pax_global_header00006660000000000000000000000064146670725410014526gustar00rootroot0000000000000052 comment=04b24da13be581ef9e827f46c634671bb652a445 pystow-0.5.5/000077500000000000000000000000001466707254100131025ustar00rootroot00000000000000pystow-0.5.5/.bumpversion.cfg000066400000000000000000000015251466707254100162150ustar00rootroot00000000000000[bumpversion] current_version = 0.5.5 commit = True tag = False parse = (?P\d+)\.(?P\d+)\.(?P\d+)(?:-(?P[0-9A-Za-z-]+(?:\.[0-9A-Za-z-]+)*))?(?:\+(?P[0-9A-Za-z-]+(?:\.[0-9A-Za-z-]+)*))? serialize = {major}.{minor}.{patch}-{release}+{build} {major}.{minor}.{patch}+{build} {major}.{minor}.{patch}-{release} {major}.{minor}.{patch} [bumpversion:part:release] optional_value = production first_value = dev values = dev production [bumpverion:part:build] values = [0-9A-Za-z-]+ [bumpversion:file:setup.cfg] search = version = {current_version} replace = version = {new_version} [bumpversion:file:docs/source/conf.py] search = release = '{current_version}' replace = release = '{new_version}' [bumpversion:file:src/pystow/version.py] search = VERSION = "{current_version}" replace = VERSION = "{new_version}" pystow-0.5.5/.github/000077500000000000000000000000001466707254100144425ustar00rootroot00000000000000pystow-0.5.5/.github/workflows/000077500000000000000000000000001466707254100164775ustar00rootroot00000000000000pystow-0.5.5/.github/workflows/tests.yml000066400000000000000000000042401466707254100203640ustar00rootroot00000000000000name: Tests on: push: branches: [ main ] pull_request: branches: [ main ] jobs: lint: name: Lint runs-on: ubuntu-latest strategy: matrix: python-version: [ "3.12", "3.8" ] steps: - uses: actions/checkout@v2 - name: Set up Python ${{ matrix.python-version }} uses: actions/setup-python@v2 with: python-version: ${{ matrix.python-version }} - name: Install dependencies run: pip install tox tox-uv - name: Check manifest run: tox -e manifest - name: Check code quality with flake8 run: tox -e flake8 - name: Check package metadata with Pyroma run: tox -e pyroma - name: Check static typing with MyPy run: tox -e mypy docs: name: Documentation runs-on: ubuntu-latest strategy: matrix: python-version: [ "3.12", "3.8" ] steps: - uses: actions/checkout@v2 - name: Set up Python ${{ matrix.python-version }} uses: actions/setup-python@v2 with: python-version: ${{ matrix.python-version }} - name: Install dependencies run: pip install tox tox-uv - name: Check RST conformity with doc8 run: tox -e doc8 - name: Check docstring coverage run: tox -e docstr-coverage - name: Check documentation build with Sphinx run: tox -e docs-test tests: name: Tests runs-on: ${{ matrix.os }} strategy: matrix: os: [ ubuntu-latest, windows-latest, macos-latest ] python-version: [ "3.8", "3.9", "3.10", "3.11", "3.12" ] steps: - uses: actions/checkout@v2 - name: Set up Python ${{ matrix.python-version }} uses: actions/setup-python@v2 with: python-version: ${{ matrix.python-version }} - name: Install dependencies run: pip install tox tox-uv - name: Test with pytest run: tox -e py - name: Build codecov file if: success() run: tox -e coverage-xml - name: Upload coverage report to codecov uses: codecov/codecov-action@v1 if: success() with: file: coverage.xml pystow-0.5.5/.gitignore000066400000000000000000000034571466707254100151030ustar00rootroot00000000000000# Byte-compiled / optimized / DLL files __pycache__/ *.py[cod] *$py.class # C extensions *.so # Distribution / packaging .Python build/ develop-eggs/ dist/ downloads/ eggs/ .eggs/ lib/ lib64/ parts/ sdist/ var/ wheels/ pip-wheel-metadata/ share/python-wheels/ *.egg-info/ .installed.cfg *.egg MANIFEST # PyInstaller # Usually these files are written by a python script from a template # before PyInstaller builds the exe, so as to inject date/other infos into it. *.manifest *.spec # Installer logs pip-log.txt pip-delete-this-directory.txt # Unit test / coverage reports htmlcov/ .tox/ .nox/ .coverage .coverage.* .cache nosetests.xml coverage.xml *.cover *.py,cover .hypothesis/ .pytest_cache/ # Translations *.mo *.pot # Django stuff: *.log local_settings.py db.sqlite3 db.sqlite3-journal # Flask stuff: instance/ .webassets-cache # Scrapy stuff: .scrapy # Sphinx documentation docs/_build/ # PyBuilder target/ # Jupyter Notebook .ipynb_checkpoints # IPython profile_default/ ipython_config.py # pyenv .python-version # pipenv # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. # However, in case of collaboration, if having platform-specific dependencies or dependencies # having no cross-platform support, pipenv may install dependencies that don't work, or not # install all needed dependencies. #Pipfile.lock # PEP 582; used by e.g. github.com/David-OConnor/pyflow __pypackages__/ # Celery stuff celerybeat-schedule celerybeat.pid # SageMath parsed files *.sage.py # Environments .env .venv env/ venv/ ENV/ env.bak/ venv.bak/ # Spyder project settings .spyderproject .spyproject # Rope project settings .ropeproject # mkdocs documentation /site # mypy .mypy_cache/ .dmypy.json dmypy.json # Pyre type checker .pyre/ # Auto-generated docs docs/source/api/ pystow-0.5.5/.readthedocs.yml000066400000000000000000000004161466707254100161710ustar00rootroot00000000000000# See: https://docs.readthedocs.io/en/latest/config-file/v2.html version: 2 build: image: latest python: version: "3.8" install: - method: pip path: . extra_requirements: - docs - rdf - pandas - xml - aws pystow-0.5.5/LICENSE000066400000000000000000000020641466707254100141110ustar00rootroot00000000000000MIT License Copyright (c) 2021 Charles Tapley Hoyt Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. pystow-0.5.5/MANIFEST.in000066400000000000000000000006361466707254100146450ustar00rootroot00000000000000graft src graft tests recursive-include docs Makefile recursive-include docs/source *.py recursive-include docs/source *.rst recursive-include docs/source *.png global-exclude *.py[cod] __pycache__ *.so *.dylib .DS_Store *.gpickle exclude .appveyor.yml .bumpversion.cfg .coveragerc .flake8 .travis.yml .readthedocs.yml tox.ini .pre-commit-config.yaml exclude .appveyor_on_request.yml include LICENSE *.md *.rst pystow-0.5.5/README.md000066400000000000000000000156611466707254100143720ustar00rootroot00000000000000

PyStow

Build status PyPI - Python Version License Documentation Status DOI Code style: black

👜 Easily pick a place to store data for your python code. ## 🚀 Getting Started Get a directory for your application. ```python import pystow # Get a directory (as a pathlib.Path) for ~/.data/pykeen pykeen_directory = pystow.join('pykeen') # Get a subdirectory (as a pathlib.Path) for ~/.data/pykeen/experiments pykeen_experiments_directory = pystow.join('pykeen', 'experiments') # You can go as deep as you want pykeen_deep_directory = pystow.join('pykeen', 'experiments', 'a', 'b', 'c') ``` If you reuse the same directory structure a lot, you can save them in a module: ```python import pystow pykeen_module = pystow.module("pykeen") # Access the module's directory with .base assert pystow.join("pykeen") == pystow.module("pykeen").base # Get a subdirectory (as a pathlib.Path) for ~/.data/pykeen/experiments pykeen_experiments_directory = pykeen_module.join('experiments') # You can go as deep as you want past the original "pykeen" module pykeen_deep_directory = pykeen_module.join('experiments', 'a', 'b', 'c') ``` Get a file path for your application by adding the `name` keyword argument. This is made explicit so PyStow knows which parent directories to automatically create. This works with `pystow` or any module you create with `pystow.module`. ```python import pystow # Get a directory (as a pathlib.Path) for ~/.data/indra/database.tsv indra_database_path = pystow.join('indra', 'database', name='database.tsv') ``` Ensure a file from the internet is available in your application's directory: ```python import pystow url = 'https://raw.githubusercontent.com/pykeen/pykeen/master/src/pykeen/datasets/nations/test.txt' path = pystow.ensure('pykeen', 'datasets', 'nations', url=url) ``` Ensure a tabular data file from the internet and load it for usage (requires `pip install pandas`): ```python import pystow import pandas as pd url = 'https://raw.githubusercontent.com/pykeen/pykeen/master/src/pykeen/datasets/nations/test.txt' df: pd.DataFrame = pystow.ensure_csv('pykeen', 'datasets', 'nations', url=url) ``` Ensure a comma-separated tabular data file from the internet and load it for usage (requires `pip install pandas`): ```python import pystow import pandas as pd url = 'https://raw.githubusercontent.com/cthoyt/pystow/main/tests/resources/test_1.csv' df: pd.DataFrame = pystow.ensure_csv('pykeen', 'datasets', 'nations', url=url, read_csv_kwargs=dict(sep=",")) ``` Ensure a RDF file from the internet and load it for usage (requires `pip install rdflib`) ```python import pystow import rdflib url = 'https://ftp.expasy.org/databases/rhea/rdf/rhea.rdf.gz' rdf_graph: rdflib.Graph = pystow.ensure_rdf('rhea', url=url) ``` Also see `pystow.ensure_excel()`, `pystow.ensure_rdf()`, `pystow.ensure_zip_df()`, and `pystow.ensure_tar_df()`. If your data comes with a lot of different files in an archive, you can ensure the archive is downloaded and get specific files from it: ```python import numpy as np import pystow url = "https://cloud.enterprise.informatik.uni-leipzig.de/index.php/s/LHPbMCre7SLqajB/download/MultiKE_D_Y_15K_V1.zip" # the path inside the archive to the file you want inner_path = "MultiKE/D_Y_15K_V1/721_5fold/1/20210219183115/ent_embeds.npy" with pystow.ensure_open_zip("kiez", url=url, inner_path=inner_path) as file: emb = np.load(file) ``` Also see `pystow.module.ensure_open_lzma()`, `pystow.module.ensure_open_tarfile()` and `pystow.module.ensure_open_gz()`. ## ⚙️️ Configuration By default, data is stored in the `$HOME/.data` directory. By default, the `` app will create the `$HOME/.data/` folder. If you want to use an alternate folder name to `.data` inside the home directory, you can set the `PYSTOW_NAME` environment variable. For example, if you set `PYSTOW_NAME=mydata`, then the following code for the `pykeen` app will create the `$HOME/mydata/pykeen/` directory: ```python import os import pystow # Only for demonstration purposes. You should set environment # variables either with your .bashrc or in the command line REPL. os.environ['PYSTOW_NAME'] = 'mydata' # Get a directory (as a pathlib.Path) for ~/mydata/pykeen pykeen_directory = pystow.join('pykeen') ``` If you want to specify a completely custom directory that isn't relative to your home directory, you can set the `PYSTOW_HOME` environment variable. For example, if you set `PYSTOW_HOME=/usr/local/`, then the following code for the `pykeen` app will create the `/usr/local/pykeen/` directory: ```python import os import pystow # Only for demonstration purposes. You should set environment # variables either with your .bashrc or in the command line REPL. os.environ['PYSTOW_HOME'] = '/usr/local/' # Get a directory (as a pathlib.Path) for /usr/local/pykeen pykeen_directory = pystow.join('pykeen') ``` Note: if you set `PYSTOW_HOME`, then `PYSTOW_NAME` is disregarded. ### X Desktop Group (XDG) Compatibility While PyStow's main goal is to make application data less opaque and less hidden, some users might want to use the [XDG specifications](http://standards.freedesktop.org/basedir-spec/basedir-spec-latest.html) for storing their app data. If you set the environment variable `PYSTOW_USE_APPDIRS` to `true` or `True`, then the [`appdirs`](https://pypi.org/project/appdirs/) package will be used to choose the base directory based on the `user data dir` option. This can still be overridden by `PYSTOW_HOME`. ## 🚀 Installation The most recent release can be installed from [PyPI](https://pypi.org/project/pystow/) with: ```bash $ pip install pystow ``` Note, as of v0.3.0, Python 3.6 isn't officially supported (its end-of-life was in December 2021). For the time being, `pystow` might still work on py36, but this is only coincidental. The most recent code and data can be installed directly from GitHub with: ```bash $ pip install git+https://github.com/cthoyt/pystow.git ``` To install in development mode, use the following: ```bash $ git clone git+https://github.com/cthoyt/pystow.git $ cd pystow $ pip install -e . ``` ## ⚖️ License The code in this package is licensed under the MIT License. pystow-0.5.5/docs/000077500000000000000000000000001466707254100140325ustar00rootroot00000000000000pystow-0.5.5/docs/Makefile000066400000000000000000000011351466707254100154720ustar00rootroot00000000000000# Minimal makefile for Sphinx documentation # You can set these variables from the command line. SPHINXOPTS = SPHINXBUILD = sphinx-build SPHINXPROJ = PyStow SOURCEDIR = source BUILDDIR = build # Put it first so that "make" without argument is like "make help". help: @$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) .PHONY: help Makefile # Catch-all target: route all unknown targets to Sphinx using the new # "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS). %: Makefile @$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)pystow-0.5.5/docs/source/000077500000000000000000000000001466707254100153325ustar00rootroot00000000000000pystow-0.5.5/docs/source/cli.rst000066400000000000000000000003231466707254100166310ustar00rootroot00000000000000Command Line Interface ====================== pystow automatically installs the command :code:`pystow`. See :code:`pystow --help` for usage details. .. click:: pystow.cli:main :prog: pystow :show-nested: pystow-0.5.5/docs/source/conf.py000066400000000000000000000154421466707254100166370ustar00rootroot00000000000000# -*- coding: utf-8 -*- # # Configuration file for the Sphinx documentation builder. # # This file does only contain a selection of the most common options. For a # full list see the documentation: # http://www.sphinx-doc.org/en/master/config # -- Path setup -------------------------------------------------------------- # If extensions (or modules to document with autodoc) are in another directory, # add these directories to sys.path here. If the directory is relative to the # documentation root, use os.path.abspath to make it absolute, like shown here. # import os import re import sys from datetime import date sys.path.insert(0, os.path.abspath('../../src')) # -- Project information ----------------------------------------------------- project = 'pystow' copyright = f'{date.today().year}, Charles Tapley Hoyt' author = 'Charles Tapley Hoyt' # The full version, including alpha/beta/rc tags. release = '0.5.5' # The short X.Y version. parsed_version = re.match( '(?P\d+)\.(?P\d+)\.(?P\d+)(?:-(?P[0-9A-Za-z-]+(?:\.[0-9A-Za-z-]+)*))?(?:\+(?P[0-9A-Za-z-]+(?:\.[0-9A-Za-z-]+)*))?', release, ) version = parsed_version.expand('\g.\g.\g') if parsed_version.group('release'): tags.add('prerelease') # -- General configuration --------------------------------------------------- # If your documentation needs a minimal Sphinx version, state it here. # # needs_sphinx = '1.0' # If true, the current module name will be prepended to all description # unit titles (such as .. function::). add_module_names = False # A list of prefixes that are ignored when creating the module index. (new in Sphinx 0.6) modindex_common_prefix = ["pystow."] # Add any Sphinx extension module names here, as strings. They can be # extensions coming with Sphinx (named 'sphinx.ext.*') or your custom # ones. extensions = [ 'sphinx.ext.autosummary', 'sphinx.ext.autodoc', 'sphinx.ext.coverage', 'sphinx.ext.intersphinx', "sphinx.ext.todo", 'sphinx.ext.mathjax', 'sphinx.ext.viewcode', 'sphinx_autodoc_typehints', 'sphinx_click.ext', 'sphinx_automodapi.automodapi', ] # generate autosummary pages # autosummary_generate = True # Add any paths that contain templates here, relative to this directory. templates_path = ['_templates'] # The suffix(es) of source filenames. # You can specify multiple suffix as a list of string: # # source_suffix = ['.rst', '.md'] source_suffix = '.rst' # The master toctree document. master_doc = 'index' # The language for content autogenerated by Sphinx. Refer to documentation # for a list of supported languages. # # This is also used if you do content translation via gettext catalogs. # Usually you set "language" from the command line for these cases. language = "en" # List of patterns, relative to source directory, that match files and # directories to ignore when looking for source files. # This pattern also affects html_static_path and html_extra_path. exclude_patterns = [] # The name of the Pygments (syntax highlighting) style to use. pygments_style = 'sphinx' # -- Options for HTML output ------------------------------------------------- # The theme to use for HTML and HTML Help pages. See the documentation for # a list of builtin themes. # html_theme = 'sphinx_rtd_theme' # Theme options are theme-specific and customize the look and feel of a theme # further. For a list of options available for each theme, see the # documentation. # # html_theme_options = {} # Add any paths that contain custom static files (such as style sheets) here, # relative to this directory. They are copied after the builtin static files, # so a file named "default.css" will overwrite the builtin "default.css". # html_static_path = ['_static'] # Custom sidebar templates, must be a dictionary that maps document names # to template names. # # The default sidebars (for documents that don't match any pattern) are # defined by theme itself. Builtin themes are using these templates by # default: ``['localtoc.html', 'relations.html', 'sourcelink.html', # 'searchbox.html']``. # # html_sidebars = {} # The name of an image file (relative to this directory) to place at the top # of the sidebar. # if os.path.exists('logo.png'): html_logo = 'logo.png' # -- Options for HTMLHelp output --------------------------------------------- # Output file base name for HTML help builder. htmlhelp_basename = 'PyStowdoc' # -- Options for LaTeX output ------------------------------------------------ # latex_elements = { # The paper size ('letterpaper' or 'a4paper'). # # 'papersize': 'letterpaper', # # The font size ('10pt', '11pt' or '12pt'). # # 'pointsize': '10pt', # # Additional stuff for the LaTeX preamble. # # 'preamble': '', # # Latex figure (float) alignment # # 'figure_align': 'htbp', # } # Grouping the document tree into LaTeX files. List of tuples # (source start file, target name, title, # author, documentclass [howto, manual, or own class]). # latex_documents = [ # ( # master_doc, # 'pystow.tex', # 'PyStow Documentation', # author, # 'manual', # ), # ] # -- Options for manual page output ------------------------------------------ # One entry per manual page. List of tuples # (source start file, name, description, authors, manual section). man_pages = [ ( master_doc, 'pystow', 'PyStow Documentation', [author], 1, ), ] # -- Options for Texinfo output ---------------------------------------------- # Grouping the document tree into Texinfo files. List of tuples # (source start file, target name, title, author, # dir menu entry, description, category) texinfo_documents = [ ( master_doc, 'pystow', 'PyStow Documentation', author, 'Charles Tapley Hoyt', '👜 Easily pick a place to store data for your python package.', 'Miscellaneous', ), ] # -- Options for Epub output ------------------------------------------------- # Bibliographic Dublin Core info. # epub_title = project # The unique identifier of the text. This can be a ISBN number # or the project homepage. # # epub_identifier = '' # A unique identification for the text. # # epub_uid = '' # A list of files that should not be packed into the epub file. # epub_exclude_files = ['search.html'] # -- Extension configuration ------------------------------------------------- # -- Options for intersphinx extension --------------------------------------- # Example configuration for intersphinx: refer to the Python standard library. intersphinx_mapping = { "python": ('https://docs.python.org/3/', None), 'rdflib': ('https://rdflib.readthedocs.io/en/latest/', None), 'pandas': ('https://pandas.pydata.org/pandas-docs/dev', None), } autoclass_content = 'both' autodoc_member_order = 'bysource' pystow-0.5.5/docs/source/index.rst000066400000000000000000000043431466707254100171770ustar00rootroot00000000000000PyStow |release| Documentation ============================== If you've ever written the following few lines of code, :mod:`pystow` is for you: .. code-block:: python import os home = os.path.expanduser('~') project_name = 'adeft' envvar_name = f'{project_name.upper()}_HOME' if envvar_name in os.environ: ADEFT_HOME = os.environ[envvar_name] else: ADEFT_HOME = os.path.join(home, f'.{project_name}') os.makedirs(ADEFT_HOME, exist_ok=True) Many projects (let's use `Adeft `_ as an example) create a folder in the home directory as a dot-file such as ``$HOME/.adeft``. I found that I had so many of these that I started grouping them inside a ``$HOME/.data`` folder. It's also the case that every time you create one of these folders, you need to ensure its existence. :mod:`pystow` takes care of these things. You can replace the previous code with: .. code-block:: python import pystow ADEFT_HOME = pystow.join('adeft') First, it takes the name of the module, uppercases it, and postpends ``_HOME`` on to it (e.g., ``ADEFT_HOME``) and looks in the environment. If this variable is available, it uses that as the directory. It ensures it exists, then returns a :class:`pathlib.Path` pointing to it. If ``ADEFT_HOME`` (or more generally, ``_HOME`` is not available in the environment, it picks the path as ``$HOME/.data/``. Normally, ``$HOME`` is specified in your OS. However, if you want to pick another location to stick the data, you can override using ``$HOME`` by setting ``$PYSTOW_HOME`` in the environment. If you want to go more directories deep inside the adeft default directory, you can just keep using more positional arguments (the same semantics as :func:`os.path.join`). These directories automatically get created as well. .. code-block:: python >>> import pystow >>> from pathlib import Path >>> # already set somewhere >>> __version__ = ... >>> ADEFT_VERSION_HOME: Path = pystow.join('adeft', __version__) .. toctree:: :maxdepth: 2 :caption: Getting Started :name: start installation usage utils cli Indices and Tables ------------------ * :ref:`genindex` * :ref:`modindex` * :ref:`search` pystow-0.5.5/docs/source/installation.rst000066400000000000000000000052321466707254100205670ustar00rootroot00000000000000Installation ============ The most recent release can be installed from `PyPI `_ with: .. code-block:: shell $ pip install pystow The most recent code and data can be installed directly from GitHub with: .. code-block:: shell $ pip install git+https://github.com/cthoyt/pystow.git To install in development mode, use the following: .. code-block:: shell $ git clone git+https://github.com/cthoyt/pystow.git $ cd pystow $ pip install -e . Configuration ============= By default, data is stored in the ``$HOME/.data`` directory. By default, the ```` app will create the ``$HOME/.data/`` folder. If you want to use an alternate folder name to ``.data`` inside the home directory, you can set the ``PYSTOW_NAME`` environment variable. For example, if you set ``PYSTOW_NAME=mydata``, then the following code for the ``pykeen`` app will create the ``$HOME/mydata/pykeen/`` directory: .. code-block:: python import os import pystow # Only for demonstration purposes. You should set environment # variables either with your .bashrc or in the command line REPL. os.environ['PYSTOW_NAME'] = 'mydata' # Get a directory (as a pathlib.Path) for ~/mydata/pykeen pykeen_directory = pystow.join('pykeen') If you want to specify a completely custom directory that isn't relative to your home directory, you can set the ``PYSTOW_HOME`` environment variable. For example, if you set ``PYSTOW_HOME=/usr/local/``, then the following code for the ``pykeen`` app will create the ``/usr/local/pykeen/`` directory: .. code-block:: python import os import pystow # Only for demonstration purposes. You should set environment # variables either with your .bashrc or in the command line REPL. os.environ['PYSTOW_HOME'] = '/usr/local/' # Get a directory (as a pathlib.Path) for /usr/local/pykeen pykeen_directory = pystow.join('pykeen') .. warning:: If you set ``PYSTOW_HOME``, then ``PYSTOW_NAME`` is disregarded. X Desktop Group (XDG) Compatibility ----------------------------------- While PyStow's main goal is to make application data less opaque and less hidden, some users might want to use the `XDG specifications `_ for storing their app data. If you set the environment variable ``PYSTOW_USE_APPDIRS`` to ``true`` or ``True``, then the `appdirs `_ package will be used to choose the base directory based on the ``user data dir`` option. .. warning:: If you use this setting, make sure you first do ``pip install appdirs`` .. note:: This can still be overridden by ``PYSTOW_HOME``. pystow-0.5.5/docs/source/usage.rst000066400000000000000000000003741466707254100171740ustar00rootroot00000000000000Usage ===== .. automodapi:: pystow :no-inheritance-diagram: :no-heading: :headings: -- :skip: Module :no-main-docstr: .. automodapi:: pystow.impl :no-inheritance-diagram: :no-heading: :headings: -- :no-main-docstr: pystow-0.5.5/docs/source/utils.rst000066400000000000000000000001611466707254100172220ustar00rootroot00000000000000Utilities ========= .. automodapi:: pystow.utils :no-inheritance-diagram: :no-heading: :headings: -- pystow-0.5.5/pyproject.toml000066400000000000000000000005601466707254100160170ustar00rootroot00000000000000# See https://setuptools.readthedocs.io/en/latest/build_meta.html [build-system] requires = ["setuptools", "wheel"] build-backend = "setuptools.build_meta:__legacy__" [tool.black] line-length = 100 target-version = ["py38", "py39", "py310", "py311", "py312"] [tool.isort] profile = "black" multi_line_output = 3 include_trailing_comma = true reverse_relative = true pystow-0.5.5/setup.cfg000066400000000000000000000056531466707254100147340ustar00rootroot00000000000000########################## # Setup.py Configuration # ########################## # Configuring setup() [metadata] name = pystow version = 0.5.5 description = Easily pick a place to store data for your python package. long_description = file: README.md long_description_content_type = text/markdown # Links url = https://github.com/cthoyt/pystow download_url = https://github.com/cthoyt/pystow/releases project_urls = Bug Tracker = https://github.com/cthoyt/pystow/issues # Author information author = Charles Tapley Hoyt author_email = cthoyt@gmail.com maintainer = Charles Tapley Hoyt maintainer_email = cthoyt@gmail.com # License information license = MIT license_file = LICENSE # Search tags classifiers = Development Status :: 5 - Production/Stable Environment :: Console License :: OSI Approved :: MIT License Operating System :: OS Independent Programming Language :: Python Programming Language :: Python :: 3.8 Programming Language :: Python :: 3.9 Programming Language :: Python :: 3.10 Programming Language :: Python :: 3.11 Programming Language :: Python :: 3.12 Programming Language :: Python :: 3 :: Only keywords = caching file management [options] install_requires = pickle5; python_version < "3.8" click requests tqdm zip_safe = false python_requires = >=3.7 # Where is my code packages = find: package_dir = = src [options.packages.find] where = src [options.extras_require] rdf = rdflib xml = lxml pandas = pandas aws = boto3 tests = coverage pytest requests_file docs = sphinx<8.0 sphinx-rtd-theme sphinx-click sphinx-autodoc-typehints sphinx_automodapi [options.entry_points] console_scripts = pystow = pystow.cli:main ###################### # Doc8 Configuration # # (doc8.ini) # ###################### [doc8] max-line-length = 120 ########################## # Coverage Configuration # # (.coveragerc) # ########################## [coverage:run] branch = True source = pystow omit = tests/* docs/* src/pystow/cli.py src/pystow/__main__.py [coverage:paths] source = src/pystow .tox/*/lib/python*/site-packages/pystow [coverage:report] show_missing = True exclude_lines = def __str__ def __repr__ ########################## # Darglint Configuration # ########################## [darglint] docstring_style = sphinx strictness = full # enable = DAR104 ######################### # Flake8 Configuration # # (.flake8) # ######################### [flake8] ignore = # pickle S403 # pickle S301 # line break before binary operator W503 S410 S320 exclude = .tox, .git, __pycache__, docs/source/conf.py, build, dist, tests/fixtures/*, *.pyc, *.egg-info, .cache, .eggs, data max-line-length = 120 max-complexity = 20 import-order-style = pycharm application-import-names = pystow tests pystow-0.5.5/src/000077500000000000000000000000001466707254100136715ustar00rootroot00000000000000pystow-0.5.5/src/pystow/000077500000000000000000000000001466707254100152365ustar00rootroot00000000000000pystow-0.5.5/src/pystow/__init__.py000066400000000000000000000020101466707254100173400ustar00rootroot00000000000000# -*- coding: utf-8 -*- """PyStow: Easily pick a place to store data for your python package.""" from .api import ( # noqa dump_df, dump_json, dump_pickle, dump_rdf, dump_xml, ensure, ensure_csv, ensure_custom, ensure_excel, ensure_from_google, ensure_from_s3, ensure_gunzip, ensure_json, ensure_json_bz2, ensure_open, ensure_open_bz2, ensure_open_gz, ensure_open_lzma, ensure_open_sqlite, ensure_open_sqlite_gz, ensure_open_tarfile, ensure_open_zip, ensure_pickle, ensure_pickle_gz, ensure_rdf, ensure_tar_df, ensure_tar_xml, ensure_untar, ensure_xml, ensure_zip_df, ensure_zip_np, join, joinpath_sqlite, load_df, load_json, load_pickle, load_pickle_gz, load_rdf, load_xml, module, open, open_gz, ) from .config_api import ConfigError, get_config, write_config # noqa from .impl import Module # noqa from .utils import ensure_readme # noqa ensure_readme() pystow-0.5.5/src/pystow/__main__.py000066400000000000000000000002171466707254100173300ustar00rootroot00000000000000# -*- coding: utf-8 -*- # type: ignore """Command line interface for PyStow.""" from .cli import main if __name__ == "__main__": main() pystow-0.5.5/src/pystow/api.py000066400000000000000000001565441466707254100164000ustar00rootroot00000000000000# -*- coding: utf-8 -*- """API functions for PyStow.""" import sqlite3 from contextlib import contextmanager from pathlib import Path from typing import TYPE_CHECKING, Any, Generator, Mapping, Optional, Sequence, Union from .constants import JSON, BytesOpener, Opener, Provider from .impl import Module if TYPE_CHECKING: import lxml.etree import numpy.typing import pandas as pd import rdflib __all__ = [ "module", "join", "joinpath_sqlite", # Opener functions "open", "open_gz", # Loader functions "load_df", "load_json", "load_pickle", "load_pickle_gz", "load_rdf", "load_xml", # Dump functions "dump_df", "dump_json", "dump_pickle", "dump_rdf", "dump_xml", # Downloader functions "ensure", "ensure_from_s3", "ensure_from_google", # Downloader functions with postprocessing "ensure_untar", "ensure_gunzip", # Downloader + opener functions "ensure_open", "ensure_open_gz", "ensure_open_bz2", "ensure_open_lzma", "ensure_open_tarfile", "ensure_open_zip", "ensure_open_sqlite", "ensure_open_sqlite_gz", # Processors "ensure_csv", "ensure_custom", "ensure_json", "ensure_json_bz2", "ensure_pickle", "ensure_pickle_gz", "ensure_excel", "ensure_xml", "ensure_rdf", "ensure_tar_df", "ensure_tar_xml", "ensure_zip_df", "ensure_zip_np", ] def module(key: str, *subkeys: str, ensure_exists: bool = True) -> Module: """Return a module for the application. :param key: The name of the module. No funny characters. The envvar _HOME where key is uppercased is checked first before using the default home directory. :param subkeys: A sequence of additional strings to join. If none are given, returns the directory for this module. :param ensure_exists: Should all directories be created automatically? Defaults to true. :return: The module object that manages getting and ensuring """ return Module.from_key(key, *subkeys, ensure_exists=ensure_exists) def join(key: str, *subkeys: str, name: Optional[str] = None, ensure_exists: bool = True) -> Path: """Return the home data directory for the given module. :param key: The name of the module. No funny characters. The envvar _HOME where key is uppercased is checked first before using the default home directory. :param subkeys: A sequence of additional strings to join :param name: The name of the file (optional) inside the folder :param ensure_exists: Should all directories be created automatically? Defaults to true. :return: The path of the directory or subdirectory for the given module. """ _module = Module.from_key(key, ensure_exists=ensure_exists) return _module.join(*subkeys, name=name, ensure_exists=ensure_exists) @contextmanager def open( key: str, *subkeys: str, name: str, mode: str = "r", open_kwargs: Optional[Mapping[str, Any]] = None, ) -> Opener: """Open a file that exists already. :param key: The name of the module. No funny characters. The envvar _HOME where key is uppercased is checked first before using the default home directory. :param subkeys: A sequence of additional strings to join. If none are given, returns the directory for this module. :param name: The name of the file to open :param mode: The read mode, passed to :func:`open` :param open_kwargs: Additional keyword arguments passed to :func:`open` :yields: An open file object """ _module = Module.from_key(key, ensure_exists=True) with _module.open(*subkeys, name=name, mode=mode, open_kwargs=open_kwargs) as file: yield file @contextmanager def open_gz( key: str, *subkeys: str, name: str, mode: str = "rt", open_kwargs: Optional[Mapping[str, Any]] = None, ) -> Opener: """Open a gzipped file that exists already. :param key: The name of the module. No funny characters. The envvar _HOME where key is uppercased is checked first before using the default home directory. :param subkeys: A sequence of additional strings to join. If none are given, returns the directory for this module. :param name: The name of the file to open :param mode: The read mode, passed to :func:`gzip.open` :param open_kwargs: Additional keyword arguments passed to :func:`gzip.open` :yields: An open file object """ _module = Module.from_key(key, ensure_exists=True) with _module.open_gz(*subkeys, name=name, mode=mode, open_kwargs=open_kwargs) as file: yield file def ensure( key: str, *subkeys: str, url: str, name: Optional[str] = None, force: bool = False, download_kwargs: Optional[Mapping[str, Any]] = None, ) -> Path: """Ensure a file is downloaded. :param key: The name of the module. No funny characters. The envvar _HOME where key is uppercased is checked first before using the default home directory. :param subkeys: A sequence of additional strings to join. If none are given, returns the directory for this module. :param url: The URL to download. :param name: Overrides the name of the file at the end of the URL, if given. Also useful for URLs that don't have proper filenames with extensions. :param force: Should the download be done again, even if the path already exists? Defaults to false. :param download_kwargs: Keyword arguments to pass through to :func:`pystow.utils.download`. :return: The path of the file that has been downloaded (or already exists) """ _module = Module.from_key(key, ensure_exists=True) return _module.ensure( *subkeys, url=url, name=name, force=force, download_kwargs=download_kwargs ) def ensure_custom( key: str, *subkeys: str, name: str, force: bool = False, provider: Provider, **kwargs: Any, ) -> Path: """Ensure a file is present, and run a custom create function otherwise. :param key: The name of the module. No funny characters. The envvar _HOME where key is uppercased is checked first before using the default home directory. :param subkeys: A sequence of additional strings to join. If none are given, returns the directory for this module. :param name: The file name. :param force: Should the file be re-created, even if the path already exists? :param provider: The file provider. Will be run with the path as the first positional argument, if the file needs to be generated. :param kwargs: Additional keyword-based parameters passed to the provider. :return: The path of the file that has been created (or already exists) """ _module = Module.from_key(key, ensure_exists=True) return _module.ensure_custom(*subkeys, name=name, force=force, provider=provider, **kwargs) def ensure_untar( key: str, *subkeys: str, url: str, name: Optional[str] = None, directory: Optional[str] = None, force: bool = False, download_kwargs: Optional[Mapping[str, Any]] = None, extract_kwargs: Optional[Mapping[str, Any]] = None, ) -> Path: """Ensure a file is downloaded and untarred. :param key: The name of the module. No funny characters. The envvar _HOME where key is uppercased is checked first before using the default home directory. :param subkeys: A sequence of additional strings to join. If none are given, returns the directory for this module. :param url: The URL to download. :param name: Overrides the name of the file at the end of the URL, if given. Also useful for URLs that don't have proper filenames with extensions. :param directory: Overrides the name of the directory into which the tar archive is extracted. If none given, will use the stem of the file name that gets downloaded. :param force: Should the download be done again, even if the path already exists? Defaults to false. :param download_kwargs: Keyword arguments to pass through to :func:`pystow.utils.download`. :param extract_kwargs: Keyword arguments to pass to :meth:`tarfile.TarFile.extract_all`. :return: The path of the directory where the file that has been downloaded gets extracted to """ _module = Module.from_key(key, ensure_exists=True) return _module.ensure_untar( *subkeys, url=url, name=name, directory=directory, force=force, download_kwargs=download_kwargs, extract_kwargs=extract_kwargs, ) def ensure_gunzip( key: str, *subkeys: str, url: str, name: Optional[str] = None, force: bool = False, autoclean: bool = True, download_kwargs: Optional[Mapping[str, Any]] = None, ) -> Path: """Ensure a file is downloaded and gunzipped. :param key: The name of the module. No funny characters. The envvar _HOME where key is uppercased is checked first before using the default home directory. :param subkeys: A sequence of additional strings to join. If none are given, returns the directory for this module. :param url: The URL to download. :param name: Overrides the name of the file at the end of the URL, if given. Also useful for URLs that don't have proper filenames with extensions. :param force: Should the download be done again, even if the path already exists? Defaults to false. :param autoclean: Should the zipped file be deleted? :param download_kwargs: Keyword arguments to pass through to :func:`pystow.utils.download`. :return: The path of the directory where the file that has been downloaded gets extracted to """ _module = Module.from_key(key, ensure_exists=True) return _module.ensure_gunzip( *subkeys, url=url, name=name, force=force, autoclean=autoclean, download_kwargs=download_kwargs, ) @contextmanager def ensure_open( key: str, *subkeys: str, url: str, name: Optional[str] = None, force: bool = False, download_kwargs: Optional[Mapping[str, Any]] = None, mode: str = "r", open_kwargs: Optional[Mapping[str, Any]] = None, ) -> Opener: """Ensure a file is downloaded and open it. :param key: The name of the module. No funny characters. The envvar `_HOME` where key is uppercased is checked first before using the default home directory. :param subkeys: A sequence of additional strings to join. If none are given, returns the directory for this module. :param url: The URL to download. :param name: Overrides the name of the file at the end of the URL, if given. Also useful for URLs that don't have proper filenames with extensions. :param force: Should the download be done again, even if the path already exists? Defaults to false. :param download_kwargs: Keyword arguments to pass through to :func:`pystow.utils.download`. :param mode: The read mode, passed to :func:`lzma.open` :param open_kwargs: Additional keyword arguments passed to :func:`lzma.open` :yields: An open file object """ _module = Module.from_key(key, ensure_exists=True) with _module.ensure_open( *subkeys, url=url, name=name, force=force, download_kwargs=download_kwargs, mode=mode, open_kwargs=open_kwargs, ) as yv: yield yv @contextmanager def ensure_open_zip( key: str, *subkeys: str, url: str, inner_path: str, name: Optional[str] = None, force: bool = False, download_kwargs: Optional[Mapping[str, Any]] = None, mode: str = "r", open_kwargs: Optional[Mapping[str, Any]] = None, ) -> BytesOpener: """Ensure a file is downloaded then open it with :mod:`zipfile`. :param key: The name of the module. No funny characters. The envvar `_HOME` where key is uppercased is checked first before using the default home directory. :param subkeys: A sequence of additional strings to join. If none are given, returns the directory for this module. :param url: The URL to download. :param inner_path: The relative path to the file inside the archive :param name: Overrides the name of the file at the end of the URL, if given. Also useful for URLs that don't have proper filenames with extensions. :param force: Should the download be done again, even if the path already exists? Defaults to false. :param download_kwargs: Keyword arguments to pass through to :func:`pystow.utils.download`. :param mode: The read mode, passed to :func:`zipfile.open` :param open_kwargs: Additional keyword arguments passed to :func:`zipfile.open` :yields: An open file object """ _module = Module.from_key(key, ensure_exists=True) with _module.ensure_open_zip( *subkeys, url=url, inner_path=inner_path, name=name, force=force, download_kwargs=download_kwargs, mode=mode, open_kwargs=open_kwargs, ) as yv: yield yv @contextmanager def ensure_open_lzma( key: str, *subkeys: str, url: str, name: Optional[str] = None, force: bool = False, download_kwargs: Optional[Mapping[str, Any]] = None, mode: str = "r", open_kwargs: Optional[Mapping[str, Any]] = None, ) -> Opener: """Ensure a LZMA-compressed file is downloaded and open a file inside it. :param key: The name of the module. No funny characters. The envvar `_HOME` where key is uppercased is checked first before using the default home directory. :param subkeys: A sequence of additional strings to join. If none are given, returns the directory for this module. :param url: The URL to download. :param name: Overrides the name of the file at the end of the URL, if given. Also useful for URLs that don't have proper filenames with extensions. :param force: Should the download be done again, even if the path already exists? Defaults to false. :param download_kwargs: Keyword arguments to pass through to :func:`pystow.utils.download`. :param mode: The read mode, passed to :func:`lzma.open` :param open_kwargs: Additional keyword arguments passed to :func:`lzma.open` :yields: An open file object """ _module = Module.from_key(key, ensure_exists=True) with _module.ensure_open_lzma( *subkeys, url=url, name=name, force=force, download_kwargs=download_kwargs, mode=mode, open_kwargs=open_kwargs, ) as yv: yield yv @contextmanager def ensure_open_tarfile( key: str, *subkeys: str, url: str, inner_path: str, name: Optional[str] = None, force: bool = False, download_kwargs: Optional[Mapping[str, Any]] = None, mode: str = "r", open_kwargs: Optional[Mapping[str, Any]] = None, ) -> BytesOpener: """Ensure a tar file is downloaded and open a file inside it. :param key: The name of the module. No funny characters. The envvar `_HOME` where key is uppercased is checked first before using the default home directory. :param subkeys: A sequence of additional strings to join. If none are given, returns the directory for this module. :param url: The URL to download. :param inner_path: The relative path to the file inside the archive :param name: Overrides the name of the file at the end of the URL, if given. Also useful for URLs that don't have proper filenames with extensions. :param force: Should the download be done again, even if the path already exists? Defaults to false. :param download_kwargs: Keyword arguments to pass through to :func:`pystow.utils.download`. :param mode: The read mode, passed to :func:`tarfile.open` :param open_kwargs: Additional keyword arguments passed to :func:`tarfile.open` :yields: An open file object """ _module = Module.from_key(key, ensure_exists=True) with _module.ensure_open_tarfile( *subkeys, url=url, inner_path=inner_path, name=name, force=force, download_kwargs=download_kwargs, mode=mode, open_kwargs=open_kwargs, ) as yv: yield yv @contextmanager def ensure_open_gz( key: str, *subkeys: str, url: str, name: Optional[str] = None, force: bool = False, download_kwargs: Optional[Mapping[str, Any]] = None, mode: str = "rb", open_kwargs: Optional[Mapping[str, Any]] = None, ) -> Opener: """Ensure a gzipped file is downloaded and open a file inside it. :param key: The name of the module. No funny characters. The envvar `_HOME` where key is uppercased is checked first before using the default home directory. :param subkeys: A sequence of additional strings to join. If none are given, returns the directory for this module. :param url: The URL to download. :param name: Overrides the name of the file at the end of the URL, if given. Also useful for URLs that don't have proper filenames with extensions. :param force: Should the download be done again, even if the path already exists? Defaults to false. :param download_kwargs: Keyword arguments to pass through to :func:`pystow.utils.download`. :param mode: The read mode, passed to :func:`gzip.open` :param open_kwargs: Additional keyword arguments passed to :func:`gzip.open` :yields: An open file object """ _module = Module.from_key(key, ensure_exists=True) with _module.ensure_open_gz( *subkeys, url=url, name=name, force=force, download_kwargs=download_kwargs, mode=mode, open_kwargs=open_kwargs, ) as yv: yield yv @contextmanager def ensure_open_bz2( key: str, *subkeys: str, url: str, name: Optional[str] = None, force: bool = False, download_kwargs: Optional[Mapping[str, Any]] = None, mode: str = "rb", open_kwargs: Optional[Mapping[str, Any]] = None, ) -> Opener: """Ensure a BZ2-compressed file is downloaded and open a file inside it. :param key: The name of the module. No funny characters. The envvar `_HOME` where key is uppercased is checked first before using the default home directory. :param subkeys: A sequence of additional strings to join. If none are given, returns the directory for this module. :param url: The URL to download. :param name: Overrides the name of the file at the end of the URL, if given. Also useful for URLs that don't have proper filenames with extensions. :param force: Should the download be done again, even if the path already exists? Defaults to false. :param download_kwargs: Keyword arguments to pass through to :func:`pystow.utils.download`. :param mode: The read mode, passed to :func:`bz2.open` :param open_kwargs: Additional keyword arguments passed to :func:`bz2.open` :yields: An open file object """ _module = Module.from_key(key, ensure_exists=True) with _module.ensure_open_bz2( *subkeys, url=url, name=name, force=force, download_kwargs=download_kwargs, mode=mode, open_kwargs=open_kwargs, ) as yv: yield yv def ensure_csv( key: str, *subkeys: str, url: str, name: Optional[str] = None, force: bool = False, download_kwargs: Optional[Mapping[str, Any]] = None, read_csv_kwargs: Optional[Mapping[str, Any]] = None, ) -> "pd.DataFrame": """Download a CSV and open as a dataframe with :mod:`pandas`. :param key: The module name :param subkeys: A sequence of additional strings to join. If none are given, returns the directory for this module. :param url: The URL to download. :param name: Overrides the name of the file at the end of the URL, if given. Also useful for URLs that don't have proper filenames with extensions. :param force: Should the download be done again, even if the path already exists? Defaults to false. :param download_kwargs: Keyword arguments to pass through to :func:`pystow.utils.download`. :param read_csv_kwargs: Keyword arguments to pass through to :func:`pandas.read_csv`. .. note:: It is assumed that the CSV uses tab separators, as this is the only safe option. For more information, see `Wikipedia `_ and `Issue #51 `_. To override this behavior and load using the comma separator, specify ``read_csv_kwargs=dict(sep=",")``. :return: A pandas DataFrame Example usage:: >>> import pystow >>> import pandas as pd >>> url = 'https://raw.githubusercontent.com/pykeen/pykeen/master/src/pykeen/datasets/nations/test.txt' >>> df: pd.DataFrame = pystow.ensure_csv('pykeen', 'datasets', 'nations', url=url) """ _module = Module.from_key(key, ensure_exists=True) return _module.ensure_csv( *subkeys, url=url, name=name, force=force, download_kwargs=download_kwargs, read_csv_kwargs=read_csv_kwargs, ) def load_df( key: str, *subkeys: str, name: str, read_csv_kwargs: Optional[Mapping[str, Any]] = None, ) -> "pd.DataFrame": """Open a pre-existing CSV as a dataframe with :mod:`pandas`. :param key: The module name :param subkeys: A sequence of additional strings to join. If none are given, returns the directory for this module. :param name: Overrides the name of the file at the end of the URL, if given. Also useful for URLs that don't have proper filenames with extensions. :param read_csv_kwargs: Keyword arguments to pass through to :func:`pandas.read_csv`. :return: A pandas DataFrame Example usage:: >>> import pystow >>> import pandas as pd >>> url = 'https://raw.githubusercontent.com/pykeen/pykeen/master/src/pykeen/datasets/nations/test.txt' >>> pystow.ensure_csv('pykeen', 'datasets', 'nations', url=url) >>> df: pd.DataFrame = pystow.load_df('pykeen', 'datasets', 'nations', name='test.txt') """ _module = Module.from_key(key, ensure_exists=True) return _module.load_df( *subkeys, name=name, read_csv_kwargs=read_csv_kwargs, ) def dump_df( key: str, *subkeys: str, name: str, obj: "pd.DataFrame", sep: str = "\t", index: bool = False, to_csv_kwargs: Optional[Mapping[str, Any]] = None, ) -> None: """Dump a dataframe to a TSV file with :mod:`pandas`. :param key: The module name :param subkeys: A sequence of additional strings to join. If none are given, returns the directory for this module. :param name: Overrides the name of the file at the end of the URL, if given. Also useful for URLs that don't have proper filenames with extensions. :param obj: The dataframe to dump :param sep: The separator to use, defaults to a tab :param index: Should the index be dumped? Defaults to false. :param to_csv_kwargs: Keyword arguments to pass through to :meth:`pandas.DataFrame.to_csv`. """ _module = Module.from_key(key, ensure_exists=True) _module.dump_df( *subkeys, name=name, obj=obj, sep=sep, index=index, to_csv_kwargs=to_csv_kwargs, ) def ensure_json( key: str, *subkeys: str, url: str, name: Optional[str] = None, force: bool = False, download_kwargs: Optional[Mapping[str, Any]] = None, open_kwargs: Optional[Mapping[str, Any]] = None, json_load_kwargs: Optional[Mapping[str, Any]] = None, ) -> JSON: """Download JSON and open with :mod:`json`. :param key: The module name :param subkeys: A sequence of additional strings to join. If none are given, returns the directory for this module. :param url: The URL to download. :param name: Overrides the name of the file at the end of the URL, if given. Also useful for URLs that don't have proper filenames with extensions. :param force: Should the download be done again, even if the path already exists? Defaults to false. :param download_kwargs: Keyword arguments to pass through to :func:`pystow.utils.download`. :param open_kwargs: Additional keyword arguments passed to :func:`open` :param json_load_kwargs: Keyword arguments to pass through to :func:`json.load`. :returns: A JSON object (list, dict, etc.) Example usage:: >>> import pystow >>> url = 'https://maayanlab.cloud/CREEDS/download/single_gene_perturbations-v1.0.json' >>> perturbations = pystow.ensure_json('bio', 'creeds', '1.0', url=url) """ _module = Module.from_key(key, ensure_exists=True) return _module.ensure_json( *subkeys, url=url, name=name, force=force, download_kwargs=download_kwargs, open_kwargs=open_kwargs, json_load_kwargs=json_load_kwargs, ) def ensure_json_bz2( key: str, *subkeys: str, url: str, name: Optional[str] = None, force: bool = False, download_kwargs: Optional[Mapping[str, Any]] = None, open_kwargs: Optional[Mapping[str, Any]] = None, json_load_kwargs: Optional[Mapping[str, Any]] = None, ) -> JSON: """Download BZ2-compressed JSON and open with :mod:`json`. :param key: The module name :param subkeys: A sequence of additional strings to join. If none are given, returns the directory for this module. :param url: The URL to download. :param name: Overrides the name of the file at the end of the URL, if given. Also useful for URLs that don't have proper filenames with extensions. :param force: Should the download be done again, even if the path already exists? Defaults to false. :param download_kwargs: Keyword arguments to pass through to :func:`pystow.utils.download`. :param open_kwargs: Additional keyword arguments passed to :func:`bz2.open` :param json_load_kwargs: Keyword arguments to pass through to :func:`json.load`. :returns: A JSON object (list, dict, etc.) Example usage:: >>> import pystow >>> url = 'https://github.com/hetio/hetionet/raw/master/hetnet/json/hetionet-v1.0.json.bz2' >>> hetionet = pystow.ensure_json_bz2('bio', 'hetionet', '1.0', url=url) """ _module = Module.from_key(key, ensure_exists=True) return _module.ensure_json_bz2( *subkeys, url=url, name=name, force=force, download_kwargs=download_kwargs, open_kwargs=open_kwargs, json_load_kwargs=json_load_kwargs, ) def load_json( key: str, *subkeys: str, name: str, json_load_kwargs: Optional[Mapping[str, Any]] = None, ) -> JSON: """Open a JSON file :mod:`json`. :param key: The module name :param subkeys: A sequence of additional strings to join. If none are given, returns the directory for this module. :param name: The name of the file to open :param json_load_kwargs: Keyword arguments to pass through to :func:`json.load`. :returns: A JSON object (list, dict, etc.) """ _module = Module.from_key(key, ensure_exists=True) return _module.load_json(*subkeys, name=name, json_load_kwargs=json_load_kwargs) def dump_json( key: str, *subkeys: str, name: str, obj: JSON, open_kwargs: Optional[Mapping[str, Any]] = None, json_dump_kwargs: Optional[Mapping[str, Any]] = None, ) -> None: """Dump an object to a file with :mod:`json`. :param key: The module name :param subkeys: A sequence of additional strings to join. If none are given, returns the directory for this module. :param name: The name of the file to open :param obj: The object to dump :param open_kwargs: Additional keyword arguments passed to :func:`open` :param json_dump_kwargs: Keyword arguments to pass through to :func:`json.dump`. """ _module = Module.from_key(key, ensure_exists=True) _module.dump_json( *subkeys, name=name, obj=obj, open_kwargs=open_kwargs, json_dump_kwargs=json_dump_kwargs ) def ensure_pickle( key: str, *subkeys: str, url: str, name: Optional[str] = None, force: bool = False, download_kwargs: Optional[Mapping[str, Any]] = None, mode: str = "rb", open_kwargs: Optional[Mapping[str, Any]] = None, pickle_load_kwargs: Optional[Mapping[str, Any]] = None, ) -> Any: """Download a pickle file and open with :mod:`pickle`. :param key: The module name :param subkeys: A sequence of additional strings to join. If none are given, returns the directory for this module. :param url: The URL to download. :param name: Overrides the name of the file at the end of the URL, if given. Also useful for URLs that don't have proper filenames with extensions. :param force: Should the download be done again, even if the path already exists? Defaults to false. :param download_kwargs: Keyword arguments to pass through to :func:`pystow.utils.download`. :param mode: The read mode, passed to :func:`open` :param open_kwargs: Additional keyword arguments passed to :func:`open` :param pickle_load_kwargs: Keyword arguments to pass through to :func:`pickle.load`. :returns: Any object """ _module = Module.from_key(key, ensure_exists=True) return _module.ensure_pickle( *subkeys, url=url, name=name, force=force, download_kwargs=download_kwargs, mode=mode, open_kwargs=open_kwargs, pickle_load_kwargs=pickle_load_kwargs, ) def load_pickle( key: str, *subkeys: str, name: str, mode: str = "rb", open_kwargs: Optional[Mapping[str, Any]] = None, pickle_load_kwargs: Optional[Mapping[str, Any]] = None, ) -> Any: """Open a pickle file with :mod:`pickle`. :param key: The module name :param subkeys: A sequence of additional strings to join. If none are given, returns the directory for this module. :param name: The name of the file to open :param mode: The read mode, passed to :func:`open` :param open_kwargs: Additional keyword arguments passed to :func:`open` :param pickle_load_kwargs: Keyword arguments to pass through to :func:`pickle.load`. :returns: Any object """ _module = Module.from_key(key, ensure_exists=True) return _module.load_pickle( *subkeys, name=name, mode=mode, open_kwargs=open_kwargs, pickle_load_kwargs=pickle_load_kwargs, ) def dump_pickle( key: str, *subkeys: str, name: str, obj: Any, mode: str = "wb", open_kwargs: Optional[Mapping[str, Any]] = None, pickle_dump_kwargs: Optional[Mapping[str, Any]] = None, ) -> None: """Dump an object to a file with :mod:`pickle`. :param key: The module name :param subkeys: A sequence of additional strings to join. If none are given, returns the directory for this module. :param name: The name of the file to open :param obj: The object to dump :param mode: The read mode, passed to :func:`open` :param open_kwargs: Additional keyword arguments passed to :func:`open` :param pickle_dump_kwargs: Keyword arguments to pass through to :func:`pickle.dump`. """ _module = Module.from_key(key, ensure_exists=True) _module.dump_pickle( *subkeys, name=name, obj=obj, mode=mode, open_kwargs=open_kwargs, pickle_dump_kwargs=pickle_dump_kwargs, ) def ensure_pickle_gz( key: str, *subkeys: str, url: str, name: Optional[str] = None, force: bool = False, download_kwargs: Optional[Mapping[str, Any]] = None, mode: str = "rb", open_kwargs: Optional[Mapping[str, Any]] = None, pickle_load_kwargs: Optional[Mapping[str, Any]] = None, ) -> Any: """Download a gzipped pickle file and open with :mod:`pickle`. :param key: The module name :param subkeys: A sequence of additional strings to join. If none are given, returns the directory for this module. :param url: The URL to download. :param name: Overrides the name of the file at the end of the URL, if given. Also useful for URLs that don't have proper filenames with extensions. :param force: Should the download be done again, even if the path already exists? Defaults to false. :param download_kwargs: Keyword arguments to pass through to :func:`pystow.utils.download`. :param mode: The read mode, passed to :func:`gzip.open` :param open_kwargs: Additional keyword arguments passed to :func:`gzip.open` :param pickle_load_kwargs: Keyword arguments to pass through to :func:`pickle.load`. :returns: Any object """ _module = Module.from_key(key, ensure_exists=True) return _module.ensure_pickle_gz( *subkeys, url=url, name=name, force=force, download_kwargs=download_kwargs, mode=mode, open_kwargs=open_kwargs, pickle_load_kwargs=pickle_load_kwargs, ) def load_pickle_gz( key: str, *subkeys: str, name: str, mode: str = "rb", open_kwargs: Optional[Mapping[str, Any]] = None, pickle_load_kwargs: Optional[Mapping[str, Any]] = None, ) -> Any: """Open a gzipped pickle file with :mod:`pickle`. :param key: The module name :param subkeys: A sequence of additional strings to join. If none are given, returns the directory for this module. :param name: The name of the file to open :param mode: The read mode, passed to :func:`open` :param open_kwargs: Additional keyword arguments passed to :func:`gzip.open` :param pickle_load_kwargs: Keyword arguments to pass through to :func:`pickle.load`. :returns: Any object """ _module = Module.from_key(key, ensure_exists=True) return _module.load_pickle_gz( *subkeys, name=name, mode=mode, open_kwargs=open_kwargs, pickle_load_kwargs=pickle_load_kwargs, ) def ensure_xml( key: str, *subkeys: str, url: str, name: Optional[str] = None, force: bool = False, download_kwargs: Optional[Mapping[str, Any]] = None, parse_kwargs: Optional[Mapping[str, Any]] = None, ) -> "lxml.etree.ElementTree": """Download an XML file and open it with :mod:`lxml`. :param key: The module name :param subkeys: A sequence of additional strings to join. If none are given, returns the directory for this module. :param url: The URL to download. :param name: Overrides the name of the file at the end of the URL, if given. Also useful for URLs that don't have proper filenames with extensions. :param force: Should the download be done again, even if the path already exists? Defaults to false. :param download_kwargs: Keyword arguments to pass through to :func:`pystow.utils.download`. :param parse_kwargs: Keyword arguments to pass through to :func:`lxml.etree.parse`. :returns: An ElementTree object .. warning:: If you have lots of files to read in the same archive, it's better just to unzip first. """ _module = Module.from_key(key, ensure_exists=True) return _module.ensure_xml( *subkeys, name=name, url=url, force=force, download_kwargs=download_kwargs, parse_kwargs=parse_kwargs, ) def load_xml( key: str, *subkeys: str, name: str, parse_kwargs: Optional[Mapping[str, Any]] = None, ) -> "lxml.etree.ElementTree": """Load an XML file with :mod:`lxml`. :param key: The module name :param subkeys: A sequence of additional strings to join. If none are given, returns the directory for this module. :param name: The name of the file to open :param parse_kwargs: Keyword arguments to pass through to :func:`lxml.etree.parse`. :returns: An ElementTree object .. warning:: If you have lots of files to read in the same archive, it's better just to unzip first. """ _module = Module.from_key(key, ensure_exists=True) return _module.load_xml( *subkeys, name=name, parse_kwargs=parse_kwargs, ) def dump_xml( key: str, *subkeys: str, name: str, obj: "lxml.etree.ElementTree", open_kwargs: Optional[Mapping[str, Any]] = None, write_kwargs: Optional[Mapping[str, Any]] = None, ) -> None: """Dump an XML element tree to a file with :mod:`lxml`. :param key: The module name :param subkeys: A sequence of additional strings to join. If none are given, returns the directory for this module. :param name: The name of the file to open :param obj: The object to dump :param open_kwargs: Additional keyword arguments passed to :func:`open` :param write_kwargs: Keyword arguments to pass through to :func:`lxml.etree.ElementTree.write`. """ _module = Module.from_key(key, ensure_exists=True) _module.dump_xml( *subkeys, name=name, obj=obj, open_kwargs=open_kwargs, write_kwargs=write_kwargs, ) def ensure_excel( key: str, *subkeys: str, url: str, name: Optional[str] = None, force: bool = False, download_kwargs: Optional[Mapping[str, Any]] = None, read_excel_kwargs: Optional[Mapping[str, Any]] = None, ) -> "pd.DataFrame": """Download an excel file and open as a dataframe with :mod:`pandas`. :param key: The module name :param subkeys: A sequence of additional strings to join. If none are given, returns the directory for this module. :param url: The URL to download. :param name: Overrides the name of the file at the end of the URL, if given. Also useful for URLs that don't have proper filenames with extensions. :param force: Should the download be done again, even if the path already exists? Defaults to false. :param download_kwargs: Keyword arguments to pass through to :func:`pystow.utils.download`. :param read_excel_kwargs: Keyword arguments to pass through to :func:`pandas.read_excel`. :return: A pandas DataFrame """ _module = Module.from_key(key, ensure_exists=True) return _module.ensure_excel( *subkeys, url=url, name=name, force=force, download_kwargs=download_kwargs, read_excel_kwargs=read_excel_kwargs, ) def ensure_tar_df( key: str, *subkeys: str, url: str, inner_path: str, name: Optional[str] = None, force: bool = False, download_kwargs: Optional[Mapping[str, Any]] = None, read_csv_kwargs: Optional[Mapping[str, Any]] = None, ) -> "pd.DataFrame": """Download a tar file and open an inner file as a dataframe with :mod:`pandas`. :param key: The module name :param subkeys: A sequence of additional strings to join. If none are given, returns the directory for this module. :param url: The URL to download. :param inner_path: The relative path to the file inside the archive :param name: Overrides the name of the file at the end of the URL, if given. Also useful for URLs that don't have proper filenames with extensions. :param force: Should the download be done again, even if the path already exists? Defaults to false. :param download_kwargs: Keyword arguments to pass through to :func:`pystow.utils.download`. :param read_csv_kwargs: Keyword arguments to pass through to :func:`pandas.read_csv`. :returns: A dataframe .. warning:: If you have lots of files to read in the same archive, it's better just to unzip first. """ _module = Module.from_key(key, ensure_exists=True) return _module.ensure_tar_df( *subkeys, url=url, name=name, force=force, inner_path=inner_path, download_kwargs=download_kwargs, read_csv_kwargs=read_csv_kwargs, ) def ensure_tar_xml( key: str, *subkeys: str, url: str, inner_path: str, name: Optional[str] = None, force: bool = False, download_kwargs: Optional[Mapping[str, Any]] = None, parse_kwargs: Optional[Mapping[str, Any]] = None, ) -> "lxml.etree.ElementTree": """Download a tar file and open an inner file as an XML with :mod:`lxml`. :param key: The module name :param subkeys: A sequence of additional strings to join. If none are given, returns the directory for this module. :param url: The URL to download. :param inner_path: The relative path to the file inside the archive :param name: Overrides the name of the file at the end of the URL, if given. Also useful for URLs that don't have proper filenames with extensions. :param force: Should the download be done again, even if the path already exists? Defaults to false. :param download_kwargs: Keyword arguments to pass through to :func:`pystow.utils.download`. :param parse_kwargs: Keyword arguments to pass through to :func:`lxml.etree.parse`. :returns: An ElementTree object .. warning:: If you have lots of files to read in the same archive, it's better just to unzip first. """ _module = Module.from_key(key, ensure_exists=True) return _module.ensure_tar_xml( *subkeys, url=url, name=name, force=force, inner_path=inner_path, download_kwargs=download_kwargs, parse_kwargs=parse_kwargs, ) def ensure_zip_df( key: str, *subkeys: str, url: str, inner_path: str, name: Optional[str] = None, force: bool = False, download_kwargs: Optional[Mapping[str, Any]] = None, read_csv_kwargs: Optional[Mapping[str, Any]] = None, ) -> "pd.DataFrame": """Download a zip file and open an inner file as a dataframe with :mod:`pandas`. :param key: The module name :param subkeys: A sequence of additional strings to join. If none are given, returns the directory for this module. :param url: The URL to download. :param inner_path: The relative path to the file inside the archive :param name: Overrides the name of the file at the end of the URL, if given. Also useful for URLs that don't have proper filenames with extensions. :param force: Should the download be done again, even if the path already exists? Defaults to false. :param download_kwargs: Keyword arguments to pass through to :func:`pystow.utils.download`. :param read_csv_kwargs: Keyword arguments to pass through to :func:`pandas.read_csv`. :return: A pandas DataFrame """ _module = Module.from_key(key, ensure_exists=True) return _module.ensure_zip_df( *subkeys, url=url, name=name, force=force, inner_path=inner_path, download_kwargs=download_kwargs, read_csv_kwargs=read_csv_kwargs, ) def ensure_zip_np( key: str, *subkeys: str, url: str, inner_path: str, name: Optional[str] = None, force: bool = False, download_kwargs: Optional[Mapping[str, Any]] = None, load_kwargs: Optional[Mapping[str, Any]] = None, ) -> "numpy.typing.ArrayLike": """Download a zip file and open an inner file as an array-like with :mod:`numpy`. :param key: The module name :param subkeys: A sequence of additional strings to join. If none are given, returns the directory for this module. :param url: The URL to download. :param inner_path: The relative path to the file inside the archive :param name: Overrides the name of the file at the end of the URL, if given. Also useful for URLs that don't have proper filenames with extensions. :param force: Should the download be done again, even if the path already exists? Defaults to false. :param download_kwargs: Keyword arguments to pass through to :func:`pystow.utils.download`. :param load_kwargs: Additional keyword arguments that are passed through to :func:`read_zip_np` and transitively to :func:`numpy.load`. :returns: An array-like object """ _module = Module.from_key(key, ensure_exists=True) return _module.ensure_zip_np( *subkeys, url=url, name=name, force=force, inner_path=inner_path, download_kwargs=download_kwargs, load_kwargs=load_kwargs, ) def ensure_rdf( key: str, *subkeys: str, url: str, name: Optional[str] = None, force: bool = False, download_kwargs: Optional[Mapping[str, Any]] = None, precache: bool = True, parse_kwargs: Optional[Mapping[str, Any]] = None, ) -> "rdflib.Graph": """Download a RDF file and open with :mod:`rdflib`. :param key: The module name :param subkeys: A sequence of additional strings to join. If none are given, returns the directory for this module. :param url: The URL to download. :param name: Overrides the name of the file at the end of the URL, if given. Also useful for URLs that don't have proper filenames with extensions. :param force: Should the download be done again, even if the path already exists? Defaults to false. :param download_kwargs: Keyword arguments to pass through to :func:`pystow.utils.download`. :param precache: Should the parsed :class:`rdflib.Graph` be stored as a pickle for fast loading? :param parse_kwargs: Keyword arguments to pass through to :func:`pystow.utils.read_rdf` and transitively to :func:`rdflib.Graph.parse`. :return: An RDF graph Example usage:: >>> import pystow >>> import rdflib >>> url = 'https://ftp.expasy.org/databases/rhea/rdf/rhea.rdf.gz' >>> rdf_graph: rdflib.Graph = pystow.ensure_rdf('rhea', url=url) If :mod:`rdflib` fails to guess the format, you can explicitly specify it using the `parse_kwargs` argument: >>> import pystow >>> import rdflib >>> url = "http://oaei.webdatacommons.org/tdrs/testdata/persistent/knowledgegraph" \ ... "/v3/suite/memoryalpha-stexpanded/component/reference.xml" >>> rdf_graph: rdflib.Graph = pystow.ensure_rdf("memoryalpha-stexpanded", url=url, parse_kwargs={"format": "xml"}) """ _module = Module.from_key(key, ensure_exists=True) return _module.ensure_rdf( *subkeys, url=url, name=name, force=force, download_kwargs=download_kwargs, precache=precache, parse_kwargs=parse_kwargs, ) def load_rdf( key: str, *subkeys: str, name: Optional[str] = None, parse_kwargs: Optional[Mapping[str, Any]] = None, ) -> "rdflib.Graph": """Open an RDF file with :mod:`rdflib`. :param key: The name of the module. No funny characters. The envvar _HOME where key is uppercased is checked first before using the default home directory. :param subkeys: A sequence of additional strings to join. If none are given, returns the directory for this module. :param name: The name of the file to open :param parse_kwargs: Keyword arguments to pass through to :func:`pystow.utils.read_rdf` and transitively to :func:`rdflib.Graph.parse`. :return: An RDF graph """ _module = Module.from_key(key, ensure_exists=True) return _module.load_rdf(*subkeys, name=name, parse_kwargs=parse_kwargs) def dump_rdf( key: str, *subkeys: str, name: str, obj: "rdflib.Graph", format: str = "turtle", serialize_kwargs: Optional[Mapping[str, Any]] = None, ) -> None: """Dump an RDF graph to a file with :mod:`rdflib`. :param key: The name of the module. No funny characters. The envvar _HOME where key is uppercased is checked first before using the default home directory. :param subkeys: A sequence of additional strings to join. If none are given, returns the directory for this module. :param name: The name of the file to open :param obj: The object to dump :param format: The format to dump in :param serialize_kwargs: Keyword arguments to through to :func:`rdflib.Graph.serialize`. """ _module = Module.from_key(key, ensure_exists=True) _module.dump_rdf(*subkeys, name=name, obj=obj, format=format, serialize_kwargs=serialize_kwargs) def ensure_from_s3( key: str, *subkeys: str, s3_bucket: str, s3_key: Union[str, Sequence[str]], name: Optional[str] = None, force: bool = False, **kwargs: Any, ) -> Path: """Ensure a file is downloaded. :param key: The name of the module. No funny characters. The envvar _HOME where key is uppercased is checked first before using the default home directory. :param subkeys: A sequence of additional strings to join. If none are given, returns the directory for this module. :param s3_bucket: The S3 bucket name :param s3_key: The S3 key name :param name: Overrides the name of the file at the end of the S3 key, if given. :param force: Should the download be done again, even if the path already exists? Defaults to false. :param kwargs: Remaining kwargs to forwrad to :class:`Module.ensure_from_s3`. :return: The path of the file that has been downloaded (or already exists) Example downloading ProtMapper 0.0.21: >>> version = '0.0.21' >>> ensure_from_s3('test', version, s3_bucket='bigmech', s3_key=f'protmapper/{version}/refseq_uniprot.csv') """ _module = Module.from_key(key, ensure_exists=True) return _module.ensure_from_s3( *subkeys, s3_bucket=s3_bucket, s3_key=s3_key, name=name, force=force, **kwargs ) def ensure_from_google( key: str, *subkeys: str, name: str, file_id: str, force: bool = False, ) -> Path: """Ensure a file is downloaded from google drive. :param key: The name of the module. No funny characters. The envvar _HOME where key is uppercased is checked first before using the default home directory. :param subkeys: A sequence of additional strings to join. If none are given, returns the directory for this module. :param name: The name of the file :param file_id: The file identifier of the google file. If your share link is https://drive.google.com/file/d/1AsPPU4ka1Rc9u-XYMGWtvV65hF3egi0z/view, then your file id is ``1AsPPU4ka1Rc9u-XYMGWtvV65hF3egi0z``. :param force: Should the download be done again, even if the path already exists? Defaults to false. :return: The path of the file that has been downloaded (or already exists) Example downloading the WK3l-15k dataset as motivated by https://github.com/pykeen/pykeen/pull/403: >>> ensure_from_google('test', name='wk3l15k.zip', file_id='1AsPPU4ka1Rc9u-XYMGWtvV65hF3egi0z') """ _module = Module.from_key(key, ensure_exists=True) return _module.ensure_from_google(*subkeys, name=name, file_id=file_id, force=force) def joinpath_sqlite(key: str, *subkeys: str, name: str) -> str: """Get an SQLite database connection string. :param key: The name of the module. No funny characters. The envvar `_HOME` where key is uppercased is checked first before using the default home directory. :param subkeys: A sequence of additional strings to join. If none are given, returns the directory for this module. :param name: The name of the database file. :return: A SQLite path string. """ _module = Module.from_key(key, ensure_exists=True) return _module.joinpath_sqlite(*subkeys, name=name) @contextmanager def ensure_open_sqlite( key: str, *subkeys: str, url: str, name: Optional[str] = None, force: bool = False, download_kwargs: Optional[Mapping[str, Any]] = None, ) -> Generator[sqlite3.Connection, None, None]: """Ensure and connect to a SQLite database. :param key: The name of the module. No funny characters. The envvar `_HOME` where key is uppercased is checked first before using the default home directory. :param subkeys: A sequence of additional strings to join. If none are given, returns the directory for this module. :param url: The URL to download. :param name: Overrides the name of the file at the end of the URL, if given. Also useful for URLs that don't have proper filenames with extensions. :param force: Should the download be done again, even if the path already exists? Defaults to false. :param download_kwargs: Keyword arguments to pass through to :func:`pystow.utils.download`. :yields: An instance of :class:`sqlite3.Connection` from :func:`sqlite3.connect` Example usage: >>> import pystow >>> import pandas as pd >>> url = "https://s3.amazonaws.com/bbop-sqlite/hp.db" >>> sql = "SELECT * FROM entailed_edge LIMIT 10" >>> with pystow.ensure_open_sqlite("test", url=url) as conn: >>> df = pd.read_sql(sql, conn) """ _module = Module.from_key(key, ensure_exists=True) with _module.ensure_open_sqlite( *subkeys, url=url, name=name, force=force, download_kwargs=download_kwargs ) as yv: yield yv @contextmanager def ensure_open_sqlite_gz( key: str, *subkeys: str, url: str, name: Optional[str] = None, force: bool = False, download_kwargs: Optional[Mapping[str, Any]] = None, ) -> Generator[sqlite3.Connection, None, None]: """Ensure and connect to a gzipped SQLite database. :param key: The name of the module. No funny characters. The envvar `_HOME` where key is uppercased is checked first before using the default home directory. :param subkeys: A sequence of additional strings to join. If none are given, returns the directory for this module. :param url: The URL to download. :param name: Overrides the name of the file at the end of the URL, if given. Also useful for URLs that don't have proper filenames with extensions. :param force: Should the download be done again, even if the path already exists? Defaults to false. :param download_kwargs: Keyword arguments to pass through to :func:`pystow.utils.download`. :yields: An instance of :class:`sqlite3.Connection` from :func:`sqlite3.connect` Example usage: >>> import pystow >>> import pandas as pd >>> url = "https://s3.amazonaws.com/bbop-sqlite/hp.db.gz" >>> sql = "SELECT * FROM entailed_edge LIMIT 10" >>> with pystow.ensure_open_sqlite_gz("test", url=url) as conn: >>> df = pd.read_sql(sql, conn) """ _module = Module.from_key(key, ensure_exists=True) with _module.ensure_open_sqlite_gz( *subkeys, url=url, name=name, force=force, download_kwargs=download_kwargs ) as yv: yield yv pystow-0.5.5/src/pystow/cache.py000066400000000000000000000132321466707254100166540ustar00rootroot00000000000000# -*- coding: utf-8 -*- """Utilities for caching files.""" import functools import json import logging from abc import ABC, abstractmethod from pathlib import Path from typing import ( TYPE_CHECKING, Any, Callable, Dict, Generic, List, MutableMapping, Optional, TypeVar, Union, cast, ) try: import pickle5 as pickle except ImportError: import pickle if TYPE_CHECKING: import pandas as pd __all__ = [ # Classses "Cached", "CachedPickle", "CachedJSON", "CachedCollection", "CachedDataFrame", # Types "Getter", ] logger = logging.getLogger(__name__) JSONType = Union[ Dict[str, Any], List[Any], ] X = TypeVar("X") Getter = Callable[[], X] class Cached(Generic[X], ABC): """Caching decorator.""" def __init__( self, path: Union[str, Path], force: bool = False, ) -> None: """Instantiate the decorator. :param path: The path to the cache for the file :param force: Should a pre-existing file be disregared/overwritten? """ self.path = Path(path) self.force = force def __call__(self, func: Getter[X]) -> Getter[X]: """Apply this instance as a decorator. :param func: The function to wrap :return: A wrapped function """ @functools.wraps(func) def _wrapped() -> X: if self.path.is_file() and not self.force: return self.load() logger.debug("no cache found at %s", self.path) rv = func() logger.debug("writing cache to %s", self.path) self.dump(rv) return rv return _wrapped @abstractmethod def load(self) -> X: """Load data from the cache (typically by opening a file at the given path).""" @abstractmethod def dump(self, rv: X) -> None: """Dump data to the cache (typically by opening a file at the given path). :param rv: The data to dump """ class CachedJSON(Cached[JSONType]): """Make a function lazily cache its return value as JSON.""" def load(self) -> JSONType: """Load data from the cache as JSON. :returns: A python object with JSON-like data from the cache """ with open(self.path) as file: return cast(JSONType, json.load(file)) def dump(self, rv: JSONType) -> None: """Dump data to the cache as JSON. :param rv: The JSON data to dump """ with open(self.path, "w") as file: json.dump(rv, file, indent=2) class CachedPickle(Cached[Any]): """Make a function lazily cache its return value as a pickle.""" def load(self) -> Any: """Load data from the cache as a pickle. :returns: A python object loaded from the cache """ with open(self.path, "rb") as file: return pickle.load(file) def dump(self, rv: Any) -> None: """Dump data to the cache as a pickle. :param rv: The arbitrary python object to dump """ with open(self.path, "wb") as file: pickle.dump(rv, file, protocol=pickle.HIGHEST_PROTOCOL) class CachedCollection(Cached[List[str]]): """Make a function lazily cache its return value as file.""" def load(self) -> List[str]: """Load data from the cache as a list of strings. :returns: A list of strings loaded from the cache """ with open(self.path) as file: return [line.strip() for line in file] def dump(self, rv: List[str]) -> None: """Dump data to the cache as a list of strings. :param rv: The list of strings to dump """ with open(self.path, "w") as file: for line in rv: print(line, file=file) # noqa:T001,T201 class CachedDataFrame(Cached["pd.DataFrame"]): """Make a function lazily cache its return value as a dataframe.""" def __init__( self, path: Union[str, Path], force: bool = False, sep: Optional[str] = None, dtype: Optional[Any] = None, read_csv_kwargs: Optional[MutableMapping[str, Any]] = None, ) -> None: """Instantiate the decorator. :param path: The path to the cache for the file :param force: Should a pre-existing file be disregared/overwritten? :param sep: The separator. Defaults to TSV, since this is the only reasonable default. :param dtype: A shortcut for setting the dtype :param read_csv_kwargs: Additional kwargs to pass to :func:`pd.read_csv`. :raises ValueError: if sep is given as a kwarg and also in ``read_csv_kwargs``. """ super().__init__(path=path, force=force) self.read_csv_kwargs = read_csv_kwargs or {} if "sep" not in self.read_csv_kwargs: self.sep = sep or "\t" elif sep is not None: raise ValueError else: self.sep = self.read_csv_kwargs.pop("sep") if dtype is not None: if "dtype" in self.read_csv_kwargs: raise ValueError self.read_csv_kwargs["dtype"] = dtype self.read_csv_kwargs.setdefault("keep_default_na", False) def load(self) -> "pd.DataFrame": """Load data from the cache as a dataframe. :returns: A dataframe loaded from the cache. """ import pandas as pd return pd.read_csv( self.path, sep=self.sep, **self.read_csv_kwargs, ) def dump(self, rv: "pd.DataFrame") -> None: """Dump data to the cache as a dataframe. :param rv: The dataframe to dump """ rv.to_csv(self.path, sep=self.sep, index=False) pystow-0.5.5/src/pystow/cli.py000066400000000000000000000026761466707254100163720ustar00rootroot00000000000000# -*- coding: utf-8 -*- # flake8: noqa # type: ignore """Command line interface for PyStow.""" import os from typing import Optional, Sequence import click @click.group() def main() -> None: """Run the PyStow CLI.""" @main.command() @click.argument("keys", nargs=-1) @click.option("--name") def join(keys: Sequence[str], name: Optional[str]): """List a directory.""" from . import api click.echo(api.join(*keys, name=name)) @main.command() @click.argument("keys", nargs=-1) def ls(keys: Sequence[str]): """List a directory.""" from . import api directory = api.join(*keys) _ls(directory) @main.command() @click.argument("keys", nargs=-1) @click.option("--url", required=True) @click.option("--name") @click.option("--force", is_flag=True) def ensure(keys: Sequence[str], url: str, name: Optional[str], force: bool): """Ensure a file is downloaded.""" from . import api path = api.ensure(*keys, url=url, name=name, force=force) _ls(path.parent) def _ls(directory): command = f"ls -al {directory}" click.secho(f"[pystow] {command}", fg="cyan", bold=True) os.system(command) # noqa:S605 @main.command(name="set") @click.argument("module") @click.argument("key") @click.argument("value") def set_config(module: str, key: str, value: str): """Set a configuration value.""" from .config_api import write_config write_config(module, key, value) if __name__ == "__main__": main() pystow-0.5.5/src/pystow/config_api.py000066400000000000000000000160111466707254100177050ustar00rootroot00000000000000# -*- coding: utf-8 -*- """Configuration handling.""" import os from configparser import ConfigParser from functools import lru_cache from pathlib import Path from textwrap import dedent from typing import Any, Callable, Optional, Type, TypeVar, Union from .utils import getenv_path __all__ = [ "get_config", "write_config", ] X = TypeVar("X") CONFIG_NAME_ENVVAR = "PYSTOW_CONFIG_NAME" CONFIG_HOME_ENVVAR = "PYSTOW_CONFIG_HOME" CONFIG_NAME_DEFAULT = ".config" class ConfigError(ValueError): """Raised when configuration can not be looked up.""" def __init__(self, module: str, key: str): """Initialize the configuration error. :param module: Name of the module, e.g., ``bioportal`` :param key: Name of the key inside the module, e.g., ``api_key`` """ self.module = module self.key = key def __str__(self) -> str: path = get_home().joinpath(self.module).with_suffix(".ini") return dedent( f"""\ Could not look up {self.module}/{self.key} and no default given. This can be solved with one of the following: 1. Set the {self.module.upper()}_{self.key.upper()} environment variable - Windows, via GUI: https://www.computerhope.com/issues/ch000549.htm - Windows, via CLI: https://learn.microsoft.com/en-us/windows-server/administration/windows-commands/set_1 - Mac OS: https://apple.stackexchange.com/questions/106778/how-do-i-set-environment-variables-on-os-x - Linux: https://www.freecodecamp.org/news/how-to-set-an-environment-variable-in-linux/ 2. Use the PyStow CLI from the command line to set the configuration like so: $ pystow set {self.module} {self.key} This creates an INI file in {path} with the configuration in the right place. 3. Create/edit an INI file in {path} and manually fill it in by 1) creating a section inside it called [{self.module}] and 2) setting a value for {self.key} = that looks like: # {path} [{self.module}] {self.key} = See https://github.com/cthoyt/pystow#%EF%B8%8F%EF%B8%8F-configuration for more information. """ ) def get_name() -> str: """Get the config home directory name. :returns: The name of the pystow home directory, either loaded from the :data:`CONFIG_NAME_ENVVAR`` environment variable or given by the default value :data:`CONFIG_NAME_DEFAULT`. """ return os.getenv(CONFIG_NAME_ENVVAR, default=CONFIG_NAME_DEFAULT) def get_home(ensure_exists: bool = True) -> Path: """Get the config home directory. :param ensure_exists: If true, ensures the directory is created :returns: A path object representing the pystow home directory, as one of: 1. :data:`CONFIG_HOME_ENVVAR` environment variable or 2. The default directory constructed in the user's home directory plus what's returned by :func:`get_name`. """ default = Path.home().joinpath(get_name()).expanduser() return getenv_path(CONFIG_HOME_ENVVAR, default, ensure_exists=ensure_exists) @lru_cache(maxsize=1) def _get_cfp(module: str) -> ConfigParser: cfp = ConfigParser() directory = get_home() # If a multi-part module was given like "zenodo:sandbox", # then only look for the first part "zenodo" as the file name if ":" in module: module = module.split(":", 1)[0] filenames = [ os.path.join(directory, "config.cfg"), os.path.join(directory, "config.ini"), os.path.join(directory, "pystow.cfg"), os.path.join(directory, "pystow.ini"), os.path.join(directory, f"{module}.cfg"), os.path.join(directory, f"{module}.ini"), os.path.join(directory, module, f"{module}.cfg"), os.path.join(directory, module, f"{module}.ini"), os.path.join(directory, module, "conf.ini"), os.path.join(directory, module, "config.ini"), os.path.join(directory, module, "conf.cfg"), os.path.join(directory, module, "config.cfg"), ] cfp.read(filenames) return cfp def get_config( module: str, key: str, *, passthrough: Optional[X] = None, default: Optional[X] = None, dtype: Optional[Type[X]] = None, raise_on_missing: bool = False, ) -> Any: """Get a configuration value. :param module: Name of the module (e.g., ``pybel``) to get configuration for :param key: Name of the key (e.g., ``connection``) :param passthrough: If this is not none, will get returned :param default: If the environment and configuration files don't contain anything, this is returned. :param dtype: The datatype to parse out. Can either be :func:`int`, :func:`float`, :func:`bool`, or :func:`str`. If none, defaults to :func:`str`. :param raise_on_missing: If true, will raise a value error if no data is found and no default is given :returns: The config value or the default. :raises ConfigError: If ``raise_on_missing`` conditions are met """ if passthrough is not None: return _cast(passthrough, dtype) rv = os.getenv(f"{module.upper()}_{key.upper()}") if rv is not None: return _cast(rv, dtype) rv = _get_cfp(module).get(module, key, fallback=None) if rv is None: if default is None and raise_on_missing: raise ConfigError(module=module, key=key) return default return _cast(rv, dtype) def _cast(rv: Any, dtype: Union[None, Callable[..., Any]]) -> Any: if not isinstance(rv, str): # if it's not a string, it doesn't need munging return rv if dtype in (None, str): # no munging necessary return rv if dtype in (int, float): return dtype(rv) if dtype is bool: if rv.lower() in ("t", "true", "yes", "1", 1, True): return True elif rv.lower() in ("f", "false", "no", "0", 0, False): return False else: raise ValueError(f"value can not be coerced into bool: {rv}") raise TypeError(f"dtype is invalid: {dtype}") def write_config(module: str, key: str, value: str) -> None: """Write a configuration value. :param module: The name of the app (e.g., ``indra``) :param key: The key of the configuration in the app :param value: The value of the configuration in the app """ _get_cfp.cache_clear() cfp = ConfigParser() # If there's a multi-part module such as "zenodo:sandbox", # then write to zenodo.ini with section [zenodo:sandbox] fname = module.split(":", 1)[0] if ":" in module else module path = get_home().joinpath(fname).with_suffix(".ini") cfp.read(path) # If the file did not exist, then this section will be empty # and running set() would raise a configparser.NoSectionError. if not cfp.has_section(module): cfp.add_section(module) # Note that the section duplicates the file name cfp.set(section=module, option=key, value=value) with path.open("w") as file: cfp.write(file) pystow-0.5.5/src/pystow/constants.py000066400000000000000000000042371466707254100176320ustar00rootroot00000000000000# -*- coding: utf-8 -*- """PyStow constants.""" from io import StringIO from textwrap import dedent from typing import IO, Any, Callable, Generator __all__ = [ "PYSTOW_NAME_ENVVAR", "PYSTOW_HOME_ENVVAR", "PYSTOW_USE_APPDIRS", "PYSTOW_NAME_DEFAULT", "README_TEXT", "Opener", "JSON", "Provider", ] PYSTOW_NAME_ENVVAR = "PYSTOW_NAME" PYSTOW_HOME_ENVVAR = "PYSTOW_HOME" PYSTOW_USE_APPDIRS = "PYSTOW_USE_APPDIRS" PYSTOW_NAME_DEFAULT = ".data" README_TEXT = dedent( """\ # PyStow Data Directory This directory is used by [`pystow`](https://github.com/cthoyt/pystow) as a reproducible location to store and access data. ### ⚙️️ Configuration By default, data is stored in the `$HOME/.data` directory. By default, the `` app will create the `$HOME/.data/` folder. If you want to use an alternate folder name to `.data` inside the home directory, you can set the `PYSTOW_NAME` environment variable. For example, if you set `PYSTOW_NAME=mydata`, then the following code for the `pykeen` app will create the `$HOME/mydata/pykeen/` directory: ```python import os import pystow # Only for demonstration purposes. You should set environment # variables either with your .bashrc or in the command line REPL. os.environ['PYSTOW_NAME'] = 'mydata' # Get a directory (as a pathlib.Path) for ~/mydata/pykeen pykeen_directory = pystow.join('pykeen') ``` If you want to specify a completely custom directory that isn't relative to your home directory, you can set the `PYSTOW_HOME` environment variable. For example, if you set `PYSTOW_HOME=/usr/local/`, then the following code for the `pykeen` app will create the `/usr/local/pykeen/` directory: ```python import os import pystow # Only for demonstration purposes. You should set environment # variables either with your .bashrc or in the command line REPL. os.environ['PYSTOW_HOME'] = '/usr/local/' # Get a directory (as a pathlib.Path) for /usr/local/pykeen pykeen_directory = pystow.join('pykeen') ``` Note: if you set `PYSTOW_HOME`, then `PYSTOW_NAME` is disregarded. """ ) Opener = Generator[StringIO, None, None] BytesOpener = Generator[IO[bytes], None, None] JSON = Any Provider = Callable[..., None] pystow-0.5.5/src/pystow/impl.py000066400000000000000000001616411466707254100165620ustar00rootroot00000000000000# -*- coding: utf-8 -*- """Module implementation.""" import bz2 import gzip import json import logging import lzma import sqlite3 import tarfile import zipfile from contextlib import closing, contextmanager from pathlib import Path from typing import ( TYPE_CHECKING, Any, Dict, Generator, Mapping, Optional, Sequence, Union, ) from . import utils from .constants import JSON, BytesOpener, Opener, Provider from .utils import ( base_from_gzip_name, download_from_google, download_from_s3, get_base, gunzip, mkdir, name_from_s3_key, name_from_url, path_to_sqlite, read_rdf, read_tarfile_csv, read_tarfile_xml, read_zip_np, read_zipfile_csv, ) try: import pickle5 as pickle except ImportError: import pickle if TYPE_CHECKING: import botocore.client import lxml.etree import numpy import pandas as pd import rdflib __all__ = ["Module"] logger = logging.getLogger(__name__) class Module: """The class wrapping the directory lookup implementation.""" def __init__(self, base: Union[str, Path], ensure_exists: bool = True) -> None: """Initialize the module. :param base: The base directory for the module :param ensure_exists: Should the base directory be created automatically? Defaults to true. """ self.base = Path(base) mkdir(self.base, ensure_exists=ensure_exists) @classmethod def from_key(cls, key: str, *subkeys: str, ensure_exists: bool = True) -> "Module": """Get a module for the given directory or one of its subdirectories. :param key: The name of the module. No funny characters. The envvar _HOME where key is uppercased is checked first before using the default home directory. :param subkeys: A sequence of additional strings to join. If none are given, returns the directory for this module. :param ensure_exists: Should all directories be created automatically? Defaults to true. :return: A module """ base = get_base(key, ensure_exists=False) rv = cls(base=base, ensure_exists=ensure_exists) if subkeys: rv = rv.module(*subkeys, ensure_exists=ensure_exists) return rv def module(self, *subkeys: str, ensure_exists: bool = True) -> "Module": """Get a module for a subdirectory of the current module. :param subkeys: A sequence of additional strings to join. If none are given, returns the directory for this module. :param ensure_exists: Should all directories be created automatically? Defaults to true. :return: A module representing the subdirectory based on the given ``subkeys``. """ base = self.join(*subkeys, ensure_exists=False) return Module(base=base, ensure_exists=ensure_exists) def join( self, *subkeys: str, name: Optional[str] = None, ensure_exists: bool = True, ) -> Path: """Get a subdirectory of the current module. :param subkeys: A sequence of additional strings to join. If none are given, returns the directory for this module. :param ensure_exists: Should all directories be created automatically? Defaults to true. :param name: The name of the file (optional) inside the folder :return: The path of the directory or subdirectory for the given module. """ rv = self.base if subkeys: rv = rv.joinpath(*subkeys) mkdir(rv, ensure_exists=ensure_exists) if name: rv = rv.joinpath(name) return rv def joinpath_sqlite(self, *subkeys: str, name: str) -> str: """Get an SQLite database connection string. :param subkeys: A sequence of additional strings to join. If none are given, returns the directory for this module. :param name: The name of the database file. :return: A SQLite path string. """ path = self.join(*subkeys, name=name, ensure_exists=True) return path_to_sqlite(path) def ensure( self, *subkeys: str, url: str, name: Optional[str] = None, force: bool = False, download_kwargs: Optional[Mapping[str, Any]] = None, ) -> Path: """Ensure a file is downloaded. :param subkeys: A sequence of additional strings to join. If none are given, returns the directory for this module. :param url: The URL to download. :param name: Overrides the name of the file at the end of the URL, if given. Also useful for URLs that don't have proper filenames with extensions. :param force: Should the download be done again, even if the path already exists? Defaults to false. :param download_kwargs: Keyword arguments to pass through to :func:`pystow.utils.download`. :return: The path of the file that has been downloaded (or already exists) """ if name is None: name = name_from_url(url) path = self.join(*subkeys, name=name, ensure_exists=True) utils.download( url=url, path=path, force=force, **(download_kwargs or {}), ) return path def ensure_custom( self, *subkeys: str, name: str, force: bool = False, provider: Provider, **kwargs: Any, ) -> Path: """Ensure a file is present, and run a custom create function otherwise. :param subkeys: A sequence of additional strings to join. If none are given, returns the directory for this module. :param name: The file name. :param force: Should the file be re-created, even if the path already exists? :param provider: The file provider. Will be run with the path as the first positional argument, if the file needs to be generated. :param kwargs: Additional keyword-based parameters passed to the provider. :raises ValueError: If the provider was called but the file was not created by it. :return: The path of the file that has been created (or already exists) """ path = self.join(*subkeys, name=name, ensure_exists=True) if path.is_file() and not force: return path provider(path, **kwargs) if not path.is_file(): raise ValueError(f"Provider {provider} did not create the file at {path}!") return path def ensure_untar( self, *subkeys: str, url: str, name: Optional[str] = None, directory: Optional[str] = None, force: bool = False, download_kwargs: Optional[Mapping[str, Any]] = None, extract_kwargs: Optional[Mapping[str, Any]] = None, ) -> Path: """Ensure a tar file is downloaded and unarchived. :param subkeys: A sequence of additional strings to join. If none are given, returns the directory for this module. :param url: The URL to download. :param name: Overrides the name of the file at the end of the URL, if given. Also useful for URLs that don't have proper filenames with extensions. :param directory: Overrides the name of the directory into which the tar archive is extracted. If none given, will use the stem of the file name that gets downloaded. :param force: Should the download be done again, even if the path already exists? Defaults to false. :param download_kwargs: Keyword arguments to pass through to :func:`pystow.utils.download`. :param extract_kwargs: Keyword arguments to pass to :meth:`tarfile.TarFile.extract_all`. :return: The path of the directory where the file that has been downloaded gets extracted to """ path = self.ensure( *subkeys, url=url, name=name, force=force, download_kwargs=download_kwargs ) if directory is None: # rhea-rxn.tar.gz -> rhea-rxn suffixes_len = sum(len(suffix) for suffix in path.suffixes) directory = path.name[:-suffixes_len] unzipped_path = path.parent.joinpath(directory) if unzipped_path.is_dir() and not force: return unzipped_path unzipped_path.mkdir(exist_ok=True, parents=True) with tarfile.open(path) as tar_file: tar_file.extractall(unzipped_path, **(extract_kwargs or {})) # noqa:S202 return unzipped_path def ensure_gunzip( self, *subkeys: str, url: str, name: Optional[str] = None, force: bool = False, autoclean: bool = True, download_kwargs: Optional[Mapping[str, Any]] = None, ) -> Path: """Ensure a tar.gz file is downloaded and unarchived. :param subkeys: A sequence of additional strings to join. If none are given, returns the directory for this module. :param url: The URL to download. :param name: Overrides the name of the file at the end of the URL, if given. Also useful for URLs that don't have proper filenames with extensions. :param force: Should the download be done again, even if the path already exists? Defaults to false. :param autoclean: Should the zipped file be deleted? :param download_kwargs: Keyword arguments to pass through to :func:`pystow.utils.download`. :return: The path of the directory where the file that has been downloaded gets extracted to """ if name is None: name = name_from_url(url) gunzipped_name = base_from_gzip_name(name) gunzipped_path = self.join(*subkeys, name=gunzipped_name, ensure_exists=True) if gunzipped_path.is_file() and not force: return gunzipped_path path = self.ensure( *subkeys, url=url, name=name, force=force, download_kwargs=download_kwargs, ) gunzip(path, gunzipped_path) if autoclean: logger.info("removing original gzipped file %s", path) path.unlink() return gunzipped_path @contextmanager def ensure_open( self, *subkeys: str, url: str, name: Optional[str] = None, force: bool = False, download_kwargs: Optional[Mapping[str, Any]] = None, mode: str = "r", open_kwargs: Optional[Mapping[str, Any]] = None, ) -> Opener: """Ensure a file is downloaded and open it. :param subkeys: A sequence of additional strings to join. If none are given, returns the directory for this module. :param url: The URL to download. :param name: Overrides the name of the file at the end of the URL, if given. Also useful for URLs that don't have proper filenames with extensions. :param force: Should the download be done again, even if the path already exists? Defaults to false. :param download_kwargs: Keyword arguments to pass through to :func:`pystow.utils.download`. :param mode: The read mode, passed to :func:`open` :param open_kwargs: Additional keyword arguments passed to :func:`open` :yields: An open file object """ path = self.ensure( *subkeys, url=url, name=name, force=force, download_kwargs=download_kwargs ) open_kwargs = {} if open_kwargs is None else dict(open_kwargs) open_kwargs.setdefault("mode", mode) with path.open(**open_kwargs) as file: yield file @contextmanager def open( self, *subkeys: str, name: str, mode: str = "r", open_kwargs: Optional[Mapping[str, Any]] = None, ensure_exists: bool = False, ) -> Opener: """Open a file that exists already. :param subkeys: A sequence of additional strings to join. If none are given, returns the directory for this module. :param name: The name of the file to open :param mode: The read mode, passed to :func:`open` :param open_kwargs: Additional keyword arguments passed to :func:`open` :param ensure_exists: Should the file be made? Set to true on write operations. :yields: An open file object """ path = self.join(*subkeys, name=name, ensure_exists=ensure_exists) open_kwargs = {} if open_kwargs is None else dict(open_kwargs) open_kwargs.setdefault("mode", mode) with path.open(**open_kwargs) as file: yield file @contextmanager def open_gz( self, *subkeys: str, name: str, mode: str = "rt", open_kwargs: Optional[Mapping[str, Any]] = None, ensure_exists: bool = False, ) -> Opener: """Open a gzipped file that exists already. :param subkeys: A sequence of additional strings to join. If none are given, returns the directory for this module. :param name: The name of the file to open :param mode: The read mode, passed to :func:`gzip.open` :param open_kwargs: Additional keyword arguments passed to :func:`gzip.open` :param ensure_exists: Should the file be made? Set to true on write operations. :yields: An open file object """ path = self.join(*subkeys, name=name, ensure_exists=ensure_exists) open_kwargs = {} if open_kwargs is None else dict(open_kwargs) open_kwargs.setdefault("mode", mode) with gzip.open(path, **open_kwargs) as file: yield file @contextmanager def ensure_open_lzma( self, *subkeys: str, url: str, name: Optional[str] = None, force: bool = False, download_kwargs: Optional[Mapping[str, Any]] = None, mode: str = "rt", open_kwargs: Optional[Mapping[str, Any]] = None, ) -> Opener: """Ensure a LZMA-compressed file is downloaded and open a file inside it. :param subkeys: A sequence of additional strings to join. If none are given, returns the directory for this module. :param url: The URL to download. :param name: Overrides the name of the file at the end of the URL, if given. Also useful for URLs that don't have proper filenames with extensions. :param force: Should the download be done again, even if the path already exists? Defaults to false. :param download_kwargs: Keyword arguments to pass through to :func:`pystow.utils.download`. :param mode: The read mode, passed to :func:`lzma.open` :param open_kwargs: Additional keyword arguments passed to :func:`lzma.open` :yields: An open file object """ path = self.ensure( *subkeys, url=url, name=name, force=force, download_kwargs=download_kwargs ) open_kwargs = {} if open_kwargs is None else dict(open_kwargs) open_kwargs.setdefault("mode", mode) with lzma.open(path, **open_kwargs) as file: yield file @contextmanager def ensure_open_tarfile( self, *subkeys: str, url: str, inner_path: str, name: Optional[str] = None, force: bool = False, download_kwargs: Optional[Mapping[str, Any]] = None, mode: str = "r", open_kwargs: Optional[Mapping[str, Any]] = None, ) -> BytesOpener: """Ensure a tar file is downloaded and open a file inside it. :param subkeys: A sequence of additional strings to join. If none are given, returns the directory for this module. :param url: The URL to download. :param inner_path: The relative path to the file inside the archive :param name: Overrides the name of the file at the end of the URL, if given. Also useful for URLs that don't have proper filenames with extensions. :param force: Should the download be done again, even if the path already exists? Defaults to false. :param download_kwargs: Keyword arguments to pass through to :func:`pystow.utils.download`. :param mode: The read mode, passed to :func:`tarfile.open` :param open_kwargs: Additional keyword arguments passed to :func:`tarfile.open` :yields: An open file object """ path = self.ensure( *subkeys, url=url, name=name, force=force, download_kwargs=download_kwargs ) open_kwargs = {} if open_kwargs is None else dict(open_kwargs) open_kwargs.setdefault("mode", mode) with tarfile.open(path, **open_kwargs) as tar_file: with tar_file.extractfile(inner_path) as file: # type:ignore yield file @contextmanager def ensure_open_zip( self, *subkeys: str, url: str, inner_path: str, name: Optional[str] = None, force: bool = False, download_kwargs: Optional[Mapping[str, Any]] = None, mode: str = "r", open_kwargs: Optional[Mapping[str, Any]] = None, ) -> BytesOpener: """Ensure a file is downloaded then open it with :mod:`zipfile`. :param subkeys: A sequence of additional strings to join. If none are given, returns the directory for this module. :param url: The URL to download. :param inner_path: The relative path to the file inside the archive :param name: Overrides the name of the file at the end of the URL, if given. Also useful for URLs that don't have proper filenames with extensions. :param force: Should the download be done again, even if the path already exists? Defaults to false. :param download_kwargs: Keyword arguments to pass through to :func:`pystow.utils.download`. :param mode: The read mode, passed to :func:`zipfile.open` :param open_kwargs: Additional keyword arguments passed to :func:`zipfile.open` :yields: An open file object """ path = self.ensure( *subkeys, url=url, name=name, force=force, download_kwargs=download_kwargs ) open_kwargs = {} if open_kwargs is None else dict(open_kwargs) open_kwargs.setdefault("mode", mode) with zipfile.ZipFile(file=path) as zip_file: with zip_file.open(inner_path) as file: yield file @contextmanager def ensure_open_gz( self, *subkeys: str, url: str, name: Optional[str] = None, force: bool = False, download_kwargs: Optional[Mapping[str, Any]] = None, mode: str = "rb", open_kwargs: Optional[Mapping[str, Any]] = None, ) -> Opener: """Ensure a gzipped file is downloaded and open a file inside it. :param subkeys: A sequence of additional strings to join. If none are given, returns the directory for this module. :param url: The URL to download. :param name: Overrides the name of the file at the end of the URL, if given. Also useful for URLs that don't have proper filenames with extensions. :param force: Should the download be done again, even if the path already exists? Defaults to false. :param download_kwargs: Keyword arguments to pass through to :func:`pystow.utils.download`. :param mode: The read mode, passed to :func:`gzip.open` :param open_kwargs: Additional keyword arguments passed to :func:`gzip.open` :yields: An open file object """ path = self.ensure( *subkeys, url=url, name=name, force=force, download_kwargs=download_kwargs ) open_kwargs = {} if open_kwargs is None else dict(open_kwargs) open_kwargs.setdefault("mode", mode) with gzip.open(path, **open_kwargs) as file: yield file @contextmanager def ensure_open_bz2( self, *subkeys: str, url: str, name: Optional[str] = None, force: bool = False, download_kwargs: Optional[Mapping[str, Any]] = None, mode: str = "rb", open_kwargs: Optional[Mapping[str, Any]] = None, ) -> Opener: """Ensure a BZ2-compressed file is downloaded and open a file inside it. :param subkeys: A sequence of additional strings to join. If none are given, returns the directory for this module. :param url: The URL to download. :param name: Overrides the name of the file at the end of the URL, if given. Also useful for URLs that don't have proper filenames with extensions. :param force: Should the download be done again, even if the path already exists? Defaults to false. :param download_kwargs: Keyword arguments to pass through to :func:`pystow.utils.download`. :param mode: The read mode, passed to :func:`bz2.open` :param open_kwargs: Additional keyword arguments passed to :func:`bz2.open` :yields: An open file object """ path = self.ensure( *subkeys, url=url, name=name, force=force, download_kwargs=download_kwargs ) open_kwargs = {} if open_kwargs is None else dict(open_kwargs) open_kwargs.setdefault("mode", mode) with bz2.open(path, **open_kwargs) as file: yield file def ensure_csv( self, *subkeys: str, url: str, name: Optional[str] = None, force: bool = False, download_kwargs: Optional[Mapping[str, Any]] = None, read_csv_kwargs: Optional[Mapping[str, Any]] = None, ) -> "pd.DataFrame": """Download a CSV and open as a dataframe with :mod:`pandas`. :param subkeys: A sequence of additional strings to join. If none are given, returns the directory for this module. :param url: The URL to download. :param name: Overrides the name of the file at the end of the URL, if given. Also useful for URLs that don't have proper filenames with extensions. :param force: Should the download be done again, even if the path already exists? Defaults to false. :param download_kwargs: Keyword arguments to pass through to :func:`pystow.utils.download`. :param read_csv_kwargs: Keyword arguments to pass through to :func:`pandas.read_csv`. :return: A pandas DataFrame :rtype: pandas.DataFrame """ import pandas as pd path = self.ensure( *subkeys, url=url, name=name, force=force, download_kwargs=download_kwargs ) return pd.read_csv(path, **_clean_csv_kwargs(read_csv_kwargs)) def load_df( self, *subkeys: str, name: str, read_csv_kwargs: Optional[Mapping[str, Any]] = None, ) -> "pd.DataFrame": """Open a pre-existing CSV as a dataframe with :mod:`pandas`. :param subkeys: A sequence of additional strings to join. If none are given, returns the directory for this module. :param name: Overrides the name of the file at the end of the URL, if given. Also useful for URLs that don't have proper filenames with extensions. :param read_csv_kwargs: Keyword arguments to pass through to :func:`pandas.read_csv`. :return: A pandas DataFrame """ import pandas as pd with self.open(*subkeys, name=name) as file: return pd.read_csv(file, **_clean_csv_kwargs(read_csv_kwargs)) def dump_df( self, *subkeys: str, name: str, obj: "pd.DataFrame", sep: str = "\t", index: bool = False, to_csv_kwargs: Optional[Mapping[str, Any]] = None, ) -> None: """Dump a dataframe to a TSV file with :mod:`pandas`. :param subkeys: A sequence of additional strings to join. If none are given, returns the directory for this module. :param name: Overrides the name of the file at the end of the URL, if given. Also useful for URLs that don't have proper filenames with extensions. :param obj: The dataframe to dump :param sep: The separator to use, defaults to a tab :param index: Should the index be dumped? Defaults to false. :param to_csv_kwargs: Keyword arguments to pass through to :meth:`pandas.DataFrame.to_csv`. """ to_csv_kwargs = {} if to_csv_kwargs is None else dict(to_csv_kwargs) to_csv_kwargs.setdefault("sep", sep) to_csv_kwargs.setdefault("index", index) # should this use unified opener instead? Pandas is pretty smart... path = self.join(*subkeys, name=name) obj.to_csv(path, **to_csv_kwargs) def ensure_json( self, *subkeys: str, url: str, name: Optional[str] = None, force: bool = False, download_kwargs: Optional[Mapping[str, Any]] = None, open_kwargs: Optional[Mapping[str, Any]] = None, json_load_kwargs: Optional[Mapping[str, Any]] = None, ) -> JSON: """Download JSON and open with :mod:`json`. :param subkeys: A sequence of additional strings to join. If none are given, returns the directory for this module. :param url: The URL to download. :param name: Overrides the name of the file at the end of the URL, if given. Also useful for URLs that don't have proper filenames with extensions. :param force: Should the download be done again, even if the path already exists? Defaults to false. :param download_kwargs: Keyword arguments to pass through to :func:`pystow.utils.download`. :param open_kwargs: Additional keyword arguments passed to :func:`open` :param json_load_kwargs: Keyword arguments to pass through to :func:`json.load`. :returns: A JSON object (list, dict, etc.) """ with self.ensure_open( *subkeys, url=url, name=name, force=force, download_kwargs=download_kwargs, open_kwargs=open_kwargs, ) as file: return json.load(file, **(json_load_kwargs or {})) def ensure_json_bz2( self, *subkeys: str, url: str, name: Optional[str] = None, force: bool = False, download_kwargs: Optional[Mapping[str, Any]] = None, open_kwargs: Optional[Mapping[str, Any]] = None, json_load_kwargs: Optional[Mapping[str, Any]] = None, ) -> JSON: """Download BZ2-compressed JSON and open with :mod:`json`. :param subkeys: A sequence of additional strings to join. If none are given, returns the directory for this module. :param url: The URL to download. :param name: Overrides the name of the file at the end of the URL, if given. Also useful for URLs that don't have proper filenames with extensions. :param force: Should the download be done again, even if the path already exists? Defaults to false. :param download_kwargs: Keyword arguments to pass through to :func:`pystow.utils.download`. :param open_kwargs: Additional keyword arguments passed to :func:`bz2.open` :param json_load_kwargs: Keyword arguments to pass through to :func:`json.load`. :returns: A JSON object (list, dict, etc.) """ with self.ensure_open_bz2( *subkeys, url=url, name=name, force=force, download_kwargs=download_kwargs, open_kwargs=open_kwargs, ) as file: return json.load(file, **(json_load_kwargs or {})) def load_json( self, *subkeys: str, name: str, open_kwargs: Optional[Mapping[str, Any]] = None, json_load_kwargs: Optional[Mapping[str, Any]] = None, ) -> JSON: """Open a JSON file :mod:`json`. :param subkeys: A sequence of additional strings to join. If none are given, returns the directory for this module. :param name: The name of the file to open :param open_kwargs: Additional keyword arguments passed to :func:`open` :param json_load_kwargs: Keyword arguments to pass through to :func:`json.load`. :returns: A JSON object (list, dict, etc.) """ with self.open( *subkeys, name=name, mode="r", open_kwargs=open_kwargs, ensure_exists=True ) as file: return json.load(file, **(json_load_kwargs or {})) def dump_json( self, *subkeys: str, name: str, obj: JSON, open_kwargs: Optional[Mapping[str, Any]] = None, json_dump_kwargs: Optional[Mapping[str, Any]] = None, ) -> None: """Dump an object to a file with :mod:`json`. :param subkeys: A sequence of additional strings to join. If none are given, returns the directory for this module. :param name: The name of the file to open :param obj: The object to dump :param open_kwargs: Additional keyword arguments passed to :func:`open` :param json_dump_kwargs: Keyword arguments to pass through to :func:`json.dump`. """ with self.open( *subkeys, name=name, mode="w", open_kwargs=open_kwargs, ensure_exists=True ) as file: json.dump(obj, file, **(json_dump_kwargs or {})) def ensure_pickle( self, *subkeys: str, url: str, name: Optional[str] = None, force: bool = False, download_kwargs: Optional[Mapping[str, Any]] = None, mode: str = "rb", open_kwargs: Optional[Mapping[str, Any]] = None, pickle_load_kwargs: Optional[Mapping[str, Any]] = None, ) -> Any: """Download a pickle file and open with :mod:`pickle`. :param subkeys: A sequence of additional strings to join. If none are given, returns the directory for this module. :param url: The URL to download. :param name: Overrides the name of the file at the end of the URL, if given. Also useful for URLs that don't have proper filenames with extensions. :param force: Should the download be done again, even if the path already exists? Defaults to false. :param download_kwargs: Keyword arguments to pass through to :func:`pystow.utils.download`. :param mode: The read mode, passed to :func:`open` :param open_kwargs: Additional keyword arguments passed to :func:`open` :param pickle_load_kwargs: Keyword arguments to pass through to :func:`pickle.load`. :returns: Any object """ with self.ensure_open( *subkeys, url=url, name=name, force=force, download_kwargs=download_kwargs, mode=mode, open_kwargs=open_kwargs, ) as file: return pickle.load(file, **(pickle_load_kwargs or {})) def load_pickle( self, *subkeys: str, name: str, mode: str = "rb", open_kwargs: Optional[Mapping[str, Any]] = None, pickle_load_kwargs: Optional[Mapping[str, Any]] = None, ) -> Any: """Open a pickle file with :mod:`pickle`. :param subkeys: A sequence of additional strings to join. If none are given, returns the directory for this module. :param name: The name of the file to open :param mode: The read mode, passed to :func:`open` :param open_kwargs: Additional keyword arguments passed to :func:`open` :param pickle_load_kwargs: Keyword arguments to pass through to :func:`pickle.load`. :returns: Any object """ with self.open( *subkeys, name=name, mode=mode, open_kwargs=open_kwargs, ) as file: return pickle.load(file, **(pickle_load_kwargs or {})) def dump_pickle( self, *subkeys: str, name: str, obj: Any, mode: str = "wb", open_kwargs: Optional[Mapping[str, Any]] = None, pickle_dump_kwargs: Optional[Mapping[str, Any]] = None, ) -> None: """Dump an object to a file with :mod:`pickle`. :param subkeys: A sequence of additional strings to join. If none are given, returns the directory for this module. :param name: The name of the file to open :param obj: The object to dump :param mode: The read mode, passed to :func:`open` :param open_kwargs: Additional keyword arguments passed to :func:`open` :param pickle_dump_kwargs: Keyword arguments to pass through to :func:`pickle.dump`. """ with self.open( *subkeys, name=name, mode=mode, open_kwargs=open_kwargs, ) as file: pickle.dump(obj, file, **(pickle_dump_kwargs or {})) def ensure_pickle_gz( self, *subkeys: str, url: str, name: Optional[str] = None, force: bool = False, download_kwargs: Optional[Mapping[str, Any]] = None, mode: str = "rb", open_kwargs: Optional[Mapping[str, Any]] = None, pickle_load_kwargs: Optional[Mapping[str, Any]] = None, ) -> Any: """Download a gzipped pickle file and open with :mod:`pickle`. :param subkeys: A sequence of additional strings to join. If none are given, returns the directory for this module. :param url: The URL to download. :param name: Overrides the name of the file at the end of the URL, if given. Also useful for URLs that don't have proper filenames with extensions. :param force: Should the download be done again, even if the path already exists? Defaults to false. :param download_kwargs: Keyword arguments to pass through to :func:`pystow.utils.download`. :param mode: The read mode, passed to :func:`gzip.open` :param open_kwargs: Additional keyword arguments passed to :func:`gzip.open` :param pickle_load_kwargs: Keyword arguments to pass through to :func:`pickle.load`. :returns: Any object """ with self.ensure_open_gz( *subkeys, url=url, name=name, force=force, download_kwargs=download_kwargs, mode=mode, open_kwargs=open_kwargs, ) as file: return pickle.load(file, **(pickle_load_kwargs or {})) def load_pickle_gz( self, *subkeys: str, name: str, mode: str = "rb", open_kwargs: Optional[Mapping[str, Any]] = None, pickle_load_kwargs: Optional[Mapping[str, Any]] = None, ) -> Any: """Open a gzipped pickle file with :mod:`pickle`. :param subkeys: A sequence of additional strings to join. If none are given, returns the directory for this module. :param name: The name of the file to open :param mode: The read mode, passed to :func:`open` :param open_kwargs: Additional keyword arguments passed to :func:`gzip.open` :param pickle_load_kwargs: Keyword arguments to pass through to :func:`pickle.load`. :returns: Any object """ with self.open_gz( *subkeys, name=name, mode=mode, open_kwargs=open_kwargs, ) as file: return pickle.load(file, **(pickle_load_kwargs or {})) def ensure_excel( self, *subkeys: str, url: str, name: Optional[str] = None, force: bool = False, download_kwargs: Optional[Mapping[str, Any]] = None, read_excel_kwargs: Optional[Mapping[str, Any]] = None, ) -> "pd.DataFrame": """Download an excel file and open as a dataframe with :mod:`pandas`. :param subkeys: A sequence of additional strings to join. If none are given, returns the directory for this module. :param url: The URL to download. :param name: Overrides the name of the file at the end of the URL, if given. Also useful for URLs that don't have proper filenames with extensions. :param force: Should the download be done again, even if the path already exists? Defaults to false. :param download_kwargs: Keyword arguments to pass through to :func:`pystow.utils.download`. :param read_excel_kwargs: Keyword arguments to pass through to :func:`pandas.read_excel`. :return: A pandas DataFrame """ import pandas as pd path = self.ensure( *subkeys, url=url, name=name, force=force, download_kwargs=download_kwargs ) return pd.read_excel(path, **(read_excel_kwargs or {})) def ensure_tar_df( self, *subkeys: str, url: str, inner_path: str, name: Optional[str] = None, force: bool = False, download_kwargs: Optional[Mapping[str, Any]] = None, read_csv_kwargs: Optional[Mapping[str, Any]] = None, ) -> "pd.DataFrame": """Download a tar file and open an inner file as a dataframe with :mod:`pandas`. :param subkeys: A sequence of additional strings to join. If none are given, returns the directory for this module. :param url: The URL to download. :param inner_path: The relative path to the file inside the archive :param name: Overrides the name of the file at the end of the URL, if given. Also useful for URLs that don't have proper filenames with extensions. :param force: Should the download be done again, even if the path already exists? Defaults to false. :param download_kwargs: Keyword arguments to pass through to :func:`pystow.utils.download`. :param read_csv_kwargs: Keyword arguments to pass through to :func:`pandas.read_csv`. :returns: A dataframe .. warning:: If you have lots of files to read in the same archive, it's better just to unzip first. """ path = self.ensure( *subkeys, url=url, name=name, force=force, download_kwargs=download_kwargs ) return read_tarfile_csv( path=path, inner_path=inner_path, **_clean_csv_kwargs(read_csv_kwargs) ) def ensure_xml( self, *subkeys: str, url: str, name: Optional[str] = None, force: bool = False, download_kwargs: Optional[Mapping[str, Any]] = None, parse_kwargs: Optional[Mapping[str, Any]] = None, ) -> "lxml.etree.ElementTree": """Download an XML file and open it with :mod:`lxml`. :param subkeys: A sequence of additional strings to join. If none are given, returns the directory for this module. :param url: The URL to download. :param name: Overrides the name of the file at the end of the URL, if given. Also useful for URLs that don't have proper filenames with extensions. :param force: Should the download be done again, even if the path already exists? Defaults to false. :param download_kwargs: Keyword arguments to pass through to :func:`pystow.utils.download`. :param parse_kwargs: Keyword arguments to pass through to :func:`lxml.etree.parse`. :returns: An ElementTree object .. warning:: If you have lots of files to read in the same archive, it's better just to unzip first. """ from lxml import etree path = self.ensure( *subkeys, url=url, name=name, force=force, download_kwargs=download_kwargs ) return etree.parse(path, **(parse_kwargs or {})) def load_xml( self, *subkeys: str, name: str, parse_kwargs: Optional[Mapping[str, Any]] = None, ) -> "lxml.etree.ElementTree": """Load an XML file with :mod:`lxml`. :param subkeys: A sequence of additional strings to join. If none are given, returns the directory for this module. :param name: The name of the file to open :param parse_kwargs: Keyword arguments to pass through to :func:`lxml.etree.parse`. :returns: An ElementTree object .. warning:: If you have lots of files to read in the same archive, it's better just to unzip first. """ from lxml import etree with self.open(*subkeys, name=name, ensure_exists=False) as file: return etree.parse(file, **(parse_kwargs or {})) def dump_xml( self, *subkeys: str, name: str, obj: "lxml.etree.ElementTree", open_kwargs: Optional[Mapping[str, Any]] = None, write_kwargs: Optional[Mapping[str, Any]] = None, ) -> None: """Dump an XML element tree to a file with :mod:`lxml`. :param subkeys: A sequence of additional strings to join. If none are given, returns the directory for this module. :param name: The name of the file to open :param obj: The object to dump :param open_kwargs: Additional keyword arguments passed to :func:`open` :param write_kwargs: Keyword arguments to pass through to :func:`lxml.etree.ElementTree.write`. """ with self.open( *subkeys, name=name, mode="wb", open_kwargs=open_kwargs, ensure_exists=True ) as file: obj.write(file, **(write_kwargs or {})) def ensure_tar_xml( self, *subkeys: str, url: str, inner_path: str, name: Optional[str] = None, force: bool = False, download_kwargs: Optional[Mapping[str, Any]] = None, parse_kwargs: Optional[Mapping[str, Any]] = None, ) -> "lxml.etree.ElementTree": """Download a tar file and open an inner file as an XML with :mod:`lxml`. :param subkeys: A sequence of additional strings to join. If none are given, returns the directory for this module. :param url: The URL to download. :param inner_path: The relative path to the file inside the archive :param name: Overrides the name of the file at the end of the URL, if given. Also useful for URLs that don't have proper filenames with extensions. :param force: Should the download be done again, even if the path already exists? Defaults to false. :param download_kwargs: Keyword arguments to pass through to :func:`pystow.utils.download`. :param parse_kwargs: Keyword arguments to pass through to :func:`lxml.etree.parse`. :returns: An ElementTree object .. warning:: If you have lots of files to read in the same archive, it's better just to unzip first. """ path = self.ensure( *subkeys, url=url, name=name, force=force, download_kwargs=download_kwargs ) return read_tarfile_xml(path=path, inner_path=inner_path, **(parse_kwargs or {})) def ensure_zip_df( self, *subkeys: str, url: str, inner_path: str, name: Optional[str] = None, force: bool = False, download_kwargs: Optional[Mapping[str, Any]] = None, read_csv_kwargs: Optional[Mapping[str, Any]] = None, ) -> "pd.DataFrame": """Download a zip file and open an inner file as a dataframe with :mod:`pandas`. :param subkeys: A sequence of additional strings to join. If none are given, returns the directory for this module. :param url: The URL to download. :param inner_path: The relative path to the file inside the archive :param name: Overrides the name of the file at the end of the URL, if given. Also useful for URLs that don't have proper filenames with extensions. :param force: Should the download be done again, even if the path already exists? Defaults to false. :param download_kwargs: Keyword arguments to pass through to :func:`pystow.utils.download`. :param read_csv_kwargs: Keyword arguments to pass through to :func:`pandas.read_csv`. :return: A pandas DataFrame :rtype: pandas.DataFrame """ path = self.ensure( *subkeys, url=url, name=name, force=force, download_kwargs=download_kwargs ) return read_zipfile_csv( path=path, inner_path=inner_path, **_clean_csv_kwargs(read_csv_kwargs) ) def ensure_zip_np( self, *subkeys: str, url: str, inner_path: str, name: Optional[str] = None, force: bool = False, download_kwargs: Optional[Mapping[str, Any]] = None, load_kwargs: Optional[Mapping[str, Any]] = None, ) -> "numpy.typing.ArrayLike": """Download a zip file and open an inner file as an array-like with :mod:`numpy`. :param subkeys: A sequence of additional strings to join. If none are given, returns the directory for this module. :param url: The URL to download. :param inner_path: The relative path to the file inside the archive :param name: Overrides the name of the file at the end of the URL, if given. Also useful for URLs that don't have proper filenames with extensions. :param force: Should the download be done again, even if the path already exists? Defaults to false. :param download_kwargs: Keyword arguments to pass through to :func:`pystow.utils.download`. :param load_kwargs: Additional keyword arguments that are passed through to :func:`read_zip_np` and transitively to :func:`numpy.load`. :returns: An array-like object :rtype: numpy.typing.ArrayLike """ path = self.ensure( *subkeys, url=url, name=name, force=force, download_kwargs=download_kwargs ) return read_zip_np(path=path, inner_path=inner_path, **(load_kwargs or {})) def ensure_rdf( self, *subkeys: str, url: str, name: Optional[str] = None, force: bool = False, download_kwargs: Optional[Mapping[str, Any]] = None, precache: bool = True, parse_kwargs: Optional[Mapping[str, Any]] = None, ) -> "rdflib.Graph": """Download a RDF file and open with :mod:`rdflib`. :param subkeys: A sequence of additional strings to join. If none are given, returns the directory for this module. :param url: The URL to download. :param name: Overrides the name of the file at the end of the URL, if given. Also useful for URLs that don't have proper filenames with extensions. :param force: Should the download be done again, even if the path already exists? Defaults to false. :param download_kwargs: Keyword arguments to pass through to :func:`pystow.utils.download`. :param precache: Should the parsed :class:`rdflib.Graph` be stored as a pickle for fast loading? :param parse_kwargs: Keyword arguments to pass through to :func:`pystow.utils.read_rdf` and transitively to :func:`rdflib.Graph.parse`. :return: An RDF graph :rtype: rdflib.Graph """ path = self.ensure( *subkeys, url=url, name=name, force=force, download_kwargs=download_kwargs ) if not precache: return read_rdf(path=path, **(parse_kwargs or {})) cache_path = path.with_suffix(path.suffix + ".pickle.gz") if cache_path.exists() and not force: with gzip.open(cache_path, "rb") as file: return pickle.load(file) rv = read_rdf(path=path, **(parse_kwargs or {})) with gzip.open(cache_path, "wb") as file: pickle.dump(rv, file, protocol=pickle.HIGHEST_PROTOCOL) return rv def load_rdf( self, *subkeys: str, name: Optional[str] = None, parse_kwargs: Optional[Mapping[str, Any]] = None, ) -> "rdflib.Graph": """Open an RDF file with :mod:`rdflib`. :param subkeys: A sequence of additional strings to join. If none are given, returns the directory for this module. :param name: The name of the file to open :param parse_kwargs: Keyword arguments to pass through to :func:`pystow.utils.read_rdf` and transitively to :func:`rdflib.Graph.parse`. :return: An RDF graph """ path = self.join(*subkeys, name=name, ensure_exists=False) return read_rdf(path=path, **(parse_kwargs or {})) def dump_rdf( self, *subkeys: str, name: str, obj: "rdflib.Graph", format: str = "turtle", serialize_kwargs: Optional[Mapping[str, Any]] = None, ) -> None: """Dump an RDF graph to a file with :mod:`rdflib`. :param subkeys: A sequence of additional strings to join. If none are given, returns the directory for this module. :param name: The name of the file to open :param obj: The object to dump :param format: The format to dump in :param serialize_kwargs: Keyword arguments to through to :func:`rdflib.Graph.serialize`. """ path = self.join(*subkeys, name=name, ensure_exists=False) serialize_kwargs = {} if serialize_kwargs is None else dict(serialize_kwargs) serialize_kwargs.setdefault("format", format) obj.serialize(path, **serialize_kwargs) def ensure_from_s3( self, *subkeys: str, s3_bucket: str, s3_key: Union[str, Sequence[str]], name: Optional[str] = None, client: Optional["botocore.client.BaseClient"] = None, client_kwargs: Optional[Mapping[str, Any]] = None, download_file_kwargs: Optional[Mapping[str, Any]] = None, force: bool = False, ) -> Path: """Ensure a file is downloaded. :param subkeys: A sequence of additional strings to join. If none are given, returns the directory for this module. :param s3_bucket: The S3 bucket name :param s3_key: The S3 key name :param name: Overrides the name of the file at the end of the S3 key, if given. :param client: A botocore client. If none given, one will be created automatically :param client_kwargs: Keyword arguments to be passed to the client on instantiation. :param download_file_kwargs: Keyword arguments to be passed to :func:`boto3.s3.transfer.S3Transfer.download_file` :param force: Should the download be done again, even if the path already exists? Defaults to false. :return: The path of the file that has been downloaded (or already exists) """ if not isinstance(s3_key, str): s3_key = "/".join(s3_key) # join sequence if name is None: name = name_from_s3_key(s3_key) path = self.join(*subkeys, name=name, ensure_exists=True) download_from_s3( s3_bucket=s3_bucket, s3_key=s3_key, path=path, client=client, client_kwargs=client_kwargs, force=force, download_file_kwargs=download_file_kwargs, ) return path def ensure_from_google( self, *subkeys: str, name: str, file_id: str, force: bool = False, download_kwargs: Optional[Mapping[str, Any]] = None, ) -> Path: """Ensure a file is downloaded from Google Drive. :param subkeys: A sequence of additional strings to join. If none are given, returns the directory for this module. :param name: The name of the file :param file_id: The file identifier of the google file. If your share link is https://drive.google.com/file/d/1AsPPU4ka1Rc9u-XYMGWtvV65hF3egi0z/view, then your file id is ``1AsPPU4ka1Rc9u-XYMGWtvV65hF3egi0z``. :param force: Should the download be done again, even if the path already exists? Defaults to false. :param download_kwargs: Keyword arguments to pass through to :func:`pystow.utils.download_from_google`. :return: The path of the file that has been downloaded (or already exists) """ path = self.join(*subkeys, name=name, ensure_exists=True) download_from_google(file_id, path, force=force, **(download_kwargs or {})) return path @contextmanager def ensure_open_sqlite( self, *subkeys: str, url: str, name: Optional[str] = None, force: bool = False, download_kwargs: Optional[Mapping[str, Any]] = None, ) -> Generator[sqlite3.Connection, None, None]: """Ensure and connect to a SQLite database. :param subkeys: A sequence of additional strings to join. If none are given, returns the directory for this module. :param url: The URL to download. :param name: Overrides the name of the file at the end of the URL, if given. Also useful for URLs that don't have proper filenames with extensions. :param force: Should the download be done again, even if the path already exists? Defaults to false. :param download_kwargs: Keyword arguments to pass through to :func:`pystow.utils.download`. :yields: An instance of :class:`sqlite3.Connection` from :func:`sqlite3.connect` Example usage: >>> import pystow >>> import pandas as pd >>> url = "https://s3.amazonaws.com/bbop-sqlite/hp.db" >>> sql = "SELECT * FROM entailed_edge LIMIT 10" >>> module = pystow.module("test") >>> with module.ensure_open_sqlite(url=url) as conn: >>> df = pd.read_sql(sql, conn) """ path = self.ensure( *subkeys, url=url, name=name, force=force, download_kwargs=download_kwargs ) with closing(sqlite3.connect(path.as_posix())) as conn: yield conn @contextmanager def ensure_open_sqlite_gz( self, *subkeys: str, url: str, name: Optional[str] = None, force: bool = False, download_kwargs: Optional[Mapping[str, Any]] = None, ) -> Generator[sqlite3.Connection, None, None]: """Ensure and connect to a SQLite database that's gzipped. Unfortunately, it's a paid feature to directly read gzipped sqlite files, so this automatically gunzips it first. :param subkeys: A sequence of additional strings to join. If none are given, returns the directory for this module. :param url: The URL to download. :param name: Overrides the name of the file at the end of the URL, if given. Also useful for URLs that don't have proper filenames with extensions. :param force: Should the download be done again, even if the path already exists? Defaults to false. :param download_kwargs: Keyword arguments to pass through to :func:`pystow.utils.download`. :yields: An instance of :class:`sqlite3.Connection` from :func:`sqlite3.connect` Example usage: >>> import pystow >>> import pandas as pd >>> url = "https://s3.amazonaws.com/bbop-sqlite/hp.db.gz" >>> module = pystow.module("test") >>> sql = "SELECT * FROM entailed_edge LIMIT 10" >>> with module.ensure_open_sqlite_gz(url=url) as conn: >>> df = pd.read_sql(sql, conn) """ path = self.ensure_gunzip( *subkeys, url=url, name=name, force=force, download_kwargs=download_kwargs ) with closing(sqlite3.connect(path.as_posix())) as conn: yield conn def _clean_csv_kwargs(read_csv_kwargs: Union[None, Mapping[str, Any]]) -> Dict[str, Any]: read_csv_kwargs = {} if read_csv_kwargs is None else dict(read_csv_kwargs) read_csv_kwargs.setdefault("sep", "\t") return read_csv_kwargs pystow-0.5.5/src/pystow/py.typed000066400000000000000000000000001466707254100167230ustar00rootroot00000000000000pystow-0.5.5/src/pystow/utils.py000066400000000000000000001070031466707254100167510ustar00rootroot00000000000000# -*- coding: utf-8 -*- """Utilities.""" import contextlib import gzip import hashlib import logging import lzma import os import pickle import shutil import tarfile import tempfile import zipfile from collections import namedtuple from functools import partial from io import BytesIO, StringIO from pathlib import Path, PurePosixPath from subprocess import check_output # noqa: S404 from typing import ( TYPE_CHECKING, Any, Collection, Iterable, Iterator, Mapping, Optional, Union, ) from urllib.parse import urlparse from urllib.request import urlretrieve from uuid import uuid4 import requests from tqdm.auto import tqdm from .constants import ( PYSTOW_HOME_ENVVAR, PYSTOW_NAME_DEFAULT, PYSTOW_NAME_ENVVAR, PYSTOW_USE_APPDIRS, README_TEXT, ) if TYPE_CHECKING: import botocore.client import lxml.etree import numpy.typing import pandas import rdflib __all__ = [ # Data Structures "HexDigestMismatch", # Exceptions "HexDigestError", "UnexpectedDirectory", # Functions "get_offending_hexdigests", "get_hashes", "raise_on_digest_mismatch", "get_hexdigests_remote", "download", "name_from_url", "name_from_s3_key", "mkdir", "mock_envvar", "mock_home", "getenv_path", "n", # Bytes generators "get_df_io", "get_np_io", # LZMA utilities "write_lzma_csv", "gunzip", # Zipfile utilities "write_zipfile_csv", "read_zipfile_csv", "write_zipfile_xml", "read_zipfile_xml", "write_zipfile_np", "read_zip_np", "read_zipfile_rdf", # Tarfile utilities "write_tarfile_csv", "read_tarfile_csv", "read_tarfile_xml", # GZ utilities "write_pickle_gz", # Standard readers "read_rdf", # Downloaders "download_from_google", "download_from_s3", # Misc "get_commit", "get_home", "get_name", "get_base", "path_to_sqlite", ] logger = logging.getLogger(__name__) # Since we're python 3.6 compatible, we can't do from __future__ import annotations and use hashlib._Hash Hash = Any HexDigestMismatch = namedtuple("HexDigestMismatch", "name actual expected") class HexDigestError(ValueError): """Thrown if the hashsums do not match expected hashsums.""" def __init__(self, offending_hexdigests: Collection[HexDigestMismatch]): """Instantiate the exception. :param offending_hexdigests: The result from :func:`get_offending_hexdigests` """ self.offending_hexdigests = offending_hexdigests def __str__(self) -> str: # noqa:D105 return "\n".join( ( "Hexdigest of downloaded file does not match the expected ones!", *( f"\t{name} actual: {actual} vs. expected: {expected}" for name, actual, expected in self.offending_hexdigests ), ) ) class UnexpectedDirectory(FileExistsError): """Thrown if a directory path is given where file path should have been.""" def __init__(self, path: Path): """Instantiate the exception. :param path: The path to a directory that should have been a file. """ self.path = path def __str__(self) -> str: # noqa:D105 return f"got directory instead of file: {self.path}" def get_hexdigests_remote( hexdigests_remote: Optional[Mapping[str, str]], hexdigests_strict: bool = False ) -> Mapping[str, str]: """Process hexdigests via URLs. :param hexdigests_remote: The expected hexdigests as (algorithm_name, url to file with expected hex digest) pairs. :param hexdigests_strict: Set this to alse to stop automatically checking for the `algorithm(filename)=hash` format :returns: A mapping of algorithms to hexdigests """ rv = {} for key, url in (hexdigests_remote or {}).items(): text = requests.get(url, timeout=15).text if not hexdigests_strict and "=" in text: text = text.rsplit("=", 1)[-1].strip() rv[key] = text return rv def get_offending_hexdigests( path: Union[str, Path], chunk_size: Optional[int] = None, hexdigests: Optional[Mapping[str, str]] = None, hexdigests_remote: Optional[Mapping[str, str]] = None, hexdigests_strict: bool = False, ) -> Collection[HexDigestMismatch]: """ Check a file for hash sums. :param path: The file path. :param chunk_size: The chunk size for reading the file. :param hexdigests: The expected hexdigests as (algorithm_name, expected_hex_digest) pairs. :param hexdigests_remote: The expected hexdigests as (algorithm_name, url to file with expected hexdigest) pairs. :param hexdigests_strict: Set this to false to stop automatically checking for the `algorithm(filename)=hash` format :return: A collection of observed / expected hexdigests where the digests do not match. """ hexdigests = dict( **(hexdigests or {}), **get_hexdigests_remote(hexdigests_remote, hexdigests_strict=hexdigests_strict), ) # If there aren't any keys in the combine dictionaries, # then there won't be any mismatches if not hexdigests: return [] logger.info(f"Checking hash sums for file: {path}") # instantiate algorithms algorithms = get_hashes(path=path, names=set(hexdigests), chunk_size=chunk_size) # Compare digests mismatches = [] for alg, expected_digest in hexdigests.items(): observed_digest = algorithms[alg].hexdigest() if observed_digest != expected_digest: logger.error(f"{alg} expected {expected_digest} but got {observed_digest}.") mismatches.append(HexDigestMismatch(alg, observed_digest, expected_digest)) else: logger.debug(f"Successfully checked with {alg}.") return mismatches def get_hashes( path: Union[str, Path], names: Iterable[str], *, chunk_size: Optional[int] = None, ) -> Mapping[str, Hash]: """Calculate several hexdigests of hash algorithms for a file concurrently. :param path: The file path. :param names: Names of the hash algorithms in :mod:`hashlib` :param chunk_size: The chunk size for reading the file. :return: A collection of observed hexdigests """ path = Path(path).resolve() if chunk_size is None: chunk_size = 64 * 2**10 # instantiate hash algorithms algorithms: Mapping[str, Hash] = {name: hashlib.new(name) for name in names} # calculate hash sums of file incrementally buffer = memoryview(bytearray(chunk_size)) with path.open("rb", buffering=0) as file: for this_chunk_size in iter(lambda: file.readinto(buffer), 0): for alg in algorithms.values(): alg.update(buffer[:this_chunk_size]) return algorithms def raise_on_digest_mismatch( *, path: Path, hexdigests: Optional[Mapping[str, str]] = None, hexdigests_remote: Optional[Mapping[str, str]] = None, hexdigests_strict: bool = False, ) -> None: """Raise a HexDigestError if the digests do not match. :param path: The file path. :param hexdigests: The expected hexdigests as (algorithm_name, expected_hex_digest) pairs. :param hexdigests_remote: The expected hexdigests as (algorithm_name, url to file with expected hexdigest) pairs. :param hexdigests_strict: Set this to false to stop automatically checking for the `algorithm(filename)=hash` format :raises HexDigestError: if there are any offending hex digests The expected hexdigests as (algorithm_name, url to file with expected hexdigest) pairs. """ offending_hexdigests = get_offending_hexdigests( path=path, hexdigests=hexdigests, hexdigests_remote=hexdigests_remote, hexdigests_strict=hexdigests_strict, ) if offending_hexdigests: raise HexDigestError(offending_hexdigests) class TqdmReportHook(tqdm): # type:ignore """A custom progress bar that can be used with urllib. Based on https://gist.github.com/leimao/37ff6e990b3226c2c9670a2cd1e4a6f5 """ def update_to( self, blocks: int = 1, block_size: int = 1, total_size: Optional[int] = None, ) -> None: """Update the internal state based on a urllib report hook. :param blocks: Number of blocks transferred so far :param block_size: Size of each block (in tqdm units) :param total_size: Total size (in tqdm units). If [default: None] remains unchanged. """ if total_size is not None: self.total = total_size self.update(blocks * block_size - self.n) # will also set self.n = b * bsize def download( url: str, path: Union[str, Path], force: bool = True, clean_on_failure: bool = True, backend: str = "urllib", hexdigests: Optional[Mapping[str, str]] = None, hexdigests_remote: Optional[Mapping[str, str]] = None, hexdigests_strict: bool = False, progress_bar: bool = True, tqdm_kwargs: Optional[Mapping[str, Any]] = None, **kwargs: Any, ) -> None: """Download a file from a given URL. :param url: URL to download :param path: Path to download the file to :param force: If false and the file already exists, will not re-download. :param clean_on_failure: If true, will delete the file on any exception raised during download :param backend: The downloader to use. Choose 'urllib' or 'requests' :param hexdigests: The expected hexdigests as (algorithm_name, expected_hex_digest) pairs. :param hexdigests_remote: The expected hexdigests as (algorithm_name, url to file with expected hexdigest) pairs. :param hexdigests_strict: Set this to false to stop automatically checking for the `algorithm(filename)=hash` format :param progress_bar: Set to true to show a progress bar while downloading :param tqdm_kwargs: Override the default arguments passed to :class:`tadm.tqdm` when progress_bar is True. :param kwargs: The keyword arguments to pass to :func:`urllib.request.urlretrieve` or to `requests.get` depending on the backend chosen. If using 'requests' backend, `stream` is set to True by default. :raises Exception: Thrown if an error besides a keyboard interrupt is thrown during download :raises KeyboardInterrupt: If a keyboard interrupt is thrown during download :raises UnexpectedDirectory: If a directory is given for the ``path`` argument :raises ValueError: If an invalid backend is chosen """ path = Path(path).resolve() if path.is_dir(): raise UnexpectedDirectory(path) if path.is_file() and not force: raise_on_digest_mismatch( path=path, hexdigests=hexdigests, hexdigests_remote=hexdigests_remote, hexdigests_strict=hexdigests_strict, ) logger.debug("did not re-download %s from %s", path, url) return _tqdm_kwargs = dict( unit="B", unit_scale=True, unit_divisor=1024, miniters=1, disable=not progress_bar, desc=f"Downloading {path.name}", leave=False, ) if tqdm_kwargs: _tqdm_kwargs.update(tqdm_kwargs) try: if backend == "urllib": logger.info("downloading with urllib from %s to %s", url, path) with TqdmReportHook(**_tqdm_kwargs) as t: urlretrieve(url, path, reporthook=t.update_to, **kwargs) # noqa:S310 elif backend == "requests": kwargs.setdefault("stream", True) # see https://requests.readthedocs.io/en/master/user/quickstart/#raw-response-content # pattern from https://stackoverflow.com/a/39217788/5775947 with requests.get(url, **kwargs) as response, path.open("wb") as file: # noqa:S113 logger.info( "downloading (stream=%s) with requests from %s to %s", kwargs["stream"], url, path, ) # Solution for progres bar from https://stackoverflow.com/a/63831344/5775947 total_size = int(response.headers.get("Content-Length", 0)) # Decompress if needed response.raw.read = partial(response.raw.read, decode_content=True) # type:ignore with tqdm.wrapattr(response.raw, "read", total=total_size, **_tqdm_kwargs) as fsrc: shutil.copyfileobj(fsrc, file) else: raise ValueError(f'Invalid backend: {backend}. Use "requests" or "urllib".') except (Exception, KeyboardInterrupt): if clean_on_failure: _unlink(path) raise raise_on_digest_mismatch( path=path, hexdigests=hexdigests, hexdigests_remote=hexdigests_remote, hexdigests_strict=hexdigests_strict, ) def name_from_url(url: str) -> str: """Get the filename from the end of the URL. :param url: A URL :return: The name of the file at the end of the URL """ parse_result = urlparse(url) path = PurePosixPath(parse_result.path) name = path.name return name def base_from_gzip_name(name: str) -> str: """Get the base name for a file after stripping the gz ending. :param name: The name of the gz file :returns: The cleaned name of the file, with no gz ending :raises ValueError: if the file does not end with ".gz" """ if not name.endswith(".gz"): raise ValueError(f"Name does not end with .gz: {name}") return name[: -len(".gz")] def name_from_s3_key(key: str) -> str: """Get the filename from the S3 key. :param key: A S3 path :returns: The name of the file """ return key.split("/")[-1] def mkdir(path: Path, ensure_exists: bool = True) -> None: """Make a directory (or parent directory if a file is given) if flagged with ``ensure_exists``. :param path: The path to a directory :param ensure_exists: Should the directories leading to the path be created if they don't already exist? """ if ensure_exists: path.mkdir(exist_ok=True, parents=True) @contextlib.contextmanager def mock_envvar(envvar: str, value: str) -> Iterator[None]: """Mock the environment variable then delete it after the test is over. :param envvar: The environment variable to mock :param value: The value to temporarily put in the environment variable during this mock. :yield: None, since this just mocks the environment variable for the time being. """ original_value = os.environ.get(envvar) os.environ[envvar] = value yield if original_value is None: del os.environ[envvar] else: os.environ[envvar] = original_value @contextlib.contextmanager def mock_home() -> Iterator[Path]: """Mock the PyStow home environment variable, yields the directory name. :yield: The path to the temporary directory. """ with tempfile.TemporaryDirectory() as directory: with mock_envvar(PYSTOW_HOME_ENVVAR, directory): yield Path(directory) def getenv_path(envvar: str, default: Path, ensure_exists: bool = True) -> Path: """Get an environment variable representing a path, or use the default. :param envvar: The environmental variable name to check :param default: The default path to return if the environmental variable is not set :param ensure_exists: Should the directories leading to the path be created if they don't already exist? :return: A path either specified by the environmental variable or by the default. """ rv = Path(os.getenv(envvar, default=default)).expanduser() mkdir(rv, ensure_exists=ensure_exists) return rv def n() -> str: """Get a random string for testing. :returns: A random string for testing purposes. """ return str(uuid4()) def get_df_io( df: "pandas.DataFrame", sep: str = "\t", index: bool = False, **kwargs: Any ) -> BytesIO: """Get the dataframe as bytes. :param df: A dataframe :param sep: The separator in the dataframe. Overrides Pandas default to use a tab. :param index: Should the index be output? Overrides the Pandas default to be false. :param kwargs: Additional kwargs to pass to :func:`pandas.DataFrame.to_csv`. :return: A bytes object that can be used as a file. """ sio = StringIO() df.to_csv(sio, sep=sep, index=index, **kwargs) sio.seek(0) bio = BytesIO(sio.read().encode("utf-8")) return bio def get_np_io(arr: "numpy.typing.ArrayLike", **kwargs: Any) -> BytesIO: """Get the numpy object as bytes. :param arr: Array-like :param kwargs: Additional kwargs to pass to :func:`numpy.save`. :return: A bytes object that can be used as a file. """ import numpy as np bio = BytesIO() np.save(bio, arr, **kwargs) bio.seek(0) return bio def write_pickle_gz( obj: Any, path: Union[str, Path], **kwargs: Any, ) -> None: """Write an object to a gzipped pickle. :param obj: The object to write :param path: The path of the file to write to :param kwargs: Additional kwargs to pass to :func:`pickle.dump` """ with gzip.open(path, mode="wb") as file: pickle.dump(obj, file, **kwargs) def write_lzma_csv( df: "pandas.DataFrame", path: Union[str, Path], sep: str = "\t", index: bool = False, **kwargs: Any, ) -> None: """Write a dataframe as an lzma-compressed file. :param df: A dataframe :type df: pandas.DataFrame :param path: The path to the resulting LZMA compressed dataframe file :param sep: The separator in the dataframe. Overrides Pandas default to use a tab. :param index: Should the index be output? Overrides the Pandas default to be false. :param kwargs: Additional kwargs to pass to :func:`get_df_io` and transitively to :func:`pandas.DataFrame.to_csv`. """ bytes_io = get_df_io(df, sep=sep, index=index, **kwargs) with lzma.open(path, "wb") as file: file.write(bytes_io.read()) def write_zipfile_csv( df: "pandas.DataFrame", path: Union[str, Path], inner_path: str, sep: str = "\t", index: bool = False, **kwargs: Any, ) -> None: """Write a dataframe to an inner CSV file to a zip archive. :param df: A dataframe :type df: pandas.DataFrame :param path: The path to the resulting zip archive :param inner_path: The path inside the zip archive to write the dataframe :param sep: The separator in the dataframe. Overrides Pandas default to use a tab. :param index: Should the index be output? Overrides the Pandas default to be false. :param kwargs: Additional kwargs to pass to :func:`get_df_io` and transitively to :func:`pandas.DataFrame.to_csv`. """ bytes_io = get_df_io(df, sep=sep, index=index, **kwargs) with zipfile.ZipFile(file=path, mode="w") as zip_file: with zip_file.open(inner_path, mode="w") as file: file.write(bytes_io.read()) def read_zipfile_csv( path: Union[str, Path], inner_path: str, sep: str = "\t", **kwargs: Any ) -> "pandas.DataFrame": """Read an inner CSV file from a zip archive. :param path: The path to the zip archive :param inner_path: The path inside the zip archive to the dataframe :param sep: The separator in the dataframe. Overrides Pandas default to use a tab. :param kwargs: Additional kwargs to pass to :func:`pandas.read_csv`. :return: A dataframe """ import pandas as pd with zipfile.ZipFile(file=path) as zip_file: with zip_file.open(inner_path) as file: return pd.read_csv(file, sep=sep, **kwargs) def write_zipfile_xml( element_tree: "lxml.etree.ElementTree", path: Union[str, Path], inner_path: str, **kwargs: Any, ) -> None: """Write an XML element tree to an inner XML file to a zip archive. :param element_tree: An XML element tree :param path: The path to the resulting zip archive :param inner_path: The path inside the zip archive to write the dataframe :param kwargs: Additional kwargs to pass to :func:`tostring` """ from lxml import etree kwargs.setdefault("pretty_print", True) with zipfile.ZipFile(file=path, mode="w") as zip_file: with zip_file.open(inner_path, mode="w") as file: file.write(etree.tostring(element_tree, **kwargs)) def read_zipfile_xml( path: Union[str, Path], inner_path: str, **kwargs: Any ) -> "lxml.etree.ElementTree": """Read an inner XML file from a zip archive. :param path: The path to the zip archive :param inner_path: The path inside the zip archive to the xml file :param kwargs: Additional kwargs to pass to :func:`lxml.etree.parse` :return: An XML element tree """ from lxml import etree with zipfile.ZipFile(file=path) as zip_file: with zip_file.open(inner_path) as file: return etree.parse(file, **kwargs) def write_zipfile_np( arr: "numpy.typing.ArrayLike", path: Union[str, Path], inner_path: str, **kwargs: Any, ) -> None: """Write a dataframe to an inner CSV file to a zip archive. :param arr: Array-like :param path: The path to the resulting zip archive :param inner_path: The path inside the zip archive to write the dataframe :param kwargs: Additional kwargs to pass to :func:`get_np_io` and transitively to :func:`numpy.save`. """ bytes_io = get_np_io(arr, **kwargs) with zipfile.ZipFile(file=path, mode="w") as zip_file: with zip_file.open(inner_path, mode="w") as file: file.write(bytes_io.read()) def read_zip_np(path: Union[str, Path], inner_path: str, **kwargs: Any) -> "numpy.typing.ArrayLike": """Read an inner numpy array-like from a zip archive. :param path: The path to the zip archive :param inner_path: The path inside the zip archive to the dataframe :param kwargs: Additional kwargs to pass to :func:`numpy.load`. :return: A numpy array or other object """ import numpy as np with zipfile.ZipFile(file=path) as zip_file: with zip_file.open(inner_path) as file: return np.load(file, **kwargs) def read_zipfile_rdf(path: Union[str, Path], inner_path: str, **kwargs: Any) -> "rdflib.Graph": """Read an inner RDF file from a zip archive. :param path: The path to the zip archive :param inner_path: The path inside the zip archive to the dataframe :param kwargs: Additional kwargs to pass to :func:`pandas.read_csv`. :return: A dataframe """ import rdflib graph = rdflib.Graph() with zipfile.ZipFile(file=path) as zip_file: with zip_file.open(inner_path) as file: graph.load(file, **kwargs) return graph def write_tarfile_csv( df: "pandas.DataFrame", path: Union[str, Path], inner_path: str, sep: str = "\t", index: bool = False, **kwargs: Any, ) -> None: """Write a dataframe to an inner CSV file from a tar archive. :param df: A dataframe :type df: pandas.DataFrame :param path: The path to the resulting tar archive :param inner_path: The path inside the tar archive to write the dataframe :param sep: The separator in the dataframe. Overrides Pandas default to use a tab. :param index: Should the index be output? Overrides the Pandas default to be false. :param kwargs: Additional kwargs to pass to :func:`get_df_io` and transitively to :func:`pandas.DataFrame.to_csv`. """ s = df.to_csv(sep=sep, index=index, **kwargs) tarinfo = tarfile.TarInfo(name=inner_path) tarinfo.size = len(s) with tarfile.TarFile(path, mode="w") as tar_file: tar_file.addfile(tarinfo, BytesIO(s.encode("utf-8"))) def read_tarfile_csv( path: Union[str, Path], inner_path: str, sep: str = "\t", **kwargs: Any ) -> "pandas.DataFrame": """Read an inner CSV file from a tar archive. :param path: The path to the tar archive :param inner_path: The path inside the tar archive to the dataframe :param sep: The separator in the dataframe. Overrides Pandas default to use a tab. :param kwargs: Additional kwargs to pass to :func:`pandas.read_csv`. :return: A dataframe """ import pandas as pd with tarfile.open(path) as tar_file: with tar_file.extractfile(inner_path) as file: # type: ignore return pd.read_csv(file, sep=sep, **kwargs) def read_tarfile_xml( path: Union[str, Path], inner_path: str, **kwargs: Any ) -> "lxml.etree.ElementTree": """Read an inner XML file from a tar archive. :param path: The path to the tar archive :param inner_path: The path inside the tar archive to the xml file :param kwargs: Additional kwargs to pass to :func:`lxml.etree.parse` :return: An XML element tree :rtype: lxml.etree.ElementTree """ from lxml import etree with tarfile.open(path) as tar_file: with tar_file.extractfile(inner_path) as file: # type: ignore return etree.parse(file, **kwargs) def read_rdf(path: Union[str, Path], **kwargs: Any) -> "rdflib.Graph": """Read an RDF file with :mod:`rdflib`. :param path: The path to the RDF file :param kwargs: Additional kwargs to pass to :func:`rdflib.Graph.parse` :return: A parsed RDF graph """ import rdflib if isinstance(path, str): path = Path(path) graph = rdflib.Graph() with ( gzip.open(path, "rb") if isinstance(path, Path) and path.suffix == ".gz" else open(path) ) as file: graph.parse(file, **kwargs) return graph def write_sql(df: "pandas.DataFrame", name: str, path: Union[str, Path], **kwargs: Any) -> None: """Write a dataframe as a SQL table. :param df: A dataframe :type df: pandas.DataFrame :param name: The table the database to write to :param path: The path to the resulting tar archive :param kwargs: Additional keyword arguments to pass to :meth:`pandas.DataFrame.to_sql` """ import sqlite3 with contextlib.closing(sqlite3.connect(path)) as conn: df.to_sql(name, conn, **kwargs) def get_commit(org: str, repo: str, provider: str = "git") -> str: """Get last commit hash for the given repo. :param org: The GitHub organization or owner :param repo: The GitHub repository name :param provider: The method for getting the most recent commit :raises ValueError: if an invalid provider is given :returns: A commit hash's hex digest as a string """ if provider == "git": output = check_output(["git", "ls-remote", f"https://github.com/{org}/{repo}"]) # noqa lines = (line.strip().split("\t") for line in output.decode("utf8").splitlines()) rv = next(line[0] for line in lines if line[1] == "HEAD") elif provider == "github": res = requests.get(f"https://api.github.com/repos/{org}/{repo}/branches/master", timeout=15) res_json = res.json() rv = res_json["commit"]["sha"] else: raise ValueError(f"invalid implementation: {provider}") return rv CHUNK_SIZE = 32768 DOWNLOAD_URL = "https://docs.google.com/uc?export=download" TOKEN_KEY = "download_warning" # noqa:S105 def download_from_google( file_id: str, path: Union[str, Path], force: bool = True, clean_on_failure: bool = True, hexdigests: Optional[Mapping[str, str]] = None, ) -> None: """Download a file from google drive. Implementation inspired by https://github.com/ndrplz/google-drive-downloader. :param file_id: The google file identifier :param path: The place to write the file :param force: If false and the file already exists, will not re-download. :param clean_on_failure: If true, will delete the file on any exception raised during download :param hexdigests: The expected hexdigests as (algorithm_name, expected_hex_digest) pairs. :raises Exception: Thrown if an error besides a keyboard interrupt is thrown during download :raises KeyboardInterrupt: If a keyboard interrupt is thrown during download :raises UnexpectedDirectory: If a directory is given for the ``path`` argument """ path = Path(path).resolve() if path.is_dir(): raise UnexpectedDirectory(path) if path.is_file() and not force: raise_on_digest_mismatch(path=path, hexdigests=hexdigests) logger.debug("did not re-download %s from Google ID %s", path, file_id) return try: with requests.Session() as sess: res = sess.get(DOWNLOAD_URL, params={"id": file_id}, stream=True) token = _get_confirm_token(res) res = sess.get(DOWNLOAD_URL, params={"id": file_id, "confirm": token}, stream=True) with path.open("wb") as file: for chunk in tqdm(res.iter_content(CHUNK_SIZE), desc="writing", unit="chunk"): if chunk: # filter out keep-alive new chunks file.write(chunk) except (Exception, KeyboardInterrupt): if clean_on_failure: _unlink(path) raise raise_on_digest_mismatch(path=path, hexdigests=hexdigests) def _get_confirm_token(res: requests.Response) -> str: for key, value in res.cookies.items(): if key.startswith(TOKEN_KEY): return value raise ValueError(f"no token found with key {TOKEN_KEY} in cookies: {res.cookies}") def download_from_s3( s3_bucket: str, s3_key: str, path: Union[str, Path], client: Union[None, "botocore.client.BaseClient"] = None, client_kwargs: Optional[Mapping[str, Any]] = None, download_file_kwargs: Optional[Mapping[str, Any]] = None, force: bool = True, clean_on_failure: bool = True, ) -> None: """Download a file from S3. :param s3_bucket: The key inside the S3 bucket name :param s3_key: The key inside the S3 bucket :param path: The place to write the file :param client: A botocore client. If none given, one will be created automatically :type client: Optional[botocore.client.BaseClient] :param client_kwargs: Keyword arguments to be passed to the client on instantiation. :param download_file_kwargs: Keyword arguments to be passed to :func:`boto3.s3.transfer.S3Transfer.download_file` :param force: If false and the file already exists, will not re-download. :param clean_on_failure: If true, will delete the file on any exception raised during download :raises Exception: Thrown if an error besides a keyboard interrupt is thrown during download :raises KeyboardInterrupt: If a keyboard interrupt is thrown during download :raises UnexpectedDirectory: If a directory is given for the ``path`` argument """ path = Path(path).resolve() if path.is_dir(): raise UnexpectedDirectory(path) if path.is_file() and not force: logger.debug("did not re-download %s from %s %s", path, s3_bucket, s3_key) return try: import boto3.s3.transfer if client is None: import boto3 import botocore.client client_kwargs = {} if client_kwargs is None else dict(client_kwargs) client_kwargs.setdefault( "config", botocore.client.Config(signature_version=botocore.UNSIGNED) ) client = boto3.client("s3", **client_kwargs) download_file_kwargs = {} if download_file_kwargs is None else dict(download_file_kwargs) download_file_kwargs.setdefault( "Config", boto3.s3.transfer.TransferConfig(use_threads=False) ) client.download_file(s3_bucket, s3_key, path.as_posix(), **download_file_kwargs) except (Exception, KeyboardInterrupt): if clean_on_failure: _unlink(path) raise def _unlink(path: Union[str, Path]) -> None: # python 3.6 does not have pathlib.Path.unlink, smh try: os.remove(path) except OSError: pass # if the file can't be deleted then no problem def get_name() -> str: """Get the PyStow home directory name. :returns: The name of the pystow home directory, either loaded from the :data:`PYSTOW_NAME_ENVVAR`` environment variable or given by the default value :data:`PYSTOW_NAME_DEFAULT`. """ return os.getenv(PYSTOW_NAME_ENVVAR, default=PYSTOW_NAME_DEFAULT) def use_appdirs() -> bool: """Check if X Desktop Group (XDG) compatibility is requested. :returns: If the :data:`PYSTOW_USE_APPDIRS` is set to ``true`` in the environment. """ return os.getenv(PYSTOW_USE_APPDIRS) in {"true", "True"} def get_home(ensure_exists: bool = True) -> Path: """Get the PyStow home directory. :param ensure_exists: If true, ensures the directory is created :returns: A path object representing the pystow home directory, as one of: 1. :data:`PYSTOW_HOME_ENVVAR` environment variable or 2. The user data directory defined by :mod:`appdirs` if the :data:`PYSTOW_USE_APPDIRS` environment variable is set to ``true`` or 3. The default directory constructed in the user's home directory plus what's returned by :func:`get_name`. """ if use_appdirs(): from appdirs import user_data_dir default = Path(user_data_dir()) else: default = Path.home() / get_name() return getenv_path(PYSTOW_HOME_ENVVAR, default, ensure_exists=ensure_exists) def get_base(key: str, ensure_exists: bool = True) -> Path: """Get the base directory for a module. :param key: The name of the module. No funny characters. The envvar _HOME where key is uppercased is checked first before using the default home directory. :param ensure_exists: Should all directories be created automatically? Defaults to true. :returns: The path to the given :raises ValueError: if the key is invalid (e.g., has a dot in it) """ if "." in key: raise ValueError(f"The module should not have a dot in it: {key}") envvar = f"{key.upper()}_HOME" if use_appdirs(): from appdirs import user_data_dir default = Path(user_data_dir(appname=key)) else: default = get_home(ensure_exists=False) / key return getenv_path(envvar, default, ensure_exists=ensure_exists) def ensure_readme() -> None: """Ensure there's a README in the PyStow data directory. :raises PermissionError: If the script calling this function does not have adequate permissions to write a file into the PyStow home directory. """ try: readme_path = get_home(ensure_exists=True).joinpath("README.md") except PermissionError as e: raise PermissionError( f"PyStow was not able to create its home directory in {readme_path.parent} due to a lack of " "permissions. This can happen, e.g., if you're working on a server where you don't have full " "rights. See https://pystow.readthedocs.io/en/latest/installation.html#configuration for instructions " "on choosing a different home folder location for PyStow to somewhere where you have write permissions." ) from e if readme_path.is_file(): return with readme_path.open("w", encoding="utf8") as file: print(README_TEXT, file=file) # noqa:T001,T201 def path_to_sqlite(path: Union[str, Path]) -> str: """Convert a path to a SQLite connection string. :param path: A path to a SQLite database file :returns: A standard connection string to the database """ path = Path(path).expanduser().resolve() return f"sqlite:///{path.as_posix()}" def gunzip(source: Union[str, Path], target: Union[str, Path]) -> None: """Unzip a file in the source to the target. :param source: The path to an input file :param target: The path to an output file """ with gzip.open(source, "rb") as in_file, open(target, "wb") as out_file: shutil.copyfileobj(in_file, out_file) pystow-0.5.5/src/pystow/version.py000066400000000000000000000001601466707254100172720ustar00rootroot00000000000000# -*- coding: utf-8 -*- """Version information for PyStow.""" __all__ = [ "VERSION", ] VERSION = "0.5.5" pystow-0.5.5/tests/000077500000000000000000000000001466707254100142445ustar00rootroot00000000000000pystow-0.5.5/tests/resources/000077500000000000000000000000001466707254100162565ustar00rootroot00000000000000pystow-0.5.5/tests/resources/test.txt000066400000000000000000000000241466707254100177720ustar00rootroot00000000000000this is a test file pystow-0.5.5/tests/resources/test.txt.md5000066400000000000000000000000401466707254100204540ustar00rootroot000000000000004221d002ceb5d3c9e9137e495ceaa647pystow-0.5.5/tests/resources/test_1.csv000066400000000000000000000000521466707254100201670ustar00rootroot00000000000000h1,h2,h3 v1_1,v1_2,v1_3 v2_1,v2_2,v2_3 pystow-0.5.5/tests/resources/test_1.json000066400000000000000000000000241466707254100203440ustar00rootroot00000000000000{ "key": "value" }pystow-0.5.5/tests/resources/test_1.pkl000066400000000000000000000001171466707254100201640ustar00rootroot00000000000000D](h1h2h3v1_1v1_2v1_3v2_1v2_2v2_3e.pystow-0.5.5/tests/resources/test_1.tsv000066400000000000000000000000471466707254100202140ustar00rootroot00000000000000h1 h2 h3 v1_1 v1_2 v1_3 v2_1 v2_2 v2_3 pystow-0.5.5/tests/resources/test_verbose.txt.md5000066400000000000000000000000561466707254100222100ustar00rootroot00000000000000MD5(text.txt)=4221d002ceb5d3c9e9137e495ceaa647pystow-0.5.5/tests/resources/test_wrong.txt.md5000066400000000000000000000000041466707254100216700ustar00rootroot00000000000000yolopystow-0.5.5/tests/test_api.py000066400000000000000000000072631466707254100164360ustar00rootroot00000000000000# -*- coding: utf-8 -*- """Test for API completeness.""" import inspect import unittest import pandas as pd import rdflib from lxml import etree import pystow from pystow import Module SKIP = {"__init__"} def _df_equal(a: pd.DataFrame, b: pd.DataFrame, msg=None) -> bool: return a.values.tolist() == b.values.tolist() def _rdf_equal(a: rdflib.Graph, b: rdflib.Graph, msg=None) -> bool: return {tuple(t) for t in a} == {tuple(t) for t in b} def _etree_equal(a: etree.ElementTree, b: etree.ElementTree, msg=None) -> bool: return etree.tostring(a) == etree.tostring(b) class TestExposed(unittest.TestCase): """Test API exposure.""" def setUp(self) -> None: """Set up the test case.""" self.addTypeEqualityFunc(pd.DataFrame, _df_equal) self.addTypeEqualityFunc(rdflib.Graph, _rdf_equal) self.addTypeEqualityFunc(type(etree.ElementTree()), _etree_equal) def assert_io(self, obj, ext: str, dump, load): """Test an object can be dumped and loaded. :param obj: The object to dump :param ext: The extension to use :param dump: The dump function :param load: The load function """ name = f"test.{ext}" path = pystow.join("test", name=name) if path.is_file(): path.unlink() self.assertFalse(path.is_file()) dump("test", name=name, obj=obj) self.assertTrue(path.is_file()) self.assertEqual(obj, load("test", name=name)) def test_exposed(self): """Test that all module-level functions also have a counterpart in the top-level API.""" for name, func in Module.__dict__.items(): if not inspect.isfunction(func) or name in SKIP: continue with self.subTest(name=name): self.assertIn( name, pystow.api.__all__, msg=f"Module.{name} should be included in from `pystow.api.__all__`.", ) self.assertTrue( hasattr(pystow.api, name), msg=f"`Module.{name} should be exposed as a top-level function in `pystow.api`.", ) self.assertTrue( hasattr(pystow, name), msg=f"`pystow.api.{name}` should be imported in `pystow.__init__`.", ) def test_io(self): """Test IO functions.""" obj = ["a", "b", "c"] for ext, dump, load in [ ("json", pystow.dump_json, pystow.load_json), ("pkl", pystow.dump_pickle, pystow.load_pickle), ]: with self.subTest(ext=ext): self.assert_io(obj, ext=ext, dump=dump, load=load) def test_pd_io(self): """Test pandas IO.""" columns = list("abc") data = [(1, 2, 3), (4, 5, 6)] df = pd.DataFrame(data, columns=columns) self.assert_io(df, ext="tsv", load=pystow.load_df, dump=pystow.dump_df) def test_rdf_io(self): """Test RDFlib IO.""" graph = rdflib.Graph() graph.add( ( rdflib.URIRef("http://example.com/subject"), rdflib.URIRef("http://example.com/predicate"), rdflib.URIRef("http://example.com/object"), ) ) self.assertEqual(1, len(graph)) self.assert_io(graph, ext="ttl", dump=pystow.dump_rdf, load=pystow.load_rdf) def test_xml_io(self): """Test XML I/O.""" root = etree.Element("root") root.set("interesting", "somewhat") etree.SubElement(root, "test") my_tree = etree.ElementTree(root) self.assert_io(my_tree, ext="xml", dump=pystow.dump_xml, load=pystow.load_xml) pystow-0.5.5/tests/test_caching.py000066400000000000000000000050631466707254100172550ustar00rootroot00000000000000# -*- coding: utf-8 -*- """Tests for caching.""" import os import tempfile import unittest from pathlib import Path from pystow.cache import CachedPickle EXPECTED = 5 EXPECTED_2 = 6 class TestCache(unittest.TestCase): """Tests for caches.""" def setUp(self) -> None: """Set up the test case with a temporary directory.""" self.tmpdir = tempfile.TemporaryDirectory() self.directory = Path(self.tmpdir.name) def tearDown(self) -> None: """Tear down the test case's temporary directory.""" self.tmpdir.cleanup() def test_cache_exception(self): """Test that exceptions aren't swallowed.""" path = self.directory.joinpath("test.pkl") self.assertFalse(path.is_file()) @CachedPickle(path=path) def _f1(): raise NotImplementedError self.assertFalse(path.is_file(), msg="function has not been called") with self.assertRaises(NotImplementedError): _f1() self.assertFalse( path.is_file(), msg="file should not have been created if an exception was thrown by the function", ) def test_cache_pickle(self): """Test caching a pickle.""" path = self.directory.joinpath("test.pkl") self.assertFalse( path.is_file(), msg="the file should not exist at the beginning of the test", ) raise_flag = True @CachedPickle(path=path) def _f1(): if raise_flag: raise ValueError return EXPECTED self.assertFalse(path.is_file(), msg="the file should not exist until function is called") with self.assertRaises(ValueError): _f1() self.assertFalse( path.is_file(), msg="the function should throw an exception because of the flag, and no file should be created", ) raise_flag = False actual = _f1() self.assertEqual(EXPECTED, actual) self.assertTrue(path.is_file(), msg="a file should have been created") raise_flag = True actual_2 = _f1() # if raises, the caching mechanism didn't work self.assertEqual(EXPECTED, actual_2) self.assertTrue(path.is_file()) os.unlink(path) self.assertFalse(path.is_file()) with self.assertRaises(ValueError): _f1() @CachedPickle(path=path, force=True) def _f2(): return EXPECTED_2 self.assertEqual(EXPECTED_2, _f2()) # overwrites the file self.assertEqual(EXPECTED_2, _f1()) pystow-0.5.5/tests/test_config.py000066400000000000000000000101431466707254100171210ustar00rootroot00000000000000# -*- coding: utf-8 -*- """Test configuration loading.""" import tempfile import unittest from pathlib import Path import pystow from pystow.config_api import CONFIG_HOME_ENVVAR, _get_cfp from pystow.utils import mock_envvar class TestConfig(unittest.TestCase): """Test configuration.""" @classmethod def setUpClass(cls) -> None: """Set up the class for testing.""" cls.test_section = "test" cls.test_option = "option" cls.test_value = "value" cls.cfp = _get_cfp(cls.test_section) cls.cfp.add_section(cls.test_section) cls.cfp.set( section=cls.test_section, option=cls.test_option, value=cls.test_value, ) def test_env_cast(self): """Test casting works properly when getting from the environment.""" with mock_envvar("TEST_VAR", "1234"): self.assertEqual("1234", pystow.get_config("test", "var")) self.assertEqual("1234", pystow.get_config("test", "var", dtype=str)) self.assertEqual(1234, pystow.get_config("test", "var", dtype=int)) with self.assertRaises(ValueError): pystow.get_config("test", "var", dtype=bool) with self.assertRaises(TypeError): pystow.get_config("test", "var", dtype=object) def test_get_config(self): """Test lookup not existing.""" self.assertIsNone(pystow.get_config(self.test_section, "key")) self.assertEqual("1234", pystow.get_config(self.test_section, "key", default="1234")) value = "not_value" self.assertEqual( value, pystow.get_config(self.test_section, self.test_option, passthrough=value) ) self.assertEqual(1, pystow.get_config(self.test_section, self.test_option, passthrough=1)) self.assertEqual( 1, pystow.get_config(self.test_section, self.test_option, passthrough="1", dtype=int) ) self.assertEqual( True, pystow.get_config(self.test_section, self.test_option, passthrough="1", dtype=bool), ) self.assertEqual( True, pystow.get_config(self.test_section, self.test_option, passthrough="yes", dtype=bool), ) self.assertEqual( True, pystow.get_config(self.test_section, self.test_option, passthrough="Yes", dtype=bool), ) self.assertEqual( True, pystow.get_config(self.test_section, self.test_option, passthrough="YES", dtype=bool), ) self.assertEqual( True, pystow.get_config(self.test_section, self.test_option, passthrough="True", dtype=bool), ) self.assertEqual( True, pystow.get_config(self.test_section, self.test_option, passthrough="TRUE", dtype=bool), ) self.assertEqual( True, pystow.get_config(self.test_section, self.test_option, passthrough="T", dtype=bool), ) self.assertEqual( True, pystow.get_config(self.test_section, self.test_option, passthrough="t", dtype=bool), ) self.assertEqual( True, pystow.get_config(self.test_section, self.test_option, passthrough=True, dtype=bool), ) self.assertEqual( True, pystow.get_config(self.test_section, self.test_option, passthrough=1, dtype=bool) ) def test_subsection(self): """Test subsections.""" with tempfile.TemporaryDirectory() as directory, mock_envvar(CONFIG_HOME_ENVVAR, directory): directory = Path(directory) path = directory.joinpath("test.ini") self.assertFalse(path.is_file(), msg="file should not already exist") self.assertIsNone(pystow.get_config("test:subtest", "key")) self.assertFalse(path.is_file(), msg="getting config should not create a file") pystow.write_config("test:subtest", "key", "value") self.assertTrue(path.is_file(), msg=f"{list(directory.iterdir())}") self.assertEqual("value", pystow.get_config("test:subtest", "key")) pystow-0.5.5/tests/test_module.py000066400000000000000000000263471466707254100171560ustar00rootroot00000000000000# -*- coding: utf-8 -*- """Tests for PyStow.""" import bz2 import contextlib import itertools as itt import json import lzma import os import pickle import shutil import tempfile import unittest from pathlib import Path from typing import ContextManager, Mapping, Union from unittest import mock import pandas as pd import pystow from pystow import join from pystow.constants import PYSTOW_HOME_ENVVAR, PYSTOW_NAME_ENVVAR from pystow.impl import Module from pystow.utils import ( get_home, get_name, mock_envvar, n, write_pickle_gz, write_sql, write_tarfile_csv, write_zipfile_csv, ) HERE = Path(__file__).parent.resolve() RESOURCES = HERE.joinpath("resources") TSV_NAME = "test_1.tsv" TSV_URL = f"{n()}/{TSV_NAME}" SQLITE_NAME = "test_1.db" SQLITE_URL = f"{n()}/{SQLITE_NAME}" SQLITE_PATH = RESOURCES / SQLITE_NAME SQLITE_TABLE = "testtable" JSON_NAME = "test_1.json" JSON_URL = f"{n()}/{JSON_NAME}" JSON_PATH = RESOURCES / JSON_NAME PICKLE_NAME = "test_1.pkl" PICKLE_URL = f"{n()}/{PICKLE_NAME}" PICKLE_PATH = RESOURCES / PICKLE_NAME PICKLE_GZ_NAME = "test_1.pkl.gz" PICKLE_GZ_URL = f"{n()}/{PICKLE_GZ_NAME}" PICKLE_GZ_PATH = RESOURCES / PICKLE_GZ_NAME JSON_BZ2_NAME = "test_1.json.bz2" JSON_BZ2_URL = f"{n()}/{JSON_BZ2_NAME}" JSON_BZ2_PATH = RESOURCES / JSON_BZ2_NAME MOCK_FILES: Mapping[str, Path] = { TSV_URL: RESOURCES / TSV_NAME, JSON_URL: JSON_PATH, JSON_BZ2_URL: JSON_BZ2_PATH, PICKLE_URL: PICKLE_PATH, PICKLE_GZ_URL: PICKLE_GZ_PATH, SQLITE_URL: SQLITE_PATH, } TEST_TSV_ROWS = [ ("h1", "h2", "h3"), ("v1_1", "v1_2", "v1_3"), ("v2_1", "v2_2", "v2_3"), ] TEST_DF = pd.DataFrame(TEST_TSV_ROWS) TEST_JSON = {"key": "value"} # Make the pickle file if not PICKLE_PATH.is_file(): PICKLE_PATH.write_bytes(pickle.dumps(TEST_TSV_ROWS)) if not SQLITE_PATH.is_file(): write_sql(TEST_DF, name=SQLITE_TABLE, path=SQLITE_PATH, index=False) if not JSON_PATH.is_file(): JSON_PATH.write_text(json.dumps(TEST_JSON)) if not JSON_BZ2_PATH.is_file(): with bz2.open(JSON_BZ2_PATH, mode="wt") as file: json.dump(TEST_JSON, file, indent=2) class TestMocks(unittest.TestCase): """Tests for :mod:`pystow` mocks and context managers.""" def test_mock_home(self): """Test that home can be properly mocked.""" name = n() with tempfile.TemporaryDirectory() as d: expected_path = Path(d) / name self.assertFalse(expected_path.exists()) with mock_envvar(PYSTOW_HOME_ENVVAR, expected_path.as_posix()): self.assertFalse(expected_path.exists()) self.assertEqual(expected_path, get_home(ensure_exists=False)) self.assertFalse(expected_path.exists()) def test_mock_name(self): """Test that the name can be properly mocked.""" name = n() expected_path = Path.home() / name self.assertFalse(expected_path.exists()) with mock_envvar(PYSTOW_NAME_ENVVAR, name): self.assertEqual(name, get_name()) self.assertFalse(expected_path.exists()) self.assertEqual(expected_path, get_home(ensure_exists=False)) self.assertFalse(expected_path.exists()) class TestGet(unittest.TestCase): """Tests for :mod:`pystow`.""" def setUp(self) -> None: """Set up the test case.""" self.directory = tempfile.TemporaryDirectory() def tearDown(self) -> None: """Tear down the test case.""" self.directory.cleanup() @contextlib.contextmanager def mock_directory(self) -> ContextManager[Path]: """Use this test case's temporary directory as a mock environment variable. :yield: The mock directory's path """ with mock_envvar(PYSTOW_HOME_ENVVAR, self.directory.name): yield Path(self.directory.name) @staticmethod def mock_download(): """Mock connection to the internet using local resource files. :return: A patch object that can be applied to the pystow download function """ def _mock_get_data(url: str, path: Union[str, Path], **_kwargs) -> Path: return shutil.copy(MOCK_FILES[url], path) return mock.patch("pystow.utils.download", side_effect=_mock_get_data) @staticmethod def mock_download_once(local_path: Union[str, Path]): """Mock connection to the internet using local resource files. :param local_path: the path to the file to mock :return: A patch object that can be applied to the pystow download function """ def _mock_get_data(path: Union[str, Path], **_kwargs) -> Path: return shutil.copy(local_path, path) return mock.patch("pystow.utils.download", side_effect=_mock_get_data) def join(self, *parts: str) -> Path: """Help join the parts to this test case's temporary directory. :param parts: The file path parts that are joined with this test case's directory :return: A path to the file """ return Path(os.path.join(self.directory.name, *parts)) def test_mock(self): """Test that mocking the directory works properly for this test case.""" with self.mock_directory(): self.assertEqual(os.getenv(PYSTOW_HOME_ENVVAR), self.directory.name) def test_get(self): """Test the :func:`get` function.""" parts_examples = [ [n()], [n(), n()], [n(), n(), n()], ] with self.mock_directory(): for parts in parts_examples: with self.subTest(parts=parts): self.assertEqual(self.join(*parts), join(*parts)) def test_ensure(self): """Test ensuring various files.""" write_pickle_gz(TEST_TSV_ROWS, path=PICKLE_GZ_PATH) with self.mock_directory(), self.mock_download(): with self.subTest(type="tsv"): df = pystow.ensure_csv("test", url=TSV_URL) self.assertEqual(3, len(df.columns)) df2 = pystow.load_df("test", name=TSV_NAME) self.assertEqual(df.values.tolist(), df2.values.tolist()) with self.subTest(type="json"): j = pystow.ensure_json("test", url=JSON_URL) self.assertEqual(TEST_JSON, j) j2 = pystow.load_json("test", name=JSON_NAME) self.assertEqual(j, j2) with self.subTest(type="pickle"): p = pystow.ensure_pickle("test", url=PICKLE_URL) self.assertEqual(3, len(p)) p2 = pystow.load_pickle("test", name=PICKLE_NAME) self.assertEqual(p, p2) with self.subTest(type="pickle_gz"): p = pystow.ensure_pickle_gz("test", url=PICKLE_GZ_URL) self.assertEqual(3, len(p)) p2 = pystow.load_pickle_gz("test", name=PICKLE_GZ_NAME) self.assertEqual(p, p2) with self.subTest(type="json_bz2"): p = pystow.ensure_json_bz2("test", url=JSON_BZ2_URL) self.assertEqual(TEST_JSON, p) def test_open_fail(self): """Test opening a missing file.""" with self.assertRaises(FileNotFoundError): with pystow.open("nope", name="nope"): pass with self.assertRaises(FileNotFoundError): pystow.load_json("nope", name="nope") def test_ensure_open_lzma(self): """Test opening lzma-encoded files.""" with tempfile.TemporaryDirectory() as directory, self.mock_directory(): path = Path(directory) / n() with self.mock_download_once(path): with lzma.open(path, "wt") as file: for row in TEST_TSV_ROWS: print(*row, sep="\t", file=file) # noqa:T001,T201 with pystow.ensure_open_lzma("test", url=n()) as file: df = pd.read_csv(file, sep="\t") self.assertEqual(3, len(df.columns)) def test_ensure_open_zip(self): """Test opening tar-encoded files.""" with tempfile.TemporaryDirectory() as directory, self.mock_directory(): path = Path(directory) / n() inner_path = n() with self.mock_download_once(path): write_zipfile_csv(TEST_DF, path, inner_path) with pystow.ensure_open_zip("test", url=n(), inner_path=inner_path) as file: df = pd.read_csv(file, sep="\t") self.assertEqual(3, len(df.columns)) def test_ensure_open_tarfile(self): """Test opening tarfile-encoded files.""" with tempfile.TemporaryDirectory() as directory, self.mock_directory(): path = Path(directory) / n() inner_path = n() with self.mock_download_once(path): write_tarfile_csv(TEST_DF, path, inner_path) with pystow.ensure_open_tarfile("test", url=n(), inner_path=inner_path) as file: df = pd.read_csv(file, sep="\t") self.assertEqual(3, len(df.columns)) def test_ensure_module(self): """Test that the ``ensure_exist`` argument in :meth:`Module.from_key` works properly.""" parts_examples = [ [n()], [n(), n()], [n(), n(), n()], ] ensure_examples = [False, True] for ensure_exists, parts in itt.product(ensure_examples, parts_examples): with self.subTest(ensure_exists=ensure_exists, parts=parts), self.mock_directory(): expected_directory = self.join(*parts) module = Module.from_key(*parts, ensure_exists=ensure_exists) self.assertEqual(expected_directory, module.base) self.assertIs( expected_directory.exists(), ensure_exists, msg=f'{expected_directory} should{"" if ensure_exists else " not"} exist.', ) def test_ensure_custom(self): """Test ensure with custom provider.""" with self.mock_directory(): # create a minimal provider def touch_file(path: Path, **_kwargs): """ Create a file. :param path: the file path :param _kwargs: ignored keywords """ path.touch() # wrap to record calls provider = mock.Mock(wraps=touch_file) # the keyword-based parameters for the provider kwargs = {"a": 4, "c": {0: 1, 5: 7}} # call first time name = n() path = pystow.ensure_custom("test", name=name, provider=provider, **kwargs) self.assertTrue(path.is_file()) # call a second time path = pystow.ensure_custom("test", name=name, provider=provider, **kwargs) # ensure that the provider was only called once with the given parameters provider.assert_called_once_with(path, **kwargs) def test_ensure_open_sqlite(self): """Test caching SQLite.""" with self.mock_directory(), self.mock_download(): with pystow.ensure_open_sqlite("test", url=SQLITE_URL) as conn: df = pd.read_sql(f"SELECT * from {SQLITE_TABLE}", conn) # noqa:S608 self.assertEqual(3, len(df.columns)) pystow-0.5.5/tests/test_utils.py000066400000000000000000000312251466707254100170200ustar00rootroot00000000000000# -*- coding: utf-8 -*- """Tests for utilities.""" import hashlib import os import tempfile import unittest from pathlib import Path import numpy as np import pandas as pd import requests from lxml import etree from requests_file import FileAdapter from pystow.utils import ( HexDigestError, download, get_hexdigests_remote, getenv_path, mkdir, mock_envvar, n, name_from_url, read_tarfile_csv, read_zip_np, read_zipfile_csv, read_zipfile_xml, write_tarfile_csv, write_zipfile_csv, write_zipfile_np, write_zipfile_xml, ) HERE = Path(__file__).resolve().parent TEST_TXT = HERE.joinpath("resources", "test.txt") TEST_TXT_MD5 = HERE.joinpath("resources", "test.txt.md5") TEST_TXT_VERBOSE_MD5 = HERE.joinpath("resources", "test_verbose.txt.md5") TEST_TXT_WRONG_MD5 = HERE.joinpath("resources", "test_wrong.txt.md5") skip_on_windows = unittest.skipIf( os.name == "nt", reason="Funny stuff happens in requests with a file adapter on windows that adds line breaks", ) class _Session(requests.sessions.Session): """A mock session.""" def __init__(self): """Instantiate the patched session with an additional file adapter.""" super().__init__() self.mount("file://", FileAdapter()) requests.sessions.Session = _Session class TestUtils(unittest.TestCase): """Test utility functions.""" def test_name_from_url(self): """Test :func:`name_from_url`.""" data = [ ("test.tsv", "https://example.com/test.tsv"), ("test.tsv", "https://example.com/deeper/test.tsv"), ("test.tsv.gz", "https://example.com/deeper/test.tsv.gz"), ] for name, url in data: with self.subTest(name=name, url=url): self.assertEqual(name, name_from_url(url)) @skip_on_windows def test_file_values(self): """Test encodings.""" for url, value in [ (TEST_TXT, "this is a test file\n"), (TEST_TXT_MD5, "4221d002ceb5d3c9e9137e495ceaa647"), (TEST_TXT_VERBOSE_MD5, "MD5(text.txt)=4221d002ceb5d3c9e9137e495ceaa647"), (TEST_TXT_WRONG_MD5, "yolo"), ]: with self.subTest(name=url.name): self.assertEqual(value, requests.get(url.as_uri(), timeout=15).text) def test_mkdir(self): """Test for ensuring a directory.""" with tempfile.TemporaryDirectory() as directory: directory = Path(directory) subdirectory = directory / "sd1" self.assertFalse(subdirectory.exists()) mkdir(subdirectory, ensure_exists=False) self.assertFalse(subdirectory.exists()) mkdir(subdirectory, ensure_exists=True) self.assertTrue(subdirectory.exists()) def test_mock_envvar(self): """Test that environment variables can be mocked properly.""" name, value = n(), n() self.assertNotIn(name, os.environ) with mock_envvar(name, value): self.assertIn(name, os.environ) self.assertEqual(value, os.getenv(name)) self.assertNotIn(name, os.environ) def test_getenv_path(self): """Test that :func:`getenv_path` works properly.""" envvar = n() with tempfile.TemporaryDirectory() as directory: directory = Path(directory) value = directory / n() default = directory / n() self.assertEqual(default, getenv_path(envvar, default)) with mock_envvar(envvar, value.as_posix()): self.assertEqual(value, getenv_path(envvar, default)) # Check that it goes back self.assertEqual(default, getenv_path(envvar, default)) def test_compressed_io(self): """Test that the read/write to compressed folder functions work.""" rows = [[1, 2], [3, 4], [5, 6]] columns = ["A", "B"] df = pd.DataFrame(rows, columns=columns) inner_path = "okay.tsv" data = [ ("test.zip", write_zipfile_csv, read_zipfile_csv), ("test.tar.gz", write_tarfile_csv, read_tarfile_csv), ] for name, writer, reader in data: with self.subTest(name=name), tempfile.TemporaryDirectory() as directory: directory = Path(directory) path = directory / name self.assertFalse(path.exists()) writer(df, path=path, inner_path=inner_path) self.assertTrue(path.exists()) new_df = reader(path=path, inner_path=inner_path) self.assertEqual(list(df.columns), list(new_df.columns)) self.assertEqual(df.values.tolist(), new_df.values.tolist()) def test_xml_io(self): """Test that read/write for XML element tree works.""" root = etree.Element("Doc") level1 = etree.SubElement(root, "S") main = etree.SubElement(level1, "Text") main.text = "Thanks for contributing an answer to Stack Overflow!" second = etree.SubElement(level1, "Tokens") level2 = etree.SubElement(second, "Token", word="low") level3 = etree.SubElement(level2, "Morph") second1 = etree.SubElement(level3, "Lemma") second1.text = "sdfs" second1 = etree.SubElement(level3, "info") second1.text = "qw" level4 = etree.SubElement(level3, "Aff") second1 = etree.SubElement(level4, "Type") second1.text = "sdfs" second1 = etree.SubElement(level4, "Suf") second1.text = "qw" tree = etree.ElementTree(root) inner_path = "okay.tsv" data = [ ("test.zip", write_zipfile_xml, read_zipfile_xml), ] for name, writer, reader in data: with self.subTest(name=name), tempfile.TemporaryDirectory() as directory: directory = Path(directory) path = directory / name self.assertFalse(path.exists()) writer(tree, path=path, inner_path=inner_path) self.assertTrue(path.exists()) new_tree = reader(path=path, inner_path=inner_path) self.assertEqual( etree.tostring(tree, pretty_print=True), etree.tostring(new_tree, pretty_print=True), ) def test_numpy_io(self): """Test IO with numpy.""" arr = np.array([[0, 1], [2, 3]]) inner_path = "okay.npz" with tempfile.TemporaryDirectory() as directory: directory = Path(directory) path = directory / "test.zip" write_zipfile_np(arr, inner_path=inner_path, path=path) reloaded_arr = read_zip_np(path=path, inner_path=inner_path) self.assertTrue(np.array_equal(arr, reloaded_arr)) class TestHashing(unittest.TestCase): """Tests for hexdigest checking.""" def setUp(self) -> None: """Set up a test.""" self.directory = tempfile.TemporaryDirectory() self.path = Path(self.directory.name).joinpath("test.tsv") md5 = hashlib.md5() # noqa:S303,S324 with TEST_TXT.open("rb") as file: md5.update(file.read()) self.expected_md5 = md5.hexdigest() self.mismatching_md5_hexdigest = "yolo" self.assertNotEqual(self.mismatching_md5_hexdigest, self.expected_md5) def tearDown(self) -> None: """Tear down a test.""" self.directory.cleanup() def test_hash_success(self): """Test checking actually works.""" self.assertFalse(self.path.exists()) download( url=TEST_TXT.as_uri(), path=self.path, hexdigests={ "md5": self.expected_md5, }, ) @skip_on_windows def test_hash_remote_success(self): """Test checking actually works.""" self.assertFalse(self.path.exists()) download( url=TEST_TXT.as_uri(), path=self.path, hexdigests_remote={ "md5": TEST_TXT_MD5.as_uri(), }, hexdigests_strict=True, ) self.assertTrue(self.path.exists()) @skip_on_windows def test_hash_remote_verbose_success(self): """Test checking actually works.""" self.assertFalse(self.path.exists()) download( url=TEST_TXT.as_uri(), path=self.path, hexdigests_remote={ "md5": TEST_TXT_VERBOSE_MD5.as_uri(), }, hexdigests_strict=False, ) self.assertTrue(self.path.exists()) def test_hash_remote_verbose_failure(self): """Test checking actually works.""" self.assertFalse(self.path.exists()) with self.assertRaises(HexDigestError): download( url=TEST_TXT.as_uri(), path=self.path, hexdigests_remote={ "md5": TEST_TXT_VERBOSE_MD5.as_uri(), }, hexdigests_strict=True, ) def test_hash_error(self): """Test hash error on download.""" self.assertFalse(self.path.exists()) with self.assertRaises(HexDigestError): download( url=TEST_TXT.as_uri(), path=self.path, hexdigests={ "md5": self.mismatching_md5_hexdigest, }, ) def test_hash_remote_error(self): """Test hash error on download.""" self.assertFalse(self.path.exists()) with self.assertRaises(HexDigestError): download( url=TEST_TXT.as_uri(), path=self.path, hexdigests_remote={ "md5": TEST_TXT_WRONG_MD5.as_uri(), }, hexdigests_strict=True, ) def test_override_hash_error(self): """Test hash error on download.""" self.path.write_text("test file content") self.assertTrue(self.path.exists()) with self.assertRaises(HexDigestError): download( url=TEST_TXT.as_uri(), path=self.path, hexdigests={ "md5": self.expected_md5, }, force=False, ) def test_override_hash_remote_error(self): """Test hash error on download.""" self.path.write_text("test file content") self.assertTrue(self.path.exists()) with self.assertRaises(HexDigestError): download( url=TEST_TXT.as_uri(), path=self.path, hexdigests_remote={ "md5": TEST_TXT_MD5.as_uri(), }, hexdigests_strict=True, force=False, ) def test_force(self): """Test overwriting wrong file.""" # now if force=True it should not bother with the hash check self.path.write_text("test file content") self.assertTrue(self.path.exists()) download( url=TEST_TXT.as_uri(), path=self.path, hexdigests={ "md5": self.expected_md5, }, force=True, ) @skip_on_windows def test_remote_force(self): """Test overwriting wrong file.""" # now if force=True it should not bother with the hash check self.path.write_text("test file content") self.assertTrue(self.path.exists()) download( url=TEST_TXT.as_uri(), path=self.path, hexdigests_remote={ "md5": TEST_TXT_MD5.as_uri(), }, hexdigests_strict=True, force=True, ) def test_hexdigest_urls(self): """Test getting hex digests from URLs.""" for url, strict in [ (TEST_TXT_MD5, True), (TEST_TXT_MD5, False), (TEST_TXT_VERBOSE_MD5, False), ]: hexdigests = get_hexdigests_remote( {"md5": url.as_uri()}, hexdigests_strict=strict, ) self.assertEqual( "4221d002ceb5d3c9e9137e495ceaa647", hexdigests["md5"], ) hexdigests = get_hexdigests_remote( {"md5": TEST_TXT_VERBOSE_MD5.as_uri()}, hexdigests_strict=True ) self.assertNotEqual( "4221d002ceb5d3c9e9137e495ceaa647", hexdigests["md5"], ) # Live test case # hexdigests = get_hexdigests_remote( # {"md5": "https://ftp.ncbi.nlm.nih.gov/pubmed/baseline/pubmed22n0001.xml.gz.md5"}, # hexdigests_strict=False, # ) # self.assertEqual( # { # "md5": "0f08d8f3947dde1f3bced5e1f450c0da", # }, # hexdigests, # ) pystow-0.5.5/tox.ini000066400000000000000000000115161466707254100144210ustar00rootroot00000000000000# Tox (http://tox.testrun.org/) is a tool for running tests # in multiple virtualenvs. This configuration file will run the # test suite on all supported python versions. To use it, "pip install tox" # and then run "tox" from this directory. [tox] # To use a PEP 517 build-backend you are required to configure tox to use an isolated_build: # https://tox.readthedocs.io/en/latest/example/package.html isolated_build = True # These environments are run in order if you just use `tox`: envlist = # always keep coverage-clean first #coverage-clean # code linters/stylers manifest lint flake8 pyroma mypy # documentation linters/checkers doc8 docstr-coverage docs-test # the actual tests py # always keep coverage-report last #coverage-report [testenv] commands = coverage run -p -m pytest --durations=20 {posargs:tests} coverage combine coverage xml passenv = HOME extras = tests pandas rdf xml allowlist_externals = /bin/cat /bin/cp /bin/mkdir /usr/bin/cat /usr/bin/cp /usr/bin/mkdir /usr/bin/git /usr/local/bin/git [testenv:coverage-xml] deps = coverage skip_install = true commands = coverage xml [testenv:coverage-clean] deps = coverage skip_install = true commands = coverage erase [testenv:manifest] deps = check-manifest skip_install = true commands = check-manifest [testenv:pre-commit] skip_install = true deps = pre-commit commands = pre-commit run -a usedevelop = true description = Run the pre-commit tool [testenv:flake8] skip_install = true deps = flake8 flake8-bandit flake8-colors flake8-docstrings flake8-isort flake8-bugbear flake8-broken-line flake8-black flake8-print pep8-naming pydocstyle darglint commands = flake8 src/pystow/ tests/ description = Run the flake8 tool with several plugins (bandit, docstrings, import order, pep8 naming). [testenv:lint] deps = black isort skip_install = true commands = black src/ tests/ isort src/ tests/ description = Run linters. [testenv:mypy] deps = mypy types-requests skip_install = true commands = mypy --install-types --non-interactive --ignore-missing-imports --strict src/pystow/ description = Run the mypy tool to check static typing on the project. [testenv:pyroma] deps = pygments pyroma skip_install = true commands = pyroma --min=10 . description = Run the pyroma tool to check the package friendliness of the project. [testenv:doc8] skip_install = true deps = sphinx doc8 commands = doc8 docs/source/ description = Run the doc8 tool to check the style of the RST files in the project docs. [testenv:docstr-coverage] skip_install = true deps = docstr-coverage commands = docstr-coverage src/pystow/ tests/ --skip-private --skip-magic description = Run the docstr-coverage tool to check documentation coverage [testenv:docs] description = Build the documentation locally. extras = docs rdf xml pandas aws commands = python -m sphinx -W -b html -d docs/build/doctrees docs/source docs/build/html [testenv:docs-test] description = Test building the documentation in an isolated environment. changedir = docs extras = {[testenv:docs]extras} commands = mkdir -p {envtmpdir} cp -r source {envtmpdir}/source python -m sphinx -W -b html -d {envtmpdir}/build/doctrees {envtmpdir}/source {envtmpdir}/build/html python -m sphinx -W -b coverage -d {envtmpdir}/build/doctrees {envtmpdir}/source {envtmpdir}/build/coverage cat {envtmpdir}/build/coverage/c.txt cat {envtmpdir}/build/coverage/python.txt allowlist_externals = /bin/cp /bin/cat /bin/mkdir # for compatibility on GitHub actions /usr/bin/cp /usr/bin/cat /usr/bin/mkdir [testenv:coverage-report] deps = coverage skip_install = true commands = coverage combine coverage report #################### # Deployment tools # #################### [testenv:bumpversion] commands = bump2version {posargs} skip_install = true passenv = HOME deps = bump2version [testenv:bumpversion-release] commands = bump2version release --tag skip_install = true passenv = HOME deps = bump2version [testenv:build] skip_install = true deps = wheel build setuptools commands = python -m build --sdist --wheel --no-isolation [testenv:release] skip_install = true passenv = TWINE_USERNAME TWINE_PASSWORD deps = {[testenv:build]deps} twine >= 1.5.0 commands = {[testenv:build]commands} twine upload --non-interactive --skip-existing dist/* [testenv:finish] skip_install = true passenv = HOME TWINE_USERNAME TWINE_PASSWORD deps = {[testenv:release]deps} bump2version commands = bump2version release --tag {[testenv:release]commands} git push --tags bump2version patch git push allowlist_externals = /usr/bin/git