pax_global_header00006660000000000000000000000064135675333710014527gustar00rootroot0000000000000052 comment=4daa59f43a621702cd2ecb38648c33507cd81524 filesystem_spec-0.6.1/000077500000000000000000000000001356753337100147315ustar00rootroot00000000000000filesystem_spec-0.6.1/.coveragerc000066400000000000000000000004001356753337100170440ustar00rootroot00000000000000[run] omit = */test_*.py fsspec/_version.py source = fsspec [report] # Regexes for lines to exclude from consideration exclude_lines = pragma: no cover raise AssertionError raise NotImplementedError pass ignore_errors = True filesystem_spec-0.6.1/.gitattributes000066400000000000000000000000401356753337100176160ustar00rootroot00000000000000fsspec/_version.py export-subst filesystem_spec-0.6.1/.gitignore000066400000000000000000000022721356753337100167240ustar00rootroot00000000000000# Dask dask-worker-space # Byte-compiled / optimized / DLL files __pycache__/ *.py[cod] *$py.class # C extensions *.so # Distribution / packaging .Python env/ build/ develop-eggs/ dist/ downloads/ eggs/ .eggs/ lib/ lib64/ parts/ sdist/ var/ wheels/ *.egg-info/ .installed.cfg *.egg pip-wheel-metadata/ # PyInstaller # Usually these files are written by a python script from a template # before PyInstaller builds the exe, so as to inject date/other infos into it. *.manifest *.spec # Installer logs pip-log.txt pip-delete-this-directory.txt # Unit test / coverage reports htmlcov/ .tox/ .coverage .coverage.* .cache nosetests.xml coverage.xml *.cover .hypothesis/ # Translations *.mo *.pot # Django stuff: *.log local_settings.py # Flask stuff: instance/ .webassets-cache # Scrapy stuff: .scrapy # Sphinx documentation docs/_build/ # PyBuilder target/ # Jupyter Notebook .ipynb_checkpoints # pyenv .python-version # celery beat schedule file celerybeat-schedule # SageMath parsed files *.sage.py # dotenv .env # virtualenv .venv venv/ ENV/ .idea/ # Spyder project settings .spyderproject .spyproject # Rope project settings .ropeproject # mkdocs documentation /site # mypy .mypy_cache/ filesystem_spec-0.6.1/.pre-commit-config.yaml000066400000000000000000000005541356753337100212160ustar00rootroot00000000000000exclude: > (?x)^( \.tox/.* )$ default_language_version: python: python3.7 repos: - repo: local hooks: - id: black name: black entry: black language: python require_serial: true types: [python] - repo: https://github.com/pre-commit/pre-commit-hooks rev: v2.3.0 hooks: - id: flake8 filesystem_spec-0.6.1/.travis.yml000066400000000000000000000004341356753337100170430ustar00rootroot00000000000000sudo: required dist: xenial os: - linux services: - docker language: generic env: - TOXENV=py35 - TOXENV=py36 - TOXENV=py37 - TOXENV=coverage - TOXENV=lint - TOXENV=s3fs - TOXENV=gcsfs install: - source ci/install.sh script: - tox -v notifications: email: false filesystem_spec-0.6.1/LICENSE000066400000000000000000000027511356753337100157430ustar00rootroot00000000000000BSD 3-Clause License Copyright (c) 2018, Martin Durant All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: * Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. * Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. * Neither the name of the copyright holder nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. filesystem_spec-0.6.1/MANIFEST.in000066400000000000000000000001561356753337100164710ustar00rootroot00000000000000include versioneer.py include fsspec/_version.py include LICENSE include README.rst include requirements.txt filesystem_spec-0.6.1/README.md000066400000000000000000000061701356753337100162140ustar00rootroot00000000000000# filesystem_spec [![Build Status](https://travis-ci.org/intake/filesystem_spec.svg?branch=master)](https://travis-ci.org/martindurant/filesystem_spec) [![Docs](https://readthedocs.org/projects/filesystem-spec/badge/?version=latest)](https://filesystem-spec.readthedocs.io/en/latest/?badge=latest) A specification for pythonic filesystems. ## Install ```bash pip install fsspec ``` or ```bash conda install -c conda-forge fsspec ``` ## Purpose To produce a template or specification for a file-system interface, that specific implementations should follow, so that applications making use of them can rely on a common behaviour and not have to worry about the specific internal implementation decisions with any given backend. Many such implementations are included in this package, or in sister projects such as `s3fs` and `gcsfs`. In addition, if this is well-designed, then additional functionality, such as a key-value store or FUSE mounting of the file-system implementation may be available for all implementations "for free". ## Documentation Please refer to [RTD](https://filesystem-spec.readthedocs.io/en/latest/?badge=latest) ## Develop fsspec uses [tox](https://tox.readthedocs.io/en/latest/) and [tox-conda](https://github.com/tox-dev/tox-conda) to manage dev and test environments. First, install conda with tox and tox-conda in a base environment (eg. `conda install -c conda-forge tox tox-conda`). Calls to `tox` can then be used to configure a development environment and run tests. First, setup a development conda environment via `tox -e dev`. This will install fspec dependencies, test & dev tools, and install fsspec in develop mode. Then, activate the dev environment under `.tox/dev` via `conda activate .tox/dev`. ### Testing Tests can be run directly in the activated dev environment via `pytest fsspec`. The full fsspec test suite can be run via `tox`, which will setup and execute tests against multiple dependency versions in isolated environment. Run `tox -av` to list available test environments, select environments via `tox -e `. The full fsspec suite requires a system-level docker, docker-compose, and fuse installation. See `ci/install.sh` for a detailed installation example. ### Code Formatting fsspec uses [Black](https://black.readthedocs.io/en/stable) to ensure a consistent code format throughout the project. ``black`` is automatically installed in the tox dev env, activated via `conda activate .tox/dev`. Then, run `black fsspec` from the root of the filesystem_spec repository to auto-format your code. Additionally, many editors have plugins that will apply `black` as you edit files. Optionally, you may wish to setup [pre-commit hooks](https://pre-commit.com) to automatically run `black` when you make a git commit. ``black`` is automatically installed in the tox dev env, activated via `conda activate .tox/dev`. Then, run `pre-commit install --install-hooks` from the root of the filesystem_spec repository to setup pre-commit hooks. `black` will now be run before you commit, reformatting any changed files. You can format without committing via `pre-commit run` or skip these checks with `git commit --no-verify`. filesystem_spec-0.6.1/ci/000077500000000000000000000000001356753337100153245ustar00rootroot00000000000000filesystem_spec-0.6.1/ci/install.sh000077500000000000000000000014621356753337100173340ustar00rootroot00000000000000#!/usr/bin/env bash # https://docs.travis-ci.com/user/docker/#using-docker-compose DOCKER_COMPOSE_VERSION=${DOCKER_COMPOSE_VERSION:-1.23.2} # Install docker curl -fsSL https://download.docker.com/linux/ubuntu/gpg | sudo apt-key add - sudo add-apt-repository "deb [arch=amd64] https://download.docker.com/linux/ubuntu $(lsb_release -cs) stable" sudo apt-get update sudo apt-get -y -o Dpkg::Options::="--force-confnew" install docker-ce # Update docker-compose sudo rm /usr/local/bin/docker-compose curl -L https://github.com/docker/compose/releases/download/${DOCKER_COMPOSE_VERSION}/docker-compose-`uname -s`-`uname -m` > docker-compose chmod +x docker-compose sudo mv docker-compose /usr/local/bin # install FUSE sudo apt-get install libfuse-dev # install conda source $(dirname $BASH_SOURCE)/install_conda.sh filesystem_spec-0.6.1/ci/install_conda.sh000077500000000000000000000005031356753337100204730ustar00rootroot00000000000000#!/usr/bin/env bash # Install conda wget https://repo.continuum.io/miniconda/Miniconda3-latest-Linux-x86_64.sh -O miniconda.sh bash miniconda.sh -b -p $HOME/miniconda export PATH="$HOME/miniconda/bin:$PATH" conda config --set always_yes yes --set changeps1 no conda update conda conda install -c conda-forge tox tox-conda filesystem_spec-0.6.1/docs/000077500000000000000000000000001356753337100156615ustar00rootroot00000000000000filesystem_spec-0.6.1/docs/Makefile000066400000000000000000000011401356753337100173150ustar00rootroot00000000000000# Minimal makefile for Sphinx documentation # # You can set these variables from the command line. SPHINXOPTS = SPHINXBUILD = sphinx-build SPHINXPROJ = fsspec SOURCEDIR = source BUILDDIR = build # Put it first so that "make" without argument is like "make help". help: @$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) .PHONY: help Makefile # Catch-all target: route all unknown targets to Sphinx using the new # "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS). %: Makefile @$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) filesystem_spec-0.6.1/docs/README.md000066400000000000000000000003621356753337100171410ustar00rootroot00000000000000# Building Documentation A basic python environment with packages listed in `./requirements.txt` is required to build the docs, see ``environment.yml``. To make HTML documentation: ```bash make html ``` Outputs to `build/html/index.html` filesystem_spec-0.6.1/docs/environment.yml000066400000000000000000000001701356753337100207460ustar00rootroot00000000000000name: fsspec channels: - defaults - conda-forge dependencies: - python=3.6 - paramiko - requests - numpydoc filesystem_spec-0.6.1/docs/make.bat000066400000000000000000000014121356753337100172640ustar00rootroot00000000000000@ECHO OFF pushd %~dp0 REM Command file for Sphinx documentation if "%SPHINXBUILD%" == "" ( set SPHINXBUILD=sphinx-build ) set SOURCEDIR=source set BUILDDIR=build set SPHINXPROJ=fsspec if "%1" == "" goto help %SPHINXBUILD% >NUL 2>NUL if errorlevel 9009 ( echo. echo.The 'sphinx-build' command was not found. Make sure you have Sphinx echo.installed, then set the SPHINXBUILD environment variable to point echo.to the full path of the 'sphinx-build' executable. Alternatively you echo.may add the Sphinx directory to PATH. echo. echo.If you don't have Sphinx installed, grab it from echo.http://sphinx-doc.org/ exit /b 1 ) %SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% goto end :help %SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% :end popd filesystem_spec-0.6.1/docs/source/000077500000000000000000000000001356753337100171615ustar00rootroot00000000000000filesystem_spec-0.6.1/docs/source/api.rst000066400000000000000000000052461356753337100204730ustar00rootroot00000000000000API Reference ============= .. currentmodule:: fsspec User Functions -------------- .. autosummary:: fsspec.open_files fsspec.open fsspec.filesystem fsspec.get_filesystem_class fsspec.get_mapper fsspec.fuse.run .. autofunction:: fsspec.open_files .. autofunction:: fsspec.open .. autofunction:: fsspec.filesystem .. autofunction:: fsspec.get_filesystem_class .. autofunction:: fsspec.get_mapper .. autofunction:: fsspec.fuse.run Base Classes ------------ .. autosummary:: fsspec.spec.AbstractFileSystem fsspec.spec.Transaction fsspec.spec.AbstractBufferedFile fsspec.FSMap fsspec.core.OpenFile fsspec.core.BaseCache .. autoclass:: fsspec.spec.AbstractFileSystem .. autoclass:: fsspec.spec.Transaction :members: .. autoclass:: fsspec.spec.AbstractBufferedFile :members: .. autoclass:: fsspec.FSMap :members: .. autoclass:: fsspec.core.OpenFile :members: .. autoclass:: fsspec.core.BaseCache :members: .. _implementations: Built-in Implementations ------------------------ .. autosummary:: fsspec.implementations.ftp.FTPFileSystem fsspec.implementations.hdfs.PyArrowHDFS fsspec.implementations.http.HTTPFileSystem fsspec.implementations.local.LocalFileSystem fsspec.implementations.memory.MemoryFileSystem fsspec.implementations.sftp.SFTPFileSystem fsspec.implementations.webhdfs.WebHDFS fsspec.implementations.zip.ZipFileSystem fsspec.implementations.cached.CachingFileSystem fsspec.implementations.cached.WholeFileCacheFileSystem .. autoclass:: fsspec.implementations.ftp.FTPFileSystem :members: __init__ .. autoclass:: fsspec.implementations.hdfs.PyArrowHDFS :members: __init__ .. autoclass:: fsspec.implementations.http.HTTPFileSystem :members: __init__ .. autoclass:: fsspec.implementations.local.LocalFileSystem :members: .. autoclass:: fsspec.implementations.memory.MemoryFileSystem :members: __init__ .. autoclass:: fsspec.implementations.sftp.SFTPFileSystem :members: __init__ .. autoclass:: fsspec.implementations.webhdfs.WebHDFS :members: __init__ .. autoclass:: fsspec.implementations.zip.ZipFileSystem :members: __init__ .. autoclass:: fsspec.implementations.cached.CachingFileSystem :members: __init__ .. autoclass:: fsspec.implementations.cached.WholeFileCacheFileSystem .. _readbuffering: Read Buffering -------------- .. autosummary:: fsspec.caching.ReadAheadCache fsspec.caching.BytesCache fsspec.caching.MMapCache fsspec.caching.BlockCache .. autoclass:: fsspec.caching.ReadAheadCache :members: .. autoclass:: fsspec.caching.BytesCache :members: .. autoclass:: fsspec.caching.MMapCache :members: .. autoclass:: fsspec.caching.BlockCache :members: filesystem_spec-0.6.1/docs/source/changelog.rst000066400000000000000000000032561356753337100216500ustar00rootroot00000000000000Changelog ========= Version 0.6.1 ------------- * ``LocalFileSystem`` is now considered a filestore by pyarrow (:pr:`211`) * Fixed bug in HDFS filesystem with ``cache_options`` (:pr:`202`) * Fixed instance caching bug with multiple instances (:pr:`203`) Version 0.6.0 ------------- * Fixed issues with filesystem instance caching. This was causing authorization errors in downstream libraries like ``gcsfs`` and ``s3fs`` in multi-threaded code (:pr:`155`, :pr:`181`) * Changed the default file caching strategy to :class:`fsspec.caching.ReadAheadCache` (:pr:`193`) * Moved file caches to the new ``fsspec.caching`` module. They're still available from their old location in ``fsspec.core``, but we recommend using the new location for new code (:pr:`195`) * Added a new file caching strategy, :class:`fsspec.caching.BlockCache` for fetching and caching file reads in blocks (:pr:`191`). * Fixed equality checks for file system instance to return ``False`` when compared to objects other than file systems (:pr:`192`) * Fixed a bug in :meth:`fsspec.FSMap.keys` returning a generator, which was consumed upon iteration (:pr:`189`). * Removed the magic addition of aliases in ``AbstractFileSystem.__init__``. Now alias methods are always present (:pr:`177`) * Deprecated passing ``trim`` to :class:`fsspec.spec.AbstractBufferedFile`. Pass it in ``storage_options`` instead (:pr:`188`) * Improved handling of requests for :class:`fsspec.implementations.http.HTTPFileSystem` when the HTTP server responds with an (incorrect) content-length of 0 (:pr:`163`) * Added a ``detail=True`` parameter to :meth:`fsspec.spec.AbstractFileSystem.ls` (:pr:`168`) * Fixed handling of UNC/DFS paths (:issue:`154`)filesystem_spec-0.6.1/docs/source/conf.py000066400000000000000000000123741356753337100204670ustar00rootroot00000000000000# -*- coding: utf-8 -*- # # fsspec documentation build configuration file, created by # sphinx-quickstart on Mon Jan 15 18:11:02 2018. # # This file is execfile()d with the current directory set to its # containing dir. # # Note that not all possible configuration values are present in this # autogenerated file. # # All configuration values have a default; values that are commented out # serve to show the default. # If extensions (or modules to document with autodoc) are in another directory, # add these directories to sys.path here. If the directory is relative to the # documentation root, use os.path.abspath to make it absolute, like shown here. # import os import sys sys.path.insert(0, os.path.abspath("../..")) # -- General configuration ------------------------------------------------ # If your documentation needs a minimal Sphinx version, state it here. # # needs_sphinx = '1.0' # Add any Sphinx extension module names here, as strings. They can be # extensions coming with Sphinx (named 'sphinx.ext.*') or your custom # ones. extensions = [ "sphinx.ext.autodoc", "sphinx.ext.viewcode", "sphinx.ext.autosummary", "sphinx.ext.extlinks", "numpydoc", ] # Add any paths that contain templates here, relative to this directory. templates_path = ["_templates"] # The suffix(es) of source filenames. # You can specify multiple suffix as a list of string: # # source_suffix = ['.rst', '.md'] source_suffix = ".rst" # The master toctree document. master_doc = "index" # General information about the project. project = "fsspec" copyright = "2018, Martin Durant" author = "Martin Durant" # The version info for the project you're documenting, acts as replacement for # |version| and |release|, also used in various other places throughout the # built documents. # # The short X.Y version. import fsspec version = fsspec.__version__ # The full version, including alpha/beta/rc tags. release = fsspec.__version__ # The language for content autogenerated by Sphinx. Refer to documentation # for a list of supported languages. # # This is also used if you do content translation via gettext catalogs. # Usually you set "language" from the command line for these cases. language = None # List of patterns, relative to source directory, that match files and # directories to ignore when looking for source files. # This patterns also effect to html_static_path and html_extra_path exclude_patterns = [] # The name of the Pygments (syntax highlighting) style to use. pygments_style = "sphinx" # If true, `todo` and `todoList` produce output, else they produce nothing. todo_include_todos = False # -- Options for HTML output ---------------------------------------------- # The theme to use for HTML and HTML Help pages. See the documentation for # a list of builtin themes. # html_theme = "sphinx_rtd_theme" # Theme options are theme-specific and customize the look and feel of a theme # further. For a list of options available for each theme, see the # documentation. # # html_theme_options = {} # Add any paths that contain custom static files (such as style sheets) here, # relative to this directory. They are copied after the builtin static files, # so a file named "default.css" will overwrite the builtin "default.css". html_static_path = [] # Custom sidebar templates, must be a dictionary that maps document names # to template names. # # This is required for the alabaster theme # refs: http://alabaster.readthedocs.io/en/latest/installation.html#sidebars html_sidebars = { "**": [ "relations.html", # needs 'show_related': True theme option to display "searchbox.html", ] } # -- Options for HTMLHelp output ------------------------------------------ # Output file base name for HTML help builder. htmlhelp_basename = "fsspecdoc" # -- Options for LaTeX output --------------------------------------------- latex_elements = { # The paper size ('letterpaper' or 'a4paper'). # # 'papersize': 'letterpaper', # The font size ('10pt', '11pt' or '12pt'). # # 'pointsize': '10pt', # Additional stuff for the LaTeX preamble. # # 'preamble': '', # Latex figure (float) alignment # # 'figure_align': 'htbp', } # Grouping the document tree into LaTeX files. List of tuples # (source start file, target name, title, # author, documentclass [howto, manual, or own class]). latex_documents = [ (master_doc, "fsspec.tex", "fsspec Documentation", "Joseph Crail", "manual") ] # -- Options for manual page output --------------------------------------- # One entry per manual page. List of tuples # (source start file, name, description, authors, manual section). man_pages = [(master_doc, "fsspec", "fsspec Documentation", [author], 1)] # -- Options for Texinfo output ------------------------------------------- # Grouping the document tree into Texinfo files. List of tuples # (source start file, target name, title, author, # dir menu entry, description, category) texinfo_documents = [ ( master_doc, "fsspec", "fsspec Documentation", author, "fsspec", "One line description of project.", "Miscellaneous", ) ] extlinks = { "issue": ("https://github.com/intake/filesystem_spec/issues/%s", "GH#"), "pr": ("https://github.com/intake/filesystem_spec/pull/%s", "GH#"), } filesystem_spec-0.6.1/docs/source/features.rst000066400000000000000000000254541356753337100215430ustar00rootroot00000000000000Features of fsspec ================== Consistent API to many different storage backends. The general API and functionality were proven with the projects `s3fs`_ and `gcsfs`_ (along with `hdfs3`_ and `adlfs`_), within the context of Dask and independently. These have been tried and tested by many users and shown their usefulness over some years. ``fsspec`` aims to build on these and unify their models, as well as extract out file-system handling code from Dask which does not so comfortably fit within a library designed for task-graph creation and their scheduling. .. _s3fs: https://s3fs.readthedocs.io/en/latest/ .. _gcsfs: https://gcsfs.readthedocs.io/en/latest/ .. _hdfs3: https://hdfs3.readthedocs.io/en/latest/ .. _adlfs: https://azure-datalake-store.readthedocs.io/en/latest/ Here follows a brief description of some features of note of ``fsspec`` that promide to make it an interesting project beyond some other file-system abstractions Serialisability --------------- Coming out of the Dask stable, it was an important design decision that file-system instances be serialisable, so that they could be created in one process (e.g., the client) and used in other processes (typically the workers). These other processes may even be on other machines, so in many cases they would need to be able to re-establish credentials, ideally without passing sensitive tokens in the pickled binary data. ``fsspec`` instances, generally speaking, abide by these rules, do not include locks, files and other thread-local material, and where possible, use local credentials (such as a token file) for re-establishing sessions upon de-serialisation. (While making use of cached instances, where they exist, see below). ``OpenFile`` instances ---------------------- The :func:`fsspec.core.OpenFile` class provides a convenient way to prescribe the manner to open some file (local, remote, in a compressed store, etc.) which is portable, and ca also apply any compression and text-mode to the file. These instances are also serialisable, because the do not contain any open files. The way to work with ``OpenFile`` s is to isolate interaction with in a ``with`` context. It is the initiation of the context which actually does the work of creating file-like instances. .. code-block:: python of = fsspec.open(url, ...) # of is just a place-holder with of as f: # f is now a real file-like object holding resources f.read(...) Random Access and Buffering --------------------------- The :func:`fsspec.spec.AbstractBufferedFile` class is provided as an easy way to build file-like interfaces to some service which is capable of providing blocks of bytes. This class is derived from in a number of the existing implementations. A subclass of ``AbstractBufferedFile`` provides random access for the underlying file-like data (without downloading the whole thing) and configurable read-ahead buffers to minimise the number of the read operations that need to be performed on the back-end storage. This is also a critical feature in the big-data access model, where each sub-task of an operation may need on a small part of a file, and does not, therefore want to be forces into downloading the whole thing. Transparent text-mode and compression ------------------------------------- As mentioned above, the ``OpenFile`` class allows for the opening of files on a binary store, which appear to be in text mode and/or allow for a compression/decompression layer between the caller and the back-end storage system. From the user's point of view, this is achieved simply by passing arguments to the :func:`fsspec.open_files` or :func:`fsspec.open` functions, and thereafter happens transparently. Key-value stores ---------------- File-systems are naturally like dict-like key-value mappings: each (string) path corresponds to some binary data on the storage back-end. For some use-cases, it is very convenient to be able to view some path within the file-system as a dict-like store, and the function :func:`fsspec.get_mapper` gives a one-stop way to return such an object. This has become useful, for example, in the context of the `zarr`_ project, which stores it array chunks in keys in any arbitrary mapping-like object. .. code-block:: python mapper = fsspec.get_mapper('protocol://server/path', args) list(mapper) mapper[k] = b'some data' .. _zarr: https://zarr.readthedocs.io/en/stable/ PyArrow integration ------------------- `pyarrow`_ has its own internal idea of what a file-system is (``pyarrow.filesystem.FileSystem``), and some functions, particularly the loading of parquet, require that the target be compatible. As it happens, the design of the file-system interface in ``pyarrow`` *is* compatible with `fsspec` (this is not by accident). Therefore at import time, ``fsspec`` checks for the existence of ``pyarrow``, and, if found, adds it to the superclasses of the spec base-class. In this manner, all ``fsspec``-derived file-systems are also pyarrow file-systems, and can be used by pyarrow functions. .. _pyarrow: https://arrow.apache.org/docs/python/ Transactions ------------ ``fsspec`` supports *transactions*, during which writing to files on a remote store are deferred (typically put into a temporary location) until the transaction is over, whereupon the whole transaction is finalised in a semi-atomic way, and all the files are moved/committed to their final destination. The implementation of the details is file-system specific (and not all support it yet), but the idea is, that all files should get written or none, to mitigate against data corruption. The feature can be used like .. code-block:: python fs = fsspec.filesystem(...) with fs.transation: with fs.open('file1', 'wb') as f: f.write(b'some data') with fs.open('file2', 'wb') as f: f.write(b'more data') Here, files 1 and 2 do not get moved to the target location until the transaction context finishes. If the context finishes due to an (uncaught) exception, then the files are discarded and the file target locations untouched. The class :func:`fsspec.spec.Transaction` allows for fine-tuning of the operation, and every ``fsspec`` instance has an instance of this as an attribute ``.transaction`` to give access. Note that synchronising transactions across multiple instances, perhaps across a cluster, is a harder problem to solve, and the implementation described here is only part of the solution. Mount anything with FUSE ------------------------ Any path of any file-system can be mapped to a local directory using pyfuse and :func:`sspec.fuse.run`. This feature is experimental, but basic file listing with details, and read/write should generally be available to the extent that the remote file-system provides enough information. Naturally, if a file-system is read-only, then write operations will fail - but they will tend to fail late and with obscure error messages such as "bad address". Some specific quirks of some file-systems may cause confusion for FUSE. For example, it is possible for a given path on s3 to be both a valid key (i.e., containing binary data, like a file) and a valid prefix (i.e., can be listed to find subkeys, like a directory). Since this breaks the assumptions of a normal file-system, it may not be possible to reach all paths on the remote. Instance Caching ---------------- In a file-system implementation class is marked as *cachable* (attribute ``.cachable``), then its instances will get stored in a class attribute, to enable quick look-up instead of needing to regenerate potentially expensive connections and sessions. They key in the cache is a tokenisation of the arguments to create the instance. The cache itself (attribute ``._cache``) is currently a simple dict, but could in the future be LRU, or something more complicated, to fine-tune instance lifetimes. Since files can hold on to write caches and read buffers, the instance cache may cause excessive memory usage in some situations; but normally, files will get ``close``d, and the data discarded. Only when there is also an unfinalised transaction or captured traceback might this be anticipated becoming a problem. File Buffering -------------- Most implementations create file objects which derive from ``fsspec.spec.AbstractBufferedFile``, and have many behaviours in common. These files offer buffering of both read and write operations, so that communication with the remote resource is limited. The size of the buffer is generally configured with the ``blocksize=`` kwargs at p[en time, although the implementation may have some minimum or maximum sizes that need to be respected. For reading, a number of buffering schemes are available, listed in ``fsspec.caching.caches`` (see :ref:`readbuffering`), or "none" for no buffering at all, e.g., for a simple read-ahead buffer, you can do .. code-block:: python fs = fsspec.filesystem(...) with fs.open(path, mode='rb', cache_type='readahead') as f: use_for_something(f) Caching Files Locally --------------------- ``fsspec`` allows you to access data on remote file systems, that is its purpose. However, such access can often be rather slow compared to local storage, so as well as buffering (see above), the option exists to cp[y files locally when you first access them, and thereafter to use the local data. This local cache of data might be temporary (i.e., attached to the process and discarded when the process ends) or at some specific location in your local storage. Two mechanisms are provided, and both involve wrapping a `target` filesystem. The following example creates a file-based cache. .. code-block:: python fs = fsspec.filesystem("filecache", target_protocol='s3', target_options={'anon': True}, cache_storage='/tmp/files/') Each time you open a remote file on S3, it will first copy it to a local temporary directory, and then all further access will use the local file. Since we specify a particular local location, the files will persist and can be reused from future sessions, although you can also set policies to have cached files expire after some time, or to check the remote file system on each open, to see if the target file has changed since it was copied. With the "blockcache" variant, data is downloaded block-wise: only the specific parts of the remote file which are accessed. This means that the local copy of the file might end up being much smaller than the remote one, if only certain parts of it are required. Whereas "filecache" works for all file system implementations, and provides a real local file for other libraries to use, "blockcache" has restrictions: that you have a storage/OS combination which supports sparse files, that the backend implementation uses files which derive ``from AbstractBufferedFile``, and that the library you pass the resultant object to accepts generic python file-like objects. You should not mix block- and file-caches in the same directory. filesystem_spec-0.6.1/docs/source/index.rst000066400000000000000000000042501356753337100210230ustar00rootroot00000000000000fsspec's: python filesystem interfaces ====================================== Filesystem Spec is a project to unify various projects and classes to work with remote filesystems and file-system-like abstractions using a standard pythonic interface. .. _highlight: Highlights ---------- - based on s3fs and gcsfs - ``fsspec`` instances are serializable and can be passed between processes/machines - the ``OpenFiles`` file-like instances are also serializable - implementations provide random access, to enable only the part of a file required to be read; plus a template to base other file-like classes on - file access can use transparent compression and text-mode - any file-system directory can be viewed as a key-value/mapping store - if installed, all file-system classes also subclass from ``pyarrow.filesystem.FileSystem``, so can work with any arrow function expecting such an instance - writes can be transactional: stored in a temporary location and only moved to the final destination when the transaction is committed - FUSE: mount any path from any backend to a point on your file-system - cached instances tokenised on the instance parameters These are described further in the :doc:`features` section. Installation ------------ pip install fsspec or conda install -c conda-forge fsspec Implementations --------------- This repo contains several file-system implementations, see :ref:`implementations`. However, the external projects ``s3fs`` and ``gcsfs`` depend on ``fsspec`` and share the same behaviours. ``Dask`` and ``Intake`` use ``fsspec`` internally for their IO needs. The current list of known implementations can be found as follows .. code-block:: python from fsspec.registry import known_implementations known_implementations These are only imported on request, which may fail if a required dependency is missing. The dictionary ``fsspec.registry`` contains all imported implementations, and can be mutated by user code, if necessary. .. toctree:: :maxdepth: 2 :caption: Contents: intro.rst usage.rst features.rst api.rst changelog.rst Indices and tables ================== * :ref:`genindex` * :ref:`modindex` * :ref:`search` filesystem_spec-0.6.1/docs/source/intro.rst000066400000000000000000000113771356753337100210570ustar00rootroot00000000000000Introduction ============ To get stuck into using the package, rather than reading about its philosophy and history, you can skip to :doc:`usage`. Background ---------- Python provides a standard interface for open files, so that alternate implementations of file-like object can work seamlessly with many function which rely only on the methods of that standard interface. A number of libraries have implemented a similar concept for file-systems, where file operations can be performed on a logical file-system which may be local, structured data store or some remote service. This repository is intended to be a place to define a standard interface that such file-systems should adhere to, such that code using them should not have to know the details of the implementation in order to operate on any of a number of backends. With hope, the community can come together to define an interface that is the best for the highest number of users, and having the specification, makes developing other file-system implementations simpler. History ------- I (Martin Durant) have been involved in building a number of remote-data file-system implementations, principally in the context of the `Dask`_ project. In particular, several are listed in `docs`_ with links to the specific repositories. With common authorship, there is much that is similar between the implementations, for example posix-like naming of the operations, and this has allowed Dask to be able to interact with the various backends and parse generic URLs in order to select amongst them. However, *some* extra code was required in each case to adapt the peculiarities of each implementation with the generic usage that Dask demanded. People may find the `code`_ which parses URLs and creates file-system instances interesting. .. _Dask: http://dask.pydata.org/en/latest/ .. _docs: http://dask.pydata.org/en/latest/remote-data-services.html .. _code: https://github.com/dask/dask/blob/master/dask/bytes/core.py#L266 At the same time, the Apache `Arrow`_ project was also concerned with a similar problem, particularly a common interface to local and HDFS files, for example the `hdfs`_ interface (which actually communicated with HDFS with a choice of driver). These are mostly used internally within Arrow, but Dask was modified in order to be able to use the alternate HDFS interface (which solves some security issues with `hdfs3`). In the process, a `conversation`_ was started, and I invite all interested parties to continue the conversation in this location. .. _Arrow: https://arrow.apache.org/ .. _hdfs: https://arrow.apache.org/docs/python/filesystems.html .. _conversation: https://github.com/dask/dask/issues/2880 There is a good argument that this type of code has no place in Dask, which is concerned with making graphs representing computations, and executing those graphs on a scheduler. Indeed, the file-systems are generally useful, and each has a user-base wider than just those that work via Dask. Influences ---------- The following places to consider, when choosing the definitions of how we would like the file-system specification to look: - python's `os`_ module and its `path` namespace; also other file-connected functionality in the standard library - posix/bash method naming conventions that linux/unix/osx users are familiar with; or perhaps their Windows variants - the existing implementations for the various backends (e.g., `gcsfs`_ or Arrow's `hdfs`_) - `pyfilesystems`_, an attempt to do something similar, with a plugin architecture. This conception has several types of local file-system, and a lot of well-thought-out validation code. .. _os: https://docs.python.org/3/library/os.html .. _gcsfs: http://gcsfs.readthedocs.io/en/latest/api.html#gcsfs.core.GCSFileSystem .. _pyfilesystems: https://docs.pyfilesystem.org/en/latest/index.html Not pyfilesystems? ------------------ It might have been conceivable to reuse code in ``pyfilesystems``, which has an established interface and several implementations of its own. However, it supports none of the :ref:`highlight`, critical to cloud and parallel access, and would not be easy to coerce. Following on the success of ``s3fs`` and ``gcsfs``, and their use within Dask, it seemed best to have an interface as close to those as possible. See a `discussion`_ on the topic. .. _discussion: https://github.com/intake/filesystem_spec/issues/5 Structure of the package ------------------------ The best place to get a feel for the contents of ``fsspec`` is by looking through the :doc:`usage` and :doc:`api` sections. In addition, the source code will be interesting for those who wish to subclass and develop new file-system implementations. ``fsspec/spec.py`` contains the main abstract file-system class to derive from, ``AbstractFileSystem``. .. _zarr: https://zarr.readthedocs.io filesystem_spec-0.6.1/docs/source/usage.rst000066400000000000000000000067071356753337100210310ustar00rootroot00000000000000Usage ===== This is quick-start documentation to help people get familiar with the layout and functioning of ``fsspec``. Instantiate a file-system ------------------------- ``fsspec`` provides an abstract file-system interface as a template for other filesystems. In this context, "interface" means an API for working with files on the given file-system, which can mean files on some remote store, local files, files within some wrapper, or anything else that is capable of producing file-like objects. Some concrete implementations are bundled with ``fsspec`` and others can be installed separately. They can be instantiated directly, or the `registry` can be used to find them. Direct instantiation: .. code-block:: python from fsspec.implementations.local import LocalFileSystem fs = LocalFileSystem() Look-up via registry: .. code-block:: python import fsspec fs = fsspec.filesystem('file') Many filesystems also take extra parameters, some of which may be options - see :doc:`api`. .. code-block:: python import fsspec fs = fsspec.filesystem('ftp', host=host, port=port, username=user, password=pw) Use a file-system ----------------- File-system instances offer a large number of methods for getting information about and manipulating files for the given back-end. Although some specific implementations may not offer all features (e.g., ``http`` is read-only), generally all normal operations, such as ``ls``, ``rm``, should be expected to work (see the full list: :class:`fsspec.spec.AbstractFileSystem`). Note that this quick-start will prefer posix-style naming, but many common operations are aliased: ``cp()`` and ``copy()`` are identical, for instance. Functionality is generally chosen to be as close to the builtin ``os`` module's working for things like ``glob`` as possible. The ``open()`` method will return a file-like object which can be passed to any other library that expects to work with python files. These will normally be binary-mode only, but may implement internal buffering in order to limit the number of reads from a remote source. They respect the use of ``with`` contexts. If you have ``pandas`` installed, for example, you can do the following: .. code-block:: python with fs.open('https://raw.githubusercontent.com/dask/' 'fastparquet/master/test-data/nation.csv') as f: df = pd.read_csv(f, sep='|', header=None) Higher-level ------------ For many situations, the only function that will be needed is :func:`fsspec.open_files()`, which will return :class:`fsspec.core.OpenFile` instances created from a single URL and parameters to pass to the backend. This supports text-mode and compression on the fly, and the objects can be serialized for passing between processes or machines (so long as each has access to the same backend file-system). The protocol (i.e., backend) is inferred from the URL passed, and glob characters are expanded in read mode (search for files) or write mode (create names). Critically, the file on the backend system is not actually opened until the ``OpenFile`` instance is used in a ``with`` context. For the example above: .. code-block:: python of = fsspec.open('https://raw.githubusercontent.com/dask/' 'fastparquet/master/test-data/nation.csv', mode='r') # files is a not-yet-open OpenFile object. The "with" context actually opens it with of as f: # now f is a text-mode file df = pd.read_csv(f, sep='|', header=None) filesystem_spec-0.6.1/fsspec/000077500000000000000000000000001356753337100162145ustar00rootroot00000000000000filesystem_spec-0.6.1/fsspec/__init__.py000066400000000000000000000010061356753337100203220ustar00rootroot00000000000000from ._version import get_versions from .spec import AbstractFileSystem from .registry import get_filesystem_class, registry, filesystem from .mapping import FSMap, get_mapper from .core import open_files, get_fs_token_paths, open from . import caching __version__ = get_versions()["version"] del get_versions __all__ = [ "AbstractFileSystem", "FSMap", "filesystem", "get_filesystem_class", "get_fs_token_paths", "get_mapper", "open", "open_files", "registry", "caching", ] filesystem_spec-0.6.1/fsspec/_version.py000066400000000000000000000441171356753337100204210ustar00rootroot00000000000000# This file helps to compute a version number in source trees obtained from # git-archive tarball (such as those provided by githubs download-from-tag # feature). Distribution tarballs (built by setup.py sdist) and build # directories (produced by setup.py build) will contain a much shorter file # that just contains the computed version number. # This file is released into the public domain. Generated by # versioneer-0.18 (https://github.com/warner/python-versioneer) """Git implementation of _version.py.""" import errno import os import re import subprocess import sys def get_keywords(): """Get the keywords needed to look up the version information.""" # these strings will be replaced by git during git-archive. # setup.py/versioneer.py will grep for the variable names, so they must # each be defined on a line of their own. _version.py will just call # get_keywords(). git_refnames = " (tag: 0.6.1)" git_full = "4daa59f43a621702cd2ecb38648c33507cd81524" git_date = "2019-11-27 11:48:41 -0600" keywords = {"refnames": git_refnames, "full": git_full, "date": git_date} return keywords class VersioneerConfig: """Container for Versioneer configuration parameters.""" def get_config(): """Create, populate and return the VersioneerConfig() object.""" # these strings are filled in when 'setup.py versioneer' creates # _version.py cfg = VersioneerConfig() cfg.VCS = "git" cfg.style = "pep440" cfg.tag_prefix = "" cfg.parentdir_prefix = "None" cfg.versionfile_source = "fsspec/_version.py" cfg.verbose = False return cfg class NotThisMethod(Exception): """Exception raised if a method is not valid for the current scenario.""" LONG_VERSION_PY = {} HANDLERS = {} def register_vcs_handler(vcs, method): # decorator """Decorator to mark a method as the handler for a particular VCS.""" def decorate(f): """Store f in HANDLERS[vcs][method].""" if vcs not in HANDLERS: HANDLERS[vcs] = {} HANDLERS[vcs][method] = f return f return decorate def run_command(commands, args, cwd=None, verbose=False, hide_stderr=False, env=None): """Call the given command(s).""" assert isinstance(commands, list) p = None for c in commands: try: dispcmd = str([c] + args) # remember shell=False, so use git.cmd on windows, not just git p = subprocess.Popen( [c] + args, cwd=cwd, env=env, stdout=subprocess.PIPE, stderr=(subprocess.PIPE if hide_stderr else None), ) break except EnvironmentError: e = sys.exc_info()[1] if e.errno == errno.ENOENT: continue if verbose: print("unable to run %s" % dispcmd) print(e) return None, None else: if verbose: print("unable to find command, tried %s" % (commands,)) return None, None stdout = p.communicate()[0].strip() if sys.version_info[0] >= 3: stdout = stdout.decode() if p.returncode != 0: if verbose: print("unable to run %s (error)" % dispcmd) print("stdout was %s" % stdout) return None, p.returncode return stdout, p.returncode def versions_from_parentdir(parentdir_prefix, root, verbose): """Try to determine the version from the parent directory name. Source tarballs conventionally unpack into a directory that includes both the project name and a version string. We will also support searching up two directory levels for an appropriately named parent directory """ rootdirs = [] for i in range(3): dirname = os.path.basename(root) if dirname.startswith(parentdir_prefix): return { "version": dirname[len(parentdir_prefix) :], "full-revisionid": None, "dirty": False, "error": None, "date": None, } else: rootdirs.append(root) root = os.path.dirname(root) # up a level if verbose: print( "Tried directories %s but none started with prefix %s" % (str(rootdirs), parentdir_prefix) ) raise NotThisMethod("rootdir doesn't start with parentdir_prefix") @register_vcs_handler("git", "get_keywords") def git_get_keywords(versionfile_abs): """Extract version information from the given file.""" # the code embedded in _version.py can just fetch the value of these # keywords. When used from setup.py, we don't want to import _version.py, # so we do it with a regexp instead. This function is not used from # _version.py. keywords = {} try: f = open(versionfile_abs, "r") for line in f.readlines(): if line.strip().startswith("git_refnames ="): mo = re.search(r'=\s*"(.*)"', line) if mo: keywords["refnames"] = mo.group(1) if line.strip().startswith("git_full ="): mo = re.search(r'=\s*"(.*)"', line) if mo: keywords["full"] = mo.group(1) if line.strip().startswith("git_date ="): mo = re.search(r'=\s*"(.*)"', line) if mo: keywords["date"] = mo.group(1) f.close() except EnvironmentError: pass return keywords @register_vcs_handler("git", "keywords") def git_versions_from_keywords(keywords, tag_prefix, verbose): """Get version information from git keywords.""" if not keywords: raise NotThisMethod("no keywords at all, weird") date = keywords.get("date") if date is not None: # git-2.2.0 added "%cI", which expands to an ISO-8601 -compliant # datestamp. However we prefer "%ci" (which expands to an "ISO-8601 # -like" string, which we must then edit to make compliant), because # it's been around since git-1.5.3, and it's too difficult to # discover which version we're using, or to work around using an # older one. date = date.strip().replace(" ", "T", 1).replace(" ", "", 1) refnames = keywords["refnames"].strip() if refnames.startswith("$Format"): if verbose: print("keywords are unexpanded, not using") raise NotThisMethod("unexpanded keywords, not a git-archive tarball") refs = set([r.strip() for r in refnames.strip("()").split(",")]) # starting in git-1.8.3, tags are listed as "tag: foo-1.0" instead of # just "foo-1.0". If we see a "tag: " prefix, prefer those. TAG = "tag: " tags = set([r[len(TAG) :] for r in refs if r.startswith(TAG)]) if not tags: # Either we're using git < 1.8.3, or there really are no tags. We use # a heuristic: assume all version tags have a digit. The old git %d # expansion behaves like git log --decorate=short and strips out the # refs/heads/ and refs/tags/ prefixes that would let us distinguish # between branches and tags. By ignoring refnames without digits, we # filter out many common branch names like "release" and # "stabilization", as well as "HEAD" and "master". tags = set([r for r in refs if re.search(r"\d", r)]) if verbose: print("discarding '%s', no digits" % ",".join(refs - tags)) if verbose: print("likely tags: %s" % ",".join(sorted(tags))) for ref in sorted(tags): # sorting will prefer e.g. "2.0" over "2.0rc1" if ref.startswith(tag_prefix): r = ref[len(tag_prefix) :] if verbose: print("picking %s" % r) return { "version": r, "full-revisionid": keywords["full"].strip(), "dirty": False, "error": None, "date": date, } # no suitable tags, so version is "0+unknown", but full hex is still there if verbose: print("no suitable tags, using unknown + full revision id") return { "version": "0+unknown", "full-revisionid": keywords["full"].strip(), "dirty": False, "error": "no suitable tags", "date": None, } @register_vcs_handler("git", "pieces_from_vcs") def git_pieces_from_vcs(tag_prefix, root, verbose, run_command=run_command): """Get version from 'git describe' in the root of the source tree. This only gets called if the git-archive 'subst' keywords were *not* expanded, and _version.py hasn't already been rewritten with a short version string, meaning we're inside a checked out source tree. """ GITS = ["git"] if sys.platform == "win32": GITS = ["git.cmd", "git.exe"] out, rc = run_command(GITS, ["rev-parse", "--git-dir"], cwd=root, hide_stderr=True) if rc != 0: if verbose: print("Directory %s not under git control" % root) raise NotThisMethod("'git rev-parse --git-dir' returned error") # if there is a tag matching tag_prefix, this yields TAG-NUM-gHEX[-dirty] # if there isn't one, this yields HEX[-dirty] (no NUM) describe_out, rc = run_command( GITS, [ "describe", "--tags", "--dirty", "--always", "--long", "--match", "%s*" % tag_prefix, ], cwd=root, ) # --long was added in git-1.5.5 if describe_out is None: raise NotThisMethod("'git describe' failed") describe_out = describe_out.strip() full_out, rc = run_command(GITS, ["rev-parse", "HEAD"], cwd=root) if full_out is None: raise NotThisMethod("'git rev-parse' failed") full_out = full_out.strip() pieces = {} pieces["long"] = full_out pieces["short"] = full_out[:7] # maybe improved later pieces["error"] = None # parse describe_out. It will be like TAG-NUM-gHEX[-dirty] or HEX[-dirty] # TAG might have hyphens. git_describe = describe_out # look for -dirty suffix dirty = git_describe.endswith("-dirty") pieces["dirty"] = dirty if dirty: git_describe = git_describe[: git_describe.rindex("-dirty")] # now we have TAG-NUM-gHEX or HEX if "-" in git_describe: # TAG-NUM-gHEX mo = re.search(r"^(.+)-(\d+)-g([0-9a-f]+)$", git_describe) if not mo: # unparseable. Maybe git-describe is misbehaving? pieces["error"] = "unable to parse git-describe output: '%s'" % describe_out return pieces # tag full_tag = mo.group(1) if not full_tag.startswith(tag_prefix): if verbose: fmt = "tag '%s' doesn't start with prefix '%s'" print(fmt % (full_tag, tag_prefix)) pieces["error"] = "tag '%s' doesn't start with prefix '%s'" % ( full_tag, tag_prefix, ) return pieces pieces["closest-tag"] = full_tag[len(tag_prefix) :] # distance: number of commits since tag pieces["distance"] = int(mo.group(2)) # commit: short hex revision ID pieces["short"] = mo.group(3) else: # HEX: no tags pieces["closest-tag"] = None count_out, rc = run_command(GITS, ["rev-list", "HEAD", "--count"], cwd=root) pieces["distance"] = int(count_out) # total number of commits # commit date: see ISO-8601 comment in git_versions_from_keywords() date = run_command(GITS, ["show", "-s", "--format=%ci", "HEAD"], cwd=root)[ 0 ].strip() pieces["date"] = date.strip().replace(" ", "T", 1).replace(" ", "", 1) return pieces def plus_or_dot(pieces): """Return a + if we don't already have one, else return a .""" if "+" in pieces.get("closest-tag", ""): return "." return "+" def render_pep440(pieces): """Build up version string, with post-release "local version identifier". Our goal: TAG[+DISTANCE.gHEX[.dirty]] . Note that if you get a tagged build and then dirty it, you'll get TAG+0.gHEX.dirty Exceptions: 1: no tags. git_describe was just HEX. 0+untagged.DISTANCE.gHEX[.dirty] """ if pieces["closest-tag"]: rendered = pieces["closest-tag"] if pieces["distance"] or pieces["dirty"]: rendered += plus_or_dot(pieces) rendered += "%d.g%s" % (pieces["distance"], pieces["short"]) if pieces["dirty"]: rendered += ".dirty" else: # exception #1 rendered = "0+untagged.%d.g%s" % (pieces["distance"], pieces["short"]) if pieces["dirty"]: rendered += ".dirty" return rendered def render_pep440_pre(pieces): """TAG[.post.devDISTANCE] -- No -dirty. Exceptions: 1: no tags. 0.post.devDISTANCE """ if pieces["closest-tag"]: rendered = pieces["closest-tag"] if pieces["distance"]: rendered += ".post.dev%d" % pieces["distance"] else: # exception #1 rendered = "0.post.dev%d" % pieces["distance"] return rendered def render_pep440_post(pieces): """TAG[.postDISTANCE[.dev0]+gHEX] . The ".dev0" means dirty. Note that .dev0 sorts backwards (a dirty tree will appear "older" than the corresponding clean one), but you shouldn't be releasing software with -dirty anyways. Exceptions: 1: no tags. 0.postDISTANCE[.dev0] """ if pieces["closest-tag"]: rendered = pieces["closest-tag"] if pieces["distance"] or pieces["dirty"]: rendered += ".post%d" % pieces["distance"] if pieces["dirty"]: rendered += ".dev0" rendered += plus_or_dot(pieces) rendered += "g%s" % pieces["short"] else: # exception #1 rendered = "0.post%d" % pieces["distance"] if pieces["dirty"]: rendered += ".dev0" rendered += "+g%s" % pieces["short"] return rendered def render_pep440_old(pieces): """TAG[.postDISTANCE[.dev0]] . The ".dev0" means dirty. Eexceptions: 1: no tags. 0.postDISTANCE[.dev0] """ if pieces["closest-tag"]: rendered = pieces["closest-tag"] if pieces["distance"] or pieces["dirty"]: rendered += ".post%d" % pieces["distance"] if pieces["dirty"]: rendered += ".dev0" else: # exception #1 rendered = "0.post%d" % pieces["distance"] if pieces["dirty"]: rendered += ".dev0" return rendered def render_git_describe(pieces): """TAG[-DISTANCE-gHEX][-dirty]. Like 'git describe --tags --dirty --always'. Exceptions: 1: no tags. HEX[-dirty] (note: no 'g' prefix) """ if pieces["closest-tag"]: rendered = pieces["closest-tag"] if pieces["distance"]: rendered += "-%d-g%s" % (pieces["distance"], pieces["short"]) else: # exception #1 rendered = pieces["short"] if pieces["dirty"]: rendered += "-dirty" return rendered def render_git_describe_long(pieces): """TAG-DISTANCE-gHEX[-dirty]. Like 'git describe --tags --dirty --always -long'. The distance/hash is unconditional. Exceptions: 1: no tags. HEX[-dirty] (note: no 'g' prefix) """ if pieces["closest-tag"]: rendered = pieces["closest-tag"] rendered += "-%d-g%s" % (pieces["distance"], pieces["short"]) else: # exception #1 rendered = pieces["short"] if pieces["dirty"]: rendered += "-dirty" return rendered def render(pieces, style): """Render the given version pieces into the requested style.""" if pieces["error"]: return { "version": "unknown", "full-revisionid": pieces.get("long"), "dirty": None, "error": pieces["error"], "date": None, } if not style or style == "default": style = "pep440" # the default if style == "pep440": rendered = render_pep440(pieces) elif style == "pep440-pre": rendered = render_pep440_pre(pieces) elif style == "pep440-post": rendered = render_pep440_post(pieces) elif style == "pep440-old": rendered = render_pep440_old(pieces) elif style == "git-describe": rendered = render_git_describe(pieces) elif style == "git-describe-long": rendered = render_git_describe_long(pieces) else: raise ValueError("unknown style '%s'" % style) return { "version": rendered, "full-revisionid": pieces["long"], "dirty": pieces["dirty"], "error": None, "date": pieces.get("date"), } def get_versions(): """Get version information or return default if unable to do so.""" # I am in _version.py, which lives at ROOT/VERSIONFILE_SOURCE. If we have # __file__, we can work backwards from there to the root. Some # py2exe/bbfreeze/non-CPython implementations don't do __file__, in which # case we can only use expanded keywords. cfg = get_config() verbose = cfg.verbose try: return git_versions_from_keywords(get_keywords(), cfg.tag_prefix, verbose) except NotThisMethod: pass try: root = os.path.realpath(__file__) # versionfile_source is the relative path from the top of the source # tree (where the .git directory might live) to this file. Invert # this to find the root from __file__. for i in cfg.versionfile_source.split("/"): root = os.path.dirname(root) except NameError: return { "version": "0+unknown", "full-revisionid": None, "dirty": None, "error": "unable to find root of source tree", "date": None, } try: pieces = git_pieces_from_vcs(cfg.tag_prefix, root, verbose) return render(pieces, cfg.style) except NotThisMethod: pass try: if cfg.parentdir_prefix: return versions_from_parentdir(cfg.parentdir_prefix, root, verbose) except NotThisMethod: pass return { "version": "0+unknown", "full-revisionid": None, "dirty": None, "error": "unable to compute version", "date": None, } filesystem_spec-0.6.1/fsspec/caching.py000066400000000000000000000275141356753337100201730ustar00rootroot00000000000000import os import io import functools import logging import math logger = logging.getLogger("fsspec") class BaseCache(object): """Pass-though cache: doesn't keep anything, calls every time Acts as base class for other cachers Parameters ---------- blocksize: int How far to read ahead in numbers of bytes fetcher: func Function of the form f(start, end) which gets bytes from remote as specified size: int How big this file is """ def __init__(self, blocksize, fetcher, size): self.blocksize = blocksize self.fetcher = fetcher self.size = size def _fetch(self, start, end): return self.fetcher(start, end) def __getitem__(self, item: slice): if not isinstance(item, slice): raise TypeError( "Cache indices must be a contiguous slice. Got {} instead.".format( type(item) ) ) if item.step and item.step != 1: raise ValueError( "Cache indices must be a contiguous slice. 'item' has step={}".format( item.step ) ) # handle endpoints if item.start is None: item = slice(0, item.stop) elif item.start < 0: item = slice(self.size + item.start, item.stop) if item.stop is None: item = slice(item.start, self.size) elif item.stop < 0: item = slice(item.start, self.size + item.stop) return self._fetch(item.start, item.stop) class MMapCache(BaseCache): """memory-mapped sparse file cache Opens temporary file, which is filled blocks-wise when data is requested. Ensure there is enough disc space in the temporary location. This cache method might only work on posix """ def __init__(self, blocksize, fetcher, size, location=None, blocks=None): super().__init__(blocksize, fetcher, size) self.blocks = set() if blocks is None else blocks self.location = location self.cache = self._makefile() def _makefile(self): import tempfile import mmap if self.size == 0: return bytearray() # posix version if self.location is None or not os.path.exists(self.location): if self.location is None: fd = tempfile.TemporaryFile() self.blocks = set() else: fd = io.open(self.location, "wb+") fd.seek(self.size - 1) fd.write(b"1") fd.flush() else: fd = io.open(self.location, "rb+") return mmap.mmap(fd.fileno(), self.size) def _fetch(self, start, end): start_block = start // self.blocksize end_block = end // self.blocksize need = [i for i in range(start_block, end_block + 1) if i not in self.blocks] while need: # TODO: not a for loop so we can consolidate blocks later to # make fewer fetch calls; this could be parallel i = need.pop(0) sstart = i * self.blocksize send = min(sstart + self.blocksize, self.size) self.cache[sstart:send] = self.fetcher(sstart, send) self.blocks.add(i) return self.cache[start:end] def __getstate__(self): state = self.__dict__.copy() # Remove the unpicklable entries. del state["cache"] return state def __setstate__(self, state): # Restore instance attributes self.__dict__.update(state) self.cache = self._makefile() class ReadAheadCache(BaseCache): """ Cache which reads only when we get beyond a block of data This is a much simpler version of BytesCache, and does not attempt to fill holes in the cache or keep fragments alive. It is best suited to many small reads in a sequential order (e.g., reading lines from a file). """ def __init__(self, blocksize, fetcher, size): super().__init__(blocksize, fetcher, size) self.cache = b"" self.start = 0 self.end = 0 def _fetch(self, start, end): end = min(self.size, end) l = end - start if start >= self.size: return b"" elif start >= self.start and end <= self.end: # cache hit return self.cache[start - self.start : end - self.start] elif self.start <= start < self.end: # partial hit part = self.cache[start - self.start :] l -= len(part) start = self.end else: # miss part = b"" end = min(self.size, end + self.blocksize) self.cache = self.fetcher(start, end) # new block replaces old self.start = start self.end = self.start + len(self.cache) return part + self.cache[:l] class BlockCache(BaseCache): """ Cache holding memory as a set of blocks. Requests are only ever made `blocksize` at a time, and are stored in an LRU cache. The least recently accessed block is discarded when more than `maxblocks` are stored. Parameters ---------- blocksize : int The number of bytes to store in each block. Requests are only ever made for `blocksize`, so this should balance the overhead of making a request against the granularity of the blocks. fetcher : Callable size : int The total size of the file being cached. maxblocks : int The maximum number of blocks to cache for. The maximum memory use for this cache is then ``blocksize * maxblocks``. """ def __init__(self, blocksize, fetcher, size, maxblocks=32): super().__init__(blocksize, fetcher, size) self.nblocks = math.ceil(size / blocksize) self.maxblocks = maxblocks self._fetch_block_cached = functools.lru_cache(maxblocks)(self._fetch_block) def __repr__(self): return "".format( self.blocksize, self.size, self.nblocks ) def cache_info(self): """ The statistics on the block cache. Returns ---------- NamedTuple Returned directly from the LRU Cache used internally. """ return self._fetch_block_cached.cache_info() def __getstate__(self): state = self.__dict__ del state["_fetch_block_cached"] return state def __setstate__(self, state): self.__dict__.update(state) self._fetch_block_cached = functools.lru_cache(state["maxblocks"])( self._fetch_block ) def _fetch(self, start, end): if end < start: raise ValueError( "'end' ({}) is smaller than 'start' ({}).".format(end, start) ) if end > self.size: raise ValueError("'end={}' larger than size ('{}')".format(end, self.size)) # byte position -> block numbers start_block_number = start // self.blocksize end_block_number = end // self.blocksize # these are cached, so safe to do multiple calls for the same start and end. for block_number in range(start_block_number, end_block_number + 1): self._fetch_block(block_number) return self._read_cache( start, end, start_block_number=start_block_number, end_block_number=end_block_number, ) def _fetch_block(self, block_number): """ Fetch the block of data for `block_number`. """ if block_number > self.nblocks: raise ValueError( "'block_number={}' is greater than the number of blocks ({})".format( block_number, self.nblocks ) ) start = block_number * self.blocksize end = start + self.blocksize logger.info("BlockCache fetching block %d", block_number) block_contents = super()._fetch(start, end) return block_contents def _read_cache(self, start, end, start_block_number, end_block_number): """ Read from our block cache. Parameters ---------- start, end : int The start and end byte positions. start_block_number, end_block_number : int The start and end block numbers. """ start_pos = start % self.blocksize end_pos = end % self.blocksize if start_block_number == end_block_number: block = self._fetch_block_cached(start_block_number) return block[start_pos:end_pos] else: # read from the initial out = [] out.append(self._fetch_block_cached(start_block_number)[start_pos:]) # intermediate blocks # Note: it'd be nice to combine these into one big request. However # that doesn't play nicely with our LRU cache. for block_number in range(start_block_number + 1, end_block_number): out.append(self._fetch_block_cached(block_number)) # final block out.append(self._fetch_block_cached(end_block_number)[:end_pos]) return b"".join(out) class BytesCache(BaseCache): """Cache which holds data in a in-memory bytes object Implements read-ahead by the block size, for semi-random reads progressing through the file. Parameters ---------- trim: bool As we read more data, whether to discard the start of the buffer when we are more than a blocksize ahead of it. """ def __init__(self, blocksize, fetcher, size, trim=True): super().__init__(blocksize, fetcher, size) self.cache = b"" self.start = None self.end = None self.trim = trim def _fetch(self, start, end): # TODO: only set start/end after fetch, in case it fails? # is this where retry logic might go? if ( self.start is not None and start >= self.start and self.end is not None and end < self.end ): # cache hit: we have all the required data offset = start - self.start return self.cache[offset : offset + end - start] if self.blocksize: bend = min(self.size, end + self.blocksize) else: bend = end if bend == start or start > self.size: return b"" if (self.start is None or start < self.start) and ( self.end is None or end > self.end ): # First read, or extending both before and after self.cache = self.fetcher(start, bend) self.start = start elif start < self.start: if self.end - end > self.blocksize: self.cache = self.fetcher(start, bend) self.start = start else: new = self.fetcher(start, self.start) self.start = start self.cache = new + self.cache elif bend > self.end: if self.end > self.size: pass elif end - self.end > self.blocksize: self.cache = self.fetcher(start, bend) self.start = start else: new = self.fetcher(self.end, bend) self.cache = self.cache + new self.end = self.start + len(self.cache) offset = start - self.start out = self.cache[offset : offset + end - start] if self.trim: num = (self.end - self.start) // (self.blocksize + 1) if num > 1: self.start += self.blocksize * num self.cache = self.cache[self.blocksize * num :] return out def __len__(self): return len(self.cache) caches = { "none": BaseCache, "mmap": MMapCache, "bytes": BytesCache, "readahead": ReadAheadCache, "block": BlockCache, } filesystem_spec-0.6.1/fsspec/compression.py000066400000000000000000000105551356753337100211350ustar00rootroot00000000000000"""Helper functions for a standard streaming compression API""" from bz2 import BZ2File from gzip import GzipFile from zipfile import ZipFile import fsspec.utils from fsspec.spec import AbstractBufferedFile def noop_file(file, mode, **kwargs): return file # should be functions of the form func(infile, mode=, **kwargs) -> file-like compr = {None: noop_file} def register_compression(name, callback, extensions, force=False): """Register an "inferable" file compression type. Registers transparent file compression type for use with fsspec.open. Compression can be specified by name in open, or "infer"-ed for any files ending with the given extensions. Args: name: (str) The compression type name. Eg. "gzip". callback: A callable of form (infile, mode, **kwargs) -> file-like. Accepts an input file-like object, the target mode and kwargs. Returns a wrapped file-like object. extensions: (str, Iterable[str]) A file extension, or list of file extensions for which to infer this compression scheme. Eg. "gz". force: (bool) Force re-registration of compression type or extensions. Raises: ValueError: If name or extensions already registered, and not force. """ if isinstance(extensions, str): extensions = [extensions] # Validate registration if name in compr and not force: raise ValueError("Duplicate compression registration: %s" % name) for ext in extensions: if ext in fsspec.utils.compressions and not force: raise ValueError( "Duplicate compression file extension: %s (%s)" % (ext, name) ) compr[name] = callback for ext in extensions: fsspec.utils.compressions[ext] = name def unzip(infile, mode="rb", filename=None, **kwargs): if "r" not in mode: filename = filename or "file" z = ZipFile(infile, mode="w", **kwargs) fo = z.open(filename, mode="w") fo.close = lambda closer=fo.close: closer() or z.close() return fo z = ZipFile(infile) if filename is None: filename = z.namelist()[0] return z.open(filename, mode="r", **kwargs) register_compression("zip", unzip, "zip") register_compression("bz2", BZ2File, "bz2") register_compression("gzip", lambda f, **kwargs: GzipFile(fileobj=f, **kwargs), "gz") try: import lzma register_compression("lzma", lzma.LZMAFile, "xz") register_compression("xz", lzma.LZMAFile, "xz", force=True) except ImportError: pass try: import lzmaffi register_compression("lzma", lzmaffi.LZMAFile, "xz", force=True) register_compression("xz", lzmaffi.LZMAFile, "xz", force=True) except ImportError: pass class SnappyFile(AbstractBufferedFile): def __init__(self, infile, mode, **kwargs): import snappy self.details = {"size": 999999999} # not true, but OK if we don't seek super().__init__(fs=None, path="snappy", mode=mode.strip("b") + "b", **kwargs) self.infile = infile if "r" in mode: self.codec = snappy.StreamDecompressor() else: self.codec = snappy.StreamCompressor() def _upload_chunk(self, final=False): self.buffer.seek(0) out = self.codec.add_chunk(self.buffer.read()) self.infile.write(out) return True def seek(self, loc, whence=0): raise NotImplementedError("SnappyFile is not seekable") def seekable(self): return False def _fetch_range(self, start, end): """Get the specified set of bytes from remote""" data = self.infile.read(end - start) return self.codec.decompress(data) try: import snappy snappy.compress # Snappy may use the .sz file extension, but this is not part of the # standard implementation. register_compression("snappy", SnappyFile, []) except (ImportError, NameError): pass try: import lz4.frame register_compression("lz4", lz4.frame.open, "lz4") except ImportError: pass try: import zstandard as zstd def zstandard_file(infile, mode="rb"): if "r" in mode: cctx = zstd.ZstdDecompressor() return cctx.stream_reader(infile) else: cctx = zstd.ZstdCompressor(level=10) return cctx.stream_writer(infile) register_compression("zstd", zstandard_file, "zst") except ImportError: pass filesystem_spec-0.6.1/fsspec/conftest.py000066400000000000000000000021471356753337100204170ustar00rootroot00000000000000import os import shutil import subprocess import sys import time import pytest import fsspec from fsspec.implementations.cached import CachingFileSystem @pytest.fixture() def m(): """ Fixture providing a memory filesystem. """ m = fsspec.filesystem("memory") m.store.clear() try: yield m finally: m.store.clear() @pytest.fixture def ftp_writable(tmpdir): """ Fixture providing a writable FTP filesystem. """ pytest.importorskip("pyftpdlib") from fsspec.implementations.ftp import FTPFileSystem FTPFileSystem.clear_instance_cache() # remove lingering connections CachingFileSystem.clear_instance_cache() d = str(tmpdir) with open(os.path.join(d, "out"), "wb") as f: f.write(b"hello" * 10000) P = subprocess.Popen( [sys.executable, "-m", "pyftpdlib", "-d", d, "-u", "user", "-P", "pass", "-w"] ) try: time.sleep(1) yield "localhost", 2121, "user", "pass" finally: P.terminate() P.wait() try: shutil.rmtree(tmpdir) except Exception: pass filesystem_spec-0.6.1/fsspec/core.py000066400000000000000000000323611356753337100175230ustar00rootroot00000000000000from __future__ import print_function, division, absolute_import import io import os import logging from .compression import compr from .utils import ( infer_compression, build_name_function, update_storage_options, stringify_path, ) from .registry import get_filesystem_class # for backwards compat, we export cache things from here too from .caching import ( # noqa: F401 BaseCache, MMapCache, ReadAheadCache, BytesCache, BlockCache, caches, ) logger = logging.getLogger("fsspec") class OpenFile(object): """ File-like object to be used in a context Can layer (buffered) text-mode and compression over any file-system, which are typically binary-only. These instances are safe to serialize, as the low-level file object is not created until invoked using `with`. Parameters ---------- fs: FileSystem The file system to use for opening the file. Should match the interface of ``dask.bytes.local.LocalFileSystem``. path: str Location to open mode: str like 'rb', optional Mode of the opened file compression: str or None, optional Compression to apply encoding: str or None, optional The encoding to use if opened in text mode. errors: str or None, optional How to handle encoding errors if opened in text mode. newline: None or str Passed to TextIOWrapper in text mode, how to handle line endings. """ def __init__( self, fs, path, mode="rb", compression=None, encoding=None, errors=None, newline=None, ): self.fs = fs self.path = path self.mode = mode self.compression = get_compression(path, compression) self.encoding = encoding self.errors = errors self.newline = newline self.fobjects = [] def __reduce__(self): return ( OpenFile, ( self.fs, self.path, self.mode, self.compression, self.encoding, self.errors, ), ) def __repr__(self): return "".format(self.path) def __fspath__(self): return self.path def __enter__(self): mode = self.mode.replace("t", "").replace("b", "") + "b" f = self.fs.open(self.path, mode=mode) self.fobjects = [f] if self.compression is not None: compress = compr[self.compression] f = compress(f, mode=mode[0]) self.fobjects.append(f) if "b" not in self.mode: # assume, for example, that 'r' is equivalent to 'rt' as in builtin f = io.TextIOWrapper( f, encoding=self.encoding, errors=self.errors, newline=self.newline ) self.fobjects.append(f) return self.fobjects[-1] def __exit__(self, *args): self.close() def __del__(self): self.close() def open(self): """Materialise this as a real open file without context The file should be explicitly closed to avoid enclosed open file instances persisting """ return self.__enter__() def close(self): """Close all encapsulated file objects""" for f in reversed(self.fobjects): if "r" not in self.mode and not f.closed: f.flush() f.close() self.fobjects = [] def open_files( urlpath, mode="rb", compression=None, encoding="utf8", errors=None, name_function=None, num=1, protocol=None, newline=None, **kwargs ): """ Given a path or paths, return a list of ``OpenFile`` objects. For writing, a str path must contain the "*" character, which will be filled in by increasing numbers, e.g., "part*" -> "part1", "part2" if num=2. For either reading or writing, can instead provide explicit list of paths. Parameters ---------- urlpath: string or list Absolute or relative filepath(s). Prefix with a protocol like ``s3://`` to read from alternative filesystems. To read from multiple files you can pass a globstring or a list of paths, with the caveat that they must all have the same protocol. mode: 'rb', 'wt', etc. compression: string Compression to use. See ``dask.bytes.compression.files`` for options. encoding: str For text mode only errors: None or str Passed to TextIOWrapper in text mode name_function: function or None if opening a set of files for writing, those files do not yet exist, so we need to generate their names by formatting the urlpath for each sequence number num: int [1] if writing mode, number of files we expect to create (passed to name+function) protocol: str or None If given, overrides the protocol found in the URL. newline: bytes or None Used for line terminator in text mode. If None, uses system default; if blank, uses no translation. **kwargs: dict Extra options that make sense to a particular storage connection, e.g. host, port, username, password, etc. Examples -------- >>> files = open_files('2015-*-*.csv') # doctest: +SKIP >>> files = open_files( ... 's3://bucket/2015-*-*.csv.gz', compression='gzip' ... ) # doctest: +SKIP Returns ------- List of ``OpenFile`` objects. """ fs, fs_token, paths = get_fs_token_paths( urlpath, mode, num=num, name_function=name_function, storage_options=kwargs, protocol=protocol, ) return [ OpenFile( fs, path, mode=mode, compression=compression, encoding=encoding, errors=errors, newline=newline, ) for path in paths ] def open( urlpath, mode="rb", compression=None, encoding="utf8", errors=None, protocol=None, newline=None, **kwargs ): """ Given a path or paths, return one ``OpenFile`` object. Parameters ---------- urlpath: string or list Absolute or relative filepath. Prefix with a protocol like ``s3://`` to read from alternative filesystems. Should not include glob character(s). mode: 'rb', 'wt', etc. compression: string Compression to use. See ``dask.bytes.compression.files`` for options. encoding: str For text mode only errors: None or str Passed to TextIOWrapper in text mode protocol: str or None If given, overrides the protocol found in the URL. newline: bytes or None Used for line terminator in text mode. If None, uses system default; if blank, uses no translation. **kwargs: dict Extra options that make sense to a particular storage connection, e.g. host, port, username, password, etc. Examples -------- >>> openfile = open('2015-01-01.csv') # doctest: +SKIP >>> openfile = open( ... 's3://bucket/2015-01-01.csv.gz', ... compression='gzip' ... ) # doctest: +SKIP >>> with openfile as f: ... df = pd.read_csv(f) # doctest: +SKIP Returns ------- ``OpenFile`` object. """ return open_files( [urlpath], mode, compression, encoding, errors, protocol, newline=newline, **kwargs )[0] def get_compression(urlpath, compression): if compression == "infer": compression = infer_compression(urlpath) if compression is not None and compression not in compr: raise ValueError("Compression type %s not supported" % compression) return compression def split_protocol(urlpath): """Return protocol, path pair""" urlpath = stringify_path(urlpath) if "://" in urlpath: protocol, path = urlpath.split("://", 1) if len(protocol) > 1: # excludes Windows paths return protocol, path return None, urlpath def strip_protocol(urlpath): """Return only path part of full URL, according to appropriate backend""" protocol, _ = split_protocol(urlpath) cls = get_filesystem_class(protocol) return cls._strip_protocol(urlpath) def expand_paths_if_needed(paths, mode, num, fs, name_function): """Expand paths if they have a ``*`` in them. :param paths: list of paths mode: str Mode in which to open files. num: int If opening in writing mode, number of files we expect to create. fs: filesystem object name_function: callable If opening in writing mode, this callable is used to generate path names. Names are generated for each partition by ``urlpath.replace('*', name_function(partition_index))``. :return: list of paths """ expanded_paths = [] paths = list(paths) if "w" in mode and sum([1 for p in paths if "*" in p]) > 1: raise ValueError("When writing data, only one filename mask can be specified.") elif "w" in mode: num = max(num, len(paths)) for curr_path in paths: if "*" in curr_path: if "w" in mode: # expand using name_function expanded_paths.extend(_expand_paths(curr_path, name_function, num)) else: # expand using glob expanded_paths.extend(fs.glob(curr_path)) else: expanded_paths.append(curr_path) # if we generated more paths that asked for, trim the list if "w" in mode and len(expanded_paths) > num: expanded_paths = expanded_paths[:num] return expanded_paths def get_fs_token_paths( urlpath, mode="rb", num=1, name_function=None, storage_options=None, protocol=None ): """Filesystem, deterministic token, and paths from a urlpath and options. Parameters ---------- urlpath: string or iterable Absolute or relative filepath, URL (may include protocols like ``s3://``), or globstring pointing to data. mode: str, optional Mode in which to open files. num: int, optional If opening in writing mode, number of files we expect to create. name_function: callable, optional If opening in writing mode, this callable is used to generate path names. Names are generated for each partition by ``urlpath.replace('*', name_function(partition_index))``. storage_options: dict, optional Additional keywords to pass to the filesystem class. protocol: str or None To override the protocol specifier in the URL """ if isinstance(urlpath, (list, tuple)): if not urlpath: raise ValueError("empty urlpath sequence") protocols, paths = zip(*map(split_protocol, urlpath)) protocol = protocol or protocols[0] if not all(p == protocol for p in protocols): raise ValueError( "When specifying a list of paths, all paths must " "share the same protocol" ) cls = get_filesystem_class(protocol) optionss = list(map(cls._get_kwargs_from_urls, urlpath)) paths = [cls._strip_protocol(u) for u in urlpath] options = optionss[0] if not all(o == options for o in optionss): raise ValueError( "When specifying a list of paths, all paths must " "share the same file-system options" ) update_storage_options(options, storage_options) fs = cls(**options) paths = expand_paths_if_needed(paths, mode, num, fs, name_function) elif isinstance(urlpath, str) or hasattr(urlpath, "name"): protocols, path = split_protocol(urlpath) protocol = protocol or protocols cls = get_filesystem_class(protocol) options = cls._get_kwargs_from_urls(urlpath) path = cls._strip_protocol(urlpath) update_storage_options(options, storage_options) fs = cls(**options) if "w" in mode: paths = _expand_paths(path, name_function, num) elif "*" in path: paths = sorted(fs.glob(path)) else: paths = [path] else: raise TypeError("url type not understood: %s" % urlpath) return fs, fs._fs_token, paths def _expand_paths(path, name_function, num): if isinstance(path, str): if path.count("*") > 1: raise ValueError("Output path spec must contain exactly one '*'.") elif "*" not in path: path = os.path.join(path, "*.part") if name_function is None: name_function = build_name_function(num - 1) paths = [path.replace("*", name_function(i)) for i in range(num)] if paths != sorted(paths): logger.warning( "In order to preserve order between partitions" " paths created with ``name_function`` should " "sort to partition order" ) elif isinstance(path, (tuple, list)): assert len(path) == num paths = list(path) else: raise ValueError( "Path should be either\n" "1. A list of paths: ['foo.json', 'bar.json', ...]\n" "2. A directory: 'foo/\n" "3. A path with a '*' in it: 'foo.*.json'" ) return paths filesystem_spec-0.6.1/fsspec/fuse.py000066400000000000000000000112151356753337100175300ustar00rootroot00000000000000from __future__ import print_function import os import stat from errno import ENOENT, EIO from fuse import Operations, FuseOSError import threading import time from fuse import FUSE class FUSEr(Operations): def __init__(self, fs, path): self.fs = fs self.cache = {} self.root = path.rstrip("/") + "/" self.counter = 0 def getattr(self, path, fh=None): path = "".join([self.root, path.lstrip("/")]).rstrip("/") try: info = self.fs.info(path) except FileNotFoundError: raise FuseOSError(ENOENT) data = {"st_uid": 1000, "st_gid": 1000} perm = 0o777 if info["type"] != "file": data["st_mode"] = stat.S_IFDIR | perm data["st_size"] = 0 data["st_blksize"] = 0 else: data["st_mode"] = stat.S_IFREG | perm data["st_size"] = info["size"] data["st_blksize"] = 5 * 2 ** 20 data["st_nlink"] = 1 data["st_atime"] = time.time() data["st_ctime"] = time.time() data["st_mtime"] = time.time() return data def readdir(self, path, fh): path = "".join([self.root, path.lstrip("/")]) files = self.fs.ls(path, False) files = [os.path.basename(f.rstrip("/")) for f in files] return [".", ".."] + files def mkdir(self, path, mode): path = "".join([self.root, path.lstrip("/")]) self.fs.mkdir(path) return 0 def rmdir(self, path): path = "".join([self.root, path.lstrip("/")]) self.fs.rmdir(path) return 0 def read(self, path, size, offset, fh): f = self.cache[fh] f.seek(offset) out = f.read(size) return out def write(self, path, data, offset, fh): f = self.cache[fh] f.write(data) return len(data) def create(self, path, flags, fi=None): fn = "".join([self.root, path.lstrip("/")]) f = self.fs.open(fn, "wb") self.cache[self.counter] = f self.counter += 1 return self.counter - 1 def open(self, path, flags): fn = "".join([self.root, path.lstrip("/")]) if flags % 2 == 0: # read mode = "rb" else: # write/create mode = "wb" self.cache[self.counter] = self.fs.open(fn, mode) self.counter += 1 return self.counter - 1 def truncate(self, path, length, fh=None): fn = "".join([self.root, path.lstrip("/")]) if length != 0: raise NotImplementedError # maybe should be no-op since open with write sets size to zero anyway self.fs.touch(fn) def unlink(self, path): fn = "".join([self.root, path.lstrip("/")]) try: self.fs.rm(fn, False) except (IOError, FileNotFoundError): raise FuseOSError(EIO) def release(self, path, fh): try: if fh in self.cache: f = self.cache[fh] f.close() self.cache.pop(fh) except Exception as e: print(e) return 0 def chmod(self, path, mode): raise NotImplementedError def run(fs, path, mount_point, foreground=True, threads=False): """ Mount stuff in a local directory This uses fusepy to make it appear as if a given path on an fsspec instance is in fact resident within the local file-system. This requires that fusepy by installed, and that FUSE be available on the system (typically requiring a package to be installed with apt, yum, brew, etc.). Parameters ---------- fs: file-system instance From one of the compatible implementations path: str Location on that file-system to regard as the root directory to mount. Note that you typically should include the terminating "/" character. mount_point: str An empty directory on the local file-system where the contents of the remote path will appear foreground: bool Whether or not calling this function will block. Operation will typically be more stable if True. threads: bool Whether or not to create threads when responding to file operations within the mounter directory. Operation will typically be more stable if False. """ func = lambda: FUSE( FUSEr(fs, path), mount_point, nothreads=not threads, foreground=True ) if foreground is False: th = threading.Thread(target=func) th.daemon = True th.start() return th else: # pragma: no cover try: func() except KeyboardInterrupt: pass filesystem_spec-0.6.1/fsspec/implementations/000077500000000000000000000000001356753337100214245ustar00rootroot00000000000000filesystem_spec-0.6.1/fsspec/implementations/__init__.py000066400000000000000000000000001356753337100235230ustar00rootroot00000000000000filesystem_spec-0.6.1/fsspec/implementations/cached.py000066400000000000000000000332251356753337100232120ustar00rootroot00000000000000import time import pickle import logging import os import hashlib import tempfile import inspect from fsspec import AbstractFileSystem, filesystem from fsspec.spec import AbstractBufferedFile from fsspec.core import MMapCache, BaseCache logger = logging.getLogger("fsspec") class CachingFileSystem(AbstractFileSystem): """Locally caching filesystem, layer over any other FS This class implements chunk-wise local storage of remote files, for quick access after the initial download. The files are stored in a given directory with random hashes for the filenames. If no directory is given, a temporary one is used, which should be cleaned up by the OS after the process ends. The files themselves as sparse (as implemented in MMapCache), so only the data which is accessed takes up space. Restrictions: - the block-size must be the same for each access of a given file, unless all blocks of the file have already been read - caching can only be applied to file-systems which produce files derived from fsspec.spec.AbstractBufferedFile ; LocalFileSystem is also allowed, for testing """ protocol = ("blockcache", "cached") def __init__( self, target_protocol=None, cache_storage="TMP", cache_check=10, check_files=False, expiry_time=604800, target_options=None, **kwargs ): """ Parameters ---------- target_protocol: str Target fielsystem protocol cache_storage: str or list(str) Location to store files. If "TMP", this is a temporary directory, and will be cleaned up by the OS when this process ends (or later). If a list, each location will be tried in the order given, but only the last will be considered writable. cache_check: int Number of seconds between reload of cache metadata check_files: bool Whether to explicitly see if the UID of the remote file matches the stored one before using. Warning: some file systems such as HTTP cannot reliably give a unique hash of the contents of some path, so be sure to set this option to False. expiry_time: int The time in seconds after which a local copy is considered useless. Set to falsy to prevent expiry. The default is equivalent to one week. target_options: dict or None Passed to the instantiation of the FS, if fs is None. """ if self._cached: return super().__init__(**kwargs) if cache_storage == "TMP": storage = [tempfile.mkdtemp()] else: if isinstance(cache_storage, str): storage = [cache_storage] else: storage = cache_storage os.makedirs(storage[-1], exist_ok=True) self.storage = storage self.kwargs = target_options or {} self.cache_check = cache_check self.check_files = check_files self.expiry = expiry_time self.load_cache() if isinstance(target_protocol, AbstractFileSystem): self.fs = target_protocol self.protocol = self.fs.protocol else: self.protocol = target_protocol self.fs = filesystem(target_protocol, **self.kwargs) def __reduce_ex__(self, *_): return ( self.__class__, ( self.protocol, self.storage, self.cache_check, self.check_files, self.expiry, self.kwargs or None, ), ) def load_cache(self): """Read set of stored blocks from file""" cached_files = [] for storage in self.storage: fn = os.path.join(storage, "cache") if os.path.exists(fn): with open(fn, "rb") as f: # TODO: consolidate blocks here cached_files.append(pickle.load(f)) else: os.makedirs(storage, exist_ok=True) cached_files.append({}) self.cached_files = cached_files or [{}] self.last_cache = time.time() def save_cache(self): """Save set of stored blocks from file""" fn = os.path.join(self.storage[-1], "cache") # TODO: a file lock could be used to ensure file does not change # between re-read and write; but occasional duplicated reads ok. cache = self.cached_files[-1] if os.path.exists(fn): with open(fn, "rb") as f: cached_files = pickle.load(f) for k, c in cached_files.items(): if c["blocks"] is not True: if cache[k]["blocks"] is True: c["blocks"] = True else: c["blocks"] = set(c["blocks"]).union(cache[k]["blocks"]) else: cached_files = cache cache = {k: v.copy() for k, v in cached_files.items()} for c in cache.values(): if isinstance(c["blocks"], set): c["blocks"] = list(c["blocks"]) with open(fn + ".temp", "wb") as f: pickle.dump(cache, f) if os.path.exists(fn): os.remove(fn) os.rename(fn + ".temp", fn) def _check_cache(self): """Reload caches if time elapsed or any disappeared""" if not self.cache_check: # explicitly told not to bother checking return timecond = time.time() - self.last_cache > self.cache_check existcond = all(os.path.exists(storage) for storage in self.storage) if timecond or not existcond: self.load_cache() def _check_file(self, path): """Is path in cache and still valid""" self._check_cache() for storage, cache in zip(self.storage, self.cached_files): if path not in cache: continue detail = cache[path].copy() if self.check_files: if detail["uid"] != self.fs.ukey(path): continue if self.expiry: if detail["time"] - time.time() > self.expiry: continue fn = os.path.join(storage, detail["fn"]) if os.path.exists(fn): return detail, fn return False, None def _open( self, path, mode="rb", block_size=None, autocommit=True, cache_options=None, **kwargs ): """Wrap the target _open If the whole file exists in the cache, just open it locally and return that. Otherwise, open the file on the target FS, and make it have a mmap cache pointing to the location which we determine, in our cache. The ``blocks`` instance is shared, so as the mmap cache instance updates, so does the entry in our ``cached_files`` attribute. We monkey-patch this file, so that when it closes, we call ``close_and_update`` to save the state of the blocks. """ path = self._strip_protocol(path) if not path.startswith(self.protocol): path = self.protocol + "://" + path if mode != "rb": return self.fs._open( path, mode=mode, block_size=block_size, autocommit=autocommit, cache_options=cache_options, **kwargs ) detail, fn = self._check_file(path) if detail: # file is in cache hash, blocks = detail["fn"], detail["blocks"] if blocks is True: # stored file is complete logger.debug("Opening local copy of %s" % path) return open(fn, "rb") # TODO: action where partial file exists in read-only cache logger.debug("Opening partially cached copy of %s" % path) else: hash = hashlib.sha256(path.encode()).hexdigest() fn = os.path.join(self.storage[-1], hash) blocks = set() detail = { "fn": hash, "blocks": blocks, "time": time.time(), "uid": self.fs.ukey(path), } self.cached_files[-1][path] = detail logger.debug("Creating local sparse file for %s" % path) # call target filesystems open f = self.fs._open( path, mode=mode, block_size=block_size, autocommit=autocommit, cache_options=cache_options, cache_type=None, **kwargs ) if "blocksize" in detail: if detail["blocksize"] != f.blocksize: raise ValueError( "Cached file must be reopened with same block" "size as original (old: %i, new %i)" "" % (detail["blocksize"], f.blocksize) ) else: detail["blocksize"] = f.blocksize f.cache = MMapCache(f.blocksize, f._fetch_range, f.size, fn, blocks) close = f.close f.close = lambda: self.close_and_update(f, close) return f def close_and_update(self, f, close): """Called when a file is closing, so store the set of blocks""" if f.path.startswith(self.protocol): path = f.path else: path = self.protocol + "://" + f.path c = self.cached_files[-1][path] if c["blocks"] is not True and len(["blocks"]) * f.blocksize >= f.size: c["blocks"] = True self.save_cache() close() def __getattribute__(self, item): if item in [ "load_cache", "_open", "save_cache", "close_and_update", "__init__", "__getattribute__", "__reduce_ex__", "open", "cat", "get", "read_block", "tail", "head", "_check_file", "_check_cache", ]: # all the methods defined in this class. Note `open` here, since # it calls `_open`, but is actually in superclass return lambda *args, **kw: getattr(type(self), item)(self, *args, **kw) if item == "__class__": return type(self) d = object.__getattribute__(self, "__dict__") fs = d.get("fs", None) # fs is not immediately defined if item in d: return d[item] elif fs is not None: if item in fs.__dict__: # attribute of instance return fs.__dict__[item] # attributed belonging to the target filesystem cls = type(fs) m = getattr(cls, item) if inspect.isfunction(m) and ( not hasattr(m, "__self__") or m.__self__ is None ): # instance method return m.__get__(fs, cls) return m # class method or attribute else: # attributes of the superclass, while target is being set up return super().__getattribute__(item) class WholeFileCacheFileSystem(CachingFileSystem): """Caches whole remote files on first access This class is intended as a layer over any other file system, and will make a local copy of each file accessed, so that all subsequent reads are local. This is similar to ``CachingFileSystem``, but without the block-wise functionality and so can work even when sparse files are not allowed. See its docstring for definition of the init arguments. The class still needs access to the remote store for listing files, and may refresh cached files. """ protocol = "filecache" def _open(self, path, mode="rb", **kwargs): path = self._strip_protocol(path) if not path.startswith(self.protocol): path = self.protocol + "://" + path if mode != "rb": return self.fs._open(path, mode=mode, **kwargs) detail, fn = self._check_file(path) if detail: hash, blocks = detail["fn"], detail["blocks"] if blocks is True: logger.debug("Opening local copy of %s" % path) return open(fn, "rb") else: raise ValueError( "Attempt to open partially cached file %s" "as a wholly cached file" % path ) else: hash = hashlib.sha256(path.encode()).hexdigest() fn = os.path.join(self.storage[-1], hash) blocks = True detail = { "fn": hash, "blocks": blocks, "time": time.time(), "uid": self.fs.ukey(path), } self.cached_files[-1][path] = detail logger.debug("Copying %s to local cache" % path) kwargs["mode"] = mode # call target filesystems open # TODO: why not just use fs.get ?? f = self.fs._open(path, **kwargs) with open(fn, "wb") as f2: if isinstance(f, AbstractBufferedFile): # want no type of caching if just downloading whole thing f.cache = BaseCache(0, f.cache.fetcher, f.size) if getattr(f, "blocksize", 0) and f.size: # opportunity to parallelise here data = True while data: data = f.read(f.blocksize) f2.write(data) else: # this only applies to HTTP, should instead use streaming f2.write(f.read()) self.save_cache() return self._open(path, mode) filesystem_spec-0.6.1/fsspec/implementations/dask.py000066400000000000000000000070641356753337100227270ustar00rootroot00000000000000from distributed.worker import get_worker from distributed.client import _get_global_client import dask from fsspec.spec import AbstractFileSystem, AbstractBufferedFile from fsspec import filesystem def make_instance(cls, args, kwargs): inst = cls(*args, **kwargs) inst._determine_worker() return inst class DaskWorkerFileSystem(AbstractFileSystem): """View files accessible to a worker as any other remote file-system When instances are run on the worker, uses the real filesystem. When run on the client, they call the worker to provide information or data. **Warning** this implementation is experimental, and read-only for now. """ def __init__(self, remote_protocol, remote_options=None, **kwargs): super().__init__(**kwargs) self.protocol = remote_protocol self.remote_options = remote_options self.worker = None self.client = None self.fs = None # What is the type here? self._determine_worker() def _determine_worker(self): try: get_worker() self.worker = True self.fs = filesystem(self.protocol, **(self.remote_options or {})) except ValueError: self.worker = False self.client = _get_global_client() self.rfs = dask.delayed(self) def __reduce__(self): return make_instance, (type(self), self.storage_args, self.storage_options) def mkdir(self, *args, **kwargs): if self.worker: self.fs.mkdir(*args, **kwargs) else: self.rfs.mkdir(*args, **kwargs).compute() def rm(self, *args, **kwargs): if self.worker: self.fs.rm(*args, **kwargs) else: self.rfs.rm(*args, **kwargs).compute() def copy(self, *args, **kwargs): if self.worker: self.fs.copy(*args, **kwargs) else: self.rfs.copy(*args, **kwargs).compute() def mv(self, *args, **kwargs): if self.worker: self.fs.mv(*args, **kwargs) else: self.rfs.mv(*args, **kwargs).compute() def ls(self, *args, **kwargs): if self.worker: return self.fs.ls(*args, **kwargs) else: return self.rfs.ls(*args, **kwargs).compute() def _open( self, path, mode="rb", block_size=None, autocommit=True, cache_options=None, **kwargs ): if self.worker: return self.fs._open( path, mode=mode, block_size=block_size, autocommit=autocommit, cache_options=cache_options, **kwargs ) else: return DaskFile( self, path, mode, block_size=block_size, autocommit=autocommit, cache_options=cache_options, **kwargs ) def fetch_range(self, path, mode, start, end): if self.worker: with self._open(path, mode) as f: f.seek(start) return f.read(end - start) else: return self.rfs.fetch_range(path, mode, start, end).compute() class DaskFile(AbstractBufferedFile): def _upload_chunk(self, final=False): pass def _initiate_upload(self): """ Create remote file/upload """ pass def _fetch_range(self, start, end): """Get the specified set of bytes from remote""" return self.fs.fetch_range(self.path, self.mode, start, end) filesystem_spec-0.6.1/fsspec/implementations/ftp.py000066400000000000000000000204101356753337100225640ustar00rootroot00000000000000from ftplib import FTP, Error, error_perm from socket import timeout import uuid from ..spec import AbstractBufferedFile, AbstractFileSystem from ..utils import infer_storage_options class FTPFileSystem(AbstractFileSystem): """A filesystem over classic """ root_marker = "/" cachable = False def __init__( self, host, port=21, username=None, password=None, acct=None, block_size=None, tempdir="/tmp", timeout=30, **kwargs ): """ You can use _get_kwargs_from_urls to get some kwargs from a reasonable FTP url. Authentication will be anonymous if username/password are not given. Parameters ---------- host: str The remote server name/ip to connect to port: int Port to connect with username: str or None If authenticating, the user's identifier password: str of None User's password on the server, if using acct: str or None Some servers also need an "account" string for auth block_size: int or None If given, the read-ahead or write buffer size. tempdir: str Directory on remote to put temporary files when in a transaction """ super(FTPFileSystem, self).__init__(**kwargs) self.host = host self.port = port self.tempdir = tempdir self.cred = username, password, acct self.timeout = timeout if block_size is not None: self.blocksize = block_size else: self.blocksize = 2 ** 16 self._connect() def _connect(self): self.ftp = FTP(timeout=self.timeout) self.ftp.connect(self.host, self.port) self.ftp.login(*self.cred) @classmethod def _strip_protocol(cls, path): return "/" + infer_storage_options(path)["path"].lstrip("/").rstrip("/") @staticmethod def _get_kwargs_from_urls(urlpath): out = infer_storage_options(urlpath) out.pop("path", None) out.pop("protocol", None) return out def invalidate_cache(self, path=None): if path is not None: self.dircache.pop(path, None) else: self.dircache.clear() def ls(self, path, detail=True): path = self._strip_protocol(path) out = [] if path not in self.dircache: try: try: out = [ (fn, details) for (fn, details) in self.ftp.mlsd(path) if fn not in [".", ".."] and details["type"] not in ["pdir", "cdir"] ] except error_perm: out = _mlsd2(self.ftp, path) # Not platform independent for fn, details in out: if path == "/": path = "" # just for forming the names, below details["name"] = "/".join([path, fn.lstrip("/")]) if details["type"] == "file": details["size"] = int(details["size"]) else: details["size"] = 0 self.dircache[path] = out except Error: try: info = self.info(path) if info["type"] == "file": out = [(path, info)] except (Error, IndexError): raise FileNotFoundError files = self.dircache.get(path, out) if not detail: return sorted([fn for fn, details in files]) return [details for fn, details in files] def info(self, path, **kwargs): # implement with direct method path = self._strip_protocol(path) files = self.ls(self._parent(path).lstrip("/"), True) try: out = [f for f in files if f["name"] == path][0] except IndexError: raise FileNotFoundError(path) return out def _open( self, path, mode="rb", block_size=None, cache_options=None, autocommit=True, **kwargs ): path = self._strip_protocol(path) block_size = block_size or self.blocksize return FTPFile( self, path, mode=mode, block_size=block_size, tempdir=self.tempdir, autocommit=autocommit, cache_options=cache_options, ) def _rm(self, path): path = self._strip_protocol(path) self.ftp.delete(path) self.invalidate_cache(path.rsplit("/", 1)[0]) def mkdir(self, path, **kwargs): path = self._strip_protocol(path) self.ftp.mkd(path) def rmdir(self, path): path = self._strip_protocol(path) self.ftp.rmd(path) def mv(self, path1, path2, **kwargs): path1 = self._strip_protocol(path1) path2 = self._strip_protocol(path2) self.ftp.rename(path1, path2) self.invalidate_cache(self._parent(path1)) self.invalidate_cache(self._parent(path2)) def __del__(self): self.ftp.close() class TransferDone(Exception): """Internal exception to break out of transfer""" pass class FTPFile(AbstractBufferedFile): """Interact with a remote FTP file with read/write buffering""" def __init__( self, fs, path, mode="rb", block_size="default", autocommit=True, cache_type="readahead", cache_options=None, **kwargs ): super().__init__( fs, path, mode=mode, block_size=block_size, autocommit=autocommit, cache_type=cache_type, cache_options=cache_options, **kwargs ) if not autocommit: self.target = self.path self.path = "/".join([kwargs["tempdir"], str(uuid.uuid4())]) def commit(self): self.fs.mv(self.path, self.target) def discard(self): self.fs.rm(self.path) def _fetch_range(self, start, end): """Get bytes between given byte limits Implemented by raising an exception in the fetch callback when the number of bytes received reaches the requested amount. Will fail if the server does not respect the REST command on retrieve requests. """ out = [] total = [0] def callback(x): total[0] += len(x) if total[0] > end - start: out.append(x[: (end - start) - total[0]]) raise TransferDone else: out.append(x) if total[0] == end - start: raise TransferDone try: self.fs.ftp.retrbinary( "RETR %s" % self.path, blocksize=self.blocksize, rest=start, callback=callback, ) except TransferDone: try: self.fs.ftp.abort() self.fs.ftp.voidresp() except timeout: self.fs._connect() return b"".join(out) def _upload_chunk(self, final=False): self.buffer.seek(0) self.fs.ftp.storbinary( "STOR " + self.path, self.buffer, blocksize=self.blocksize, rest=self.offset ) return True def _mlsd2(ftp, path="."): """ Fall back to using `dir` instead of `mlsd` if not supported. This parses a Linux style `ls -l` response to `dir`, but the response may be platform dependent. Parameters ---------- ftp: ftplib.FTP path: str Expects to be given path, but defaults to ".". """ lines = [] minfo = [] ftp.dir(path, lines.append) for line in lines: line = line.split() this = ( line[-1], { "modify": " ".join(line[5:8]), "unix.owner": line[2], "unix.group": line[3], "unix.mode": line[0], "size": line[4], }, ) if "d" == this[1]["unix.mode"][0]: this[1]["type"] = "dir" else: this[1]["type"] = "file" minfo.append(this) return minfo filesystem_spec-0.6.1/fsspec/implementations/github.py000066400000000000000000000047731356753337100232730ustar00rootroot00000000000000import io import requests from ..spec import AbstractFileSystem class GithubFileSystem(AbstractFileSystem): """[Experimental] interface to files in github An instance of this class provides the files residing within a remote github repository. You may specify a point in the repos history, by SHA, branch or tag (default is current master). Given that code files tend to be small, and that github does not support retrieving partial content, we always fetch whole files. """ url = "https://api.github.com/repos/{org}/{repo}/git/trees/{sha}" rurl = "https://raw.githubusercontent.com/{org}/{repo}/{sha}/{path}" protocol = "github" def __init__(self, org, repo, sha="master", **kwargs): super().__init__(**kwargs) self.org = org self.repo = repo self.root = sha self.ls("") def ls(self, path, detail=False, sha=None, **kwargs): if path == "": sha = self.root if sha is None: parts = path.rstrip("/").split("/") so_far = "" sha = self.root for part in parts: out = self.ls(so_far, True, sha=sha) so_far += "/" + part if so_far else part out = [o for o in out if o["name"] == so_far][0] if out["type"] == "file": if detail: return [out] else: return path sha = out["sha"] if path not in self.dircache: r = requests.get(self.url.format(org=self.org, repo=self.repo, sha=sha)) self.dircache[path] = [ { "name": path + "/" + f["path"] if path else f["path"], "mode": f["mode"], "type": {"blob": "file", "tree": "directory"}[f["type"]], "size": f.get("size", 0), "sha": f["sha"], } for f in r.json()["tree"] ] if detail: return self.dircache[path] else: return sorted([f["name"] for f in self.dircache[path]]) def _open( self, path, mode="rb", block_size=None, autocommit=True, cache_options=None, **kwargs ): if mode != "rb": raise NotImplementedError url = self.rurl.format(org=self.org, repo=self.repo, path=path, sha=self.root) r = requests.get(url) return io.BytesIO(r.content) filesystem_spec-0.6.1/fsspec/implementations/hdfs.py000066400000000000000000000137131356753337100227270ustar00rootroot00000000000000from ..spec import AbstractFileSystem from ..utils import infer_storage_options from pyarrow.hdfs import HadoopFileSystem class PyArrowHDFS(AbstractFileSystem): """Adapted version of Arrow's HadoopFileSystem This is a very simple wrapper over pa.hdfs.HadoopFileSystem, which passes on all calls to the underlying class. """ def __init__( self, host="default", port=0, user=None, kerb_ticket=None, driver="libhdfs", extra_conf=None, **kwargs ): """ Parameters ---------- host: str Hostname, IP or "default" to try to read from Hadoop config port: int Port to connect on, or default from Hadoop config if 0 user: str or None If given, connect as this username kerb_ticket: str or None If given, use this ticket for authentication driver: 'libhdfs' or 'libhdfs3' Binary driver; libhdfs if the JNI library and default extra_conf: None or dict Passed on to HadoopFileSystem """ if self._cached: return AbstractFileSystem.__init__(self, **kwargs) self.pars = (host, port, user, kerb_ticket, driver, extra_conf) self.pahdfs = HadoopFileSystem( host=host, port=port, user=user, kerb_ticket=kerb_ticket, driver=driver, extra_conf=extra_conf, ) def _open( self, path, mode="rb", block_size=None, autocommit=True, cache_options=None, **kwargs ): """ Parameters ---------- path: str Location of file; should start with '/' mode: str block_size: int Hadoop block size, e.g., 2**26 autocommit: True Transactions are not yet implemented for HDFS; errors if not True kwargs: dict or None Hadoop config parameters Returns ------- HDFSFile file-like instance """ return HDFSFile( self, path, mode, block_size=block_size, autocommit=autocommit, cache_options=cache_options, **kwargs ) def __reduce_ex__(self, protocol): return PyArrowHDFS, self.pars def ls(self, path, detail=True): out = self.pahdfs.ls(path, detail) if detail: for p in out: p["type"] = p["kind"] p["name"] = self._strip_protocol(p["name"]) else: out = [self._strip_protocol(p) for p in out] return out @staticmethod def _get_kwargs_from_urls(paths): ops = infer_storage_options(paths) out = {} if ops.get("host", None): out["host"] = ops["host"] if ops.get("username", None): out["user"] = ops["username"] if ops.get("port", None): out["port"] = ops["port"] return out @classmethod def _strip_protocol(cls, path): ops = infer_storage_options(path) return ops["path"] def __getattribute__(self, item): if item in [ "_open", "__init__", "__getattribute__", "__reduce_ex__", "open", "ls", "makedirs", ]: # all the methods defined in this class. Note `open` here, since # it calls `_open`, but is actually in superclass return lambda *args, **kw: getattr(PyArrowHDFS, item)(self, *args, **kw) if item == "__class__": return PyArrowHDFS d = object.__getattribute__(self, "__dict__") pahdfs = d.get("pahdfs", None) # fs is not immediately defined if pahdfs is not None and item in [ "chmod", "chown", "user", "df", "disk_usage", "download", "driver", "exists", "extra_conf", "get_capacity", "get_space_used", "host", "is_open", "kerb_ticket", "strip_protocol", "mkdir", "mv", "port", "get_capacity", "get_space_used", "df", "chmod", "chown", "disk_usage", "download", "upload", "_get_kwargs_from_urls", "read_parquet", "rm", "stat", "upload", ]: return getattr(pahdfs, item) else: # attributes of the superclass, while target is being set up return super().__getattribute__(item) class HDFSFile(object): """Wrapper around arrow's HdfsFile Allows seek beyond EOF and (eventually) commit/discard """ def __init__( self, fs, path, mode, block_size, autocommit=True, cache_type="readahead", cache_options=None, **kwargs ): # TODO: Inherit from AbstractBufferedFile? if not autocommit: raise NotImplementedError( "HDFSFile cannot be opened with 'autocommit=False'." ) self.fs = fs self.path = path self.mode = mode self.block_size = block_size self.fh = fs.pahdfs.open(path, mode, block_size, **kwargs) if self.fh.readable(): self.seek_size = self.size() def seek(self, loc, whence=0): if whence == 0 and self.readable(): loc = min(loc, self.seek_size) return self.fh.seek(loc, whence) def __getattr__(self, item): return getattr(self.fh, item) def __reduce_ex__(self, protocol): return HDFSFile, (self.fs, self.path, self.mode, self.block_size) def __enter__(self): return self def __exit__(self, exc_type, exc_val, exc_tb): self.close() filesystem_spec-0.6.1/fsspec/implementations/http.py000066400000000000000000000304001356753337100227520ustar00rootroot00000000000000from __future__ import print_function, division, absolute_import import re import requests from urllib.parse import urlparse from fsspec import AbstractFileSystem from fsspec.spec import AbstractBufferedFile from fsspec.utils import tokenize, DEFAULT_BLOCK_SIZE # https://stackoverflow.com/a/15926317/3821154 ex = re.compile(r"""]*?\s+)?href=(["'])(.*?)\1""") ex2 = re.compile(r"""(http[s]?://[-a-zA-Z0-9@:%_+.~#?&/=]+)""") class HTTPFileSystem(AbstractFileSystem): """ Simple File-System for fetching data via HTTP(S) ``ls()`` is implemented by loading the parent page and doing a regex match on the result. If simple_link=True, anything of the form "http(s)://server.com/stuff?thing=other"; otherwise only links within HTML href tags will be used. """ sep = "/" def __init__( self, simple_links=True, block_size=None, same_scheme=True, size_policy=None, **storage_options ): """ Parameters ---------- block_size: int Blocks to read bytes; if 0, will default to raw requests file-like objects instead of HTTPFile instances simple_links: bool If True, will consider both HTML tags and anything that looks like a URL; if False, will consider only the former. same_scheme: True When doing ls/glob, if this is True, only consider paths that have http/https matching the input URLs. size_policy: this argument is deprecated storage_options: key-value May be credentials, e.g., `{'auth': ('username', 'pword')}` or any other parameters passed on to requests """ AbstractFileSystem.__init__(self) self.block_size = block_size if block_size is not None else DEFAULT_BLOCK_SIZE self.simple_links = simple_links self.same_schema = same_scheme self.kwargs = storage_options self.session = requests.Session() @classmethod def _strip_protocol(cls, path): """ For HTTP, we always want to keep the full URL """ return path # TODO: override get def ls(self, url, detail=True): # ignoring URL-encoded arguments r = self.session.get(url, **self.kwargs) if self.simple_links: links = ex2.findall(r.text) + ex.findall(r.text) else: links = ex.findall(r.text) out = set() parts = urlparse(url) for l in links: if isinstance(l, tuple): l = l[1] if l.startswith("http"): if self.same_schema: if l.split(":", 1)[0] == url.split(":", 1)[0]: out.add(l) elif l.replace("https", "http").startswith( url.replace("https", "http") ): # allowed to cross http <-> https out.add(l) elif l.startswith("/") and len(l) > 1: out.add(parts.scheme + "://" + parts.netloc + l) else: if l not in ["..", "../"]: # Ignore FTP-like "parent" out.add("/".join([url.rstrip("/"), l.lstrip("/")])) if not out and url.endswith("/"): return self.ls(url.rstrip("/"), detail=True) if detail: return [ { "name": u, "size": None, "type": "directory" if u.endswith("/") else "file", } for u in out ] else: return list(sorted(out)) def cat(self, url): r = requests.get(url, **self.kwargs) r.raise_for_status() return r.content def mkdirs(self, url): """Make any intermediate directories to make path writable""" raise NotImplementedError def exists(self, path): kwargs = self.kwargs.copy() kwargs["stream"] = True try: r = self.session.get(path, **kwargs) r.close() return r.ok except requests.HTTPError: return False def _open( self, path, mode="rb", block_size=None, autocommit=None, # XXX: This differs from the base class. cache_options=None, **kwargs ): """Make a file-like object Parameters ---------- path: str Full URL with protocol mode: string must be "rb" block_size: int or None Bytes to download in one request; use instance value if None. If zero, will return a streaming Requests file-like instance. kwargs: key-value Any other parameters, passed to requests calls """ if mode != "rb": raise NotImplementedError block_size = block_size if block_size is not None else self.block_size kw = self.kwargs.copy() kw.update(kwargs) # this does nothing? if block_size: return HTTPFile( self, path, self.session, block_size, mode=mode, cache_options=cache_options, **kw ) else: kw["stream"] = True r = self.session.get(path, **kw) r.raise_for_status() r.raw.decode_content = True return r.raw def ukey(self, url): """Unique identifier; assume HTTP files are static, unchanging""" return tokenize(url, self.kwargs, self.protocol) def info(self, url, **kwargs): """Get info of URL Tries to access location via HEAD, and then GET methods, but does not fetch the data. It is possible that the server does not supply any size information, in which case size will be given as None (and certain operations on the corresponding file will not work). """ size = False for policy in ["head", "get"]: try: size = file_size(url, self.session, policy, **self.kwargs) if size: break except Exception: pass else: # get failed, so conclude URL does not exist if size is False: raise FileNotFoundError(url) return {"name": url, "size": size or None, "type": "file"} class HTTPFile(AbstractBufferedFile): """ A file-like object pointing to a remove HTTP(S) resource Supports only reading, with read-ahead of a predermined block-size. In the case that the server does not supply the filesize, only reading of the complete file in one go is supported. Parameters ---------- url: str Full URL of the remote resource, including the protocol session: requests.Session or None All calls will be made within this session, to avoid restarting connections where the server allows this block_size: int or None The amount of read-ahead to do, in bytes. Default is 5MB, or the value configured for the FileSystem creating this file size: None or int If given, this is the size of the file in bytes, and we don't attempt to call the server to find the value. kwargs: all other key-values are passed to requests calls. """ def __init__( self, fs, url, session=None, block_size=None, mode="rb", cache_type="bytes", cache_options=None, size=None, **kwargs ): if mode != "rb": raise NotImplementedError("File mode not supported") self.url = url self.session = session if session is not None else requests.Session() if size is not None: self.details = {"name": url, "size": size, "type": "file"} super().__init__( fs=fs, path=url, mode=mode, block_size=block_size, cache_type=cache_type, cache_options=cache_options, **kwargs ) self.cache.size = self.size or self.blocksize def read(self, length=-1): """Read bytes from file Parameters ---------- length: int Read up to this many bytes. If negative, read all content to end of file. If the server has not supplied the filesize, attempting to read only part of the data will raise a ValueError. """ if ( (length < 0 and self.loc == 0) or (length > (self.size or length)) # explicit read all or ( # read more than there is self.size and self.size < self.blocksize ) # all fits in one block anyway ): self._fetch_all() if self.size is None: if length < 0: self._fetch_all() else: length = min(self.size - self.loc, length) return super().read(length) def _fetch_all(self): """Read whole file in one shot, without caching This is only called when position is still at zero, and read() is called without a byte-count. """ if not isinstance(self.cache, AllBytes): r = self.session.get(self.url, **self.kwargs) r.raise_for_status() out = r.content self.cache = AllBytes(out) self.size = len(out) def _fetch_range(self, start, end): """Download a block of data The expectation is that the server returns only the requested bytes, with HTTP code 206. If this is not the case, we first check the headers, and then stream the output - if the data size is bigger than we requested, an exception is raised. """ kwargs = self.kwargs.copy() headers = kwargs.pop("headers", {}) headers["Range"] = "bytes=%i-%i" % (start, end - 1) r = self.session.get(self.url, headers=headers, stream=True, **kwargs) if r.status_code == 416: # range request outside file return b"" r.raise_for_status() if r.status_code == 206: # partial content, as expected out = r.content elif "Content-Length" in r.headers: cl = int(r.headers["Content-Length"]) if cl <= end - start: # data size OK out = r.content else: raise ValueError( "Got more bytes (%i) than requested (%i)" % (cl, end - start) ) else: cl = 0 out = [] for chunk in r.iter_content(chunk_size=2 ** 20): # data size unknown, let's see if it goes too big if chunk: out.append(chunk) cl += len(chunk) if cl > end - start: raise ValueError( "Got more bytes so far (>%i) than requested (%i)" % (cl, end - start) ) else: break out = b"".join(out) return out def file_size(url, session=None, size_policy="head", **kwargs): """Call HEAD on the server to get file size Default operation is to explicitly allow redirects and use encoding 'identity' (no compression) to get the true size of the target. """ kwargs = kwargs.copy() ar = kwargs.pop("allow_redirects", True) head = kwargs.get("headers", {}).copy() head["Accept-Encoding"] = "identity" session = session or requests.Session() if size_policy == "head": r = session.head(url, allow_redirects=ar, **kwargs) elif size_policy == "get": kwargs["stream"] = True r = session.get(url, allow_redirects=ar, **kwargs) else: raise TypeError('size_policy must be "head" or "get", got %s' "" % size_policy) if "Content-Length" in r.headers: return int(r.headers["Content-Length"]) elif "Content-Range" in r.headers: return int(r.headers["Content-Range"].split("/")[1]) class AllBytes(object): """Cache entire contents of a remote URL""" def __init__(self, data): self.data = data def _fetch(self, start, end): return self.data[start:end] filesystem_spec-0.6.1/fsspec/implementations/local.py000066400000000000000000000166361356753337100231040ustar00rootroot00000000000000import io import os import shutil import posixpath import re import tempfile from fsspec import AbstractFileSystem from fsspec.utils import stringify_path class LocalFileSystem(AbstractFileSystem): """Interface to files on local storage Parameters ---------- auto_mkdirs: bool Whether, when opening a file, the directory containing it should be created (if it doesn't already exist). This is assumed by pyarrow code. """ root_marker = "/" def __init__(self, auto_mkdir=True, **kwargs): super().__init__(**kwargs) self.auto_mkdir = auto_mkdir def mkdir(self, path, create_parents=True, **kwargs): path = self._strip_protocol(path) if create_parents: self.makedirs(path, exist_ok=True) else: os.mkdir(path, **kwargs) def makedirs(self, path, exist_ok=False): path = self._strip_protocol(path) os.makedirs(path, exist_ok=exist_ok) def rmdir(self, path): os.rmdir(path) def ls(self, path, detail=False): path = self._strip_protocol(path) paths = [posixpath.join(path, f) for f in os.listdir(path)] if detail: return [self.info(f) for f in paths] else: return paths def glob(self, path, **kargs): path = self._strip_protocol(path) return super().glob(path) def info(self, path, **kwargs): path = self._strip_protocol(path) out = os.stat(path, follow_symlinks=False) dest = False if os.path.islink(path): t = "link" dest = os.readlink(path) elif os.path.isdir(path): t = "directory" elif os.path.isfile(path): t = "file" else: t = "other" result = {"name": path, "size": out.st_size, "type": t, "created": out.st_ctime} for field in ["mode", "uid", "gid", "mtime"]: result[field] = getattr(out, "st_" + field) if dest: result["destination"] = dest try: out2 = os.stat(path, follow_symlinks=True) result["size"] = out2.st_size except IOError: result["size"] = 0 return result def copy(self, path1, path2, **kwargs): shutil.copyfile(path1, path2) def get(self, path1, path2, **kwargs): if kwargs.get("recursive"): return super(LocalFileSystem, self).get(path1, path2, **kwargs) else: return self.copy(path1, path2, **kwargs) def put(self, path1, path2, **kwargs): if kwargs.get("recursive"): return super(LocalFileSystem, self).put(path1, path2, **kwargs) else: return self.copy(path1, path2, **kwargs) def mv(self, path1, path2, **kwargs): os.rename(path1, path2) def rm(self, path, recursive=False, maxdepth=None): if recursive and self.isdir(path): shutil.rmtree(path) else: os.remove(path) def _open(self, path, mode="rb", block_size=None, **kwargs): path = self._strip_protocol(path) if self.auto_mkdir: self.makedirs(self._parent(path), exist_ok=True) return LocalFileOpener(path, mode, fs=self, **kwargs) def touch(self, path, **kwargs): path = self._strip_protocol(path) if self.exists(path): os.utime(path, None) else: open(path, "a").close() @classmethod def _parent(cls, path): path = cls._strip_protocol(path).rstrip("/") if "/" in path: return path.rsplit("/", 1)[0] else: return cls.root_marker @classmethod def _strip_protocol(cls, path): path = stringify_path(path) if path.startswith("file://"): path = path[7:] return make_path_posix(path) def _isfilestore(self): # Inheriting from DaskFileSystem makes this False (S3, etc. were) # the original motivation. But we are a posix-like file system. # See https://github.com/dask/dask/issues/5526 return True def make_path_posix(path, sep=os.sep): """ Make path generic """ if re.match("/[A-Za-z]:", path): # for windows file URI like "file:///C:/folder/file" # or "file:///C:\\dir\\file" path = path[1:] if path.startswith("\\\\"): # special case for windows UNC/DFS-style paths, do nothing, # jsut flip the slashes around (case below does not work!) return path.replace("\\", "/") if path.startswith("\\") or re.match("[\\\\]*[A-Za-z]:", path): # windows full path "\\server\\path" or "C:\\local\\path" return path.lstrip("\\").replace("\\", "/").replace("//", "/") if ( sep not in path and "/" not in path or (sep == "/" and not path.startswith("/")) or (sep == "\\" and ":" not in path) ): # relative path like "path" or "rel\\path" (win) or rel/path" path = os.path.abspath(path) if os.sep == "\\": # abspath made some more '\\' separators return make_path_posix(path, sep) return path class LocalFileOpener(object): def __init__(self, path, mode, autocommit=True, fs=None, **kwargs): self.path = path self.mode = mode self.fs = fs self.f = None self.autocommit = autocommit self.blocksize = io.DEFAULT_BUFFER_SIZE self._open() def _open(self): if self.f is None or self.f.closed: if self.autocommit or "w" not in self.mode: self.f = open(self.path, mode=self.mode) else: # TODO: check if path is writable? i, name = tempfile.mkstemp() self.temp = name self.f = open(name, mode=self.mode) if "w" not in self.mode: self.details = self.fs.info(self.path) self.size = self.details["size"] self.f.size = self.size def _fetch_range(self, start, end): # probably only used by cached FS if "r" not in self.mode: raise ValueError self._open() self.f.seek(start) return self.f.read(end - start) def __setstate__(self, state): if "r" in state["mode"]: loc = self.state.pop("loc") self._open() self.f.seek(loc) else: self.f = None self.__dict__.update(state) def __getstate__(self): d = self.__dict__.copy() d.pop("f") if "r" in self.mode: d["loc"] = self.f.tell() else: if not self.f.closed: raise ValueError("Cannot serialise open write-mode local file") return d def commit(self): if self.autocommit: raise RuntimeError("Can only commit if not already set to autocommit") os.rename(self.temp, self.path) def discard(self): if self.autocommit: raise RuntimeError("Cannot discard if set to autocommit") os.remove(self.temp) def __fspath__(self): # uniquely for fsspec implementations, this is a real path return self.path def __getattr__(self, item): return getattr(self.f, item) def __enter__(self): self._incontext = True return self.f.__enter__() def __exit__(self, exc_type, exc_value, traceback): self._incontext = False self.f.__exit__(exc_type, exc_value, traceback) filesystem_spec-0.6.1/fsspec/implementations/memory.py000066400000000000000000000115301356753337100233060ustar00rootroot00000000000000from __future__ import print_function, division, absolute_import from io import BytesIO from fsspec import AbstractFileSystem import logging logger = logging.Logger("fsspec.memoryfs") class MemoryFileSystem(AbstractFileSystem): """A filesystem based on a dict of BytesIO objects""" store = {} # global pseudo_dirs = [] protocol = "memory" root_marker = "" def ls(self, path, detail=False): if path in self.store: # there is a key with this exact name, but could also be directory out = [ { "name": path, "size": self.store[path].getbuffer().nbytes, "type": "file", } ] else: out = [] path = path.strip("/").lstrip("/") paths = set() for p2 in self.store: has_slash = "/" if p2.startswith("/") else "" p = p2.lstrip("/") if "/" in p: root = p.rsplit("/", 1)[0] else: root = "" if root == path: out.append( { "name": has_slash + p, "size": self.store[p2].getbuffer().nbytes, "type": "file", } ) elif path and all( (a == b) for a, b in zip(path.split("/"), p.strip("/").split("/")) ): # implicit directory ppath = "/".join(p.split("/")[: len(path.split("/")) + 1]) if ppath not in paths: out.append( { "name": has_slash + ppath + "/", "size": 0, "type": "directory", } ) paths.add(ppath) elif all( (a == b) for a, b in zip(path.split("/"), [""] + p.strip("/").split("/")) ): # root directory entry ppath = p.rstrip("/").split("/", 1)[0] if ppath not in paths: out.append( { "name": has_slash + ppath + "/", "size": 0, "type": "directory", } ) paths.add(ppath) for p2 in self.pseudo_dirs: if self._parent(p2).strip("/").rstrip("/") == path: out.append({"name": p2 + "/", "size": 0, "type": "directory"}) if detail: return out return sorted([f["name"] for f in out]) def mkdir(self, path): path = path.rstrip("/") if path not in self.pseudo_dirs: self.pseudo_dirs.append(path) def rmdir(self, path): path = path.rstrip("/") if path in self.pseudo_dirs: if self.ls(path) == []: self.pseudo_dirs.remove(path) else: raise OSError("Directory %s not empty" % path) else: raise FileNotFoundError(path) def exists(self, path): return path in self.store def _open( self, path, mode="rb", block_size=None, autocommit=True, cache_options=None, **kwargs ): if mode in ["rb", "ab", "rb+"]: if path in self.store: f = self.store[path] if mode == "rb": f.seek(0) else: f.seek(0, 2) return f else: raise FileNotFoundError(path) if mode == "wb": m = MemoryFile(self, path) if not self._intrans: m.commit() return m def copy(self, path1, path2, **kwargs): self.store[path2] = MemoryFile(self, path2, self.store[path1].getbuffer()) def cat(self, path): return self.store[path].getvalue() def _rm(self, path): del self.store[path] def size(self, path): """Size in bytes of the file at path""" if path not in self.store: raise FileNotFoundError(path) return self.store[path].getbuffer().nbytes class MemoryFile(BytesIO): """A BytesIO which can't close and works as a context manager Can initialise with data No need to provide fs, path if auto-committing (default) """ def __init__(self, fs, path, data=None): self.fs = fs self.path = path if data: self.write(data) self.size = len(data) self.seek(0) def __enter__(self): return self def close(self): self.size = self.seek(0, 2) def discard(self): pass def commit(self): self.fs.store[self.path] = self filesystem_spec-0.6.1/fsspec/implementations/sftp.py000066400000000000000000000100221356753337100227450ustar00rootroot00000000000000import paramiko from stat import S_ISDIR, S_ISLNK import types import uuid from .. import AbstractFileSystem from ..utils import infer_storage_options class SFTPFileSystem(AbstractFileSystem): """Files over SFTP/SSH Peer-to-peer filesystem over SSH using paramiko. """ protocol = "sftp", "ssh" def __init__(self, host, **ssh_kwargs): """ Parameters ---------- host: str Hostname or IP as a string temppath: str Location on the server to put files, when within a transaction ssh_kwargs: dict Parameters passed on to connection. See details in http://docs.paramiko.org/en/2.4/api/client.html#paramiko.client.SSHClient.connect May include port, username, password... """ if self._cached: return super(SFTPFileSystem, self).__init__(**ssh_kwargs) self.temppath = ssh_kwargs.pop("temppath", "/tmp") self.host = host self.ssh_kwargs = ssh_kwargs self._connect() def _connect(self): self.client = paramiko.SSHClient() self.client.set_missing_host_key_policy(paramiko.AutoAddPolicy()) self.client.connect(self.host, **self.ssh_kwargs) self.ftp = self.client.open_sftp() @classmethod def _strip_protocol(cls, path): return infer_storage_options(path)["path"] @staticmethod def _get_kwargs_from_urls(urlpath): out = infer_storage_options(urlpath) out.pop("path", None) out.pop("protocol", None) return out def mkdir(self, path, mode=511): self.ftp.mkdir(path, mode) def makedirs(self, path, exist_ok=False, mode=511): if self.exists(path) and not exist_ok: raise FileExistsError("File exists: {}".format(path)) parts = path.split("/") path = "" for part in parts: path += "/" + part if not self.exists(path): self.mkdir(path, mode) def rmdir(self, path): self.ftp.rmdir(path) def info(self, path): s = self.ftp.stat(path) if S_ISDIR(s.st_mode): t = "directory" elif S_ISLNK(s.st_mode): t = "link" else: t = "file" return { "name": path + "/" if t == "directory" else path, "size": s.st_size, "type": t, "uid": s.st_uid, "gui": s.st_gid, "time": s.st_atime, "mtime": s.st_mtime, } def ls(self, path, detail=False): out = ["/".join([path.rstrip("/"), p]) for p in self.ftp.listdir(path)] out = [self.info(o) for o in out] if detail: return out return sorted([p["name"] for p in out]) def put(self, lpath, rpath): self.ftp.put(lpath, rpath) def get(self, rpath, lpath): self.ftp.get(rpath, lpath) def _open(self, path, mode="rb", block_size=None, **kwargs): """ block_size: int or None If 0, no buffering, if 1, line buffering, if >1, buffer that many bytes, if None use default from paramiko. """ if kwargs.get("autocommit", True) is False: # writes to temporary file, move on commit path2 = "{}/{}".format(self.temppath, uuid.uuid4()) f = self.ftp.open(path2, mode, bufsize=block_size if block_size else -1) f.temppath = path2 f.targetpath = path f.fs = self f.commit = types.MethodType(commit_a_file, f) f.discard = types.MethodType(discard_a_file, f) else: f = self.ftp.open(path, mode, bufsize=block_size if block_size else -1) return f def _rm(self, path): if self.isdir(path): self.ftp.rmdir(path) else: self.ftp.remove(path) def mv(self, old, new): self.ftp.posix_rename(old, new) def commit_a_file(self): self.fs.mv(self.temppath, self.targetpath) def discard_a_file(self): self.fs._rm(self.temppath) filesystem_spec-0.6.1/fsspec/implementations/tests/000077500000000000000000000000001356753337100225665ustar00rootroot00000000000000filesystem_spec-0.6.1/fsspec/implementations/tests/__init__.py000066400000000000000000000000001356753337100246650ustar00rootroot00000000000000filesystem_spec-0.6.1/fsspec/implementations/tests/test_cached.py000066400000000000000000000220311356753337100254040ustar00rootroot00000000000000import os import shutil import pickle import pytest import fsspec from fsspec.implementations.cached import CachingFileSystem from .test_ftp import FTPFileSystem @pytest.fixture def local_filecache(): import tempfile original_location = tempfile.mkdtemp() cache_location = tempfile.mkdtemp() original_file = os.path.join(original_location, "afile") data = b"test data" with open(original_file, "wb") as f: f.write(data) # we can access the file and read it fs = fsspec.filesystem( "filecache", target_protocol="file", cache_storage=cache_location ) return (data, original_file, cache_location, fs) def test_idempotent(): fs = CachingFileSystem("file") fs2 = CachingFileSystem("file") assert fs2 is fs fs3 = pickle.loads(pickle.dumps(fs)) assert fs3.storage == fs.storage def test_workflow(ftp_writable): host, port, user, pw = ftp_writable fs = FTPFileSystem(host, port, user, pw) with fs.open("/out", "wb") as f: f.write(b"test") fs = fsspec.filesystem( "cached", target_protocol="ftp", target_options={"host": host, "port": port, "username": user, "password": pw}, ) assert os.listdir(fs.storage[-1]) == [] with fs.open("/out") as f: assert os.listdir(fs.storage[-1]) assert f.read() == b"test" assert fs.cached_files[-1]["ftp:///out"]["blocks"] assert fs.cat("/out") == b"test" assert fs.cached_files[-1]["ftp:///out"]["blocks"] is True with fs.open("/out", "wb") as f: f.write(b"changed") assert fs.cat("/out") == b"test" # old value def test_blocksize(ftp_writable): host, port, user, pw = ftp_writable fs = FTPFileSystem(host, port, user, pw) with fs.open("/out_block", "wb") as f: f.write(b"test" * 4000) fs = fsspec.filesystem( "blockcache", target_protocol="ftp", target_options={"host": host, "port": port, "username": user, "password": pw}, ) with fs.open("/out_block", block_size=20) as f: assert f.read(1) == b"t" with pytest.raises(ValueError): fs.open("/out_block", block_size=30) def test_local_filecache_creates_dir_if_needed(): import tempfile original_location = tempfile.mkdtemp() cache_location = "foofoobarbar" assert not os.path.exists(cache_location) try: original_file = os.path.join(original_location, "afile") data = b"test data" with open(original_file, "wb") as f: f.write(data) # we can access the file and read it fs = fsspec.filesystem( "filecache", target_protocol="file", cache_storage=cache_location ) with fs.open(original_file, "rb") as f: data_in_cache = f.read() assert os.path.exists(cache_location) finally: shutil.rmtree(cache_location) assert data_in_cache == data def test_local_filecache_basic(local_filecache): data, original_file, cache_location, fs = local_filecache # reading from the file contains the right data with fs.open(original_file, "rb") as f: assert f.read() == data assert "cache" in os.listdir(cache_location) # the file in the location contains the right data fn = list(fs.cached_files[-1].values())[0]["fn"] # this is a hash value assert fn in os.listdir(cache_location) with open(os.path.join(cache_location, fn), "rb") as f: assert f.read() == data # still there when original file is removed (check=False) os.remove(original_file) with fs.open(original_file, "rb") as f: assert f.read() == data def test_local_filecache_does_not_change_when_original_data_changed(local_filecache): old_data, original_file, cache_location, fs = local_filecache new_data = b"abc" with fs.open(original_file, "rb") as f: assert f.read() == old_data with open(original_file, "wb") as f: f.write(new_data) with fs.open(original_file, "rb") as f: assert f.read() == old_data def test_local_filecache_gets_from_original_if_cache_deleted(local_filecache): old_data, original_file, cache_location, fs = local_filecache new_data = b"abc" with fs.open(original_file, "rb") as f: assert f.read() == old_data with open(original_file, "wb") as f: f.write(new_data) shutil.rmtree(cache_location) assert os.path.exists(original_file) with open(original_file, "rb") as f: assert f.read() == new_data with fs.open(original_file, "rb") as f: assert f.read() == new_data # the file in the location contains the right data fn = list(fs.cached_files[-1].values())[0]["fn"] # this is a hash value assert fn in os.listdir(cache_location) with open(os.path.join(cache_location, fn), "rb") as f: assert f.read() == new_data def test_local_filecache_with_new_cache_location_makes_a_new_copy(local_filecache): import tempfile data, original_file, old_cache_location, old_fs = local_filecache new_cache_location = tempfile.mkdtemp() with old_fs.open(original_file, "rb") as f: assert f.read() == data new_fs = fsspec.filesystem( "filecache", target_protocol="file", cache_storage=new_cache_location ) with new_fs.open(original_file, "rb") as f: assert f.read() == data # the file in the location contains the right data fn = list(new_fs.cached_files[-1].values())[0]["fn"] # this is a hash value assert fn in os.listdir(old_cache_location) assert fn in os.listdir(new_cache_location) with open(os.path.join(new_cache_location, fn), "rb") as f: assert f.read() == data def test_filecache_multicache(): import tempfile origin = tempfile.mkdtemp() cache1 = tempfile.mkdtemp() cache2 = tempfile.mkdtemp() data = b"test data" f1 = os.path.join(origin, "afile") f2 = os.path.join(origin, "bfile") with open(f1, "wb") as f: f.write(data) with open(f2, "wb") as f: f.write(data * 2) # populates first cache fs = fsspec.filesystem("filecache", target_protocol="file", cache_storage=cache1) assert fs.cat(f1) == data assert len(os.listdir(cache1)) == 2 # cache and hashed afile assert len(os.listdir(cache2)) == 0 # hasn't been intialized yet # populates last cache if file not found in first cache fs = fsspec.filesystem( "filecache", target_protocol="file", cache_storage=[cache1, cache2] ) assert fs.cat(f1) == data assert fs.cat(f2) == data * 2 assert "cache" in os.listdir(cache1) assert "cache" in os.listdir(cache2) cache1_contents = [f for f in os.listdir(cache1) if f != "cache"] assert len(cache1_contents) == 1 with open(os.path.join(cache1, cache1_contents[0]), "rb") as f: assert f.read() == data cache2_contents = [f for f in os.listdir(cache2) if f != "cache"] assert len(cache2_contents) == 1 with open(os.path.join(cache2, cache2_contents[0]), "rb") as f: assert f.read() == data * 2 def test_filecache_multicache_with_same_file_different_data_reads_from_first(): import tempfile origin = tempfile.mkdtemp() cache1 = tempfile.mkdtemp() cache2 = tempfile.mkdtemp() data = b"test data" f1 = os.path.join(origin, "afile") with open(f1, "wb") as f: f.write(data) # populate first cache fs = fsspec.filesystem("filecache", target_protocol="file", cache_storage=cache1) assert fs.cat(f1) == data with open(f1, "wb") as f: f.write(data * 2) # populate second cache fs = fsspec.filesystem("filecache", target_protocol="file", cache_storage=cache2) assert fs.cat(f1) == data * 2 # the filenames in each cache are the same, but the data is different assert os.listdir(cache1) == os.listdir(cache2) fs = fsspec.filesystem( "filecache", target_protocol="file", cache_storage=[cache1, cache2] ) assert fs.cat(f1) == data def test_filecache_with_checks(): import tempfile import time origin = tempfile.mkdtemp() cache1 = tempfile.mkdtemp() data = b"test data" f1 = os.path.join(origin, "afile") with open(f1, "wb") as f: f.write(data) # populate first cache fs = fsspec.filesystem( "filecache", target_protocol="file", cache_storage=cache1, expiry_time=0.1 ) fs2 = fsspec.filesystem( "filecache", target_protocol="file", cache_storage=cache1, check_files=True ) assert fs.cat(f1) == data assert fs2.cat(f1) == data with open(f1, "wb") as f: f.write(data * 2) assert fs.cat(f1) == data # does not change assert fs2.cat(f1) == data * 2 # changed, since origin changed time.sleep(0.11) # allow cache details to expire assert fs.cat(f1) == data * 2 # changed, since origin changed def test_takes_fs_instance(): import tempfile origin = tempfile.mkdtemp() data = b"test data" f1 = os.path.join(origin, "afile") with open(f1, "wb") as f: f.write(data) fs = fsspec.filesystem("file") fs2 = fsspec.filesystem("filecache", target_protocol=fs) assert fs2.cat(f1) == data filesystem_spec-0.6.1/fsspec/implementations/tests/test_dask.py000066400000000000000000000010521356753337100251170ustar00rootroot00000000000000import pytest import fsspec pytest.importorskip("distributed") @pytest.fixture() def cli(tmpdir): import dask.distributed client = dask.distributed.Client(n_workers=1) def setup(): m = fsspec.filesystem("memory") with m.open("afile", "wb") as f: f.write(b"data") client.run(setup) try: yield client finally: client.close() def test_basic(cli): fs = fsspec.filesystem("dask", remote_protocol="memory") assert fs.ls("") == ["afile"] assert fs.cat("afile") == b"data" filesystem_spec-0.6.1/fsspec/implementations/tests/test_ftp.py000066400000000000000000000057641356753337100250040ustar00rootroot00000000000000import os import pytest import subprocess import sys import time from fsspec.implementations.ftp import FTPFileSystem from fsspec import open_files import fsspec here = os.path.dirname(os.path.abspath(__file__)) @pytest.fixture() def ftp(): pytest.importorskip("pyftpdlib") P = subprocess.Popen( [sys.executable, "-m", "pyftpdlib", "-d", here], stderr=subprocess.STDOUT, stdout=subprocess.PIPE, ) try: time.sleep(1) yield "localhost", 2121 finally: P.terminate() P.wait() def test_basic(ftp): host, port = ftp fs = FTPFileSystem(host, port) assert fs.ls("/", detail=False) == sorted(os.listdir(here)) out = fs.cat("/" + os.path.basename(__file__)) assert out == open(__file__, "rb").read() def test_not_cached(ftp): host, port = ftp fs = FTPFileSystem(host, port) fs2 = FTPFileSystem(host, port) assert fs is not fs2 @pytest.mark.parametrize("cache_type", ["bytes", "mmap"]) def test_complex(ftp_writable, cache_type): from fsspec.core import BytesCache host, port, user, pw = ftp_writable files = open_files( "ftp:///ou*", host=host, port=port, username=user, password=pw, block_size=10000, cache_type=cache_type, ) assert len(files) == 1 with files[0] as fo: assert fo.read(10) == b"hellohello" if isinstance(fo.cache, BytesCache): assert len(fo.cache.cache) == 10010 assert fo.read(2) == b"he" assert fo.tell() == 12 def test_write_small(ftp_writable): host, port, user, pw = ftp_writable fs = FTPFileSystem(host, port, user, pw) with fs.open("/out2", "wb") as f: f.write(b"oi") assert fs.cat("/out2") == b"oi" def test_with_url(ftp_writable): host, port, user, pw = ftp_writable fo = fsspec.open("ftp://{}:{}@{}:{}/out".format(user, pw, host, port), "wb") with fo as f: f.write(b"hello") fo = fsspec.open("ftp://{}:{}@{}:{}/out".format(user, pw, host, port), "rb") with fo as f: assert f.read() == b"hello" @pytest.mark.parametrize("cache_type", ["bytes", "mmap"]) def test_write_big(ftp_writable, cache_type): host, port, user, pw = ftp_writable fs = FTPFileSystem(host, port, user, pw, block_size=1000, cache_type=cache_type) fn = "/bigger" with fs.open(fn, "wb") as f: f.write(b"o" * 500) assert not fs.exists(fn) f.write(b"o" * 1000) fs.invalidate_cache() assert fs.exists(fn) f.write(b"o" * 200) f.flush() assert fs.info(fn)["size"] == 1700 assert fs.cat(fn) == b"o" * 1700 def test_transaction(ftp_writable): host, port, user, pw = ftp_writable fs = FTPFileSystem(host, port, user, pw) fs.mkdir("/tmp") fn = "/tr" with fs.transaction: with fs.open(fn, "wb") as f: f.write(b"not") assert not fs.exists(fn) assert fs.exists(fn) assert fs.cat(fn) == b"not" fs.rm(fn) assert not fs.exists(fn) filesystem_spec-0.6.1/fsspec/implementations/tests/test_http.py000066400000000000000000000110111356753337100251500ustar00rootroot00000000000000import pytest from http.server import BaseHTTPRequestHandler, HTTPServer import threading import fsspec requests = pytest.importorskip("requests") port = 9898 data = b"\n".join([b"some test data"] * 1000) realfile = "http://localhost:%i/index/realfile" % port index = b'Link' % realfile.encode() class HTTPTestHandler(BaseHTTPRequestHandler): def _respond(self, code=200, headers=None, data=b""): headers = headers or {} headers.update({"User-Agent": "test"}) self.send_response(code) for k, v in headers.items(): self.send_header(k, str(v)) self.end_headers() if data: self.wfile.write(data) def do_GET(self): if self.path not in ["/index/realfile", "/index"]: self._respond(404) return d = data if self.path == "/index/realfile" else index if "Range" in self.headers: ran = self.headers["Range"] b, ran = ran.split("=") start, end = ran.split("-") print(start) print(end) d = d[int(start) : int(end) + 1] if "give_length" in self.headers: response_headers = {"Content-Length": len(d)} self._respond(200, response_headers, d) elif "give_range" in self.headers: self._respond(200, {"Content-Range": "0-%i/%i" % (len(d) - 1, len(d))}, d) else: self._respond(200, data=d) def do_HEAD(self): if "head_ok" not in self.headers: self._respond(405) return d = data if self.path == "/index/realfile" else index if self.path not in ["/index/realfile", "/index"]: self._respond(404) elif "give_length" in self.headers: response_headers = {"Content-Length": len(d)} if "zero_length" in self.headers: response_headers["Content-Length"] = 0 self._respond(200, response_headers) elif "give_range" in self.headers: self._respond(200, {"Content-Range": "0-%i/%i" % (len(d) - 1, len(d))}) else: self._respond(200) # OK response, but no useful info @pytest.fixture(scope="module") def server(): server_address = ("", port) httpd = HTTPServer(server_address, HTTPTestHandler) th = threading.Thread(target=httpd.serve_forever) th.daemon = True th.start() try: yield "http://localhost:%i" % port finally: httpd.socket.close() httpd.shutdown() th.join() def test_list(server): h = fsspec.filesystem("http") out = h.glob(server + "/index/*") assert out == [server + "/index/realfile"] def test_policy_arg(server): h = fsspec.filesystem("http", size_policy="get") out = h.glob(server + "/index/*") assert out == [server + "/index/realfile"] def test_exists(server): h = fsspec.filesystem("http") assert not h.exists(server + "/notafile") def test_read(server): h = fsspec.filesystem("http") out = server + "/index/realfile" with h.open(out, "rb") as f: assert f.read() == data with h.open(out, "rb", block_size=0) as f: assert f.read() == data with h.open(out, "rb") as f: assert f.read(100) + f.read() == data def test_methods(server): h = fsspec.filesystem("http") url = server + "/index/realfile" assert h.exists(url) assert h.cat(url) == data @pytest.mark.parametrize( "headers", [ {}, {"give_length": "true"}, {"give_length": "true", "head_ok": "true"}, {"give_range": "true"}, ], ) def test_random_access(server, headers): h = fsspec.filesystem("http", headers=headers) url = server + "/index/realfile" with h.open(url, "rb") as f: if headers: assert f.size == len(data) assert f.read(5) == data[:5] # python server does not respect bytes range request # we actually get all the data f.seek(5, 1) assert f.read(5) == data[10:15] def test_mapper_url(server): h = fsspec.filesystem("http") mapper = h.get_mapper(server + "/index/") assert mapper.root.startswith("http:") assert list(mapper) mapper2 = fsspec.get_mapper(server + "/index/") assert mapper2.root.startswith("http:") assert list(mapper) == list(mapper2) def test_content_length_zero(server): h = fsspec.filesystem( "http", headers={"give_length": "true", "zero_length": "true"} ) url = server + "/index/realfile" with h.open(url, "rb") as f: assert f.read() == data filesystem_spec-0.6.1/fsspec/implementations/tests/test_local.py000066400000000000000000000313631356753337100252770ustar00rootroot00000000000000from __future__ import print_function, division, absolute_import import gzip import os import os.path import sys from contextlib import contextmanager import tempfile import pytest import fsspec from fsspec.core import open_files, get_fs_token_paths, OpenFile from fsspec.implementations.local import LocalFileSystem, make_path_posix from fsspec import compression files = { ".test.accounts.1.json": ( b'{"amount": 100, "name": "Alice"}\n' b'{"amount": 200, "name": "Bob"}\n' b'{"amount": 300, "name": "Charlie"}\n' b'{"amount": 400, "name": "Dennis"}\n' ), ".test.accounts.2.json": ( b'{"amount": 500, "name": "Alice"}\n' b'{"amount": 600, "name": "Bob"}\n' b'{"amount": 700, "name": "Charlie"}\n' b'{"amount": 800, "name": "Dennis"}\n' ), } csv_files = { ".test.fakedata.1.csv": (b"a,b\n" b"1,2\n"), ".test.fakedata.2.csv": (b"a,b\n" b"3,4\n"), } @contextmanager def filetexts(d, open=open, mode="t"): """ Dumps a number of textfiles to disk d - dict a mapping from filename to text like {'a.csv': '1,1\n2,2'} Since this is meant for use in tests, this context manager will automatically switch to a temporary current directory, to avoid race conditions when running tests in parallel. """ odir = os.getcwd() dirname = tempfile.mkdtemp() try: os.chdir(dirname) for filename, text in d.items(): f = open(filename, "w" + mode) try: f.write(text) finally: try: f.close() except AttributeError: pass yield list(d) for filename in d: if os.path.exists(filename): try: os.remove(filename) except (IOError, OSError): pass finally: os.chdir(odir) def test_urlpath_inference_strips_protocol(tmpdir): tmpdir = str(tmpdir) paths = [os.path.join(tmpdir, "test.%02d.csv" % i) for i in range(20)] for path in paths: with open(path, "wb") as f: f.write(b"1,2,3\n" * 10) # globstring protocol = "file:///" if sys.platform == "win32" else "file://" urlpath = protocol + os.path.join(tmpdir, "test.*.csv") _, _, paths2 = get_fs_token_paths(urlpath) assert paths2 == paths # list of paths _, _, paths2 = get_fs_token_paths([protocol + p for p in paths]) assert paths2 == paths def test_urlpath_inference_errors(): # Empty list with pytest.raises(ValueError) as err: get_fs_token_paths([]) assert "empty" in str(err.value) # Protocols differ with pytest.raises(ValueError) as err: get_fs_token_paths(["s3://test/path.csv", "/other/path.csv"]) assert "same protocol" in str(err.value) # Unknown type with pytest.raises(TypeError): get_fs_token_paths( {"sets/are.csv", "unordered/so/they.csv", "should/not/be.csvallowed.csv"} ) def test_urlpath_expand_read(): """Make sure * is expanded in file paths when reading.""" # when reading, globs should be expanded to read files by mask with filetexts(csv_files, mode="b"): _, _, paths = get_fs_token_paths("./.*.csv") assert len(paths) == 2 _, _, paths = get_fs_token_paths(["./.*.csv"]) assert len(paths) == 2 def test_urlpath_expand_write(): """Make sure * is expanded in file paths when writing.""" _, _, paths = get_fs_token_paths("prefix-*.csv", mode="wb", num=2) assert all( [p.endswith(pa) for p, pa in zip(paths, ["/prefix-0.csv", "/prefix-1.csv"])] ) _, _, paths = get_fs_token_paths(["prefix-*.csv"], mode="wb", num=2) assert all( [p.endswith(pa) for p, pa in zip(paths, ["/prefix-0.csv", "/prefix-1.csv"])] ) # we can read with multiple masks, but not write with pytest.raises(ValueError): _, _, paths = get_fs_token_paths( ["prefix1-*.csv", "prefix2-*.csv"], mode="wb", num=2 ) def test_open_files(): with filetexts(files, mode="b"): myfiles = open_files("./.test.accounts.*") assert len(myfiles) == len(files) for lazy_file, data_file in zip(myfiles, sorted(files)): with lazy_file as f: x = f.read() assert x == files[data_file] @pytest.mark.parametrize("encoding", ["utf-8", "ascii"]) def test_open_files_text_mode(encoding): with filetexts(files, mode="b"): myfiles = open_files("./.test.accounts.*", mode="rt", encoding=encoding) assert len(myfiles) == len(files) data = [] for file in myfiles: with file as f: data.append(f.read()) assert list(data) == [files[k].decode(encoding) for k in sorted(files)] @pytest.mark.parametrize("mode", ["rt", "rb"]) @pytest.mark.parametrize("fmt", list(compression.compr)) def test_compressions(fmt, mode, tmpdir): if fmt == "zip" and sys.version_info < (3, 6): pytest.xfail("zip compression requires python3.6 or higher") tmpdir = str(tmpdir) fn = os.path.join(tmpdir, ".tmp.getsize") fs = LocalFileSystem() f = OpenFile(fs, fn, compression=fmt, mode="wb") data = b"Long line of readily compressible text" with f as fo: fo.write(data) if fmt is None: assert fs.size(fn) == len(data) else: assert fs.size(fn) != len(data) f = OpenFile(fs, fn, compression=fmt, mode=mode) with f as fo: if mode == "rb": assert fo.read() == data else: assert fo.read() == data.decode() def test_bad_compression(): with filetexts(files, mode="b"): for func in [open_files]: with pytest.raises(ValueError): func("./.test.accounts.*", compression="not-found") def test_not_found(): fn = "not-a-file" fs = LocalFileSystem() with pytest.raises((FileNotFoundError, OSError)): with OpenFile(fs, fn, mode="rb"): pass def test_isfile(): fs = LocalFileSystem() with filetexts(files, mode="b"): for f in files.keys(): assert fs.isfile(f) assert not fs.isfile("not-a-file") def test_isdir(): fs = LocalFileSystem() with filetexts(files, mode="b"): for f in files.keys(): assert fs.isdir(os.path.dirname(os.path.abspath(f))) assert not fs.isdir(f) assert not fs.isdir("not-a-dir") @pytest.mark.parametrize("compression_opener", [(None, open), ("gzip", gzip.open)]) def test_open_files_write(tmpdir, compression_opener): tmpdir = str(tmpdir) compression, opener = compression_opener fn = str(tmpdir) + "/*.part" files = open_files(fn, num=2, mode="wb", compression=compression) assert len(files) == 2 assert {f.mode for f in files} == {"wb"} for fil in files: with fil as f: f.write(b"000") files = sorted(os.listdir(tmpdir)) assert files == ["0.part", "1.part"] with opener(os.path.join(tmpdir, files[0]), "rb") as f: d = f.read() assert d == b"000" def test_pickability_of_lazy_files(tmpdir): tmpdir = str(tmpdir) cloudpickle = pytest.importorskip("cloudpickle") with filetexts(files, mode="b"): myfiles = open_files("./.test.accounts.*") myfiles2 = cloudpickle.loads(cloudpickle.dumps(myfiles)) for f, f2 in zip(myfiles, myfiles2): assert f.path == f2.path assert isinstance(f.fs, type(f2.fs)) with f as f_open, f2 as f2_open: assert f_open.read() == f2_open.read() def test_abs_paths(tmpdir): tmpdir = str(tmpdir) here = os.getcwd() os.chdir(tmpdir) with open("tmp", "w") as f: f.write("hi") out = LocalFileSystem().glob("./*") assert len(out) == 1 assert os.sep in out[0] assert "tmp" in out[0] # I don't know what this was testing - but should avoid local paths anyway # fs = LocalFileSystem() os.chdir(here) # with fs.open('tmp', 'r') as f: # res = f.read() # assert res == 'hi' @pytest.mark.parametrize("sep", ["/", "\\"]) @pytest.mark.parametrize("chars", ["+", "++", "(", ")", "|", "\\"]) def test_glob_weird_characters(tmpdir, sep, chars): tmpdir = str(tmpdir) subdir = tmpdir + sep + "test" + chars + "x" os.mkdir(subdir) with open(subdir + sep + "tmp", "w") as f: f.write("hi") out = LocalFileSystem().glob(subdir + sep + "*") assert len(out) == 1 assert os.sep in out[0] assert "tmp" in out[0] def test_globfind_dirs(tmpdir): tmpdir = str(tmpdir) fs = fsspec.filesystem("file") fs.mkdir(tmpdir + "/dir") fs.touch(tmpdir + "/dir/afile") assert [tmpdir + "/dir"] == fs.glob(tmpdir + "/*") assert [tmpdir + "/dir/afile"] == fs.find(tmpdir) assert [tmpdir + "/dir", tmpdir + "/dir/afile"] == fs.find(tmpdir, withdirs=True) def test_get_pyarrow_filesystem(): pa = pytest.importorskip("pyarrow") fs = LocalFileSystem() assert isinstance(fs, pa.filesystem.FileSystem) assert fs._get_pyarrow_filesystem() is fs class UnknownFileSystem(object): pass assert not isinstance(UnknownFileSystem(), pa.filesystem.FileSystem) def test_directories(tmpdir): tmpdir = str(tmpdir) fs = LocalFileSystem() fs.mkdir(tmpdir + "/dir") assert tmpdir + "/dir" in fs.ls(tmpdir) assert fs.ls(tmpdir, True)[0]["type"] == "directory" fs.rmdir(tmpdir + "/dir") assert not fs.ls(tmpdir) def test_file_ops(tmpdir): tmpdir = str(tmpdir) fs = LocalFileSystem() with pytest.raises(FileNotFoundError): fs.info(tmpdir + "/nofile") fs.touch(tmpdir + "/afile") i1 = fs.ukey(tmpdir + "/afile") assert tmpdir + "/afile" in fs.ls(tmpdir) with fs.open(tmpdir + "/afile", "wb") as f: f.write(b"data") i2 = fs.ukey(tmpdir + "/afile") assert i1 != i2 # because file changed fs.copy(tmpdir + "/afile", tmpdir + "/afile2") assert tmpdir + "/afile2" in fs.ls(tmpdir) fs.move(tmpdir + "/afile", tmpdir + "/afile3") assert not fs.exists(tmpdir + "/afile") fs.rm(tmpdir + "/afile3", recursive=True) assert not fs.exists(tmpdir + "/afile3") fs.rm(tmpdir, recursive=True) assert not fs.exists(tmpdir) def test_recursive_get_put(tmpdir): tmpdir = str(tmpdir) fs = LocalFileSystem() fs.mkdir(tmpdir + "/a1/a2/a3") fs.touch(tmpdir + "/a1/a2/a3/afile") fs.touch(tmpdir + "/a1/afile") fs.get("file://{0}/a1".format(tmpdir), tmpdir + "/b1", recursive=True) assert fs.isfile(tmpdir + "/b1/afile") assert fs.isfile(tmpdir + "/b1/a2/a3/afile") fs.put(tmpdir + "/b1", "file://{0}/c1".format(tmpdir), recursive=True) assert fs.isfile(tmpdir + "/c1/afile") assert fs.isfile(tmpdir + "/c1/a2/a3/afile") def test_commit_discard(tmpdir): tmpdir = str(tmpdir) fs = LocalFileSystem() with fs.transaction: with fs.open(tmpdir + "/afile", "wb") as f: assert not fs.exists(tmpdir + "/afile") f.write(b"data") assert not fs.exists(tmpdir + "/afile") assert fs._transaction is None assert fs.cat(tmpdir + "/afile") == b"data" try: with fs.transaction: with fs.open(tmpdir + "/bfile", "wb") as f: f.write(b"data") raise KeyboardInterrupt except KeyboardInterrupt: assert not fs.exists(tmpdir + "/bfile") def test_make_path_posix(): cwd = os.getcwd() assert make_path_posix("/a/posix/path") == "/a/posix/path" assert make_path_posix("/posix") == "/posix" assert make_path_posix("relpath", sep="/") == os.path.join(cwd, "relpath") assert make_path_posix("rel/path", sep="/") == os.path.join(cwd, "rel/path") assert make_path_posix("C:\\path", sep="\\") == "C:/path" assert ( make_path_posix( "\\\\windows-server\\someshare\\path\\more\\path\\dir\\foo.parquet" ) == "//windows-server/someshare/path/more/path/dir/foo.parquet" ) assert "/" in make_path_posix("rel\\path", sep="\\") def test_links(tmpdir): tmpdir = str(tmpdir) fn0 = os.path.join(tmpdir, "target") fn1 = os.path.join(tmpdir, "link1") fn2 = os.path.join(tmpdir, "link2") data = b"my target data" with open(fn0, "wb") as f: f.write(data) os.symlink(fn0, fn1) os.symlink(fn0, fn2) fs = LocalFileSystem() assert fs.info(fn0)["type"] == "file" assert fs.info(fn1)["type"] == "link" assert fs.info(fn2)["type"] == "link" assert fs.info(fn0)["size"] == len(data) assert fs.info(fn1)["size"] == len(data) assert fs.info(fn2)["size"] == len(data) of = fsspec.open(fn1, "rb") with of as f: assert f.read() == data of = fsspec.open(fn2, "rb") with of as f: assert f.read() == data def test_isfilestore(): fs = LocalFileSystem(auto_mkdir=False) assert fs._isfilestore() filesystem_spec-0.6.1/fsspec/implementations/tests/test_memory.py000066400000000000000000000014041356753337100255060ustar00rootroot00000000000000import pytest import sys def test_1(m): m.touch("/somefile") # NB: is found with or without initial / m.touch("afiles/and/anothers") assert m.find("") == ["afiles/and/anothers", "somefile"] assert list(m.get_mapper("")) == ["afiles/and/anothers", "somefile"] @pytest.mark.xfail( sys.version_info < (3, 6), reason="py35 error, see https://github.com/intake/filesystem_spec/issues/148", ) def test_ls(m): m.touch("/dir/afile") m.touch("/dir/dir1/bfile") m.touch("/dir/dir1/cfile") assert m.ls("/", False) == ["/dir/"] assert m.ls("/dir", False) == ["/dir/afile", "/dir/dir1/"] assert m.ls("/dir", True)[0]["type"] == "file" assert m.ls("/dir", True)[1]["type"] == "directory" assert len(m.ls("/dir/dir1")) == 2 filesystem_spec-0.6.1/fsspec/implementations/tests/test_sftp.py000066400000000000000000000066151356753337100251630ustar00rootroot00000000000000import pytest import shlex import subprocess import time import fsspec pytest.importorskip("paramiko") def stop_docker(name): cmd = shlex.split('docker ps -a -q --filter "name=%s"' % name) cid = subprocess.check_output(cmd).strip().decode() if cid: subprocess.call(["docker", "rm", "-f", cid]) @pytest.fixture(scope="module") def ssh(): try: subprocess.check_call(["docker", "run", "hello-world"]) except subprocess.CalledProcessError: pytest.skip("docker run not available") return # requires docker cmds = [ r"apt-get update", r"apt-get install -y openssh-server", r"mkdir /var/run/sshd", "bash -c \"echo 'root:pass' | chpasswd\"", ( r"sed -i 's/PermitRootLogin prohibit-password/PermitRootLogin yes/' " r"/etc/ssh/sshd_config" ), ( r"sed 's@session\s*required\s*pam_loginuid.so@session optional " r"pam_loginuid.so@g' -i /etc/pam.d/sshd" ), r'bash -c "echo \"export VISIBLE=now\" >> /etc/profile"', r"/usr/sbin/sshd", ] name = "fsspec_sftp" stop_docker(name) cmd = "docker run -d -p 9200:22 --name {} ubuntu:16.04 sleep 9000".format(name) cid = subprocess.check_output(shlex.split(cmd)).strip().decode() for cmd in cmds: subprocess.call(["docker", "exec", cid] + shlex.split(cmd)) try: time.sleep(1) yield dict(host="localhost", port=9200, username="root", password="pass") finally: stop_docker(name) def test_simple(ssh): f = fsspec.get_filesystem_class("sftp")(**ssh) f.mkdirs("/home/someuser/deeper") f.touch("/home/someuser/deeper/afile") assert f.find("/home/someuser") == ["/home/someuser/deeper/afile"] assert f.ls("/home/someuser/deeper/") == ["/home/someuser/deeper/afile"] assert f.info("/home/someuser/deeper/afile")["type"] == "file" assert f.info("/home/someuser/deeper/afile")["size"] == 0 assert f.exists("/home/someuser") f.rm("/home/someuser", recursive=True) assert not f.exists("/home/someuser") @pytest.mark.parametrize("protocol", ["sftp", "ssh"]) def test_with_url(protocol, ssh): fo = fsspec.open( protocol + "://{username}:{password}@{host}:{port}" "/home/someuserout".format(**ssh), "wb", ) with fo as f: f.write(b"hello") fo = fsspec.open( protocol + "://{username}:{password}@{host}:{port}" "/home/someuserout".format(**ssh), "rb", ) with fo as f: assert f.read() == b"hello" def test_transaction(ssh): f = fsspec.get_filesystem_class("sftp")(**ssh) f.mkdirs("/home/someuser/deeper") f.start_transaction() f.touch("/home/someuser/deeper/afile") assert f.find("/home/someuser") == [] f.end_transaction() f.find("/home/someuser") == ["/home/someuser/deeper/afile"] with f.transaction: assert f._intrans f.touch("/home/someuser/deeper/afile2") assert f.find("/home/someuser") == ["/home/someuser/deeper/afile"] assert f.find("/home/someuser") == [ "/home/someuser/deeper/afile", "/home/someuser/deeper/afile2", ] def test_makedirs_exist_ok(ssh): f = fsspec.get_filesystem_class("sftp")(**ssh) f.makedirs("/a/b/c") with pytest.raises(FileExistsError, match="/a/b/c"): f.makedirs("/a/b/c", exist_ok=False) f.makedirs("/a/b/c", exist_ok=True) filesystem_spec-0.6.1/fsspec/implementations/tests/test_webhdfs.py000066400000000000000000000064351356753337100256310ustar00rootroot00000000000000import pickle import pytest import subprocess import time import fsspec requests = pytest.importorskip("requests") from fsspec.implementations.webhdfs import WebHDFS # noqa: E402 @pytest.fixture(scope="module") def hdfs_cluster(): cmd0 = "htcluster shutdown".split() try: subprocess.check_output(cmd0, stderr=subprocess.STDOUT) except FileNotFoundError: pytest.skip("htcluster not found") except subprocess.CalledProcessError as ex: pytest.skip("htcluster failed: " + ex.output.decode()) cmd1 = "htcluster startup --image base".split() subprocess.check_output(cmd1) try: while True: t = 90 try: requests.get("http://localhost:50070/webhdfs/v1/?op=LISTSTATUS") except: # noqa: E722 t -= 1 assert t > 0, "Timeout waiting for HDFS" time.sleep(1) continue break time.sleep(7) yield "localhost" finally: subprocess.check_output(cmd0) def test_pickle(hdfs_cluster): w = WebHDFS(hdfs_cluster, user="testuser") w2 = pickle.loads(pickle.dumps(w)) assert w == w2 def test_simple(hdfs_cluster): w = WebHDFS(hdfs_cluster, user="testuser") home = w.home_directory() assert home == "/user/testuser" with pytest.raises(PermissionError): w.mkdir("/root") def test_url(hdfs_cluster): url = "webhdfs://testuser@localhost:50070/user/testuser/myfile" fo = fsspec.open(url, "wb", data_proxy={"worker.example.com": "localhost"}) with fo as f: f.write(b"hello") fo = fsspec.open(url, "rb", data_proxy={"worker.example.com": "localhost"}) with fo as f: assert f.read() == b"hello" def test_workflow(hdfs_cluster): w = WebHDFS( hdfs_cluster, user="testuser", data_proxy={"worker.example.com": "localhost"} ) fn = "/user/testuser/testrun/afile" w.mkdir("/user/testuser/testrun") with w.open(fn, "wb") as f: f.write(b"hello") assert w.exists(fn) info = w.info(fn) assert info["size"] == 5 assert w.isfile(fn) assert w.cat(fn) == b"hello" w.rm("/user/testuser/testrun", recursive=True) assert not w.exists(fn) def test_with_gzip(hdfs_cluster): from gzip import GzipFile w = WebHDFS( hdfs_cluster, user="testuser", data_proxy={"worker.example.com": "localhost"} ) fn = "/user/testuser/gzfile" with w.open(fn, "wb") as f: gf = GzipFile(fileobj=f, mode="w") gf.write(b"hello") gf.close() with w.open(fn, "rb") as f: gf = GzipFile(fileobj=f, mode="r") assert gf.read() == b"hello" def test_workflow_transaction(hdfs_cluster): w = WebHDFS( hdfs_cluster, user="testuser", data_proxy={"worker.example.com": "localhost"} ) fn = "/user/testuser/testrun/afile" w.mkdirs("/user/testuser/testrun") with w.transaction: with w.open(fn, "wb") as f: f.write(b"hello") assert not w.exists(fn) assert w.exists(fn) assert w.ukey(fn) files = w.ls("/user/testuser/testrun", True) summ = w.content_summary("/user/testuser/testrun") assert summ["length"] == files[0]["size"] assert summ["fileCount"] == 1 w.rm("/user/testuser/testrun", recursive=True) assert not w.exists(fn) filesystem_spec-0.6.1/fsspec/implementations/tests/test_zip.py000066400000000000000000000023111356753337100247760ustar00rootroot00000000000000import zipfile from contextlib import contextmanager import os import pickle import pytest import sys import tempfile import fsspec @contextmanager def tempzip(data={}): f = tempfile.mkstemp(suffix="zip")[1] with zipfile.ZipFile(f, mode="w") as z: for k, v in data.items(): z.writestr(k, v) try: yield f finally: try: os.remove(f) except (IOError, OSError): pass data = {"a": b"", "b": b"hello", "deeply/nested/path": b"stuff"} def test_empty(): with tempzip() as z: fs = fsspec.get_filesystem_class("zip")(fo=z) assert fs.find("") == [] @pytest.mark.xfail(sys.version_info < (3, 6), reason="zip-info odd on py35") def test_mapping(): with tempzip(data) as z: fs = fsspec.get_filesystem_class("zip")(fo=z) m = fs.get_mapper("") assert list(m) == ["a", "b", "deeply/nested/path"] assert m["b"] == data["b"] @pytest.mark.xfail(sys.version_info < (3, 6), reason="zip not supported on py35") def test_pickle(): with tempzip(data) as z: fs = fsspec.get_filesystem_class("zip")(fo=z) fs2 = pickle.loads(pickle.dumps(fs)) assert fs2.cat("b") == b"hello" filesystem_spec-0.6.1/fsspec/implementations/webhdfs.py000066400000000000000000000313521356753337100234240ustar00rootroot00000000000000# https://hadoop.apache.org/docs/r1.0.4/webhdfs.html import requests from urllib.parse import quote import uuid from ..spec import AbstractFileSystem, AbstractBufferedFile from ..utils import infer_storage_options import logging logger = logging.getLogger("webhdfs") class WebHDFS(AbstractFileSystem): """ Interface to HDFS over HTTP Three auth mechanisms are supported: insecure: no auth is done, and the user is assumed to be whoever they say they are (parameter `user`), or a predefined value such as "dr.who" if not given spnego: when kerberos authentication is enabled, auth is negotiated by requests_kerberos https://github.com/requests/requests-kerberos . This establishes a session based on existing kinit login and/or specified principal/password; paraneters are passed with ``kerb_kwargs`` token: uses an existing Hadoop delegation token from another secured service. Indeed, this client can also generate such tokens when not insecure. Note that tokens expire, but can be renewed (by a previously specified user) and may allow for proxying. """ tempdir = "/tmp" protocol = "webhdfs", "webHDFS" def __init__( self, host, port=50070, kerberos=False, token=None, user=None, proxy_to=None, kerb_kwargs=None, data_proxy=None, **kwargs ): """ Parameters ---------- host: str Name-node address port: int Port for webHDFS kerberos: bool Whether to authenticate with kerberos for this connection token: str or None If given, use this token on every call to authenticate. A user and user-proxy may be encoded in the token and should not be also given user: str or None If given, assert the user name to connect with proxy_to: str or None If given, the user has the authority to proxy, and this value is the user in who's name actions are taken kerb_kwargs: dict Any extra arguments for HTTPKerberosAuth, see https://github.com/requests/requests-kerberos/blob/master/requests_kerberos/kerberos_.py data_proxy: dict, callable or None If given, map data-node addresses. This can be necessary if the HDFS cluster is behind a proxy, running on Docker or otherwise has a mismatch between the host-names given by the name-node and the address by which to refer to them from the client. If a dict, maps host names `host->data_proxy[host]`; if a callable, full URLs are passed, and function must conform to `url->data_proxy(url)`. kwargs """ if self._cached: return super().__init__(**kwargs) self.url = "http://{host}:{port}/webhdfs/v1".format(host=host, port=port) self.kerb = kerberos self.kerb_kwargs = kerb_kwargs or {} self.pars = {} self.proxy = data_proxy or {} if token is not None: if user is not None or proxy_to is not None: raise ValueError( "If passing a delegation token, must not set " "user or proxy_to, as these are encoded in the" " token" ) self.pars["delegation"] = token if user is not None: self.pars["user.name"] = user if proxy_to is not None: self.pars["doas"] = proxy_to if kerberos and user is not None: raise ValueError( "If using Kerberos auth, do not specify the " "user, this is handled by kinit." ) self._connect() def _connect(self): self.session = requests.Session() if self.kerb: from requests_kerberos import HTTPKerberosAuth self.session.auth = HTTPKerberosAuth(**self.kerb_kwargs) def _call(self, op, method="get", path=None, data=None, redirect=True, **kwargs): url = self.url + quote(path or "") args = kwargs.copy() args.update(self.pars) args["op"] = op.upper() logger.debug(url, method, args) out = self.session.request( method=method.upper(), url=url, params=args, data=data, allow_redirects=redirect, ) if out.status_code == 404: raise FileNotFoundError(path) if out.status_code == 403: raise PermissionError(path or "") if out.status_code == 401: raise PermissionError # not specific to path out.raise_for_status() return out def _open( self, path, mode="rb", block_size=None, autocommit=True, replication=None, permissions=None, **kwargs ): """ Parameters ---------- path: str File location mode: str 'rb', 'wb', etc. block_size: int Client buffer size for read-ahead or write buffer autocommit: bool If False, writes to temporary file that only gets put in final location upon commit replication: int Number of copies of file on the cluster, write mode only permissions: str or int posix permissions, write mode only kwargs Returns ------- WebHDFile instance """ block_size = block_size or self.blocksize return WebHDFile( self, path, mode=mode, block_size=block_size, tempdir=self.tempdir, autocommit=autocommit, replication=replication, permissions=permissions, ) @staticmethod def _process_info(info): info["type"] = info["type"].lower() info["size"] = info["length"] return info @classmethod def _strip_protocol(cls, path): return infer_storage_options(path)["path"] @staticmethod def _get_kwargs_from_urls(urlpath): out = infer_storage_options(urlpath) out.pop("path", None) out.pop("protocol", None) if "username" in out: out["user"] = out.pop("username") return out def info(self, path): out = self._call("GETFILESTATUS", path=path) info = out.json()["FileStatus"] info["name"] = path return self._process_info(info) def ls(self, path, detail=False): out = self._call("LISTSTATUS", path=path) infos = out.json()["FileStatuses"]["FileStatus"] for info in infos: self._process_info(info) info["name"] = path.rstrip("/") + "/" + info["pathSuffix"] if detail: return sorted(infos, key=lambda i: i["name"]) else: return sorted(info["name"] for info in infos) def content_summary(self, path): """Total numbers of files, directories and bytes under path""" out = self._call("GETCONTENTSUMMARY", path=path) return out.json()["ContentSummary"] def ukey(self, path): """Checksum info of file, giving method and result""" out = self._call("GETFILECHECKSUM", path=path, redirect=False) location = self._apply_proxy(out.headers["Location"]) out2 = self.session.get(location) out2.raise_for_status() return out2.json()["FileChecksum"] def home_directory(self): """Get user's home directory""" out = self._call("GETHOMEDIRECTORY") return out.json()["Path"] def get_delegation_token(self, renewer=None): """Retrieve token which can give the same authority to other uses Parameters ---------- renewer: str or None User who may use this token; if None, will be current user """ if renewer: out = self._call("GETDELEGATIONTOKEN", renewer=renewer) else: out = self._call("GETDELEGATIONTOKEN") t = out.json()["Token"] if t is None: raise ValueError("No token available for this user/security context") return t["urlString"] def renew_delegation_token(self, token): """Make token live longer. Returns new expiry time""" out = self._call("RENEWDELEGATIONTOKEN", method="put", token=token) return out.json()["long"] def cancel_delegation_token(self, token): """Stop the token from being useful""" self._call("CANCELDELEGATIONTOKEN", method="put", token=token) def chmod(self, path, mod): """Set the permission at path Parameters ---------- path: str location to set (file or directory) mod: str or int posix epresentation or permission, give as oct string, e.g, '777' or 0o777 """ self._call("SETPERMISSION", method="put", path=path, permission=mod) def chown(self, path, owner=None, group=None): """Change owning user and/or group""" kwargs = {} if owner is not None: kwargs["owner"] = owner if group is not None: kwargs["group"] = group self._call("SETOWNER", method="put", path=path, **kwargs) def set_replication(self, path, replication): """ Set file replication factor Parameters ---------- path: str File location (not for directories) replication: int Number of copies of file on the cluster. Should be smaller than number of data nodes; normally 3 on most systems. """ self._call("SETREPLICATION", path=path, method="put", replication=replication) def mkdir(self, path, **kwargs): self._call("MKDIRS", method="put", path=path) def makedirs(self, path, exist_ok=False): if exist_ok is False and self.exists(path): raise FileExistsError(path) self.mkdir(path) def mv(self, path1, path2, **kwargs): self._call("RENAME", method="put", path=path1, destination=path2) def rm(self, path, recursive=False, **kwargs): self._call( "DELETE", method="delete", path=path, recursive="true" if recursive else "false", ) def _apply_proxy(self, location): if self.proxy and callable(self.proxy): location = self.proxy(location) elif self.proxy: # as a dict for k, v in self.proxy.items(): location = location.replace(k, v, 1) return location class WebHDFile(AbstractBufferedFile): """A file living in HDFS over webHDFS""" def __init__(self, fs, path, **kwargs): super().__init__(fs, path, **kwargs) kwargs = kwargs.copy() if kwargs.get("permissions", None) is None: kwargs.pop("permissions", None) if kwargs.get("replication", None) is None: kwargs.pop("replication", None) self.permissions = kwargs.pop("permissions", 511) tempdir = kwargs.pop("tempdir") if kwargs.pop("autocommit", False) is False: self.target = self.path self.path = "/".join([tempdir, str(uuid.uuid4())]) def _upload_chunk(self, final=False): """ Write one part of a multi-block file upload Parameters ========== final: bool This is the last block, so should complete file, if self.autocommit is True. """ out = self.fs.session.post(self.location, data=self.buffer.getvalue()) out.raise_for_status() return True def _initiate_upload(self): """ Create remote file/upload """ if "a" in self.mode: op, method = "APPEND", "POST" else: op, method = "CREATE", "PUT" if self.fs.exists(self.path): # no "truncate" or "create empty" self.fs.rm(self.path) out = self.fs._call(op, method, self.path, redirect=False, **self.kwargs) location = self.fs._apply_proxy(out.headers["Location"]) if "w" in self.mode: # create empty file to append to out2 = self.fs.session.put(location) out2.raise_for_status() self.location = location.replace("CREATE", "APPEND") def _fetch_range(self, start, end): out = self.fs._call( "OPEN", path=self.path, offset=start, length=end - start, redirect=False ) out.raise_for_status() location = out.headers["Location"] out2 = self.fs.session.get(self.fs._apply_proxy(location)) return out2.content def commit(self): self.fs.mv(self.path, self.target) def discard(self): self.fs.rm(self.path) filesystem_spec-0.6.1/fsspec/implementations/zip.py000066400000000000000000000101461356753337100226020ustar00rootroot00000000000000from __future__ import print_function, division, absolute_import import zipfile from fsspec import AbstractFileSystem, open_files from fsspec.utils import tokenize, DEFAULT_BLOCK_SIZE class ZipFileSystem(AbstractFileSystem): """Read contents of ZIP archive as a file-system Keeps file object open while instance lives. This class is pickleable, but not necessarily thread-safe """ root_marker = "" def __init__(self, fo="", mode="r", **storage_options): """ Parameters ---------- fo: str or file-like Contains ZIP, and must exist. If a str, will fetch file using `open_files()`, which must return one file exactly. mode: str Currently, only 'r' accepted storage_options: key-value May be credentials, e.g., `{'auth': ('username', 'pword')}` or any other parameters for requests """ if self._cached: return AbstractFileSystem.__init__(self) if mode != "r": raise ValueError("Only read from zip files accepted") self.in_fo = fo if isinstance(fo, str): files = open_files(fo) if len(files) != 1: raise ValueError( 'Path "{}" did not resolve to exactly' 'one file: "{}"'.format(fo, files) ) fo = files[0] self.fo = fo.__enter__() # the whole instance is a context self.zip = zipfile.ZipFile(self.fo) self.block_size = storage_options.get("block_size", DEFAULT_BLOCK_SIZE) self.dir_cache = None @classmethod def _strip_protocol(cls, path): # zip file paths are always relative to the archive root return super()._strip_protocol(path).lstrip("/") def _get_dirs(self): if self.dir_cache is None: files = self.zip.infolist() self.dir_cache = {} for z in files: f = {s: getattr(z, s) for s in zipfile.ZipInfo.__slots__} f.update( { "name": z.filename, "size": z.file_size, "type": ("directory" if z.is_dir() else "file"), } ) self.dir_cache[f["name"]] = f def ls(self, path, detail=False): self._get_dirs() paths = {} for p, f in self.dir_cache.items(): p = p.rstrip("/") if "/" in p: root = p.rsplit("/", 1)[0] else: root = "" if root == path.rstrip("/"): paths[p] = f elif path and all( (a == b) for a, b in zip(path.split("/"), p.strip("/").split("/")) ): # implicit directory ppath = "/".join(p.split("/")[: len(path.split("/")) + 1]) if ppath not in paths: out = {"name": ppath + "/", "size": 0, "type": "directory"} paths[ppath] = out elif all( (a == b) for a, b in zip(path.split("/"), [""] + p.strip("/").split("/")) ): # root directory entry ppath = p.rstrip("/").split("/", 1)[0] if ppath not in paths: out = {"name": ppath + "/", "size": 0, "type": "directory"} paths[ppath] = out out = list(paths.values()) if detail: return out else: return list(sorted(f["name"] for f in out)) def cat(self, path): return self.zip.read(path) def _open( self, path, mode="rb", block_size=None, autocommit=True, cache_options=None, **kwargs ): path = self._strip_protocol(path) if mode != "rb": raise NotImplementedError info = self.info(path) out = self.zip.open(path, "r") out.size = info["size"] out.name = info["name"] return out def ukey(self, path): return tokenize(path, self.in_fo, self.protocol) filesystem_spec-0.6.1/fsspec/mapping.py000066400000000000000000000106751356753337100202320ustar00rootroot00000000000000from collections.abc import MutableMapping from .registry import get_filesystem_class from .core import split_protocol class FSMap(MutableMapping): """Wrap a FileSystem instance as a mutable wrapping. The keys of the mapping become files under the given root, and the values (which must be bytes) the contents of those files. Parameters ---------- root: string prefix for all the files fs: FileSystem instance check: bool (=True) performs a touch at the location, to check for write access. Examples -------- >>> fs = FileSystem(**parameters) # doctest: +SKIP >>> d = FSMap('my-data/path/', fs) # doctest: +SKIP or, more likely >>> d = fs.get_mapper('my-data/path/') >>> d['loc1'] = b'Hello World' # doctest: +SKIP >>> list(d.keys()) # doctest: +SKIP ['loc1'] >>> d['loc1'] # doctest: +SKIP b'Hello World' """ def __init__(self, root, fs, check=False, create=False): self.fs = fs self.root = fs._strip_protocol(root).rstrip( "/" ) # we join on '/' in _key_to_str if create: if not self.fs.exists(root): self.fs.mkdir(root) if check: if not self.fs.exists(root): raise ValueError( "Path %s does not exist. Create " " with the ``create=True`` keyword" % root ) self.fs.touch(root + "/a") self.fs.rm(root + "/a") def clear(self): """Remove all keys below root - empties out mapping """ try: self.fs.rm(self.root, True) self.fs.mkdir(self.root) except: # noqa: E722 pass def _key_to_str(self, key): """Generate full path for the key""" if isinstance(key, (tuple, list)): key = str(tuple(key)) else: key = str(key) return "/".join([self.root, key]) if self.root else key def _str_to_key(self, s): """Strip path of to leave key name""" return s[len(self.root) :].lstrip("/") def __getitem__(self, key, default=None): """Retrieve data""" key = self._key_to_str(key) try: result = self.fs.cat(key) except: # noqa: E722 if default is not None: return default raise KeyError(key) return result def pop(self, key, default=None): result = self.__getitem__(key, default) try: del self[key] except KeyError: pass return result def __setitem__(self, key, value): """Store value in key""" key = self._key_to_str(key) self.fs.mkdirs(self.fs._parent(key), exist_ok=True) with self.fs.open(key, "wb") as f: f.write(value) def __iter__(self): return (self._str_to_key(x) for x in self.fs.find(self.root)) def __len__(self): return len(self.fs.find(self.root)) def __delitem__(self, key): """Remove key""" try: self.fs.rm(self._key_to_str(key)) except: # noqa: E722 raise KeyError def __contains__(self, key): """Does key exist in mapping?""" return self.fs.exists(self._key_to_str(key)) def __getstate__(self): """Mapping should be pickleable""" # TODO: replace with reduce to reinstantiate? return self.fs, self.root def __setstate__(self, state): fs, root = state self.fs = fs self.root = root def get_mapper(url, check=False, create=False, **kwargs): """Create key-value interface for given URL and options The URL will be of the form "protocol://location" and point to the root of the mapper required. All keys will be file-names below this location, and their values the contents of each key. Parameters ---------- url: str Root URL of mapping check: bool Whether to attempt to read from the location before instantiation, to check that the mapping does exist create: bool Whether to make the directory corresponding to the root before instantiating Returns ------- ``FSMap`` instance, the dict-like key-value store. """ protocol, path = split_protocol(url) cls = get_filesystem_class(protocol) fs = cls(**kwargs) # Removing protocol here - could defer to each open() on the backend return FSMap(url, fs, check, create) filesystem_spec-0.6.1/fsspec/registry.py000066400000000000000000000104701356753337100204400ustar00rootroot00000000000000import importlib from distutils.version import LooseVersion __all__ = ["registry", "get_filesystem_class", "default"] # mapping protocol: implementation class object registry = {} default = "file" # protocols mapped to the class which implements them. This dict can # be dynamically updated. known_implementations = { "file": {"class": "fsspec.implementations.local.LocalFileSystem"}, "memory": {"class": "fsspec.implementations.memory.MemoryFileSystem"}, "http": { "class": "fsspec.implementations.http.HTTPFileSystem", "err": 'HTTPFileSystem requires "requests" to be installed', }, "https": { "class": "fsspec.implementations.http.HTTPFileSystem", "err": 'HTTPFileSystem requires "requests" to be installed', }, "zip": {"class": "fsspec.implementations.zip.ZipFileSystem"}, "gcs": { "class": "gcsfs.GCSFileSystem", "err": "Please install gcsfs to access Google Storage", }, "gs": { "class": "gcsfs.GCSFileSystem", "err": "Please install gcsfs to access Google Storage", }, "sftp": { "class": "fsspec.implementations.sftp.SFTPFileSystem", "err": 'SFTPFileSystem requires "paramiko" to be installed', }, "ssh": { "class": "fsspec.implementations.sftp.SFTPFileSystem", "err": 'SFTPFileSystem requires "paramiko" to be installed', }, "ftp": {"class": "fsspec.implementations.ftp.FTPFileSystem"}, "hdfs": { "class": "fsspec.implementations.hdfs.PyArrowHDFS", "err": "pyarrow and local java libraries required for HDFS", }, "webhdfs": { "class": "fsspec.implementations.webhdfs.WebHDFS", "err": 'webHDFS access requires "requests" to be installed', }, "s3": {"class": "s3fs.S3FileSystem", "err": "Install s3fs to access S3"}, "cached": {"class": "fsspec.implementations.cached.CachingFileSystem"}, "blockcache": {"class": "fsspec.implementations.cached.CachingFileSystem"}, "filecache": {"class": "fsspec.implementations.cached.WholeFileCacheFileSystem"}, "dask": { "class": "fsspec.implementations.dask.DaskWorkerFileSystem", "err": "Install dask distributed to access worker file system", }, } minversions = {"s3fs": LooseVersion("0.3.0"), "gcsfs": LooseVersion("0.3.0")} def get_filesystem_class(protocol): """Fetch named protocol implementation from the registry The dict ``known_implementations`` maps protocol names to the locations of classes implementing the corresponding file-system. When used for the first time, appropriate imports will happen and the class will be placed in the registry. All subsequent calls will fetch directly from the registry. Some protocol implementations require additional dependencies, and so the import may fail. In this case, the string in the "err" field of the ``known_implementations`` will be given as the error message. """ if protocol is None: protocol = default if protocol not in registry: if protocol not in known_implementations: raise ValueError("Protocol not known: %s" % protocol) bit = known_implementations[protocol] mod, name = bit["class"].rsplit(".", 1) minversion = minversions.get(mod, None) err = None try: mod = importlib.import_module(mod) except ImportError: err = ImportError(bit["err"]) except Exception as e: err = e if err is not None: raise RuntimeError(str(err)) if minversion: version = getattr(mod, "__version__", None) if version and LooseVersion(version) < minversion: raise RuntimeError( "'{}={}' is installed, but version '{}' or " "higher is required".format(mod.__name__, version, minversion) ) registry[protocol] = getattr(mod, name) cls = registry[protocol] if getattr(cls, "protocol", None) in ("abstract", None): cls.protocol = protocol return cls def filesystem(protocol, **storage_options): """Instantiate filesystems for given protocol and arguments ``storage_options`` are specific to the protocol being chosen, and are passed directly to the class. """ cls = get_filesystem_class(protocol) return cls(**storage_options) filesystem_spec-0.6.1/fsspec/spec.py000066400000000000000000001177651356753337100175410ustar00rootroot00000000000000import warnings from hashlib import md5 import io import os import logging from .transaction import Transaction from .utils import read_block, tokenize, stringify_path logger = logging.getLogger("fsspec") def make_instance(cls, args, kwargs): return cls(*args, **kwargs) class _Cached(type): """ Metaclass for caching file system instances. Notes ----- Instances are cached according to * The values of the class attributes listed in `_extra_tokenize_attributes` * The arguments passed to ``__init__``. This creates an additional reference to the filesystem, which prevents the filesystem from being garbage collected when all *user* references go away. A call to the :meth:`AbstractFileSystem.clear_instance_cache` must *also* be made for a filesystem instance to be garbage collected. """ cachable = True _extra_tokenize_attributes = () def __init__(cls, *args, **kwargs): super().__init__(*args, **kwargs) # Note: we intentionally create a reference here, to avoid garbage # collecting instances when all other references are gone. To really # delete a FileSystem, the cache must be cleared. cls._cache = {} def __call__(cls, *args, **kwargs): extra_tokens = tuple( getattr(cls, attr, None) for attr in cls._extra_tokenize_attributes ) token = tokenize(cls, *args, *extra_tokens, **kwargs) if cls.cachable and token in cls._cache: return cls._cache[token] else: obj = super().__call__(*args, **kwargs) # Setting _fs_token here causes some static linters to complain. obj._fs_token_ = token obj.storage_args = args obj.storage_options = kwargs if cls.cachable: cls._cache[token] = obj return obj try: # optionally derive from pyarrow's FileSystem, if available import pyarrow as pa up = pa.filesystem.DaskFileSystem except ImportError: up = object class AbstractFileSystem(up, metaclass=_Cached): """ An abstract super-class for pythonic file-systems Implementations are expected to be compatible with or, better, subclass from here. """ cachable = True # this class can be cached, instances reused _cached = False blocksize = 2 ** 22 sep = "/" protocol = "abstract" root_marker = "" # For some FSs, may require leading '/' or other character #: Extra *class attributes* that should be considered when hashing. _extra_tokenize_attributes = () def __init__(self, *args, **storage_options): """Create and configure file-system instance Instances may be cachable, so if similar enough arguments are seen a new instance is not required. The token attribute exists to allow implementations to cache instances if they wish. A reasonable default should be provided if there are no arguments. Subclasses should call this method. Magic kwargs that affect functionality here: add_docs: if True, will append docstrings from this spec to the specific implementation """ if self._cached: # reusing instance, don't change return self._cached = True self._intrans = False self._transaction = None self.dircache = {} if storage_options.pop("add_docs", None): warnings.warn("add_docs is no longer supported.", FutureWarning) if storage_options.pop("add_aliases", None): warnings.warn("add_aliases has been removed.", FutureWarning) # This is set in _Cached self._fs_token_ = None @property def _fs_token(self): return self._fs_token_ def __dask_tokenize__(self): return self._fs_token def __hash__(self): return int(self._fs_token, 16) def __eq__(self, other): return isinstance(other, type(self)) and self._fs_token == other._fs_token @classmethod def _strip_protocol(cls, path): """ Turn path from fully-qualified to file-system-specific May require FS-specific handling, e.g., for relative paths or links. """ path = stringify_path(path) protos = (cls.protocol,) if isinstance(cls.protocol, str) else cls.protocol for protocol in protos: path = path.rstrip("/") if path.startswith(protocol + "://"): path = path[len(protocol) + 3 :] elif path.startswith(protocol + ":"): path = path[len(protocol) + 1 :] # use of root_marker to make minimum required path, e.g., "/" return path or cls.root_marker @staticmethod def _get_kwargs_from_urls(paths): """If kwargs can be encoded in the paths, extract them here This should happen before instantiation of the class; incoming paths then should be amended to strip the options in methods. Examples may look like an sftp path "sftp://user@host:/my/path", where the user and host should become kwargs and later get stripped. """ # by default, nothing happens return {} @classmethod def current(cls): """ Return the most recently created FileSystem If no instance has been created, then create one with defaults """ if not len(cls._cache): return cls() else: return list(cls._cache.values())[-1] @property def transaction(self): """A context within which files are committed together upon exit Requires the file class to implement `.commit()` and `.discard()` for the normal and exception cases. """ if self._transaction is None: self._transaction = Transaction(self) return self._transaction def start_transaction(self): """Begin write transaction for deferring files, non-context version""" self._intrans = True self._transaction = Transaction(self) return self.transaction def end_transaction(self): """Finish write transaction, non-context version""" self.transaction.complete() self._transaction = None def invalidate_cache(self, path=None): """ Discard any cached directory information Parameters ---------- path: string or None If None, clear all listings cached else listings at or under given path. """ pass # not necessary to implement, may have no cache def mkdir(self, path, create_parents=True, **kwargs): """ Create directory entry at path For systems that don't have true directories, may create an for this instance only and not touch the real filesystem Parameters ---------- path: str location create_parents: bool if True, this is equivalent to ``makedirs`` kwargs: may be permissions, etc. """ pass # not necessary to implement, may not have directories def makedirs(self, path, exist_ok=False): """Recursively make directories Creates directory at path and any intervening required directories. Raises exception if, for instance, the path already exists but is a file. Parameters ---------- path: str leaf directory name exist_ok: bool (False) If True, will error if the target already exists """ pass # not necessary to implement, may not have directories def rmdir(self, path): """Remove a directory, if empty""" pass # not necessary to implement, may not have directories def ls(self, path, detail=True, **kwargs): """List objects at path. This should include subdirectories and files at that location. The difference between a file and a directory must be clear when details are requested. The specific keys, or perhaps a FileInfo class, or similar, is TBD, but must be consistent across implementations. Must include: - full path to the entry (without protocol) - size of the entry, in bytes. If the value cannot be determined, will be ``None``. - type of entry, "file", "directory" or other Additional information may be present, aproriate to the file-system, e.g., generation, checksum, etc. May use refresh=True|False to allow use of self._ls_from_cache to check for a saved listing and avoid calling the backend. This would be common where listing may be expensive. Parameters ---------- path: str detail: bool if True, gives a list of dictionaries, where each is the same as the result of ``info(path)``. If False, gives a list of paths (str). kwargs: may have additional backend-specific options, such as version information Returns ------- List of strings if detail is False, or list of directory information dicts if detail is True. """ raise NotImplementedError def _ls_from_cache(self, path): """Check cache for listing Returns listing, if found (may me empty list for a directly that exists but contains nothing), None if not in cache. """ parent = self._parent(path) if path in self.dircache: return self.dircache[path] elif parent in self.dircache: files = [f for f in self.dircache[parent] if f["name"] == path] if len(files) == 0: # parent dir was listed but did not contain this file raise FileNotFoundError(path) return files def walk(self, path, maxdepth=None, **kwargs): """ Return all files belows path List all files, recursing into subdirectories; output is iterator-style, like ``os.walk()``. For a simple list of files, ``find()`` is available. Note that the "files" outputted will include anything that is not a directory, such as links. Parameters ---------- path: str Root to recurse into maxdepth: int Maximum recursion depth. None means limitless, but not recommended on link-based file-systems. kwargs: passed to ``ls`` """ path = self._strip_protocol(path) full_dirs = [] dirs = [] files = [] try: listing = self.ls(path, detail=True, **kwargs) except (FileNotFoundError, IOError): return [], [], [] for info in listing: # each info name must be at least [path]/part , but here # we check also for names like [path]/part/ name = info["name"].rstrip("/") if info["type"] == "directory" and name != path: # do not include "self" path full_dirs.append(name) dirs.append(name.rsplit("/", 1)[-1]) elif name == path: # file-like with same name as give path files.append("") else: files.append(name.rsplit("/", 1)[-1]) yield path, dirs, files for d in full_dirs: if maxdepth is None or maxdepth > 1: for res in self.walk( d, maxdepth=(maxdepth - 1) if maxdepth is not None else None, **kwargs ): yield res def find(self, path, maxdepth=None, withdirs=False, **kwargs): """List all files below path. Like posix ``find`` command without conditions Parameters ---------- path : str maxdepth: int or None If not None, the maximum number of levels to descend withdirs: bool Whether to include directory paths in the output. This is True when used by glob, but users usually only want files. kwargs are passed to ``ls``. """ # TODO: allow equivalent of -name parameter out = set() for path, dirs, files in self.walk(path, maxdepth, **kwargs): if withdirs: files += dirs for name in files: if name and name not in out: out.add("/".join([path.rstrip("/"), name]) if path else name) if self.isfile(path) and path not in out: # walk works on directories, but find should also return [path] # when path happens to be a file out.add(path) return sorted(out) def du(self, path, total=True, maxdepth=None, **kwargs): """Space used by files within a path Parameters ---------- path: str total: bool whether to sum all the file sizes maxdepth: int or None maximum number of directory levels to descend, None for unlimited. kwargs: passed to ``ls`` Returns ------- Dict of {fn: size} if total=False, or int otherwise, where numbers refer to bytes used. """ sizes = {} for f in self.find(path, maxdepth=maxdepth, **kwargs): info = self.info(f) sizes[info["name"]] = info["size"] if total: return sum(sizes.values()) else: return sizes def glob(self, path, **kwargs): """ Find files by glob-matching. If the path ends with '/' and does not contain "*", it is essentially the same as ``ls(path)``, returning only files. We support ``"**"``, ``"?"`` and ``"[..]"``. kwargs are passed to ``ls``. """ import re from glob import has_magic ends = path.endswith("/") path = self._strip_protocol(path) indstar = path.find("*") if path.find("*") >= 0 else len(path) indques = path.find("?") if path.find("?") >= 0 else len(path) indbrace = path.find("[") if path.find("[") >= 0 else len(path) ind = min(indstar, indques, indbrace) if not has_magic(path): root = path depth = 1 if ends: path += "/*" elif self.exists(path): return [path] else: return [] # glob of non-existent returns empty elif "/" in path[:ind]: ind2 = path[:ind].rindex("/") root = path[: ind2 + 1] depth = 20 if "**" in path else path[ind2 + 1 :].count("/") + 1 else: root = "" depth = 20 if "**" in path else 1 allpaths = self.find(root, maxdepth=depth, withdirs=True, **kwargs) pattern = ( "^" + ( path.replace("\\", r"\\") .replace(".", r"\.") .replace("+", r"\+") .replace("//", "/") .replace("(", r"\(") .replace(")", r"\)") .replace("|", r"\|") .rstrip("/") .replace("?", ".") ) + "$" ) pattern = re.sub("[*]{2}", "=PLACEHOLDER=", pattern) pattern = re.sub("[*]", "[^/]*", pattern) pattern = re.compile(pattern.replace("=PLACEHOLDER=", ".*")) out = {p for p in allpaths if pattern.match(p.replace("//", "/").rstrip("/"))} return list(sorted(out)) def exists(self, path): """Is there a file at the given path""" try: self.info(path) return True except: # noqa: E722 # any exception allowed bar FileNotFoundError? return False def info(self, path, **kwargs): """Give details of entry at path Returns a single dictionary, with exactly the same information as ``ls`` would with ``detail=True``. The default implementation should calls ls and could be overridden by a shortcut. kwargs are passed on to ```ls()``. Some file systems might not be able to measure the file's size, in which case, the returned dict will include ``'size': None``. Returns ------- dict with keys: name (full path in the FS), size (in bytes), type (file, directory, or something else) and other FS-specific keys. """ path = self._strip_protocol(path) out = self.ls(self._parent(path), detail=True, **kwargs) out = [o for o in out if o["name"].rstrip("/") == path] if out: return out[0] out = self.ls(path, detail=True, **kwargs) path = path.rstrip("/") out1 = [o for o in out if o["name"].rstrip("/") == path] if len(out1) == 1: if "size" not in out1[0]: out1[0]["size"] = None return out1[0] elif len(out1) > 1 or out: return {"name": path, "size": 0, "type": "directory"} else: raise FileNotFoundError(path) def checksum(self, path): """Unique value for current version of file If the checksum is the same from one moment to another, the contents are guaranteed to be the same. If the checksum changes, the contents *might* have changed. This should normally be overridden; default will probably capture creation/modification timestamp (which would be good) or maybe access timestamp (which would be bad) """ return int(tokenize(self.info(path)), 16) def size(self, path): """Size in bytes of file""" return self.info(path).get("size", None) def isdir(self, path): """Is this entry directory-like?""" try: return self.info(path)["type"] == "directory" except FileNotFoundError: return False def isfile(self, path): """Is this entry file-like?""" try: return self.info(path)["type"] == "file" except: # noqa: E722 return False def cat(self, path): """ Get the content of a file """ return self.open(path, "rb").read() def get(self, rpath, lpath, recursive=False, **kwargs): """Copy file to local. Possible extension: maybe should be able to copy to any file-system (streaming through local). """ rpath = self._strip_protocol(rpath) if recursive: rpaths = self.find(rpath) lpaths = [ os.path.join(lpath, path[len(rpath) :].lstrip("/")) for path in rpaths ] for lpath in lpaths: dirname = os.path.dirname(lpath) if not os.path.isdir(dirname): os.makedirs(dirname) else: rpaths = [rpath] lpaths = [lpath] for lpath, rpath in zip(lpaths, rpaths): with self.open(rpath, "rb", **kwargs) as f1: with open(lpath, "wb") as f2: data = True while data: data = f1.read(self.blocksize) f2.write(data) def put(self, lpath, rpath, recursive=False, **kwargs): """ Upload file from local """ if recursive: lpaths = [] for dirname, subdirlist, filelist in os.walk(lpath): lpaths += [os.path.join(dirname, filename) for filename in filelist] rootdir = os.path.basename(lpath.rstrip("/")) if self.exists(rpath): # copy lpath inside rpath directory rpath2 = os.path.join(rpath, rootdir) else: # copy lpath as rpath directory rpath2 = rpath rpaths = [ os.path.join(rpath2, path[len(lpath) :].lstrip("/")) for path in lpaths ] else: lpaths = [lpath] rpaths = [rpath] for lpath, rpath in zip(lpaths, rpaths): with open(lpath, "rb") as f1: with self.open(rpath, "wb", **kwargs) as f2: data = True while data: data = f1.read(self.blocksize) f2.write(data) def head(self, path, size=1024): """ Get the first ``size`` bytes from file """ with self.open(path, "rb") as f: return f.read(size) def tail(self, path, size=1024): """ Get the last ``size`` bytes from file """ with self.open(path, "rb") as f: f.seek(max(-size, -f.size), 2) return f.read() def copy(self, path1, path2, **kwargs): """ Copy within two locations in the filesystem""" raise NotImplementedError def mv(self, path1, path2, **kwargs): """ Move file from one location to another """ self.copy(path1, path2, **kwargs) self.rm(path1, recursive=False) def _rm(self, path): """Delete a file""" raise NotImplementedError def rm(self, path, recursive=False, maxdepth=None): """Delete files. Parameters ---------- path: str or list of str File(s) to delete. recursive: bool If file(s) are directories, recursively delete contents and then also remove the directory maxdepth: int or None Depth to pass to walk for finding files to delete, if recursive. If None, there will be no limit and infinite recursion may be possible. """ # prefer some bulk method, if possible if not isinstance(path, list): path = [path] for p in path: if recursive: out = self.walk(p, maxdepth=maxdepth) for pa_, _, files in reversed(list(out)): for name in files: fn = "/".join([pa_, name]) if pa_ else name self.rm(fn) self.rmdir(pa_) else: self._rm(p) @classmethod def _parent(cls, path): path = cls._strip_protocol(path.rstrip("/")) if "/" in path: return cls.root_marker + path.rsplit("/", 1)[0] else: return cls.root_marker def _open( self, path, mode="rb", block_size=None, autocommit=True, cache_options=None, **kwargs ): """Return raw bytes-mode file-like from the file-system""" return AbstractBufferedFile( self, path, mode, block_size, autocommit, cache_options=cache_options, **kwargs ) def open(self, path, mode="rb", block_size=None, cache_options=None, **kwargs): """ Return a file-like object from the filesystem The resultant instance must function correctly in a context ``with`` block. Parameters ---------- path: str Target file mode: str like 'rb', 'w' See builtin ``open()`` block_size: int Some indication of buffering - this is a value in bytes cache_options : dict, optional Extra arguments to pass through to the cache. encoding, errors, newline: passed on to TextIOWrapper for text mode """ import io path = self._strip_protocol(path) if "b" not in mode: mode = mode.replace("t", "") + "b" text_kwargs = { k: kwargs.pop(k) for k in ["encoding", "errors", "newline"] if k in kwargs } return io.TextIOWrapper( self.open(path, mode, block_size, **kwargs), **text_kwargs ) else: ac = kwargs.pop("autocommit", not self._intrans) f = self._open( path, mode=mode, block_size=block_size, autocommit=ac, cache_options=cache_options, **kwargs ) if not ac: self.transaction.files.append(f) return f def touch(self, path, truncate=True, **kwargs): """ Create empty file, or update timestamp Parameters ---------- path: str file location truncate: bool If True, always set file size to 0; if False, update timestamp and leave file unchanged, if backend allows this """ if truncate or not self.exists(path): with self.open(path, "wb", **kwargs): pass else: raise NotImplementedError # update timestamp, if possible def ukey(self, path): """Hash of file properties, to tell if it has changed""" return md5(str(self.info(path)).encode()).hexdigest() def read_block(self, fn, offset, length, delimiter=None): """ Read a block of bytes from Starting at ``offset`` of the file, read ``length`` bytes. If ``delimiter`` is set then we ensure that the read starts and stops at delimiter boundaries that follow the locations ``offset`` and ``offset + length``. If ``offset`` is zero then we start at zero. The bytestring returned WILL include the end delimiter string. If offset+length is beyond the eof, reads to eof. Parameters ---------- fn: string Path to filename offset: int Byte offset to start read length: int Number of bytes to read delimiter: bytes (optional) Ensure reading starts and stops at delimiter bytestring Examples -------- >>> fs.read_block('data/file.csv', 0, 13) # doctest: +SKIP b'Alice, 100\\nBo' >>> fs.read_block('data/file.csv', 0, 13, delimiter=b'\\n') # doctest: +SKIP b'Alice, 100\\nBob, 200\\n' Use ``length=None`` to read to the end of the file. >>> fs.read_block('data/file.csv', 0, None, delimiter=b'\\n') # doctest: +SKIP b'Alice, 100\\nBob, 200\\nCharlie, 300' See Also -------- utils.read_block """ with self.open(fn, "rb") as f: size = f.size if length is None: length = size if size is not None and offset + length > size: length = size - offset return read_block(f, offset, length, delimiter) def __reduce__(self): return make_instance, (type(self), self.storage_args, self.storage_options) def _get_pyarrow_filesystem(self): """ Make a version of the FS instance which will be acceptable to pyarrow """ # all instances already also derive from pyarrow return self def get_mapper(self, root, check=False, create=False): """Create key/value store based on this file-system Makes a MutibleMapping interface to the FS at the given root path. See ``fsspec.mapping.FSMap`` for further details. """ from .mapping import FSMap return FSMap(root, self, check, create) @classmethod def clear_instance_cache(cls): """ Clear the cache of filesystem instances. Notes ----- Unless overridden by setting the ``cachable`` class attribute to False, the filesystem class stores a reference to newly created instances. This prevents Python's normal rules around garbage collection from working, since the instances refcount will not drop to zero until ``clear_instance_cache`` is called. """ cls._cache.clear() # ------------------------------------------------------------------------ # Aliases def makedir(self, path, create_parents=True, **kwargs): """Alias of :ref:`FilesystemSpec.mkdir`.""" return self.mkdir(path, create_parents=create_parents, **kwargs) def mkdirs(self, path, exist_ok=False): """Alias of :ref:`FilesystemSpec.makedirs`.""" return self.makedirs(path, exist_ok=exist_ok) def listdir(self, path, detail=True, **kwargs): """Alias of :ref:`FilesystemSpec.ls`.""" return self.ls(path, detail=detail, **kwargs) def cp(self, path1, path2, **kwargs): """Alias of :ref:`FilesystemSpec.copy`.""" return self.copy(path1, path2, **kwargs) def move(self, path1, path2, **kwargs): """Alias of :ref:`FilesystemSpec.mv`.""" return self.mv(path1, path2, **kwargs) def stat(self, path, **kwargs): """Alias of :ref:`FilesystemSpec.info`.""" return self.info(path, **kwargs) def disk_usage(self, path, total=True, maxdepth=None, **kwargs): """Alias of :ref:`FilesystemSpec.du`.""" return self.du(path, total=total, maxdepth=maxdepth, **kwargs) def rename(self, path1, path2, **kwargs): """Alias of :ref:`FilesystemSpec.mv`.""" return self.mv(path1, path2, **kwargs) def delete(self, path, recursive=False, maxdepth=None): """Alias of :ref:`FilesystemSpec.rm`.""" return self.rm(path, recursive=recursive, maxdepth=maxdepth) def upload(self, lpath, rpath, recursive=False, **kwargs): """Alias of :ref:`FilesystemSpec.put`.""" return self.put(lpath, rpath, recursive=recursive, **kwargs) def download(self, rpath, lpath, recursive=False, **kwargs): """Alias of :ref:`FilesystemSpec.get`.""" return self.get(rpath, lpath, recursive=recursive, **kwargs) class AbstractBufferedFile(io.IOBase): """Convenient class to derive from to provide buffering In the case that the backend does not provide a pythonic file-like object already, this class contains much of the logic to build one. The only methods that need to be overridden are ``_upload_chunk``, ``_initate_upload`` and ``_fetch_range``. """ DEFAULT_BLOCK_SIZE = 5 * 2 ** 20 def __init__( self, fs, path, mode="rb", block_size="default", autocommit=True, cache_type="readahead", cache_options=None, **kwargs ): """ Template for files with buffered reading and writing Parameters ---------- fs: instance of FileSystem path: str location in file-system mode: str Normal file modes. Currently only 'wb', 'ab' or 'rb'. Some file systems may be read-only, and some may not support append. block_size: int Buffer size for reading or writing, 'default' for class default autocommit: bool Whether to write to final destination; may only impact what happens when file is being closed. cache_type: {"readahead", "none", "mmap", "bytes"}, default "readahead" Caching policy in read mode. See the definitions in ``core``. cache_options : dict Additional options passed to the constructor for the cache specified by `cache_type`. kwargs: Gets stored as self.kwargs """ from .core import caches self.path = path self.fs = fs self.mode = mode self.blocksize = ( self.DEFAULT_BLOCK_SIZE if block_size in ["default", None] else block_size ) self.loc = 0 self.autocommit = autocommit self.end = None self.start = None self.closed = False if cache_options is None: cache_options = {} if "trim" in kwargs: warnings.warn( "Passing 'trim' to control the cache behavior has been deprecated. " "Specify it within the 'cache_options' argument instead.", FutureWarning, ) cache_options["trim"] = kwargs.pop("trim") self.kwargs = kwargs if mode not in {"ab", "rb", "wb"}: raise NotImplementedError("File mode not supported") if mode == "rb": if not hasattr(self, "details"): self.details = fs.info(path) self.size = self.details["size"] self.cache = caches[cache_type]( self.blocksize, self._fetch_range, self.size, **cache_options ) else: self.buffer = io.BytesIO() self.offset = None self.forced = False self.location = None @property def closed(self): # get around this attr being read-only in IOBase return self._closed @closed.setter def closed(self, c): self._closed = c def __hash__(self): if "w" in self.mode: return id(self) else: return int(tokenize(self.details), 16) def __eq__(self, other): """Files are equal if they have the same checksum, only in read mode""" return self.mode == "rb" and other.mode == "rb" and hash(self) == hash(other) def commit(self): """Move from temp to final destination""" def discard(self): """Throw away temporary file""" def info(self): """ File information about this path """ if "r" in self.mode: return self.details else: raise ValueError("Info not available while writing") def tell(self): """ Current file location """ return self.loc def seek(self, loc, whence=0): """ Set current file location Parameters ---------- loc: int byte location whence: {0, 1, 2} from start of file, current location or end of file, resp. """ loc = int(loc) if not self.mode == "rb": raise ValueError("Seek only available in read mode") if whence == 0: nloc = loc elif whence == 1: nloc = self.loc + loc elif whence == 2: nloc = self.size + loc else: raise ValueError("invalid whence (%s, should be 0, 1 or 2)" % whence) if nloc < 0: raise ValueError("Seek before start of file") self.loc = nloc return self.loc def write(self, data): """ Write data to buffer. Buffer only sent on flush() or if buffer is greater than or equal to blocksize. Parameters ---------- data: bytes Set of bytes to be written. """ if self.mode not in {"wb", "ab"}: raise ValueError("File not in write mode") if self.closed: raise ValueError("I/O operation on closed file.") if self.forced: raise ValueError("This file has been force-flushed, can only close") out = self.buffer.write(data) self.loc += out if self.buffer.tell() >= self.blocksize: self.flush() return out def flush(self, force=False): """ Write buffered data to backend store. Writes the current buffer, if it is larger than the block-size, or if the file is being closed. Parameters ---------- force: bool When closing, write the last block even if it is smaller than blocks are allowed to be. Disallows further writing to this file. """ if self.closed: raise ValueError("Flush on closed file") if force and self.forced: raise ValueError("Force flush cannot be called more than once") if force: self.forced = True if self.mode not in {"wb", "ab"}: # no-op to flush on read-mode return if not force and self.buffer.tell() < self.blocksize: # Defer write on small block return if self.offset is None: # Initialize a multipart upload self.offset = 0 self._initiate_upload() if self._upload_chunk(final=force) is not False: self.offset += self.buffer.seek(0, 2) self.buffer = io.BytesIO() def _upload_chunk(self, final=False): """ Write one part of a multi-block file upload Parameters ========== final: bool This is the last block, so should complete file, if self.autocommit is True. """ # may not yet have been initialized, may neet to call _initialize_upload def _initiate_upload(self): """ Create remote file/upload """ pass def _fetch_range(self, start, end): """Get the specified set of bytes from remote""" raise NotImplementedError def read(self, length=-1): """ Return data from cache, or fetch pieces as necessary Parameters ---------- length: int (-1) Number of bytes to read; if <0, all remaining bytes. """ length = -1 if length is None else int(length) if self.mode != "rb": raise ValueError("File not in read mode") if length < 0: length = self.size - self.loc if self.closed: raise ValueError("I/O operation on closed file.") logger.debug("%s read: %i - %i" % (self, self.loc, self.loc + length)) if length == 0: # don't even bother calling fetch return b"" out = self.cache._fetch(self.loc, self.loc + length) self.loc += len(out) return out def readinto(self, b): """mirrors builtin file's readinto method https://docs.python.org/3/library/io.html#io.RawIOBase.readinto """ data = self.read(len(b)) b[: len(data)] = data return len(data) def readuntil(self, char=b"\n", blocks=None): """Return data between current position and first occurrence of char char is included in the output, except if the end of the tile is encountered first. Parameters ---------- char: bytes Thing to find blocks: None or int How much to read in each go. Defaults to file blocksize - which may mean a new read on every call. """ out = [] while True: start = self.tell() part = self.read(blocks or self.blocksize) if len(part) == 0: break found = part.find(char) if found > -1: out.append(part[: found + len(char)]) self.seek(start + found + len(char)) break out.append(part) return b"".join(out) def readline(self): """Read until first occurrence of newline character Note that, because of character encoding, this is not necessarily a true line ending. """ return self.readuntil(b"\n") def __next__(self): out = self.readline() if out: return out raise StopIteration def __iter__(self): return self def readlines(self): """Return all data, split by the newline character""" data = self.read() lines = data.split(b"\n") out = [l + b"\n" for l in lines[:-1]] if data.endswith(b"\n"): return out else: return out + [lines[-1]] # return list(self) ??? def readinto1(self, b): return self.readinto(b) def close(self): """ Close file Finalizes writes, discards cache """ if self.closed: return if self.mode == "rb": self.cache = None else: if not self.forced: self.flush(force=True) if self.fs is not None: self.fs.invalidate_cache(self.path) self.fs.invalidate_cache(self.fs._parent(self.path)) self.closed = True def readable(self): """Whether opened for reading""" return self.mode == "rb" and not self.closed def seekable(self): """Whether is seekable (only in read mode)""" return self.readable() def writable(self): """Whether opened for writing""" return self.mode in {"wb", "ab"} and not self.closed def __del__(self): self.close() def __str__(self): return "" % (type(self.fs).__name__, self.path) __repr__ = __str__ def __enter__(self): return self def __exit__(self, *args): self.close() filesystem_spec-0.6.1/fsspec/tests/000077500000000000000000000000001356753337100173565ustar00rootroot00000000000000filesystem_spec-0.6.1/fsspec/tests/__init__.py000066400000000000000000000000001356753337100214550ustar00rootroot00000000000000filesystem_spec-0.6.1/fsspec/tests/test_api.py000066400000000000000000000066501356753337100215470ustar00rootroot00000000000000"""Tests the spec, using memoryfs""" import os import pickle from fsspec.implementations.memory import MemoryFileSystem, MemoryFile def test_idempotent(): MemoryFileSystem.clear_instance_cache() fs = MemoryFileSystem() fs2 = MemoryFileSystem() assert fs is fs2 assert MemoryFileSystem.current() is fs2 MemoryFileSystem.clear_instance_cache() assert not MemoryFileSystem._cache fs2 = MemoryFileSystem().current() assert fs == fs2 def test_pickle(): fs = MemoryFileSystem() fs2 = pickle.loads(pickle.dumps(fs)) assert fs == fs2 def test_class_methods(): assert MemoryFileSystem._strip_protocol("memory:stuff") == "stuff" assert MemoryFileSystem._strip_protocol("memory://stuff") == "stuff" assert MemoryFileSystem._strip_protocol("stuff") == "stuff" assert MemoryFileSystem._strip_protocol("other://stuff") == "other://stuff" assert MemoryFileSystem._get_kwargs_from_urls("memory://user@thing") == {} def test_get_put(tmpdir): tmpdir = str(tmpdir) fn = os.path.join(tmpdir, "one") open(fn, "wb").write(b"one") os.mkdir(os.path.join(tmpdir, "dir")) fn2 = os.path.join(tmpdir, "dir", "two") open(fn2, "wb").write(b"two") fs = MemoryFileSystem() fs.put(fn, "/afile") assert fs.cat("/afile") == b"one" fs.store["/bfile"] = MemoryFile(fs, "/bfile", b"data") fn3 = os.path.join(tmpdir, "three") fs.get("/bfile", fn3) assert open(fn3, "rb").read() == b"data" fs.put(tmpdir, "/more", recursive=True) assert fs.find("/more") == ["/more/dir/two", "/more/one", "/more/three"] for f in [fn, fn2, fn3]: os.remove(f) os.rmdir(os.path.join(tmpdir, "dir")) fs.get("/more/", tmpdir + "/", recursive=True) assert open(fn3, "rb").read() == b"data" assert open(fn, "rb").read() == b"one" def test_du(): fs = MemoryFileSystem() fs.store = { "/dir/afile": MemoryFile(fs, "/afile", b"a"), "/dir/dirb/afile": MemoryFile(fs, "/afile", b"bb"), "/dir/dirb/bfile": MemoryFile(fs, "/afile", b"ccc"), } assert fs.du("/dir") == 6 assert fs.du("/dir", total=False)["/dir/dirb/afile"] == 2 assert fs.du("/dir", maxdepth=0) == 1 def test_head_tail(): fs = MemoryFileSystem() with fs.open("/myfile", "wb") as f: f.write(b"I had a nice big cabbage") assert fs.head("/myfile", 5) == b"I had" assert fs.tail("/myfile", 7) == b"cabbage" def test_move(): fs = MemoryFileSystem() with fs.open("/myfile", "wb") as f: f.write(b"I had a nice big cabbage") fs.move("/myfile", "/otherfile") assert not fs.exists("/myfile") assert fs.info("/otherfile") assert isinstance(fs.ukey("/otherfile"), str) def test_read_block_delimiter(): fs = MemoryFileSystem() with fs.open("/myfile", "wb") as f: f.write(b"some\n" b"lines\n" b"of\n" b"text") assert fs.read_block("/myfile", 0, 2, b"\n") == b"some\n" assert fs.read_block("/myfile", 2, 6, b"\n") == b"lines\n" assert fs.read_block("/myfile", 6, 2, b"\n") == b"" assert fs.read_block("/myfile", 2, 9, b"\n") == b"lines\nof\n" assert fs.read_block("/myfile", 12, 6, b"\n") == b"text" assert fs.read_block("/myfile", 0, None) == fs.cat("/myfile") def test_open_text(): fs = MemoryFileSystem() with fs.open("/myfile", "wb") as f: f.write(b"some\n" b"lines\n" b"of\n" b"text") f = fs.open("/myfile", "r", encoding="latin1") assert f.encoding == "latin1" filesystem_spec-0.6.1/fsspec/tests/test_compression.py000066400000000000000000000111021356753337100233230ustar00rootroot00000000000000import pathlib import pytest import fsspec.core from fsspec.compression import compr, register_compression from fsspec.utils import compressions, infer_compression def test_infer_custom_compression(): """Inferred compression gets values from fsspec.compression.compr.""" assert infer_compression("fn.zip") == "zip" assert infer_compression("fn.gz") == "gzip" assert infer_compression("fn.unknown") is None assert infer_compression("fn.test_custom") is None assert infer_compression("fn.tst") is None register_compression("test_custom", lambda f, **kwargs: f, "tst") try: assert infer_compression("fn.zip") == "zip" assert infer_compression("fn.gz") == "gzip" assert infer_compression("fn.unknown") is None assert infer_compression("fn.test_custom") is None assert infer_compression("fn.tst") == "test_custom" # Duplicate registration in name or extension raises a value error. with pytest.raises(ValueError): register_compression("test_custom", lambda f, **kwargs: f, "tst") with pytest.raises(ValueError): register_compression("test_conflicting", lambda f, **kwargs: f, "tst") assert "test_conflicting" not in compr # ...but can be forced. register_compression( "test_conflicting", lambda f, **kwargs: f, "tst", force=True ) assert infer_compression("fn.zip") == "zip" assert infer_compression("fn.gz") == "gzip" assert infer_compression("fn.unknown") is None assert infer_compression("fn.test_custom") is None assert infer_compression("fn.tst") == "test_conflicting" finally: del compr["test_custom"] del compr["test_conflicting"] del compressions["tst"] def test_lzma_compression_name(): pytest.importorskip("lzma") assert infer_compression("fn.xz") == "xz" def test_lz4_compression(tmpdir): """Infer lz4 compression for .lz4 files if lz4 is available.""" tmp_path = pathlib.Path(str(tmpdir)) lz4 = pytest.importorskip("lz4") tmp_path.mkdir(exist_ok=True) tdat = "foobar" * 100 with fsspec.core.open( str(tmp_path / "out.lz4"), mode="wt", compression="infer" ) as outfile: outfile.write(tdat) compressed = (tmp_path / "out.lz4").open("rb").read() assert lz4.frame.decompress(compressed).decode() == tdat with fsspec.core.open( str(tmp_path / "out.lz4"), mode="rt", compression="infer" ) as infile: assert infile.read() == tdat with fsspec.core.open( str(tmp_path / "out.lz4"), mode="rt", compression="lz4" ) as infile: assert infile.read() == tdat def test_zstd_compression(tmpdir): """Infer zstd compression for .zst files if zstandard is available.""" tmp_path = pathlib.Path(str(tmpdir)) zstd = pytest.importorskip("zstandard") tmp_path.mkdir(exist_ok=True) tdat = "foobar" * 100 with fsspec.core.open( str(tmp_path / "out.zst"), mode="wt", compression="infer" ) as outfile: outfile.write(tdat) compressed = (tmp_path / "out.zst").open("rb").read() assert zstd.ZstdDecompressor().decompress(compressed, len(tdat)).decode() == tdat with fsspec.core.open( str(tmp_path / "out.zst"), mode="rt", compression="infer" ) as infile: assert infile.read() == tdat with fsspec.core.open( str(tmp_path / "out.zst"), mode="rt", compression="zstd" ) as infile: assert infile.read() == tdat def test_snappy_compression(tmpdir): """No registered compression for snappy, but can be specified.""" tmp_path = pathlib.Path(str(tmpdir)) snappy = pytest.importorskip("snappy") tmp_path.mkdir(exist_ok=True) tdat = "foobar" * 100 # Snappy isn't inferred. with fsspec.core.open( str(tmp_path / "out.snappy"), mode="wt", compression="infer" ) as outfile: outfile.write(tdat) assert (tmp_path / "out.snappy").open("rb").read().decode() == tdat # but can be specified. with fsspec.core.open( str(tmp_path / "out.snappy"), mode="wt", compression="snappy" ) as outfile: outfile.write(tdat) compressed = (tmp_path / "out.snappy").open("rb").read() assert snappy.StreamDecompressor().decompress(compressed).decode() == tdat with fsspec.core.open( str(tmp_path / "out.snappy"), mode="rb", compression="infer" ) as infile: assert infile.read() == compressed with fsspec.core.open( str(tmp_path / "out.snappy"), mode="rt", compression="snappy" ) as infile: assert infile.read() == tdat filesystem_spec-0.6.1/fsspec/tests/test_core.py000066400000000000000000000075441356753337100217310ustar00rootroot00000000000000import pytest import pickle import string from fsspec.core import ( _expand_paths, OpenFile, caches, get_compression, BaseCache, BlockCache, ) @pytest.mark.parametrize( "path, name_function, num, out", [ [["apath"], None, 1, ["apath"]], ["apath.*.csv", None, 1, ["apath.0.csv"]], ["apath.*.csv", None, 2, ["apath.0.csv", "apath.1.csv"]], ["a*", lambda x: "abc"[x], 2, ["aa", "ab"]], ], ) def test_expand_paths(path, name_function, num, out): assert _expand_paths(path, name_function, num) == out def test_expand_error(): with pytest.raises(ValueError): _expand_paths("*.*", None, 1) def test_openfile_api(m): m.open("somepath", "wb").write(b"data") of = OpenFile(m, "somepath") assert str(of) == "" f = of.open() assert f.read() == b"data" f.close() with OpenFile(m, "somepath", mode="rt") as f: f.read() == "data" # For test_cache_pickleable(). Functions are only picklable if they are defined # at the top-level of a module def _fetcher(start, end): return b"0" * (end - start) def letters_fetcher(start, end): return string.ascii_letters[start:end].encode() @pytest.fixture(params=caches.values(), ids=list(caches.keys())) def Cache_imp(request): return request.param def test_cache_empty_file(Cache_imp): blocksize = 5 size = 0 cache = Cache_imp(blocksize, _fetcher, size) assert cache._fetch(0, 0) == b"" def test_cache_pickleable(Cache_imp): blocksize = 5 size = 100 cache = Cache_imp(blocksize, _fetcher, size) cache._fetch(0, 5) # fill in cache unpickled = pickle.loads(pickle.dumps(cache)) assert isinstance(unpickled, Cache_imp) assert unpickled.blocksize == blocksize assert unpickled.size == size assert unpickled._fetch(0, 10) == b"0" * 10 @pytest.mark.parametrize( "size_requests", [[(0, 30), (0, 35), (51, 52)], [(0, 1), (1, 11), (1, 52)], [(0, 52), (11, 15)]], ) @pytest.mark.parametrize("blocksize", [1, 10, 52, 100]) def test_cache_basic(Cache_imp, blocksize, size_requests): cache = Cache_imp(blocksize, letters_fetcher, len(string.ascii_letters)) for start, end in size_requests: result = cache[start:end] expected = string.ascii_letters[start:end].encode() assert result == expected def test_xz_lzma_compressions(): pytest.importorskip("lzma") # Ensure that both 'xz' and 'lzma' compression names can be parsed assert get_compression("some_file.xz", "infer") == "xz" assert get_compression("some_file.xz", "xz") == "xz" assert get_compression("some_file.xz", "lzma") == "lzma" def test_cache_getitem(Cache_imp): cacher = Cache_imp(4, letters_fetcher, len(string.ascii_letters)) assert cacher[0:4] == b"abcd" assert cacher[:4] == b"abcd" assert cacher[-3:] == b"XYZ" assert cacher[-3:-1] == b"XY" assert cacher[2:4] == b"cd" def test_cache_getitem_raises(): cacher = BaseCache(4, letters_fetcher, len(string.ascii_letters)) with pytest.raises(TypeError, match="int"): cacher[5] with pytest.raises(ValueError, match="contiguous"): cacher[::4] def test_block_cache_lru(): cache = BlockCache(4, letters_fetcher, len(string.ascii_letters), maxblocks=2) # miss cache[0:2] assert cache.cache_info().hits == 0 assert cache.cache_info().misses == 1 assert cache.cache_info().currsize == 1 # hit cache[0:2] assert cache.cache_info().hits == 1 assert cache.cache_info().misses == 1 assert cache.cache_info().currsize == 1 # miss cache[4:6] assert cache.cache_info().hits == 1 assert cache.cache_info().misses == 2 assert cache.cache_info().currsize == 2 # miss & evict cache[12:13] assert cache.cache_info().hits == 1 assert cache.cache_info().misses == 3 assert cache.cache_info().currsize == 2 filesystem_spec-0.6.1/fsspec/tests/test_file.py000066400000000000000000000113531356753337100217110ustar00rootroot00000000000000"""Tests abstract buffered file API, using FTP implementation""" import pickle import sys import pytest from fsspec.implementations.tests.test_ftp import FTPFileSystem data = b"hello" * 10000 @pytest.mark.xfail( sys.version_info < (3, 6), reason="py35 error, see https://github.com/intake/filesystem_spec/issues/147", ) def test_pickle(ftp_writable): host, port, user, pw = ftp_writable ftp = FTPFileSystem(host=host, port=port, username=user, password=pw) f = ftp.open("/out", "rb") f2 = pickle.loads(pickle.dumps(f)) assert f == f2 def test_file_read_attributes(ftp_writable): host, port, user, pw = ftp_writable ftp = FTPFileSystem(host=host, port=port, username=user, password=pw) f = ftp.open("/out", "rb") assert f.info()["size"] == len(data) assert f.tell() == 0 assert f.seekable() assert f.readable() assert not f.writable() out = bytearray(len(data)) assert f.read() == data assert f.read() == b"" f.seek(0) assert f.readuntil(b"l") == b"hel" assert f.tell() == 3 f.readinto1(out) assert out[:-3] == data[3:] with pytest.raises(ValueError): f.write(b"") f.close() with pytest.raises(ValueError): f.read()(b"") def test_seek(ftp_writable): host, port, user, pw = ftp_writable ftp = FTPFileSystem(host=host, port=port, username=user, password=pw) f = ftp.open("/out", "rb") assert f.seek(-10, 2) == len(data) - 10 assert f.tell() == len(data) - 10 assert f.seek(-1, 1) == len(data) - 11 with pytest.raises(ValueError): f.seek(-1) with pytest.raises(ValueError): f.seek(0, 7) def test_file_idempotent(ftp_writable): host, port, user, pw = ftp_writable ftp = FTPFileSystem(host=host, port=port, username=user, password=pw) f = ftp.open("/out", "rb") f2 = ftp.open("/out", "rb") assert hash(f) == hash(f2) assert f == f2 ftp.touch("/out2") f2 = ftp.open("/out2", "rb") assert hash(f2) != hash(f) assert f != f2 f2 = ftp.open("/out", "wb") assert hash(f2) != hash(f) def test_file_text_attributes(ftp_writable): host, port, user, pw = ftp_writable ftp = FTPFileSystem(host=host, port=port, username=user, password=pw) data = b"hello\n" * 1000 with ftp.open("/out2", "wb") as f: f.write(data) f = ftp.open("/out2", "rb") assert f.readline() == b"hello\n" f.seek(0) assert list(f) == [d + b"\n" for d in data.split()] f.seek(0) assert f.readlines() == [d + b"\n" for d in data.split()] f = ftp.open("/out2", "rt") assert f.readline() == "hello\n" assert f.encoding def test_file_write_attributes(ftp_writable): host, port, user, pw = ftp_writable ftp = FTPFileSystem(host=host, port=port, username=user, password=pw) f = ftp.open("/out2", "wb") with pytest.raises(ValueError): f.info() with pytest.raises(ValueError): f.seek(0) with pytest.raises(ValueError): f.read(0) assert not f.readable() assert f.writable() f.flush() # no-op assert f.write(b"hello") == 5 assert f.write(b"hello") == 5 assert not f.closed f.close() assert f.closed with pytest.raises(ValueError): f.write(b"") with pytest.raises(ValueError): f.flush() def test_midread_cache(ftp_writable): host, port, user, pw = ftp_writable fs = FTPFileSystem(host=host, port=port, username=user, password=pw) fn = "/myfile" with fs.open(fn, "wb") as f: f.write(b"a" * 175627146) with fs.open(fn, "rb") as f: f.seek(175561610) d1 = f.read(65536) assert len(d1) == 65536 f.seek(4) size = 17562198 d2 = f.read(size) assert len(d2) == size f.seek(17562288) size = 17562187 d3 = f.read(size) assert len(d3) == size def test_read_block(ftp_writable): # not the same as test_read_block in test_utils, this depends on the # behaviour of the bytest caching from fsspec.utils import read_block host, port, user, pw = ftp_writable fs = FTPFileSystem(host=host, port=port, username=user, password=pw) fn = "/myfile" with fs.open(fn, "wb") as f: f.write(b"a,b\n1,2") f = fs.open(fn, "rb", cache_type="bytes") assert read_block(f, 0, 6400, b"\n") == b"a,b\n1,2" def test_with_gzip(ftp_writable): import gzip data = b"some compressable stuff" host, port, user, pw = ftp_writable fs = FTPFileSystem(host=host, port=port, username=user, password=pw) fn = "/myfile" with fs.open(fn, "wb") as f: gf = gzip.GzipFile(fileobj=f, mode="w") gf.write(data) gf.close() with fs.open(fn, "rb") as f: gf = gzip.GzipFile(fileobj=f, mode="r") assert gf.read() == data filesystem_spec-0.6.1/fsspec/tests/test_fuse.py000066400000000000000000000025261356753337100217360ustar00rootroot00000000000000import os import signal import time from multiprocessing import Process import pytest pytest.importorskip("fuse") # noqa: E402 from fsspec.fuse import run from fsspec.implementations.memory import MemoryFileSystem def host_fuse(mountdir): fs = MemoryFileSystem() fs.touch("/mounted/testfile") run(fs, "/mounted/", mountdir) def test_basic(tmpdir): mountdir = str(tmpdir.mkdir("mount")) fuse_process = Process(target=host_fuse, args=(str(mountdir),)) fuse_process.start() try: timeout = 10 while True: try: # can fail with device not ready while waiting for fuse if "testfile" in os.listdir(mountdir): break except Exception: pass timeout -= 1 time.sleep(1) assert timeout > 0, "Timeout" fn = os.path.join(mountdir, "test") with open(fn, "wb") as f: f.write(b"data") with open(fn) as f: assert f.read() == "data" os.remove(fn) os.mkdir(fn) assert os.listdir(fn) == [] os.mkdir(fn + "/inner") with pytest.raises(OSError): os.rmdir(fn) os.rmdir(fn + "/inner") os.rmdir(fn) finally: os.kill(fuse_process.pid, signal.SIGTERM) fuse_process.join() filesystem_spec-0.6.1/fsspec/tests/test_mapping.py000066400000000000000000000026121356753337100224230ustar00rootroot00000000000000import os import fsspec from fsspec.implementations.memory import MemoryFileSystem import pickle import pytest def test_mapping_prefix(tmpdir): tmpdir = str(tmpdir) os.makedirs(os.path.join(tmpdir, "afolder")) open(os.path.join(tmpdir, "afile"), "w").write("test") open(os.path.join(tmpdir, "afolder", "anotherfile"), "w").write("test2") m = fsspec.get_mapper("file://" + tmpdir) assert "afile" in m assert m["afolder/anotherfile"] == b"test2" fs = fsspec.filesystem("file") m2 = fs.get_mapper(tmpdir) m3 = fs.get_mapper("file://" + tmpdir) assert m == m2 == m3 def test_ops(): MemoryFileSystem.store.clear() m = fsspec.get_mapper("memory://") assert not m assert list(m) == [] with pytest.raises(KeyError): m["hi"] assert m.pop("key", 0) == 0 m["key0"] = b"data" assert list(m) == ["key0"] assert m["key0"] == b"data" m.clear() assert list(m) == [] def test_pickle(): m = fsspec.get_mapper("memory://") assert isinstance(m.fs, MemoryFileSystem) m["key"] = b"data" m2 = pickle.loads(pickle.dumps(m)) assert list(m) == list(m2) def test_keys_view(): # https://github.com/intake/filesystem_spec/issues/186 m = fsspec.get_mapper("memory://") m["key"] = b"data" keys = m.keys() assert len(keys) == 1 # check that we don't consume the keys assert len(keys) == 1 filesystem_spec-0.6.1/fsspec/tests/test_registry.py000066400000000000000000000011271356753337100226400ustar00rootroot00000000000000import pytest from fsspec.registry import get_filesystem_class, registry @pytest.mark.parametrize( "protocol,module,minversion,oldversion", [("s3", "s3fs", "0.3.0", "0.1.0"), ("gs", "gcsfs", "0.3.0", "0.1.0")], ) def test_minversion_s3fs(protocol, module, minversion, oldversion, monkeypatch): registry.clear() mod = pytest.importorskip(module, minversion) assert get_filesystem_class("s3") is not None registry.clear() monkeypatch.setattr(mod, "__version__", oldversion) with pytest.raises(RuntimeError, match=minversion): get_filesystem_class(protocol) filesystem_spec-0.6.1/fsspec/tests/test_spec.py000066400000000000000000000114251356753337100217240ustar00rootroot00000000000000import pickle import pytest from fsspec.spec import AbstractFileSystem, AbstractBufferedFile class DummyTestFS(AbstractFileSystem): protocol = "mock" _fs_contents = ( {"name": "top_level", "type": "directory"}, {"name": "top_level/second_level", "type": "directory"}, {"name": "top_level/second_level/date=2019-10-01", "type": "directory"}, { "name": "top_level/second_level/date=2019-10-01/a.parquet", "type": "file", "size": 100, }, { "name": "top_level/second_level/date=2019-10-01/b.parquet", "type": "file", "size": 100, }, {"name": "top_level/second_level/date=2019-10-02", "type": "directory"}, { "name": "top_level/second_level/date=2019-10-02/a.parquet", "type": "file", "size": 100, }, {"name": "top_level/second_level/date=2019-10-04", "type": "directory"}, { "name": "top_level/second_level/date=2019-10-04/a.parquet", "type": "file", "size": 100, }, {"name": "misc", "type": "directory"}, {"name": "misc/foo.txt", "type": "file", "size": 100}, ) def ls(self, path, detail=True, **kwargs): path = self._strip_protocol(path) files = ( file for file in self._fs_contents if path == self._parent(file["name"]) ) if detail: return list(files) return list(sorted([file["name"] for file in files])) @pytest.mark.parametrize( "test_path, expected", [ ( "mock://top_level/second_level/date=2019-10-01/a.parquet", ["top_level/second_level/date=2019-10-01/a.parquet"], ), ( "mock://top_level/second_level/date=2019-10-01/*", [ "top_level/second_level/date=2019-10-01/a.parquet", "top_level/second_level/date=2019-10-01/b.parquet", ], ), ("mock://top_level/second_level/date=2019-10", []), ( "mock://top_level/second_level/date=2019-10-0[1-4]", [ "top_level/second_level/date=2019-10-01", "top_level/second_level/date=2019-10-02", "top_level/second_level/date=2019-10-04", ], ), ( "mock://top_level/second_level/date=2019-10-0[1-4]/*", [ "top_level/second_level/date=2019-10-01/a.parquet", "top_level/second_level/date=2019-10-01/b.parquet", "top_level/second_level/date=2019-10-02/a.parquet", "top_level/second_level/date=2019-10-04/a.parquet", ], ), ( "mock://top_level/second_level/date=2019-10-0[1-4]/[a].*", [ "top_level/second_level/date=2019-10-01/a.parquet", "top_level/second_level/date=2019-10-02/a.parquet", "top_level/second_level/date=2019-10-04/a.parquet", ], ), ], ) def test_glob(test_path, expected): test_fs = DummyTestFS() assert test_fs.glob(test_path) == expected def test_cache(): fs = DummyTestFS() fs2 = DummyTestFS() assert fs is fs2 assert len(fs._cache) == 1 del fs2 assert len(fs._cache) == 1 del fs assert len(DummyTestFS._cache) == 1 DummyTestFS.clear_instance_cache() assert len(DummyTestFS._cache) == 0 def test_alias(): with pytest.warns(FutureWarning, match="add_aliases"): DummyTestFS(add_aliases=True) def test_add_docs_warns(): with pytest.warns(FutureWarning, match="add_docs"): AbstractFileSystem(add_docs=True) def test_cache_options(): fs = DummyTestFS() f = AbstractBufferedFile(fs, "misc/foo.txt", cache_type="bytes") assert f.cache.trim # TODO: dummy buffered file f = AbstractBufferedFile( fs, "misc/foo.txt", cache_type="bytes", cache_options=dict(trim=False) ) assert f.cache.trim is False f = fs.open("misc/foo.txt", cache_type="bytes", cache_options=dict(trim=False)) assert f.cache.trim is False def test_trim_kwarg_warns(): fs = DummyTestFS() with pytest.warns(FutureWarning, match="cache_options"): AbstractBufferedFile(fs, "misc/foo.txt", cache_type="bytes", trim=False) def test_eq(): fs = DummyTestFS() result = fs == 1 assert result is False def test_pickle_multiple(): a = DummyTestFS(1) b = DummyTestFS(2, bar=1) x = pickle.dumps(a) y = pickle.dumps(b) del a, b DummyTestFS.clear_instance_cache() result = pickle.loads(x) assert result.storage_args == (1,) assert result.storage_options == {} result = pickle.loads(y) assert result.storage_args == (2,) assert result.storage_options == dict(bar=1) filesystem_spec-0.6.1/fsspec/tests/test_utils.py000066400000000000000000000166511356753337100221400ustar00rootroot00000000000000import io import pytest from fsspec.utils import infer_storage_options, seek_delimiter, read_block def test_read_block(): delimiter = b"\n" data = delimiter.join([b"123", b"456", b"789"]) f = io.BytesIO(data) assert read_block(f, 1, 2) == b"23" assert read_block(f, 0, 1, delimiter=b"\n") == b"123\n" assert read_block(f, 0, 2, delimiter=b"\n") == b"123\n" assert read_block(f, 0, 3, delimiter=b"\n") == b"123\n" assert read_block(f, 0, 5, delimiter=b"\n") == b"123\n456\n" assert read_block(f, 0, 8, delimiter=b"\n") == b"123\n456\n789" assert read_block(f, 0, 100, delimiter=b"\n") == b"123\n456\n789" assert read_block(f, 1, 1, delimiter=b"\n") == b"" assert read_block(f, 1, 5, delimiter=b"\n") == b"456\n" assert read_block(f, 1, 8, delimiter=b"\n") == b"456\n789" for ols in [[(0, 3), (3, 3), (6, 3), (9, 2)], [(0, 4), (4, 4), (8, 4)]]: out = [read_block(f, o, l, b"\n") for o, l in ols] assert b"".join(filter(None, out)) == data def test_read_block_split_before(): """Test start/middle/end cases of split_before.""" # noqa: I d = ( "#header" + "".join(">foo{i}\nFOOBAR{i}\n".format(i=i) for i in range(100000)) ).encode() # Read single record at beginning. # All reads include beginning of file and read through termination of # delimited record. assert read_block(io.BytesIO(d), 0, 10, delimiter=b"\n") == b"#header>foo0\n" assert ( read_block(io.BytesIO(d), 0, 10, delimiter=b"\n", split_before=True) == b"#header>foo0" ) assert ( read_block(io.BytesIO(d), 0, 10, delimiter=b">") == b"#header>foo0\nFOOBAR0\n>" ) assert ( read_block(io.BytesIO(d), 0, 10, delimiter=b">", split_before=True) == b"#header>foo0\nFOOBAR0\n" ) # Read multiple records at beginning. # All reads include beginning of file and read through termination of # delimited record. assert ( read_block(io.BytesIO(d), 0, 27, delimiter=b"\n") == b"#header>foo0\nFOOBAR0\n>foo1\nFOOBAR1\n" ) assert ( read_block(io.BytesIO(d), 0, 27, delimiter=b"\n", split_before=True) == b"#header>foo0\nFOOBAR0\n>foo1\nFOOBAR1" ) assert ( read_block(io.BytesIO(d), 0, 27, delimiter=b">") == b"#header>foo0\nFOOBAR0\n>foo1\nFOOBAR1\n>" ) assert ( read_block(io.BytesIO(d), 0, 27, delimiter=b">", split_before=True) == b"#header>foo0\nFOOBAR0\n>foo1\nFOOBAR1\n" ) # Read with offset spanning into next record, splits on either side of delimiter. # Read not spanning the full record returns nothing. assert read_block(io.BytesIO(d), 10, 3, delimiter=b"\n") == b"FOOBAR0\n" assert ( read_block(io.BytesIO(d), 10, 3, delimiter=b"\n", split_before=True) == b"\nFOOBAR0" ) assert read_block(io.BytesIO(d), 10, 3, delimiter=b">") == b"" assert read_block(io.BytesIO(d), 10, 3, delimiter=b">", split_before=True) == b"" # Read with offset spanning multiple records, splits on either side of delimiter assert ( read_block(io.BytesIO(d), 10, 20, delimiter=b"\n") == b"FOOBAR0\n>foo1\nFOOBAR1\n" ) assert ( read_block(io.BytesIO(d), 10, 20, delimiter=b"\n", split_before=True) == b"\nFOOBAR0\n>foo1\nFOOBAR1" ) assert read_block(io.BytesIO(d), 10, 20, delimiter=b">") == b"foo1\nFOOBAR1\n>" assert ( read_block(io.BytesIO(d), 10, 20, delimiter=b">", split_before=True) == b">foo1\nFOOBAR1\n" ) # Read record at end, all records read to end tlen = len(d) assert ( read_block(io.BytesIO(d), tlen - 30, 35, delimiter=b"\n") == b">foo99999\nFOOBAR99999\n" ) assert ( read_block(io.BytesIO(d), tlen - 30, 35, delimiter=b"\n", split_before=True) == b"\n>foo99999\nFOOBAR99999\n" ) assert ( read_block(io.BytesIO(d), tlen - 30, 35, delimiter=b">") == b"foo99999\nFOOBAR99999\n" ) assert ( read_block(io.BytesIO(d), tlen - 30, 35, delimiter=b">", split_before=True) == b">foo99999\nFOOBAR99999\n" ) def test_seek_delimiter_endline(): f = io.BytesIO(b"123\n456\n789") # if at zero, stay at zero seek_delimiter(f, b"\n", 5) assert f.tell() == 0 # choose the first block for bs in [1, 5, 100]: f.seek(1) seek_delimiter(f, b"\n", blocksize=bs) assert f.tell() == 4 # handle long delimiters well, even with short blocksizes f = io.BytesIO(b"123abc456abc789") for bs in [1, 2, 3, 4, 5, 6, 10]: f.seek(1) seek_delimiter(f, b"abc", blocksize=bs) assert f.tell() == 6 # End at the end f = io.BytesIO(b"123\n456") f.seek(5) seek_delimiter(f, b"\n", 5) assert f.tell() == 7 def test_infer_options(): so = infer_storage_options("/mnt/datasets/test.csv") assert so.pop("protocol") == "file" assert so.pop("path") == "/mnt/datasets/test.csv" assert not so assert infer_storage_options("./test.csv")["path"] == "./test.csv" assert infer_storage_options("../test.csv")["path"] == "../test.csv" so = infer_storage_options("C:\\test.csv") assert so.pop("protocol") == "file" assert so.pop("path") == "C:\\test.csv" assert not so assert infer_storage_options("d:\\test.csv")["path"] == "d:\\test.csv" assert infer_storage_options("\\test.csv")["path"] == "\\test.csv" assert infer_storage_options(".\\test.csv")["path"] == ".\\test.csv" assert infer_storage_options("test.csv")["path"] == "test.csv" so = infer_storage_options( "hdfs://username:pwd@Node:123/mnt/datasets/test.csv?q=1#fragm", inherit_storage_options={"extra": "value"}, ) assert so.pop("protocol") == "hdfs" assert so.pop("username") == "username" assert so.pop("password") == "pwd" assert so.pop("host") == "Node" assert so.pop("port") == 123 assert so.pop("path") == "/mnt/datasets/test.csv#fragm" assert so.pop("url_query") == "q=1" assert so.pop("url_fragment") == "fragm" assert so.pop("extra") == "value" assert not so so = infer_storage_options("hdfs://User-name@Node-name.com/mnt/datasets/test.csv") assert so.pop("username") == "User-name" assert so.pop("host") == "Node-name.com" u = "http://127.0.0.1:8080/test.csv" assert infer_storage_options(u) == {"protocol": "http", "path": u} # For s3 and gcs the netloc is actually the bucket name, so we want to # include it in the path. Test that: # - Parsing doesn't lowercase the bucket # - The bucket is included in path for protocol in ["s3", "gcs", "gs"]: options = infer_storage_options("%s://Bucket-name.com/test.csv" % protocol) assert options["path"] == "Bucket-name.com/test.csv" with pytest.raises(KeyError): infer_storage_options("file:///bucket/file.csv", {"path": "collide"}) with pytest.raises(KeyError): infer_storage_options("hdfs:///bucket/file.csv", {"protocol": "collide"}) @pytest.mark.parametrize( "urlpath, expected_path", ( (r"c:\foo\bar", r"c:\foo\bar"), (r"C:\\foo\bar", r"C:\\foo\bar"), (r"c:/foo/bar", r"c:/foo/bar"), (r"file:///c|\foo\bar", r"c:\foo\bar"), (r"file:///C|/foo/bar", r"C:/foo/bar"), (r"file:///C:/foo/bar", r"C:/foo/bar"), ), ) def test_infer_storage_options_c(urlpath, expected_path): so = infer_storage_options(urlpath) assert so["protocol"] == "file" assert so["path"] == expected_path filesystem_spec-0.6.1/fsspec/transaction.py000066400000000000000000000041151356753337100211140ustar00rootroot00000000000000class Transaction(object): """Filesystem transaction write context Gathers files for deferred commit or discard, so that several write operations can be finalized semi-atomically. This works by having this instance as the ``.transaction`` attribute of the given filesystem """ def __init__(self, fs): """ Parameters ---------- fs: FileSystem instance """ self.fs = fs self.files = [] def __enter__(self): self.start() def __exit__(self, exc_type, exc_val, exc_tb): """End transaction and commit, if exit is not due to exception""" # only commit if there was no exception self.complete(commit=exc_type is None) self.fs._intrans = False self.fs._transaction = None def start(self): """Start a transaction on this FileSystem""" self.fs._intrans = True def complete(self, commit=True): """Finish transaction: commit or discard all deferred files""" for f in self.files: if commit: f.commit() else: f.discard() self.files = [] self.fs._intrans = False class FileActor(object): def __init__(self): self.files = [] def commit(self): for f in self.files: f.commit() self.files.clear() def discard(self): for f in self.files: f.discard() self.files.clear() def append(self, f): self.files.append(f) class DaskTransaction(Transaction): def __init__(self, fs): """ Parameters ---------- fs: FileSystem instance """ import distributed super().__init__(fs) client = distributed.default_client() self.files = client.submit(FileActor, actor=True).result() def complete(self, commit=True): """Finish transaction: commit or discard all deferred files""" if commit: self.files.commit().result() else: self.files.discard().result() self.fs._intrans = False filesystem_spec-0.6.1/fsspec/utils.py000066400000000000000000000223071356753337100177320ustar00rootroot00000000000000from hashlib import md5 import math import os import pathlib import re from urllib.parse import urlsplit DEFAULT_BLOCK_SIZE = 5 * 2 ** 20 def infer_storage_options(urlpath, inherit_storage_options=None): """ Infer storage options from URL path and merge it with existing storage options. Parameters ---------- urlpath: str or unicode Either local absolute file path or URL (hdfs://namenode:8020/file.csv) inherit_storage_options: dict (optional) Its contents will get merged with the inferred information from the given path Returns ------- Storage options dict. Examples -------- >>> infer_storage_options('/mnt/datasets/test.csv') # doctest: +SKIP {"protocol": "file", "path", "/mnt/datasets/test.csv"} >>> infer_storage_options( ... 'hdfs://username:pwd@node:123/mnt/datasets/test.csv?q=1', ... inherit_storage_options={'extra': 'value'}) # doctest: +SKIP {"protocol": "hdfs", "username": "username", "password": "pwd", "host": "node", "port": 123, "path": "/mnt/datasets/test.csv", "url_query": "q=1", "extra": "value"} """ # Handle Windows paths including disk name in this special case if re.match(r"^[a-zA-Z]:[\\/]", urlpath): return {"protocol": "file", "path": urlpath} parsed_path = urlsplit(urlpath) protocol = parsed_path.scheme or "file" if parsed_path.fragment: path = "#".join([parsed_path.path, parsed_path.fragment]) else: path = parsed_path.path if protocol == "file": # Special case parsing file protocol URL on Windows according to: # https://msdn.microsoft.com/en-us/library/jj710207.aspx windows_path = re.match(r"^/([a-zA-Z])[:|]([\\/].*)$", path) if windows_path: path = "%s:%s" % windows_path.groups() if protocol in ["http", "https"]: # for HTTP, we don't want to parse, as requests will anyway return {"protocol": protocol, "path": urlpath} options = {"protocol": protocol, "path": path} if parsed_path.netloc: # Parse `hostname` from netloc manually because `parsed_path.hostname` # lowercases the hostname which is not always desirable (e.g. in S3): # https://github.com/dask/dask/issues/1417 options["host"] = parsed_path.netloc.rsplit("@", 1)[-1].rsplit(":", 1)[0] if protocol in ("s3", "gcs", "gs"): options["path"] = options["host"] + options["path"] else: options["host"] = options["host"] if parsed_path.port: options["port"] = parsed_path.port if parsed_path.username: options["username"] = parsed_path.username if parsed_path.password: options["password"] = parsed_path.password if parsed_path.query: options["url_query"] = parsed_path.query if parsed_path.fragment: options["url_fragment"] = parsed_path.fragment if inherit_storage_options: update_storage_options(options, inherit_storage_options) return options def update_storage_options(options, inherited=None): if not inherited: inherited = {} collisions = set(options) & set(inherited) if collisions: collisions = "\n".join("- %r" % k for k in collisions) raise KeyError( "Collision between inferred and specified storage " "options:\n%s" % collisions ) options.update(inherited) # Compression extensions registered via fsspec.compression.register_compression compressions = {} def infer_compression(filename): """Infer compression, if available, from filename. Infer a named compression type, if registered and available, from filename extension. This includes builtin (gz, bz2, zip) compressions, as well as optional compressions. See fsspec.compression.register_compression. """ extension = os.path.splitext(filename)[-1].strip(".") if extension in compressions: return compressions[extension] def build_name_function(max_int): """ Returns a function that receives a single integer and returns it as a string padded by enough zero characters to align with maximum possible integer >>> name_f = build_name_function(57) >>> name_f(7) '07' >>> name_f(31) '31' >>> build_name_function(1000)(42) '0042' >>> build_name_function(999)(42) '042' >>> build_name_function(0)(0) '0' """ # handle corner cases max_int is 0 or exact power of 10 max_int += 1e-8 pad_length = int(math.ceil(math.log10(max_int))) def name_function(i): return str(i).zfill(pad_length) return name_function def seek_delimiter(file, delimiter, blocksize): r"""Seek current file to file start, file end, or byte after delimiter seq. Seeks file to next chunk delimiter, where chunks are defined on file start, a delimiting sequence, and file end. Use file.tell() to see location afterwards. Note that file start is a valid split, so must be at offset > 0 to seek for delimiter. Parameters ---------- file: a file delimiter: bytes a delimiter like ``b'\n'`` or message sentinel, matching file .read() type blocksize: int Number of bytes to read from the file at once. Returns ------- Returns True if a delimiter was found, False if at file start or end. """ if file.tell() == 0: # beginning-of-file, return without seek return False # Interface is for binary IO, with delimiter as bytes, but initialize last # with result of file.read to preserve compatibility with text IO. last = None while True: current = file.read(blocksize) if not current: # end-of-file without delimiter return False full = last + current if last else current try: if delimiter in full: i = full.index(delimiter) file.seek(file.tell() - (len(full) - i) + len(delimiter)) return True elif len(current) < blocksize: # end-of-file without delimiter return False except (OSError, ValueError): pass last = full[-len(delimiter) :] def read_block(f, offset, length, delimiter=None, split_before=False): """ Read a block of bytes from a file Parameters ---------- f: File Open file offset: int Byte offset to start read length: int Number of bytes to read, read through end of file if None delimiter: bytes (optional) Ensure reading starts and stops at delimiter bytestring split_before: bool (optional) Start/stop read *before* delimiter bytestring. If using the ``delimiter=`` keyword argument we ensure that the read starts and stops at delimiter boundaries that follow the locations ``offset`` and ``offset + length``. If ``offset`` is zero then we start at zero, regardless of delimiter. The bytestring returned WILL include the terminating delimiter string. Examples -------- >>> from io import BytesIO # doctest: +SKIP >>> f = BytesIO(b'Alice, 100\\nBob, 200\\nCharlie, 300') # doctest: +SKIP >>> read_block(f, 0, 13) # doctest: +SKIP b'Alice, 100\\nBo' >>> read_block(f, 0, 13, delimiter=b'\\n') # doctest: +SKIP b'Alice, 100\\nBob, 200\\n' >>> read_block(f, 10, 10, delimiter=b'\\n') # doctest: +SKIP b'Bob, 200\\nCharlie, 300' """ if delimiter: f.seek(offset) found_start_delim = seek_delimiter(f, delimiter, 2 ** 16) if length is None: return f.read() start = f.tell() length -= start - offset f.seek(start + length) found_end_delim = seek_delimiter(f, delimiter, 2 ** 16) end = f.tell() # Adjust split location to before delimiter iff seek found the # delimiter sequence, not start or end of file. if found_start_delim and split_before: start -= len(delimiter) if found_end_delim and split_before: end -= len(delimiter) offset = start length = end - start f.seek(offset) b = f.read(length) return b def tokenize(*args, **kwargs): """ Deterministic token (modified from dask.base) >>> tokenize([1, 2, '3']) '9d71491b50023b06fc76928e6eddb952' >>> tokenize('Hello') == tokenize('Hello') True """ if kwargs: args += (kwargs,) return md5(str(args).encode()).hexdigest() def stringify_path(filepath): """ Attempt to convert a path-like object to a string. Parameters ---------- filepath: object to be converted Returns ------- filepath_str: maybe a string version of the object Notes ----- Objects supporting the fspath protocol (Python 3.6+) are coerced according to its __fspath__ method. For backwards compatibility with older Python version, pathlib.Path objects are specially coerced. Any other object is passed through unchanged, which includes bytes, strings, buffers, or anything else that's not even path-like. """ if hasattr(filepath, "__fspath__"): return filepath.__fspath__() elif isinstance(filepath, pathlib.Path): return str(filepath) return filepath filesystem_spec-0.6.1/pyproject.toml000066400000000000000000000002311356753337100176410ustar00rootroot00000000000000[tool.black] # Revert to py34 target syntax to accomodate # errors in trailing commas. # https://github.com/psf/black/pull/763 target_version = ['py34'] filesystem_spec-0.6.1/readthedocs.yml000066400000000000000000000000461356753337100177410ustar00rootroot00000000000000conda: file: docs/environment.yml filesystem_spec-0.6.1/requirements.txt000066400000000000000000000000001356753337100202030ustar00rootroot00000000000000filesystem_spec-0.6.1/setup.cfg000066400000000000000000000006721356753337100165570ustar00rootroot00000000000000[metadata] long_description: file: README.rst [versioneer] VCS = git style = pep440 versionfile_source = fsspec/_version.py versionfile_build = fsspec/_version.py tag_prefix = "" [flake8] exclude = .tox,build,docs/source/conf.py,versioneer.py max-line-length = 88 ignore = # Assigning lambda expression E731 # Ambiguous variable names E741 # line break before binary operator W503 # whitespace before : E203 filesystem_spec-0.6.1/setup.py000077500000000000000000000022201356753337100164420ustar00rootroot00000000000000#!/usr/bin/env python import os from setuptools import setup import versioneer here = os.path.abspath(os.path.dirname(__file__)) with open(os.path.join(here, "README.md"), encoding="utf-8") as f: long_description = f.read() setup( name="fsspec", version=versioneer.get_version(), cmdclass=versioneer.get_cmdclass(), classifiers=[ "Development Status :: 4 - Beta", "Intended Audience :: Developers", "License :: OSI Approved :: BSD License", "Operating System :: OS Independent", "Programming Language :: Python :: 3.5", "Programming Language :: Python :: 3.6", "Programming Language :: Python :: 3.7", ], description="File-system specification", long_description=long_description, long_description_content_type="text/markdown", url="http://github.com/intake/filesystem_spec", maintainer="Martin Durant", maintainer_email="mdurant@anaconda.com", license="BSD", keywords="file", packages=["fsspec", "fsspec.implementations"], python_requires=">=3.5", install_requires=open("requirements.txt").read().strip().split("\n"), zip_safe=False, ) filesystem_spec-0.6.1/tox.ini000066400000000000000000000045051356753337100162500ustar00rootroot00000000000000# content of: tox.ini , put in same dir as setup.py [tox] envlist = {py35,py36,py37} [core] conda_channels= defaults conda-forge conda_deps= pip paramiko requests zstandard python-snappy lz4 distributed dask pyarrow pyftpdlib cloudpickle pytest pytest-cov fusepy==3.0.1 deps= hadoop-test-cluster==0.1.0 [dev] conda_deps= conda-forge::pre-commit=1.18 black=19.3b0 flake8 deps= [testenv] description=Run test suite against target versions. conda_channels= {[core]conda_channels} conda_deps= {[core]conda_deps} deps= {[core]deps} commands = py.test -v -r s [testenv:coverage] description=Run test suite with coverage enabled. basepython=python3.7 conda_channels= {[core]conda_channels} conda_deps= {[core]conda_deps} deps= {[core]deps} commands = py.test --cov=fsspec -v -r s [testenv:dev] description=Setup conda dev env under '.tox/dev'. basepython=python3.7 usedevelop=True conda_channels= {[core]conda_channels} conda_deps= {[core]conda_deps} {[dev]conda_deps} deps= {[core]deps} {[dev]deps} commands = [testenv:lint] description=Run pre-commit checks. basepython=python3.7 skip_install=True conda_deps= {[dev]conda_deps} deps= {[dev]deps} commands_pre= pre-commit install --install-hooks commands= pre-commit run --all-files --show-diff-on-failure [testenv:s3fs] description=Run s3fs (@master) test suite against fsspec. conda_channels= defaults conda-forge conda_deps= {[core]conda_deps} boto3 botocore httpretty moto six mock deps= {[core]deps} changedir=.tox/s3fs/tmp whitelist_externals= rm git setenv= BOTO_CONFIG=/dev/null AWS_ACCESS_KEY_ID=foobar_key AWS_SECRET_ACCESS_KEY=foobar_secret commands= rm -rf s3fs git clone https://github.com/dask/s3fs py.test -vv s3fs/s3fs [testenv:gcsfs] description=Run gcsfs (@master) test suite against fsspec. conda_channels= defaults conda-forge conda_deps= {[core]conda_deps} requests decorator google-auth deps= {[core]deps} vcrpy google-auth-oauthlib changedir=.tox/gcsfs/tmp whitelist_externals= rm git setenv= GCSFS_RECORD_MODE=none commands= rm -rf gcsfs git clone https://github.com/dask/gcsfs py.test -vv gcsfs/gcsfs -k 'not fuse' filesystem_spec-0.6.1/versioneer.py000066400000000000000000002062171356753337100174740ustar00rootroot00000000000000# Version: 0.18 """The Versioneer - like a rocketeer, but for versions. The Versioneer ============== * like a rocketeer, but for versions! * https://github.com/warner/python-versioneer * Brian Warner * License: Public Domain * Compatible With: python2.6, 2.7, 3.2, 3.3, 3.4, 3.5, 3.6, and pypy * [![Latest Version] (https://pypip.in/version/versioneer/badge.svg?style=flat) ](https://pypi.python.org/pypi/versioneer/) * [![Build Status] (https://travis-ci.org/warner/python-versioneer.png?branch=master) ](https://travis-ci.org/warner/python-versioneer) This is a tool for managing a recorded version number in distutils-based python projects. The goal is to remove the tedious and error-prone "update the embedded version string" step from your release process. Making a new release should be as easy as recording a new tag in your version-control system, and maybe making new tarballs. ## Quick Install * `pip install versioneer` to somewhere to your $PATH * add a `[versioneer]` section to your setup.cfg (see below) * run `versioneer install` in your source tree, commit the results ## Version Identifiers Source trees come from a variety of places: * a version-control system checkout (mostly used by developers) * a nightly tarball, produced by build automation * a snapshot tarball, produced by a web-based VCS browser, like github's "tarball from tag" feature * a release tarball, produced by "setup.py sdist", distributed through PyPI Within each source tree, the version identifier (either a string or a number, this tool is format-agnostic) can come from a variety of places: * ask the VCS tool itself, e.g. "git describe" (for checkouts), which knows about recent "tags" and an absolute revision-id * the name of the directory into which the tarball was unpacked * an expanded VCS keyword ($Id$, etc) * a `_version.py` created by some earlier build step For released software, the version identifier is closely related to a VCS tag. Some projects use tag names that include more than just the version string (e.g. "myproject-1.2" instead of just "1.2"), in which case the tool needs to strip the tag prefix to extract the version identifier. For unreleased software (between tags), the version identifier should provide enough information to help developers recreate the same tree, while also giving them an idea of roughly how old the tree is (after version 1.2, before version 1.3). Many VCS systems can report a description that captures this, for example `git describe --tags --dirty --always` reports things like "0.7-1-g574ab98-dirty" to indicate that the checkout is one revision past the 0.7 tag, has a unique revision id of "574ab98", and is "dirty" (it has uncommitted changes. The version identifier is used for multiple purposes: * to allow the module to self-identify its version: `myproject.__version__` * to choose a name and prefix for a 'setup.py sdist' tarball ## Theory of Operation Versioneer works by adding a special `_version.py` file into your source tree, where your `__init__.py` can import it. This `_version.py` knows how to dynamically ask the VCS tool for version information at import time. `_version.py` also contains `$Revision$` markers, and the installation process marks `_version.py` to have this marker rewritten with a tag name during the `git archive` command. As a result, generated tarballs will contain enough information to get the proper version. To allow `setup.py` to compute a version too, a `versioneer.py` is added to the top level of your source tree, next to `setup.py` and the `setup.cfg` that configures it. This overrides several distutils/setuptools commands to compute the version when invoked, and changes `setup.py build` and `setup.py sdist` to replace `_version.py` with a small static file that contains just the generated version data. ## Installation See [INSTALL.md](./INSTALL.md) for detailed installation instructions. ## Version-String Flavors Code which uses Versioneer can learn about its version string at runtime by importing `_version` from your main `__init__.py` file and running the `get_versions()` function. From the "outside" (e.g. in `setup.py`), you can import the top-level `versioneer.py` and run `get_versions()`. Both functions return a dictionary with different flavors of version information: * `['version']`: A condensed version string, rendered using the selected style. This is the most commonly used value for the project's version string. The default "pep440" style yields strings like `0.11`, `0.11+2.g1076c97`, or `0.11+2.g1076c97.dirty`. See the "Styles" section below for alternative styles. * `['full-revisionid']`: detailed revision identifier. For Git, this is the full SHA1 commit id, e.g. "1076c978a8d3cfc70f408fe5974aa6c092c949ac". * `['date']`: Date and time of the latest `HEAD` commit. For Git, it is the commit date in ISO 8601 format. This will be None if the date is not available. * `['dirty']`: a boolean, True if the tree has uncommitted changes. Note that this is only accurate if run in a VCS checkout, otherwise it is likely to be False or None * `['error']`: if the version string could not be computed, this will be set to a string describing the problem, otherwise it will be None. It may be useful to throw an exception in setup.py if this is set, to avoid e.g. creating tarballs with a version string of "unknown". Some variants are more useful than others. Including `full-revisionid` in a bug report should allow developers to reconstruct the exact code being tested (or indicate the presence of local changes that should be shared with the developers). `version` is suitable for display in an "about" box or a CLI `--version` output: it can be easily compared against release notes and lists of bugs fixed in various releases. The installer adds the following text to your `__init__.py` to place a basic version in `YOURPROJECT.__version__`: from ._version import get_versions __version__ = get_versions()['version'] del get_versions ## Styles The setup.cfg `style=` configuration controls how the VCS information is rendered into a version string. The default style, "pep440", produces a PEP440-compliant string, equal to the un-prefixed tag name for actual releases, and containing an additional "local version" section with more detail for in-between builds. For Git, this is TAG[+DISTANCE.gHEX[.dirty]] , using information from `git describe --tags --dirty --always`. For example "0.11+2.g1076c97.dirty" indicates that the tree is like the "1076c97" commit but has uncommitted changes (".dirty"), and that this commit is two revisions ("+2") beyond the "0.11" tag. For released software (exactly equal to a known tag), the identifier will only contain the stripped tag, e.g. "0.11". Other styles are available. See [details.md](details.md) in the Versioneer source tree for descriptions. ## Debugging Versioneer tries to avoid fatal errors: if something goes wrong, it will tend to return a version of "0+unknown". To investigate the problem, run `setup.py version`, which will run the version-lookup code in a verbose mode, and will display the full contents of `get_versions()` (including the `error` string, which may help identify what went wrong). ## Known Limitations Some situations are known to cause problems for Versioneer. This details the most significant ones. More can be found on Github [issues page](https://github.com/warner/python-versioneer/issues). ### Subprojects Versioneer has limited support for source trees in which `setup.py` is not in the root directory (e.g. `setup.py` and `.git/` are *not* siblings). The are two common reasons why `setup.py` might not be in the root: * Source trees which contain multiple subprojects, such as [Buildbot](https://github.com/buildbot/buildbot), which contains both "master" and "slave" subprojects, each with their own `setup.py`, `setup.cfg`, and `tox.ini`. Projects like these produce multiple PyPI distributions (and upload multiple independently-installable tarballs). * Source trees whose main purpose is to contain a C library, but which also provide bindings to Python (and perhaps other langauges) in subdirectories. Versioneer will look for `.git` in parent directories, and most operations should get the right version string. However `pip` and `setuptools` have bugs and implementation details which frequently cause `pip install .` from a subproject directory to fail to find a correct version string (so it usually defaults to `0+unknown`). `pip install --editable .` should work correctly. `setup.py install` might work too. Pip-8.1.1 is known to have this problem, but hopefully it will get fixed in some later version. [Bug #38](https://github.com/warner/python-versioneer/issues/38) is tracking this issue. The discussion in [PR #61](https://github.com/warner/python-versioneer/pull/61) describes the issue from the Versioneer side in more detail. [pip PR#3176](https://github.com/pypa/pip/pull/3176) and [pip PR#3615](https://github.com/pypa/pip/pull/3615) contain work to improve pip to let Versioneer work correctly. Versioneer-0.16 and earlier only looked for a `.git` directory next to the `setup.cfg`, so subprojects were completely unsupported with those releases. ### Editable installs with setuptools <= 18.5 `setup.py develop` and `pip install --editable .` allow you to install a project into a virtualenv once, then continue editing the source code (and test) without re-installing after every change. "Entry-point scripts" (`setup(entry_points={"console_scripts": ..})`) are a convenient way to specify executable scripts that should be installed along with the python package. These both work as expected when using modern setuptools. When using setuptools-18.5 or earlier, however, certain operations will cause `pkg_resources.DistributionNotFound` errors when running the entrypoint script, which must be resolved by re-installing the package. This happens when the install happens with one version, then the egg_info data is regenerated while a different version is checked out. Many setup.py commands cause egg_info to be rebuilt (including `sdist`, `wheel`, and installing into a different virtualenv), so this can be surprising. [Bug #83](https://github.com/warner/python-versioneer/issues/83) describes this one, but upgrading to a newer version of setuptools should probably resolve it. ### Unicode version strings While Versioneer works (and is continually tested) with both Python 2 and Python 3, it is not entirely consistent with bytes-vs-unicode distinctions. Newer releases probably generate unicode version strings on py2. It's not clear that this is wrong, but it may be surprising for applications when then write these strings to a network connection or include them in bytes-oriented APIs like cryptographic checksums. [Bug #71](https://github.com/warner/python-versioneer/issues/71) investigates this question. ## Updating Versioneer To upgrade your project to a new release of Versioneer, do the following: * install the new Versioneer (`pip install -U versioneer` or equivalent) * edit `setup.cfg`, if necessary, to include any new configuration settings indicated by the release notes. See [UPGRADING](./UPGRADING.md) for details. * re-run `versioneer install` in your source tree, to replace `SRC/_version.py` * commit any changed files ## Future Directions This tool is designed to make it easily extended to other version-control systems: all VCS-specific components are in separate directories like src/git/ . The top-level `versioneer.py` script is assembled from these components by running make-versioneer.py . In the future, make-versioneer.py will take a VCS name as an argument, and will construct a version of `versioneer.py` that is specific to the given VCS. It might also take the configuration arguments that are currently provided manually during installation by editing setup.py . Alternatively, it might go the other direction and include code from all supported VCS systems, reducing the number of intermediate scripts. ## License To make Versioneer easier to embed, all its code is dedicated to the public domain. The `_version.py` that it creates is also in the public domain. Specifically, both are released under the Creative Commons "Public Domain Dedication" license (CC0-1.0), as described in https://creativecommons.org/publicdomain/zero/1.0/ . """ from __future__ import print_function try: import configparser except ImportError: import ConfigParser as configparser import errno import json import os import re import subprocess import sys class VersioneerConfig: """Container for Versioneer configuration parameters.""" def get_root(): """Get the project root directory. We require that all commands are run from the project root, i.e. the directory that contains setup.py, setup.cfg, and versioneer.py . """ root = os.path.realpath(os.path.abspath(os.getcwd())) setup_py = os.path.join(root, "setup.py") versioneer_py = os.path.join(root, "versioneer.py") if not (os.path.exists(setup_py) or os.path.exists(versioneer_py)): # allow 'python path/to/setup.py COMMAND' root = os.path.dirname(os.path.realpath(os.path.abspath(sys.argv[0]))) setup_py = os.path.join(root, "setup.py") versioneer_py = os.path.join(root, "versioneer.py") if not (os.path.exists(setup_py) or os.path.exists(versioneer_py)): err = ( "Versioneer was unable to run the project root directory. " "Versioneer requires setup.py to be executed from " "its immediate directory (like 'python setup.py COMMAND'), " "or in a way that lets it use sys.argv[0] to find the root " "(like 'python path/to/setup.py COMMAND')." ) raise VersioneerBadRootError(err) try: # Certain runtime workflows (setup.py install/develop in a setuptools # tree) execute all dependencies in a single python process, so # "versioneer" may be imported multiple times, and python's shared # module-import table will cache the first one. So we can't use # os.path.dirname(__file__), as that will find whichever # versioneer.py was first imported, even in later projects. me = os.path.realpath(os.path.abspath(__file__)) me_dir = os.path.normcase(os.path.splitext(me)[0]) vsr_dir = os.path.normcase(os.path.splitext(versioneer_py)[0]) if me_dir != vsr_dir: print( "Warning: build in %s is using versioneer.py from %s" % (os.path.dirname(me), versioneer_py) ) except NameError: pass return root def get_config_from_root(root): """Read the project setup.cfg file to determine Versioneer config.""" # This might raise EnvironmentError (if setup.cfg is missing), or # configparser.NoSectionError (if it lacks a [versioneer] section), or # configparser.NoOptionError (if it lacks "VCS="). See the docstring at # the top of versioneer.py for instructions on writing your setup.cfg . setup_cfg = os.path.join(root, "setup.cfg") parser = configparser.SafeConfigParser() with open(setup_cfg, "r") as f: parser.readfp(f) VCS = parser.get("versioneer", "VCS") # mandatory def get(parser, name): if parser.has_option("versioneer", name): return parser.get("versioneer", name) return None cfg = VersioneerConfig() cfg.VCS = VCS cfg.style = get(parser, "style") or "" cfg.versionfile_source = get(parser, "versionfile_source") cfg.versionfile_build = get(parser, "versionfile_build") cfg.tag_prefix = get(parser, "tag_prefix") if cfg.tag_prefix in ("''", '""'): cfg.tag_prefix = "" cfg.parentdir_prefix = get(parser, "parentdir_prefix") cfg.verbose = get(parser, "verbose") return cfg class NotThisMethod(Exception): """Exception raised if a method is not valid for the current scenario.""" # these dictionaries contain VCS-specific tools LONG_VERSION_PY = {} HANDLERS = {} def register_vcs_handler(vcs, method): # decorator """Decorator to mark a method as the handler for a particular VCS.""" def decorate(f): """Store f in HANDLERS[vcs][method].""" if vcs not in HANDLERS: HANDLERS[vcs] = {} HANDLERS[vcs][method] = f return f return decorate def run_command(commands, args, cwd=None, verbose=False, hide_stderr=False, env=None): """Call the given command(s).""" assert isinstance(commands, list) p = None for c in commands: try: dispcmd = str([c] + args) # remember shell=False, so use git.cmd on windows, not just git p = subprocess.Popen( [c] + args, cwd=cwd, env=env, stdout=subprocess.PIPE, stderr=(subprocess.PIPE if hide_stderr else None), ) break except EnvironmentError: e = sys.exc_info()[1] if e.errno == errno.ENOENT: continue if verbose: print("unable to run %s" % dispcmd) print(e) return None, None else: if verbose: print("unable to find command, tried %s" % (commands,)) return None, None stdout = p.communicate()[0].strip() if sys.version_info[0] >= 3: stdout = stdout.decode() if p.returncode != 0: if verbose: print("unable to run %s (error)" % dispcmd) print("stdout was %s" % stdout) return None, p.returncode return stdout, p.returncode LONG_VERSION_PY[ "git" ] = ''' # This file helps to compute a version number in source trees obtained from # git-archive tarball (such as those provided by githubs download-from-tag # feature). Distribution tarballs (built by setup.py sdist) and build # directories (produced by setup.py build) will contain a much shorter file # that just contains the computed version number. # This file is released into the public domain. Generated by # versioneer-0.18 (https://github.com/warner/python-versioneer) """Git implementation of _version.py.""" import errno import os import re import subprocess import sys def get_keywords(): """Get the keywords needed to look up the version information.""" # these strings will be replaced by git during git-archive. # setup.py/versioneer.py will grep for the variable names, so they must # each be defined on a line of their own. _version.py will just call # get_keywords(). git_refnames = "%(DOLLAR)sFormat:%%d%(DOLLAR)s" git_full = "%(DOLLAR)sFormat:%%H%(DOLLAR)s" git_date = "%(DOLLAR)sFormat:%%ci%(DOLLAR)s" keywords = {"refnames": git_refnames, "full": git_full, "date": git_date} return keywords class VersioneerConfig: """Container for Versioneer configuration parameters.""" def get_config(): """Create, populate and return the VersioneerConfig() object.""" # these strings are filled in when 'setup.py versioneer' creates # _version.py cfg = VersioneerConfig() cfg.VCS = "git" cfg.style = "%(STYLE)s" cfg.tag_prefix = "%(TAG_PREFIX)s" cfg.parentdir_prefix = "%(PARENTDIR_PREFIX)s" cfg.versionfile_source = "%(VERSIONFILE_SOURCE)s" cfg.verbose = False return cfg class NotThisMethod(Exception): """Exception raised if a method is not valid for the current scenario.""" LONG_VERSION_PY = {} HANDLERS = {} def register_vcs_handler(vcs, method): # decorator """Decorator to mark a method as the handler for a particular VCS.""" def decorate(f): """Store f in HANDLERS[vcs][method].""" if vcs not in HANDLERS: HANDLERS[vcs] = {} HANDLERS[vcs][method] = f return f return decorate def run_command(commands, args, cwd=None, verbose=False, hide_stderr=False, env=None): """Call the given command(s).""" assert isinstance(commands, list) p = None for c in commands: try: dispcmd = str([c] + args) # remember shell=False, so use git.cmd on windows, not just git p = subprocess.Popen([c] + args, cwd=cwd, env=env, stdout=subprocess.PIPE, stderr=(subprocess.PIPE if hide_stderr else None)) break except EnvironmentError: e = sys.exc_info()[1] if e.errno == errno.ENOENT: continue if verbose: print("unable to run %%s" %% dispcmd) print(e) return None, None else: if verbose: print("unable to find command, tried %%s" %% (commands,)) return None, None stdout = p.communicate()[0].strip() if sys.version_info[0] >= 3: stdout = stdout.decode() if p.returncode != 0: if verbose: print("unable to run %%s (error)" %% dispcmd) print("stdout was %%s" %% stdout) return None, p.returncode return stdout, p.returncode def versions_from_parentdir(parentdir_prefix, root, verbose): """Try to determine the version from the parent directory name. Source tarballs conventionally unpack into a directory that includes both the project name and a version string. We will also support searching up two directory levels for an appropriately named parent directory """ rootdirs = [] for i in range(3): dirname = os.path.basename(root) if dirname.startswith(parentdir_prefix): return {"version": dirname[len(parentdir_prefix):], "full-revisionid": None, "dirty": False, "error": None, "date": None} else: rootdirs.append(root) root = os.path.dirname(root) # up a level if verbose: print("Tried directories %%s but none started with prefix %%s" %% (str(rootdirs), parentdir_prefix)) raise NotThisMethod("rootdir doesn't start with parentdir_prefix") @register_vcs_handler("git", "get_keywords") def git_get_keywords(versionfile_abs): """Extract version information from the given file.""" # the code embedded in _version.py can just fetch the value of these # keywords. When used from setup.py, we don't want to import _version.py, # so we do it with a regexp instead. This function is not used from # _version.py. keywords = {} try: f = open(versionfile_abs, "r") for line in f.readlines(): if line.strip().startswith("git_refnames ="): mo = re.search(r'=\s*"(.*)"', line) if mo: keywords["refnames"] = mo.group(1) if line.strip().startswith("git_full ="): mo = re.search(r'=\s*"(.*)"', line) if mo: keywords["full"] = mo.group(1) if line.strip().startswith("git_date ="): mo = re.search(r'=\s*"(.*)"', line) if mo: keywords["date"] = mo.group(1) f.close() except EnvironmentError: pass return keywords @register_vcs_handler("git", "keywords") def git_versions_from_keywords(keywords, tag_prefix, verbose): """Get version information from git keywords.""" if not keywords: raise NotThisMethod("no keywords at all, weird") date = keywords.get("date") if date is not None: # git-2.2.0 added "%%cI", which expands to an ISO-8601 -compliant # datestamp. However we prefer "%%ci" (which expands to an "ISO-8601 # -like" string, which we must then edit to make compliant), because # it's been around since git-1.5.3, and it's too difficult to # discover which version we're using, or to work around using an # older one. date = date.strip().replace(" ", "T", 1).replace(" ", "", 1) refnames = keywords["refnames"].strip() if refnames.startswith("$Format"): if verbose: print("keywords are unexpanded, not using") raise NotThisMethod("unexpanded keywords, not a git-archive tarball") refs = set([r.strip() for r in refnames.strip("()").split(",")]) # starting in git-1.8.3, tags are listed as "tag: foo-1.0" instead of # just "foo-1.0". If we see a "tag: " prefix, prefer those. TAG = "tag: " tags = set([r[len(TAG):] for r in refs if r.startswith(TAG)]) if not tags: # Either we're using git < 1.8.3, or there really are no tags. We use # a heuristic: assume all version tags have a digit. The old git %%d # expansion behaves like git log --decorate=short and strips out the # refs/heads/ and refs/tags/ prefixes that would let us distinguish # between branches and tags. By ignoring refnames without digits, we # filter out many common branch names like "release" and # "stabilization", as well as "HEAD" and "master". tags = set([r for r in refs if re.search(r'\d', r)]) if verbose: print("discarding '%%s', no digits" %% ",".join(refs - tags)) if verbose: print("likely tags: %%s" %% ",".join(sorted(tags))) for ref in sorted(tags): # sorting will prefer e.g. "2.0" over "2.0rc1" if ref.startswith(tag_prefix): r = ref[len(tag_prefix):] if verbose: print("picking %%s" %% r) return {"version": r, "full-revisionid": keywords["full"].strip(), "dirty": False, "error": None, "date": date} # no suitable tags, so version is "0+unknown", but full hex is still there if verbose: print("no suitable tags, using unknown + full revision id") return {"version": "0+unknown", "full-revisionid": keywords["full"].strip(), "dirty": False, "error": "no suitable tags", "date": None} @register_vcs_handler("git", "pieces_from_vcs") def git_pieces_from_vcs(tag_prefix, root, verbose, run_command=run_command): """Get version from 'git describe' in the root of the source tree. This only gets called if the git-archive 'subst' keywords were *not* expanded, and _version.py hasn't already been rewritten with a short version string, meaning we're inside a checked out source tree. """ GITS = ["git"] if sys.platform == "win32": GITS = ["git.cmd", "git.exe"] out, rc = run_command(GITS, ["rev-parse", "--git-dir"], cwd=root, hide_stderr=True) if rc != 0: if verbose: print("Directory %%s not under git control" %% root) raise NotThisMethod("'git rev-parse --git-dir' returned error") # if there is a tag matching tag_prefix, this yields TAG-NUM-gHEX[-dirty] # if there isn't one, this yields HEX[-dirty] (no NUM) describe_out, rc = run_command(GITS, ["describe", "--tags", "--dirty", "--always", "--long", "--match", "%%s*" %% tag_prefix], cwd=root) # --long was added in git-1.5.5 if describe_out is None: raise NotThisMethod("'git describe' failed") describe_out = describe_out.strip() full_out, rc = run_command(GITS, ["rev-parse", "HEAD"], cwd=root) if full_out is None: raise NotThisMethod("'git rev-parse' failed") full_out = full_out.strip() pieces = {} pieces["long"] = full_out pieces["short"] = full_out[:7] # maybe improved later pieces["error"] = None # parse describe_out. It will be like TAG-NUM-gHEX[-dirty] or HEX[-dirty] # TAG might have hyphens. git_describe = describe_out # look for -dirty suffix dirty = git_describe.endswith("-dirty") pieces["dirty"] = dirty if dirty: git_describe = git_describe[:git_describe.rindex("-dirty")] # now we have TAG-NUM-gHEX or HEX if "-" in git_describe: # TAG-NUM-gHEX mo = re.search(r'^(.+)-(\d+)-g([0-9a-f]+)$', git_describe) if not mo: # unparseable. Maybe git-describe is misbehaving? pieces["error"] = ("unable to parse git-describe output: '%%s'" %% describe_out) return pieces # tag full_tag = mo.group(1) if not full_tag.startswith(tag_prefix): if verbose: fmt = "tag '%%s' doesn't start with prefix '%%s'" print(fmt %% (full_tag, tag_prefix)) pieces["error"] = ("tag '%%s' doesn't start with prefix '%%s'" %% (full_tag, tag_prefix)) return pieces pieces["closest-tag"] = full_tag[len(tag_prefix):] # distance: number of commits since tag pieces["distance"] = int(mo.group(2)) # commit: short hex revision ID pieces["short"] = mo.group(3) else: # HEX: no tags pieces["closest-tag"] = None count_out, rc = run_command(GITS, ["rev-list", "HEAD", "--count"], cwd=root) pieces["distance"] = int(count_out) # total number of commits # commit date: see ISO-8601 comment in git_versions_from_keywords() date = run_command(GITS, ["show", "-s", "--format=%%ci", "HEAD"], cwd=root)[0].strip() pieces["date"] = date.strip().replace(" ", "T", 1).replace(" ", "", 1) return pieces def plus_or_dot(pieces): """Return a + if we don't already have one, else return a .""" if "+" in pieces.get("closest-tag", ""): return "." return "+" def render_pep440(pieces): """Build up version string, with post-release "local version identifier". Our goal: TAG[+DISTANCE.gHEX[.dirty]] . Note that if you get a tagged build and then dirty it, you'll get TAG+0.gHEX.dirty Exceptions: 1: no tags. git_describe was just HEX. 0+untagged.DISTANCE.gHEX[.dirty] """ if pieces["closest-tag"]: rendered = pieces["closest-tag"] if pieces["distance"] or pieces["dirty"]: rendered += plus_or_dot(pieces) rendered += "%%d.g%%s" %% (pieces["distance"], pieces["short"]) if pieces["dirty"]: rendered += ".dirty" else: # exception #1 rendered = "0+untagged.%%d.g%%s" %% (pieces["distance"], pieces["short"]) if pieces["dirty"]: rendered += ".dirty" return rendered def render_pep440_pre(pieces): """TAG[.post.devDISTANCE] -- No -dirty. Exceptions: 1: no tags. 0.post.devDISTANCE """ if pieces["closest-tag"]: rendered = pieces["closest-tag"] if pieces["distance"]: rendered += ".post.dev%%d" %% pieces["distance"] else: # exception #1 rendered = "0.post.dev%%d" %% pieces["distance"] return rendered def render_pep440_post(pieces): """TAG[.postDISTANCE[.dev0]+gHEX] . The ".dev0" means dirty. Note that .dev0 sorts backwards (a dirty tree will appear "older" than the corresponding clean one), but you shouldn't be releasing software with -dirty anyways. Exceptions: 1: no tags. 0.postDISTANCE[.dev0] """ if pieces["closest-tag"]: rendered = pieces["closest-tag"] if pieces["distance"] or pieces["dirty"]: rendered += ".post%%d" %% pieces["distance"] if pieces["dirty"]: rendered += ".dev0" rendered += plus_or_dot(pieces) rendered += "g%%s" %% pieces["short"] else: # exception #1 rendered = "0.post%%d" %% pieces["distance"] if pieces["dirty"]: rendered += ".dev0" rendered += "+g%%s" %% pieces["short"] return rendered def render_pep440_old(pieces): """TAG[.postDISTANCE[.dev0]] . The ".dev0" means dirty. Eexceptions: 1: no tags. 0.postDISTANCE[.dev0] """ if pieces["closest-tag"]: rendered = pieces["closest-tag"] if pieces["distance"] or pieces["dirty"]: rendered += ".post%%d" %% pieces["distance"] if pieces["dirty"]: rendered += ".dev0" else: # exception #1 rendered = "0.post%%d" %% pieces["distance"] if pieces["dirty"]: rendered += ".dev0" return rendered def render_git_describe(pieces): """TAG[-DISTANCE-gHEX][-dirty]. Like 'git describe --tags --dirty --always'. Exceptions: 1: no tags. HEX[-dirty] (note: no 'g' prefix) """ if pieces["closest-tag"]: rendered = pieces["closest-tag"] if pieces["distance"]: rendered += "-%%d-g%%s" %% (pieces["distance"], pieces["short"]) else: # exception #1 rendered = pieces["short"] if pieces["dirty"]: rendered += "-dirty" return rendered def render_git_describe_long(pieces): """TAG-DISTANCE-gHEX[-dirty]. Like 'git describe --tags --dirty --always -long'. The distance/hash is unconditional. Exceptions: 1: no tags. HEX[-dirty] (note: no 'g' prefix) """ if pieces["closest-tag"]: rendered = pieces["closest-tag"] rendered += "-%%d-g%%s" %% (pieces["distance"], pieces["short"]) else: # exception #1 rendered = pieces["short"] if pieces["dirty"]: rendered += "-dirty" return rendered def render(pieces, style): """Render the given version pieces into the requested style.""" if pieces["error"]: return {"version": "unknown", "full-revisionid": pieces.get("long"), "dirty": None, "error": pieces["error"], "date": None} if not style or style == "default": style = "pep440" # the default if style == "pep440": rendered = render_pep440(pieces) elif style == "pep440-pre": rendered = render_pep440_pre(pieces) elif style == "pep440-post": rendered = render_pep440_post(pieces) elif style == "pep440-old": rendered = render_pep440_old(pieces) elif style == "git-describe": rendered = render_git_describe(pieces) elif style == "git-describe-long": rendered = render_git_describe_long(pieces) else: raise ValueError("unknown style '%%s'" %% style) return {"version": rendered, "full-revisionid": pieces["long"], "dirty": pieces["dirty"], "error": None, "date": pieces.get("date")} def get_versions(): """Get version information or return default if unable to do so.""" # I am in _version.py, which lives at ROOT/VERSIONFILE_SOURCE. If we have # __file__, we can work backwards from there to the root. Some # py2exe/bbfreeze/non-CPython implementations don't do __file__, in which # case we can only use expanded keywords. cfg = get_config() verbose = cfg.verbose try: return git_versions_from_keywords(get_keywords(), cfg.tag_prefix, verbose) except NotThisMethod: pass try: root = os.path.realpath(__file__) # versionfile_source is the relative path from the top of the source # tree (where the .git directory might live) to this file. Invert # this to find the root from __file__. for i in cfg.versionfile_source.split('/'): root = os.path.dirname(root) except NameError: return {"version": "0+unknown", "full-revisionid": None, "dirty": None, "error": "unable to find root of source tree", "date": None} try: pieces = git_pieces_from_vcs(cfg.tag_prefix, root, verbose) return render(pieces, cfg.style) except NotThisMethod: pass try: if cfg.parentdir_prefix: return versions_from_parentdir(cfg.parentdir_prefix, root, verbose) except NotThisMethod: pass return {"version": "0+unknown", "full-revisionid": None, "dirty": None, "error": "unable to compute version", "date": None} ''' @register_vcs_handler("git", "get_keywords") def git_get_keywords(versionfile_abs): """Extract version information from the given file.""" # the code embedded in _version.py can just fetch the value of these # keywords. When used from setup.py, we don't want to import _version.py, # so we do it with a regexp instead. This function is not used from # _version.py. keywords = {} try: f = open(versionfile_abs, "r") for line in f.readlines(): if line.strip().startswith("git_refnames ="): mo = re.search(r'=\s*"(.*)"', line) if mo: keywords["refnames"] = mo.group(1) if line.strip().startswith("git_full ="): mo = re.search(r'=\s*"(.*)"', line) if mo: keywords["full"] = mo.group(1) if line.strip().startswith("git_date ="): mo = re.search(r'=\s*"(.*)"', line) if mo: keywords["date"] = mo.group(1) f.close() except EnvironmentError: pass return keywords @register_vcs_handler("git", "keywords") def git_versions_from_keywords(keywords, tag_prefix, verbose): """Get version information from git keywords.""" if not keywords: raise NotThisMethod("no keywords at all, weird") date = keywords.get("date") if date is not None: # git-2.2.0 added "%cI", which expands to an ISO-8601 -compliant # datestamp. However we prefer "%ci" (which expands to an "ISO-8601 # -like" string, which we must then edit to make compliant), because # it's been around since git-1.5.3, and it's too difficult to # discover which version we're using, or to work around using an # older one. date = date.strip().replace(" ", "T", 1).replace(" ", "", 1) refnames = keywords["refnames"].strip() if refnames.startswith("$Format"): if verbose: print("keywords are unexpanded, not using") raise NotThisMethod("unexpanded keywords, not a git-archive tarball") refs = set([r.strip() for r in refnames.strip("()").split(",")]) # starting in git-1.8.3, tags are listed as "tag: foo-1.0" instead of # just "foo-1.0". If we see a "tag: " prefix, prefer those. TAG = "tag: " tags = set([r[len(TAG) :] for r in refs if r.startswith(TAG)]) if not tags: # Either we're using git < 1.8.3, or there really are no tags. We use # a heuristic: assume all version tags have a digit. The old git %d # expansion behaves like git log --decorate=short and strips out the # refs/heads/ and refs/tags/ prefixes that would let us distinguish # between branches and tags. By ignoring refnames without digits, we # filter out many common branch names like "release" and # "stabilization", as well as "HEAD" and "master". tags = set([r for r in refs if re.search(r"\d", r)]) if verbose: print("discarding '%s', no digits" % ",".join(refs - tags)) if verbose: print("likely tags: %s" % ",".join(sorted(tags))) for ref in sorted(tags): # sorting will prefer e.g. "2.0" over "2.0rc1" if ref.startswith(tag_prefix): r = ref[len(tag_prefix) :] if verbose: print("picking %s" % r) return { "version": r, "full-revisionid": keywords["full"].strip(), "dirty": False, "error": None, "date": date, } # no suitable tags, so version is "0+unknown", but full hex is still there if verbose: print("no suitable tags, using unknown + full revision id") return { "version": "0+unknown", "full-revisionid": keywords["full"].strip(), "dirty": False, "error": "no suitable tags", "date": None, } @register_vcs_handler("git", "pieces_from_vcs") def git_pieces_from_vcs(tag_prefix, root, verbose, run_command=run_command): """Get version from 'git describe' in the root of the source tree. This only gets called if the git-archive 'subst' keywords were *not* expanded, and _version.py hasn't already been rewritten with a short version string, meaning we're inside a checked out source tree. """ GITS = ["git"] if sys.platform == "win32": GITS = ["git.cmd", "git.exe"] out, rc = run_command(GITS, ["rev-parse", "--git-dir"], cwd=root, hide_stderr=True) if rc != 0: if verbose: print("Directory %s not under git control" % root) raise NotThisMethod("'git rev-parse --git-dir' returned error") # if there is a tag matching tag_prefix, this yields TAG-NUM-gHEX[-dirty] # if there isn't one, this yields HEX[-dirty] (no NUM) describe_out, rc = run_command( GITS, [ "describe", "--tags", "--dirty", "--always", "--long", "--match", "%s*" % tag_prefix, ], cwd=root, ) # --long was added in git-1.5.5 if describe_out is None: raise NotThisMethod("'git describe' failed") describe_out = describe_out.strip() full_out, rc = run_command(GITS, ["rev-parse", "HEAD"], cwd=root) if full_out is None: raise NotThisMethod("'git rev-parse' failed") full_out = full_out.strip() pieces = {} pieces["long"] = full_out pieces["short"] = full_out[:7] # maybe improved later pieces["error"] = None # parse describe_out. It will be like TAG-NUM-gHEX[-dirty] or HEX[-dirty] # TAG might have hyphens. git_describe = describe_out # look for -dirty suffix dirty = git_describe.endswith("-dirty") pieces["dirty"] = dirty if dirty: git_describe = git_describe[: git_describe.rindex("-dirty")] # now we have TAG-NUM-gHEX or HEX if "-" in git_describe: # TAG-NUM-gHEX mo = re.search(r"^(.+)-(\d+)-g([0-9a-f]+)$", git_describe) if not mo: # unparseable. Maybe git-describe is misbehaving? pieces["error"] = "unable to parse git-describe output: '%s'" % describe_out return pieces # tag full_tag = mo.group(1) if not full_tag.startswith(tag_prefix): if verbose: fmt = "tag '%s' doesn't start with prefix '%s'" print(fmt % (full_tag, tag_prefix)) pieces["error"] = "tag '%s' doesn't start with prefix '%s'" % ( full_tag, tag_prefix, ) return pieces pieces["closest-tag"] = full_tag[len(tag_prefix) :] # distance: number of commits since tag pieces["distance"] = int(mo.group(2)) # commit: short hex revision ID pieces["short"] = mo.group(3) else: # HEX: no tags pieces["closest-tag"] = None count_out, rc = run_command(GITS, ["rev-list", "HEAD", "--count"], cwd=root) pieces["distance"] = int(count_out) # total number of commits # commit date: see ISO-8601 comment in git_versions_from_keywords() date = run_command(GITS, ["show", "-s", "--format=%ci", "HEAD"], cwd=root)[ 0 ].strip() pieces["date"] = date.strip().replace(" ", "T", 1).replace(" ", "", 1) return pieces def do_vcs_install(manifest_in, versionfile_source, ipy): """Git-specific installation logic for Versioneer. For Git, this means creating/changing .gitattributes to mark _version.py for export-subst keyword substitution. """ GITS = ["git"] if sys.platform == "win32": GITS = ["git.cmd", "git.exe"] files = [manifest_in, versionfile_source] if ipy: files.append(ipy) try: me = __file__ if me.endswith(".pyc") or me.endswith(".pyo"): me = os.path.splitext(me)[0] + ".py" versioneer_file = os.path.relpath(me) except NameError: versioneer_file = "versioneer.py" files.append(versioneer_file) present = False try: f = open(".gitattributes", "r") for line in f.readlines(): if line.strip().startswith(versionfile_source): if "export-subst" in line.strip().split()[1:]: present = True f.close() except EnvironmentError: pass if not present: f = open(".gitattributes", "a+") f.write("%s export-subst\n" % versionfile_source) f.close() files.append(".gitattributes") run_command(GITS, ["add", "--"] + files) def versions_from_parentdir(parentdir_prefix, root, verbose): """Try to determine the version from the parent directory name. Source tarballs conventionally unpack into a directory that includes both the project name and a version string. We will also support searching up two directory levels for an appropriately named parent directory """ rootdirs = [] for i in range(3): dirname = os.path.basename(root) if dirname.startswith(parentdir_prefix): return { "version": dirname[len(parentdir_prefix) :], "full-revisionid": None, "dirty": False, "error": None, "date": None, } else: rootdirs.append(root) root = os.path.dirname(root) # up a level if verbose: print( "Tried directories %s but none started with prefix %s" % (str(rootdirs), parentdir_prefix) ) raise NotThisMethod("rootdir doesn't start with parentdir_prefix") SHORT_VERSION_PY = """ # This file was generated by 'versioneer.py' (0.18) from # revision-control system data, or from the parent directory name of an # unpacked source archive. Distribution tarballs contain a pre-generated copy # of this file. import json version_json = ''' %s ''' # END VERSION_JSON def get_versions(): return json.loads(version_json) """ def versions_from_file(filename): """Try to determine the version from _version.py if present.""" try: with open(filename) as f: contents = f.read() except EnvironmentError: raise NotThisMethod("unable to read _version.py") mo = re.search( r"version_json = '''\n(.*)''' # END VERSION_JSON", contents, re.M | re.S ) if not mo: mo = re.search( r"version_json = '''\r\n(.*)''' # END VERSION_JSON", contents, re.M | re.S ) if not mo: raise NotThisMethod("no version_json in _version.py") return json.loads(mo.group(1)) def write_to_version_file(filename, versions): """Write the given version number to the given _version.py file.""" os.unlink(filename) contents = json.dumps(versions, sort_keys=True, indent=1, separators=(",", ": ")) with open(filename, "w") as f: f.write(SHORT_VERSION_PY % contents) print("set %s to '%s'" % (filename, versions["version"])) def plus_or_dot(pieces): """Return a + if we don't already have one, else return a .""" if "+" in pieces.get("closest-tag", ""): return "." return "+" def render_pep440(pieces): """Build up version string, with post-release "local version identifier". Our goal: TAG[+DISTANCE.gHEX[.dirty]] . Note that if you get a tagged build and then dirty it, you'll get TAG+0.gHEX.dirty Exceptions: 1: no tags. git_describe was just HEX. 0+untagged.DISTANCE.gHEX[.dirty] """ if pieces["closest-tag"]: rendered = pieces["closest-tag"] if pieces["distance"] or pieces["dirty"]: rendered += plus_or_dot(pieces) rendered += "%d.g%s" % (pieces["distance"], pieces["short"]) if pieces["dirty"]: rendered += ".dirty" else: # exception #1 rendered = "0+untagged.%d.g%s" % (pieces["distance"], pieces["short"]) if pieces["dirty"]: rendered += ".dirty" return rendered def render_pep440_pre(pieces): """TAG[.post.devDISTANCE] -- No -dirty. Exceptions: 1: no tags. 0.post.devDISTANCE """ if pieces["closest-tag"]: rendered = pieces["closest-tag"] if pieces["distance"]: rendered += ".post.dev%d" % pieces["distance"] else: # exception #1 rendered = "0.post.dev%d" % pieces["distance"] return rendered def render_pep440_post(pieces): """TAG[.postDISTANCE[.dev0]+gHEX] . The ".dev0" means dirty. Note that .dev0 sorts backwards (a dirty tree will appear "older" than the corresponding clean one), but you shouldn't be releasing software with -dirty anyways. Exceptions: 1: no tags. 0.postDISTANCE[.dev0] """ if pieces["closest-tag"]: rendered = pieces["closest-tag"] if pieces["distance"] or pieces["dirty"]: rendered += ".post%d" % pieces["distance"] if pieces["dirty"]: rendered += ".dev0" rendered += plus_or_dot(pieces) rendered += "g%s" % pieces["short"] else: # exception #1 rendered = "0.post%d" % pieces["distance"] if pieces["dirty"]: rendered += ".dev0" rendered += "+g%s" % pieces["short"] return rendered def render_pep440_old(pieces): """TAG[.postDISTANCE[.dev0]] . The ".dev0" means dirty. Eexceptions: 1: no tags. 0.postDISTANCE[.dev0] """ if pieces["closest-tag"]: rendered = pieces["closest-tag"] if pieces["distance"] or pieces["dirty"]: rendered += ".post%d" % pieces["distance"] if pieces["dirty"]: rendered += ".dev0" else: # exception #1 rendered = "0.post%d" % pieces["distance"] if pieces["dirty"]: rendered += ".dev0" return rendered def render_git_describe(pieces): """TAG[-DISTANCE-gHEX][-dirty]. Like 'git describe --tags --dirty --always'. Exceptions: 1: no tags. HEX[-dirty] (note: no 'g' prefix) """ if pieces["closest-tag"]: rendered = pieces["closest-tag"] if pieces["distance"]: rendered += "-%d-g%s" % (pieces["distance"], pieces["short"]) else: # exception #1 rendered = pieces["short"] if pieces["dirty"]: rendered += "-dirty" return rendered def render_git_describe_long(pieces): """TAG-DISTANCE-gHEX[-dirty]. Like 'git describe --tags --dirty --always -long'. The distance/hash is unconditional. Exceptions: 1: no tags. HEX[-dirty] (note: no 'g' prefix) """ if pieces["closest-tag"]: rendered = pieces["closest-tag"] rendered += "-%d-g%s" % (pieces["distance"], pieces["short"]) else: # exception #1 rendered = pieces["short"] if pieces["dirty"]: rendered += "-dirty" return rendered def render(pieces, style): """Render the given version pieces into the requested style.""" if pieces["error"]: return { "version": "unknown", "full-revisionid": pieces.get("long"), "dirty": None, "error": pieces["error"], "date": None, } if not style or style == "default": style = "pep440" # the default if style == "pep440": rendered = render_pep440(pieces) elif style == "pep440-pre": rendered = render_pep440_pre(pieces) elif style == "pep440-post": rendered = render_pep440_post(pieces) elif style == "pep440-old": rendered = render_pep440_old(pieces) elif style == "git-describe": rendered = render_git_describe(pieces) elif style == "git-describe-long": rendered = render_git_describe_long(pieces) else: raise ValueError("unknown style '%s'" % style) return { "version": rendered, "full-revisionid": pieces["long"], "dirty": pieces["dirty"], "error": None, "date": pieces.get("date"), } class VersioneerBadRootError(Exception): """The project root directory is unknown or missing key files.""" def get_versions(verbose=False): """Get the project version from whatever source is available. Returns dict with two keys: 'version' and 'full'. """ if "versioneer" in sys.modules: # see the discussion in cmdclass.py:get_cmdclass() del sys.modules["versioneer"] root = get_root() cfg = get_config_from_root(root) assert cfg.VCS is not None, "please set [versioneer]VCS= in setup.cfg" handlers = HANDLERS.get(cfg.VCS) assert handlers, "unrecognized VCS '%s'" % cfg.VCS verbose = verbose or cfg.verbose assert ( cfg.versionfile_source is not None ), "please set versioneer.versionfile_source" assert cfg.tag_prefix is not None, "please set versioneer.tag_prefix" versionfile_abs = os.path.join(root, cfg.versionfile_source) # extract version from first of: _version.py, VCS command (e.g. 'git # describe'), parentdir. This is meant to work for developers using a # source checkout, for users of a tarball created by 'setup.py sdist', # and for users of a tarball/zipball created by 'git archive' or github's # download-from-tag feature or the equivalent in other VCSes. get_keywords_f = handlers.get("get_keywords") from_keywords_f = handlers.get("keywords") if get_keywords_f and from_keywords_f: try: keywords = get_keywords_f(versionfile_abs) ver = from_keywords_f(keywords, cfg.tag_prefix, verbose) if verbose: print("got version from expanded keyword %s" % ver) return ver except NotThisMethod: pass try: ver = versions_from_file(versionfile_abs) if verbose: print("got version from file %s %s" % (versionfile_abs, ver)) return ver except NotThisMethod: pass from_vcs_f = handlers.get("pieces_from_vcs") if from_vcs_f: try: pieces = from_vcs_f(cfg.tag_prefix, root, verbose) ver = render(pieces, cfg.style) if verbose: print("got version from VCS %s" % ver) return ver except NotThisMethod: pass try: if cfg.parentdir_prefix: ver = versions_from_parentdir(cfg.parentdir_prefix, root, verbose) if verbose: print("got version from parentdir %s" % ver) return ver except NotThisMethod: pass if verbose: print("unable to compute version") return { "version": "0+unknown", "full-revisionid": None, "dirty": None, "error": "unable to compute version", "date": None, } def get_version(): """Get the short version string for this project.""" return get_versions()["version"] def get_cmdclass(): """Get the custom setuptools/distutils subclasses used by Versioneer.""" if "versioneer" in sys.modules: del sys.modules["versioneer"] # this fixes the "python setup.py develop" case (also 'install' and # 'easy_install .'), in which subdependencies of the main project are # built (using setup.py bdist_egg) in the same python process. Assume # a main project A and a dependency B, which use different versions # of Versioneer. A's setup.py imports A's Versioneer, leaving it in # sys.modules by the time B's setup.py is executed, causing B to run # with the wrong versioneer. Setuptools wraps the sub-dep builds in a # sandbox that restores sys.modules to it's pre-build state, so the # parent is protected against the child's "import versioneer". By # removing ourselves from sys.modules here, before the child build # happens, we protect the child from the parent's versioneer too. # Also see https://github.com/warner/python-versioneer/issues/52 cmds = {} # we add "version" to both distutils and setuptools from distutils.core import Command class cmd_version(Command): description = "report generated version string" user_options = [] boolean_options = [] def initialize_options(self): pass def finalize_options(self): pass def run(self): vers = get_versions(verbose=True) print("Version: %s" % vers["version"]) print(" full-revisionid: %s" % vers.get("full-revisionid")) print(" dirty: %s" % vers.get("dirty")) print(" date: %s" % vers.get("date")) if vers["error"]: print(" error: %s" % vers["error"]) cmds["version"] = cmd_version # we override "build_py" in both distutils and setuptools # # most invocation pathways end up running build_py: # distutils/build -> build_py # distutils/install -> distutils/build ->.. # setuptools/bdist_wheel -> distutils/install ->.. # setuptools/bdist_egg -> distutils/install_lib -> build_py # setuptools/install -> bdist_egg ->.. # setuptools/develop -> ? # pip install: # copies source tree to a tempdir before running egg_info/etc # if .git isn't copied too, 'git describe' will fail # then does setup.py bdist_wheel, or sometimes setup.py install # setup.py egg_info -> ? # we override different "build_py" commands for both environments if "setuptools" in sys.modules: from setuptools.command.build_py import build_py as _build_py else: from distutils.command.build_py import build_py as _build_py class cmd_build_py(_build_py): def run(self): root = get_root() cfg = get_config_from_root(root) versions = get_versions() _build_py.run(self) # now locate _version.py in the new build/ directory and replace # it with an updated value if cfg.versionfile_build: target_versionfile = os.path.join(self.build_lib, cfg.versionfile_build) print("UPDATING %s" % target_versionfile) write_to_version_file(target_versionfile, versions) cmds["build_py"] = cmd_build_py if "cx_Freeze" in sys.modules: # cx_freeze enabled? from cx_Freeze.dist import build_exe as _build_exe # nczeczulin reports that py2exe won't like the pep440-style string # as FILEVERSION, but it can be used for PRODUCTVERSION, e.g. # setup(console=[{ # "version": versioneer.get_version().split("+", 1)[0], # FILEVERSION # "product_version": versioneer.get_version(), # ... class cmd_build_exe(_build_exe): def run(self): root = get_root() cfg = get_config_from_root(root) versions = get_versions() target_versionfile = cfg.versionfile_source print("UPDATING %s" % target_versionfile) write_to_version_file(target_versionfile, versions) _build_exe.run(self) os.unlink(target_versionfile) with open(cfg.versionfile_source, "w") as f: LONG = LONG_VERSION_PY[cfg.VCS] f.write( LONG % { "DOLLAR": "$", "STYLE": cfg.style, "TAG_PREFIX": cfg.tag_prefix, "PARENTDIR_PREFIX": cfg.parentdir_prefix, "VERSIONFILE_SOURCE": cfg.versionfile_source, } ) cmds["build_exe"] = cmd_build_exe del cmds["build_py"] if "py2exe" in sys.modules: # py2exe enabled? try: from py2exe.distutils_buildexe import py2exe as _py2exe # py3 except ImportError: from py2exe.build_exe import py2exe as _py2exe # py2 class cmd_py2exe(_py2exe): def run(self): root = get_root() cfg = get_config_from_root(root) versions = get_versions() target_versionfile = cfg.versionfile_source print("UPDATING %s" % target_versionfile) write_to_version_file(target_versionfile, versions) _py2exe.run(self) os.unlink(target_versionfile) with open(cfg.versionfile_source, "w") as f: LONG = LONG_VERSION_PY[cfg.VCS] f.write( LONG % { "DOLLAR": "$", "STYLE": cfg.style, "TAG_PREFIX": cfg.tag_prefix, "PARENTDIR_PREFIX": cfg.parentdir_prefix, "VERSIONFILE_SOURCE": cfg.versionfile_source, } ) cmds["py2exe"] = cmd_py2exe # we override different "sdist" commands for both environments if "setuptools" in sys.modules: from setuptools.command.sdist import sdist as _sdist else: from distutils.command.sdist import sdist as _sdist class cmd_sdist(_sdist): def run(self): versions = get_versions() self._versioneer_generated_versions = versions # unless we update this, the command will keep using the old # version self.distribution.metadata.version = versions["version"] return _sdist.run(self) def make_release_tree(self, base_dir, files): root = get_root() cfg = get_config_from_root(root) _sdist.make_release_tree(self, base_dir, files) # now locate _version.py in the new base_dir directory # (remembering that it may be a hardlink) and replace it with an # updated value target_versionfile = os.path.join(base_dir, cfg.versionfile_source) print("UPDATING %s" % target_versionfile) write_to_version_file( target_versionfile, self._versioneer_generated_versions ) cmds["sdist"] = cmd_sdist return cmds CONFIG_ERROR = """ setup.cfg is missing the necessary Versioneer configuration. You need a section like: [versioneer] VCS = git style = pep440 versionfile_source = src/myproject/_version.py versionfile_build = myproject/_version.py tag_prefix = parentdir_prefix = myproject- You will also need to edit your setup.py to use the results: import versioneer setup(version=versioneer.get_version(), cmdclass=versioneer.get_cmdclass(), ...) Please read the docstring in ./versioneer.py for configuration instructions, edit setup.cfg, and re-run the installer or 'python versioneer.py setup'. """ SAMPLE_CONFIG = """ # See the docstring in versioneer.py for instructions. Note that you must # re-run 'versioneer.py setup' after changing this section, and commit the # resulting files. [versioneer] #VCS = git #style = pep440 #versionfile_source = #versionfile_build = #tag_prefix = #parentdir_prefix = """ INIT_PY_SNIPPET = """ from ._version import get_versions __version__ = get_versions()['version'] del get_versions """ def do_setup(): """Main VCS-independent setup function for installing Versioneer.""" root = get_root() try: cfg = get_config_from_root(root) except ( EnvironmentError, configparser.NoSectionError, configparser.NoOptionError, ) as e: if isinstance(e, (EnvironmentError, configparser.NoSectionError)): print("Adding sample versioneer config to setup.cfg", file=sys.stderr) with open(os.path.join(root, "setup.cfg"), "a") as f: f.write(SAMPLE_CONFIG) print(CONFIG_ERROR, file=sys.stderr) return 1 print(" creating %s" % cfg.versionfile_source) with open(cfg.versionfile_source, "w") as f: LONG = LONG_VERSION_PY[cfg.VCS] f.write( LONG % { "DOLLAR": "$", "STYLE": cfg.style, "TAG_PREFIX": cfg.tag_prefix, "PARENTDIR_PREFIX": cfg.parentdir_prefix, "VERSIONFILE_SOURCE": cfg.versionfile_source, } ) ipy = os.path.join(os.path.dirname(cfg.versionfile_source), "__init__.py") if os.path.exists(ipy): try: with open(ipy, "r") as f: old = f.read() except EnvironmentError: old = "" if INIT_PY_SNIPPET not in old: print(" appending to %s" % ipy) with open(ipy, "a") as f: f.write(INIT_PY_SNIPPET) else: print(" %s unmodified" % ipy) else: print(" %s doesn't exist, ok" % ipy) ipy = None # Make sure both the top-level "versioneer.py" and versionfile_source # (PKG/_version.py, used by runtime code) are in MANIFEST.in, so # they'll be copied into source distributions. Pip won't be able to # install the package without this. manifest_in = os.path.join(root, "MANIFEST.in") simple_includes = set() try: with open(manifest_in, "r") as f: for line in f: if line.startswith("include "): for include in line.split()[1:]: simple_includes.add(include) except EnvironmentError: pass # That doesn't cover everything MANIFEST.in can do # (http://docs.python.org/2/distutils/sourcedist.html#commands), so # it might give some false negatives. Appending redundant 'include' # lines is safe, though. if "versioneer.py" not in simple_includes: print(" appending 'versioneer.py' to MANIFEST.in") with open(manifest_in, "a") as f: f.write("include versioneer.py\n") else: print(" 'versioneer.py' already in MANIFEST.in") if cfg.versionfile_source not in simple_includes: print( " appending versionfile_source ('%s') to MANIFEST.in" % cfg.versionfile_source ) with open(manifest_in, "a") as f: f.write("include %s\n" % cfg.versionfile_source) else: print(" versionfile_source already in MANIFEST.in") # Make VCS-specific changes. For git, this means creating/changing # .gitattributes to mark _version.py for export-subst keyword # substitution. do_vcs_install(manifest_in, cfg.versionfile_source, ipy) return 0 def scan_setup_py(): """Validate the contents of setup.py against Versioneer's expectations.""" found = set() setters = False errors = 0 with open("setup.py", "r") as f: for line in f.readlines(): if "import versioneer" in line: found.add("import") if "versioneer.get_cmdclass()" in line: found.add("cmdclass") if "versioneer.get_version()" in line: found.add("get_version") if "versioneer.VCS" in line: setters = True if "versioneer.versionfile_source" in line: setters = True if len(found) != 3: print("") print("Your setup.py appears to be missing some important items") print("(but I might be wrong). Please make sure it has something") print("roughly like the following:") print("") print(" import versioneer") print(" setup( version=versioneer.get_version(),") print(" cmdclass=versioneer.get_cmdclass(), ...)") print("") errors += 1 if setters: print("You should remove lines like 'versioneer.VCS = ' and") print("'versioneer.versionfile_source = ' . This configuration") print("now lives in setup.cfg, and should be removed from setup.py") print("") errors += 1 return errors if __name__ == "__main__": cmd = sys.argv[1] if cmd == "setup": errors = do_setup() errors += scan_setup_py() if errors: sys.exit(1)