pax_global_header00006660000000000000000000000064141124142440014507gustar00rootroot0000000000000052 comment=b3286afa999edddd1f1acafdf4486ffd8f559c3f smart_open-5.2.1/000077500000000000000000000000001411241424400136635ustar00rootroot00000000000000smart_open-5.2.1/.github/000077500000000000000000000000001411241424400152235ustar00rootroot00000000000000smart_open-5.2.1/.github/FUNDING.yml000066400000000000000000000013241411241424400170400ustar00rootroot00000000000000# These are supported funding model platforms github: [piskvorky] # Replace with up to 4 GitHub Sponsors-enabled usernames e.g., [user1, user2] patreon: # Replace with a single Patreon username open_collective: # Replace with a single Open Collective username ko_fi: # Replace with a single Ko-fi username tidelift: # Replace with a single Tidelift platform-name/package-name e.g., npm/babel community_bridge: # Replace with a single Community Bridge project-name e.g., cloud-foundry liberapay: # Replace with a single Liberapay username issuehunt: # Replace with a single IssueHunt username otechie: # Replace with a single Otechie username custom: # Replace with up to 4 custom sponsorship URLs e.g., ['link1', 'link2'] smart_open-5.2.1/.github/ISSUE_TEMPLATE.md000066400000000000000000000020121411241424400177230ustar00rootroot00000000000000#### Problem description Be sure your description clearly answers the following questions: - What are you trying to achieve? - What is the expected result? - What are you seeing instead? #### Steps/code to reproduce the problem In order for us to be able to solve your problem, we have to be able to reproduce it on our end. Without reproducing the problem, it is unlikely that we'll be able to help you. Include full tracebacks, logs and datasets if necessary. Please keep the examples minimal ([minimal reproducible example](https://stackoverflow.com/help/minimal-reproducible-example)). #### Versions Please provide the output of: ```python import platform, sys, smart_open print(platform.platform()) print("Python", sys.version) print("smart_open", smart_open.__version__) ``` #### Checklist Before you create the issue, please make sure you have: - [ ] Described the problem clearly - [ ] Provided a minimal reproducible example, including any required data - [ ] Provided the version numbers of the relevant software smart_open-5.2.1/.github/PULL_REQUEST_TEMPLATE.md000066400000000000000000000036741411241424400210360ustar00rootroot00000000000000#### Title Please **pick a concise, informative and complete title** for your PR. The title is important because it will appear in [our change log](https://github.com/RaRe-Technologies/smart_open/blob/master/CHANGELOG.md). #### Motivation Please explain the motivation behind this PR in the description. If you're fixing a bug, link to the issue number like so: ``` - Fixes #{issue_number} ``` If you're adding a new feature, then consider opening a ticket and discussing it with the maintainers before you actually do the hard work. #### Tests If you're fixing a bug, consider [test-driven development](https://en.wikipedia.org/wiki/Test-driven_development): 1. Create a unit test that demonstrates the bug. The test should **fail**. 2. Implement your bug fix. 3. The test you created should now **pass**. If you're implementing a new feature, include unit tests for it. Make sure all existing unit tests pass. You can run them locally using: pytest smart_open If there are any failures, please fix them before creating the PR (or mark it as WIP, see below). #### Work in progress If you're still working on your PR, include "WIP" in the title. We'll skip reviewing it for the time being. Once you're ready to review, remove the "WIP" from the title, and ping one of the maintainers (e.g. mpenkov). #### Checklist Before you create the PR, please make sure you have: - [ ] Picked a concise, informative and complete title - [ ] Clearly explained the motivation behind the PR - [ ] Linked to any existing issues that your PR will be solving - [ ] Included tests for any new functionality - [ ] Checked that all unit tests pass #### Workflow Please avoid rebasing and force-pushing to the branch of the PR once a review is in progress. Rebasing can make your commits look a bit cleaner, but it also makes life more difficult from the reviewer, because they are no longer able to distinguish between code that has already been reviewed, and unreviewed code. smart_open-5.2.1/.github/workflows/000077500000000000000000000000001411241424400172605ustar00rootroot00000000000000smart_open-5.2.1/.github/workflows/python-package-win.yml000066400000000000000000000017721411241424400235170ustar00rootroot00000000000000name: Test under Windows on: [push, pull_request] jobs: build: runs-on: windows-2019 strategy: matrix: include: - python-version: '3.6' toxenv: "py36-doctest" - python-version: '3.6' toxenv: "py36-test" - python-version: '3.7' toxenv: "py37-doctest" - python-version: '3.7' toxenv: "py37-test" - python-version: '3.8' toxenv: "py38-doctest" - python-version: '3.8' toxenv: "py38-test" - python-version: '3.9' toxenv: "py39-doctest" - python-version: '3.9' toxenv: "py39-test" steps: - uses: actions/checkout@v2 - uses: actions/setup-python@v2 with: python-version: ${{ matrix.python-version }} - name: Update pip run: python -m pip install -U pip - name: Install tox run: python -m pip install tox - name: Test using Tox run: tox smart_open-5.2.1/.github/workflows/python-package.yml000066400000000000000000000042171411241424400227210ustar00rootroot00000000000000name: Test on: [push, pull_request] jobs: build: env: BOTO_CONFIG: "/dev/null" SO_BUCKET: smart-open runs-on: ubuntu-latest strategy: matrix: include: - python-version: '3.6' toxenv: "check_keys,py36-doctest,py36-test,py36-benchmark,py36-integration" result_key: benchmark-results-py36 - python-version: '3.7' toxenv: "check_keys,py37-doctest,enable_moto_server,py37-test,py37-benchmark,py37-integration,disable_moto_server" enable_moto_server: "1" - python-version: '3.8' toxenv: "check_keys,py38-doctest,test_coverage,py38-integration" coveralls: true - python-version: '3.9' toxenv: "check_keys,py39-doctest,test_coverage,py39-integration" coveralls: true - python-version: '3.8' toxenv: "flake8" steps: - uses: actions/checkout@v2 - uses: actions/setup-python@v2 with: python-version: ${{ matrix.python-version }} - name: Update pip run: python -m pip install -U pip - name: Install tox run: python -m pip install tox - name: Test using Tox env: SO_RESULT_KEY: ${{ matrix.result_key }} SO_ENABLE_MOTO_SERVER: ${{ matrix.enable_moto_server }} TOXENV: ${{ matrix.toxenv }} run: tox # # The test_coverage environment in tox.ini generates coverage data and # saves it to disk. This step uploads that data. We do it # separately from the tox env because the upload can fail for various # reasons (e.g. https://github.com/lemurheavy/coveralls-public/issues/1392) # and we don't want it to break the build. # # Looks like there's a github action for this # (https://github.com/coverallsapp/github-action/issues/30) but it does # not work with pytest output. # - name: Upload code coverage to coveralls.io if: ${{ matrix.coveralls }} continue-on-error: true env: GITHUB_TOKEN: ${{ github.token }} run: | pip install coveralls coveralls smart_open-5.2.1/.github/workflows/release.yml000066400000000000000000000025131411241424400214240ustar00rootroot00000000000000name: Release to PyPI on: push: tags: - 'v*.*.*' jobs: tarball: if: github.event_name == 'push' timeout-minutes: 1 runs-on: ubuntu-20.04 env: PYPI_USERNAME: ${{ secrets.PYPI_USERNAME }} PYPI_PASSWORD: ${{ secrets.PYPI_PASSWORD }} steps: - uses: actions/checkout@v1 - uses: actions/setup-python@v1 with: python-version: "3.8.x" # https://github.community/t/how-to-get-just-the-tag-name/16241/4 - name: Extract the version number id: get_version run: | echo ::set-output name=V::$(python smart_open/version.py) - name: Install dependencies run: | python -m pip install --upgrade pip python -m venv venv . venv/bin/activate pip install twine wheel - name: Build and upload tarball to PyPI run: | . venv/bin/activate python setup.py sdist twine upload dist/smart_open-${{ steps.get_version.outputs.V }}.tar.gz -u ${{ env.PYPI_USERNAME }} -p ${{ env.PYPI_PASSWORD }} - name: Build and upload wheel to PyPI run: | . venv/bin/activate python setup.py bdist_wheel twine upload dist/smart_open-${{ steps.get_version.outputs.V }}-py3-none-any.whl -u ${{ env.PYPI_USERNAME }} -p ${{ env.PYPI_PASSWORD }} smart_open-5.2.1/.gitignore000066400000000000000000000013551411241424400156570ustar00rootroot00000000000000# Byte-compiled / optimized / DLL files __pycache__/ *.py[cod] # C extensions *.so # Distribution / packaging .Python env/ build/ develop-eggs/ dist/ downloads/ eggs/ lib/ lib64/ parts/ sdist/ var/ *.egg-info/ .installed.cfg *.egg # PyInstaller # Usually these files are written by a python script from a template # before PyInstaller builds the exe, so as to inject date/other infos into it. *.manifest *.spec # Installer logs pip-log.txt pip-delete-this-directory.txt # Unit test / coverage reports htmlcov/ .tox/ .coverage .cache nosetests.xml coverage.xml # Translations *.mo *.pot # Django stuff: *.log # Sphinx documentation docs/_build/ # PyBuilder target/ # vim *.swp *.swo # PyCharm .idea/ # VSCode .vscode/ # env files .env smart_open-5.2.1/CHANGELOG.md000066400000000000000000000711521411241424400155020ustar00rootroot00000000000000# Unreleased # 5.2.1, 28 August 2021 - make HTTP/S seeking less strict (PR [#646](https://github.com/RaRe-Technologies/smart_open/pull/646), [@mpenkov](https://github.com/mpenkov)) # 5.2.0, 18 August 2021 - Work around changes to `urllib.parse.urlsplit` (PR [#633](https://github.com/RaRe-Technologies/smart_open/pull/633), [@judahrand](https://github.com/judahrand)) - New blob_properties transport parameter for GCS (PR [#632](https://github.com/RaRe-Technologies/smart_open/pull/632), [@FHTheron](https://github.com/FHTheron)) - Don't leak compressed stream (PR [#636](https://github.com/RaRe-Technologies/smart_open/pull/636), [@ampanasiuk](https://github.com/ampanasiuk)) - Change python_requires version to fix PEP 440 issue (PR [#639](https://github.com/RaRe-Technologies/smart_open/pull/639), [@lucasvieirasilva](https://github.com/lucasvieirasilva)) - New max_concurrency transport parameter for azure (PR [#642](https://github.com/RaRe-Technologies/smart_open/pull/642), [@omBratteng](https://github.com/omBratteng)) # 5.1.0, 25 May 2021 This release introduces a new top-level parameter: `compression`. It controls compression behavior and partially overlaps with the old `ignore_ext` parameter. For details, see the README.rst file. You may continue to use `ignore_ext` parameter for now, but it will be deprecated in the next major release. - Add warning for recently deprecated s3 parameters (PR [#618](https://github.com/RaRe-Technologies/smart_open/pull/618), [@mpenkov](https://github.com/mpenkov)) - Add new top-level compression parameter (PR [#609](https://github.com/RaRe-Technologies/smart_open/pull/609), [@dmcguire81](https://github.com/dmcguire81)) - Drop mock dependency; standardize on unittest.mock (PR [#621](https://github.com/RaRe-Technologies/smart_open/pull/621), [@musicinmybrain](https://github.com/musicinmybrain)) - Fix to_boto3 method (PR [#619](https://github.com/RaRe-Technologies/smart_open/pull/619), [@mpenkov](https://github.com/mpenkov)) # 5.0.0, 30 Mar 2021 This release modifies the handling of transport parameters for the S3 back-end in a backwards-incompatible way. See [the migration docs](MIGRATING_FROM_OLDER_VERSIONS.rst) for details. - Refactor S3, replace high-level resource/session API with low-level client API (PR [#583](https://github.com/RaRe-Technologies/smart_open/pull/583), [@mpenkov](https://github.com/mpenkov)) - Fix potential infinite loop when reading from webhdfs (PR [#597](https://github.com/RaRe-Technologies/smart_open/pull/597), [@traboukos](https://github.com/traboukos)) - Add timeout parameter for http/https (PR [#594](https://github.com/RaRe-Technologies/smart_open/pull/594), [@dustymugs](https://github.com/dustymugs)) - Remove `tests` directory from package (PR [#589](https://github.com/RaRe-Technologies/smart_open/pull/589), [@e-nalepa](https://github.com/e-nalepa)) # 4.2.0, 15 Feb 2021 - Support tell() for text mode write on s3/gcs/azure (PR [#582](https://github.com/RaRe-Technologies/smart_open/pull/582), [@markopy](https://github.com/markopy)) - Implement option to use a custom buffer during S3 writes (PR [#547](https://github.com/RaRe-Technologies/smart_open/pull/547), [@mpenkov](https://github.com/mpenkov)) # 4.1.2, 18 Jan 2021 - Correctly pass boto3 resource to writers (PR [#576](https://github.com/RaRe-Technologies/smart_open/pull/576), [@jackluo923](https://github.com/jackluo923)) - Improve robustness of S3 reading (PR [#552](https://github.com/RaRe-Technologies/smart_open/pull/552), [@mpenkov](https://github.com/mpenkov)) - Replace codecs with TextIOWrapper to fix newline issues when reading text files (PR [#578](https://github.com/RaRe-Technologies/smart_open/pull/578), [@markopy](https://github.com/markopy)) # 4.1.0, 30 Dec 2020 - Refactor `s3` submodule to minimize resource usage (PR [#569](https://github.com/RaRe-Technologies/smart_open/pull/569), [@mpenkov](https://github.com/mpenkov)) - Change `download_as_string` to `download_as_bytes` in `gcs` submodule (PR [#571](https://github.com/RaRe-Technologies/smart_open/pull/571), [@alexandreyc](https://github.com/alexandreyc)) # 4.0.1, 27 Nov 2020 - Exclude `requests` from `install_requires` dependency list. If you need it, use `pip install smart_open[http]` or `pip install smart_open[webhdfs]`. # 4.0.0, 24 Nov 2020 - Fix reading empty file or seeking past end of file for s3 backend (PR [#549](https://github.com/RaRe-Technologies/smart_open/pull/549), [@jcushman](https://github.com/jcushman)) - Fix handling of rt/wt mode when working with gzip compression (PR [#559](https://github.com/RaRe-Technologies/smart_open/pull/559), [@mpenkov](https://github.com/mpenkov)) - Bump minimum Python version to 3.6 (PR [#562](https://github.com/RaRe-Technologies/smart_open/pull/562), [@mpenkov](https://github.com/mpenkov)) # 3.0.0, 8 Oct 2020 This release modifies the behavior of setup.py with respect to dependencies. Previously, `boto3` and other AWS-related packages were installed by default. Now, in order to install them, you need to run either: pip install smart_open[s3] to install the AWS dependencies only, or pip install smart_open[all] to install all dependencies, including AWS, GCS, etc. # 2.2.1, 1 Oct 2020 - Include S3 dependencies by default, because removing them in the 2.2.0 minor release was a mistake. # 2.2.0, 25 Sep 2020 This release modifies the behavior of setup.py with respect to dependencies. Previously, `boto3` and other AWS-related packages were installed by default. Now, in order to install them, you need to run either: pip install smart_open[s3] to install the AWS dependencies only, or pip install smart_open[all] to install all dependencies, including AWS, GCS, etc. Summary of changes: - Correctly pass `newline` parameter to built-in `open` function (PR [#478](https://github.com/RaRe-Technologies/smart_open/pull/478), [@burkovae](https://github.com/burkovae)) - Remove boto as a dependency (PR [#523](https://github.com/RaRe-Technologies/smart_open/pull/523), [@isobit](https://github.com/isobit)) - Performance improvement: avoid redundant GetObject API queries in s3.Reader (PR [#495](https://github.com/RaRe-Technologies/smart_open/pull/495), [@jcushman](https://github.com/jcushman)) - Support installing smart_open without AWS dependencies (PR [#534](https://github.com/RaRe-Technologies/smart_open/pull/534), [@justindujardin](https://github.com/justindujardin)) - Take object version into account in `to_boto3` method (PR [#539](https://github.com/RaRe-Technologies/smart_open/pull/539), [@interpolatio](https://github.com/interpolatio)) ## Deprecations Functionality on the left hand side will be removed in future releases. Use the functions on the right hand side instead. - `smart_open.s3_iter_bucket` → `smart_open.s3.iter_bucket` # 2.1.1, 27 Aug 2020 - Bypass unnecessary GCS storage.buckets.get permission (PR [#516](https://github.com/RaRe-Technologies/smart_open/pull/516), [@gelioz](https://github.com/gelioz)) - Allow SFTP connection with SSH key (PR [#522](https://github.com/RaRe-Technologies/smart_open/pull/522), [@rostskadat](https://github.com/rostskadat)) # 2.1.0, 1 July 2020 - Azure storage blob support ([@nclsmitchell](https://github.com/nclsmitchell) and [@petedannemann](https://github.com/petedannemann)) - Correctly pass `newline` parameter to built-in `open` function (PR [#478](https://github.com/RaRe-Technologies/smart_open/pull/478), [@burkovae](https://github.com/burkovae)) - Ensure GCS objects always have a .name attribute (PR [#506](https://github.com/RaRe-Technologies/smart_open/pull/506), [@todor-markov](https://github.com/todor-markov)) - Use exception chaining to convey the original cause of the exception (PR [#508](https://github.com/RaRe-Technologies/smart_open/pull/508), [@cool-RR](https://github.com/cool-RR)) # 2.0.0, 27 April 2020, "Python 3" - **This version supports Python 3 only** (3.5+). - If you still need Python 2, install the smart_open==1.10.1 legacy release instead. - Prevent smart_open from writing to logs on import (PR [#476](https://github.com/RaRe-Technologies/smart_open/pull/476), [@mpenkov](https://github.com/mpenkov)) - Modify setup.py to explicitly support only Py3.5 and above (PR [#471](https://github.com/RaRe-Technologies/smart_open/pull/471), [@Amertz08](https://github.com/Amertz08)) - Include all the test_data in setup.py (PR [#473](https://github.com/RaRe-Technologies/smart_open/pull/473), [@sikuan](https://github.com/sikuan)) # 1.10.1, 26 April 2020 - This is the last version to support Python 2.7. Versions 1.11 and above will support Python 3 only. - Use only if you need Python 2. # 1.11.1, 8 Apr 2020 - Add missing boto dependency (Issue [#468](https://github.com/RaRe-Technologies/smart_open/issues/468)) # 1.11.0, 8 Apr 2020 - Fix GCS multiple writes (PR [#421](https://github.com/RaRe-Technologies/smart_open/pull/421), [@petedannemann](https://github.com/petedannemann)) - Implemented efficient readline for ByteBuffer (PR [#426](https://github.com/RaRe-Technologies/smart_open/pull/426), [@mpenkov](https://github.com/mpenkov)) - Fix WebHDFS read method (PR [#433](https://github.com/RaRe-Technologies/smart_open/pull/433), [@mpenkov](https://github.com/mpenkov)) - Make S3 uploads more robust (PR [#434](https://github.com/RaRe-Technologies/smart_open/pull/434), [@mpenkov](https://github.com/mpenkov)) - Add pathlib monkeypatch with replacement of `pathlib.Path.open` (PR [#436](https://github.com/RaRe-Technologies/smart_open/pull/436), [@menshikh-iv](https://github.com/menshikh-iv)) - Fix error when calling str() or repr() on GCS SeekableBufferedInputBase (PR [#442](https://github.com/RaRe-Technologies/smart_open/pull/442), [@robcowie](https://github.com/robcowie)) - Move optional dependencies to extras (PR [#454](https://github.com/RaRe-Technologies/smart_open/pull/454), [@Amertz08](https://github.com/Amertz08)) - Correctly handle GCS paths that contain '?' char (PR [#460](https://github.com/RaRe-Technologies/smart_open/pull/460), [@chakruperitus](https://github.com/chakruperitus)) - Make our doctools submodule more robust (PR [#467](https://github.com/RaRe-Technologies/smart_open/pull/467), [@mpenkov](https://github.com/mpenkov)) Starting with this release, you will have to run: pip install smart_open[gcs] to use the GCS transport. In the future, all extra dependencies will be optional. If you want to continue installing all of them, use: pip install smart_open[all] See the README.rst for details. # 1.10.0, 16 Mar 2020 - Various webhdfs improvements (PR [#383](https://github.com/RaRe-Technologies/smart_open/pull/383), [@mrk-its](https://github.com/mrk-its)) - Fixes "the connection was closed by the remote peer" error (PR [#389](https://github.com/RaRe-Technologies/smart_open/pull/389), [@Gapex](https://github.com/Gapex)) - allow use of S3 single part uploads (PR [#400](https://github.com/RaRe-Technologies/smart_open/pull/400), [@adrpar](https://github.com/adrpar)) - Add test data in package via MANIFEST.in (PR [#401](https://github.com/RaRe-Technologies/smart_open/pull/401), [@jayvdb](https://github.com/jayvdb)) - Google Cloud Storage (GCS) (PR [#404](https://github.com/RaRe-Technologies/smart_open/pull/404), [@petedannemann](https://github.com/petedannemann)) - Implement to_boto3 function for S3 I/O. (PR [#405](https://github.com/RaRe-Technologies/smart_open/pull/405), [@mpenkov](https://github.com/mpenkov)) - enable smart_open to operate without docstrings (PR [#406](https://github.com/RaRe-Technologies/smart_open/pull/406), [@mpenkov](https://github.com/mpenkov)) - Implement object_kwargs parameter (PR [#411](https://github.com/RaRe-Technologies/smart_open/pull/411), [@mpenkov](https://github.com/mpenkov)) - Remove dependency on old boto library (PR [#413](https://github.com/RaRe-Technologies/smart_open/pull/413), [@mpenkov](https://github.com/mpenkov)) - implemented efficient readline for ByteBuffer (PR [#426](https://github.com/RaRe-Technologies/smart_open/pull/426), [@mpenkov](https://github.com/mpenkov)) - improve buffering efficiency (PR [#427](https://github.com/RaRe-Technologies/smart_open/pull/427), [@mpenkov](https://github.com/mpenkov)) - fix WebHDFS read method (PR [#433](https://github.com/RaRe-Technologies/smart_open/pull/433), [@mpenkov](https://github.com/mpenkov)) - Make S3 uploads more robust (PR [#434](https://github.com/RaRe-Technologies/smart_open/pull/434), [@mpenkov](https://github.com/mpenkov)) # 1.9.0, 3 Nov 2019 - Add version_id transport parameter for fetching a specific S3 object version (PR [#325](https://github.com/RaRe-Technologies/smart_open/pull/325), [@interpolatio](https://github.com/interpolatio)) - Document passthrough use case (PR [#333](https://github.com/RaRe-Technologies/smart_open/pull/333), [@mpenkov](https://github.com/mpenkov)) - Support seeking over HTTP and HTTPS (PR [#339](https://github.com/RaRe-Technologies/smart_open/pull/339), [@interpolatio](https://github.com/interpolatio)) - Add support for rt, rt+, wt, wt+, at, at+ methods (PR [#342](https://github.com/RaRe-Technologies/smart_open/pull/342), [@interpolatio](https://github.com/interpolatio)) - Change VERSION to version.py (PR [#349](https://github.com/RaRe-Technologies/smart_open/pull/349), [@mpenkov](https://github.com/mpenkov)) - Adding howto guides (PR [#355](https://github.com/RaRe-Technologies/smart_open/pull/355), [@mpenkov](https://github.com/mpenkov)) - smart_open/s3: Initial implementations of str and repr (PR [#359](https://github.com/RaRe-Technologies/smart_open/pull/359), [@ZlatSic](https://github.com/ZlatSic)) - Support writing any bytes-like object to S3. (PR [#361](https://github.com/RaRe-Technologies/smart_open/pull/361), [@gilbsgilbs](https://github.com/gilbsgilbs)) # 1.8.4, 2 Jun 2019 - Don't use s3 bucket_head to check for bucket existence (PR [#315](https://github.com/RaRe-Technologies/smart_open/pull/315), [@caboteria](https://github.com/caboteria)) - Dont list buckets in s3 tests (PR [#318](https://github.com/RaRe-Technologies/smart_open/pull/318), [@caboteria](https://github.com/caboteria)) - Use warnings.warn instead of logger.warning (PR [#321](https://github.com/RaRe-Technologies/smart_open/pull/321), [@mpenkov](https://github.com/mpenkov)) - Optimize reading from S3 (PR [#322](https://github.com/RaRe-Technologies/smart_open/pull/322), [@mpenkov](https://github.com/mpenkov)) # 1.8.3, 26 April 2019 - Improve S3 read performance by not copying buffer (PR [#284](https://github.com/RaRe-Technologies/smart_open/pull/284), [@aperiodic](https://github.com/aperiodic)) - accept bytearray and memoryview as input to write in s3 submodule (PR [#293](https://github.com/RaRe-Technologies/smart_open/pull/293), [@bmizhen-exos](https://github.com/bmizhen-exos)) - Fix two S3 bugs (PR [#307](https://github.com/RaRe-Technologies/smart_open/pull/307), [@mpenkov](https://github.com/mpenkov)) - Minor fixes: bz2file dependency, paramiko warning handling (PR [#309](https://github.com/RaRe-Technologies/smart_open/pull/309), [@mpenkov](https://github.com/mpenkov)) - improve unit tests (PR [#310](https://github.com/RaRe-Technologies/smart_open/pull/310), [@mpenkov](https://github.com/mpenkov)) # 1.8.2, 17 April 2019 - Removed dependency on lzma (PR [#262](https://github.com/RaRe-Technologies/smart_open/pull/282), [@tdhopper](https://github.com/tdhopper)) - backward compatibility fixes (PR [#294](https://github.com/RaRe-Technologies/smart_open/pull/294), [@mpenkov](https://github.com/mpenkov)) - Minor fixes (PR [#291](https://github.com/RaRe-Technologies/smart_open/pull/291), [@mpenkov](https://github.com/mpenkov)) - Fix #289: the smart_open package now correctly exposes a `__version__` attribute - Fix #285: handle edge case with question marks in an S3 URL This release rolls back support for transparently decompressing .xz files, previously introduced in 1.8.1. This is a useful feature, but it requires a tricky dependency. It's still possible to handle .xz files with relatively little effort. Please see the [README.rst](https://github.com/RaRe-Technologies/smart_open/blob/master/README.rst#supported-compression-formats) file for details. # 1.8.1, 6 April 2019 - Added support for .xz / lzma (PR [#262](https://github.com/RaRe-Technologies/smart_open/pull/262), [@vmarkovtsev](https://github.com/vmarkovtsev)) - Added streaming HTTP support (PR [#236](https://github.com/RaRe-Technologies/smart_open/pull/236), [@handsomezebra](https://github.com/handsomezebra)) - Fix handling of "+" mode, refactor tests (PR [#263](https://github.com/RaRe-Technologies/smart_open/pull/263), [@vmarkovtsev](https://github.com/vmarkovtsev)) - Added support for SSH/SCP/SFTP (PR [#58](https://github.com/RaRe-Technologies/smart_open/pull/58), [@val314159](https://github.com/val314159) & [@mpenkov](https://github.com/mpenkov)) - Added new feature: compressor registry (PR [#266](https://github.com/RaRe-Technologies/smart_open/pull/266), [@mpenkov](https://github.com/mpenkov)) - Implemented new `smart_open.open` function (PR [#268](https://github.com/RaRe-Technologies/smart_open/pull/268), [@mpenkov](https://github.com/mpenkov)) ## smart_open.open This new function replaces `smart_open.smart_open`, which is now deprecated. Main differences: - ignore_extension → ignore_ext - new `transport_params` dict parameter to contain keyword parameters for the transport layer (S3, HTTPS, HDFS, etc). Main advantages of the new function: - Simpler interface for the user, less parameters - Greater API flexibility: adding additional keyword arguments will no longer require updating the top-level interface - Better documentation for keyword parameters (previously, they were documented via examples only) The old `smart_open.smart_open` function is deprecated, but continues to work as previously. # 1.8.0, 17th January 2019 - Add `python3.7` support (PR [#240](https://github.com/RaRe-Technologies/smart_open/pull/240), [@menshikh-iv](https://github.com/menshikh-iv)) - Add `http/https` schema correctly (PR [#242](https://github.com/RaRe-Technologies/smart_open/pull/242), [@gliv](https://github.com/gliv)) - Fix url parsing for `S3` (PR [#235](https://github.com/RaRe-Technologies/smart_open/pull/235), [@rileypeterson](https://github.com/rileypeterson)) - Clean up `_parse_uri_s3x`, resolve edge cases (PR [#237](https://github.com/RaRe-Technologies/smart_open/pull/237), [@mpenkov](https://github.com/mpenkov)) - Handle leading slash in local path edge case (PR [#238](https://github.com/RaRe-Technologies/smart_open/pull/238), [@mpenkov](https://github.com/mpenkov)) - Roll back README changes (PR [#239](https://github.com/RaRe-Technologies/smart_open/pull/239), [@mpenkov](https://github.com/mpenkov)) - Add example how to work with Digital Ocean spaces and boto profile (PR [#248](https://github.com/RaRe-Technologies/smart_open/pull/248), [@navado](https://github.com/@navado) & [@mpenkov](https://github.com/mpenkov)) - Fix boto fail to load gce plugin (PR [#255](https://github.com/RaRe-Technologies/smart_open/pull/255), [@menshikh-iv](https://github.com/menshikh-iv)) - Drop deprecated `sudo` from travis config (PR [#256](https://github.com/RaRe-Technologies/smart_open/pull/256), [@cclauss](https://github.com/cclauss)) - Raise `ValueError` if s3 key does not exist (PR [#245](https://github.com/RaRe-Technologies/smart_open/pull/245), [@adrpar](https://github.com/adrpar)) - Ensure `_list_bucket` uses continuation token for subsequent pages (PR [#246](https://github.com/RaRe-Technologies/smart_open/pull/246), [@tcsavage](https://github.com/tcsavage)) # 1.7.1, 18th September 2018 - Unpin boto/botocore for regular installation. Fix #227 (PR [#232](https://github.com/RaRe-Technologies/smart_open/pull/232), [@menshikh-iv](https://github.com/menshikh-iv)) # 1.7.0, 18th September 2018 - Drop support for `python3.3` and `python3.4` & workaround for broken `moto` (PR [#225](https://github.com/RaRe-Technologies/smart_open/pull/225), [@menshikh-iv](https://github.com/menshikh-iv)) - Add `s3a://` support for `S3`. Fix #210 (PR [#229](https://github.com/RaRe-Technologies/smart_open/pull/229), [@mpenkov](https://github.com/mpenkov)) - Allow use `@` in object (key) names for `S3`. Fix #94 (PRs [#204](https://github.com/RaRe-Technologies/smart_open/pull/204) & [#224](https://github.com/RaRe-Technologies/smart_open/pull/224), [@dkasyanov](https://github.com/dkasyanov) & [@mpenkov](https://github.com/mpenkov)) - Make `close` idempotent & add dummy `flush` for `S3` (PR [#212](https://github.com/RaRe-Technologies/smart_open/pull/212), [@mpenkov](https://github.com/mpenkov)) - Use built-in `open` whenever possible. Fix #207 (PR [#208](https://github.com/RaRe-Technologies/smart_open/pull/208), [@mpenkov](https://github.com/mpenkov)) - Fix undefined name `uri` in `smart_open_lib.py`. Fix #213 (PR [#214](https://github.com/RaRe-Technologies/smart_open/pull/214), [@cclauss](https://github.com/cclauss)) - Fix new unittests from [#212](https://github.com/RaRe-Technologies/smart_open/pull/212) (PR [#219](https://github.com/RaRe-Technologies/smart_open/pull/219), [@mpenkov](https://github.com/mpenkov)) - Reorganize README & make examples py2/py3 compatible (PR [#211](https://github.com/RaRe-Technologies/smart_open/pull/211), [@piskvorky](https://github.com/piskvorky)) # 1.6.0, 29th June 2018 - Migrate to `boto3`. Fix #43 (PR [#164](https://github.com/RaRe-Technologies/smart_open/pull/164), [@mpenkov](https://github.com/mpenkov)) - Refactoring smart_open to share compression and encoding functionality (PR [#185](https://github.com/RaRe-Technologies/smart_open/pull/185), [@mpenkov](https://github.com/mpenkov)) - Drop `python2.6` compatibility. Fix #156 (PR [#192](https://github.com/RaRe-Technologies/smart_open/pull/192), [@mpenkov](https://github.com/mpenkov)) - Accept a custom `boto3.Session` instance (support STS AssumeRole). Fix #130, #149, #199 (PR [#201](https://github.com/RaRe-Technologies/smart_open/pull/201), [@eschwartz](https://github.com/eschwartz)) - Accept `multipart_upload` parameters (supports ServerSideEncryption) for `S3`. Fix (PR [#202](https://github.com/RaRe-Technologies/smart_open/pull/202), [@eschwartz](https://github.com/eschwartz)) - Add support for `pathlib.Path`. Fix #170 (PR [#175](https://github.com/RaRe-Technologies/smart_open/pull/175), [@clintval](https://github.com/clintval)) - Fix performance regression using local file-system. Fix #184 (PR [#190](https://github.com/RaRe-Technologies/smart_open/pull/190), [@mpenkov](https://github.com/mpenkov)) - Replace `ParsedUri` class with functions, cleanup internal argument parsing (PR [#191](https://github.com/RaRe-Technologies/smart_open/pull/191), [@mpenkov](https://github.com/mpenkov)) - Handle edge case (read 0 bytes) in read function. Fix #171 (PR [#193](https://github.com/RaRe-Technologies/smart_open/pull/193), [@mpenkov](https://github.com/mpenkov)) - Fix bug with changing `f._current_pos` when call `f.readline()` (PR [#182](https://github.com/RaRe-Technologies/smart_open/pull/182), [@inksink](https://github.com/inksink)) - Сlose the old body explicitly after `seek` for `S3`. Fix #187 (PR [#188](https://github.com/RaRe-Technologies/smart_open/pull/188), [@inksink](https://github.com/inksink)) # 1.5.7, 18th March 2018 - Fix author/maintainer fields in `setup.py`, avoid bug from `setuptools==39.0.0` and add workaround for `botocore` and `python==3.3`. Fix #176 (PR [#178](https://github.com/RaRe-Technologies/smart_open/pull/178) & [#177](https://github.com/RaRe-Technologies/smart_open/pull/177), [@menshikh-iv](https://github.com/menshikh-iv) & [@baldwindc](https://github.com/baldwindc)) # 1.5.6, 28th December 2017 - Improve S3 read performance. Fix #152 (PR [#157](https://github.com/RaRe-Technologies/smart_open/pull/157), [@mpenkov](https://github.com/mpenkov)) - Add integration testing + benchmark with real S3. Partial fix #151, #156 (PR [#158](https://github.com/RaRe-Technologies/smart_open/pull/158), [@menshikh-iv](https://github.com/menshikh-iv) & [@mpenkov](https://github.com/mpenkov)) - Disable integration testing if secure vars isn't defined (PR [#157](https://github.com/RaRe-Technologies/smart_open/pull/158), [@menshikh-iv](https://github.com/menshikh-iv)) # 1.5.5, 6th December 2017 - Fix problems from 1.5.4 release. Fix #153, #154 , partial fix #152 (PR [#155](https://github.com/RaRe-Technologies/smart_open/pull/155), [@mpenkov](https://github.com/mpenkov)) # 1.5.4, 30th November 2017 - Add naitive .gz support for HDFS (PR [#128](https://github.com/RaRe-Technologies/smart_open/pull/128), [@yupbank](https://github.com/yupbank)) - Drop python2.6 support + fix style (PR [#137](https://github.com/RaRe-Technologies/smart_open/pull/137), [@menshikh-iv](https://github.com/menshikh-iv)) - Create separate compression-specific layer. Fix [#91](https://github.com/RaRe-Technologies/smart_open/issues/91) (PR [#131](https://github.com/RaRe-Technologies/smart_open/pull/131), [@mpenkov](https://github.com/mpenkov)) - Fix ResourceWarnings + replace deprecated assertEquals (PR [#140](https://github.com/RaRe-Technologies/smart_open/pull/140), [@horpto](https://github.com/horpto)) - Add encoding parameter to smart_open. Fix [#142](https://github.com/RaRe-Technologies/smart_open/issues/142) (PR [#143](https://github.com/RaRe-Technologies/smart_open/pull/143), [@mpenkov](https://github.com/mpenkov)) - Add encoding tests for readers. Fix [#145](https://github.com/RaRe-Technologies/smart_open/issues/145), partial fix [#146](https://github.com/RaRe-Technologies/smart_open/issues/146) (PR [#147](https://github.com/RaRe-Technologies/smart_open/pull/147), [@mpenkov](https://github.com/mpenkov)) - Fix file mode for updating case (PR [#150](https://github.com/RaRe-Technologies/smart_open/pull/150), [@menshikh-iv](https://github.com/menshikh-iv)) # 1.5.3, 18th May 2017 - Remove GET parameters from url. Fix #120 (PR #121, @mcrowson) # 1.5.2, 12th Apr 2017 - Enable compressed formats over http. Avoid filehandle leak. Fix #109 and #110. (PR #112, @robottwo ) - Make possible to change number of retries (PR #102, @shaform) # 1.5.1, 16th Mar 2017 - Bugfix for compressed formats (PR #110, @tmylk) # 1.5.0, 14th Mar 2017 - HTTP/HTTPS read support w/ Kerberos (PR #107, @robottwo) # 1.4.0, 13th Feb 2017 - HdfsOpenWrite implementation similar to read (PR #106, @skibaa) - Support custom S3 server host, port, ssl. (PR #101, @robottwo) - Add retry around `s3_iter_bucket_process_key` to address S3 Read Timeout errors. (PR #96, @bbbco) - Include tests data in sdist + install them. (PR #105, @cournape) # 1.3.5, 5th October 2016 # - Add MANIFEST.in required for conda-forge recip (PR #90, @tmylk) - Fix #92. Allow hash in filename (PR #93, @tmylk) # 1.3.4, 26th August 2016 - Relative path support (PR #73, @yupbank) - Move gzipstream module to smart_open package (PR #81, @mpenkov) - Ensure reader objects never return None (PR #81, @mpenkov) - Ensure read functions never return more bytes than asked for (PR #84, @mpenkov) - Add support for reading gzipped objects until EOF, e.g. read() (PR #81, @mpenkov) - Add missing parameter to read_from_buffer call (PR #84, @mpenkov) - Add unit tests for gzipstream (PR #84, @mpenkov) - Bundle gzipstream to enable streaming of gzipped content from S3 (PR #73, @mpenkov) - Update gzipstream to avoid deep recursion (PR #73, @mpenkov) - Implemented readline for S3 (PR #73, @mpenkov) - Added pip requirements.txt (PR #73, @mpenkov) - Invert NO_MULTIPROCESSING flag (PR #79, @Janrain-Colin) - Add ability to add query to webhdfs uri. (PR #78, @ellimilial) # 1.3.3, 16th May 2016 - Accept an instance of boto.s3.key.Key to smart_open (PR #38, @asieira) - Allow passing `encrypt_key` and other parameters to `initiate_multipart_upload` (PR #63, @asieira) - Allow passing boto `host` and `profile_name` to smart_open (PR #71 #68, @robcowie) - Write an empty key to S3 even if nothing is written to S3OpenWrite (PR #61, @petedmarsh) - Support `LC_ALL=C` environment variable setup (PR #40, @nikicc) - Python 3.5 support # 1.3.2, 3rd January 2016 - Bug fix release to enable 'wb+' file mode (PR #50) # 1.3.1, 18th December 2015 - Disable multiprocessing if unavailable. Allows to run on Google Compute Engine. (PR #41, @nikicc) - Httpretty updated to allow LC_ALL=C locale config. (PR #39, @jsphpl) - Accept an instance of boto.s3.key.Key (PR #38, @asieira) # 1.3.0, 19th September 2015 - WebHDFS read/write (PR #29, @ziky90) - re-upload last S3 chunk in failed upload (PR #20, @andreycizov) - return the entire key in s3_iter_bucket instead of only the key name (PR #22, @salilb) - pass optional keywords on S3 write (PR #30, @val314159) - smart_open a no-op if passed a file-like object with a read attribute (PR #32, @gojomo) - various improvements to testing (PR #30, @val314159) # 1.1.0, 1st February 2015 - support for multistream bzip files (PR #9, @pombredanne) - introduce this CHANGELOG smart_open-5.2.1/LICENSE000066400000000000000000000020741411241424400146730ustar00rootroot00000000000000The MIT License (MIT) Copyright (c) 2015 Radim Řehůřek Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. smart_open-5.2.1/MANIFEST.in000066400000000000000000000001421411241424400154160ustar00rootroot00000000000000include LICENSE include README.rst include MIGRATING_FROM_OLDER_VERSIONS.rst include CHANGELOG.md smart_open-5.2.1/MIGRATING_FROM_OLDER_VERSIONS.rst000066400000000000000000000225601411241424400210430ustar00rootroot00000000000000Migrating to the new client-based S3 API ======================================== Version of smart_open prior to 5.0.0 used the boto3 `resource API`_ for communicating with S3. This API was easy to integrate for smart_open developers, but this came at a cost: it was not thread- or multiprocess-safe. Furthermore, as smart_open supported more and more options, the transport parameter list grew, making it less maintainable. Starting with version 5.0.0, smart_open uses the `client API`_ instead of the resource API. Functionally, very little changes for the smart_open user. The only difference is in passing transport parameters to the S3 backend. More specifically, the following S3 transport parameters are no longer supported: - `multipart_upload_kwargs` - `object_kwargs` - `resource` - `resource_kwargs` - `session` - `singlepart_upload_kwargs` **If you weren't using the above parameters, nothing changes for you.** However, if you were using any of the above, then you need to adjust your code. Here are some quick recipes below. If you were previously passing `session`, then construct an S3 client from the session and pass that instead. For example, before: .. code-block:: python smart_open.open('s3://bucket/key', transport_params={'session': session}) After: .. code-block:: python smart_open.open('s3://bucket/key', transport_params={'client': session.client('s3')}) If you were passing `resource`, then replace the resource with a client, and pass that instead. For example, before: .. code-block:: python resource = session.resource('s3', **resource_kwargs) smart_open.open('s3://bucket/key', transport_params={'resource': resource}) After: .. code-block:: python client = session.client('s3') smart_open.open('s3://bucket/key', transport_params={'client': client}) If you were passing any of the `*_kwargs` parameters, you will need to include them in `client_kwargs`, keeping in mind the following transformations. ========================== ====================================== ========================== Parameter name Resource API method Client API function ========================== ====================================== ========================== `multipart_upload_kwargs` `S3.Object.initiate_multipart_upload`_ `S3.Client.create_multipart_upload`_ `object_kwargs` `S3.Object.get`_ `S3.Client.get_object`_ `resource_kwargs` S3.resource `S3.client`_ `singlepart_upload_kwargs` `S3.Object.put`_ `S3.Client.put_object`_ ========================== ====================================== ========================== Most of the above is self-explanatory, with the exception of `resource_kwargs`. These were previously used mostly for passing a custom endpoint URL. The `client_kwargs` dict can thus contain the following members: - `S3.Client`: initializer parameters, e.g. those to pass directly to the `boto3.client` function, such as `endpoint_url`. - `S3.Client.create_multipart_upload` - `S3.Client.get_object` - `S3.Client.put_object` Here's a before-and-after example for connecting to a custom endpoint. Before: .. code-block:: python session = boto3.Session(profile_name='digitalocean') resource_kwargs = {'endpoint_url': 'https://ams3.digitaloceanspaces.com'} with open('s3://bucket/key.txt', 'wb', transport_params={'resource_kwarg': resource_kwargs}) as fout: fout.write(b'here we stand') After: .. code-block:: python session = boto3.Session(profile_name='digitalocean') client = session.client('s3', endpoint_url='https://ams3.digitaloceanspaces.com') with open('s3://bucket/key.txt', 'wb', transport_params={'client': client}) as fout: fout.write(b'here we stand') See `README `_ and `HOWTO `_ for more examples. .. _resource API: https://boto3.amazonaws.com/v1/documentation/api/latest/reference/services/s3.html#service-resource .. _S3.Object.initiate_multipart_upload: https://boto3.amazonaws.com/v1/documentation/api/latest/reference/services/s3.html#S3.Object.initiate_multipart_upload .. _S3.Object.get: https://boto3.amazonaws.com/v1/documentation/api/latest/reference/services/s3.html#S3.ObjectSummary.get .. _S3.Object.put: https://boto3.amazonaws.com/v1/documentation/api/latest/reference/services/s3.html#S3.ObjectSummary.put .. _client API: https://boto3.amazonaws.com/v1/documentation/api/latest/reference/services/s3.html#client .. _S3.Client: https://boto3.amazonaws.com/v1/documentation/api/latest/reference/services/s3.html#client .. _S3.Client.create_multipart_upload: https://boto3.amazonaws.com/v1/documentation/api/latest/reference/services/s3.html#S3.Client.create_multipart_upload .. _S3.Client.get_object: https://boto3.amazonaws.com/v1/documentation/api/latest/reference/services/s3.html#S3.Client.get_object .. _S3.Client.put_object: https://boto3.amazonaws.com/v1/documentation/api/latest/reference/services/s3.html#S3.Client.put_object Migrating to the new dependency management subsystem ==================================================== Smart_open has grown over the years to cover a lot of different storages, each with a different set of library dependencies. Not everybody needs *all* of them, so to make each smart_open installation leaner and faster, version 3.0.0 introduced a new, backward-incompatible installation method: * smart_open < 3.0.0: All dependencies were installed by default. No way to select just a subset during installation. * smart_open >= 3.0.0: No dependencies installed by default. Install the ones you need with e.g. ``pip install smart_open[s3]`` (only AWS), or ``smart_open[all]`` (install everything = same behaviour as < 3.0.0; use this for backward compatibility). You can read more about the motivation and internal discussions for this change `here `_. Migrating to the new ``open`` function ====================================== Since 1.8.1, there is a ``smart_open.open`` function that replaces ``smart_open.smart_open``. The new function offers several advantages over the old one: - 100% compatible with the built-in ``open`` function (aka ``io.open``): it accepts all the parameters that the built-in ``open`` accepts. - The default open mode is now "r", the same as for the built-in ``open``. The default for the old ``smart_open.smart_open`` function used to be "rb". - Fully documented keyword parameters (try ``help("smart_open.open")``) The instructions below will help you migrate to the new function painlessly. First, update your imports: .. code-block:: python >>> from smart_open import smart_open # before >>> from smart_open import open # after In general, ``smart_open`` uses ``io.open`` directly, where possible, so if your code already uses ``open`` for local file I/O, then it will continue to work. If you want to continue using the built-in ``open`` function for e.g. debugging, then you can ``import smart_open`` and use ``smart_open.open``. **The default read mode is now "r" (read text).** If your code was implicitly relying on the default mode being "rb" (read binary), you'll need to update it and pass "rb" explicitly. Before: .. code-block:: python >>> import smart_open >>> smart_open.smart_open('s3://commoncrawl/robots.txt').read(32) # 'rb' used to be the default b'User-Agent: *\nDisallow: /' After: .. code-block:: python >>> import smart_open >>> smart_open.open('s3://commoncrawl/robots.txt', 'rb').read(32) b'User-Agent: *\nDisallow: /' The ``ignore_extension`` keyword parameter is now called ``ignore_ext``. It behaves identically otherwise. The most significant change is in the handling on keyword parameters for the transport layer, e.g. HTTP, S3, etc. The old function accepted these directly: .. code-block:: python >>> url = 's3://smart-open-py37-benchmark-results/test.txt' >>> session = boto3.Session(profile_name='smart_open') >>> smart_open.smart_open(url, 'r', session=session).read(32) 'first line\nsecond line\nthird lin' The new function accepts a ``transport_params`` keyword argument. It's a dict. Put your transport parameters in that dictionary. .. code-block:: python >>> url = 's3://smart-open-py37-benchmark-results/test.txt' >>> params = {'session': boto3.Session(profile_name='smart_open')} >>> open(url, 'r', transport_params=params).read(32) 'first line\nsecond line\nthird lin' Renamed parameters: - ``s3_upload`` -> ``multipart_upload_kwargs`` - ``s3_session`` -> ``session`` Removed parameters: - ``profile_name`` **The profile_name parameter has been removed.** Pass an entire ``boto3.Session`` object instead. Before: .. code-block:: python >>> url = 's3://smart-open-py37-benchmark-results/test.txt' >>> smart_open.smart_open(url, 'r', profile_name='smart_open').read(32) 'first line\nsecond line\nthird lin' After: .. code-block:: python >>> url = 's3://smart-open-py37-benchmark-results/test.txt' >>> params = {'session': boto3.Session(profile_name='smart_open')} >>> open(url, 'r', transport_params=params).read(32) 'first line\nsecond line\nthird lin' See ``help("smart_open.open")`` for the full list of acceptable parameter names, or view the help online `here `__. If you pass an invalid parameter name, the ``smart_open.open`` function will warn you about it. Keep an eye on your logs for WARNING messages from ``smart_open``. smart_open-5.2.1/README.rst000066400000000000000000000475711411241424400153700ustar00rootroot00000000000000====================================================== smart_open — utils for streaming large files in Python ====================================================== |License|_ |GHA|_ |Coveralls|_ |Downloads|_ .. |License| image:: https://img.shields.io/pypi/l/smart_open.svg .. |GHA| image:: https://github.com/RaRe-Technologies/smart_open/workflows/Test/badge.svg .. |Coveralls| image:: https://coveralls.io/repos/github/RaRe-Technologies/smart_open/badge.svg?branch=develop .. |Downloads| image:: https://pepy.tech/badge/smart-open/month .. _License: https://github.com/RaRe-Technologies/smart_open/blob/master/LICENSE .. _GHA: https://github.com/RaRe-Technologies/smart_open/actions?query=workflow%3ATest .. _Coveralls: https://coveralls.io/github/RaRe-Technologies/smart_open?branch=HEAD .. _Downloads: https://pypi.org/project/smart-open/ What? ===== ``smart_open`` is a Python 3 library for **efficient streaming of very large files** from/to storages such as S3, GCS, Azure Blob Storage, HDFS, WebHDFS, HTTP, HTTPS, SFTP, or local filesystem. It supports transparent, on-the-fly (de-)compression for a variety of different formats. ``smart_open`` is a drop-in replacement for Python's built-in ``open()``: it can do anything ``open`` can (100% compatible, falls back to native ``open`` wherever possible), plus lots of nifty extra stuff on top. **Python 2.7 is no longer supported. If you need Python 2.7, please use** `smart_open 1.10.1 `_, **the last version to support Python 2.** Why? ==== Working with large remote files, for example using Amazon's `boto3 `_ Python library, is a pain. ``boto3``'s ``Object.upload_fileobj()`` and ``Object.download_fileobj()`` methods require gotcha-prone boilerplate to use successfully, such as constructing file-like object wrappers. ``smart_open`` shields you from that. It builds on boto3 and other remote storage libraries, but offers a **clean unified Pythonic API**. The result is less code for you to write and fewer bugs to make. How? ===== ``smart_open`` is well-tested, well-documented, and has a simple Pythonic API: .. _doctools_before_examples: .. code-block:: python >>> from smart_open import open >>> >>> # stream lines from an S3 object >>> for line in open('s3://commoncrawl/robots.txt'): ... print(repr(line)) ... break 'User-Agent: *\n' >>> # stream from/to compressed files, with transparent (de)compression: >>> for line in open('smart_open/tests/test_data/1984.txt.gz', encoding='utf-8'): ... print(repr(line)) 'It was a bright cold day in April, and the clocks were striking thirteen.\n' 'Winston Smith, his chin nuzzled into his breast in an effort to escape the vile\n' 'wind, slipped quickly through the glass doors of Victory Mansions, though not\n' 'quickly enough to prevent a swirl of gritty dust from entering along with him.\n' >>> # can use context managers too: >>> with open('smart_open/tests/test_data/1984.txt.gz') as fin: ... with open('smart_open/tests/test_data/1984.txt.bz2', 'w') as fout: ... for line in fin: ... fout.write(line) 74 80 78 79 >>> # can use any IOBase operations, like seek >>> with open('s3://commoncrawl/robots.txt', 'rb') as fin: ... for line in fin: ... print(repr(line.decode('utf-8'))) ... break ... offset = fin.seek(0) # seek to the beginning ... print(fin.read(4)) 'User-Agent: *\n' b'User' >>> # stream from HTTP >>> for line in open('http://example.com/index.html'): ... print(repr(line)) ... break '\n' .. _doctools_after_examples: Other examples of URLs that ``smart_open`` accepts:: s3://my_bucket/my_key s3://my_key:my_secret@my_bucket/my_key s3://my_key:my_secret@my_server:my_port@my_bucket/my_key gs://my_bucket/my_blob azure://my_bucket/my_blob hdfs:///path/file hdfs://path/file webhdfs://host:port/path/file ./local/path/file ~/local/path/file local/path/file ./local/path/file.gz file:///home/user/file file:///home/user/file.bz2 [ssh|scp|sftp]://username@host//path/file [ssh|scp|sftp]://username@host/path/file [ssh|scp|sftp]://username:password@host/path/file Documentation ============= Installation ------------ ``smart_open`` supports a wide range of storage solutions, including AWS S3, Google Cloud and Azure. Each individual solution has its own dependencies. By default, ``smart_open`` does not install any dependencies, in order to keep the installation size small. You can install these dependencies explicitly using:: pip install smart_open[azure] # Install Azure deps pip install smart_open[gcs] # Install GCS deps pip install smart_open[s3] # Install S3 deps Or, if you don't mind installing a large number of third party libraries, you can install all dependencies using:: pip install smart_open[all] Be warned that this option increases the installation size significantly, e.g. over 100MB. If you're upgrading from ``smart_open`` versions 2.x and below, please check out the `Migration Guide `_. Built-in help ------------- For detailed API info, see the online help: .. code-block:: python help('smart_open') or click `here `__ to view the help in your browser. More examples ------------- For the sake of simplicity, the examples below assume you have all the dependencies installed, i.e. you have done:: pip install smart_open[all] .. code-block:: python >>> import os, boto3 >>> >>> # stream content *into* S3 (write mode) using a custom session >>> session = boto3.Session( ... aws_access_key_id=os.environ['AWS_ACCESS_KEY_ID'], ... aws_secret_access_key=os.environ['AWS_SECRET_ACCESS_KEY'], ... ) >>> url = 's3://smart-open-py37-benchmark-results/test.txt' >>> with open(url, 'wb', transport_params={'client': session.client('s3')}) as fout: ... bytes_written = fout.write(b'hello world!') ... print(bytes_written) 12 .. code-block:: python # stream from HDFS for line in open('hdfs://user/hadoop/my_file.txt', encoding='utf8'): print(line) # stream from WebHDFS for line in open('webhdfs://host:port/user/hadoop/my_file.txt'): print(line) # stream content *into* HDFS (write mode): with open('hdfs://host:port/user/hadoop/my_file.txt', 'wb') as fout: fout.write(b'hello world') # stream content *into* WebHDFS (write mode): with open('webhdfs://host:port/user/hadoop/my_file.txt', 'wb') as fout: fout.write(b'hello world') # stream from a completely custom s3 server, like s3proxy: for line in open('s3u://user:secret@host:port@mybucket/mykey.txt'): print(line) # Stream to Digital Ocean Spaces bucket providing credentials from boto3 profile session = boto3.Session(profile_name='digitalocean') client = session.client('s3', endpoint_url='https://ams3.digitaloceanspaces.com') transport_params = {'client': client} with open('s3://bucket/key.txt', 'wb', transport_params=transport_params) as fout: fout.write(b'here we stand') # stream from GCS for line in open('gs://my_bucket/my_file.txt'): print(line) # stream content *into* GCS (write mode): with open('gs://my_bucket/my_file.txt', 'wb') as fout: fout.write(b'hello world') # stream from Azure Blob Storage connect_str = os.environ['AZURE_STORAGE_CONNECTION_STRING'] transport_params = { 'client': azure.storage.blob.BlobServiceClient.from_connection_string(connect_str), } for line in open('azure://mycontainer/myfile.txt', transport_params=transport_params): print(line) # stream content *into* Azure Blob Storage (write mode): connect_str = os.environ['AZURE_STORAGE_CONNECTION_STRING'] transport_params = { 'client': azure.storage.blob.BlobServiceClient.from_connection_string(connect_str), } with open('azure://mycontainer/my_file.txt', 'wb', transport_params=transport_params) as fout: fout.write(b'hello world') Compression Handling -------------------- The top-level `compression` parameter controls compression/decompression behavior when reading and writing. The supported values for this parameter are: - ``infer_from_extension`` (default behavior) - ``disable`` - ``.gz`` - ``.bz2`` By default, ``smart_open`` determines the compression algorithm to use based on the file extension. .. code-block:: python >>> from smart_open import open, register_compressor >>> with open('smart_open/tests/test_data/1984.txt.gz') as fin: ... print(fin.read(32)) It was a bright cold day in Apri You can override this behavior to either disable compression, or explicitly specify the algorithm to use. To disable compression: .. code-block:: python >>> from smart_open import open, register_compressor >>> with open('smart_open/tests/test_data/1984.txt.gz', 'rb', compression='disable') as fin: ... print(fin.read(32)) b'\x1f\x8b\x08\x08\x85F\x94\\\x00\x031984.txt\x005\x8f=r\xc3@\x08\x85{\x9d\xe2\x1d@' To specify the algorithm explicitly (e.g. for non-standard file extensions): .. code-block:: python >>> from smart_open import open, register_compressor >>> with open('smart_open/tests/test_data/1984.txt.gzip', compression='.gz') as fin: ... print(fin.read(32)) It was a bright cold day in Apri You can also easily add support for other file extensions and compression formats. For example, to open xz-compressed files: .. code-block:: python >>> import lzma, os >>> from smart_open import open, register_compressor >>> def _handle_xz(file_obj, mode): ... return lzma.LZMAFile(filename=file_obj, mode=mode, format=lzma.FORMAT_XZ) >>> register_compressor('.xz', _handle_xz) >>> with open('smart_open/tests/test_data/1984.txt.xz') as fin: ... print(fin.read(32)) It was a bright cold day in Apri ``lzma`` is in the standard library in Python 3.3 and greater. For 2.7, use `backports.lzma`_. .. _backports.lzma: https://pypi.org/project/backports.lzma/ Transport-specific Options -------------------------- ``smart_open`` supports a wide range of transport options out of the box, including: - S3 - HTTP, HTTPS (read-only) - SSH, SCP and SFTP - WebHDFS - GCS - Azure Blob Storage Each option involves setting up its own set of parameters. For example, for accessing S3, you often need to set up authentication, like API keys or a profile name. ``smart_open``'s ``open`` function accepts a keyword argument ``transport_params`` which accepts additional parameters for the transport layer. Here are some examples of using this parameter: .. code-block:: python >>> import boto3 >>> fin = open('s3://commoncrawl/robots.txt', transport_params=dict(client=boto3.client('s3'))) >>> fin = open('s3://commoncrawl/robots.txt', transport_params=dict(buffer_size=1024)) For the full list of keyword arguments supported by each transport option, see the documentation: .. code-block:: python help('smart_open.open') S3 Credentials -------------- ``smart_open`` uses the ``boto3`` library to talk to S3. ``boto3`` has several `mechanisms `__ for determining the credentials to use. By default, ``smart_open`` will defer to ``boto3`` and let the latter take care of the credentials. There are several ways to override this behavior. The first is to pass a ``boto3.Client`` object as a transport parameter to the ``open`` function. You can customize the credentials when constructing the session for the client. ``smart_open`` will then use the session when talking to S3. .. code-block:: python session = boto3.Session( aws_access_key_id=ACCESS_KEY, aws_secret_access_key=SECRET_KEY, aws_session_token=SESSION_TOKEN, ) client = session.client('s3', endpoint_url=..., config=...) fin = open('s3://bucket/key', transport_params=dict(client=client)) Your second option is to specify the credentials within the S3 URL itself: .. code-block:: python fin = open('s3://aws_access_key_id:aws_secret_access_key@bucket/key', ...) *Important*: The two methods above are **mutually exclusive**. If you pass an AWS client *and* the URL contains credentials, ``smart_open`` will ignore the latter. *Important*: ``smart_open`` ignores configuration files from the older ``boto`` library. Port your old ``boto`` settings to ``boto3`` in order to use them with ``smart_open``. Iterating Over an S3 Bucket's Contents -------------------------------------- Since going over all (or select) keys in an S3 bucket is a very common operation, there's also an extra function ``smart_open.s3.iter_bucket()`` that does this efficiently, **processing the bucket keys in parallel** (using multiprocessing): .. code-block:: python >>> from smart_open import s3 >>> # get data corresponding to 2010 and later under "silo-open-data/annual/monthly_rain" >>> # we use workers=1 for reproducibility; you should use as many workers as you have cores >>> bucket = 'silo-open-data' >>> prefix = 'annual/monthly_rain/' >>> for key, content in s3.iter_bucket(bucket, prefix=prefix, accept_key=lambda key: '/201' in key, workers=1, key_limit=3): ... print(key, round(len(content) / 2**20)) annual/monthly_rain/2010.monthly_rain.nc 13 annual/monthly_rain/2011.monthly_rain.nc 13 annual/monthly_rain/2012.monthly_rain.nc 13 GCS Credentials --------------- ``smart_open`` uses the ``google-cloud-storage`` library to talk to GCS. ``google-cloud-storage`` uses the ``google-cloud`` package under the hood to handle authentication. There are several `options `__ to provide credentials. By default, ``smart_open`` will defer to ``google-cloud-storage`` and let it take care of the credentials. To override this behavior, pass a ``google.cloud.storage.Client`` object as a transport parameter to the ``open`` function. You can `customize the credentials `__ when constructing the client. ``smart_open`` will then use the client when talking to GCS. To follow allow with the example below, `refer to Google's guide `__ to setting up GCS authentication with a service account. .. code-block:: python import os from google.cloud.storage import Client service_account_path = os.environ['GOOGLE_APPLICATION_CREDENTIALS'] client = Client.from_service_account_json(service_account_path) fin = open('gs://gcp-public-data-landsat/index.csv.gz', transport_params=dict(client=client)) If you need more credential options, you can create an explicit ``google.auth.credentials.Credentials`` object and pass it to the Client. To create an API token for use in the example below, refer to the `GCS authentication guide `__. .. code-block:: python import os from google.auth.credentials import Credentials from google.cloud.storage import Client token = os.environ['GOOGLE_API_TOKEN'] credentials = Credentials(token=token) client = Client(credentials=credentials) fin = open('gs://gcp-public-data-landsat/index.csv.gz', transport_params=dict(client=client)) Azure Credentials ----------------- ``smart_open`` uses the ``azure-storage-blob`` library to talk to Azure Blob Storage. By default, ``smart_open`` will defer to ``azure-storage-blob`` and let it take care of the credentials. Azure Blob Storage does not have any ways of inferring credentials therefore, passing a ``azure.storage.blob.BlobServiceClient`` object as a transport parameter to the ``open`` function is required. You can `customize the credentials `__ when constructing the client. ``smart_open`` will then use the client when talking to. To follow allow with the example below, `refer to Azure's guide `__ to setting up authentication. .. code-block:: python import os from azure.storage.blob import BlobServiceClient azure_storage_connection_string = os.environ['AZURE_STORAGE_CONNECTION_STRING'] client = BlobServiceClient.from_connection_string(azure_storage_connection_string) fin = open('azure://my_container/my_blob.txt', transport_params=dict(client=client)) If you need more credential options, refer to the `Azure Storage authentication guide `__. File-like Binary Streams ------------------------ The ``open`` function also accepts file-like objects. This is useful when you already have a `binary file `_ open, and would like to wrap it with transparent decompression: .. code-block:: python >>> import io, gzip >>> >>> # Prepare some gzipped binary data in memory, as an example. >>> # Any binary file will do; we're using BytesIO here for simplicity. >>> buf = io.BytesIO() >>> with gzip.GzipFile(fileobj=buf, mode='w') as fout: ... _ = fout.write(b'this is a bytestring') >>> _ = buf.seek(0) >>> >>> # Use case starts here. >>> buf.name = 'file.gz' # add a .name attribute so smart_open knows what compressor to use >>> import smart_open >>> smart_open.open(buf, 'rb').read() # will gzip-decompress transparently! b'this is a bytestring' In this case, ``smart_open`` relied on the ``.name`` attribute of our `binary I/O stream `_ ``buf`` object to determine which decompressor to use. If your file object doesn't have one, set the ``.name`` attribute to an appropriate value. Furthermore, that value has to end with a **known** file extension (see the ``register_compressor`` function). Otherwise, the transparent decompression will not occur. Drop-in replacement of ``pathlib.Path.open`` -------------------------------------------- ``smart_open.open`` can also be used with ``Path`` objects. The built-in `Path.open()` is not able to read text from compressed files, so use ``patch_pathlib`` to replace it with `smart_open.open()` instead. This can be helpful when e.g. working with compressed files. .. code-block:: python >>> from pathlib import Path >>> from smart_open.smart_open_lib import patch_pathlib >>> >>> _ = patch_pathlib() # replace `Path.open` with `smart_open.open` >>> >>> path = Path("smart_open/tests/test_data/crime-and-punishment.txt.gz") >>> >>> with path.open("r") as infile: ... print(infile.readline()[:41]) В начале июля, в чрезвычайно жаркое время How do I ...? ============= See `this document `__. Extending ``smart_open`` ======================== See `this document `__. Testing ``smart_open`` ====================== ``smart_open`` comes with a comprehensive suite of unit tests. Before you can run the test suite, install the test dependencies:: pip install -e .[test] Now, you can run the unit tests:: pytest smart_open The tests are also run automatically with `Travis CI `_ on every commit push & pull request. Comments, bug reports ===================== ``smart_open`` lives on `Github `_. You can file issues or pull requests there. Suggestions, pull requests and improvements welcome! ---------------- ``smart_open`` is open source software released under the `MIT license `_. Copyright (c) 2015-now `Radim Řehůřek `_. smart_open-5.2.1/benchmark/000077500000000000000000000000001411241424400156155ustar00rootroot00000000000000smart_open-5.2.1/benchmark/read_s3.py000066400000000000000000000007011411241424400175050ustar00rootroot00000000000000import sys import boto3 import smart_open urls = [line.strip() for line in sys.stdin] tp = {} if 'create_session_and_resource' in sys.argv: tp['session'] = boto3.Session() tp['resource'] = tp['session'].resource('s3') elif 'create_resource' in sys.argv: tp['resource'] = boto3.resource('s3') elif 'create_session' in sys.argv: tp['session'] = boto3.Session() for url in urls: smart_open.open(url, transport_params=tp).read() smart_open-5.2.1/extending.md000066400000000000000000000122071411241424400161740ustar00rootroot00000000000000# Extending `smart_open` This document targets potential contributors to `smart_open`. Currently, there are two main directions for extending existing `smart_open` functionality: 1. Add a new transport mechanism 2. Add a new compression format The first is by far the more challenging, and also the more welcome. ## New transport mechanisms Each transport mechanism lives in its own submodule. For example, currently we have: - [smart_open.local_file](smart_open/local_file.py) - [smart_open.s3](smart_open/s3.py) - [smart_open.ssh](smart_open/ssh.py) - ... and others So, to implement a new transport mechanism, you need to create a new module. Your module must expose the following (see [smart_open.http](smart_open/http.py) for the full implementation): ```python SCHEMA = ... """The name of the mechanism, e.g. s3, ssh, etc. This is the part that goes before the `://` in a URL, e.g. `s3://`.""" URI_EXAMPLES = ('xxx://foo/bar', 'zzz://baz/boz') """This will appear in the documentation of the the `parse_uri` function.""" MISSING_DEPS = False """Wrap transport-specific imports in a try/catch and set this to True if any imports are not found. Seting MISSING_DEPS to True will cause the library to suggest installing its dependencies with an example pip command. If your transport has no external dependencies, you can omit this variable. """ def parse_uri(uri_as_str): """Parse the specified URI into a dict. At a bare minimum, the dict must have `schema` member. """ return dict(schema=XXX_SCHEMA, ...) def open_uri(uri_as_str, mode, transport_params): """Return a file-like object pointing to the URI. Parameters: uri_as_str: str The URI to open mode: str Either "rb" or "wb". You don't need to implement text modes, `smart_open` does that for you, outside of the transport layer. transport_params: dict Any additional parameters to pass to the `open` function (see below). """ # # Parse the URI using parse_uri # Consolidate the parsed URI with transport_params, if needed # Pass everything to the open function (see below). # ... def open(..., mode, param1=None, param2=None, paramN=None): """This function does the hard work. The keyword parameters are the transport_params from the `open_uri` function. """ ... ``` Have a look at the existing mechanisms to see how they work. You may define other functions and classes as necessary for your implementation. Once your module is working, register it in the [smart_open.transport](smart_open/transport.py) submodule. The `register_transport()` function updates a mapping from schemes to the modules that implement functionality for them. Once you've registered your new transport module, the following will happen automagically: 1. `smart_open` will be able to open any URI supported by your module 2. The docstring for the `smart_open.open` function will contain a section detailing the parameters for your transport module. 3. The docstring for the `parse_uri` function will include the schemas and examples supported by your module. You can confirm the documentation changes by running: python -c 'help("smart_open")' and verify that documentation for your new submodule shows up. ### What's the difference between the `open_uri` and `open` functions? There are several key differences between the two. First, the parameters to `open_uri` are the same for _all transports_. On the other hand, the parameters to the `open` function can differ from transport to transport. Second, the responsibilities of the two functions are also different. The `open` function opens the remote object. The `open_uri` function deals with parsing transport-specific details out of the URI, and then delegates to `open`. The `open` function contains documentation for transport parameters. This documentation gets parsed by the `doctools` module and appears in various docstrings. Some of these differences are by design; others as a consequence of evolution. ## New compression mechanisms The compression layer is self-contained in the `smart_open.compression` submodule. To add support for a new compressor: - Create a new function to handle your compression format (given an extension) - Add your compressor to the registry For example: ```python def _handle_xz(file_obj, mode): import lzma return lzma.LZMAFile(filename=file_obj, mode=mode, format=lzma.FORMAT_XZ) register_compressor('.xz', _handle_xz) ``` There are many compression formats out there, and supporting all of them is beyond the scope of `smart_open`. We want our code's functionality to cover the bare minimum required to satisfy 80% of our users. We leave the remaining 20% of users with the ability to deal with compression in their own code, using the trivial mechanism described above. Documentation ------------- Once you've contributed your extension, please add it to the documentation so that it is discoverable for other users. Some notable files: - setup.py: See the `description` keyword. Not all contributions will affect this. - README.rst - howto.md (if your extension solves a specific problem that doesn't get covered by other documentation) smart_open-5.2.1/help.txt000066400000000000000000000306521411241424400153620ustar00rootroot00000000000000Help on package smart_open: NAME smart_open DESCRIPTION Utilities for streaming to/from several file-like data storages: S3 / HDFS / local filesystem / compressed files, and many more, using a simple, Pythonic API. The streaming makes heavy use of generators and pipes, to avoid loading full file contents into memory, allowing work with arbitrarily large files. The main functions are: * `open()`, which opens the given file for reading/writing * `parse_uri()` * `s3_iter_bucket()`, which goes over all keys in an S3 bucket in parallel * `register_compressor()`, which registers callbacks for transparent compressor handling PACKAGE CONTENTS azure bytebuffer compression concurrency constants doctools gcs hdfs http local_file s3 smart_open_lib ssh tests (package) transport utils version webhdfs FUNCTIONS open(uri, mode='r', buffering=-1, encoding=None, errors=None, newline=None, closefd=True, opener=None, ignore_ext=False, transport_params=None) Open the URI object, returning a file-like object. The URI is usually a string in a variety of formats. For a full list of examples, see the :func:`parse_uri` function. The URI may also be one of: - an instance of the pathlib.Path class - a stream (anything that implements io.IOBase-like functionality) Parameters ---------- uri: str or object The object to open. mode: str, optional Mimicks built-in open parameter of the same name. buffering: int, optional Mimicks built-in open parameter of the same name. encoding: str, optional Mimicks built-in open parameter of the same name. errors: str, optional Mimicks built-in open parameter of the same name. newline: str, optional Mimicks built-in open parameter of the same name. closefd: boolean, optional Mimicks built-in open parameter of the same name. Ignored. opener: object, optional Mimicks built-in open parameter of the same name. Ignored. ignore_ext: boolean, optional Disable transparent compression/decompression based on the file extension. transport_params: dict, optional Additional parameters for the transport layer (see notes below). Returns ------- A file-like object. Notes ----- smart_open has several implementations for its transport layer (e.g. S3, HTTP). Each transport layer has a different set of keyword arguments for overriding default behavior. If you specify a keyword argument that is *not* supported by the transport layer being used, smart_open will ignore that argument and log a warning message. smart_open supports the following transport mechanisms: azure (smart_open/azure.py) ~~~~~~~~~~~~~~~~~~~~~~~~~~~ Implements file-like objects for reading and writing to/from Azure Blob Storage. buffer_size: int, optional The buffer size to use when performing I/O. For reading only. min_part_size: int, optional The minimum part size for multipart uploads. For writing only. file (smart_open/local_file.py) ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ Implements the transport for the file:// schema. gs (smart_open/gcs.py) ~~~~~~~~~~~~~~~~~~~~~~ Implements file-like objects for reading and writing to/from GCS. buffer_size: int, optional The buffer size to use when performing I/O. For reading only. min_part_size: int, optional The minimum part size for multipart uploads. For writing only. client: google.cloud.storage.Client, optional The GCS client to use when working with google-cloud-storage. hdfs (smart_open/hdfs.py) ~~~~~~~~~~~~~~~~~~~~~~~~~ Implements reading and writing to/from HDFS. http (smart_open/http.py) ~~~~~~~~~~~~~~~~~~~~~~~~~ Implements file-like objects for reading from http. kerberos: boolean, optional If True, will attempt to use the local Kerberos credentials user: str, optional The username for authenticating over HTTP password: str, optional The password for authenticating over HTTP headers: dict, optional Any headers to send in the request. If ``None``, the default headers are sent: ``{'Accept-Encoding': 'identity'}``. To use no headers at all, set this variable to an empty dict, ``{}``. s3 (smart_open/s3.py) ~~~~~~~~~~~~~~~~~~~~~ Implements file-like objects for reading and writing from/to AWS S3. buffer_size: int, optional The buffer size to use when performing I/O. min_part_size: int, optional The minimum part size for multipart uploads. For writing only. multipart_upload: bool, optional Default: `True` If set to `True`, will use multipart upload for writing to S3. If set to `False`, S3 upload will use the S3 Single-Part Upload API, which is more ideal for small file sizes. For writing only. version_id: str, optional Version of the object, used when reading object. If None, will fetch the most recent version. defer_seek: boolean, optional Default: `False` If set to `True` on a file opened for reading, GetObject will not be called until the first seek() or read(). Avoids redundant API queries when seeking before reading. client: object, optional The S3 client to use when working with boto3. If you don't specify this, then smart_open will create a new client for you. client_kwargs: dict, optional Additional parameters to pass to the relevant functions of the client. The keys are fully qualified method names, e.g. `S3.Client.create_multipart_upload`. The values are kwargs to pass to that method each time it is called. writebuffer: IO[bytes], optional By default, this module will buffer data in memory using io.BytesIO when writing. Pass another binary IO instance here to use it instead. For example, you may pass a file object to buffer to local disk instead of in RAM. Use this to keep RAM usage low at the expense of additional disk IO. If you pass in an open file, then you are responsible for cleaning it up after writing completes. scp (smart_open/ssh.py) ~~~~~~~~~~~~~~~~~~~~~~~ Implements I/O streams over SSH. mode: str, optional The mode to use for opening the file. host: str, optional The hostname of the remote machine. May not be None. user: str, optional The username to use to login to the remote machine. If None, defaults to the name of the current user. password: str, optional The password to use to login to the remote machine. port: int, optional The port to connect to. transport_params: dict, optional Any additional settings to be passed to paramiko.SSHClient.connect webhdfs (smart_open/webhdfs.py) ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ Implements reading and writing to/from WebHDFS. min_part_size: int, optional For writing only. Examples -------- >>> from smart_open import open >>> >>> # stream lines from an S3 object >>> for line in open('s3://commoncrawl/robots.txt'): ... print(repr(line)) ... break 'User-Agent: *\n' >>> # stream from/to compressed files, with transparent (de)compression: >>> for line in open('smart_open/tests/test_data/1984.txt.gz', encoding='utf-8'): ... print(repr(line)) 'It was a bright cold day in April, and the clocks were striking thirteen.\n' 'Winston Smith, his chin nuzzled into his breast in an effort to escape the vile\n' 'wind, slipped quickly through the glass doors of Victory Mansions, though not\n' 'quickly enough to prevent a swirl of gritty dust from entering along with him.\n' >>> # can use context managers too: >>> with open('smart_open/tests/test_data/1984.txt.gz') as fin: ... with open('smart_open/tests/test_data/1984.txt.bz2', 'w') as fout: ... for line in fin: ... fout.write(line) >>> # can use any IOBase operations, like seek >>> with open('s3://commoncrawl/robots.txt', 'rb') as fin: ... for line in fin: ... print(repr(line.decode('utf-8'))) ... break ... offset = fin.seek(0) # seek to the beginning ... print(fin.read(4)) 'User-Agent: *\n' b'User' >>> # stream from HTTP >>> for line in open('http://example.com/index.html'): ... print(repr(line)) ... break This function also supports transparent compression and decompression using the following codecs: * .bz2 * .gz The function depends on the file extension to determine the appropriate codec. See Also -------- - `Standard library reference `__ - `smart_open README.rst `__ parse_uri(uri_as_string) Parse the given URI from a string. Parameters ---------- uri_as_string: str The URI to parse. Returns ------- collections.namedtuple The parsed URI. Notes ----- Supported URI schemes are: * azure * file * gs * hdfs * http * s3 * scp * webhdfs Valid URI examples:: * ./local/path/file * ~/local/path/file * local/path/file * ./local/path/file.gz * file:///home/user/file * file:///home/user/file.bz2 * hdfs:///path/file * hdfs://path/file * s3://my_bucket/my_key * s3://my_key:my_secret@my_bucket/my_key * s3://my_key:my_secret@my_server:my_port@my_bucket/my_key * ssh://username@host/path/file * ssh://username@host//path/file * scp://username@host/path/file * sftp://username@host/path/file * webhdfs://host:port/path/file register_compressor(ext, callback) Register a callback for transparently decompressing files with a specific extension. Parameters ---------- ext: str The extension. Must include the leading period, e.g. ``.gz``. callback: callable The callback. It must accept two position arguments, file_obj and mode. This function will be called when ``smart_open`` is opening a file with the specified extension. Examples -------- Instruct smart_open to use the `lzma` module whenever opening a file with a .xz extension (see README.rst for the complete example showing I/O): >>> def _handle_xz(file_obj, mode): ... import lzma ... return lzma.LZMAFile(filename=file_obj, mode=mode, format=lzma.FORMAT_XZ) >>> >>> register_compressor('.xz', _handle_xz) s3_iter_bucket(bucket_name, prefix='', accept_key=None, key_limit=None, workers=16, retries=3, **session_kwargs) Deprecated. Use smart_open.s3.iter_bucket instead. smart_open(uri, mode='rb', buffering=-1, encoding=None, errors=None, newline=None, closefd=True, opener=None, ignore_extension=False, **kwargs) DATA __all__ = ['open', 'parse_uri', 'register_compressor', 's3_iter_bucket... VERSION 4.1.2.dev0 FILE /Users/misha/git/smart_open/smart_open/__init__.py smart_open-5.2.1/howto.md000066400000000000000000000377671411241424400153710ustar00rootroot00000000000000# How-to Guides The howtos are **goal-oriented guides** that demonstrate **how to solve a specific problem** using `smart_open`. ## How to Add a New Guide The guides are code snippets compatible with Python's [doctest](https://docs.python.org/2/library/doctest.html) module. Lines that start with `>>>` and `...` are Python commands to run via the interpreter. Lines without the above prefixes are expected standard output from the commands. The `doctest` module runs the commands and ensures that their output matches the expected values. ```python >>> foo = 'bar' >>> print(foo) bar ``` Some tips: - Enclose the snippets with markdowns triple backticks to get free syntax highlighting - End your example with a blank line to let `doctest` know the triple backticks aren't part of the example Finally, ensure all the guides still work by running: python -m doctest howto.md The above command shouldn't print anything to standard output/error and return zero. ## How to Read/Write Zip Files `smart_open` does not support reading/writing zip files out of the box. However, you can easily integrate `smart_open` with the standard library's [zipfile](https://docs.python.org/3.5/library/zipfile.html) module: - `smart_open` handles the I/O - `zipfile` handles the compression, decompression, and file member lookup Reading example: ```python >>> from smart_open import open >>> import zipfile >>> with open('sampledata/hello.zip', 'rb') as fin: ... with zipfile.ZipFile(fin) as zip: ... for info in zip.infolist(): ... file_bytes = zip.read(info.filename) ... print('%r: %r' % (info.filename, file_bytes.decode('utf-8'))) 'hello/': '' 'hello/en.txt': 'hello world!\n' 'hello/ru.txt': 'здравствуй, мир!\n' ``` Writing example: ```python >>> from smart_open import open >>> import os >>> import tempfile >>> import zipfile >>> tmp = tempfile.NamedTemporaryFile(prefix='smart_open-howto-', suffix='.zip', delete=False) >>> with open(tmp.name, 'wb') as fout: ... with zipfile.ZipFile(fout, 'w') as zip: ... zip.writestr('hello/en.txt', 'hello world!\n') ... zip.writestr('hello/ru.txt', 'здравствуй, мир!\n') >>> os.unlink(tmp.name) # comment this line to keep the file for later ``` ## How to access S3 anonymously The `boto3` library that `smart_open` uses for accessing S3 signs each request using your `boto3` credentials. If you'd like to access S3 without using an S3 account, then you need disable this signing mechanism. ```python >>> import boto3 >>> import botocore >>> import botocore.client >>> from smart_open import open >>> config = botocore.client.Config(signature_version=botocore.UNSIGNED) >>> params = {'client': boto3.client('s3', config=config)} >>> with open('s3://commoncrawl/robots.txt', transport_params=params) as fin: ... fin.readline() 'User-Agent: *\n' ``` ## How to Access S3 Object Properties When working with AWS S3, you may want to look beyond the abstraction provided by `smart_open` and communicate with `boto3` directly in order to satisfy your use case. For example: - Access the object's properties, such as the content type, timestamp of the last change, etc. - Access version information for the object (versioned buckets only) - Copy the object to another location - Apply an ACL to the object - and anything else specified in the [boto3 S3 Object API](https://boto3.amazonaws.com/v1/documentation/api/latest/reference/services/s3.html#object). To enable such use cases, the file-like objects returned by `smart_open` have a special `to_boto3` method. This returns a `boto3.s3.Object` that you can work with directly. For example, let's get the content type of a publicly available file: ```python >>> import boto3 >>> from smart_open import open >>> resource = boto3.resource('s3') # Pass additional resource parameters here >>> with open('s3://commoncrawl/robots.txt') as fin: ... print(fin.readline().rstrip()) ... boto3_s3_object = fin.to_boto3(resource) ... print(repr(boto3_s3_object)) ... print(boto3_s3_object.content_type) # Using the boto3 API here User-Agent: * s3.Object(bucket_name='commoncrawl', key='robots.txt') text/plain ``` This works only when reading and writing via S3. ## How to Access a Specific Version of an S3 Object The ``version_id`` transport parameter enables you to get the desired version of the object from an S3 bucket. .. Important:: S3 disables version control by default. Before using the ``version_id`` parameter, you must explicitly enable version control for your S3 bucket. Read https://docs.aws.amazon.com/AmazonS3/latest/dev/Versioning.html for details. ```python >>> import boto3 >>> from smart_open import open >>> versions = ['KiQpZPsKI5Dm2oJZy_RzskTOtl2snjBg', 'N0GJcE3TQCKtkaS.gF.MUBZS85Gs3hzn'] >>> for v in versions: ... with open('s3://smart-open-versioned/demo.txt', transport_params={'version_id': v}) as fin: ... print(v, repr(fin.read())) KiQpZPsKI5Dm2oJZy_RzskTOtl2snjBg 'second version\n' N0GJcE3TQCKtkaS.gF.MUBZS85Gs3hzn 'first version\n' >>> # If you don't specify a version, smart_open will read the most recent one >>> with open('s3://smart-open-versioned/demo.txt') as fin: ... print(repr(fin.read())) 'second version\n' ``` This works only when reading via S3. ## How to Access the Underlying boto3 Object At some stage in your workflow, you may opt to work with `boto3` directly. You can do this by calling to the `to_boto3()` method. You can then interact with the object using the `boto3` API: ```python >>> import boto3 >>> resource = boto3.resource('s3') # Pass additional resource parameters here >>> with open('s3://commoncrawl/robots.txt') as fin: ... boto3_object = fin.to_boto3(resource) ... print(boto3_object) ... print(boto3_object.get()['LastModified']) s3.Object(bucket_name='commoncrawl', key='robots.txt') 2016-05-21 18:17:43+00:00 ``` This works only when reading and writing via S3. For versioned objects, the returned object will be slightly different: ```python >>> params = {'version_id': 'KiQpZPsKI5Dm2oJZy_RzskTOtl2snjBg'} >>> with open('s3://smart-open-versioned/demo.txt', transport_params=params) as fin: ... print(fin.to_boto3()) s3.ObjectVersion(bucket_name='smart-open-versioned', object_key='demo.txt', id='KiQpZPsKI5Dm2oJZy_RzskTOtl2snjBg') ``` ## How to Read from S3 Efficiently Under the covers, `smart_open` uses the [boto3 client API](https://boto3.amazonaws.com/v1/documentation/api/latest/reference/services/s3.html#client) to read from S3. By default, calling `smart_open.open` with an S3 URL will create its own boto3 client. These are expensive operations: they require both CPU time to construct the objects from a low-level API definition, and memory to store the objects once they have been created. It is possible to save both CPU time and memory by sharing the same resource across multiple `smart_open.open` calls, for example: ```python >>> import boto3 >>> from smart_open import open >>> tp = {'client': boto3.client('s3')} >>> for month in (1, 2, 3): ... url = 's3://nyc-tlc/trip data/yellow_tripdata_2020-%02d.csv' % month ... with open(url, transport_params=tp) as fin: ... _ = fin.readline() # skip CSV header ... print(fin.readline().strip()) 1,2020-01-01 00:28:15,2020-01-01 00:33:03,1,1.20,1,N,238,239,1,6,3,0.5,1.47,0,0.3,11.27,2.5 1,2020-02-01 00:17:35,2020-02-01 00:30:32,1,2.60,1,N,145,7,1,11,0.5,0.5,2.45,0,0.3,14.75,0 1,2020-03-01 00:31:13,2020-03-01 01:01:42,1,4.70,1,N,88,255,1,22,3,0.5,2,0,0.3,27.8,2.5 ``` Clients are thread-safe and multiprocess-safe, so you may share them between other threads and subprocesses. ## How to Write to S3 Efficiently By default, `smart_open` buffers the most recent part of a multipart upload in memory. The default part size is 50MB. If you're concerned about memory usage, then you have two options. The first option is to use smaller part sizes (e.g. 5MB, the lowest value permitted by AWS): ```python import boto3 from smart_open import open tp = {'min_part_size': 5 * 1024**2} with open('s3://bucket/key', 'w', transport_params=tp) as fout: fout.write(lots_of_data) ``` This will split your upload into smaller parts. Be warned that AWS enforces a [limit](https://docs.aws.amazon.com/AmazonS3/latest/userguide/qfacts.html) of a maximum of 10,000 parts per upload. The second option is to use a temporary file as a buffer instead. ```python import boto3 from smart_open import open with tempfile.NamedTemporaryFile() as tmp: tp = {'writebuffer': tmp} with open('s3://bucket/key', 'w', transport_params=tp) as fout: fout.write(lots_of_data) ``` This option reduces memory usage at the expense of additional disk I/O (writing to and reading from a hard disk is slower). ## How to Specify the Request Payer (S3 only) Some public buckets require you to [pay for S3 requests for the data in the bucket](https://docs.aws.amazon.com/AmazonS3/latest/dev/RequesterPaysBuckets.html). This relieves the bucket owner of the data transfer costs, and spreads them among the consumers of the data. To access such buckets, you need to pass some special transport parameters: ```python >>> from smart_open import open >>> params = {'client_kwargs': {'S3.Client.get_object': {RequestPayer': 'requester'}}} >>> with open('s3://arxiv/pdf/arXiv_pdf_manifest.xml', transport_params=params) as fin: ... print(fin.readline()) ``` This works only when reading and writing via S3. ## How to Make S3 I/O Robust to Network Errors Boto3 has a [built-in mechanism](https://boto3.amazonaws.com/v1/documentation/api/latest/guide/retries.html) for retrying after a recoverable error. You can fine-tune it using several ways: ### Pre-configuring a boto3 client and then passing the client to smart_open ```python >>> import boto3 >>> import botocore.config >>> import smart_open >>> config = botocore.config.Config(retries={'mode': 'standard'}) >>> client = boto3.client('s3', config=config) >>> tp = {'client': client} >>> with smart_open.open('s3://commoncrawl/robots.txt', transport_params=tp) as fin: ... print(fin.readline()) User-Agent: * ``` To verify your settings have effect: ```python import logging logging.getLogger('smart_open.s3').setLevel(logging.DEBUG) ``` and check the log output of your code. ## How to Pass Additional Parameters to boto3 `boto3` is a highly configurable library, and each function call accepts many optional parameters. `smart_open` does not attempt to replicate this behavior, since most of these parameters often do not influence the behavior of `smart_open` itself. Instead, `smart_open` offers the caller of the function to pass additional parameters as necessary: ```python >>> import boto3 >>> client_kwargs = {'S3.Client.get_object': {RequestPayer': 'requester'}}} >>> with open('s3://arxiv/pdf/arXiv_pdf_manifest.xml', transport_params=params) as fin: ... pass ``` The above example influences how the [S3.Client.get_object function](https://boto3.amazonaws.com/v1/documentation/api/latest/reference/services/s3.html#S3.Client.get_object) gets called by `smart_open` when reading the specified URL. More specifically, the `RequestPayer` parameter will be set to `requester` **for each call**. Influential functions include: - S3.Client (the initializer function) - S3.Client.abort_multipart_upload - S3.Client.complete_multipart_upload - S3.Client.create_multipart_upload - S3.Client.get_object - S3.Client.head_bucket - S3.Client.put_object - S3.Client.upload_part If you choose to pass additional parameters, keep the following in mind: 1. Study the [boto3 client API](https://boto3.amazonaws.com/v1/documentation/api/latest/reference/services/s3.html#S3.Client) and ensure the function and parameters are valid. 2. Study the [code for the smart_open.s3 submodule](smart_open/s3.py) and ensure `smart_open` is actually calling the function you're passing additional parameters for. Finally, in some cases, it's possible to work directly with `boto3` without going through `smart_open`. For example, setting the ACL for an object is possible after the object is created (with `boto3`), as opposed to at creation time (with `smart_open`). More specifically, here's the direct method: ```python import boto3 import smart_open with smart_open.open('s3://bucket/key', 'wb') as fout: fout.write(b'hello world!') client = boto3.client('s3') client.put_object_acl(ACL=acl_as_string) ``` Here's the same code that passes the above parameter via `smart_open`: ```python import smart_open tp = {'client_kwargs': {'S3.Client.create_multipart_upload': {'ACL': acl_as_string}}} with smart_open.open('s3://bucket/key', 'wb', transport_params=tp) as fout: fout.write(b'hello world!') ``` If passing everything via `smart_open` feels awkward, try passing part of the parameters directly to `boto3`. ## How to Read from Github API The Github API allows users access to, among many other things, read files from repositories that you have access to. Below is an example for how users can read a file with smart_open. For more info, see the [Github API documentation](https://docs.github.com/en/rest/reference/repos#contents). ```python >>> from smart_open import open >>> import base64 >>> import json >>> owner = "RaRe-Technologies" >>> repo = "smart_open" >>> path = "howto.md" >>> git_token = "..." >>> url = f"https://api.github.com/repos/{owner}/{repo}/contents/{path}" >>> transport_params = { ... "headers" : { ... "Authorization" : "Bearer " + git_token ... } ... } >>> with open(url, transport_params=transport_params) as obj: ... response_contents = json.loads(obj.read())["contents"] ... file_text = base64.b64decode(response_contents).decode() ``` Note: If you are accessing a file in a Github Enterprise org, you will likely have a different base dns than the `https://api.github.com/` in the example. ## How to Read/Write from localstack [localstack](https://github.com/localstack/localstack) is a convenient test framework for developing cloud apps. You run it locally on your machine and behaves almost identically to the real AWS. This makes it useful for testing your code offline, without requiring you to set up mocks or test harnesses. First, install localstack and start it: $ pip install localstack $ localstack start The start command is blocking, so you'll need to run it in a separate terminal session or run it in the background. Before we can read/write, we'll need to create a bucket: $ aws --endpoint-url http://localhost:4566 s3api create-bucket --bucket mybucket where `http://localhost:4566` is the default host/port that localstack uses to listen for requests. You can now read/write to the bucket the same way you would to a real S3 bucket: ```python >>> import boto3 >>> from smart_open import open >>> client = boto3.client('s3', endpoint_url='http://localhost:4566') >>> tparams = {'client': client} >>> with open('s3://mybucket/hello.txt', 'wt', transport_params=tparams) as fout: ... fout.write('hello world!') >>> with open('s3://mybucket/hello.txt', 'rt', transport_params=tparams) as fin: ... fin.read() 'hello world!' ``` You can also access it using the CLI: $ aws --endpoint-url http://localhost:4566 s3 ls s3://mybucket/ 2020-12-09 15:56:22 12 hello.txt ## How to Download a Whole Directory From Google Cloud Object storage providers generally don't provide real directories, and instead emulate them using object name patterns (see [here](https://stackoverflow.com/questions/38416598/how-to-create-an-empty-folder-on-google-storage-with-google-api/38417397#38417397) for an explanation). To download all files in a directory you can do this: ```python >>> from google.cloud import storage >>> from smart_open import open >>> client = storage.Client() >>> bucket_name = "gcp-public-data-landsat" >>> prefix = "LC08/01/044/034/LC08_L1GT_044034_20130330_20170310_01_T2/" >>> for blob in client.list_blobs(client.get_bucket(bucket_name), prefix=prefix): ... with open(f"gs://{bucket_name}/{blob.name}") as f: ... print(f.name) ... break # just show the first iteration for the test LC08/01/044/034/LC08_L1GT_044034_20130330_20170310_01_T2/LC08_L1GT_044034_20130330_20170310_01_T2_ANG.txt ``` smart_open-5.2.1/integration-tests/000077500000000000000000000000001411241424400173465ustar00rootroot00000000000000smart_open-5.2.1/integration-tests/README.md000066400000000000000000000071411411241424400206300ustar00rootroot00000000000000This directory contains integration tests for smart_open. To run the tests, you need read/write access to an S3 bucket. Also, you need to install py.test and its benchmarks addon: pip install -r requirements.txt Then, to run the tests, run: SO_BUCKET=bucket SO_KEY=key py.test integration-tests/test_s3.py You may use any key name instead of "smart_open_test". It does not have to be an existing key. The tests will create temporary keys under `s3://SO_BUCKET/SO_KEY` and remove them at completion. The tests will take several minutes to complete. Each test will run several times to obtain summary statistics such as min, max, mean and median. This allows us to detect regressions in performance. Here is some example output (you need a wide screen to get the best of it): ``` $ SMART_OPEN_S3_URL=s3://bucket/smart_open_test py.test integration-tests/test_s3.py =============================================== test session starts ================================================ platform darwin -- Python 3.6.3, pytest-3.3.0, py-1.5.2, pluggy-0.6.0 benchmark: 3.1.1 (defaults: timer=time.perf_counter disable_gc=False min_rounds=5 min_time=0.000005 max_time=1.0 calibration_precision=10 warmup=False warmup_iterations=100000) rootdir: /Users/misha/git/smart_open, inifile: plugins: benchmark-3.1.1 collected 6 items integration-tests/test_s3.py ...... [100%] --------------------------------------------------------------------------------------- benchmark: 6 tests -------------------------------------------------------------------------------------- Name (time in s) Min Max Mean StdDev Median IQR Outliers OPS Rounds Iterations ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- test_s3_readwrite_text 2.7593 (1.0) 3.4935 (1.0) 3.2203 (1.0) 0.3064 (1.0) 3.3202 (1.04) 0.4730 (1.0) 1;0 0.3105 (1.0) 5 1 test_s3_readwrite_text_gzip 3.0242 (1.10) 4.6782 (1.34) 3.7079 (1.15) 0.8531 (2.78) 3.2001 (1.0) 1.5850 (3.35) 2;0 0.2697 (0.87) 5 1 test_s3_readwrite_binary 3.0549 (1.11) 3.9062 (1.12) 3.5399 (1.10) 0.3516 (1.15) 3.4721 (1.09) 0.5532 (1.17) 2;0 0.2825 (0.91) 5 1 test_s3_performance_gz 3.1885 (1.16) 5.2845 (1.51) 3.9298 (1.22) 0.8197 (2.68) 3.6974 (1.16) 0.9693 (2.05) 1;0 0.2545 (0.82) 5 1 test_s3_readwrite_binary_gzip 3.3756 (1.22) 5.0423 (1.44) 4.1763 (1.30) 0.6381 (2.08) 4.0722 (1.27) 0.9209 (1.95) 2;0 0.2394 (0.77) 5 1 test_s3_performance 7.6758 (2.78) 29.5266 (8.45) 18.8346 (5.85) 10.3003 (33.62) 21.1854 (6.62) 19.6234 (41.49) 3;0 0.0531 (0.17) 5 1 ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- Legend: Outliers: 1 Standard Deviation from Mean; 1.5 IQR (InterQuartile Range) from 1st Quartile and 3rd Quartile. OPS: Operations Per Second, computed as 1 / Mean ============================================ 6 passed in 285.14 seconds ============================================ ``` smart_open-5.2.1/integration-tests/initialize_s3_bucket.py000066400000000000000000000026341411241424400240300ustar00rootroot00000000000000# -*- coding: utf-8 -*- # # Copyright (C) 2020 Radim Rehurek # # This code is distributed under the terms and conditions # from the MIT License (MIT). # """Prepare an S3 bucket for our integration tests. Once the bucket is initialized, the tests in test_s3_ported.py should pass. """ import gzip import io import sys import boto3 def gzip_compress(data): # # gzip.compress does not exist under Py2 # buf = io.BytesIO() with gzip.GzipFile(fileobj=buf, mode='wb') as fout: fout.write(data) return buf.getvalue() def _build_contents(): hello_bytes = u"hello wořld\nhow are you?".encode('utf8') yield 'hello.txt', hello_bytes yield 'multiline.txt', b'englishman\nin\nnew\nyork\n' yield 'hello.txt.gz', gzip_compress(hello_bytes) for i in range(100): key = 'iter_bucket/%02d.txt' % i body = '\n'.join("line%i%i" % (i, line_no) for line_no in range(10)).encode('utf8') yield key, body CONTENTS = dict(_build_contents()) def main(): bucket_name = sys.argv[1] bucket = boto3.resource('s3').Bucket(bucket_name) # # Assume the bucket exists. Creating it ourselves and dealing with # timing issues is too much of a PITA. # for key in bucket.objects.all(): key.delete() for (key, body) in CONTENTS.items(): bucket.put_object(Key=key, Body=body) if __name__ == '__main__': main() smart_open-5.2.1/integration-tests/requirements.txt000066400000000000000000000000371411241424400226320ustar00rootroot00000000000000pytest pytest_benchmark awscli smart_open-5.2.1/integration-tests/test_184.py000066400000000000000000000016311411241424400212740ustar00rootroot00000000000000# -*- coding: utf-8 -*- # # Copyright (C) 2019 Radim Rehurek # # This code is distributed under the terms and conditions # from the MIT License (MIT). # import sys import time import smart_open open_fn = smart_open.smart_open # open_fn = open def report_time_iterate_rows(file_name, report_every=100000): start = time.time() last = start with open_fn(file_name, 'r') as f: for i, line in enumerate(f, start=1): if not (i % report_every): current = time.time() time_taken = current - last print('Time taken for %d rows: %.2f seconds, %.2f rows/s' % ( report_every, time_taken, report_every / time_taken)) last = current total = time.time() - start print('Total: %d rows, %.2f seconds, %.2f rows/s' % ( i, total, i / total)) report_time_iterate_rows(sys.argv[1]) smart_open-5.2.1/integration-tests/test_207.py000066400000000000000000000016171411241424400212740ustar00rootroot00000000000000# -*- coding: utf-8 -*- # # Copyright (C) 2019 Radim Rehurek # # This code is distributed under the terms and conditions # from the MIT License (MIT). # import os import sys import tempfile try: import numpy as np except ImportError: print("You really need numpy to proceed with this test") sys.exit(1) import smart_open def tofile(): dt = np.dtype([('time', [('min', int), ('sec', int)]), ('temp', float)]) x = np.zeros((1,), dtype=dt) with tempfile.NamedTemporaryFile(prefix='test_207', suffix='.dat', delete=False) as fout: x.tofile(fout.name) return fout.name def test(): try: path = tofile() with smart_open.smart_open(path, 'rb') as fin: loaded = np.fromfile(fin) del loaded return 0 finally: os.unlink(path) return 1 if __name__ == '__main__': sys.exit(test()) smart_open-5.2.1/integration-tests/test_azure.py000066400000000000000000000105121411241424400221040ustar00rootroot00000000000000# -*- coding: utf-8 -*- import io import os import azure.storage.blob from pytest import fixture import smart_open _AZURE_CONTAINER = os.environ.get('SO_AZURE_CONTAINER') _AZURE_STORAGE_CONNECTION_STRING = os.environ.get('AZURE_STORAGE_CONNECTION_STRING') _FILE_PREFIX = '%s://%s' % (smart_open.azure.SCHEME, _AZURE_CONTAINER) assert _AZURE_CONTAINER is not None, 'please set the SO_AZURE_CONTAINER environment variable' assert _AZURE_STORAGE_CONNECTION_STRING is not None, \ 'please set the AZURE_STORAGE_CONNECTION_STRING environment variable' @fixture def client(): # type: () -> azure.storage.blob.BlobServiceClient return azure.storage.blob.BlobServiceClient.from_connection_string(_AZURE_STORAGE_CONNECTION_STRING) def initialize_bucket(client): container_client = client.get_container_client(_AZURE_CONTAINER) blobs = container_client.list_blobs() for blob in blobs: container_client.delete_blob(blob=blob) def write_read(key, content, write_mode, read_mode, **kwargs): with smart_open.open(key, write_mode, **kwargs) as fout: fout.write(content) with smart_open.open(key, read_mode, **kwargs) as fin: return fin.read() def read_length_prefixed_messages(key, read_mode, **kwargs): result = io.BytesIO() with smart_open.open(key, read_mode, **kwargs) as fin: length_byte = fin.read(1) while len(length_byte): result.write(length_byte) msg = fin.read(ord(length_byte)) result.write(msg) length_byte = fin.read(1) return result.getvalue() def test_azure_readwrite_text(benchmark, client): initialize_bucket(client) key = _FILE_PREFIX + '/sanity.txt' text = 'с гранатою в кармане, с чекою в руке' actual = benchmark( write_read, key, text, 'w', 'r', encoding='utf-8', transport_params=dict(client=client) ) assert actual == text def test_azure_readwrite_text_gzip(benchmark, client): initialize_bucket(client) key = _FILE_PREFIX + '/sanity.txt.gz' text = 'не чайки здесь запели на знакомом языке' actual = benchmark( write_read, key, text, 'w', 'r', encoding='utf-8', transport_params=dict(client=client) ) assert actual == text def test_azure_readwrite_binary(benchmark, client): initialize_bucket(client) key = _FILE_PREFIX + '/sanity.txt' binary = b'this is a test' actual = benchmark(write_read, key, binary, 'wb', 'rb', transport_params=dict(client=client)) assert actual == binary def test_azure_readwrite_binary_gzip(benchmark, client): initialize_bucket(client) key = _FILE_PREFIX + '/sanity.txt.gz' binary = b'this is a test' actual = benchmark(write_read, key, binary, 'wb', 'rb', transport_params=dict(client=client)) assert actual == binary def test_azure_performance(benchmark, client): initialize_bucket(client) one_megabyte = io.BytesIO() for _ in range(1024*128): one_megabyte.write(b'01234567') one_megabyte = one_megabyte.getvalue() key = _FILE_PREFIX + '/performance.txt' actual = benchmark(write_read, key, one_megabyte, 'wb', 'rb', transport_params=dict(client=client)) assert actual == one_megabyte def test_azure_performance_gz(benchmark, client): initialize_bucket(client) one_megabyte = io.BytesIO() for _ in range(1024*128): one_megabyte.write(b'01234567') one_megabyte = one_megabyte.getvalue() key = _FILE_PREFIX + '/performance.txt.gz' actual = benchmark(write_read, key, one_megabyte, 'wb', 'rb', transport_params=dict(client=client)) assert actual == one_megabyte def test_azure_performance_small_reads(benchmark, client): initialize_bucket(client) ONE_MIB = 1024**2 one_megabyte_of_msgs = io.BytesIO() msg = b'\x0f' + b'0123456789abcde' # a length-prefixed "message" for _ in range(0, ONE_MIB, len(msg)): one_megabyte_of_msgs.write(msg) one_megabyte_of_msgs = one_megabyte_of_msgs.getvalue() key = _FILE_PREFIX + '/many_reads_performance.bin' with smart_open.open(key, 'wb', transport_params=dict(client=client)) as fout: fout.write(one_megabyte_of_msgs) actual = benchmark( read_length_prefixed_messages, key, 'rb', buffering=ONE_MIB, transport_params=dict(client=client) ) assert actual == one_megabyte_of_msgs smart_open-5.2.1/integration-tests/test_gcs.py000066400000000000000000000067021411241424400215400ustar00rootroot00000000000000# -*- coding: utf-8 -*- import io import os import urllib.parse import google.cloud.storage import smart_open _GCS_URL = os.environ.get('SO_GCS_URL') assert _GCS_URL is not None, 'please set the SO_GCS_URL environment variable' def initialize_bucket(): client = google.cloud.storage.Client() parsed = urllib.parse.urlparse(_GCS_URL) bucket_name = parsed.netloc prefix = parsed.path bucket = client.get_bucket(bucket_name) blobs = bucket.list_blobs(prefix=prefix) for blob in blobs: blob.delete() def write_read(key, content, write_mode, read_mode, **kwargs): with smart_open.open(key, write_mode, **kwargs) as fout: fout.write(content) with smart_open.open(key, read_mode, **kwargs) as fin: return fin.read() def read_length_prefixed_messages(key, read_mode, **kwargs): result = io.BytesIO() with smart_open.open(key, read_mode, **kwargs) as fin: length_byte = fin.read(1) while len(length_byte): result.write(length_byte) msg = fin.read(ord(length_byte)) result.write(msg) length_byte = fin.read(1) return result.getvalue() def test_gcs_readwrite_text(benchmark): initialize_bucket() key = _GCS_URL + '/sanity.txt' text = 'с гранатою в кармане, с чекою в руке' actual = benchmark(write_read, key, text, 'w', 'r', encoding='utf-8') assert actual == text def test_gcs_readwrite_text_gzip(benchmark): initialize_bucket() key = _GCS_URL + '/sanity.txt.gz' text = 'не чайки здесь запели на знакомом языке' actual = benchmark(write_read, key, text, 'w', 'r', encoding='utf-8') assert actual == text def test_gcs_readwrite_binary(benchmark): initialize_bucket() key = _GCS_URL + '/sanity.txt' binary = b'this is a test' actual = benchmark(write_read, key, binary, 'wb', 'rb') assert actual == binary def test_gcs_readwrite_binary_gzip(benchmark): initialize_bucket() key = _GCS_URL + '/sanity.txt.gz' binary = b'this is a test' actual = benchmark(write_read, key, binary, 'wb', 'rb') assert actual == binary def test_gcs_performance(benchmark): initialize_bucket() one_megabyte = io.BytesIO() for _ in range(1024*128): one_megabyte.write(b'01234567') one_megabyte = one_megabyte.getvalue() key = _GCS_URL + '/performance.txt' actual = benchmark(write_read, key, one_megabyte, 'wb', 'rb') assert actual == one_megabyte def test_gcs_performance_gz(benchmark): initialize_bucket() one_megabyte = io.BytesIO() for _ in range(1024*128): one_megabyte.write(b'01234567') one_megabyte = one_megabyte.getvalue() key = _GCS_URL + '/performance.txt.gz' actual = benchmark(write_read, key, one_megabyte, 'wb', 'rb') assert actual == one_megabyte def test_gcs_performance_small_reads(benchmark): initialize_bucket() ONE_MIB = 1024**2 one_megabyte_of_msgs = io.BytesIO() msg = b'\x0f' + b'0123456789abcde' # a length-prefixed "message" for _ in range(0, ONE_MIB, len(msg)): one_megabyte_of_msgs.write(msg) one_megabyte_of_msgs = one_megabyte_of_msgs.getvalue() key = _GCS_URL + '/many_reads_performance.bin' with smart_open.open(key, 'wb') as fout: fout.write(one_megabyte_of_msgs) actual = benchmark(read_length_prefixed_messages, key, 'rb', buffering=ONE_MIB) assert actual == one_megabyte_of_msgs smart_open-5.2.1/integration-tests/test_hdfs.py000066400000000000000000000011471411241424400217060ustar00rootroot00000000000000# -*- coding: utf-8 -*- # # Copyright (C) 2019 Radim Rehurek # # This code is distributed under the terms and conditions # from the MIT License (MIT). # """ Sample code for HDFS integration tests. Requires hadoop to be running on localhost, at the moment. """ import smart_open with smart_open.smart_open("hdfs://user/root/input/core-site.xml") as fin: print(fin.read()) with smart_open.smart_open("hdfs://user/root/input/test.txt") as fin: print(fin.read()) with smart_open.smart_open("hdfs://user/root/input/test.txt?user.name=root", 'wb') as fout: fout.write(b'hello world') smart_open-5.2.1/integration-tests/test_http.py000066400000000000000000000040231411241424400217350ustar00rootroot00000000000000# -*- coding: utf-8 -*- # # Copyright (C) 2019 Radim Rehurek # # This code is distributed under the terms and conditions # from the MIT License (MIT). # from __future__ import unicode_literals import logging import unittest import smart_open GZIP_MAGIC = b'\x1f\x8b' BASE_URL = ('https://raw.githubusercontent.com/RaRe-Technologies/smart_open/' 'master/smart_open/tests/test_data/') class ReadTest(unittest.TestCase): def test_read_text(self): url = BASE_URL + 'crime-and-punishment.txt' with smart_open.smart_open(url, encoding='utf-8') as fin: text = fin.read() self.assertTrue(text.startswith('В начале июля, в чрезвычайно жаркое время,')) self.assertTrue(text.endswith('улизнуть, чтобы никто не видал.\n')) def test_read_binary(self): url = BASE_URL + 'crime-and-punishment.txt' with smart_open.smart_open(url, 'rb') as fin: text = fin.read() self.assertTrue(text.startswith('В начале июля, в чрезвычайно'.encode('utf-8'))) self.assertTrue(text.endswith('улизнуть, чтобы никто не видал.\n'.encode('utf-8'))) def test_read_gzip_text(self): url = BASE_URL + 'crime-and-punishment.txt.gz' with smart_open.smart_open(url, encoding='utf-8') as fin: text = fin.read() self.assertTrue(text.startswith('В начале июля, в чрезвычайно жаркое время,')) self.assertTrue(text.endswith('улизнуть, чтобы никто не видал.\n')) def test_read_gzip_binary(self): url = BASE_URL + 'crime-and-punishment.txt.gz' with smart_open.smart_open(url, 'rb', ignore_extension=True) as fin: binary = fin.read() self.assertTrue(binary.startswith(GZIP_MAGIC)) if __name__ == '__main__': logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.DEBUG) unittest.main() smart_open-5.2.1/integration-tests/test_minio.py000066400000000000000000000035301411241424400220730ustar00rootroot00000000000000# -*- coding: utf-8 -*- # # Copyright (C) 2019 Radim Rehurek # # This code is distributed under the terms and conditions # from the MIT License (MIT). # import logging import boto3 from smart_open import open # # These are publicly available via play.min.io # KEY_ID = 'Q3AM3UQ867SPQQA43P2F' SECRET_KEY = 'zuf+tfteSlswRu7BJ86wekitnifILbZam1KYY3TG' ENDPOINT_URL = 'https://play.min.io:9000' def read_boto3(): """Read directly using boto3.""" session = get_minio_session() s3 = session.resource('s3', endpoint_url=ENDPOINT_URL) obj = s3.Object('smart-open-test', 'README.rst') data = obj.get()['Body'].read() logging.info('read %d bytes via boto3', len(data)) return data def read_smart_open(): url = 's3://Q3AM3UQ867SPQQA43P2F:zuf+tfteSlswRu7BJ86wekitnifILbZam1KYY3TG@play.min.io:9000@smart-open-test/README.rst' # noqa # # If the default region is not us-east-1, we need to construct our own # session. This is because smart_open will create a session in the default # region, which _must_ be us-east-1 for minio to work. # tp = {} if get_default_region() != 'us-east-1': logging.info('injecting custom session') tp['session'] = get_minio_session() with open(url, transport_params=tp) as fin: text = fin.read() logging.info('read %d characters via smart_open', len(text)) return text def get_minio_session(): return boto3.Session( region_name='us-east-1', aws_access_key_id=KEY_ID, aws_secret_access_key=SECRET_KEY, ) def get_default_region(): return boto3.Session().region_name def main(): logging.basicConfig(level=logging.INFO) from_boto3 = read_boto3() from_smart_open = read_smart_open() assert from_boto3.decode('utf-8') == from_smart_open if __name__ == '__main__': main() smart_open-5.2.1/integration-tests/test_s3.py000066400000000000000000000105041411241424400213040ustar00rootroot00000000000000# -*- coding: utf-8 -*- # # Copyright (C) 2019 Radim Rehurek # # This code is distributed under the terms and conditions # from the MIT License (MIT). # from __future__ import unicode_literals import contextlib import io import os import random import subprocess import string import boto3 import smart_open _BUCKET = os.environ.get('SO_BUCKET') assert _BUCKET is not None, 'please set the SO_BUCKET environment variable' _KEY = os.environ.get('SO_KEY') assert _KEY is not None, 'please set the SO_KEY environment variable' # # https://stackoverflow.com/questions/13484726/safe-enough-8-character-short-unique-random-string # def _random_string(length=8): alphabet = string.ascii_lowercase + string.digits return ''.join(random.choices(alphabet, k=length)) @contextlib.contextmanager def temporary(): """Yields a URL than can be used for temporary writing. Removes all content under the URL when exiting. """ key = '%s/%s' % (_KEY, _random_string()) yield 's3://%s/%s' % (_BUCKET, key) boto3.resource('s3').Bucket(_BUCKET).objects.filter(Prefix=key).delete() def _test_case(function): def inner(benchmark): with temporary() as uri: return function(benchmark, uri) return inner def write_read(uri, content, write_mode, read_mode, encoding=None, s3_upload=None, **kwargs): write_params = dict(kwargs) write_params.update(s3_upload=s3_upload) with smart_open.open(uri, write_mode, encoding=encoding, transport_params=write_params) as fout: fout.write(content) with smart_open.open(uri, read_mode, encoding=encoding, transport_params=kwargs) as fin: actual = fin.read() return actual def read_length_prefixed_messages(uri, read_mode, encoding=None, **kwargs): with smart_open.open(uri, read_mode, encoding=encoding, transport_params=kwargs) as fin: actual = b'' length_byte = fin.read(1) while len(length_byte): actual += length_byte msg = fin.read(ord(length_byte)) actual += msg length_byte = fin.read(1) return actual @_test_case def test_s3_readwrite_text(benchmark, uri): text = 'с гранатою в кармане, с чекою в руке' actual = benchmark(write_read, uri, text, 'w', 'r', 'utf-8') assert actual == text @_test_case def test_s3_readwrite_text_gzip(benchmark, uri): text = 'не чайки здесь запели на знакомом языке' actual = benchmark(write_read, uri, text, 'w', 'r', 'utf-8') assert actual == text @_test_case def test_s3_readwrite_binary(benchmark, uri): binary = b'this is a test' actual = benchmark(write_read, uri, binary, 'wb', 'rb') assert actual == binary @_test_case def test_s3_readwrite_binary_gzip(benchmark, uri): binary = b'this is a test' actual = benchmark(write_read, uri, binary, 'wb', 'rb') assert actual == binary @_test_case def test_s3_performance(benchmark, uri): one_megabyte = io.BytesIO() for _ in range(1024*128): one_megabyte.write(b'01234567') one_megabyte = one_megabyte.getvalue() actual = benchmark(write_read, uri, one_megabyte, 'wb', 'rb') assert actual == one_megabyte @_test_case def test_s3_performance_gz(benchmark, uri): one_megabyte = io.BytesIO() for _ in range(1024*128): one_megabyte.write(b'01234567') one_megabyte = one_megabyte.getvalue() actual = benchmark(write_read, uri, one_megabyte, 'wb', 'rb') assert actual == one_megabyte @_test_case def test_s3_performance_small_reads(benchmark, uri): one_mib = 1024**2 one_megabyte_of_msgs = io.BytesIO() msg = b'\x0f' + b'0123456789abcde' # a length-prefixed "message" for _ in range(0, one_mib, len(msg)): one_megabyte_of_msgs.write(msg) one_megabyte_of_msgs = one_megabyte_of_msgs.getvalue() with smart_open.open(uri, 'wb') as fout: fout.write(one_megabyte_of_msgs) actual = benchmark(read_length_prefixed_messages, uri, 'rb', buffer_size=one_mib) assert actual == one_megabyte_of_msgs @_test_case def test_s3_encrypted_file(benchmark, uri): text = 'с гранатою в кармане, с чекою в руке' s3_upload = {'ServerSideEncryption': 'AES256'} actual = benchmark(write_read, uri, text, 'w', 'r', 'utf-8', s3_upload=s3_upload) assert actual == text smart_open-5.2.1/integration-tests/test_s3_buffering.py000066400000000000000000000010211411241424400233250ustar00rootroot00000000000000from smart_open import open def read_bytes(url, limit): bytes_ = [] with open(url, 'rb') as fin: for i in range(limit): bytes_.append(fin.read(1)) return bytes_ def test(benchmark): # # This file is around 850MB. # url = ( 's3://commoncrawl/crawl-data/CC-MAIN-2019-51/segments/1575541319511.97' '/warc/CC-MAIN-20191216093448-20191216121448-00559.warc.gz' ) limit = 1000000 bytes_ = benchmark(read_bytes, url, limit) assert len(bytes_) == limit smart_open-5.2.1/integration-tests/test_s3_ported.py000066400000000000000000000265731411241424400226760ustar00rootroot00000000000000# -*- coding: utf-8 -*- # # Copyright (C) 2020 Radim Rehurek # # This code is distributed under the terms and conditions # from the MIT License (MIT). # """Integration tests ported from our old unit tests. Before running these tests against a real bucket, make sure its initialized with initialize_s3_bucket.py. """ import contextlib import gzip import io import unittest import uuid import warnings import boto3 from parameterizedtestcase import ParameterizedTestCase as PTestCase import smart_open import smart_open.concurrency import smart_open.constants from initialize_s3_bucket import CONTENTS BUCKET_NAME = 'smartopen-integration-tests' def setUpModule(): assert boto3.resource('s3').Bucket(BUCKET_NAME).creation_date, 'see initialize_s3_bucket.py' def ignore_resource_warnings(): warnings.filterwarnings("ignore", category=ResourceWarning, message="unclosed.*") # noqa class ReaderTest(unittest.TestCase): def setUp(self): ignore_resource_warnings() def test_iter(self): """Are S3 files iterated over correctly?""" key_name = 'hello.txt' expected = CONTENTS[key_name].split(b'\n') fin = smart_open.s3.Reader(BUCKET_NAME, key_name) actual = [line.rstrip(b'\n') for line in fin] self.assertEqual(expected, actual) def test_iter_context_manager(self): # same thing but using a context manager key_name = 'hello.txt' expected = CONTENTS[key_name].split(b'\n') with smart_open.s3.Reader(BUCKET_NAME, key_name) as fin: actual = [line.rstrip(b'\n') for line in fin] self.assertEqual(expected, actual) def test_read(self): """Are S3 files read correctly?""" key_name = 'hello.txt' expected = CONTENTS[key_name] fin = smart_open.s3.Reader(BUCKET_NAME, key_name) self.assertEqual(expected[:6], fin.read(6)) self.assertEqual(expected[6:14], fin.read(8)) # ř is 2 bytes self.assertEqual(expected[14:], fin.read()) # read the rest def test_seek_beginning(self): """Does seeking to the beginning of S3 files work correctly?""" key_name = 'hello.txt' expected = CONTENTS[key_name] fin = smart_open.s3.Reader(BUCKET_NAME, key_name) self.assertEqual(expected[:6], fin.read(6)) self.assertEqual(expected[6:14], fin.read(8)) # ř is 2 bytes fin.seek(0) self.assertEqual(expected, fin.read()) # no size given => read whole file fin.seek(0) self.assertEqual(expected, fin.read(-1)) # same thing def test_seek_start(self): """Does seeking from the start of S3 files work correctly?""" fin = smart_open.s3.Reader(BUCKET_NAME, 'hello.txt') seek = fin.seek(6) self.assertEqual(seek, 6) self.assertEqual(fin.tell(), 6) self.assertEqual(fin.read(6), u'wořld'.encode('utf-8')) def test_seek_current(self): """Does seeking from the middle of S3 files work correctly?""" fin = smart_open.s3.Reader(BUCKET_NAME, 'hello.txt') self.assertEqual(fin.read(5), b'hello') seek = fin.seek(1, whence=smart_open.constants.WHENCE_CURRENT) self.assertEqual(seek, 6) self.assertEqual(fin.read(6), u'wořld'.encode('utf-8')) def test_seek_end(self): """Does seeking from the end of S3 files work correctly?""" key_name = 'hello.txt' expected = CONTENTS[key_name] fin = smart_open.s3.Reader(BUCKET_NAME, key_name) seek = fin.seek(-4, whence=smart_open.constants.WHENCE_END) self.assertEqual(seek, len(expected) - 4) self.assertEqual(fin.read(), b'you?') def test_detect_eof(self): key_name = 'hello.txt' expected = CONTENTS[key_name] fin = smart_open.s3.Reader(BUCKET_NAME, key_name) fin.read() eof = fin.tell() self.assertEqual(eof, len(expected)) fin.seek(0, whence=smart_open.constants.WHENCE_END) self.assertEqual(eof, fin.tell()) def test_read_gzip(self): key_name = 'hello.txt.gz' with gzip.GzipFile(fileobj=io.BytesIO(CONTENTS[key_name])) as fin: expected = fin.read() with smart_open.s3.Reader(BUCKET_NAME, key_name) as fin: with gzip.GzipFile(fileobj=fin) as zipfile: actual = zipfile.read() self.assertEqual(expected, actual) def test_readline(self): key_name = 'multiline.txt' expected = CONTENTS[key_name] with smart_open.s3.Reader(BUCKET_NAME, key_name) as fin: fin.readline() self.assertEqual(fin.tell(), expected.index(b'\n')+1) fin.seek(0) actual = list(fin) self.assertEqual(fin.tell(), len(expected)) expected = [b'englishman\n', b'in\n', b'new\n', b'york\n'] self.assertEqual(expected, actual) def test_readline_tiny_buffer(self): key_name = 'multiline.txt' expected = CONTENTS[key_name] with smart_open.s3.Reader(BUCKET_NAME, key_name, buffer_size=8) as fin: actual = list(fin) expected = [b'englishman\n', b'in\n', b'new\n', b'york\n'] self.assertEqual(expected, actual) def test_read0_does_not_return_data(self): with smart_open.s3.Reader(BUCKET_NAME, 'hello.txt') as fin: data = fin.read(0) self.assertEqual(data, b'') def test_to_boto3(self): key_name = 'multiline.txt' expected = CONTENTS[key_name] with smart_open.s3.Reader(BUCKET_NAME, key_name) as fin: returned_obj = fin.to_boto3() boto3_body = returned_obj.get()['Body'].read() self.assertEqual(expected, boto3_body) def read_key(key): return boto3.resource('s3').Object(BUCKET_NAME, key).get()['Body'].read() class WriterTest(unittest.TestCase): def setUp(self): # # Write to a unique key each time to avoid cross-talk between # simultaneous test runs. # self.key = 'writer-test/' + uuid.uuid4().hex def tearDown(self): boto3.resource('s3').Object(BUCKET_NAME, self.key).delete() def test_write(self): """Does writing into s3 work correctly?""" test_string = u"žluťoučký koníček".encode('utf8') with smart_open.s3.MultipartWriter(BUCKET_NAME, self.key) as fout: fout.write(test_string) data = read_key(self.key) self.assertEqual(data, test_string) def test_multipart(self): """Does s3 multipart chunking work correctly?""" with smart_open.s3.MultipartWriter(BUCKET_NAME, self.key, min_part_size=10) as fout: fout.write(b"test") self.assertEqual(fout._buf.tell(), 4) fout.write(b"test\n") self.assertEqual(fout._buf.tell(), 9) self.assertEqual(fout._total_parts, 0) fout.write(b"test") self.assertEqual(fout._buf.tell(), 0) self.assertEqual(fout._total_parts, 1) data = read_key(self.key) self.assertEqual(data, b"testtest\ntest") def test_empty_key(self): """Does writing no data cause key with an empty value to be created?""" smart_open_write = smart_open.s3.MultipartWriter(BUCKET_NAME, self.key) with smart_open_write as fout: # noqa pass # read back the same key and check its content data = read_key(self.key) self.assertEqual(data, b'') def test_buffered_writer_wrapper_works(self): """ Ensure that we can wrap a smart_open s3 stream in a BufferedWriter, which passes a memoryview object to the underlying stream in python >= 2.7 """ expected = u'не думай о секундах свысока' with smart_open.s3.MultipartWriter(BUCKET_NAME, self.key) as fout: with io.BufferedWriter(fout) as sub_out: sub_out.write(expected.encode('utf-8')) text = read_key(self.key).decode('utf-8') self.assertEqual(expected, text) def test_double_close(self): text = u'там за туманами, вечными, пьяными'.encode('utf-8') fout = smart_open.s3.open(BUCKET_NAME, self.key, 'wb') fout.write(text) fout.close() fout.close() result = read_key(self.key) self.assertEqual(result, text) def test_flush_close(self): text = u'там за туманами, вечными, пьяными'.encode('utf-8') fout = smart_open.s3.open(BUCKET_NAME, self.key, 'wb') fout.write(text) fout.flush() fout.close() result = read_key(self.key) self.assertEqual(result, text) @contextlib.contextmanager def force(multiprocessing=False, concurrent_futures=False): assert not (multiprocessing and concurrent_futures) old_multiprocessing = smart_open.concurrency._MULTIPROCESSING old_concurrent_futures = smart_open.concurrency._CONCURRENT_FUTURES smart_open.concurrency._MULTIPROCESSING = multiprocessing smart_open.concurrency._CONCURRENT_FUTURES = concurrent_futures yield smart_open.concurrency._MULTIPROCESSING = old_multiprocessing smart_open.concurrency._CONCURRENT_FUTURES = old_concurrent_futures class IterBucketTest(PTestCase): def setUp(self): self.expected = [ (key, value) for (key, value) in CONTENTS.items() if key.startswith('iter_bucket/') ] self.expected.sort() def test_singleprocess(self): with force(): actual = list(smart_open.s3.iter_bucket(BUCKET_NAME, prefix='iter_bucket')) self.assertEqual(len(self.expected), len(actual)) self.assertEqual(self.expected, sorted(actual)) @unittest.skipIf(not smart_open.concurrency._MULTIPROCESSING, 'multiprocessing unavailable') def test_multiprocess(self): with force(multiprocessing=True): actual = list(smart_open.s3.iter_bucket(BUCKET_NAME, prefix='iter_bucket')) self.assertEqual(len(self.expected), len(actual)) self.assertEqual(self.expected, sorted(actual)) @unittest.skipIf(not smart_open.concurrency._CONCURRENT_FUTURES, 'concurrent.futures unavailable') def test_concurrent_futures(self): with force(concurrent_futures=True): actual = list(smart_open.s3.iter_bucket(BUCKET_NAME, prefix='iter_bucket')) self.assertEqual(len(self.expected), len(actual)) self.assertEqual(self.expected, sorted(actual)) def test_accept_key(self): expected = [(key, value) for (key, value) in self.expected if '4' in key] actual = list( smart_open.s3.iter_bucket( BUCKET_NAME, prefix='iter_bucket', accept_key=lambda key: '4' in key ) ) self.assertEqual(len(expected), len(actual)) self.assertEqual(expected, sorted(actual)) @PTestCase.parameterize(('workers',), [(x,) for x in (1, 4, 8, 16, 64)]) def test_workers(self, workers): actual = list(smart_open.s3.iter_bucket(BUCKET_NAME, prefix='iter_bucket', workers=workers)) self.assertEqual(len(self.expected), len(actual)) self.assertEqual(self.expected, sorted(actual)) class DownloadKeyTest(unittest.TestCase): def test(self): key_name = 'hello.txt' expected = (key_name, CONTENTS[key_name]) actual = smart_open.s3._download_key(key_name, bucket_name=BUCKET_NAME) self.assertEqual(expected, actual) smart_open-5.2.1/integration-tests/test_s3_readline.py000066400000000000000000000011101411241424400231400ustar00rootroot00000000000000from smart_open import open def read_lines(url, limit): lines = [] with open(url, 'r', errors='ignore') as fin: for i, l in enumerate(fin): if i == limit: break lines.append(l) return lines def test(benchmark): # # This file is around 850MB. # url = ( 's3://commoncrawl/crawl-data/CC-MAIN-2019-51/segments/1575541319511.97' '/warc/CC-MAIN-20191216093448-20191216121448-00559.warc.gz' ) limit = 1000000 lines = benchmark(read_lines, url, limit) assert len(lines) == limit smart_open-5.2.1/integration-tests/test_version_id.py000066400000000000000000000020121411241424400231130ustar00rootroot00000000000000"""Tests the version_id transport parameter for S3 against real S3.""" import boto3 from smart_open import open BUCKET, KEY = 'smart-open-versioned', 'demo.txt' """Our have a public-readable bucket with a versioned object.""" URL = 's3://%s/%s' % (BUCKET, KEY) def assert_equal(a, b): assert a == b, '%r != %r' % (a, b) def main(): versions = [ v.id for v in boto3.resource('s3').Bucket(BUCKET).object_versions.filter(Prefix=KEY) ] expected_versions = [ 'KiQpZPsKI5Dm2oJZy_RzskTOtl2snjBg', 'N0GJcE3TQCKtkaS.gF.MUBZS85Gs3hzn', ] assert_equal(versions, expected_versions) contents = [ open(URL, transport_params={'version_id': v}).read() for v in versions ] expected_contents = ['second version\n', 'first version\n'] assert_equal(contents, expected_contents) with open(URL) as fin: most_recent_contents = fin.read() assert_equal(most_recent_contents, expected_contents[0]) print('OK') if __name__ == '__main__': main() smart_open-5.2.1/integration-tests/test_webhdfs.py000066400000000000000000000037711411241424400224110ustar00rootroot00000000000000# -*- coding: utf-8 -*- # # Copyright (C) 2019 Radim Rehurek # # This code is distributed under the terms and conditions # from the MIT License (MIT). # """ Sample code for WebHDFS integration tests. To run it working WebHDFS in your network is needed - simply set _SO_WEBHDFS_BASE_URL env variable to webhdfs url you have write access to. For example on Amazon EMR WebHDFS is accessible on driver port 14000, so it may look like: $ export SO_WEBHDFS_BASE_URL=webhdfs://hadoop@your-emr-driver:14000/tmp/ $ py.test integration-tests/test_webhdfs.py """ import json import os import smart_open from smart_open.webhdfs import WebHdfsException import pytest _SO_WEBHDFS_BASE_URL = os.environ.get("SO_WEBHDFS_BASE_URL") assert ( _SO_WEBHDFS_BASE_URL is not None ), "please set the SO_WEBHDFS_BASE_URL environment variable" def make_url(path): return "{base_url}/{path}".format( base_url=_SO_WEBHDFS_BASE_URL.rstrip("/"), path=path.lstrip("/") ) def test_write_and_read(): with smart_open.open(make_url("test2.txt"), "w") as f: f.write("write_test\n") with smart_open.open(make_url("test2.txt"), "r") as f: assert f.read() == "write_test\n" def test_binary_write_and_read(): with smart_open.open(make_url("test3.txt"), "wb") as f: f.write(b"binary_write_test\n") with smart_open.open(make_url("test3.txt"), "rb") as f: assert f.read() == b"binary_write_test\n" def test_not_found(): with pytest.raises(WebHdfsException) as exc_info: with smart_open.open(make_url("not_existing"), "r") as f: assert f.read() assert exc_info.value.status_code == 404 def test_quoted_path(): with smart_open.open(make_url("test_%40_4.txt"), "w") as f: f.write("write_test\n") with smart_open.open(make_url("?op=LISTSTATUS"), "r") as f: data = json.load(f) filenames = [ entry["pathSuffix"] for entry in data["FileStatuses"]["FileStatus"] ] assert "test_@_4.txt" in filenames smart_open-5.2.1/release/000077500000000000000000000000001411241424400153035ustar00rootroot00000000000000smart_open-5.2.1/release/README.md000066400000000000000000000036751411241424400165750ustar00rootroot00000000000000# Release Scripts This subdirectory contains various scripts for making a smart_open release. ## Prerequisites You need a GNU-like environment to run these scripts. I perform the releases using Ubuntu 18.04, but other O/S like MacOS should also work. The prerequisites are minimal: - bash - git with authentication set up (e.g. via ssh-agent) - virtualenv - pip All of the above are generally freely available, e.g. installable via apt in Ubuntu. ## Release Procedure First, check that the [latest commit](https://github.com/RaRe-Technologies/smart_open/commits/master) passed all CI. For the subsequent steps to work, you will need to be in the top-level subdirectory for the repo (e.g. /home/misha/git/smart_open). Prepare the release, replacing 2.3.4 with the actual version of the new release: bash release/prepare.sh 2.3.4 This will create a local release branch. Look around the branch and make sure everything is in order. Checklist: - [ ] Does smart_open/version.py contain the correct version number for the release? - [ ] Does the CHANGELOG.md contain a section detailing the new release? - [ ] Are there any PRs that should be in CHANGELOG.md, but currently aren't? If anything is out of order, make the appropriate changes and commit them to the release branch before proceeding. **This is the point of no return**. **Once you're happy with the release branch**, run: bash release/merge.sh Congratulations, at this stage, you are done! ## Troubleshooting Ideally, our CI should save you from major boo-boos along the way. If the build is broken, fix it before even thinking about doing a release. If anything is wrong with the local release branch (before you call merge.sh), for example: - Typo in CHANGELOG.md - Missing entries in CHANGELOG.md - Wrong version.py number then just fix it in the release branch before moving on. Otherwise, it's too late to fix anything for the current release. Make a bugfix release to fix the problem. smart_open-5.2.1/release/annotate_pr.py000077500000000000000000000024751411241424400202020ustar00rootroot00000000000000#!/usr/bin/env python """Helper script for including change log entries in an open PR. Automatically constructs the change log entry from the PR title. Copies the entry to the window manager clipboard. Opens the change log belonging to the specific PR in a browser window. All you have to do is paste and click "commit changes". """ import json import sys import webbrowser import smart_open def copy_to_clipboard(text): try: import pyperclip except ImportError: print('pyperclip is missing.', file=sys.stderr) print('copy-paste the following text manually:', file=sys.stderr) print('\t', text, file=sys.stderr) else: pyperclip.copy(text) prid = int(sys.argv[1]) url = "https://api.github.com/repos/RaRe-Technologies/smart_open/pulls/%d" % prid with smart_open.open(url) as fin: prinfo = json.load(fin) prinfo['user_login'] = prinfo['user']['login'] prinfo['user_html_url'] = prinfo['user']['html_url'] text = '- %(title)s (PR [#%(number)s](%(html_url)s), [@%(user_login)s](%(user_html_url)s))' % prinfo copy_to_clipboard(text) prinfo['head_repo_html_url'] = prinfo['head']['repo']['html_url'] prinfo['head_ref'] = prinfo['head']['ref'] edit_url = '%(head_repo_html_url)s/edit/%(head_ref)s/CHANGELOG.md' % prinfo webbrowser.open(edit_url) smart_open-5.2.1/release/check_preamble.py000066400000000000000000000042221411241424400206010ustar00rootroot00000000000000# -*- coding: utf-8 -*- # # Copyright (C) 2019 Radim Rehurek # # This code is distributed under the terms and conditions # from the MIT License (MIT). # """Checks preambles of Python script files. We want to ensure they all contain the appropriate license and copyright. For the purposes of this script, the *preamble* is defined as the first lines of the file starting with a hash (#). Any line that does not start with a hash ends the preamble. Usage:: python check_preamble.py --replace /path/to/template.py script.py The above command reads the preamble from ``template.py``, and then copies that preamble into ``script.py``. If ``script.py`` already contains a preamble, then the existing preamble will be replaced **entirely**. Processing entire subdirectories with one command:: find subdir1 subdir2 -iname "*.py" | xargs -n 1 python check_preamble.py --replace template.py """ import argparse import logging import os import sys def extract_preamble(fin): end_preamble = False preamble, body = [], [] for line in fin: if end_preamble: body.append(line) elif line.startswith('#'): preamble.append(line) else: end_preamble = True body.append(line) return preamble, body def main(): parser = argparse.ArgumentParser() parser.add_argument('path', help='the path of the file to check') parser.add_argument('--replace', help='replace the preamble with the one from this file') parser.add_argument('--loglevel', default=logging.INFO) args = parser.parse_args() logging.basicConfig(level=args.loglevel) with open(args.path) as fin: preamble, body = extract_preamble(fin) for line in preamble: logging.info('%s: %s', args.path, line.rstrip()) if not args.replace: sys.exit(0) with open(args.replace) as fin: preamble, _ = extract_preamble(fin) if os.access(args.path, os.X_OK): preamble.insert(0, '#!/usr/bin/env python\n') with open(args.path, 'w') as fout: for line in preamble + body: fout.write(line) if __name__ == '__main__': main() smart_open-5.2.1/release/doctest.sh000077500000000000000000000005261411241424400173120ustar00rootroot00000000000000script_dir="$(dirname "${BASH_SOURCE[0]}")" export AWS_ACCESS_KEY_ID=$(aws --profile smart_open configure get aws_access_key_id) export AWS_SECRET_ACCESS_KEY=$(aws --profile smart_open configure get aws_secret_access_key) # # Using the current environment, which has smart_open installed. # cd "$script_dir/.." python -m doctest README.rst smart_open-5.2.1/release/hijack_pr.py000077500000000000000000000017511411241424400176160ustar00rootroot00000000000000#!/usr/bin/env python """Hijack a PR to add commits as a maintainer. This is a two-step process: 1. Add a git remote that points to the contributor's repo 2. Check out the actual contribution by reference As a maintainer, you can add changes by making new commits and pushing them back to the remote. """ import json import subprocess import sys import smart_open prid = int(sys.argv[1]) url = f"https://api.github.com/repos/RaRe-Technologies/smart_open/pulls/{prid}" with smart_open.open(url) as fin: prinfo = json.load(fin) user = prinfo['head']['user']['login'] ssh_url = prinfo['head']['repo']['ssh_url'] remotes = subprocess.check_output(['git', 'remote']).strip().decode('utf-8').split('\n') if user not in remotes: subprocess.check_call(['git', 'remote', 'add', user, ssh_url]) subprocess.check_call(['git', 'fetch', user]) ref = prinfo['head']['ref'] subprocess.check_call(['git', 'checkout', f'{user}/{ref}']) subprocess.check_call(['git', 'switch', '-c', f'{ref}']) smart_open-5.2.1/release/merge.sh000077500000000000000000000041111411241424400167360ustar00rootroot00000000000000# # This script performs the following tasks: # # - Merges the current release branch into master # - Applies a tag to master # - Merges # - Pushes the updated master branch and its tag to upstream # # - develop: Our development branch. We merge all PRs into this branch. # - release-$version: A local branch containing commits specific to this release. # This is a local-only branch, we never push this anywhere. # - master: Our "clean" release branch. Contains tags. # # The relationships between the three branches are illustrated below: # # github.com PRs # \ # develop --+--+----------------------------------+--- # \ / # (new branch) \ commits (CHANGELOG.md, etc) / # \ v / # release ---*-----X (delete branch) / (merge 2) # \ / # (merge 1) \ TAG / # \ v / # master -------------------+------*-----+----------- # # Use it like this: # # bash release/merge.sh # # Expects smart_open/version.py to be correctly incremented for the new release. # set -euo pipefail cd "$(dirname "${BASH_SOURCE[0]}")/.." version="$(python smart_open/version.py)" read -p "Push version $version to github.com and PyPI? yes or no: " reply if [ "$reply" != "yes" ] then echo "aborted by user" exit 1 fi # # Delete the local develop branch in case one is left lying around. # set +e git branch -D develop git branch -D master set -e git checkout upstream/master -b master git merge --no-ff release-${version} git tag -a "v${version}" -m "v${version}" git checkout upstream/develop -b develop git merge --no-ff master # # N.B. these push steps are non-reversible. # git checkout master git push --tags upstream master git checkout develop dev_version="$version.dev0" sed --in-place="" -e s/$(python smart_open/version.py)/$dev_version/ smart_open/version.py git commit smart_open/version.py -m "bump version to $dev_version" git push upstream develop python release/update_release_notes.py "$version" smart_open-5.2.1/release/prepare.sh000077500000000000000000000023301411241424400172760ustar00rootroot00000000000000# # Prepare a new release of smart_open. Use it like this: # # bash release/prepare.sh 1.2.3 # # where 1.2.3 is the new version to release. # # Does the following: # # - Creates a clean virtual environment # - Creates a local release git branch # - Bumps VERSION accordingly # - Opens CHANGELOG.md for editing, commits updates # # Once you're happy, run merge.sh to continue with the release. # set -euxo pipefail version="$1" echo "version: $version" script_dir="$(dirname "${BASH_SOURCE[0]}")" cd "$script_dir/.." git fetch upstream # # Delete the release branch in case one is left lying around. # git checkout upstream/develop set +e git branch -D release-"$version" set -e git checkout upstream/develop -b release-"$version" sed --in-place="" -e "s/$(python smart_open/version.py)/$version/" smart_open/version.py git commit smart_open/version.py -m "bump version to $version" echo "Next, update CHANGELOG.md." echo "Consider running summarize_pr.sh for each PR merged since the last release." read -p "Press Enter to continue..." ${EDITOR:-vim} CHANGELOG.md set +e git commit CHANGELOG.md -m "updated CHANGELOG.md for version $version" set -e echo "Have a look at the current branch, and if all looks good, run merge.sh" smart_open-5.2.1/release/update_help_txt.sh000077500000000000000000000003321411241424400210310ustar00rootroot00000000000000script_dir="$(dirname "${BASH_SOURCE[0]}")" # # Using the current environment, which has smart_open installed. # cd "$script_dir/.." python -c 'help("smart_open")' > help.txt git commit help.txt -m "updated help.txt" smart_open-5.2.1/release/update_release_notes.py000066400000000000000000000014571411241424400220560ustar00rootroot00000000000000"""Helper script for updating the release notes. Copies the change log to the window manager clipboard. Opens the release notes using the browser. All you have to do is paste and click "commit changes". """ import os import sys import webbrowser version = sys.argv[1] curr_dir = os.path.dirname(__file__) def copy_to_clipboard(text): try: import pyperclip except ImportError: print('pyperclip is missing.', file=sys.stderr) print('copy-paste the contents of CHANGELOG.md manually', file=sys.stderr) else: pyperclip.copy(text) with open(os.path.join(curr_dir, '../CHANGELOG.md')) as fin: copy_to_clipboard(fin.read()) url = "https://github.com/RaRe-Technologies/smart_open/releases/tag/v%s" % version webbrowser.open(url) smart_open-5.2.1/sampledata/000077500000000000000000000000001411241424400157765ustar00rootroot00000000000000smart_open-5.2.1/sampledata/hello.zip000066400000000000000000000007751411241424400176360ustar00rootroot00000000000000PK K]AOhello/UT ]-]ux PK 3]AO hello/en.txtUT ⼒]⼒]ux hello world! PK K]AOl3 hello/ru.txtUT ]]ux здравствуй, мир! PK K]AOAhello/UT]ux PK 3]AO @hello/en.txtUT⼒]ux PK K]AOl3 hello/ru.txtUT]ux PKsmart_open-5.2.1/setup.py000066400000000000000000000054561411241424400154070ustar00rootroot00000000000000#!/usr/bin/env python # -*- coding: utf-8 -*- # # Copyright (C) 2015 Radim Rehurek # # This code is distributed under the terms and conditions # from the MIT License (MIT). import io import os from setuptools import setup, find_packages def _get_version(): curr_dir = os.path.dirname(os.path.abspath(__file__)) with open(os.path.join(curr_dir, 'smart_open', 'version.py')) as fin: line = fin.readline().strip() parts = line.split(' ') assert len(parts) == 3 assert parts[0] == '__version__' assert parts[1] == '=' return parts[2].strip('\'"') # # We cannot do "from smart_open.version import __version__" because that will # require the dependencies for smart_open to already be in place, and that is # not necessarily the case when running setup.py for the first time. # __version__ = _get_version() def read(fname): return io.open(os.path.join(os.path.dirname(__file__), fname), encoding='utf-8').read() aws_deps = ['boto3'] gcs_deps = ['google-cloud-storage'] azure_deps = ['azure-storage-blob', 'azure-common', 'azure-core'] http_deps = ['requests'] all_deps = aws_deps + gcs_deps + azure_deps + http_deps tests_require = all_deps + [ 'moto[server]==1.3.14', # Older versions of moto appear broken 'pathlib2', 'responses', 'boto3', 'paramiko', 'parameterizedtestcase', 'pytest', 'pytest-rerunfailures' ] setup( name='smart_open', version=__version__, description='Utils for streaming large files (S3, HDFS, GCS, Azure Blob Storage, gzip, bz2...)', long_description=read('README.rst'), packages=find_packages(exclude=["smart_open.tests*"]), author='Radim Rehurek', author_email='me@radimrehurek.com', maintainer='Radim Rehurek', maintainer_email='me@radimrehurek.com', url='https://github.com/piskvorky/smart_open', download_url='http://pypi.python.org/pypi/smart_open', keywords='file streaming, s3, hdfs, gcs, azure blob storage', license='MIT', platforms='any', tests_require=tests_require, extras_require={ 'test': tests_require, 's3': aws_deps, 'gcs': gcs_deps, 'azure': azure_deps, 'all': all_deps, 'http': http_deps, 'webhdfs': http_deps, }, python_requires=">=3.6,<4.0", test_suite="smart_open.tests", classifiers=[ 'Development Status :: 4 - Beta', 'Environment :: Console', 'Intended Audience :: Developers', 'License :: OSI Approved :: MIT License', 'Operating System :: OS Independent', 'Programming Language :: Python :: 3.6', 'Programming Language :: Python :: 3.7', 'Programming Language :: Python :: 3.8', 'Topic :: System :: Distributed Computing', 'Topic :: Database :: Front-Ends', ], ) smart_open-5.2.1/smart_open/000077500000000000000000000000001411241424400160325ustar00rootroot00000000000000smart_open-5.2.1/smart_open/__init__.py000066400000000000000000000037551411241424400201550ustar00rootroot00000000000000# -*- coding: utf-8 -*- # # Copyright (C) 2019 Radim Rehurek # # This code is distributed under the terms and conditions # from the MIT License (MIT). # """ Utilities for streaming to/from several file-like data storages: S3 / HDFS / local filesystem / compressed files, and many more, using a simple, Pythonic API. The streaming makes heavy use of generators and pipes, to avoid loading full file contents into memory, allowing work with arbitrarily large files. The main functions are: * `open()`, which opens the given file for reading/writing * `parse_uri()` * `s3_iter_bucket()`, which goes over all keys in an S3 bucket in parallel * `register_compressor()`, which registers callbacks for transparent compressor handling """ import logging # # Prevent regression of #474 and #475 # logger = logging.getLogger(__name__) logger.addHandler(logging.NullHandler()) from smart_open import version # noqa: E402 from .smart_open_lib import open, parse_uri, smart_open, register_compressor # noqa: E402 _WARNING = """smart_open.s3_iter_bucket is deprecated and will stop functioning in a future version. Please import iter_bucket from the smart_open.s3 module instead: from smart_open.s3 import iter_bucket as s3_iter_bucket """ _WARNED = False def s3_iter_bucket( bucket_name, prefix='', accept_key=None, key_limit=None, workers=16, retries=3, **session_kwargs ): """Deprecated. Use smart_open.s3.iter_bucket instead.""" global _WARNED from .s3 import iter_bucket if not _WARNED: logger.warning(_WARNING) _WARNED = True return iter_bucket( bucket_name=bucket_name, prefix=prefix, accept_key=accept_key, key_limit=key_limit, workers=workers, retries=retries, session_kwargs=session_kwargs ) __all__ = [ 'open', 'parse_uri', 'register_compressor', 's3_iter_bucket', 'smart_open', ] __version__ = version.__version__ smart_open-5.2.1/smart_open/azure.py000066400000000000000000000403231411241424400175340ustar00rootroot00000000000000# -*- coding: utf-8 -*- # # Copyright (C) 2020 Radim Rehurek # Copyright (C) 2020 Nicolas Mitchell # # This code is distributed under the terms and conditions # from the MIT License (MIT). # """Implements file-like objects for reading and writing to/from Azure Blob Storage.""" import base64 import io import logging import smart_open.bytebuffer import smart_open.constants try: import azure.storage.blob import azure.core.exceptions except ImportError: MISSING_DEPS = True logger = logging.getLogger(__name__) _BINARY_TYPES = (bytes, bytearray, memoryview) """Allowed binary buffer types for writing to the underlying Azure Blob Storage stream""" SCHEME = "azure" """Supported scheme for Azure Blob Storage in smart_open endpoint URL""" _DEFAULT_MIN_PART_SIZE = 64 * 1024**2 """Default minimum part size for Azure Cloud Storage multipart uploads is 64MB""" DEFAULT_BUFFER_SIZE = 4 * 1024**2 """Default buffer size for working with Azure Blob Storage is 256MB https://docs.microsoft.com/en-us/rest/api/storageservices/understanding-block-blobs--append-blobs--and-page-blobs """ DEFAULT_MAX_CONCURRENCY = 1 """Default number of parallel connections with which to download.""" def parse_uri(uri_as_string): sr = smart_open.utils.safe_urlsplit(uri_as_string) assert sr.scheme == SCHEME first = sr.netloc second = sr.path.lstrip('/') # https://docs.microsoft.com/en-us/rest/api/storageservices/working-with-the-root-container if not second: container_id = '$root' blob_id = first else: container_id = first blob_id = second return dict(scheme=SCHEME, container_id=container_id, blob_id=blob_id) def open_uri(uri, mode, transport_params): parsed_uri = parse_uri(uri) kwargs = smart_open.utils.check_kwargs(open, transport_params) return open(parsed_uri['container_id'], parsed_uri['blob_id'], mode, **kwargs) def open( container_id, blob_id, mode, client=None, # type: azure.storage.blob.BlobServiceClient buffer_size=DEFAULT_BUFFER_SIZE, min_part_size=_DEFAULT_MIN_PART_SIZE, max_concurrency=DEFAULT_MAX_CONCURRENCY, ): """Open an Azure Blob Storage blob for reading or writing. Parameters ---------- container_id: str The name of the container this object resides in. blob_id: str The name of the blob within the bucket. mode: str The mode for opening the object. Must be either "rb" or "wb". client: azure.storage.blob.BlobServiceClient The Azure Blob Storage client to use when working with azure-storage-blob. buffer_size: int, optional The buffer size to use when performing I/O. For reading only. min_part_size: int, optional The minimum part size for multipart uploads. For writing only. max_concurrency: int, optional The number of parallel connections with which to download. For reading only. """ if not client: raise ValueError('you must specify the client to connect to Azure') if mode == smart_open.constants.READ_BINARY: return Reader( container_id, blob_id, client, buffer_size=buffer_size, line_terminator=smart_open.constants.BINARY_NEWLINE, max_concurrency=max_concurrency, ) elif mode == smart_open.constants.WRITE_BINARY: return Writer( container_id, blob_id, client, min_part_size=min_part_size ) else: raise NotImplementedError('Azure Blob Storage support for mode %r not implemented' % mode) class _RawReader(object): """Read an Azure Blob Storage file.""" def __init__(self, blob, size, concurrency): # type: (azure.storage.blob.BlobClient, int, int) -> None self._blob = blob self._size = size self._position = 0 self._concurrency = concurrency def seek(self, position): """Seek to the specified position (byte offset) in the Azure Blob Storage blob. :param int position: The byte offset from the beginning of the blob. Returns the position after seeking. """ self._position = position return self._position def read(self, size=-1): if self._position >= self._size: return b'' binary = self._download_blob_chunk(size) self._position += len(binary) return binary def _download_blob_chunk(self, size): if self._size == self._position: # # When reading, we can't seek to the first byte of an empty file. # Similarly, we can't seek past the last byte. Do nothing here. # return b'' elif size == -1: stream = self._blob.download_blob(offset=self._position, max_concurrency=self._concurrency) else: stream = self._blob.download_blob( offset=self._position, max_concurrency=self._concurrency, length=size) logging.debug('reading with a max concurrency of %d', self._concurrency) if isinstance(stream, azure.storage.blob.StorageStreamDownloader): binary = stream.readall() else: binary = stream.read() return binary class Reader(io.BufferedIOBase): """Reads bytes from Azure Blob Storage. Implements the io.BufferedIOBase interface of the standard library. :raises azure.core.exceptions.ResourceNotFoundError: Raised when the blob to read from does not exist. """ def __init__( self, container, blob, client, # type: azure.storage.blob.BlobServiceClient buffer_size=DEFAULT_BUFFER_SIZE, line_terminator=smart_open.constants.BINARY_NEWLINE, max_concurrency=DEFAULT_MAX_CONCURRENCY, ): self._container_client = client.get_container_client(container) # type: azure.storage.blob.ContainerClient self._blob = self._container_client.get_blob_client(blob) if self._blob is None: raise azure.core.exceptions.ResourceNotFoundError( 'blob %s not found in %s' % (blob, container) ) try: self._size = self._blob.get_blob_properties()['size'] except KeyError: self._size = 0 self._raw_reader = _RawReader(self._blob, self._size, max_concurrency) self._position = 0 self._current_part = smart_open.bytebuffer.ByteBuffer(buffer_size) self._line_terminator = line_terminator # # This member is part of the io.BufferedIOBase interface. # self.raw = None # # Override some methods from io.IOBase. # def close(self): """Flush and close this stream.""" logger.debug("close: called") self._blob = None self._raw_reader = None def readable(self): """Return True if the stream can be read from.""" return True def seekable(self): """If False, seek(), tell() and truncate() will raise IOError. We offer only seek support, and no truncate support.""" return True # # io.BufferedIOBase methods. # def detach(self): """Unsupported.""" raise io.UnsupportedOperation def seek(self, offset, whence=smart_open.constants.WHENCE_START): """Seek to the specified position. :param int offset: The offset in bytes. :param int whence: Where the offset is from. Returns the position after seeking.""" logger.debug('seeking to offset: %r whence: %r', offset, whence) if whence not in smart_open.constants.WHENCE_CHOICES: raise ValueError('invalid whence %i, expected one of %r' % (whence, smart_open.constants.WHENCE_CHOICES)) if whence == smart_open.constants.WHENCE_START: new_position = offset elif whence == smart_open.constants.WHENCE_CURRENT: new_position = self._position + offset else: new_position = self._size + offset self._position = new_position self._raw_reader.seek(new_position) logger.debug('current_pos: %r', self._position) self._current_part.empty() return self._position def tell(self): """Return the current position within the file.""" return self._position def truncate(self, size=None): """Unsupported.""" raise io.UnsupportedOperation def read(self, size=-1): """Read up to size bytes from the object and return them.""" if size == 0: return b'' elif size < 0: self._position = self._size return self._read_from_buffer() + self._raw_reader.read() # # Return unused data first # if len(self._current_part) >= size: return self._read_from_buffer(size) if self._position == self._size: return self._read_from_buffer() self._fill_buffer() return self._read_from_buffer(size) def read1(self, size=-1): """This is the same as read().""" return self.read(size=size) def readinto(self, b): """Read up to len(b) bytes into b, and return the number of bytes read.""" data = self.read(len(b)) if not data: return 0 b[:len(data)] = data return len(data) def readline(self, limit=-1): """Read up to and including the next newline. Returns the bytes read.""" if limit != -1: raise NotImplementedError('limits other than -1 not implemented yet') the_line = io.BytesIO() while not (self._position == self._size and len(self._current_part) == 0): # # In the worst case, we're reading the unread part of self._current_part # twice here, once in the if condition and once when calling index. # # This is sub-optimal, but better than the alternative: wrapping # .index in a try..except, because that is slower. # remaining_buffer = self._current_part.peek() if self._line_terminator in remaining_buffer: next_newline = remaining_buffer.index(self._line_terminator) the_line.write(self._read_from_buffer(next_newline + 1)) break else: the_line.write(self._read_from_buffer()) self._fill_buffer() return the_line.getvalue() # # Internal methods. # def _read_from_buffer(self, size=-1): """Remove at most size bytes from our buffer and return them.""" # logger.debug('reading %r bytes from %r byte-long buffer', size, len(self._current_part)) size = size if size >= 0 else len(self._current_part) part = self._current_part.read(size) self._position += len(part) # logger.debug('part: %r', part) return part def _fill_buffer(self, size=-1): size = max(size, self._current_part._chunk_size) while len(self._current_part) < size and not self._position == self._size: bytes_read = self._current_part.fill(self._raw_reader) if bytes_read == 0: logger.debug('reached EOF while filling buffer') return True def __enter__(self): return self def __exit__(self, exc_type, exc_val, exc_tb): self.close() def __str__(self): return "(%s, %r, %r)" % (self.__class__.__name__, self._container.container_name, self._blob.blob_name) def __repr__(self): return "%s(container=%r, blob=%r)" % ( self.__class__.__name__, self._container_client.container_name, self._blob.blob_name, ) class Writer(io.BufferedIOBase): """Writes bytes to Azure Blob Storage. Implements the io.BufferedIOBase interface of the standard library.""" def __init__( self, container, blob, client, # type: azure.storage.blob.BlobServiceClient min_part_size=_DEFAULT_MIN_PART_SIZE, ): self._client = client self._container_client = self._client.get_container_client(container) # type: azure.storage.blob.ContainerClient self._blob = self._container_client.get_blob_client(blob) # type: azure.storage.blob.BlobClient self._min_part_size = min_part_size self._total_size = 0 self._total_parts = 0 self._bytes_uploaded = 0 self._current_part = io.BytesIO() self._block_list = [] # # This member is part of the io.BufferedIOBase interface. # self.raw = None def flush(self): pass # # Override some methods from io.IOBase. # def close(self): logger.debug("closing") if not self.closed: if self._current_part.tell() > 0: self._upload_part() self._blob.commit_block_list(self._block_list) self._block_list = [] self._client = None logger.debug("successfully closed") @property def closed(self): return self._client is None def writable(self): """Return True if the stream supports writing.""" return True def seekable(self): """If False, seek(), tell() and truncate() will raise IOError. We offer only tell support, and no seek or truncate support.""" return True def seek(self, offset, whence=smart_open.constants.WHENCE_START): """Unsupported.""" raise io.UnsupportedOperation def truncate(self, size=None): """Unsupported.""" raise io.UnsupportedOperation def tell(self): """Return the current stream position.""" return self._total_size # # io.BufferedIOBase methods. # def detach(self): raise io.UnsupportedOperation("detach() not supported") def write(self, b): """Write the given bytes (binary string) to the Azure Blob Storage file. There's buffering happening under the covers, so this may not actually do any HTTP transfer right away.""" if not isinstance(b, _BINARY_TYPES): raise TypeError("input must be one of %r, got: %r" % (_BINARY_TYPES, type(b))) self._current_part.write(b) self._total_size += len(b) if self._current_part.tell() >= self._min_part_size: self._upload_part() return len(b) def _upload_part(self): part_num = self._total_parts + 1 content_length = self._current_part.tell() range_stop = self._bytes_uploaded + content_length - 1 """ # noqa: E501 block_id's must be base64 encoded, all the same length, and less than or equal to 64 bytes in size prior to encoding. https://docs.microsoft.com/en-us/python/api/azure-storage-blob/azure.storage.blob.blobclient?view=azure-python#stage-block-block-id--data--length-none----kwargs- """ zero_padded_part_num = str(part_num).zfill(64 // 2) block_id = base64.b64encode(zero_padded_part_num.encode()) self._current_part.seek(0) self._blob.stage_block(block_id, self._current_part.read(content_length)) self._block_list.append(azure.storage.blob.BlobBlock(block_id=block_id)) logger.info( "uploading part #%i, %i bytes (total %.3fGB)", part_num, content_length, range_stop / 1024.0 ** 3, ) self._total_parts += 1 self._bytes_uploaded += content_length self._current_part = io.BytesIO(self._current_part.read()) self._current_part.seek(0, io.SEEK_END) def __enter__(self): return self def __exit__(self, exc_type, exc_val, exc_tb): self.close() def __str__(self): return "(%s, %r, %r)" % ( self.__class__.__name__, self._container_client.container_name, self._blob.blob_name ) def __repr__(self): return "%s(container=%r, blob=%r, min_part_size=%r)" % ( self.__class__.__name__, self._container_client.container_name, self._blob.blob_name, self._min_part_size ) smart_open-5.2.1/smart_open/bytebuffer.py000066400000000000000000000135231411241424400205450ustar00rootroot00000000000000# -*- coding: utf-8 -*- # # Copyright (C) 2019 Radim Rehurek # # This code is distributed under the terms and conditions # from the MIT License (MIT). # """Implements ByteBuffer class for amortizing network transfer overhead.""" import io class ByteBuffer(object): """Implements a byte buffer that allows callers to read data with minimal copying, and has a fast __len__ method. The buffer is parametrized by its chunk_size, which is the number of bytes that it will read in from the supplied reader or iterable when the buffer is being filled. As primary use case for this buffer is to amortize the overhead costs of transferring data over the network (rather than capping memory consumption), it leads to more predictable performance to always read the same amount of bytes each time the buffer is filled, hence the chunk_size parameter instead of some fixed capacity. The bytes are stored in a bytestring, and previously-read bytes are freed when the buffer is next filled (by slicing the bytestring into a smaller copy). Example ------- Note that while this example works in both Python 2 and 3, the doctest only passes in Python 3 due to the bytestring literals in the expected values. >>> buf = ByteBuffer(chunk_size = 8) >>> message_bytes = iter([b'Hello, W', b'orld!']) >>> buf.fill(message_bytes) 8 >>> len(buf) # only chunk_size bytes are filled 8 >>> buf.peek() b'Hello, W' >>> len(buf) # peek() does not change read position 8 >>> buf.read(6) b'Hello,' >>> len(buf) # read() does change read position 2 >>> buf.fill(message_bytes) 5 >>> buf.read() b' World!' >>> len(buf) 0 """ def __init__(self, chunk_size=io.DEFAULT_BUFFER_SIZE): """Create a ByteBuffer instance that reads chunk_size bytes when filled. Note that the buffer has no maximum size. Parameters ----------- chunk_size: int, optional The the number of bytes that will be read from the supplied reader or iterable when filling the buffer. """ self._chunk_size = chunk_size self.empty() def __len__(self): """Return the number of unread bytes in the buffer as an int""" return len(self._bytes) - self._pos def read(self, size=-1): """Read bytes from the buffer and advance the read position. Returns the bytes in a bytestring. Parameters ---------- size: int, optional Maximum number of bytes to read. If negative or not supplied, read all unread bytes in the buffer. Returns ------- bytes """ part = self.peek(size) self._pos += len(part) return part def peek(self, size=-1): """Get bytes from the buffer without advancing the read position. Returns the bytes in a bytestring. Parameters ---------- size: int, optional Maximum number of bytes to return. If negative or not supplied, return all unread bytes in the buffer. Returns ------- bytes """ if size < 0 or size > len(self): size = len(self) part = self._bytes[self._pos:self._pos+size] return part def empty(self): """Remove all bytes from the buffer""" self._bytes = b'' self._pos = 0 def fill(self, source, size=-1): """Fill the buffer with bytes from source until one of these conditions is met: * size bytes have been read from source (if size >= 0); * chunk_size bytes have been read from source; * no more bytes can be read from source; Returns the number of new bytes added to the buffer. Note: all previously-read bytes in the buffer are removed. Parameters ---------- source: a file-like object, or iterable/list that contains bytes The source of bytes to fill the buffer with. If this argument has the `read` attribute, it's assumed to be a file-like object and `read` is called to get the bytes; otherwise it's assumed to be an iterable or list that contains bytes, and a for loop is used to get the bytes. size: int, optional The number of bytes to try to read from source. If not supplied, negative, or larger than the buffer's chunk_size, then chunk_size bytes are read. Note that if source is an iterable or list, then it's possible that more than size bytes will be read if iterating over source produces more than one byte at a time. Returns ------- int, the number of new bytes added to the buffer. """ size = size if size >= 0 else self._chunk_size size = min(size, self._chunk_size) if self._pos != 0: self._bytes = self._bytes[self._pos:] self._pos = 0 if hasattr(source, 'read'): new_bytes = source.read(size) else: new_bytes = b'' for more_bytes in source: new_bytes += more_bytes if len(new_bytes) >= size: break self._bytes += new_bytes return len(new_bytes) def readline(self, terminator): """Read a line from this buffer efficiently. A line is a contiguous sequence of bytes that ends with either: 1. The ``terminator`` character 2. The end of the buffer itself :param byte terminator: The line terminator character. :rtype: bytes """ index = self._bytes.find(terminator, self._pos) if index == -1: size = len(self) else: size = index - self._pos + 1 return self.read(size) smart_open-5.2.1/smart_open/compression.py000066400000000000000000000104541411241424400207510ustar00rootroot00000000000000# -*- coding: utf-8 -*- # # Copyright (C) 2020 Radim Rehurek # # This code is distributed under the terms and conditions # from the MIT License (MIT). # """Implements the compression layer of the ``smart_open`` library.""" import logging import os.path logger = logging.getLogger(__name__) _COMPRESSOR_REGISTRY = {} NO_COMPRESSION = 'disable' """Use no compression. Read/write the data as-is.""" INFER_FROM_EXTENSION = 'infer_from_extension' """Determine the compression to use from the file extension. See get_supported_extensions(). """ def get_supported_compression_types(): """Return the list of supported compression types available to open. See compression paratemeter to smart_open.open(). """ return [NO_COMPRESSION, INFER_FROM_EXTENSION] + get_supported_extensions() def get_supported_extensions(): """Return the list of file extensions for which we have registered compressors.""" return sorted(_COMPRESSOR_REGISTRY.keys()) def register_compressor(ext, callback): """Register a callback for transparently decompressing files with a specific extension. Parameters ---------- ext: str The extension. Must include the leading period, e.g. ``.gz``. callback: callable The callback. It must accept two position arguments, file_obj and mode. This function will be called when ``smart_open`` is opening a file with the specified extension. Examples -------- Instruct smart_open to use the `lzma` module whenever opening a file with a .xz extension (see README.rst for the complete example showing I/O): >>> def _handle_xz(file_obj, mode): ... import lzma ... return lzma.LZMAFile(filename=file_obj, mode=mode, format=lzma.FORMAT_XZ) >>> >>> register_compressor('.xz', _handle_xz) """ if not (ext and ext[0] == '.'): raise ValueError('ext must be a string starting with ., not %r' % ext) if ext in _COMPRESSOR_REGISTRY: logger.warning('overriding existing compression handler for %r', ext) _COMPRESSOR_REGISTRY[ext] = callback def tweak_close(outer, inner): """Ensure that closing the `outer` stream closes the `inner` stream as well. Use this when your compression library's `close` method does not automatically close the underlying filestream. See https://github.com/RaRe-Technologies/smart_open/issues/630 for an explanation why that is a problem for smart_open. """ outer_close = outer.close def close_both(*args): nonlocal inner try: outer_close() finally: if inner: inner, fp = None, inner fp.close() outer.close = close_both def _handle_bz2(file_obj, mode): from bz2 import BZ2File result = BZ2File(file_obj, mode) tweak_close(result, file_obj) return result def _handle_gzip(file_obj, mode): import gzip result = gzip.GzipFile(fileobj=file_obj, mode=mode) tweak_close(result, file_obj) return result def compression_wrapper(file_obj, mode, compression): """ This function will wrap the file_obj with an appropriate [de]compression mechanism based on the specified extension. file_obj must either be a filehandle object, or a class which behaves like one. It must have a .name attribute. If the filename extension isn't recognized, will simply return the original file_obj. """ if compression == NO_COMPRESSION: return file_obj elif compression == INFER_FROM_EXTENSION: try: filename = file_obj.name except (AttributeError, TypeError): logger.warning( 'unable to transparently decompress %r because it ' 'seems to lack a string-like .name', file_obj ) return file_obj _, compression = os.path.splitext(filename) if compression in _COMPRESSOR_REGISTRY and mode.endswith('+'): raise ValueError('transparent (de)compression unsupported for mode %r' % mode) try: callback = _COMPRESSOR_REGISTRY[compression] except KeyError: return file_obj else: return callback(file_obj, mode) # # NB. avoid using lambda here to make stack traces more readable. # register_compressor('.bz2', _handle_bz2) register_compressor('.gz', _handle_gzip) smart_open-5.2.1/smart_open/concurrency.py000066400000000000000000000047601411241424400207450ustar00rootroot00000000000000# -*- coding: utf-8 -*- # # Copyright (C) 2020 Radim Rehurek # # This code is distributed under the terms and conditions # from the MIT License (MIT). # """Common functionality for concurrent processing. The main entry point is :func:`create_pool`. """ import contextlib import logging import warnings logger = logging.getLogger(__name__) # AWS Lambda environments do not support multiprocessing.Queue or multiprocessing.Pool. # However they do support Threads and therefore concurrent.futures's ThreadPoolExecutor. # We use this flag to allow python 2 backward compatibility, where concurrent.futures doesn't exist. _CONCURRENT_FUTURES = False try: import concurrent.futures _CONCURRENT_FUTURES = True except ImportError: warnings.warn("concurrent.futures could not be imported and won't be used") # Multiprocessing is unavailable in App Engine (and possibly other sandboxes). # The only method currently relying on it is iter_bucket, which is instructed # whether to use it by the MULTIPROCESSING flag. _MULTIPROCESSING = False try: import multiprocessing.pool _MULTIPROCESSING = True except ImportError: warnings.warn("multiprocessing could not be imported and won't be used") class DummyPool(object): """A class that mimics multiprocessing.pool.Pool for our purposes.""" def imap_unordered(self, function, items): return map(function, items) def terminate(self): pass class ConcurrentFuturesPool(object): """A class that mimics multiprocessing.pool.Pool but uses concurrent futures instead of processes.""" def __init__(self, max_workers): self.executor = concurrent.futures.ThreadPoolExecutor(max_workers) def imap_unordered(self, function, items): futures = [self.executor.submit(function, item) for item in items] for future in concurrent.futures.as_completed(futures): yield future.result() def terminate(self): self.executor.shutdown(wait=True) @contextlib.contextmanager def create_pool(processes=1): if _MULTIPROCESSING and processes: logger.info("creating multiprocessing pool with %i workers", processes) pool = multiprocessing.pool.Pool(processes=processes) elif _CONCURRENT_FUTURES and processes: logger.info("creating concurrent futures pool with %i workers", processes) pool = ConcurrentFuturesPool(max_workers=processes) else: logger.info("creating dummy pool") pool = DummyPool() yield pool pool.terminate() smart_open-5.2.1/smart_open/constants.py000066400000000000000000000007231411241424400204220ustar00rootroot00000000000000# -*- coding: utf-8 -*- # # Copyright (C) 2020 Radim Rehurek # # This code is distributed under the terms and conditions # from the MIT License (MIT). # """Some universal constants that are common to I/O operations.""" READ_BINARY = 'rb' WRITE_BINARY = 'wb' BINARY_MODES = (READ_BINARY, WRITE_BINARY) BINARY_NEWLINE = b'\n' WHENCE_START = 0 WHENCE_CURRENT = 1 WHENCE_END = 2 WHENCE_CHOICES = (WHENCE_START, WHENCE_CURRENT, WHENCE_END) smart_open-5.2.1/smart_open/doctools.py000066400000000000000000000147571411241424400202500ustar00rootroot00000000000000# -*- coding: utf-8 -*- # # Copyright (C) 2019 Radim Rehurek # # This code is distributed under the terms and conditions # from the MIT License (MIT). # """Common functions for working with docstrings. For internal use only. """ import contextlib import inspect import io import os.path import re from . import compression from . import transport PLACEHOLDER = ' smart_open/doctools.py magic goes here' def extract_kwargs(docstring): """Extract keyword argument documentation from a function's docstring. Parameters ---------- docstring: str The docstring to extract keyword arguments from. Returns ------- list of (str, str, list str) str The name of the keyword argument. str Its type. str Its documentation as a list of lines. Notes ----- The implementation is rather fragile. It expects the following: 1. The parameters are under an underlined Parameters section 2. Keyword parameters have the literal ", optional" after the type 3. Names and types are not indented 4. Descriptions are indented with 4 spaces 5. The Parameters section ends with an empty line. Examples -------- >>> docstring = '''The foo function. ... Parameters ... ---------- ... bar: str, optional ... This parameter is the bar. ... baz: int, optional ... This parameter is the baz. ... ... ''' >>> kwargs = extract_kwargs(docstring) >>> kwargs[0] ('bar', 'str, optional', ['This parameter is the bar.']) """ if not docstring: return [] lines = inspect.cleandoc(docstring).split('\n') retval = [] # # 1. Find the underlined 'Parameters' section # 2. Once there, continue parsing parameters until we hit an empty line # while lines and lines[0] != 'Parameters': lines.pop(0) if not lines: return [] lines.pop(0) lines.pop(0) while lines and lines[0]: name, type_ = lines.pop(0).split(':', 1) description = [] while lines and lines[0].startswith(' '): description.append(lines.pop(0).strip()) if 'optional' in type_: retval.append((name.strip(), type_.strip(), description)) return retval def to_docstring(kwargs, lpad=''): """Reconstruct a docstring from keyword argument info. Basically reverses :func:`extract_kwargs`. Parameters ---------- kwargs: list Output from the extract_kwargs function lpad: str, optional Padding string (from the left). Returns ------- str The docstring snippet documenting the keyword arguments. Examples -------- >>> kwargs = [ ... ('bar', 'str, optional', ['This parameter is the bar.']), ... ('baz', 'int, optional', ['This parameter is the baz.']), ... ] >>> print(to_docstring(kwargs), end='') bar: str, optional This parameter is the bar. baz: int, optional This parameter is the baz. """ buf = io.StringIO() for name, type_, description in kwargs: buf.write('%s%s: %s\n' % (lpad, name, type_)) for line in description: buf.write('%s %s\n' % (lpad, line)) return buf.getvalue() def extract_examples_from_readme_rst(indent=' '): """Extract examples from this project's README.rst file. Parameters ---------- indent: str Prepend each line with this string. Should contain some number of spaces. Returns ------- str The examples. Notes ----- Quite fragile, depends on named labels inside the README.rst file. """ curr_dir = os.path.dirname(os.path.abspath(__file__)) readme_path = os.path.join(curr_dir, '..', 'README.rst') try: with open(readme_path) as fin: lines = list(fin) start = lines.index('.. _doctools_before_examples:\n') end = lines.index(".. _doctools_after_examples:\n") lines = lines[start+4:end-2] return ''.join([indent + re.sub('^ ', '', line) for line in lines]) except Exception: return indent + 'See README.rst' def tweak_open_docstring(f): buf = io.StringIO() seen = set() root_path = os.path.dirname(os.path.dirname(__file__)) with contextlib.redirect_stdout(buf): print(' smart_open supports the following transport mechanisms:') print() for scheme, submodule in sorted(transport._REGISTRY.items()): if scheme == transport.NO_SCHEME or submodule in seen: continue seen.add(submodule) relpath = os.path.relpath(submodule.__file__, start=root_path) heading = '%s (%s)' % (scheme, relpath) print(' %s' % heading) print(' %s' % ('~' * len(heading))) print(' %s' % submodule.__doc__.split('\n')[0]) print() kwargs = extract_kwargs(submodule.open.__doc__) if kwargs: print(to_docstring(kwargs, lpad=u' ')) print(' Examples') print(' --------') print() print(extract_examples_from_readme_rst()) print(' This function also supports transparent compression and decompression ') print(' using the following codecs:') print() for extension in compression.get_supported_extensions(): print(' * %s' % extension) print() print(' The function depends on the file extension to determine the appropriate codec.') # # The docstring can be None if -OO was passed to the interpreter. # if f.__doc__: f.__doc__ = f.__doc__.replace(PLACEHOLDER, buf.getvalue()) def tweak_parse_uri_docstring(f): buf = io.StringIO() seen = set() schemes = [] examples = [] for scheme, submodule in sorted(transport._REGISTRY.items()): if scheme == transport.NO_SCHEME or submodule in seen: continue schemes.append(scheme) seen.add(submodule) try: examples.extend(submodule.URI_EXAMPLES) except AttributeError: pass with contextlib.redirect_stdout(buf): print(' Supported URI schemes are:') print() for scheme in schemes: print(' * %s' % scheme) print() print(' Valid URI examples::') print() for example in examples: print(' * %s' % example) if f.__doc__: f.__doc__ = f.__doc__.replace(PLACEHOLDER, buf.getvalue()) smart_open-5.2.1/smart_open/gcs.py000066400000000000000000000463061411241424400171710ustar00rootroot00000000000000# -*- coding: utf-8 -*- # # Copyright (C) 2019 Radim Rehurek # # This code is distributed under the terms and conditions # from the MIT License (MIT). # """Implements file-like objects for reading and writing to/from GCS.""" import io import logging try: import google.cloud.exceptions import google.cloud.storage import google.auth.transport.requests except ImportError: MISSING_DEPS = True import smart_open.bytebuffer import smart_open.utils from smart_open import constants logger = logging.getLogger(__name__) _BINARY_TYPES = (bytes, bytearray, memoryview) """Allowed binary buffer types for writing to the underlying GCS stream""" _UNKNOWN = '*' SCHEME = "gs" """Supported scheme for GCS""" _MIN_MIN_PART_SIZE = _REQUIRED_CHUNK_MULTIPLE = 256 * 1024 """Google requires you to upload in multiples of 256 KB, except for the last part.""" _DEFAULT_MIN_PART_SIZE = 50 * 1024**2 """Default minimum part size for GCS multipart uploads""" DEFAULT_BUFFER_SIZE = 256 * 1024 """Default buffer size for working with GCS""" _UPLOAD_INCOMPLETE_STATUS_CODES = (308, ) _UPLOAD_COMPLETE_STATUS_CODES = (200, 201) def _make_range_string(start, stop=None, end=None): # # GCS seems to violate RFC-2616 (see utils.make_range_string), so we # need a separate implementation. # # https://cloud.google.com/storage/docs/xml-api/resumable-upload#step_3upload_the_file_blocks # if end is None: end = _UNKNOWN if stop is None: return 'bytes %d-/%s' % (start, end) return 'bytes %d-%d/%s' % (start, stop, end) class UploadFailedError(Exception): def __init__(self, message, status_code, text): """Raise when a multi-part upload to GCS returns a failed response status code. Parameters ---------- message: str The error message to display. status_code: int The status code returned from the upload response. text: str The text returned from the upload response. """ super(UploadFailedError, self).__init__(message) self.status_code = status_code self.text = text def _fail(response, part_num, content_length, total_size, headers): status_code = response.status_code response_text = response.text total_size_gb = total_size / 1024.0 ** 3 msg = ( "upload failed (status code: %(status_code)d, response text: %(response_text)s), " "part #%(part_num)d, %(total_size)d bytes (total %(total_size_gb).3fGB), headers: %(headers)r" ) % locals() raise UploadFailedError(msg, response.status_code, response.text) def parse_uri(uri_as_string): sr = smart_open.utils.safe_urlsplit(uri_as_string) assert sr.scheme == SCHEME bucket_id = sr.netloc blob_id = sr.path.lstrip('/') return dict(scheme=SCHEME, bucket_id=bucket_id, blob_id=blob_id) def open_uri(uri, mode, transport_params): parsed_uri = parse_uri(uri) kwargs = smart_open.utils.check_kwargs(open, transport_params) return open(parsed_uri['bucket_id'], parsed_uri['blob_id'], mode, **kwargs) def open( bucket_id, blob_id, mode, buffer_size=DEFAULT_BUFFER_SIZE, min_part_size=_MIN_MIN_PART_SIZE, client=None, # type: google.cloud.storage.Client blob_properties=None ): """Open an GCS blob for reading or writing. Parameters ---------- bucket_id: str The name of the bucket this object resides in. blob_id: str The name of the blob within the bucket. mode: str The mode for opening the object. Must be either "rb" or "wb". buffer_size: int, optional The buffer size to use when performing I/O. For reading only. min_part_size: int, optional The minimum part size for multipart uploads. For writing only. client: google.cloud.storage.Client, optional The GCS client to use when working with google-cloud-storage. blob_properties: dict, optional Set properties on blob before writing. For writing only. """ if mode == constants.READ_BINARY: fileobj = Reader( bucket_id, blob_id, buffer_size=buffer_size, line_terminator=constants.BINARY_NEWLINE, client=client, ) elif mode == constants.WRITE_BINARY: fileobj = Writer( bucket_id, blob_id, min_part_size=min_part_size, client=client, blob_properties=blob_properties, ) else: raise NotImplementedError('GCS support for mode %r not implemented' % mode) fileobj.name = blob_id return fileobj class _RawReader(object): """Read an GCS object.""" def __init__(self, gcs_blob, size): # type: (google.cloud.storage.Blob, int) -> None self._blob = gcs_blob self._size = size self._position = 0 def seek(self, position): """Seek to the specified position (byte offset) in the GCS key. :param int position: The byte offset from the beginning of the key. Returns the position after seeking. """ self._position = position return self._position def read(self, size=-1): if self._position >= self._size: return b'' binary = self._download_blob_chunk(size) self._position += len(binary) return binary def _download_blob_chunk(self, size): start = position = self._position if position == self._size: # # When reading, we can't seek to the first byte of an empty file. # Similarly, we can't seek past the last byte. Do nothing here. # binary = b'' elif size == -1: binary = self._blob.download_as_bytes(start=start) else: end = position + size binary = self._blob.download_as_bytes(start=start, end=end) return binary class Reader(io.BufferedIOBase): """Reads bytes from GCS. Implements the io.BufferedIOBase interface of the standard library. :raises google.cloud.exceptions.NotFound: Raised when the blob to read from does not exist. """ def __init__( self, bucket, key, buffer_size=DEFAULT_BUFFER_SIZE, line_terminator=constants.BINARY_NEWLINE, client=None, # type: google.cloud.storage.Client ): if client is None: client = google.cloud.storage.Client() self._blob = client.bucket(bucket).get_blob(key) # type: google.cloud.storage.Blob if self._blob is None: raise google.cloud.exceptions.NotFound('blob %s not found in %s' % (key, bucket)) self._size = self._blob.size if self._blob.size is not None else 0 self._raw_reader = _RawReader(self._blob, self._size) self._current_pos = 0 self._current_part_size = buffer_size self._current_part = smart_open.bytebuffer.ByteBuffer(buffer_size) self._eof = False self._line_terminator = line_terminator # # This member is part of the io.BufferedIOBase interface. # self.raw = None # # Override some methods from io.IOBase. # def close(self): """Flush and close this stream.""" logger.debug("close: called") self._blob = None self._current_part = None self._raw_reader = None def readable(self): """Return True if the stream can be read from.""" return True def seekable(self): """If False, seek(), tell() and truncate() will raise IOError. We offer only seek support, and no truncate support.""" return True # # io.BufferedIOBase methods. # def detach(self): """Unsupported.""" raise io.UnsupportedOperation def seek(self, offset, whence=constants.WHENCE_START): """Seek to the specified position. :param int offset: The offset in bytes. :param int whence: Where the offset is from. Returns the position after seeking.""" logger.debug('seeking to offset: %r whence: %r', offset, whence) if whence not in constants.WHENCE_CHOICES: raise ValueError('invalid whence, expected one of %r' % constants.WHENCE_CHOICES) if whence == constants.WHENCE_START: new_position = offset elif whence == constants.WHENCE_CURRENT: new_position = self._current_pos + offset else: new_position = self._size + offset new_position = smart_open.utils.clamp(new_position, 0, self._size) self._current_pos = new_position self._raw_reader.seek(new_position) logger.debug('current_pos: %r', self._current_pos) self._current_part.empty() self._eof = self._current_pos == self._size return self._current_pos def tell(self): """Return the current position within the file.""" return self._current_pos def truncate(self, size=None): """Unsupported.""" raise io.UnsupportedOperation def read(self, size=-1): """Read up to size bytes from the object and return them.""" if size == 0: return b'' elif size < 0: self._current_pos = self._size return self._read_from_buffer() + self._raw_reader.read() # # Return unused data first # if len(self._current_part) >= size: return self._read_from_buffer(size) # # If the stream is finished, return what we have. # if self._eof: return self._read_from_buffer() # # Fill our buffer to the required size. # self._fill_buffer(size) return self._read_from_buffer(size) def read1(self, size=-1): """This is the same as read().""" return self.read(size=size) def readinto(self, b): """Read up to len(b) bytes into b, and return the number of bytes read.""" data = self.read(len(b)) if not data: return 0 b[:len(data)] = data return len(data) def readline(self, limit=-1): """Read up to and including the next newline. Returns the bytes read.""" if limit != -1: raise NotImplementedError('limits other than -1 not implemented yet') the_line = io.BytesIO() while not (self._eof and len(self._current_part) == 0): # # In the worst case, we're reading the unread part of self._current_part # twice here, once in the if condition and once when calling index. # # This is sub-optimal, but better than the alternative: wrapping # .index in a try..except, because that is slower. # remaining_buffer = self._current_part.peek() if self._line_terminator in remaining_buffer: next_newline = remaining_buffer.index(self._line_terminator) the_line.write(self._read_from_buffer(next_newline + 1)) break else: the_line.write(self._read_from_buffer()) self._fill_buffer() return the_line.getvalue() # # Internal methods. # def _read_from_buffer(self, size=-1): """Remove at most size bytes from our buffer and return them.""" # logger.debug('reading %r bytes from %r byte-long buffer', size, len(self._current_part)) size = size if size >= 0 else len(self._current_part) part = self._current_part.read(size) self._current_pos += len(part) # logger.debug('part: %r', part) return part def _fill_buffer(self, size=-1): size = size if size >= 0 else self._current_part._chunk_size while len(self._current_part) < size and not self._eof: bytes_read = self._current_part.fill(self._raw_reader) if bytes_read == 0: logger.debug('reached EOF while filling buffer') self._eof = True def __str__(self): return "(%s, %r, %r)" % (self.__class__.__name__, self._blob.bucket.name, self._blob.name) def __repr__(self): return "%s(bucket=%r, blob=%r, buffer_size=%r)" % ( self.__class__.__name__, self._blob.bucket.name, self._blob.name, self._current_part_size, ) class Writer(io.BufferedIOBase): """Writes bytes to GCS. Implements the io.BufferedIOBase interface of the standard library.""" def __init__( self, bucket, blob, min_part_size=_DEFAULT_MIN_PART_SIZE, client=None, # type: google.cloud.storage.Client blob_properties=None, ): if client is None: client = google.cloud.storage.Client() self._client = client self._blob = self._client.bucket(bucket).blob(blob) # type: google.cloud.storage.Blob assert min_part_size % _REQUIRED_CHUNK_MULTIPLE == 0, 'min part size must be a multiple of 256KB' assert min_part_size >= _MIN_MIN_PART_SIZE, 'min part size must be greater than 256KB' self._min_part_size = min_part_size self._total_size = 0 self._total_parts = 0 self._bytes_uploaded = 0 self._current_part = io.BytesIO() self._session = google.auth.transport.requests.AuthorizedSession(client._credentials) if blob_properties: for k, v in blob_properties.items(): setattr(self._blob, k, v) # # https://cloud.google.com/storage/docs/json_api/v1/how-tos/resumable-upload#start-resumable # self._resumable_upload_url = self._blob.create_resumable_upload_session() # # This member is part of the io.BufferedIOBase interface. # self.raw = None def flush(self): pass # # Override some methods from io.IOBase. # def close(self): logger.debug("closing") if not self.closed: if self._total_size == 0: # empty files self._upload_empty_part() else: self._upload_part(is_last=True) self._client = None logger.debug("successfully closed") @property def closed(self): return self._client is None def writable(self): """Return True if the stream supports writing.""" return True def seekable(self): """If False, seek(), tell() and truncate() will raise IOError. We offer only tell support, and no seek or truncate support.""" return True def seek(self, offset, whence=constants.WHENCE_START): """Unsupported.""" raise io.UnsupportedOperation def truncate(self, size=None): """Unsupported.""" raise io.UnsupportedOperation def tell(self): """Return the current stream position.""" return self._total_size # # io.BufferedIOBase methods. # def detach(self): raise io.UnsupportedOperation("detach() not supported") def write(self, b): """Write the given bytes (binary string) to the GCS file. There's buffering happening under the covers, so this may not actually do any HTTP transfer right away.""" if not isinstance(b, _BINARY_TYPES): raise TypeError("input must be one of %r, got: %r" % (_BINARY_TYPES, type(b))) self._current_part.write(b) self._total_size += len(b) # # If the size of this part is precisely equal to the minimum part size, # we don't perform the actual write now, and wait until we see more data. # We do this because the very last part of the upload must be handled slightly # differently (see comments in the _upload_part method). # if self._current_part.tell() > self._min_part_size: self._upload_part() return len(b) def terminate(self): """Cancel the underlying resumable upload.""" # # https://cloud.google.com/storage/docs/xml-api/resumable-upload#example_cancelling_an_upload # self._session.delete(self._resumable_upload_url) # # Internal methods. # def _upload_part(self, is_last=False): part_num = self._total_parts + 1 # # Here we upload the largest amount possible given GCS's restriction # of parts being multiples of 256kB, except for the last one. # # A final upload of 0 bytes does not work, so we need to guard against # this edge case. This results in occasionally keeping an additional # 256kB in the buffer after uploading a part, but until this is fixed # on Google's end there is no other option. # # https://stackoverflow.com/questions/60230631/upload-zero-size-final-part-to-google-cloud-storage-resumable-upload # content_length = self._current_part.tell() remainder = content_length % self._min_part_size if is_last: end = self._bytes_uploaded + content_length elif remainder == 0: content_length -= _REQUIRED_CHUNK_MULTIPLE end = None else: content_length -= remainder end = None range_stop = self._bytes_uploaded + content_length - 1 content_range = _make_range_string(self._bytes_uploaded, range_stop, end=end) headers = { 'Content-Length': str(content_length), 'Content-Range': content_range, } logger.info( "uploading part #%i, %i bytes (total %.3fGB) headers %r", part_num, content_length, range_stop / 1024.0 ** 3, headers, ) self._current_part.seek(0) response = self._session.put( self._resumable_upload_url, data=self._current_part.read(content_length), headers=headers, ) if is_last: expected = _UPLOAD_COMPLETE_STATUS_CODES else: expected = _UPLOAD_INCOMPLETE_STATUS_CODES if response.status_code not in expected: _fail(response, part_num, content_length, self._total_size, headers) logger.debug("upload of part #%i finished" % part_num) self._total_parts += 1 self._bytes_uploaded += content_length # # For the last part, the below _current_part handling is a NOOP. # self._current_part = io.BytesIO(self._current_part.read()) self._current_part.seek(0, io.SEEK_END) def _upload_empty_part(self): logger.debug("creating empty file") headers = {'Content-Length': '0'} response = self._session.put(self._resumable_upload_url, headers=headers) if response.status_code not in _UPLOAD_COMPLETE_STATUS_CODES: _fail(response, self._total_parts + 1, 0, self._total_size, headers) self._total_parts += 1 def __enter__(self): return self def __exit__(self, exc_type, exc_val, exc_tb): if exc_type is not None: self.terminate() else: self.close() def __str__(self): return "(%s, %r, %r)" % (self.__class__.__name__, self._blob.bucket.name, self._blob.name) def __repr__(self): return "%s(bucket=%r, blob=%r, min_part_size=%r)" % ( self.__class__.__name__, self._blob.bucket.name, self._blob.name, self._min_part_size, ) smart_open-5.2.1/smart_open/hdfs.py000066400000000000000000000076321411241424400173400ustar00rootroot00000000000000# -*- coding: utf-8 -*- # # Copyright (C) 2019 Radim Rehurek # # This code is distributed under the terms and conditions # from the MIT License (MIT). # """Implements reading and writing to/from HDFS. The main entry point is the :func:`~smart_open.hdfs.open` function. Uses the command-line hdfs utility under the covers. """ import io import logging import subprocess import urllib.parse from smart_open import utils logger = logging.getLogger(__name__) SCHEME = 'hdfs' URI_EXAMPLES = ( 'hdfs:///path/file', 'hdfs://path/file', ) def parse_uri(uri_as_string): split_uri = urllib.parse.urlsplit(uri_as_string) assert split_uri.scheme == SCHEME uri_path = split_uri.netloc + split_uri.path uri_path = "/" + uri_path.lstrip("/") if not uri_path: raise RuntimeError("invalid HDFS URI: %r" % uri_as_string) return dict(scheme=SCHEME, uri_path=uri_path) def open_uri(uri, mode, transport_params): utils.check_kwargs(open, transport_params) parsed_uri = parse_uri(uri) fobj = open(parsed_uri['uri_path'], mode) fobj.name = parsed_uri['uri_path'].split('/')[-1] return fobj def open(uri, mode): if mode == 'rb': return CliRawInputBase(uri) elif mode == 'wb': return CliRawOutputBase(uri) else: raise NotImplementedError('hdfs support for mode %r not implemented' % mode) class CliRawInputBase(io.RawIOBase): """Reads bytes from HDFS via the "hdfs dfs" command-line interface. Implements the io.RawIOBase interface of the standard library. """ def __init__(self, uri): self._uri = uri self._sub = subprocess.Popen(["hdfs", "dfs", '-cat', self._uri], stdout=subprocess.PIPE) # # This member is part of the io.BufferedIOBase interface. # self.raw = None # # Override some methods from io.IOBase. # def close(self): """Flush and close this stream.""" logger.debug("close: called") self._sub.terminate() self._sub = None def readable(self): """Return True if the stream can be read from.""" return self._sub is not None def seekable(self): """If False, seek(), tell() and truncate() will raise IOError.""" return False # # io.RawIOBase methods. # def detach(self): """Unsupported.""" raise io.UnsupportedOperation def read(self, size=-1): """Read up to size bytes from the object and return them.""" return self._sub.stdout.read(size) def read1(self, size=-1): """This is the same as read().""" return self.read(size=size) def readinto(self, b): """Read up to len(b) bytes into b, and return the number of bytes read.""" data = self.read(len(b)) if not data: return 0 b[:len(data)] = data return len(data) class CliRawOutputBase(io.RawIOBase): """Writes bytes to HDFS via the "hdfs dfs" command-line interface. Implements the io.RawIOBase interface of the standard library. """ def __init__(self, uri): self._uri = uri self._sub = subprocess.Popen(["hdfs", "dfs", '-put', '-f', '-', self._uri], stdin=subprocess.PIPE) # # This member is part of the io.RawIOBase interface. # self.raw = None def close(self): self.flush() self._sub.stdin.close() self._sub.wait() def flush(self): self._sub.stdin.flush() def writeable(self): """Return True if this object is writeable.""" return self._sub is not None def seekable(self): """If False, seek(), tell() and truncate() will raise IOError.""" return False def write(self, b): self._sub.stdin.write(b) # # io.IOBase methods. # def detach(self): raise io.UnsupportedOperation("detach() not supported") smart_open-5.2.1/smart_open/http.py000066400000000000000000000241021411241424400173620ustar00rootroot00000000000000# -*- coding: utf-8 -*- # # Copyright (C) 2019 Radim Rehurek # # This code is distributed under the terms and conditions # from the MIT License (MIT). # """Implements file-like objects for reading from http.""" import io import logging import os.path import urllib.parse try: import requests except ImportError: MISSING_DEPS = True from smart_open import bytebuffer, constants import smart_open.utils DEFAULT_BUFFER_SIZE = 128 * 1024 SCHEMES = ('http', 'https') logger = logging.getLogger(__name__) _HEADERS = {'Accept-Encoding': 'identity'} """The headers we send to the server with every HTTP request. For now, we ask the server to send us the files as they are. Sometimes, servers compress the file for more efficient transfer, in which case the client (us) has to decompress them with the appropriate algorithm. """ def parse_uri(uri_as_string): split_uri = urllib.parse.urlsplit(uri_as_string) assert split_uri.scheme in SCHEMES uri_path = split_uri.netloc + split_uri.path uri_path = "/" + uri_path.lstrip("/") return dict(scheme=split_uri.scheme, uri_path=uri_path) def open_uri(uri, mode, transport_params): kwargs = smart_open.utils.check_kwargs(open, transport_params) return open(uri, mode, **kwargs) def open(uri, mode, kerberos=False, user=None, password=None, headers=None, timeout=None): """Implement streamed reader from a web site. Supports Kerberos and Basic HTTP authentication. Parameters ---------- url: str The URL to open. mode: str The mode to open using. kerberos: boolean, optional If True, will attempt to use the local Kerberos credentials user: str, optional The username for authenticating over HTTP password: str, optional The password for authenticating over HTTP headers: dict, optional Any headers to send in the request. If ``None``, the default headers are sent: ``{'Accept-Encoding': 'identity'}``. To use no headers at all, set this variable to an empty dict, ``{}``. Note ---- If neither kerberos or (user, password) are set, will connect unauthenticated, unless set separately in headers. """ if mode == constants.READ_BINARY: fobj = SeekableBufferedInputBase( uri, mode, kerberos=kerberos, user=user, password=password, headers=headers, timeout=timeout, ) fobj.name = os.path.basename(urllib.parse.urlparse(uri).path) return fobj else: raise NotImplementedError('http support for mode %r not implemented' % mode) class BufferedInputBase(io.BufferedIOBase): def __init__(self, url, mode='r', buffer_size=DEFAULT_BUFFER_SIZE, kerberos=False, user=None, password=None, headers=None, timeout=None): if kerberos: import requests_kerberos auth = requests_kerberos.HTTPKerberosAuth() elif user is not None and password is not None: auth = (user, password) else: auth = None self.buffer_size = buffer_size self.mode = mode if headers is None: self.headers = _HEADERS.copy() else: self.headers = headers self.timeout = timeout self.response = requests.get( url, auth=auth, stream=True, headers=self.headers, timeout=self.timeout, ) if not self.response.ok: self.response.raise_for_status() self._read_iter = self.response.iter_content(self.buffer_size) self._read_buffer = bytebuffer.ByteBuffer(buffer_size) self._current_pos = 0 # # This member is part of the io.BufferedIOBase interface. # self.raw = None # # Override some methods from io.IOBase. # def close(self): """Flush and close this stream.""" logger.debug("close: called") self.response = None self._read_iter = None def readable(self): """Return True if the stream can be read from.""" return True def seekable(self): return False # # io.BufferedIOBase methods. # def detach(self): """Unsupported.""" raise io.UnsupportedOperation def read(self, size=-1): """ Mimics the read call to a filehandle object. """ logger.debug("reading with size: %d", size) if self.response is None: return b'' if size == 0: return b'' elif size < 0 and len(self._read_buffer) == 0: retval = self.response.raw.read() elif size < 0: retval = self._read_buffer.read() + self.response.raw.read() else: while len(self._read_buffer) < size: logger.debug( "http reading more content at current_pos: %d with size: %d", self._current_pos, size, ) bytes_read = self._read_buffer.fill(self._read_iter) if bytes_read == 0: # Oops, ran out of data early. retval = self._read_buffer.read() self._current_pos += len(retval) return retval # If we got here, it means we have enough data in the buffer # to return to the caller. retval = self._read_buffer.read(size) self._current_pos += len(retval) return retval def read1(self, size=-1): """This is the same as read().""" return self.read(size=size) def readinto(self, b): """Read up to len(b) bytes into b, and return the number of bytes read.""" data = self.read(len(b)) if not data: return 0 b[:len(data)] = data return len(data) class SeekableBufferedInputBase(BufferedInputBase): """ Implement seekable streamed reader from a web site. Supports Kerberos and Basic HTTP authentication. """ def __init__(self, url, mode='r', buffer_size=DEFAULT_BUFFER_SIZE, kerberos=False, user=None, password=None, headers=None, timeout=None): """ If Kerberos is True, will attempt to use the local Kerberos credentials. Otherwise, will try to use "basic" HTTP authentication via username/password. If none of those are set, will connect unauthenticated. """ self.url = url if kerberos: import requests_kerberos self.auth = requests_kerberos.HTTPKerberosAuth() elif user is not None and password is not None: self.auth = (user, password) else: self.auth = None if headers is None: self.headers = _HEADERS.copy() else: self.headers = headers self.timeout = timeout self.buffer_size = buffer_size self.mode = mode self.response = self._partial_request() if not self.response.ok: self.response.raise_for_status() logger.debug('self.response: %r, raw: %r', self.response, self.response.raw) self.content_length = int(self.response.headers.get("Content-Length", -1)) # # We assume the HTTP stream is seekable unless the server explicitly # tells us it isn't. It's better to err on the side of "seekable" # because we don't want to prevent users from seeking a stream that # does not appear to be seekable but really is. # self._seekable = self.response.headers.get("Accept-Ranges", "").lower() != "none" self._read_iter = self.response.iter_content(self.buffer_size) self._read_buffer = bytebuffer.ByteBuffer(buffer_size) self._current_pos = 0 # # This member is part of the io.BufferedIOBase interface. # self.raw = None def seek(self, offset, whence=0): """Seek to the specified position. :param int offset: The offset in bytes. :param int whence: Where the offset is from. Returns the position after seeking.""" logger.debug('seeking to offset: %r whence: %r', offset, whence) if whence not in constants.WHENCE_CHOICES: raise ValueError('invalid whence, expected one of %r' % constants.WHENCE_CHOICES) if not self.seekable(): raise OSError('stream is not seekable') if whence == constants.WHENCE_START: new_pos = offset elif whence == constants.WHENCE_CURRENT: new_pos = self._current_pos + offset elif whence == constants.WHENCE_END: new_pos = self.content_length + offset if self.content_length == -1: new_pos = smart_open.utils.clamp(new_pos, maxval=None) else: new_pos = smart_open.utils.clamp(new_pos, maxval=self.content_length) if self._current_pos == new_pos: return self._current_pos logger.debug("http seeking from current_pos: %d to new_pos: %d", self._current_pos, new_pos) self._current_pos = new_pos if new_pos == self.content_length: self.response = None self._read_iter = None self._read_buffer.empty() else: response = self._partial_request(new_pos) if response.ok: self.response = response self._read_iter = self.response.iter_content(self.buffer_size) self._read_buffer.empty() else: self.response = None return self._current_pos def tell(self): return self._current_pos def seekable(self, *args, **kwargs): return self._seekable def truncate(self, size=None): """Unsupported.""" raise io.UnsupportedOperation def _partial_request(self, start_pos=None): if start_pos is not None: self.headers.update({"range": smart_open.utils.make_range_string(start_pos)}) response = requests.get( self.url, auth=self.auth, stream=True, headers=self.headers, timeout=self.timeout, ) return response smart_open-5.2.1/smart_open/local_file.py000066400000000000000000000017541411241424400205040ustar00rootroot00000000000000# -*- coding: utf-8 -*- # # Copyright (C) 2020 Radim Rehurek # # This code is distributed under the terms and conditions # from the MIT License (MIT). # """Implements the transport for the file:// schema.""" import io import os.path SCHEME = 'file' URI_EXAMPLES = ( './local/path/file', '~/local/path/file', 'local/path/file', './local/path/file.gz', 'file:///home/user/file', 'file:///home/user/file.bz2', ) open = io.open def parse_uri(uri_as_string): local_path = extract_local_path(uri_as_string) return dict(scheme=SCHEME, uri_path=local_path) def open_uri(uri_as_string, mode, transport_params): parsed_uri = parse_uri(uri_as_string) fobj = io.open(parsed_uri['uri_path'], mode) return fobj def extract_local_path(uri_as_string): if uri_as_string.startswith('file://'): local_path = uri_as_string.replace('file://', '', 1) else: local_path = uri_as_string return os.path.expanduser(local_path) smart_open-5.2.1/smart_open/s3.py000066400000000000000000001174011411241424400167350ustar00rootroot00000000000000# -*- coding: utf-8 -*- # # Copyright (C) 2019 Radim Rehurek # # This code is distributed under the terms and conditions # from the MIT License (MIT). # """Implements file-like objects for reading and writing from/to AWS S3.""" import io import functools import logging import time import warnings try: import boto3 import botocore.client import botocore.exceptions import urllib3.exceptions except ImportError: MISSING_DEPS = True import smart_open.bytebuffer import smart_open.concurrency import smart_open.utils from smart_open import constants logger = logging.getLogger(__name__) DEFAULT_MIN_PART_SIZE = 50 * 1024**2 """Default minimum part size for S3 multipart uploads""" MIN_MIN_PART_SIZE = 5 * 1024 ** 2 """The absolute minimum permitted by Amazon.""" SCHEMES = ("s3", "s3n", 's3u', "s3a") DEFAULT_PORT = 443 DEFAULT_HOST = 's3.amazonaws.com' DEFAULT_BUFFER_SIZE = 128 * 1024 URI_EXAMPLES = ( 's3://my_bucket/my_key', 's3://my_key:my_secret@my_bucket/my_key', 's3://my_key:my_secret@my_server:my_port@my_bucket/my_key', ) _UPLOAD_ATTEMPTS = 6 _SLEEP_SECONDS = 10 # Returned by AWS when we try to seek beyond EOF. _OUT_OF_RANGE = 'InvalidRange' class _ClientWrapper: """Wraps a client to inject the appropriate keyword args into each method call. The keyword args are a dictionary keyed by the fully qualified method name. For example, S3.Client.create_multipart_upload. See https://boto3.amazonaws.com/v1/documentation/api/latest/reference/services/s3.html#client This wrapper behaves identically to the client otherwise. """ def __init__(self, client, kwargs): self.client = client self.kwargs = kwargs def __getattr__(self, method_name): method = getattr(self.client, method_name) kwargs = self.kwargs.get('S3.Client.%s' % method_name, {}) return functools.partial(method, **kwargs) def parse_uri(uri_as_string): # # Restrictions on bucket names and labels: # # - Bucket names must be at least 3 and no more than 63 characters long. # - Bucket names must be a series of one or more labels. # - Adjacent labels are separated by a single period (.). # - Bucket names can contain lowercase letters, numbers, and hyphens. # - Each label must start and end with a lowercase letter or a number. # # We use the above as a guide only, and do not perform any validation. We # let boto3 take care of that for us. # split_uri = smart_open.utils.safe_urlsplit(uri_as_string) assert split_uri.scheme in SCHEMES port = DEFAULT_PORT host = DEFAULT_HOST ordinary_calling_format = False # # These defaults tell boto3 to look for credentials elsewhere # access_id, access_secret = None, None # # Common URI template [secret:key@][host[:port]@]bucket/object # # The urlparse function doesn't handle the above schema, so we have to do # it ourselves. # uri = split_uri.netloc + split_uri.path if '@' in uri and ':' in uri.split('@')[0]: auth, uri = uri.split('@', 1) access_id, access_secret = auth.split(':') head, key_id = uri.split('/', 1) if '@' in head and ':' in head: ordinary_calling_format = True host_port, bucket_id = head.split('@') host, port = host_port.split(':', 1) port = int(port) elif '@' in head: ordinary_calling_format = True host, bucket_id = head.split('@') else: bucket_id = head return dict( scheme=split_uri.scheme, bucket_id=bucket_id, key_id=key_id, port=port, host=host, ordinary_calling_format=ordinary_calling_format, access_id=access_id, access_secret=access_secret, ) def _consolidate_params(uri, transport_params): """Consolidates the parsed Uri with the additional parameters. This is necessary because the user can pass some of the parameters can in two different ways: 1) Via the URI itself 2) Via the transport parameters These are not mutually exclusive, but we have to pick one over the other in a sensible way in order to proceed. """ transport_params = dict(transport_params) def inject(**kwargs): try: client_kwargs = transport_params['client_kwargs'] except KeyError: client_kwargs = transport_params['client_kwargs'] = {} try: init_kwargs = client_kwargs['S3.Client'] except KeyError: init_kwargs = client_kwargs['S3.Client'] = {} init_kwargs.update(**kwargs) client = transport_params.get('client') if client is not None and (uri['access_id'] or uri['access_secret']): logger.warning( 'ignoring credentials parsed from URL because they conflict with ' 'transport_params["client"]. Set transport_params["client"] to None ' 'to suppress this warning.' ) uri.update(access_id=None, access_secret=None) elif (uri['access_id'] and uri['access_secret']): inject( aws_access_key_id=uri['access_id'], aws_secret_access_key=uri['access_secret'], ) uri.update(access_id=None, access_secret=None) if client is not None and uri['host'] != DEFAULT_HOST: logger.warning( 'ignoring endpoint_url parsed from URL because they conflict with ' 'transport_params["client"]. Set transport_params["client"] to None ' 'to suppress this warning.' ) uri.update(host=None) elif uri['host'] != DEFAULT_HOST: inject(endpoint_url='https://%(host)s:%(port)d' % uri) uri.update(host=None) return uri, transport_params def open_uri(uri, mode, transport_params): deprecated = ( 'multipart_upload_kwargs', 'object_kwargs', 'resource', 'resource_kwargs', 'session', 'singlepart_upload_kwargs', ) detected = [k for k in deprecated if k in transport_params] if detected: doc_url = ( 'https://github.com/RaRe-Technologies/smart_open/blob/develop/' 'MIGRATING_FROM_OLDER_VERSIONS.rst' ) # # We use warnings.warn /w UserWarning instead of logger.warn here because # # 1) Not everyone has logging enabled; and # 2) check_kwargs (below) already uses logger.warn with a similar message # # https://github.com/RaRe-Technologies/smart_open/issues/614 # message = ( 'ignoring the following deprecated transport parameters: %r. ' 'See <%s> for details' % (detected, doc_url) ) warnings.warn(message, UserWarning) parsed_uri = parse_uri(uri) parsed_uri, transport_params = _consolidate_params(parsed_uri, transport_params) kwargs = smart_open.utils.check_kwargs(open, transport_params) return open(parsed_uri['bucket_id'], parsed_uri['key_id'], mode, **kwargs) def open( bucket_id, key_id, mode, version_id=None, buffer_size=DEFAULT_BUFFER_SIZE, min_part_size=DEFAULT_MIN_PART_SIZE, multipart_upload=True, defer_seek=False, client=None, client_kwargs=None, writebuffer=None, ): """Open an S3 object for reading or writing. Parameters ---------- bucket_id: str The name of the bucket this object resides in. key_id: str The name of the key within the bucket. mode: str The mode for opening the object. Must be either "rb" or "wb". buffer_size: int, optional The buffer size to use when performing I/O. min_part_size: int, optional The minimum part size for multipart uploads. For writing only. multipart_upload: bool, optional Default: `True` If set to `True`, will use multipart upload for writing to S3. If set to `False`, S3 upload will use the S3 Single-Part Upload API, which is more ideal for small file sizes. For writing only. version_id: str, optional Version of the object, used when reading object. If None, will fetch the most recent version. defer_seek: boolean, optional Default: `False` If set to `True` on a file opened for reading, GetObject will not be called until the first seek() or read(). Avoids redundant API queries when seeking before reading. client: object, optional The S3 client to use when working with boto3. If you don't specify this, then smart_open will create a new client for you. client_kwargs: dict, optional Additional parameters to pass to the relevant functions of the client. The keys are fully qualified method names, e.g. `S3.Client.create_multipart_upload`. The values are kwargs to pass to that method each time it is called. writebuffer: IO[bytes], optional By default, this module will buffer data in memory using io.BytesIO when writing. Pass another binary IO instance here to use it instead. For example, you may pass a file object to buffer to local disk instead of in RAM. Use this to keep RAM usage low at the expense of additional disk IO. If you pass in an open file, then you are responsible for cleaning it up after writing completes. """ logger.debug('%r', locals()) if mode not in constants.BINARY_MODES: raise NotImplementedError('bad mode: %r expected one of %r' % (mode, constants.BINARY_MODES)) if (mode == constants.WRITE_BINARY) and (version_id is not None): raise ValueError("version_id must be None when writing") if mode == constants.READ_BINARY: fileobj = Reader( bucket_id, key_id, version_id=version_id, buffer_size=buffer_size, defer_seek=defer_seek, client=client, client_kwargs=client_kwargs, ) elif mode == constants.WRITE_BINARY: if multipart_upload: fileobj = MultipartWriter( bucket_id, key_id, min_part_size=min_part_size, client=client, client_kwargs=client_kwargs, writebuffer=writebuffer, ) else: fileobj = SinglepartWriter( bucket_id, key_id, client=client, client_kwargs=client_kwargs, writebuffer=writebuffer, ) else: assert False, 'unexpected mode: %r' % mode fileobj.name = key_id return fileobj def _get(client, bucket, key, version, range_string): try: if version: return client.get_object(Bucket=bucket, Key=key, VersionId=version, Range=range_string) else: return client.get_object(Bucket=bucket, Key=key, Range=range_string) except botocore.client.ClientError as error: wrapped_error = IOError( 'unable to access bucket: %r key: %r version: %r error: %s' % ( bucket, key, version, error ) ) wrapped_error.backend_error = error raise wrapped_error from error def _unwrap_ioerror(ioe): """Given an IOError from _get, return the 'Error' dictionary from boto.""" try: return ioe.backend_error.response['Error'] except (AttributeError, KeyError): return None class _SeekableRawReader(object): """Read an S3 object. This class is internal to the S3 submodule. """ def __init__( self, client, bucket, key, version_id=None, ): self._client = client self._bucket = bucket self._key = key self._version_id = version_id self._content_length = None self._position = 0 self._body = None def seek(self, offset, whence=constants.WHENCE_START): """Seek to the specified position. :param int offset: The offset in bytes. :param int whence: Where the offset is from. :returns: the position after seeking. :rtype: int """ if whence not in constants.WHENCE_CHOICES: raise ValueError('invalid whence, expected one of %r' % constants.WHENCE_CHOICES) # # Close old body explicitly. # When first seek() after __init__(), self._body is not exist. # if self._body is not None: self._body.close() self._body = None start = None stop = None if whence == constants.WHENCE_START: start = max(0, offset) elif whence == constants.WHENCE_CURRENT: start = max(0, offset + self._position) else: stop = max(0, -offset) # # If we can figure out that we've read past the EOF, then we can save # an extra API call. # if self._content_length is None: reached_eof = False elif start is not None and start >= self._content_length: reached_eof = True elif stop == 0: reached_eof = True else: reached_eof = False if reached_eof: self._body = io.BytesIO() self._position = self._content_length else: self._open_body(start, stop) return self._position def _open_body(self, start=None, stop=None): """Open a connection to download the specified range of bytes. Store the open file handle in self._body. If no range is specified, start defaults to self._position. start and stop follow the semantics of the http range header, so a stop without a start will read bytes beginning at stop. As a side effect, set self._content_length. Set self._position to self._content_length if start is past end of file. """ if start is None and stop is None: start = self._position range_string = smart_open.utils.make_range_string(start, stop) try: # Optimistically try to fetch the requested content range. response = _get( self._client, self._bucket, self._key, self._version_id, range_string, ) except IOError as ioe: # Handle requested content range exceeding content size. error_response = _unwrap_ioerror(ioe) if error_response is None or error_response.get('Code') != _OUT_OF_RANGE: raise self._position = self._content_length = int(error_response['ActualObjectSize']) self._body = io.BytesIO() else: # # Keep track of how many times boto3's built-in retry mechanism # activated. # # https://boto3.amazonaws.com/v1/documentation/api/latest/guide/retries.html#checking-retry-attempts-in-an-aws-service-response # logger.debug( '%s: RetryAttempts: %d', self, response['ResponseMetadata']['RetryAttempts'], ) units, start, stop, length = smart_open.utils.parse_content_range(response['ContentRange']) self._content_length = length self._position = start self._body = response['Body'] def read(self, size=-1): """Read from the continuous connection with the remote peer.""" if self._body is None: # This is necessary for the very first read() after __init__(). self._open_body() if self._position >= self._content_length: return b'' # # Boto3 has built-in error handling and retry mechanisms: # # https://boto3.amazonaws.com/v1/documentation/api/latest/guide/error-handling.html # https://boto3.amazonaws.com/v1/documentation/api/latest/guide/retries.html # # Unfortunately, it isn't always enough. There is still a non-zero # possibility that an exception will slip past these mechanisms and # terminate the read prematurely. Luckily, at this stage, it's very # simple to recover from the problem: wait a little bit, reopen the # HTTP connection and try again. Usually, a single retry attempt is # enough to recover, but we try multiple times "just in case". # for attempt, seconds in enumerate([1, 2, 4, 8, 16], 1): try: if size == -1: binary = self._body.read() else: binary = self._body.read(size) except ( ConnectionResetError, botocore.exceptions.BotoCoreError, urllib3.exceptions.HTTPError, ) as err: logger.warning( '%s: caught %r while reading %d bytes, sleeping %ds before retry', self, err, size, seconds, ) time.sleep(seconds) self._open_body() else: self._position += len(binary) return binary raise IOError('%s: failed to read %d bytes after %d attempts' % (self, size, attempt)) def __str__(self): return 'smart_open.s3._SeekableReader(%r, %r)' % (self._bucket, self._key) def _initialize_boto3(rw, client, client_kwargs, bucket, key): """Created the required objects for accessing S3. Ideally, they have been already created for us and we can just reuse them.""" if client_kwargs is None: client_kwargs = {} if client is None: init_kwargs = client_kwargs.get('S3.Client', {}) client = boto3.client('s3', **init_kwargs) assert client rw._client = _ClientWrapper(client, client_kwargs) rw._bucket = bucket rw._key = key class Reader(io.BufferedIOBase): """Reads bytes from S3. Implements the io.BufferedIOBase interface of the standard library.""" def __init__( self, bucket, key, version_id=None, buffer_size=DEFAULT_BUFFER_SIZE, line_terminator=constants.BINARY_NEWLINE, defer_seek=False, client=None, client_kwargs=None, ): self._version_id = version_id self._buffer_size = buffer_size _initialize_boto3(self, client, client_kwargs, bucket, key) self._raw_reader = _SeekableRawReader( self._client, bucket, key, self._version_id, ) self._current_pos = 0 self._buffer = smart_open.bytebuffer.ByteBuffer(buffer_size) self._eof = False self._line_terminator = line_terminator # # This member is part of the io.BufferedIOBase interface. # self.raw = None if not defer_seek: self.seek(0) # # io.BufferedIOBase methods. # def close(self): """Flush and close this stream.""" pass def readable(self): """Return True if the stream can be read from.""" return True def read(self, size=-1): """Read up to size bytes from the object and return them.""" if size == 0: return b'' elif size < 0: # call read() before setting _current_pos to make sure _content_length is set out = self._read_from_buffer() + self._raw_reader.read() self._current_pos = self._raw_reader._content_length return out # # Return unused data first # if len(self._buffer) >= size: return self._read_from_buffer(size) # # If the stream is finished, return what we have. # if self._eof: return self._read_from_buffer() self._fill_buffer(size) return self._read_from_buffer(size) def read1(self, size=-1): """This is the same as read().""" return self.read(size=size) def readinto(self, b): """Read up to len(b) bytes into b, and return the number of bytes read.""" data = self.read(len(b)) if not data: return 0 b[:len(data)] = data return len(data) def readline(self, limit=-1): """Read up to and including the next newline. Returns the bytes read.""" if limit != -1: raise NotImplementedError('limits other than -1 not implemented yet') # # A single line may span multiple buffers. # line = io.BytesIO() while not (self._eof and len(self._buffer) == 0): line_part = self._buffer.readline(self._line_terminator) line.write(line_part) self._current_pos += len(line_part) if line_part.endswith(self._line_terminator): break else: self._fill_buffer() return line.getvalue() def seekable(self): """If False, seek(), tell() and truncate() will raise IOError. We offer only seek support, and no truncate support.""" return True def seek(self, offset, whence=constants.WHENCE_START): """Seek to the specified position. :param int offset: The offset in bytes. :param int whence: Where the offset is from. Returns the position after seeking.""" # Convert relative offset to absolute, since self._raw_reader # doesn't know our current position. if whence == constants.WHENCE_CURRENT: whence = constants.WHENCE_START offset += self._current_pos self._current_pos = self._raw_reader.seek(offset, whence) self._buffer.empty() self._eof = self._current_pos == self._raw_reader._content_length return self._current_pos def tell(self): """Return the current position within the file.""" return self._current_pos def truncate(self, size=None): """Unsupported.""" raise io.UnsupportedOperation def detach(self): """Unsupported.""" raise io.UnsupportedOperation def terminate(self): """Do nothing.""" pass def to_boto3(self, resource): """Create an **independent** `boto3.s3.Object` instance that points to the same S3 object as this instance. Changes to the returned object will not affect the current instance. """ assert resource, 'resource must be a boto3.resource instance' obj = resource.Object(self._bucket, self._key) if self._version_id is not None: return obj.Version(self._version_id) else: return obj # # Internal methods. # def _read_from_buffer(self, size=-1): """Remove at most size bytes from our buffer and return them.""" size = size if size >= 0 else len(self._buffer) part = self._buffer.read(size) self._current_pos += len(part) return part def _fill_buffer(self, size=-1): size = max(size, self._buffer._chunk_size) while len(self._buffer) < size and not self._eof: bytes_read = self._buffer.fill(self._raw_reader) if bytes_read == 0: logger.debug('%s: reached EOF while filling buffer', self) self._eof = True def __str__(self): return "smart_open.s3.Reader(%r, %r)" % (self._bucket, self._key) def __repr__(self): return ( "smart_open.s3.Reader(" "bucket=%r, " "key=%r, " "version_id=%r, " "buffer_size=%r, " "line_terminator=%r)" ) % ( self._bucket, self._key, self._version_id, self._buffer_size, self._line_terminator, ) class MultipartWriter(io.BufferedIOBase): """Writes bytes to S3 using the multi part API. Implements the io.BufferedIOBase interface of the standard library.""" def __init__( self, bucket, key, min_part_size=DEFAULT_MIN_PART_SIZE, client=None, client_kwargs=None, writebuffer=None, ): if min_part_size < MIN_MIN_PART_SIZE: logger.warning("S3 requires minimum part size >= 5MB; \ multipart upload may fail") self._min_part_size = min_part_size _initialize_boto3(self, client, client_kwargs, bucket, key) try: partial = functools.partial( self._client.create_multipart_upload, Bucket=bucket, Key=key, ) self._upload_id = _retry_if_failed(partial)['UploadId'] except botocore.client.ClientError as error: raise ValueError( 'the bucket %r does not exist, or is forbidden for access (%r)' % ( bucket, error ) ) from error if writebuffer is None: self._buf = io.BytesIO() else: self._buf = writebuffer self._total_bytes = 0 self._total_parts = 0 self._parts = [] # # This member is part of the io.BufferedIOBase interface. # self.raw = None def flush(self): pass # # Override some methods from io.IOBase. # def close(self): if self._buf.tell(): self._upload_next_part() if self._total_bytes and self._upload_id: partial = functools.partial( self._client.complete_multipart_upload, Bucket=self._bucket, Key=self._key, UploadId=self._upload_id, MultipartUpload={'Parts': self._parts}, ) _retry_if_failed(partial) logger.debug('%s: completed multipart upload', self) elif self._upload_id: # # AWS complains with "The XML you provided was not well-formed or # did not validate against our published schema" when the input is # completely empty => abort the upload, no file created. # # We work around this by creating an empty file explicitly. # assert self._upload_id, "no multipart upload in progress" self._client.abort_multipart_upload( Bucket=self._bucket, Key=self._key, UploadId=self._upload_id, ) self._client.put_object( Bucket=self._bucket, Key=self._key, Body=b'', ) logger.debug('%s: wrote 0 bytes to imitate multipart upload', self) self._upload_id = None @property def closed(self): return self._upload_id is None def writable(self): """Return True if the stream supports writing.""" return True def seekable(self): """If False, seek(), tell() and truncate() will raise IOError. We offer only tell support, and no seek or truncate support.""" return True def seek(self, offset, whence=constants.WHENCE_START): """Unsupported.""" raise io.UnsupportedOperation def truncate(self, size=None): """Unsupported.""" raise io.UnsupportedOperation def tell(self): """Return the current stream position.""" return self._total_bytes # # io.BufferedIOBase methods. # def detach(self): raise io.UnsupportedOperation("detach() not supported") def write(self, b): """Write the given buffer (bytes, bytearray, memoryview or any buffer interface implementation) to the S3 file. For more information about buffers, see https://docs.python.org/3/c-api/buffer.html There's buffering happening under the covers, so this may not actually do any HTTP transfer right away.""" length = self._buf.write(b) self._total_bytes += length if self._buf.tell() >= self._min_part_size: self._upload_next_part() return length def terminate(self): """Cancel the underlying multipart upload.""" assert self._upload_id, "no multipart upload in progress" self._client.abort_multipart_upload( Bucket=self._bucket, Key=self._key, UploadId=self._upload_id, ) self._upload_id = None def to_boto3(self, resource): """Create an **independent** `boto3.s3.Object` instance that points to the same S3 object as this instance. Changes to the returned object will not affect the current instance. """ assert resource, 'resource must be a boto3.resource instance' return resource.Object(self._bucket, self._key) # # Internal methods. # def _upload_next_part(self): part_num = self._total_parts + 1 logger.info( "%s: uploading part_num: %i, %i bytes (total %.3fGB)", self, part_num, self._buf.tell(), self._total_bytes / 1024.0 ** 3, ) self._buf.seek(0) # # Network problems in the middle of an upload are particularly # troublesome. We don't want to abort the entire upload just because # of a temporary connection problem, so this part needs to be # especially robust. # upload = _retry_if_failed( functools.partial( self._client.upload_part, Bucket=self._bucket, Key=self._key, UploadId=self._upload_id, PartNumber=part_num, Body=self._buf, ) ) self._parts.append({'ETag': upload['ETag'], 'PartNumber': part_num}) logger.debug("%s: upload of part_num #%i finished", self, part_num) self._total_parts += 1 self._buf.seek(0) self._buf.truncate(0) def __enter__(self): return self def __exit__(self, exc_type, exc_val, exc_tb): if exc_type is not None: self.terminate() else: self.close() def __str__(self): return "smart_open.s3.MultipartWriter(%r, %r)" % (self._bucket, self._key) def __repr__(self): return "smart_open.s3.MultipartWriter(bucket=%r, key=%r, min_part_size=%r)" % ( self._bucket, self._key, self._min_part_size, ) class SinglepartWriter(io.BufferedIOBase): """Writes bytes to S3 using the single part API. Implements the io.BufferedIOBase interface of the standard library. This class buffers all of its input in memory until its `close` method is called. Only then will the data be written to S3 and the buffer is released.""" def __init__( self, bucket, key, client=None, client_kwargs=None, writebuffer=None, ): _initialize_boto3(self, client, client_kwargs, bucket, key) try: self._client.head_bucket(Bucket=bucket) except botocore.client.ClientError as e: raise ValueError('the bucket %r does not exist, or is forbidden for access' % bucket) from e if writebuffer is None: self._buf = io.BytesIO() else: self._buf = writebuffer self._total_bytes = 0 # # This member is part of the io.BufferedIOBase interface. # self.raw = None def flush(self): pass # # Override some methods from io.IOBase. # def close(self): if self._buf is None: return self._buf.seek(0) try: self._client.put_object( Bucket=self._bucket, Key=self._key, Body=self._buf, ) except botocore.client.ClientError as e: raise ValueError( 'the bucket %r does not exist, or is forbidden for access' % self._bucket) from e logger.debug("%s: direct upload finished", self) self._buf = None @property def closed(self): return self._buf is None def writable(self): """Return True if the stream supports writing.""" return True def seekable(self): """If False, seek(), tell() and truncate() will raise IOError. We offer only tell support, and no seek or truncate support.""" return True def seek(self, offset, whence=constants.WHENCE_START): """Unsupported.""" raise io.UnsupportedOperation def truncate(self, size=None): """Unsupported.""" raise io.UnsupportedOperation def tell(self): """Return the current stream position.""" return self._total_bytes # # io.BufferedIOBase methods. # def detach(self): raise io.UnsupportedOperation("detach() not supported") def write(self, b): """Write the given buffer (bytes, bytearray, memoryview or any buffer interface implementation) into the buffer. Content of the buffer will be written to S3 on close as a single-part upload. For more information about buffers, see https://docs.python.org/3/c-api/buffer.html""" length = self._buf.write(b) self._total_bytes += length return length def terminate(self): """Nothing to cancel in single-part uploads.""" return # # Internal methods. # def __enter__(self): return self def __exit__(self, exc_type, exc_val, exc_tb): if exc_type is not None: self.terminate() else: self.close() def __str__(self): return "smart_open.s3.SinglepartWriter(%r, %r)" % (self._object.bucket_name, self._object.key) def __repr__(self): return "smart_open.s3.SinglepartWriter(bucket=%r, key=%r)" % (self._bucket, self._key) def _retry_if_failed( partial, attempts=_UPLOAD_ATTEMPTS, sleep_seconds=_SLEEP_SECONDS, exceptions=None): if exceptions is None: exceptions = (botocore.exceptions.EndpointConnectionError, ) for attempt in range(attempts): try: return partial() except exceptions: logger.critical( 'Unable to connect to the endpoint. Check your network connection. ' 'Sleeping and retrying %d more times ' 'before giving up.' % (attempts - attempt - 1) ) time.sleep(sleep_seconds) else: logger.critical('Unable to connect to the endpoint. Giving up.') raise IOError('Unable to connect to the endpoint after %d attempts' % attempts) def _accept_all(key): return True def iter_bucket( bucket_name, prefix='', accept_key=None, key_limit=None, workers=16, retries=3, **session_kwargs): """ Iterate and download all S3 objects under `s3://bucket_name/prefix`. Parameters ---------- bucket_name: str The name of the bucket. prefix: str, optional Limits the iteration to keys starting with the prefix. accept_key: callable, optional This is a function that accepts a key name (unicode string) and returns True/False, signalling whether the given key should be downloaded. The default behavior is to accept all keys. key_limit: int, optional If specified, the iterator will stop after yielding this many results. workers: int, optional The number of subprocesses to use. retries: int, optional The number of time to retry a failed download. session_kwargs: dict, optional Keyword arguments to pass when creating a new session. For a list of available names and values, see: https://boto3.amazonaws.com/v1/documentation/api/latest/reference/core/session.html#boto3.session.Session Yields ------ str The full key name (does not include the bucket name). bytes The full contents of the key. Notes ----- The keys are processed in parallel, using `workers` processes (default: 16), to speed up downloads greatly. If multiprocessing is not available, thus _MULTIPROCESSING is False, this parameter will be ignored. Examples -------- >>> # get all JSON files under "mybucket/foo/" >>> for key, content in iter_bucket( ... bucket_name, prefix='foo/', ... accept_key=lambda key: key.endswith('.json')): ... print key, len(content) >>> # limit to 10k files, using 32 parallel workers (default is 16) >>> for key, content in iter_bucket(bucket_name, key_limit=10000, workers=32): ... print key, len(content) """ if accept_key is None: accept_key = _accept_all # # If people insist on giving us bucket instances, silently extract the name # before moving on. Works for boto3 as well as boto. # try: bucket_name = bucket_name.name except AttributeError: pass total_size, key_no = 0, -1 key_iterator = _list_bucket( bucket_name, prefix=prefix, accept_key=accept_key, **session_kwargs) download_key = functools.partial( _download_key, bucket_name=bucket_name, retries=retries, **session_kwargs) with smart_open.concurrency.create_pool(processes=workers) as pool: result_iterator = pool.imap_unordered(download_key, key_iterator) for key_no, (key, content) in enumerate(result_iterator): if True or key_no % 1000 == 0: logger.info( "yielding key #%i: %s, size %i (total %.1fMB)", key_no, key, len(content), total_size / 1024.0 ** 2 ) yield key, content total_size += len(content) if key_limit is not None and key_no + 1 >= key_limit: # we were asked to output only a limited number of keys => we're done break logger.info("processed %i keys, total size %i" % (key_no + 1, total_size)) def _list_bucket( bucket_name, prefix='', accept_key=lambda k: True, **session_kwargs): session = boto3.session.Session(**session_kwargs) client = session.client('s3') ctoken = None while True: # list_objects_v2 doesn't like a None value for ContinuationToken # so we don't set it if we don't have one. if ctoken: kwargs = dict(Bucket=bucket_name, Prefix=prefix, ContinuationToken=ctoken) else: kwargs = dict(Bucket=bucket_name, Prefix=prefix) response = client.list_objects_v2(**kwargs) try: content = response['Contents'] except KeyError: pass else: for c in content: key = c['Key'] if accept_key(key): yield key ctoken = response.get('NextContinuationToken', None) if not ctoken: break def _download_key(key_name, bucket_name=None, retries=3, **session_kwargs): if bucket_name is None: raise ValueError('bucket_name may not be None') # # https://boto3.amazonaws.com/v1/documentation/api/latest/guide/resources.html#multithreading-and-multiprocessing # session = boto3.session.Session(**session_kwargs) s3 = session.resource('s3') bucket = s3.Bucket(bucket_name) # Sometimes, https://github.com/boto/boto/issues/2409 can happen # because of network issues on either side. # Retry up to 3 times to ensure its not a transient issue. for x in range(retries + 1): try: content_bytes = _download_fileobj(bucket, key_name) except botocore.client.ClientError: # Actually fail on last pass through the loop if x == retries: raise # Otherwise, try again, as this might be a transient timeout pass else: return key_name, content_bytes def _download_fileobj(bucket, key_name): # # This is a separate function only because it makes it easier to inject # exceptions during tests. # buf = io.BytesIO() bucket.download_fileobj(key_name, buf) return buf.getvalue() smart_open-5.2.1/smart_open/smart_open_lib.py000066400000000000000000000374141411241424400214120ustar00rootroot00000000000000# -*- coding: utf-8 -*- # # Copyright (C) 2019 Radim Rehurek # # This code is distributed under the terms and conditions # from the MIT License (MIT). # """Implements the majority of smart_open's top-level API. The main functions are: * ``parse_uri()`` * ``open()`` """ import collections import io import locale import logging import os import os.path as P import pathlib import urllib.parse import warnings # # This module defines a function called smart_open so we cannot use # smart_open.submodule to reference to the submodules. # import smart_open.local_file as so_file import smart_open.compression as so_compression from smart_open import doctools from smart_open import transport # # For backwards compatibility and keeping old unit tests happy. # from smart_open.compression import register_compressor # noqa: F401 from smart_open.utils import check_kwargs as _check_kwargs # noqa: F401 from smart_open.utils import inspect_kwargs as _inspect_kwargs # noqa: F401 logger = logging.getLogger(__name__) DEFAULT_ENCODING = locale.getpreferredencoding(do_setlocale=False) def _sniff_scheme(uri_as_string): """Returns the scheme of the URL only, as a string.""" # # urlsplit doesn't work on Windows -- it parses the drive as the scheme... # no protocol given => assume a local file # if os.name == 'nt' and '://' not in uri_as_string: uri_as_string = 'file://' + uri_as_string return urllib.parse.urlsplit(uri_as_string).scheme def parse_uri(uri_as_string): """ Parse the given URI from a string. Parameters ---------- uri_as_string: str The URI to parse. Returns ------- collections.namedtuple The parsed URI. Notes ----- smart_open/doctools.py magic goes here """ scheme = _sniff_scheme(uri_as_string) submodule = transport.get_transport(scheme) as_dict = submodule.parse_uri(uri_as_string) # # The conversion to a namedtuple is just to keep the old tests happy while # I'm still refactoring. # Uri = collections.namedtuple('Uri', sorted(as_dict.keys())) return Uri(**as_dict) # # To keep old unit tests happy while I'm refactoring. # _parse_uri = parse_uri _builtin_open = open def open( uri, mode='r', buffering=-1, encoding=None, errors=None, newline=None, closefd=True, opener=None, ignore_ext=False, compression=None, transport_params=None, ): r"""Open the URI object, returning a file-like object. The URI is usually a string in a variety of formats. For a full list of examples, see the :func:`parse_uri` function. The URI may also be one of: - an instance of the pathlib.Path class - a stream (anything that implements io.IOBase-like functionality) Parameters ---------- uri: str or object The object to open. mode: str, optional Mimicks built-in open parameter of the same name. buffering: int, optional Mimicks built-in open parameter of the same name. encoding: str, optional Mimicks built-in open parameter of the same name. errors: str, optional Mimicks built-in open parameter of the same name. newline: str, optional Mimicks built-in open parameter of the same name. closefd: boolean, optional Mimicks built-in open parameter of the same name. Ignored. opener: object, optional Mimicks built-in open parameter of the same name. Ignored. ignore_ext: boolean, optional Disable transparent compression/decompression based on the file extension. compression: str, optional (see smart_open.compression.get_supported_compression_types) Explicitly specify the compression/decompression behavior. If you specify this parameter, then ignore_ext must not be specified. transport_params: dict, optional Additional parameters for the transport layer (see notes below). Returns ------- A file-like object. Notes ----- smart_open has several implementations for its transport layer (e.g. S3, HTTP). Each transport layer has a different set of keyword arguments for overriding default behavior. If you specify a keyword argument that is *not* supported by the transport layer being used, smart_open will ignore that argument and log a warning message. smart_open/doctools.py magic goes here See Also -------- - `Standard library reference `__ - `smart_open README.rst `__ """ logger.debug('%r', locals()) if not isinstance(mode, str): raise TypeError('mode should be a string') if compression and ignore_ext: raise ValueError('ignore_ext and compression parameters are mutually exclusive') elif compression and compression not in so_compression.get_supported_compression_types(): raise ValueError(f'invalid compression type: {compression}') elif ignore_ext: compression = so_compression.NO_COMPRESSION warnings.warn("'ignore_ext' will be deprecated in a future release", PendingDeprecationWarning) elif compression is None: compression = so_compression.INFER_FROM_EXTENSION if transport_params is None: transport_params = {} fobj = _shortcut_open( uri, mode, compression=compression, buffering=buffering, encoding=encoding, errors=errors, newline=newline, ) if fobj is not None: return fobj # # This is a work-around for the problem described in Issue #144. # If the user has explicitly specified an encoding, then assume they want # us to open the destination in text mode, instead of the default binary. # # If we change the default mode to be text, and match the normal behavior # of Py2 and 3, then the above assumption will be unnecessary. # if encoding is not None and 'b' in mode: mode = mode.replace('b', '') if isinstance(uri, pathlib.Path): uri = str(uri) explicit_encoding = encoding encoding = explicit_encoding if explicit_encoding else DEFAULT_ENCODING # # This is how we get from the filename to the end result. Decompression is # optional, but it always accepts bytes and returns bytes. # # Decoding is also optional, accepts bytes and returns text. The diagram # below is for reading, for writing, the flow is from right to left, but # the code is identical. # # open as binary decompress? decode? # filename ---------------> bytes -------------> bytes ---------> text # binary decompressed decode # try: binary_mode = _get_binary_mode(mode) except ValueError as ve: raise NotImplementedError(ve.args[0]) binary = _open_binary_stream(uri, binary_mode, transport_params) decompressed = so_compression.compression_wrapper(binary, binary_mode, compression) if 'b' not in mode or explicit_encoding is not None: decoded = _encoding_wrapper( decompressed, mode, encoding=encoding, errors=errors, newline=newline, ) else: decoded = decompressed return decoded def _get_binary_mode(mode_str): # # https://docs.python.org/3/library/functions.html#open # # The order of characters in the mode parameter appears to be unspecified. # The implementation follows the examples, just to be safe. # mode = list(mode_str) binmode = [] if 't' in mode and 'b' in mode: raise ValueError("can't have text and binary mode at once") counts = [mode.count(x) for x in 'rwa'] if sum(counts) > 1: raise ValueError("must have exactly one of create/read/write/append mode") def transfer(char): binmode.append(mode.pop(mode.index(char))) if 'a' in mode: transfer('a') elif 'w' in mode: transfer('w') elif 'r' in mode: transfer('r') else: raise ValueError( "Must have exactly one of create/read/write/append " "mode and at most one plus" ) if 'b' in mode: transfer('b') elif 't' in mode: mode.pop(mode.index('t')) binmode.append('b') else: binmode.append('b') if '+' in mode: transfer('+') # # There shouldn't be anything left in the mode list at this stage. # If there is, then either we've missed something and the implementation # of this function is broken, or the original input mode is invalid. # if mode: raise ValueError('invalid mode: %r' % mode_str) return ''.join(binmode) def _shortcut_open( uri, mode, compression, buffering=-1, encoding=None, errors=None, newline=None, ): """Try to open the URI using the standard library io.open function. This can be much faster than the alternative of opening in binary mode and then decoding. This is only possible under the following conditions: 1. Opening a local file; and 2. Compression is disabled If it is not possible to use the built-in open for the specified URI, returns None. :param str uri: A string indicating what to open. :param str mode: The mode to pass to the open function. :param str compression: The compression type selected. :returns: The opened file :rtype: file """ if not isinstance(uri, str): return None scheme = _sniff_scheme(uri) if scheme not in (transport.NO_SCHEME, so_file.SCHEME): return None local_path = so_file.extract_local_path(uri) if compression == so_compression.INFER_FROM_EXTENSION: _, extension = P.splitext(local_path) if extension in so_compression.get_supported_extensions(): return None elif compression != so_compression.NO_COMPRESSION: return None open_kwargs = {} if encoding is not None: open_kwargs['encoding'] = encoding mode = mode.replace('b', '') if newline is not None: open_kwargs['newline'] = newline # # binary mode of the builtin/stdlib open function doesn't take an errors argument # if errors and 'b' not in mode: open_kwargs['errors'] = errors return _builtin_open(local_path, mode, buffering=buffering, **open_kwargs) def _open_binary_stream(uri, mode, transport_params): """Open an arbitrary URI in the specified binary mode. Not all modes are supported for all protocols. :arg uri: The URI to open. May be a string, or something else. :arg str mode: The mode to open with. Must be rb, wb or ab. :arg transport_params: Keyword argumens for the transport layer. :returns: A named file object :rtype: file-like object with a .name attribute """ if mode not in ('rb', 'rb+', 'wb', 'wb+', 'ab', 'ab+'): # # This should really be a ValueError, but for the sake of compatibility # with older versions, which raise NotImplementedError, we do the same. # raise NotImplementedError('unsupported mode: %r' % mode) if hasattr(uri, 'read'): # simply pass-through if already a file-like # we need to return something as the file name, but we don't know what # so we probe for uri.name (e.g., this works with open() or tempfile.NamedTemporaryFile) # if the value ends with COMPRESSED_EXT, we will note it in compression_wrapper() # if there is no such an attribute, we return "unknown" - this # effectively disables any compression if not hasattr(uri, 'name'): uri.name = getattr(uri, 'name', 'unknown') return uri if not isinstance(uri, str): raise TypeError("don't know how to handle uri %s" % repr(uri)) scheme = _sniff_scheme(uri) submodule = transport.get_transport(scheme) fobj = submodule.open_uri(uri, mode, transport_params) if not hasattr(fobj, 'name'): fobj.name = uri return fobj def _encoding_wrapper(fileobj, mode, encoding=None, errors=None, newline=None): """Decode bytes into text, if necessary. If mode specifies binary access, does nothing, unless the encoding is specified. A non-null encoding implies text mode. :arg fileobj: must quack like a filehandle object. :arg str mode: is the mode which was originally requested by the user. :arg str encoding: The text encoding to use. If mode is binary, overrides mode. :arg str errors: The method to use when handling encoding/decoding errors. :returns: a file object """ logger.debug('encoding_wrapper: %r', locals()) # # If the mode is binary, but the user specified an encoding, assume they # want text. If we don't make this assumption, ignore the encoding and # return bytes, smart_open behavior will diverge from the built-in open: # # open(filename, encoding='utf-8') returns a text stream in Py3 # smart_open(filename, encoding='utf-8') would return a byte stream # without our assumption, because the default mode is rb. # if 'b' in mode and encoding is None: return fileobj if encoding is None: encoding = DEFAULT_ENCODING fileobj = io.TextIOWrapper( fileobj, encoding=encoding, errors=errors, newline=newline, write_through=True, ) return fileobj class patch_pathlib(object): """Replace `Path.open` with `smart_open.open`""" def __init__(self): self.old_impl = _patch_pathlib(open) def __enter__(self): return self def __exit__(self, exc_type, exc_val, exc_tb): _patch_pathlib(self.old_impl) def _patch_pathlib(func): """Replace `Path.open` with `func`""" old_impl = pathlib.Path.open pathlib.Path.open = func return old_impl def smart_open( uri, mode='rb', buffering=-1, encoding=None, errors=None, newline=None, closefd=True, opener=None, ignore_extension=False, **kwargs ): # # This is a thin wrapper of smart_open.open. It's here for backward # compatibility. It works exactly like smart_open.open when the passed # parameters are identical. Otherwise, it raises a DeprecationWarning. # # For completeness, the main differences of the old smart_open function: # # 1. Default mode was read binary (mode='rb') # 2. ignore_ext parameter was called ignore_extension # 3. Transport parameters were passed directly as kwargs # url = 'https://github.com/RaRe-Technologies/smart_open/blob/develop/MIGRATING_FROM_OLDER_VERSIONS.rst' if kwargs: raise DeprecationWarning( 'The following keyword parameters are not supported: %r. ' 'See %s for more information.' % (sorted(kwargs), url) ) message = 'This function is deprecated. See %s for more information' % url warnings.warn(message, category=DeprecationWarning) ignore_ext = ignore_extension del kwargs, url, message, ignore_extension return open(**locals()) # # Prevent failures with doctools from messing up the entire library. We don't # expect such failures, but contributed modules (e.g. new transport mechanisms) # may not be as polished. # try: doctools.tweak_open_docstring(open) doctools.tweak_parse_uri_docstring(parse_uri) except Exception as ex: logger.error( 'Encountered a non-fatal error while building docstrings (see below). ' 'help(smart_open) will provide incomplete information as a result. ' 'For full help text, see ' '.' ) logger.exception(ex) smart_open-5.2.1/smart_open/ssh.py000066400000000000000000000103301411241424400171760ustar00rootroot00000000000000# -*- coding: utf-8 -*- # # Copyright (C) 2019 Radim Rehurek # # This code is distributed under the terms and conditions # from the MIT License (MIT). # """Implements I/O streams over SSH. Examples -------- >>> with open('/proc/version_signature', host='1.2.3.4') as conn: ... print(conn.read()) b'Ubuntu 4.4.0-1061.70-aws 4.4.131' Similarly, from a command line:: $ python -c "from smart_open import ssh;print(ssh.open('/proc/version_signature', host='1.2.3.4').read())" b'Ubuntu 4.4.0-1061.70-aws 4.4.131' """ import getpass import logging import urllib.parse import warnings import smart_open.utils logger = logging.getLogger(__name__) # # Global storage for SSH connections. # _SSH = {} SCHEMES = ("ssh", "scp", "sftp") """Supported URL schemes.""" DEFAULT_PORT = 22 URI_EXAMPLES = ( 'ssh://username@host/path/file', 'ssh://username@host//path/file', 'scp://username@host/path/file', 'sftp://username@host/path/file', ) def _unquote(text): return text and urllib.parse.unquote(text) def parse_uri(uri_as_string): split_uri = urllib.parse.urlsplit(uri_as_string) assert split_uri.scheme in SCHEMES return dict( scheme=split_uri.scheme, uri_path=_unquote(split_uri.path), user=_unquote(split_uri.username), host=split_uri.hostname, port=int(split_uri.port or DEFAULT_PORT), password=_unquote(split_uri.password), ) def open_uri(uri, mode, transport_params): smart_open.utils.check_kwargs(open, transport_params) parsed_uri = parse_uri(uri) uri_path = parsed_uri.pop('uri_path') parsed_uri.pop('scheme') return open(uri_path, mode, transport_params=transport_params, **parsed_uri) def _connect(hostname, username, port, password, transport_params): try: import paramiko except ImportError: warnings.warn( 'paramiko missing, opening SSH/SCP/SFTP paths will be disabled. ' '`pip install paramiko` to suppress' ) raise key = (hostname, username) ssh = _SSH.get(key) if ssh is None: ssh = _SSH[key] = paramiko.client.SSHClient() ssh.load_system_host_keys() ssh.set_missing_host_key_policy(paramiko.AutoAddPolicy()) kwargs = transport_params.get('connect_kwargs', {}).copy() # if 'key_filename' is present in transport_params, then I do not # overwrite the credentials. if 'key_filename' not in kwargs: kwargs.setdefault('password', password) kwargs.setdefault('username', username) ssh.connect(hostname, port, **kwargs) return ssh def open(path, mode='r', host=None, user=None, password=None, port=DEFAULT_PORT, transport_params=None): """Open a file on a remote machine over SSH. Expects authentication to be already set up via existing keys on the local machine. Parameters ---------- path: str The path to the file to open on the remote machine. mode: str, optional The mode to use for opening the file. host: str, optional The hostname of the remote machine. May not be None. user: str, optional The username to use to login to the remote machine. If None, defaults to the name of the current user. password: str, optional The password to use to login to the remote machine. port: int, optional The port to connect to. transport_params: dict, optional Any additional settings to be passed to paramiko.SSHClient.connect Returns ------- A file-like object. Important --------- If you specify a previously unseen host, then its host key will be added to the local ~/.ssh/known_hosts *automatically*. If ``username`` or ``password`` are specified in *both* the uri and ``transport_params``, ``transport_params`` will take precedence """ if not host: raise ValueError('you must specify the host to connect to') if not user: user = getpass.getuser() if not transport_params: transport_params = {} conn = _connect(host, user, port, password, transport_params) sftp_client = conn.get_transport().open_sftp_client() fobj = sftp_client.open(path, mode) fobj.name = path return fobj smart_open-5.2.1/smart_open/tests/000077500000000000000000000000001411241424400171745ustar00rootroot00000000000000smart_open-5.2.1/smart_open/tests/__init__.py000066400000000000000000000002571411241424400213110ustar00rootroot00000000000000# -*- coding: utf-8 -*- # # Copyright (C) 2019 Radim Rehurek # # This code is distributed under the terms and conditions # from the MIT License (MIT). # smart_open-5.2.1/smart_open/tests/fixtures/000077500000000000000000000000001411241424400210455ustar00rootroot00000000000000smart_open-5.2.1/smart_open/tests/fixtures/__init__.py000066400000000000000000000000001411241424400231440ustar00rootroot00000000000000smart_open-5.2.1/smart_open/tests/fixtures/good_transport.py000066400000000000000000000004061411241424400244630ustar00rootroot00000000000000# -*- coding: utf-8 -*- """A no-op transport that registers scheme 'foo'""" import io SCHEME = "foo" open = io.open def parse_uri(uri_as_string): # pragma: no cover ... def open_uri(uri_as_string, mode, transport_params): # pragma: no cover ... smart_open-5.2.1/smart_open/tests/fixtures/missing_deps_transport.py000066400000000000000000000005531411241424400262220ustar00rootroot00000000000000# -*- coding: utf-8 -*- """Transport that has missing deps""" import io try: import this_module_does_not_exist_but_we_need_it # noqa except ImportError: MISSING_DEPS = True SCHEME = "missing" open = io.open def parse_uri(uri_as_string): # pragma: no cover ... def open_uri(uri_as_string, mode, transport_params): # pragma: no cover ... smart_open-5.2.1/smart_open/tests/fixtures/no_schemes_transport.py000066400000000000000000000004141411241424400256550ustar00rootroot00000000000000# -*- coding: utf-8 -*- """A transport that is missing the required SCHEME/SCHEMAS attributes""" import io open = io.open def parse_uri(uri_as_string): # pragma: no cover ... def open_uri(uri_as_string, mode, transport_params): # pragma: no cover ... smart_open-5.2.1/smart_open/tests/test_azure.py000066400000000000000000000702521411241424400217410ustar00rootroot00000000000000# -*- coding: utf-8 -*- # # Copyright (C) 2020 Radim Rehurek # Copyright (C) 2020 Nicolas Mitchell # # This code is distributed under the terms and conditions # from the MIT License (MIT). # import gzip import io import logging import os import time import uuid import unittest from collections import OrderedDict import smart_open import smart_open.constants import azure.storage.blob import azure.common import azure.core.exceptions CONTAINER_NAME = 'test-smartopen-{}'.format(uuid.uuid4().hex) BLOB_NAME = 'test-blob' DISABLE_MOCKS = os.environ.get('SO_DISABLE_AZURE_MOCKS') == "1" """If mocks are disabled, allow to use the Azurite local Azure Storage API https://github.com/Azure/Azurite To use locally: docker run -p 10000:10000 -p 10001:10001 mcr.microsoft.com/azure-storage/azurite """ _AZURITE_DEFAULT_CONNECT_STR = 'DefaultEndpointsProtocol=http;' \ 'AccountName=devstoreaccount1;' \ 'AccountKey=Eby8vdM02xNOcqFlqUwJPLlmEtlCDXJ1OUzFT50uSRZ6IFsuFq2UVErCz4I6tq/' \ 'K1SZFPTOtr/KBHBeksoGMGw==;' \ 'BlobEndpoint=http://127.0.0.1:10000/devstoreaccount1;' CONNECT_STR = os.environ.get('SO_AZURE_CONNECTION_STRING', _AZURITE_DEFAULT_CONNECT_STR) logger = logging.getLogger(__name__) class FakeBlobClient(object): # From Azure's BlobClient API # https://azuresdkdocs.blob.core.windows.net/$web/python/azure-storage-blob/12.0.0/azure.storage.blob.html#azure.storage.blob.BlobClient def __init__(self, container_client, name): self._container_client = container_client # type: FakeContainerClient self.blob_name = name self.metadata = dict(size=0) self.__contents = io.BytesIO() self._staged_contents = {} def commit_block_list(self, block_list): data = b''.join([self._staged_contents[block_blob['id']] for block_blob in block_list]) self.__contents = io.BytesIO(data) self.set_blob_metadata(dict(size=len(data))) self._container_client.register_blob_client(self) def delete_blob(self): self._container_client.delete_blob(self) def download_blob(self, offset=None, length=None, max_concurrency=1): if offset is None: return self.__contents self.__contents.seek(offset) return io.BytesIO(self.__contents.read(length)) def get_blob_properties(self): return self.metadata def set_blob_metadata(self, metadata): self.metadata = metadata def stage_block(self, block_id, data): self._staged_contents[block_id] = data def upload_blob(self, data, length=None, metadata=None): if metadata is not None: self.set_blob_metadata(metadata) self.__contents = io.BytesIO(data[:length]) self.set_blob_metadata(dict(size=len(data[:length]))) self._container_client.register_blob_client(self) class FakeBlobClientTest(unittest.TestCase): def setUp(self): self.blob_service_client = FakeBlobServiceClient.from_connection_string(CONNECT_STR) self.container_client = FakeContainerClient(self.blob_service_client, 'test-container') self.blob_client = FakeBlobClient(self.container_client, 'test-blob.txt') def test_delete_blob(self): data = b'Lorem ipsum' self.blob_client.upload_blob(data) self.assertEqual(self.container_client.list_blobs(), [self.blob_client.blob_name]) self.blob_client.delete_blob() self.assertEqual(self.container_client.list_blobs(), []) def test_upload_blob(self): data = b'Lorem ipsum' self.blob_client.upload_blob(data) actual = self.blob_client.download_blob().read() self.assertEqual(actual, data) class FakeContainerClient(object): # From Azure's ContainerClient API # https://docs.microsoft.com/fr-fr/python/api/azure-storage-blob/azure.storage.blob.containerclient?view=azure-python def __init__(self, blob_service_client, name): self.blob_service_client = blob_service_client # type: FakeBlobServiceClient self.container_name = name self.metadata = {} self.__blob_clients = OrderedDict() def create_container(self, metadata): self.metadata = metadata def delete_blob(self, blob): del self.__blob_clients[blob.blob_name] def delete_blobs(self): self.__blob_clients = OrderedDict() def delete_container(self): self.blob_service_client.delete_container(self.container_name) def download_blob(self, blob): if blob.blob_name not in list(self.__blob_clients.keys()): raise azure.core.exceptions.ResourceNotFoundError('The specified blob does not exist.') blob_client = self.__blob_clients[blob.blob_name] blob_content = blob_client.download_blob() return blob_content def get_blob_client(self, blob_name): return self.__blob_clients.get(blob_name, FakeBlobClient(self, blob_name)) def get_container_properties(self): return self.metadata def list_blobs(self): return list(self.__blob_clients.keys()) def upload_blob(self, blob_name, data): blob_client = FakeBlobClient(self, blob_name) blob_client.upload_blob(data) self.__blob_clients[blob_name] = blob_client def register_blob_client(self, blob_client): self.__blob_clients[blob_client.blob_name] = blob_client class FakeContainerClientTest(unittest.TestCase): def setUp(self): self.blob_service_client = FakeBlobServiceClient.from_connection_string(CONNECT_STR) self.container_client = FakeContainerClient(self.blob_service_client, 'test-container') def test_nonexistent_blob(self): blob_client = self.container_client.get_blob_client('test-blob.txt') with self.assertRaises(azure.core.exceptions.ResourceNotFoundError): self.container_client.download_blob(blob_client) def test_delete_blob(self): blob_name = 'test-blob.txt' data = b'Lorem ipsum' self.container_client.upload_blob(blob_name, data) self.assertEqual(self.container_client.list_blobs(), [blob_name]) blob_client = FakeBlobClient(self.container_client, 'test-blob.txt') self.container_client.delete_blob(blob_client) self.assertEqual(self.container_client.list_blobs(), []) def test_delete_blobs(self): blob_name_1 = 'test-blob-1.txt' blob_name_2 = 'test-blob-2.txt' data = b'Lorem ipsum' self.container_client.upload_blob(blob_name_1, data) self.container_client.upload_blob(blob_name_2, data) self.assertEqual(self.container_client.list_blobs(), [blob_name_1, blob_name_2]) def test_delete_container(self): container_name = 'test-container' container_client = self.blob_service_client.create_container(container_name) self.assertEqual(self.blob_service_client.get_container_client(container_name).container_name, container_name) container_client.delete_container() with self.assertRaises(azure.core.exceptions.ResourceNotFoundError): self.blob_service_client.get_container_client(container_name) def test_list_blobs(self): blob_name_1 = 'test-blob-1.txt' blob_name_2 = 'test-blob-2.txt' data = b'Lorem ipsum' self.container_client.upload_blob(blob_name_1, data) self.container_client.upload_blob(blob_name_2, data) self.assertEqual(self.container_client.list_blobs(), [blob_name_1, blob_name_2]) self.container_client.delete_blobs() self.assertEqual(self.container_client.list_blobs(), []) def test_upload_blob(self): blob_name = 'test-blob.txt' data = b'Lorem ipsum' self.container_client.upload_blob(blob_name, data) blob_client = self.container_client.get_blob_client(blob_name) actual = self.container_client.download_blob(blob_client).read() self.assertEqual(actual, data) class FakeBlobServiceClient(object): # From Azure's BlobServiceClient API # https://docs.microsoft.com/fr-fr/python/api/azure-storage-blob/azure.storage.blob.blobserviceclient?view=azure-python def __init__(self, account_url, credential=None, **kwargs): self._account_url = account_url self._credential = credential self.__container_clients = OrderedDict() @classmethod def from_connection_string(cls, conn_str, credential=None, **kwargs): account_url, secondary, credential = \ azure.storage.blob._shared.base_client.parse_connection_str(conn_str, credential, 'blob') if 'secondary_hostname' not in kwargs: kwargs['secondary_hostname'] = secondary return cls(account_url, credential=credential, **kwargs) def create_container(self, container_name, metadata=None): if container_name in self.__container_clients: raise azure.core.exceptions.ResourceExistsError('The specified container already exists.') container_client = FakeContainerClient(self, container_name) if metadata is not None: container_client.create_container(metadata) self.__container_clients[container_name] = container_client return container_client def delete_container(self, container_name): del self.__container_clients[container_name] def get_blob_client(self, container, blob): container = self.__container_clients[container] blob_client = container.get_blob_client(blob) return blob_client def get_container_client(self, container): if container not in self.__container_clients: raise azure.core.exceptions.ResourceNotFoundError('The specified container does not exist.') return self.__container_clients[container] class FakeBlobServiceClientTest(unittest.TestCase): def setUp(self): self.blob_service_client = FakeBlobServiceClient.from_connection_string(CONNECT_STR) def test_nonexistent_container(self): with self.assertRaises(azure.core.exceptions.ResourceNotFoundError): self.blob_service_client.get_container_client('test-container') def test_create_container(self): container_name = 'test_container' expected = self.blob_service_client.create_container(container_name) actual = self.blob_service_client.get_container_client(container_name) self.assertEqual(actual, expected) def test_duplicate_container(self): container_name = 'test-container' self.blob_service_client.create_container(container_name) with self.assertRaises(azure.core.exceptions.ResourceExistsError): self.blob_service_client.create_container(container_name) def test_delete_container(self): container_name = 'test_container' self.blob_service_client.create_container(container_name) self.blob_service_client.delete_container(container_name) with self.assertRaises(azure.core.exceptions.ResourceNotFoundError): self.blob_service_client.get_container_client(container_name) def test_get_blob_client(self): container_name = 'test_container' blob_name = 'test-blob.txt' self.blob_service_client.create_container(container_name) blob_client = self.blob_service_client.get_blob_client(container_name, blob_name) self.assertEqual(blob_client.blob_name, blob_name) if DISABLE_MOCKS: CLIENT = azure.storage.blob.BlobServiceClient.from_connection_string(CONNECT_STR) else: CLIENT = FakeBlobServiceClient.from_connection_string(CONNECT_STR) def get_container_client(): return CLIENT.get_container_client(container=CONTAINER_NAME) def cleanup_container(): container_client = get_container_client() container_client.delete_blobs() def put_to_container(blob_name, contents, num_attempts=12, sleep_time=5): logger.debug('%r', locals()) # # In real life, it can take a few seconds for the container to become ready. # If we try to write to the key while the container while it isn't ready, we # will get a StorageError: NotFound. # for attempt in range(num_attempts): try: container_client = get_container_client() container_client.upload_blob(blob_name, contents) return except azure.common.AzureHttpError as err: logger.error('caught %r, retrying', err) time.sleep(sleep_time) assert False, 'failed to create container %s after %d attempts' % (CONTAINER_NAME, num_attempts) def setUpModule(): # noqa """Called once by unittest when initializing this module. Set up the test Azure container. """ CLIENT.create_container(CONTAINER_NAME) def tearDownModule(): # noqa """Called once by unittest when tearing down this module. Empty and removes the test Azure container. """ try: container_client = get_container_client() container_client.delete_container() except azure.common.AzureHttpError: pass class ReaderTest(unittest.TestCase): def tearDown(self): cleanup_container() def test_iter(self): """Are Azure Blob Storage files iterated over correctly?""" expected = u"hello wořld\nhow are you?".encode('utf8') blob_name = "test_iter_%s" % BLOB_NAME put_to_container(blob_name, contents=expected) # connect to fake Azure Blob Storage and read from the fake key we filled above fin = smart_open.azure.Reader(CONTAINER_NAME, blob_name, CLIENT) output = [line.rstrip(b'\n') for line in fin] self.assertEqual(output, expected.split(b'\n')) def test_iter_context_manager(self): # same thing but using a context manager expected = u"hello wořld\nhow are you?".encode('utf8') blob_name = "test_iter_context_manager_%s" % BLOB_NAME put_to_container(blob_name, contents=expected) with smart_open.azure.Reader(CONTAINER_NAME, blob_name, CLIENT) as fin: output = [line.rstrip(b'\n') for line in fin] self.assertEqual(output, expected.split(b'\n')) def test_read(self): """Are Azure Blob Storage files read correctly?""" content = u"hello wořld\nhow are you?".encode('utf8') blob_name = "test_read_%s" % BLOB_NAME put_to_container(blob_name, contents=content) logger.debug('content: %r len: %r', content, len(content)) fin = smart_open.azure.Reader(CONTAINER_NAME, blob_name, CLIENT) self.assertEqual(content[:6], fin.read(6)) self.assertEqual(content[6:14], fin.read(8)) # ř is 2 bytes self.assertEqual(content[14:], fin.read()) # read the rest def test_read_max_concurrency(self): """Are Azure Blob Storage files read correctly?""" content = u"hello wořld\nhow are you?".encode('utf8') blob_name = "test_read_%s" % BLOB_NAME put_to_container(blob_name, contents=content) logger.debug('content: %r len: %r', content, len(content)) fin = smart_open.azure.Reader(CONTAINER_NAME, blob_name, CLIENT, max_concurrency=4) self.assertEqual(content[:6], fin.read(6)) self.assertEqual(content[6:14], fin.read(8)) # ř is 2 bytes self.assertEqual(content[14:], fin.read()) # read the rest def test_seek_beginning(self): """Does seeking to the beginning of Azure Blob Storage files work correctly?""" content = u"hello wořld\nhow are you?".encode('utf8') blob_name = "test_seek_beginning_%s" % BLOB_NAME put_to_container(blob_name, contents=content) fin = smart_open.azure.Reader(CONTAINER_NAME, blob_name, CLIENT) self.assertEqual(content[:6], fin.read(6)) self.assertEqual(content[6:14], fin.read(8)) # ř is 2 bytes fin.seek(0) self.assertEqual(content, fin.read()) # no size given => read whole file fin.seek(0) self.assertEqual(content, fin.read(-1)) # same thing def test_seek_start(self): """Does seeking from the start of Azure Blob Storage files work correctly?""" content = u"hello wořld\nhow are you?".encode('utf8') blob_name = "test_seek_start_%s" % BLOB_NAME put_to_container(blob_name, contents=content) fin = smart_open.azure.Reader(CONTAINER_NAME, blob_name, CLIENT) seek = fin.seek(6) self.assertEqual(seek, 6) self.assertEqual(fin.tell(), 6) self.assertEqual(fin.read(6), u'wořld'.encode('utf-8')) def test_seek_current(self): """Does seeking from the middle of Azure Blob Storage files work correctly?""" content = u"hello wořld\nhow are you?".encode('utf8') blob_name = "test_seek_current_%s" % BLOB_NAME put_to_container(blob_name, contents=content) fin = smart_open.azure.Reader(CONTAINER_NAME, blob_name, CLIENT) self.assertEqual(fin.read(5), b'hello') seek = fin.seek(1, whence=smart_open.constants.WHENCE_CURRENT) self.assertEqual(seek, 6) self.assertEqual(fin.read(6), u'wořld'.encode('utf-8')) def test_seek_end(self): """Does seeking from the end of Azure Blob Storage files work correctly?""" content = u"hello wořld\nhow are you?".encode('utf8') blob_name = "test_seek_end_%s" % BLOB_NAME put_to_container(blob_name, contents=content) fin = smart_open.azure.Reader(CONTAINER_NAME, blob_name, CLIENT) seek = fin.seek(-4, whence=smart_open.constants.WHENCE_END) self.assertEqual(seek, len(content) - 4) self.assertEqual(fin.read(), b'you?') def test_detect_eof(self): content = u"hello wořld\nhow are you?".encode('utf8') blob_name = "test_detect_eof_%s" % BLOB_NAME put_to_container(blob_name, contents=content) fin = smart_open.azure.Reader(CONTAINER_NAME, blob_name, CLIENT) fin.read() eof = fin.tell() self.assertEqual(eof, len(content)) fin.seek(0, whence=smart_open.constants.WHENCE_END) self.assertEqual(eof, fin.tell()) def test_read_gzip(self): expected = u'раcцветали яблони и груши, поплыли туманы над рекой...'.encode('utf-8') buf = io.BytesIO() buf.close = lambda: None # keep buffer open so that we can .getvalue() with gzip.GzipFile(fileobj=buf, mode='w') as zipfile: zipfile.write(expected) blob_name = "test_read_gzip_%s" % BLOB_NAME put_to_container(blob_name, contents=buf.getvalue()) # # Make sure we're reading things correctly. # with smart_open.azure.Reader(CONTAINER_NAME, blob_name, CLIENT) as fin: self.assertEqual(fin.read(), buf.getvalue()) # # Make sure the buffer we wrote is legitimate gzip. # sanity_buf = io.BytesIO(buf.getvalue()) with gzip.GzipFile(fileobj=sanity_buf) as zipfile: self.assertEqual(zipfile.read(), expected) logger.debug('starting actual test') with smart_open.azure.Reader(CONTAINER_NAME, blob_name, CLIENT) as fin: with gzip.GzipFile(fileobj=fin) as zipfile: actual = zipfile.read() self.assertEqual(expected, actual) def test_readline(self): content = b'englishman\nin\nnew\nyork\n' blob_name = "test_readline_%s" % BLOB_NAME put_to_container(blob_name, contents=content) with smart_open.azure.Reader(CONTAINER_NAME, blob_name, CLIENT) as fin: fin.readline() self.assertEqual(fin.tell(), content.index(b'\n')+1) fin.seek(0) actual = list(fin) self.assertEqual(fin.tell(), len(content)) expected = [b'englishman\n', b'in\n', b'new\n', b'york\n'] self.assertEqual(expected, actual) def test_readline_tiny_buffer(self): content = b'englishman\nin\nnew\nyork\n' blob_name = "test_readline_tiny_buffer_%s" % BLOB_NAME put_to_container(blob_name, contents=content) with smart_open.azure.Reader( CONTAINER_NAME, blob_name, CLIENT, buffer_size=8 ) as fin: actual = list(fin) expected = [b'englishman\n', b'in\n', b'new\n', b'york\n'] self.assertEqual(expected, actual) def test_read0_does_not_return_data(self): content = b'englishman\nin\nnew\nyork\n' blob_name = "test_read0_does_not_return_data_%s" % BLOB_NAME put_to_container(blob_name, contents=content) with smart_open.azure.Reader(CONTAINER_NAME, blob_name, CLIENT) as fin: data = fin.read(0) self.assertEqual(data, b'') def test_read_past_end(self): content = b'englishman\nin\nnew\nyork\n' blob_name = "test_read_past_end_%s" % BLOB_NAME put_to_container(blob_name, contents=content) with smart_open.azure.Reader(CONTAINER_NAME, blob_name, CLIENT) as fin: data = fin.read(100) self.assertEqual(data, content) class WriterTest(unittest.TestCase): """Test writing into Azure Blob files.""" def tearDown(self): cleanup_container() def test_write_01(self): """Does writing into Azure Blob Storage work correctly?""" test_string = u"žluťoučký koníček".encode('utf8') blob_name = "test_write_01_%s" % BLOB_NAME with smart_open.azure.Writer(CONTAINER_NAME, blob_name, CLIENT) as fout: fout.write(test_string) output = list(smart_open.open( "azure://%s/%s" % (CONTAINER_NAME, blob_name), "rb", transport_params=dict(client=CLIENT), )) self.assertEqual(output, [test_string]) def test_incorrect_input(self): """Does azure write fail on incorrect input?""" blob_name = "test_incorrect_input_%s" % BLOB_NAME try: with smart_open.azure.Writer(CONTAINER_NAME, blob_name, CLIENT) as fin: fin.write(None) except TypeError: pass else: self.fail() def test_write_02(self): """Does Azure Blob Storage write unicode-utf8 conversion work?""" blob_name = "test_write_02_%s" % BLOB_NAME smart_open_write = smart_open.azure.Writer(CONTAINER_NAME, blob_name, CLIENT) smart_open_write.tell() logger.info("smart_open_write: %r", smart_open_write) with smart_open_write as fout: fout.write(u"testžížáč".encode("utf-8")) self.assertEqual(fout.tell(), 14) def test_write_03(self): """Do multiple writes less than the min_part_size work correctly?""" # write blob_name = "test_write_03_%s" % BLOB_NAME min_part_size = 256 * 1024 smart_open_write = smart_open.azure.Writer( CONTAINER_NAME, blob_name, CLIENT, min_part_size=min_part_size ) local_write = io.BytesIO() with smart_open_write as fout: first_part = b"t" * 262141 fout.write(first_part) local_write.write(first_part) self.assertEqual(fout._current_part.tell(), 262141) second_part = b"t\n" fout.write(second_part) local_write.write(second_part) self.assertEqual(fout._current_part.tell(), 262143) self.assertEqual(fout._total_parts, 0) third_part = b"t" fout.write(third_part) local_write.write(third_part) self.assertEqual(fout._current_part.tell(), 0) self.assertEqual(fout._total_parts, 1) fourth_part = b"t" * 1 fout.write(fourth_part) local_write.write(fourth_part) self.assertEqual(fout._current_part.tell(), 1) self.assertEqual(fout._total_parts, 1) # read back the same key and check its content uri = "azure://%s/%s" % (CONTAINER_NAME, blob_name) output = list(smart_open.open(uri, transport_params=dict(client=CLIENT))) local_write.seek(0) actual = [line.decode("utf-8") for line in list(local_write)] self.assertEqual(output, actual) def test_write_03a(self): """Do multiple writes greater than or equal to the min_part_size work correctly?""" min_part_size = 256 * 1024 blob_name = "test_write_03_%s" % BLOB_NAME smart_open_write = smart_open.azure.Writer( CONTAINER_NAME, blob_name, CLIENT, min_part_size=min_part_size ) local_write = io.BytesIO() with smart_open_write as fout: for i in range(1, 4): part = b"t" * min_part_size fout.write(part) local_write.write(part) self.assertEqual(fout._current_part.tell(), 0) self.assertEqual(fout._total_parts, i) # read back the same key and check its content uri = "azure://%s/%s" % (CONTAINER_NAME, blob_name) output = list(smart_open.open(uri, transport_params=dict(client=CLIENT))) local_write.seek(0) actual = [line.decode("utf-8") for line in list(local_write)] self.assertEqual(output, actual) def test_write_04(self): """Does writing no data cause key with an empty value to be created?""" blob_name = "test_write_04_%s" % BLOB_NAME smart_open_write = smart_open.azure.Writer(CONTAINER_NAME, blob_name, CLIENT) with smart_open_write as fout: # noqa pass # read back the same key and check its content output = list(smart_open.open( "azure://%s/%s" % (CONTAINER_NAME, blob_name), transport_params=dict(client=CLIENT)) ) self.assertEqual(output, []) def test_gzip(self): expected = u'а не спеть ли мне песню... о любви'.encode('utf-8') blob_name = "test_gzip_%s" % BLOB_NAME with smart_open.azure.Writer(CONTAINER_NAME, blob_name, CLIENT) as fout: with gzip.GzipFile(fileobj=fout, mode='w') as zipfile: zipfile.write(expected) with smart_open.azure.Reader(CONTAINER_NAME, blob_name, CLIENT) as fin: with gzip.GzipFile(fileobj=fin) as zipfile: actual = zipfile.read() self.assertEqual(expected, actual) def test_buffered_writer_wrapper_works(self): """ Ensure that we can wrap a smart_open azure stream in a BufferedWriter, which passes a memoryview object to the underlying stream in python >= 2.7 """ expected = u'не думай о секундах свысока' blob_name = "test_buffered_writer_wrapper_works_%s" % BLOB_NAME with smart_open.azure.Writer(CONTAINER_NAME, blob_name, CLIENT) as fout: with io.BufferedWriter(fout) as sub_out: sub_out.write(expected.encode('utf-8')) with smart_open.open( "azure://%s/%s" % (CONTAINER_NAME, blob_name), 'rb', transport_params=dict(client=CLIENT) ) as fin: with io.TextIOWrapper(fin, encoding='utf-8') as text: actual = text.read() self.assertEqual(expected, actual) def test_binary_iterator(self): expected = u"выйду ночью в поле с конём".encode('utf-8').split(b' ') blob_name = "test_binary_iterator_%s" % BLOB_NAME put_to_container(blob_name=blob_name, contents=b"\n".join(expected)) with smart_open.azure.open(CONTAINER_NAME, blob_name, 'rb', CLIENT) as fin: actual = [line.rstrip() for line in fin] self.assertEqual(expected, actual) def test_nonexisting_container(self): expected = u"выйду ночью в поле с конём".encode('utf-8') with self.assertRaises(azure.core.exceptions.ResourceNotFoundError): with smart_open.azure.open( 'thiscontainerdoesntexist', 'mykey', 'wb', CLIENT ) as fout: fout.write(expected) def test_double_close(self): text = u'там за туманами, вечными, пьяными'.encode('utf-8') fout = smart_open.azure.open(CONTAINER_NAME, 'key', 'wb', CLIENT) fout.write(text) fout.close() fout.close() def test_flush_close(self): text = u'там за туманами, вечными, пьяными'.encode('utf-8') fout = smart_open.azure.open(CONTAINER_NAME, 'key', 'wb', CLIENT) fout.write(text) fout.flush() fout.close() smart_open-5.2.1/smart_open/tests/test_bytebuffer.py000066400000000000000000000151401411241424400227430ustar00rootroot00000000000000# -*- coding: utf-8 -*- # # Copyright (C) 2019 Radim Rehurek # # This code is distributed under the terms and conditions # from the MIT License (MIT). # import io import random import unittest import smart_open.bytebuffer CHUNK_SIZE = 1024 def int2byte(i): return bytes((i, )) def random_byte_string(length=CHUNK_SIZE): rand_bytes = [int2byte(random.randint(0, 255)) for _ in range(length)] return b''.join(rand_bytes) def bytebuffer_and_random_contents(): buf = smart_open.bytebuffer.ByteBuffer(CHUNK_SIZE) contents = random_byte_string(CHUNK_SIZE) content_reader = io.BytesIO(contents) buf.fill(content_reader) return [buf, contents] class ByteBufferTest(unittest.TestCase): def test_len(self): buf = smart_open.bytebuffer.ByteBuffer(CHUNK_SIZE) self.assertEqual(len(buf), 0) contents = b'foo bar baz' buf._bytes = contents self.assertEqual(len(buf), len(contents)) pos = 4 buf._pos = pos self.assertEqual(len(buf), len(contents) - pos) def test_fill_from_reader(self): buf = smart_open.bytebuffer.ByteBuffer(CHUNK_SIZE) contents = random_byte_string(CHUNK_SIZE) content_reader = io.BytesIO(contents) bytes_filled = buf.fill(content_reader) self.assertEqual(bytes_filled, CHUNK_SIZE) self.assertEqual(len(buf), CHUNK_SIZE) self.assertEqual(buf._bytes, contents) def test_fill_from_iterable(self): buf = smart_open.bytebuffer.ByteBuffer(CHUNK_SIZE) contents = random_byte_string(CHUNK_SIZE) contents_iter = (contents[i:i+8] for i in range(0, CHUNK_SIZE, 8)) bytes_filled = buf.fill(contents_iter) self.assertEqual(bytes_filled, CHUNK_SIZE) self.assertEqual(len(buf), CHUNK_SIZE) self.assertEqual(buf._bytes, contents) def test_fill_from_list(self): buf = smart_open.bytebuffer.ByteBuffer(CHUNK_SIZE) contents = random_byte_string(CHUNK_SIZE) contents_list = [contents[i:i+7] for i in range(0, CHUNK_SIZE, 7)] bytes_filled = buf.fill(contents_list) self.assertEqual(bytes_filled, CHUNK_SIZE) self.assertEqual(len(buf), CHUNK_SIZE) self.assertEqual(buf._bytes, contents) def test_fill_multiple(self): buf = smart_open.bytebuffer.ByteBuffer(CHUNK_SIZE) long_contents = random_byte_string(CHUNK_SIZE * 4) long_content_reader = io.BytesIO(long_contents) first_bytes_filled = buf.fill(long_content_reader) self.assertEqual(first_bytes_filled, CHUNK_SIZE) second_bytes_filled = buf.fill(long_content_reader) self.assertEqual(second_bytes_filled, CHUNK_SIZE) self.assertEqual(len(buf), 2 * CHUNK_SIZE) def test_fill_size(self): buf = smart_open.bytebuffer.ByteBuffer(CHUNK_SIZE) contents = random_byte_string(CHUNK_SIZE * 2) content_reader = io.BytesIO(contents) fill_size = int(CHUNK_SIZE / 2) bytes_filled = buf.fill(content_reader, size=fill_size) self.assertEqual(bytes_filled, fill_size) self.assertEqual(len(buf), fill_size) second_bytes_filled = buf.fill(content_reader, size=CHUNK_SIZE+1) self.assertEqual(second_bytes_filled, CHUNK_SIZE) self.assertEqual(len(buf), fill_size + CHUNK_SIZE) def test_fill_reader_exhaustion(self): buf = smart_open.bytebuffer.ByteBuffer(CHUNK_SIZE) short_content_size = int(CHUNK_SIZE / 4) short_contents = random_byte_string(short_content_size) short_content_reader = io.BytesIO(short_contents) bytes_filled = buf.fill(short_content_reader) self.assertEqual(bytes_filled, short_content_size) self.assertEqual(len(buf), short_content_size) def test_fill_iterable_exhaustion(self): buf = smart_open.bytebuffer.ByteBuffer(CHUNK_SIZE) short_content_size = int(CHUNK_SIZE / 4) short_contents = random_byte_string(short_content_size) short_contents_iter = (short_contents[i:i+8] for i in range(0, short_content_size, 8)) bytes_filled = buf.fill(short_contents_iter) self.assertEqual(bytes_filled, short_content_size) self.assertEqual(len(buf), short_content_size) def test_empty(self): buf, _ = bytebuffer_and_random_contents() self.assertEqual(len(buf), CHUNK_SIZE) buf.empty() self.assertEqual(len(buf), 0) def test_peek(self): buf, contents = bytebuffer_and_random_contents() self.assertEqual(buf.peek(), contents) self.assertEqual(len(buf), CHUNK_SIZE) self.assertEqual(buf.peek(64), contents[0:64]) self.assertEqual(buf.peek(CHUNK_SIZE * 10), contents) def test_read(self): buf, contents = bytebuffer_and_random_contents() self.assertEqual(buf.read(), contents) self.assertEqual(len(buf), 0) self.assertEqual(buf.read(), b'') def test_read_size(self): buf, contents = bytebuffer_and_random_contents() read_size = 128 self.assertEqual(buf.read(read_size), contents[:read_size]) self.assertEqual(len(buf), CHUNK_SIZE - read_size) self.assertEqual(buf.read(CHUNK_SIZE*2), contents[read_size:]) self.assertEqual(len(buf), 0) def test_readline(self): """Does the readline function work as expected in the simple case?""" expected = (b'this is the very first line\n', b'and this the second') buf = smart_open.bytebuffer.ByteBuffer() buf.fill(io.BytesIO(b''.join(expected))) first_line = buf.readline(b'\n') self.assertEqual(expected[0], first_line) second_line = buf.readline(b'\n') self.assertEqual(expected[1], second_line) def test_readline_middle(self): """Does the readline function work when we're in the middle of the buffer?""" expected = (b'this is the very first line\n', b'and this the second') buf = smart_open.bytebuffer.ByteBuffer() buf.fill(io.BytesIO(b''.join(expected))) buf.read(5) first_line = buf.readline(b'\n') self.assertEqual(expected[0][5:], first_line) buf.read(5) second_line = buf.readline(b'\n') self.assertEqual(expected[1][5:], second_line) def test_readline_terminator(self): """Does the readline function respect the terminator parameter?""" buf = smart_open.bytebuffer.ByteBuffer() buf.fill(io.BytesIO(b'one!two.three,')) expected = [b'one!', b'two.', b'three,'] actual = [buf.readline(b'!'), buf.readline(b'.'), buf.readline(b',')] self.assertEqual(expected, actual) smart_open-5.2.1/smart_open/tests/test_data/000077500000000000000000000000001411241424400211445ustar00rootroot00000000000000smart_open-5.2.1/smart_open/tests/test_data/1984.txt.bz2000066400000000000000000000003671411241424400230140ustar00rootroot00000000000000BZh91AY&SY]ZW@ " ?04Ec6ܒt(l V(6E.^iL''lVdd33UJ\!S$')+ٸ'Ů4;[߳ w}vYy#}6kPv-%ֳg?NHVW>d-_:E pfeRXx~A&V^>K2 pJx/t:,!XBFgTx6pڽ%lh"[Zz[;HbPӬ€_[ʇ"gYZsmart_open-5.2.1/smart_open/tests/test_data/cp852.tsv.txt000066400000000000000000000000261411241424400233570ustar00rootroot00000000000000tmto bude budem bylismart_open-5.2.1/smart_open/tests/test_data/crime-and-punishment.txt000066400000000000000000000057651411241424400257510ustar00rootroot00000000000000В начале июля, в чрезвычайно жаркое время, под вечер, один молодой человек вышел из своей каморки, которую нанимал от жильцов в С -- м переулке, на улицу и медленно, как бы в нерешимости, отправился к К -- ну мосту. Он благополучно избегнул встречи с своею хозяйкой на лестнице. Каморка его приходилась под самою кровлей высокого пятиэтажного дома и походила более на шкаф, чем на квартиру. Квартирная же хозяйка его, у которой он нанимал эту каморку с обедом и прислугой, помещалась одною лестницей ниже, в отдельной квартире, и каждый раз, при выходе на улицу, ему непременно надо было проходить мимо хозяйкиной кухни, почти всегда настежь отворенной на лестницу. И каждый раз молодой человек, проходя мимо, чувствовал какое-то болезненное и трусливое ощущение, которого стыдился и от которого морщился. Он был должен кругом хозяйке и боялся с нею встретиться. Не то чтоб он был так труслив и забит, совсем даже напротив; но с некоторого времени он был в раздражительном и напряженном состоянии, похожем на ипохондрию. Он до того углубился в себя и уединился от всех, что боялся даже всякой встречи, не только встречи с хозяйкой. Он был задавлен бедностью; но даже стесненное положение перестало в последнее время тяготить его. Насущными делами своими он совсем перестал и не хотел заниматься. Никакой хозяйки, в сущности, он не боялся, что бы та ни замышляла против него. Но останавливаться на лестнице, слушать всякий вздор про всю эту обыденную дребедень, до которой ему нет никакого дела, все эти приставания о платеже, угрозы, жалобы, и при этом самому изворачиваться, извиняться, лгать, -- нет уж, лучше проскользнуть как-нибудь кошкой по лестнице и улизнуть, чтобы никто не видал. smart_open-5.2.1/smart_open/tests/test_data/crime-and-punishment.txt.gz000066400000000000000000000022561411241424400263600ustar00rootroot00000000000000Zcrime-and-punishment.txt}V[nH).8!{K#|zH\c?l ?\B_BP{%st%CPƓVܱvt ІJCBuc?;}8O SWAh\1q nVD{~c|zy,)s9~cfqOO@5#tӧNIdƓ) <>0S>!<>-\\-؋1 .0Ycj DĽ{A djERc70Pf(x"n3ayalou1 4A{F:v:N\UL0·bD^)Mw\XhGo+g *w'r&JD2HbRyV ]jgw$uBO( Ϝ*(k}-f:2Q#g@̙6,-,̜ 4a%CE=sMg) ,~l=2@ KTZ#Qųh?f/djɼaM ZZŠ:>MԼ+8,MMXQuZB 2BmA}cT&¨=f!1Ij*vÀ"5̩9=N9p_NKKBp_E%@ֽNrT 삩W.ƫ8yq{:P>&Frx 7όW[pXK,ئKqY*^a$3`83+Uwhaez!H,U"..YY, VB)VO^rU_Avl- ^M[>o*'v0WFĉz$.Xب8`S'*MTqk%$np smart_open-5.2.1/smart_open/tests/test_data/crime-and-punishment.txt.xz000066400000000000000000000022601411241424400263740ustar00rootroot000000000000007zXZִF!t/ p]h$ o+Szڀq.Ǔ6WߤOSuZ t[1#n%P50PK~Zd!;7G`m1[D9hJ5ɠ74^O eLkX,xٗykC6;i"˼eՆmlt?B )Ә4p@U!mCm&u ʸI"n#}le,kZM<ЧTG 4iE55|3 ~4Q&T)3;۟?&z=(Qm ~JK߂ !Edmj]i4l6 vL薰W@|,h."/]wy# Ѡb*?s0jΘ&(ml[_˄l 'r4G%-h50%Y6=.l1u]f6VVF?x/x01n/ŌmIZ m }l>90Pt=QSW Ksj'nR/פS'!ěc>P@wm^8iP Xlk ɚY?5aJZZD)yL|<~2(¾ʆe>ɪ鞗 =^,BkUxc~Cgwa}pN֑t+7<986ןRw(:*݅5){Iv[KeO9 62OXViXo%oVѨ8fjņa ra@_$nθ(ИDLߏe!e<~F4䧨9)x. So>GoY`GC*\烋(BO ;]K ?^g*ȎRI K{ t3أVHf&_|VW;K|L$$Wj t)DL(HxlM>B:Cu4|/WESfTlӶww}xJWֻ]LM{* o1gYZsmart_open-5.2.1/smart_open/tests/test_data/crlf_at_1k_boundary.warc.gz000066400000000000000000000050341411241424400263530ustar00rootroot00000000000000XkoFH[$jml|wTnwRU 1$iӪ_^ݙyffgN./lmNM&2XSy_Ԁ-6JRҤ3,hb5&!qYvtj"&sxŲ+ /X2jZY~UluF\:JZ~E!ҾRf ֮#S ohŚ(_2!e @HӉH%!S\~u$PM Zd ~[u݇XNl>nё3J^2H d!K )Uzy]̴Ej8% 5'wYte $S!o *kwYk6-@с2y 5t ]p狩aMZ dYש.LR,RX mbV&7N~&]&!}ÙT1E;,&KR΁ADi (P9*a<{T8~Cxd.Jn/JcӰDdb\$V!R H!ki'^%/p? >@qu3vBˆDpɫ_Y5Z|d.6H@]jB&vsV-wa6)+6M6kV8b_?*0Գ؇vcֆY-OO|*R#p6Ljx1^OjyPgmKl4{=4\tY6@{B{YN'd\ RE# jN5m90ek\s#92)Ҵue_fs ̨ր7uS4w N{wW Xx8PVJgFn?4 E-qGOo2?1_$ha aOj='ȯwRRx_4 Aпzvk_u?7!aXu+ u.ۏOc쳜e!9'Oa@,Y*&R[`ă=}MA=`HF`yI֜Q}㐼kXb>1U%6$j }㛴…\Dmu UH?#x:*k.]? 63ى~Zqaz1 ~ިso4 :>nnlo N9[5=8l/g)#/:6 &eT*@Ŋg4q|t\L1 8R # # This code is distributed under the terms and conditions # from the MIT License (MIT). # import gzip import inspect import io import logging import os import time import uuid import unittest from unittest import mock import warnings from collections import OrderedDict import google.cloud import google.api_core.exceptions import smart_open import smart_open.constants BUCKET_NAME = 'test-smartopen-{}'.format(uuid.uuid4().hex) BLOB_NAME = 'test-blob' WRITE_BLOB_NAME = 'test-write-blob' DISABLE_MOCKS = os.environ.get('SO_DISABLE_GCS_MOCKS') == "1" RESUMABLE_SESSION_URI_TEMPLATE = ( 'https://www.googleapis.com/upload/storage/v1/b/' '%(bucket)s' '/o?uploadType=resumable&upload_id=' '%(upload_id)s' ) logger = logging.getLogger(__name__) def ignore_resource_warnings(): warnings.filterwarnings("ignore", category=ResourceWarning, message="unclosed.*") # noqa class FakeBucket(object): def __init__(self, client, name=None): self.client = client # type: FakeClient self.name = name self.blobs = OrderedDict() self._exists = True # # This is simpler than creating a backend and metaclass to store the state of every bucket created # self.client.register_bucket(self) def blob(self, blob_id): return self.blobs.get(blob_id, FakeBlob(blob_id, self)) def delete(self): self.client.delete_bucket(self) self._exists = False for blob in list(self.blobs.values()): blob.delete() def exists(self): return self._exists def get_blob(self, blob_id): try: return self.blobs[blob_id] except KeyError as e: raise google.cloud.exceptions.NotFound('Blob {} not found'.format(blob_id)) from e def list_blobs(self): return list(self.blobs.values()) def delete_blob(self, blob): del self.blobs[blob.name] def register_blob(self, blob): if blob.name not in self.blobs.keys(): self.blobs[blob.name] = blob def register_upload(self, upload): self.client.register_upload(upload) class FakeBucketTest(unittest.TestCase): def setUp(self): self.client = FakeClient() self.bucket = FakeBucket(self.client, 'test-bucket') def test_blob_registers_with_bucket(self): blob_id = 'blob.txt' expected = FakeBlob(blob_id, self.bucket) actual = self.bucket.blob(blob_id) self.assertEqual(actual, expected) def test_blob_alternate_constuctor(self): blob_id = 'blob.txt' expected = self.bucket.blob(blob_id) actual = self.bucket.list_blobs()[0] self.assertEqual(actual, expected) def test_delete(self): blob_id = 'blob.txt' blob = FakeBlob(blob_id, self.bucket) self.bucket.delete() self.assertFalse(self.bucket.exists()) self.assertFalse(blob.exists()) def test_get_multiple_blobs(self): blob_one_id = 'blob_one.avro' blob_two_id = 'blob_two.parquet' blob_one = self.bucket.blob(blob_one_id) blob_two = self.bucket.blob(blob_two_id) actual_first_blob = self.bucket.get_blob(blob_one_id) actual_second_blob = self.bucket.get_blob(blob_two_id) self.assertEqual(actual_first_blob, blob_one) self.assertEqual(actual_second_blob, blob_two) def test_get_nonexistent_blob(self): with self.assertRaises(google.cloud.exceptions.NotFound): self.bucket.get_blob('test-blob') def test_list_blobs(self): blob_one = self.bucket.blob('blob_one.avro') blob_two = self.bucket.blob('blob_two.parquet') actual = self.bucket.list_blobs() expected = [blob_one, blob_two] self.assertEqual(actual, expected) class FakeBlob(object): def __init__(self, name, bucket): self.name = name self._bucket = bucket # type: FakeBucket self._exists = False self.__contents = io.BytesIO() self._create_if_not_exists() def create_resumable_upload_session(self): resumeable_upload_url = RESUMABLE_SESSION_URI_TEMPLATE % dict( bucket=self._bucket.name, upload_id=str(uuid.uuid4()), ) upload = FakeBlobUpload(resumeable_upload_url, self) self._bucket.register_upload(upload) return resumeable_upload_url def delete(self): self._bucket.delete_blob(self) self._exists = False def download_as_bytes(self, start=0, end=None): # mimics Google's API by returning bytes # https://googleapis.dev/python/storage/latest/blobs.html#google.cloud.storage.blob.Blob.download_as_bytes if end is None: end = self.__contents.tell() self.__contents.seek(start) return self.__contents.read(end - start) def exists(self, client=None): return self._exists def upload_from_string(self, data): # mimics Google's API by accepting bytes or str, despite the method name # https://googleapis.dev/python/storage/latest/blobs.html#google.cloud.storage.blob.Blob.upload_from_string if isinstance(data, str): data = bytes(data, 'utf8') self.__contents = io.BytesIO(data) self.__contents.seek(0, io.SEEK_END) def write(self, data): self.upload_from_string(data) @property def bucket(self): return self._bucket @property def size(self): if self.__contents.tell() == 0: return None return self.__contents.tell() def _create_if_not_exists(self): self._bucket.register_blob(self) self._exists = True class FakeBlobTest(unittest.TestCase): def setUp(self): self.client = FakeClient() self.bucket = FakeBucket(self.client, 'test-bucket') def test_create_resumable_upload_session(self): blob = FakeBlob('fake-blob', self.bucket) resumable_upload_url = blob.create_resumable_upload_session() self.assertTrue(resumable_upload_url in self.client.uploads) def test_delete(self): blob = FakeBlob('fake-blob', self.bucket) blob.delete() self.assertFalse(blob.exists()) self.assertEqual(self.bucket.list_blobs(), []) def test_upload_download(self): blob = FakeBlob('fake-blob', self.bucket) contents = b'test' blob.upload_from_string(contents) self.assertEqual(blob.download_as_bytes(), b'test') self.assertEqual(blob.download_as_bytes(start=2), b'st') self.assertEqual(blob.download_as_bytes(end=2), b'te') self.assertEqual(blob.download_as_bytes(start=2, end=3), b's') def test_size(self): blob = FakeBlob('fake-blob', self.bucket) self.assertEqual(blob.size, None) blob.upload_from_string(b'test') self.assertEqual(blob.size, 4) class FakeCredentials(object): def __init__(self, client): self.client = client # type: FakeClient def before_request(self, *args, **kwargs): pass class FakeClient(object): def __init__(self, credentials=None): if credentials is None: credentials = FakeCredentials(self) self._credentials = credentials # type: FakeCredentials self.uploads = OrderedDict() self.__buckets = OrderedDict() def bucket(self, bucket_id): try: return self.__buckets[bucket_id] except KeyError as e: raise google.cloud.exceptions.NotFound('Bucket %s not found' % bucket_id) from e def create_bucket(self, bucket_id): bucket = FakeBucket(self, bucket_id) return bucket def get_bucket(self, bucket_id): return self.bucket(bucket_id) def register_bucket(self, bucket): if bucket.name in self.__buckets: raise google.cloud.exceptions.Conflict('Bucket %s already exists' % bucket.name) self.__buckets[bucket.name] = bucket def delete_bucket(self, bucket): del self.__buckets[bucket.name] def register_upload(self, upload): self.uploads[upload.url] = upload class FakeClientTest(unittest.TestCase): def setUp(self): self.client = FakeClient() def test_nonexistent_bucket(self): with self.assertRaises(google.cloud.exceptions.NotFound): self.client.bucket('test-bucket') def test_bucket(self): bucket_id = 'test-bucket' bucket = FakeBucket(self.client, bucket_id) actual = self.client.bucket(bucket_id) self.assertEqual(actual, bucket) def test_duplicate_bucket(self): bucket_id = 'test-bucket' FakeBucket(self.client, bucket_id) with self.assertRaises(google.cloud.exceptions.Conflict): FakeBucket(self.client, bucket_id) def test_create_bucket(self): bucket_id = 'test-bucket' bucket = self.client.create_bucket(bucket_id) actual = self.client.get_bucket(bucket_id) self.assertEqual(actual, bucket) class FakeBlobUpload(object): def __init__(self, url, blob): self.url = url self.blob = blob # type: FakeBlob self._finished = False self.__contents = io.BytesIO() def write(self, data): self.__contents.write(data) def finish(self): if not self._finished: self.__contents.seek(0) data = self.__contents.read() self.blob.upload_from_string(data) self._finished = True def terminate(self): self.blob.delete() self.__contents = None class FakeResponse(object): def __init__(self, status_code=200, text=None): self.status_code = status_code self.text = text class FakeAuthorizedSession(object): def __init__(self, credentials): self._credentials = credentials # type: FakeCredentials def delete(self, upload_url): upload = self._credentials.client.uploads.pop(upload_url) upload.terminate() def put(self, url, data=None, headers=None): upload = self._credentials.client.uploads[url] if data is not None: if hasattr(data, 'read'): upload.write(data.read()) else: upload.write(data) if not headers.get('Content-Range', '').endswith(smart_open.gcs._UNKNOWN): upload.finish() return FakeResponse(200) return FakeResponse(smart_open.gcs._UPLOAD_INCOMPLETE_STATUS_CODES[0]) @staticmethod def _blob_with_url(url, client): # type: (str, FakeClient) -> FakeBlobUpload return client.uploads.get(url) class FakeAuthorizedSessionTest(unittest.TestCase): def setUp(self): self.client = FakeClient() self.credentials = FakeCredentials(self.client) self.session = FakeAuthorizedSession(self.credentials) self.bucket = FakeBucket(self.client, 'test-bucket') self.blob = FakeBlob('test-blob', self.bucket) self.upload_url = self.blob.create_resumable_upload_session() def test_delete(self): self.session.delete(self.upload_url) self.assertFalse(self.blob.exists()) self.assertDictEqual(self.client.uploads, {}) def test_unfinished_put_does_not_write_to_blob(self): data = io.BytesIO(b'test') headers = { 'Content-Range': 'bytes 0-3/*', 'Content-Length': str(4), } response = self.session.put(self.upload_url, data, headers=headers) self.assertIn(response.status_code, smart_open.gcs._UPLOAD_INCOMPLETE_STATUS_CODES) self.session._blob_with_url(self.upload_url, self.client) blob_contents = self.blob.download_as_bytes() self.assertEqual(blob_contents, b'') def test_finished_put_writes_to_blob(self): data = io.BytesIO(b'test') headers = { 'Content-Range': 'bytes 0-3/4', 'Content-Length': str(4), } response = self.session.put(self.upload_url, data, headers=headers) self.assertEqual(response.status_code, 200) self.session._blob_with_url(self.upload_url, self.client) blob_contents = self.blob.download_as_bytes() data.seek(0) self.assertEqual(blob_contents, data.read()) if DISABLE_MOCKS: storage_client = google.cloud.storage.Client() else: storage_client = FakeClient() def get_bucket(): return storage_client.bucket(BUCKET_NAME) def get_blob(): bucket = get_bucket() return bucket.blob(BLOB_NAME) def cleanup_bucket(): bucket = get_bucket() blobs = bucket.list_blobs() for blob in blobs: blob.delete() def put_to_bucket(contents, num_attempts=12, sleep_time=5): logger.debug('%r', locals()) # # In real life, it can take a few seconds for the bucket to become ready. # If we try to write to the key while the bucket while it isn't ready, we # will get a StorageError: NotFound. # for attempt in range(num_attempts): try: blob = get_blob() blob.upload_from_string(contents) return except google.cloud.exceptions.NotFound as err: logger.error('caught %r, retrying', err) time.sleep(sleep_time) assert False, 'failed to create bucket %s after %d attempts' % (BUCKET_NAME, num_attempts) def mock_gcs(class_or_func): """Mock all methods of a class or a function.""" if inspect.isclass(class_or_func): for attr in class_or_func.__dict__: if callable(getattr(class_or_func, attr)): setattr(class_or_func, attr, mock_gcs_func(getattr(class_or_func, attr))) return class_or_func else: return mock_gcs_func(class_or_func) def mock_gcs_func(func): """Mock the function and provide additional required arguments.""" assert callable(func), '%r is not a callable function' % func def inner(*args, **kwargs): # # Is it a function or a method? The latter requires a self parameter. # signature = inspect.signature(func) fake_session = FakeAuthorizedSession(storage_client._credentials) patched_client = mock.patch( 'google.cloud.storage.Client', return_value=storage_client, ) patched_session = mock.patch( 'google.auth.transport.requests.AuthorizedSession', return_value=fake_session, ) with patched_client, patched_session: if not hasattr(signature, 'self'): return func(*args, **kwargs) else: return func(signature.self, *args, **kwargs) return inner def maybe_mock_gcs(func): if DISABLE_MOCKS: return func else: return mock_gcs(func) @maybe_mock_gcs def setUpModule(): # noqa """Called once by unittest when initializing this module. Set up the test GCS bucket. """ storage_client.create_bucket(BUCKET_NAME) @maybe_mock_gcs def tearDownModule(): # noqa """Called once by unittest when tearing down this module. Empty and removes the test GCS bucket. """ try: bucket = get_bucket() bucket.delete() except google.cloud.exceptions.NotFound: pass @maybe_mock_gcs class ReaderTest(unittest.TestCase): def setUp(self): # lower the multipart upload size, to speed up these tests self.old_min_buffer_size = smart_open.gcs.DEFAULT_BUFFER_SIZE smart_open.gcs.DEFAULT_BUFFER_SIZE = 5 * 1024**2 ignore_resource_warnings() def tearDown(self): cleanup_bucket() def test_iter(self): """Are GCS files iterated over correctly?""" expected = u"hello wořld\nhow are you?".encode('utf8') put_to_bucket(contents=expected) # connect to fake GCS and read from the fake key we filled above fin = smart_open.gcs.Reader(BUCKET_NAME, BLOB_NAME) output = [line.rstrip(b'\n') for line in fin] self.assertEqual(output, expected.split(b'\n')) def test_iter_context_manager(self): # same thing but using a context manager expected = u"hello wořld\nhow are you?".encode('utf8') put_to_bucket(contents=expected) with smart_open.gcs.Reader(BUCKET_NAME, BLOB_NAME) as fin: output = [line.rstrip(b'\n') for line in fin] self.assertEqual(output, expected.split(b'\n')) def test_read(self): """Are GCS files read correctly?""" content = u"hello wořld\nhow are you?".encode('utf8') put_to_bucket(contents=content) logger.debug('content: %r len: %r', content, len(content)) fin = smart_open.gcs.Reader(BUCKET_NAME, BLOB_NAME) self.assertEqual(content[:6], fin.read(6)) self.assertEqual(content[6:14], fin.read(8)) # ř is 2 bytes self.assertEqual(content[14:], fin.read()) # read the rest def test_seek_beginning(self): """Does seeking to the beginning of GCS files work correctly?""" content = u"hello wořld\nhow are you?".encode('utf8') put_to_bucket(contents=content) fin = smart_open.gcs.Reader(BUCKET_NAME, BLOB_NAME) self.assertEqual(content[:6], fin.read(6)) self.assertEqual(content[6:14], fin.read(8)) # ř is 2 bytes fin.seek(0) self.assertEqual(content, fin.read()) # no size given => read whole file fin.seek(0) self.assertEqual(content, fin.read(-1)) # same thing def test_seek_start(self): """Does seeking from the start of GCS files work correctly?""" content = u"hello wořld\nhow are you?".encode('utf8') put_to_bucket(contents=content) fin = smart_open.gcs.Reader(BUCKET_NAME, BLOB_NAME) seek = fin.seek(6) self.assertEqual(seek, 6) self.assertEqual(fin.tell(), 6) self.assertEqual(fin.read(6), u'wořld'.encode('utf-8')) def test_seek_current(self): """Does seeking from the middle of GCS files work correctly?""" content = u"hello wořld\nhow are you?".encode('utf8') put_to_bucket(contents=content) fin = smart_open.gcs.Reader(BUCKET_NAME, BLOB_NAME) self.assertEqual(fin.read(5), b'hello') seek = fin.seek(1, whence=smart_open.constants.WHENCE_CURRENT) self.assertEqual(seek, 6) self.assertEqual(fin.read(6), u'wořld'.encode('utf-8')) def test_seek_end(self): """Does seeking from the end of GCS files work correctly?""" content = u"hello wořld\nhow are you?".encode('utf8') put_to_bucket(contents=content) fin = smart_open.gcs.Reader(BUCKET_NAME, BLOB_NAME) seek = fin.seek(-4, whence=smart_open.constants.WHENCE_END) self.assertEqual(seek, len(content) - 4) self.assertEqual(fin.read(), b'you?') def test_detect_eof(self): content = u"hello wořld\nhow are you?".encode('utf8') put_to_bucket(contents=content) fin = smart_open.gcs.Reader(BUCKET_NAME, BLOB_NAME) fin.read() eof = fin.tell() self.assertEqual(eof, len(content)) fin.seek(0, whence=smart_open.constants.WHENCE_END) self.assertEqual(eof, fin.tell()) def test_read_gzip(self): expected = u'раcцветали яблони и груши, поплыли туманы над рекой...'.encode('utf-8') buf = io.BytesIO() buf.close = lambda: None # keep buffer open so that we can .getvalue() with gzip.GzipFile(fileobj=buf, mode='w') as zipfile: zipfile.write(expected) put_to_bucket(contents=buf.getvalue()) # # Make sure we're reading things correctly. # with smart_open.gcs.Reader(BUCKET_NAME, BLOB_NAME) as fin: self.assertEqual(fin.read(), buf.getvalue()) # # Make sure the buffer we wrote is legitimate gzip. # sanity_buf = io.BytesIO(buf.getvalue()) with gzip.GzipFile(fileobj=sanity_buf) as zipfile: self.assertEqual(zipfile.read(), expected) logger.debug('starting actual test') with smart_open.gcs.Reader(BUCKET_NAME, BLOB_NAME) as fin: with gzip.GzipFile(fileobj=fin) as zipfile: actual = zipfile.read() self.assertEqual(expected, actual) def test_readline(self): content = b'englishman\nin\nnew\nyork\n' put_to_bucket(contents=content) with smart_open.gcs.Reader(BUCKET_NAME, BLOB_NAME) as fin: fin.readline() self.assertEqual(fin.tell(), content.index(b'\n')+1) fin.seek(0) actual = list(fin) self.assertEqual(fin.tell(), len(content)) expected = [b'englishman\n', b'in\n', b'new\n', b'york\n'] self.assertEqual(expected, actual) def test_readline_tiny_buffer(self): content = b'englishman\nin\nnew\nyork\n' put_to_bucket(contents=content) with smart_open.gcs.Reader(BUCKET_NAME, BLOB_NAME, buffer_size=8) as fin: actual = list(fin) expected = [b'englishman\n', b'in\n', b'new\n', b'york\n'] self.assertEqual(expected, actual) def test_read0_does_not_return_data(self): content = b'englishman\nin\nnew\nyork\n' put_to_bucket(contents=content) with smart_open.gcs.Reader(BUCKET_NAME, BLOB_NAME) as fin: data = fin.read(0) self.assertEqual(data, b'') def test_read_past_end(self): content = b'englishman\nin\nnew\nyork\n' put_to_bucket(contents=content) with smart_open.gcs.Reader(BUCKET_NAME, BLOB_NAME) as fin: data = fin.read(100) self.assertEqual(data, content) @maybe_mock_gcs class WriterTest(unittest.TestCase): """ Test writing into GCS files. """ def setUp(self): ignore_resource_warnings() def tearDown(self): cleanup_bucket() def test_write_01(self): """Does writing into GCS work correctly?""" test_string = u"žluťoučký koníček".encode('utf8') with smart_open.gcs.Writer(BUCKET_NAME, WRITE_BLOB_NAME) as fout: fout.write(test_string) with smart_open.open("gs://{}/{}".format(BUCKET_NAME, WRITE_BLOB_NAME), "rb") as fin: output = list(fin) self.assertEqual(output, [test_string]) def test_incorrect_input(self): """Does gcs write fail on incorrect input?""" try: with smart_open.gcs.Writer(BUCKET_NAME, WRITE_BLOB_NAME) as fin: fin.write(None) except TypeError: pass else: self.fail() def test_write_02(self): """Does gcs write unicode-utf8 conversion work?""" smart_open_write = smart_open.gcs.Writer(BUCKET_NAME, WRITE_BLOB_NAME) smart_open_write.tell() logger.info("smart_open_write: %r", smart_open_write) with smart_open_write as fout: fout.write(u"testžížáč".encode("utf-8")) self.assertEqual(fout.tell(), 14) def test_write_03(self): """Do multiple writes less than the min_part_size work correctly?""" # write min_part_size = 256 * 1024 smart_open_write = smart_open.gcs.Writer( BUCKET_NAME, WRITE_BLOB_NAME, min_part_size=min_part_size ) local_write = io.BytesIO() with smart_open_write as fout: first_part = b"t" * 262141 fout.write(first_part) local_write.write(first_part) self.assertEqual(fout._current_part.tell(), 262141) second_part = b"t\n" fout.write(second_part) local_write.write(second_part) self.assertEqual(fout._current_part.tell(), 262143) self.assertEqual(fout._total_parts, 0) third_part = b"t" fout.write(third_part) local_write.write(third_part) self.assertEqual(fout._current_part.tell(), 262144) self.assertEqual(fout._total_parts, 0) fourth_part = b"t" * 1 fout.write(fourth_part) local_write.write(fourth_part) self.assertEqual(fout._current_part.tell(), 1) self.assertEqual(fout._total_parts, 1) # read back the same key and check its content output = list(smart_open.open("gs://{}/{}".format(BUCKET_NAME, WRITE_BLOB_NAME))) local_write.seek(0) actual = [line.decode("utf-8") for line in list(local_write)] self.assertEqual(output, actual) def test_write_03a(self): """Do multiple writes greater than the min_part_size work correctly?""" min_part_size = 256 * 1024 smart_open_write = smart_open.gcs.Writer( BUCKET_NAME, WRITE_BLOB_NAME, min_part_size=min_part_size ) local_write = io.BytesIO() with smart_open_write as fout: for i in range(1, 4): part = b"t" * (min_part_size + 1) fout.write(part) local_write.write(part) self.assertEqual(fout._current_part.tell(), i) self.assertEqual(fout._total_parts, i) # read back the same key and check its content output = list(smart_open.open("gs://{}/{}".format(BUCKET_NAME, WRITE_BLOB_NAME))) local_write.seek(0) actual = [line.decode("utf-8") for line in list(local_write)] self.assertEqual(output, actual) def test_write_03b(self): """Does writing a last chunk size equal to a multiple of the min_part_size work?""" min_part_size = 256 * 1024 smart_open_write = smart_open.gcs.Writer( BUCKET_NAME, WRITE_BLOB_NAME, min_part_size=min_part_size ) expected = b"t" * min_part_size * 2 with smart_open_write as fout: fout.write(expected) self.assertEqual(fout._current_part.tell(), 262144) self.assertEqual(fout._total_parts, 1) # read back the same key and check its content with smart_open.open("gs://{}/{}".format(BUCKET_NAME, WRITE_BLOB_NAME)) as fin: output = fin.read().encode('utf-8') self.assertEqual(output, expected) def test_write_04(self): """Does writing no data cause key with an empty value to be created?""" smart_open_write = smart_open.gcs.Writer(BUCKET_NAME, WRITE_BLOB_NAME) with smart_open_write as fout: # noqa pass # read back the same key and check its content output = list(smart_open.open("gs://{}/{}".format(BUCKET_NAME, WRITE_BLOB_NAME))) self.assertEqual(output, []) def test_write_05(self): """Do blob_properties get applied?""" smart_open_write = smart_open.gcs.Writer(BUCKET_NAME, WRITE_BLOB_NAME, blob_properties={ "content_type": "random/x-test", "content_encoding": "coded" } ) with smart_open_write as fout: # noqa assert fout._blob.content_type == "random/x-test" assert fout._blob.content_encoding == "coded" def test_gzip(self): expected = u'а не спеть ли мне песню... о любви'.encode('utf-8') with smart_open.gcs.Writer(BUCKET_NAME, WRITE_BLOB_NAME) as fout: with gzip.GzipFile(fileobj=fout, mode='w') as zipfile: zipfile.write(expected) with smart_open.gcs.Reader(BUCKET_NAME, WRITE_BLOB_NAME) as fin: with gzip.GzipFile(fileobj=fin) as zipfile: actual = zipfile.read() self.assertEqual(expected, actual) def test_buffered_writer_wrapper_works(self): """ Ensure that we can wrap a smart_open gcs stream in a BufferedWriter, which passes a memoryview object to the underlying stream in python >= 2.7 """ expected = u'не думай о секундах свысока' with smart_open.gcs.Writer(BUCKET_NAME, WRITE_BLOB_NAME) as fout: with io.BufferedWriter(fout) as sub_out: sub_out.write(expected.encode('utf-8')) with smart_open.open("gs://{}/{}".format(BUCKET_NAME, WRITE_BLOB_NAME), 'rb') as fin: with io.TextIOWrapper(fin, encoding='utf-8') as text: actual = text.read() self.assertEqual(expected, actual) def test_binary_iterator(self): expected = u"выйду ночью в поле с конём".encode('utf-8').split(b' ') put_to_bucket(contents=b"\n".join(expected)) with smart_open.gcs.open(BUCKET_NAME, BLOB_NAME, 'rb') as fin: actual = [line.rstrip() for line in fin] self.assertEqual(expected, actual) def test_nonexisting_bucket(self): expected = u"выйду ночью в поле с конём".encode('utf-8') with self.assertRaises(google.api_core.exceptions.NotFound): with smart_open.gcs.open('thisbucketdoesntexist', 'mykey', 'wb') as fout: fout.write(expected) def test_read_nonexisting_key(self): with self.assertRaises(google.api_core.exceptions.NotFound): with smart_open.gcs.open(BUCKET_NAME, 'my_nonexisting_key', 'rb') as fin: fin.read() def test_double_close(self): text = u'там за туманами, вечными, пьяными'.encode('utf-8') fout = smart_open.gcs.open(BUCKET_NAME, 'key', 'wb') fout.write(text) fout.close() fout.close() def test_flush_close(self): text = u'там за туманами, вечными, пьяными'.encode('utf-8') fout = smart_open.gcs.open(BUCKET_NAME, 'key', 'wb') fout.write(text) fout.flush() fout.close() def test_terminate(self): text = u'там за туманами, вечными, пьяными'.encode('utf-8') fout = smart_open.gcs.open(BUCKET_NAME, 'key', 'wb') fout.write(text) fout.terminate() with self.assertRaises(google.api_core.exceptions.NotFound): with smart_open.gcs.open(BUCKET_NAME, 'key', 'rb') as fin: fin.read() @maybe_mock_gcs class OpenTest(unittest.TestCase): def setUp(self): ignore_resource_warnings() def tearDown(self): cleanup_bucket() def test_read_never_returns_none(self): """read should never return None.""" test_string = u"ветер по морю гуляет..." with smart_open.gcs.open(BUCKET_NAME, BLOB_NAME, "wb") as fout: self.assertEqual(fout.name, BLOB_NAME) fout.write(test_string.encode('utf8')) r = smart_open.gcs.open(BUCKET_NAME, BLOB_NAME, "rb") self.assertEqual(r.name, BLOB_NAME) self.assertEqual(r.read(), test_string.encode("utf-8")) self.assertEqual(r.read(), b"") self.assertEqual(r.read(), b"") def test_round_trip(self): test_string = u"ветер по морю гуляет..." url = 'gs://%s/%s' % (BUCKET_NAME, BLOB_NAME) with smart_open.open(url, "w", encoding='utf-8') as fout: fout.write(test_string) with smart_open.open(url, encoding='utf-8') as fin: actual = fin.read() self.assertEqual(test_string, actual) class MakeRangeStringTest(unittest.TestCase): def test_no_stop(self): start, stop = 1, None self.assertEqual(smart_open.gcs._make_range_string(start, stop), 'bytes 1-/*') def test_stop(self): start, stop = 1, 2 self.assertEqual(smart_open.gcs._make_range_string(start, stop), 'bytes 1-2/*') if __name__ == '__main__': logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO) unittest.main() smart_open-5.2.1/smart_open/tests/test_hdfs.py000066400000000000000000000121671411241424400215400ustar00rootroot00000000000000# -*- coding: utf-8 -*- # # Copyright (C) 2019 Radim Rehurek # # This code is distributed under the terms and conditions # from the MIT License (MIT). # import gzip import os import os.path as P import subprocess import unittest from unittest import mock import sys import smart_open.hdfs # # Workaround for https://bugs.python.org/issue37380 # if sys.version_info[:2] == (3, 6): subprocess._cleanup = lambda: None CURR_DIR = P.dirname(P.abspath(__file__)) # # We want our mocks to emulate the real implementation as close as possible, # so we use a Popen call during each test. If we mocked using io.BytesIO, then # it is possible the mocks would behave differently to what we expect in real # use. # # Since these tests use cat, they will not work in an environment without cat, # such as Windows. The main line of this test submodule contains a simple # cat implementation. We need this because Windows' analog, type, does # weird stuff with line endings (inserts CRLF). Also, I don't know of a way # to get type to echo standard input. # def cat(path=None): command = [sys.executable, P.abspath(__file__)] if path: command.append(path) return subprocess.Popen(command, stdin=subprocess.PIPE, stdout=subprocess.PIPE) class CliRawInputBaseTest(unittest.TestCase): def setUp(self): self.path = P.join(CURR_DIR, 'test_data', 'crime-and-punishment.txt') # # We have to specify the encoding explicitly, because different # platforms like Windows may be using something other than unicode # by default. # with open(self.path, encoding='utf-8') as fin: self.expected = fin.read() self.cat = cat(self.path) def test_read(self): with mock.patch('subprocess.Popen', return_value=self.cat): reader = smart_open.hdfs.CliRawInputBase('hdfs://dummy/url') as_bytes = reader.read() # # Not 100% sure why this is necessary on Windows platforms, but the # tests fail without it. It may be a bug, but I don't have time to # investigate right now. # as_text = as_bytes.decode('utf-8').replace(os.linesep, '\n') assert as_text == self.expected def test_read_75(self): with mock.patch('subprocess.Popen', return_value=self.cat): reader = smart_open.hdfs.CliRawInputBase('hdfs://dummy/url') as_bytes = reader.read(75) as_text = as_bytes.decode('utf-8').replace(os.linesep, '\n') assert as_text == self.expected[:len(as_text)] def test_unzip(self): path = P.join(CURR_DIR, 'test_data', 'crime-and-punishment.txt.gz') with mock.patch('subprocess.Popen', return_value=cat(path)): with gzip.GzipFile(fileobj=smart_open.hdfs.CliRawInputBase('hdfs://dummy/url')) as fin: as_bytes = fin.read() as_text = as_bytes.decode('utf-8') assert as_text == self.expected def test_context_manager(self): with mock.patch('subprocess.Popen', return_value=self.cat): with smart_open.hdfs.CliRawInputBase('hdfs://dummy/url') as fin: as_bytes = fin.read() as_text = as_bytes.decode('utf-8').replace('\r\n', '\n') assert as_text == self.expected class SanityTest(unittest.TestCase): def test_read_bytes(self): path = P.join(CURR_DIR, 'test_data', 'crime-and-punishment.txt') with open(path, 'rb') as fin: lines = [line for line in fin] assert len(lines) == 3 def test_read_text(self): path = P.join(CURR_DIR, 'test_data', 'crime-and-punishment.txt') with open(path, 'r', encoding='utf-8') as fin: text = fin.read() expected = 'В начале июля, в чрезвычайно жаркое время' assert text[:len(expected)] == expected class CliRawOutputBaseTest(unittest.TestCase): def test_write(self): expected = 'мы в ответе за тех, кого приручили' mocked_cat = cat() with mock.patch('subprocess.Popen', return_value=mocked_cat): with smart_open.hdfs.CliRawOutputBase('hdfs://dummy/url') as fout: fout.write(expected.encode('utf-8')) actual = mocked_cat.stdout.read().decode('utf-8') assert actual == expected def test_zip(self): expected = 'мы в ответе за тех, кого приручили' mocked_cat = cat() with mock.patch('subprocess.Popen', return_value=mocked_cat): with smart_open.hdfs.CliRawOutputBase('hdfs://dummy/url') as fout: with gzip.GzipFile(fileobj=fout, mode='wb') as gz_fout: gz_fout.write(expected.encode('utf-8')) with gzip.GzipFile(fileobj=mocked_cat.stdout) as fin: actual = fin.read().decode('utf-8') assert actual == expected def main(): try: path = sys.argv[1] except IndexError: bytez = sys.stdin.buffer.read() else: with open(path, 'rb') as fin: bytez = fin.read() sys.stdout.buffer.write(bytez) sys.stdout.flush() if __name__ == '__main__': main() smart_open-5.2.1/smart_open/tests/test_http.py000066400000000000000000000150521411241424400215670ustar00rootroot00000000000000# -*- coding: utf-8 -*- # # Copyright (C) 2019 Radim Rehurek # # This code is distributed under the terms and conditions # from the MIT License (MIT). # import functools import os import unittest import pytest import responses import smart_open.http import smart_open.s3 import smart_open.constants BYTES = b'i tried so hard and got so far but in the end it doesn\'t even matter' URL = 'http://localhost' HTTPS_URL = 'https://localhost' HEADERS = { 'Content-Length': str(len(BYTES)), 'Accept-Ranges': 'bytes', } def request_callback(request, headers=HEADERS, data=BYTES): try: range_string = request.headers['range'] except KeyError: return (200, headers, data) start, end = range_string.replace('bytes=', '').split('-', 1) start = int(start) if end: end = int(end) else: end = len(data) return (200, headers, data[start:end]) @unittest.skipIf(os.environ.get('TRAVIS'), 'This test does not work on TravisCI for some reason') class HttpTest(unittest.TestCase): @responses.activate def test_read_all(self): responses.add(responses.GET, URL, body=BYTES, stream=True) reader = smart_open.http.SeekableBufferedInputBase(URL) read_bytes = reader.read() self.assertEqual(BYTES, read_bytes) @responses.activate def test_seek_from_start(self): responses.add_callback(responses.GET, URL, callback=request_callback) reader = smart_open.http.SeekableBufferedInputBase(URL) reader.seek(10) self.assertEqual(reader.tell(), 10) read_bytes = reader.read(size=10) self.assertEqual(reader.tell(), 20) self.assertEqual(BYTES[10:20], read_bytes) reader.seek(20) read_bytes = reader.read(size=10) self.assertEqual(BYTES[20:30], read_bytes) reader.seek(0) read_bytes = reader.read(size=10) self.assertEqual(BYTES[:10], read_bytes) @responses.activate def test_seek_from_current(self): responses.add_callback(responses.GET, URL, callback=request_callback) reader = smart_open.http.SeekableBufferedInputBase(URL) reader.seek(10) read_bytes = reader.read(size=10) self.assertEqual(BYTES[10:20], read_bytes) self.assertEqual(reader.tell(), 20) reader.seek(10, whence=smart_open.constants.WHENCE_CURRENT) self.assertEqual(reader.tell(), 30) read_bytes = reader.read(size=10) self.assertEqual(reader.tell(), 40) self.assertEqual(BYTES[30:40], read_bytes) @responses.activate def test_seek_from_end(self): responses.add_callback(responses.GET, URL, callback=request_callback) reader = smart_open.http.SeekableBufferedInputBase(URL) reader.seek(-10, whence=smart_open.constants.WHENCE_END) self.assertEqual(reader.tell(), len(BYTES) - 10) read_bytes = reader.read(size=10) self.assertEqual(reader.tell(), len(BYTES)) self.assertEqual(BYTES[-10:], read_bytes) @responses.activate def test_headers_are_as_assigned(self): responses.add_callback(responses.GET, URL, callback=request_callback) # use default _HEADERS x = smart_open.http.BufferedInputBase(URL) # set different ones x.headers['Accept-Encoding'] = 'compress, gzip' x.headers['Other-Header'] = 'value' # use default again, global shoudn't overwritten from x y = smart_open.http.BufferedInputBase(URL) # should be default headers self.assertEqual(y.headers, {'Accept-Encoding': 'identity'}) # should be assigned headers self.assertEqual(x.headers, {'Accept-Encoding': 'compress, gzip', 'Other-Header': 'value'}) @responses.activate def test_headers(self): """Does the top-level http.open function handle headers correctly?""" responses.add_callback(responses.GET, URL, callback=request_callback) reader = smart_open.http.open(URL, 'rb', headers={'Foo': 'bar'}) self.assertEqual(reader.headers['Foo'], 'bar') @responses.activate def test_https_seek_start(self): """Did the seek start over HTTPS work?""" responses.add_callback(responses.GET, HTTPS_URL, callback=request_callback) with smart_open.open(HTTPS_URL, "rb") as fin: read_bytes_1 = fin.read(size=10) fin.seek(0) read_bytes_2 = fin.read(size=10) self.assertEqual(read_bytes_1, read_bytes_2) @responses.activate def test_https_seek_forward(self): """Did the seek forward over HTTPS work?""" responses.add_callback(responses.GET, HTTPS_URL, callback=request_callback) with smart_open.open(HTTPS_URL, "rb") as fin: fin.seek(10) read_bytes = fin.read(size=10) self.assertEqual(BYTES[10:20], read_bytes) @responses.activate def test_https_seek_reverse(self): """Did the seek in reverse over HTTPS work?""" responses.add_callback(responses.GET, HTTPS_URL, callback=request_callback) with smart_open.open(HTTPS_URL, "rb") as fin: read_bytes_1 = fin.read(size=10) fin.seek(-10, whence=smart_open.constants.WHENCE_CURRENT) read_bytes_2 = fin.read(size=10) self.assertEqual(read_bytes_1, read_bytes_2) @responses.activate def test_timeout_attribute(self): timeout = 1 responses.add_callback(responses.GET, URL, callback=request_callback) reader = smart_open.open(URL, "rb", transport_params={'timeout': timeout}) assert hasattr(reader, 'timeout') assert reader.timeout == timeout @responses.activate def test_seek_implicitly_enabled(numbytes=10): """Can we seek even if the server hasn't explicitly allowed it?""" callback = functools.partial(request_callback, headers={}) responses.add_callback(responses.GET, HTTPS_URL, callback=callback) with smart_open.open(HTTPS_URL, 'rb') as fin: assert fin.seekable() first = fin.read(size=numbytes) fin.seek(-numbytes, whence=smart_open.constants.WHENCE_CURRENT) second = fin.read(size=numbytes) assert first == second @responses.activate def test_seek_implicitly_disabled(): """Does seeking fail when the server has explicitly disabled it?""" callback = functools.partial(request_callback, headers={'Accept-Ranges': 'none'}) responses.add_callback(responses.GET, HTTPS_URL, callback=callback) with smart_open.open(HTTPS_URL, 'rb') as fin: assert not fin.seekable() fin.read() with pytest.raises(OSError): fin.seek(0) smart_open-5.2.1/smart_open/tests/test_package.py000066400000000000000000000017331411241424400222040ustar00rootroot00000000000000# -*- coding: utf-8 -*- import os import unittest import pytest from smart_open import open skip_tests = "SMART_OPEN_TEST_MISSING_DEPS" not in os.environ class PackageTests(unittest.TestCase): @pytest.mark.skipif(skip_tests, reason="requires missing dependencies") def test_azure_raises_helpful_error_with_missing_deps(self): with pytest.raises(ImportError, match=r"pip install smart_open\[azure\]"): open("azure://foo/bar") @pytest.mark.skipif(skip_tests, reason="requires missing dependencies") def test_aws_raises_helpful_error_with_missing_deps(self): match = r"pip install smart_open\[s3\]" with pytest.raises(ImportError, match=match): open("s3://foo/bar") @pytest.mark.skipif(skip_tests, reason="requires missing dependencies") def test_gcs_raises_helpful_error_with_missing_deps(self): with pytest.raises(ImportError, match=r"pip install smart_open\[gcs\]"): open("gs://foo/bar") smart_open-5.2.1/smart_open/tests/test_s3.py000066400000000000000000001131631411241424400211370ustar00rootroot00000000000000# -*- coding: utf-8 -*- # # Copyright (C) 2019 Radim Rehurek # # This code is distributed under the terms and conditions # from the MIT License (MIT). # from collections import defaultdict import gzip import io import logging import os import tempfile import time import unittest import warnings from contextlib import contextmanager from unittest import mock import sys import boto3 import botocore.client import botocore.endpoint import moto import pytest import smart_open import smart_open.s3 # To reduce spurious errors due to S3's eventually-consistent behavior # we create this bucket once before running these tests and then # remove it when we're done. The bucket has a random name so that we # can run multiple instances of this suite in parallel and not have # them conflict with one another. Travis, for example, runs the Python # 2.7, 3.6, and 3.7 suites concurrently. BUCKET_NAME = 'test-smartopen' KEY_NAME = 'test-key' WRITE_KEY_NAME = 'test-write-key' ENABLE_MOTO_SERVER = os.environ.get("SO_ENABLE_MOTO_SERVER") == "1" # # This is a hack to keep moto happy # See https://github.com/spulec/moto/issues/1941 # os.environ["AWS_ACCESS_KEY_ID"] = "test" os.environ["AWS_SECRET_ACCESS_KEY"] = "test" logger = logging.getLogger(__name__) @moto.mock_s3 def setUpModule(): '''Called once by unittest when initializing this module. Sets up the test S3 bucket. ''' bucket = boto3.resource('s3').create_bucket(Bucket=BUCKET_NAME) bucket.wait_until_exists() def cleanup_bucket(): for key in boto3.resource('s3').Bucket(BUCKET_NAME).objects.all(): key.delete() def put_to_bucket(contents, num_attempts=12, sleep_time=5): logger.debug('%r', locals()) # # In real life, it can take a few seconds for the bucket to become ready. # If we try to write to the key while the bucket while it isn't ready, we # will get a ClientError: NoSuchBucket. # for attempt in range(num_attempts): try: boto3.resource('s3').Object(BUCKET_NAME, KEY_NAME).put(Body=contents) return except botocore.exceptions.ClientError as err: logger.error('caught %r, retrying', err) time.sleep(sleep_time) assert False, 'failed to write to bucket %s after %d attempts' % (BUCKET_NAME, num_attempts) def ignore_resource_warnings(): # # https://github.com/boto/boto3/issues/454 # Py2 doesn't have ResourceWarning, so do nothing. # warnings.filterwarnings("ignore", category=ResourceWarning, message="unclosed.*") # noqa @contextmanager def patch_invalid_range_response(actual_size): """ Work around a bug in moto (https://github.com/spulec/moto/issues/2981) where the API response doesn't match when requesting an invalid range of bytes from an S3 GetObject. """ _real_get = smart_open.s3._get def mock_get(*args, **kwargs): try: return _real_get(*args, **kwargs) except IOError as ioe: error_response = smart_open.s3._unwrap_ioerror(ioe) if error_response and error_response.get('Message') == 'Requested Range Not Satisfiable': error_response['ActualObjectSize'] = actual_size error_response['Code'] = 'InvalidRange' error_response['Message'] = 'The requested range is not satisfiable' raise with mock.patch('smart_open.s3._get', new=mock_get): yield class BaseTest(unittest.TestCase): @contextmanager def assertApiCalls(self, **expected_api_calls): """ Track calls to S3 in self.api_calls by patching botocore.endpoint.Endpoint.make_request. """ _real_make_request = botocore.endpoint.Endpoint.make_request api_calls = defaultdict(int) def mock_make_request(self, operation_model, *args, **kwargs): api_calls[operation_model.name] += 1 return _real_make_request(self, operation_model, *args, **kwargs) patcher = mock.patch('botocore.endpoint.Endpoint.make_request', new=mock_make_request) patcher.start() try: yield api_calls self.assertDictEqual(expected_api_calls, api_calls) finally: patcher.stop() @unittest.skipUnless( ENABLE_MOTO_SERVER, 'The test case needs a Moto server running on the local 5000 port.' ) class SeekableRawReaderTest(unittest.TestCase): def setUp(self): self._body = b'123456' self._local_resource = boto3.resource('s3', endpoint_url='http://localhost:5000') self._local_resource.Bucket(BUCKET_NAME).create() self._local_resource.Object(BUCKET_NAME, KEY_NAME).put(Body=self._body) self._local_client = boto3.client('s3', endpoint_url='http://localhost:5000') def tearDown(self): self._local_resource.Object(BUCKET_NAME, KEY_NAME).delete() self._local_resource.Bucket(BUCKET_NAME).delete() def test_read_from_a_closed_body(self): reader = smart_open.s3._SeekableRawReader(self._local_client, BUCKET_NAME, KEY_NAME) self.assertEqual(reader.read(1), b'1') reader._body.close() self.assertEqual(reader.read(2), b'23') class CrapStream(io.BytesIO): """Raises an exception on every second read call.""" def __init__(self, *args, modulus=2, **kwargs): super().__init__(*args, **kwargs) self._count = 0 self._modulus = modulus def read(self, size=-1): self._count += 1 if self._count % self._modulus == 0: raise botocore.exceptions.BotoCoreError() the_bytes = super().read(size) return the_bytes class CrapClient: def __init__(self, data, modulus=2): self._datasize = len(data) self._body = CrapStream(data, modulus=modulus) def get_object(self, *args, **kwargs): return { 'ActualObjectSize': self._datasize, 'ContentLength': self._datasize, 'ContentRange': 'bytes 0-%d/%d' % (self._datasize, self._datasize), 'Body': self._body, 'ResponseMetadata': {'RetryAttempts': 1}, } class IncrementalBackoffTest(unittest.TestCase): def test_every_read_fails(self): reader = smart_open.s3._SeekableRawReader(CrapClient(b'hello', 1), 'bucket', 'key') with mock.patch('time.sleep') as mock_sleep: with self.assertRaises(IOError): reader.read() # # Make sure our incremental backoff is actually happening here. # mock_sleep.assert_has_calls([mock.call(s) for s in (1, 2, 4, 8, 16)]) def test_every_second_read_fails(self): """Can we read from a stream that raises exceptions from time to time?""" reader = smart_open.s3._SeekableRawReader(CrapClient(b'hello'), 'bucket', 'key') with mock.patch('time.sleep') as mock_sleep: assert reader.read(1) == b'h' mock_sleep.assert_not_called() assert reader.read(1) == b'e' mock_sleep.assert_called_with(1) mock_sleep.reset_mock() assert reader.read(1) == b'l' mock_sleep.reset_mock() assert reader.read(1) == b'l' mock_sleep.assert_called_with(1) mock_sleep.reset_mock() assert reader.read(1) == b'o' mock_sleep.assert_called_with(1) mock_sleep.reset_mock() @moto.mock_s3 class ReaderTest(BaseTest): def setUp(self): # lower the multipart upload size, to speed up these tests self.old_min_part_size = smart_open.s3.DEFAULT_MIN_PART_SIZE smart_open.s3.DEFAULT_MIN_PART_SIZE = 5 * 1024**2 ignore_resource_warnings() super().setUp() def tearDown(self): smart_open.s3.DEFAULT_MIN_PART_SIZE = self.old_min_part_size cleanup_bucket() def test_iter(self): """Are S3 files iterated over correctly?""" # a list of strings to test with expected = u"hello wořld\nhow are you?".encode('utf8') put_to_bucket(contents=expected) # connect to fake s3 and read from the fake key we filled above with self.assertApiCalls(GetObject=1): fin = smart_open.s3.Reader(BUCKET_NAME, KEY_NAME) output = [line.rstrip(b'\n') for line in fin] self.assertEqual(output, expected.split(b'\n')) def test_iter_context_manager(self): # same thing but using a context manager expected = u"hello wořld\nhow are you?".encode('utf8') put_to_bucket(contents=expected) with self.assertApiCalls(GetObject=1): with smart_open.s3.Reader(BUCKET_NAME, KEY_NAME) as fin: output = [line.rstrip(b'\n') for line in fin] self.assertEqual(output, expected.split(b'\n')) def test_read(self): """Are S3 files read correctly?""" content = u"hello wořld\nhow are you?".encode('utf8') put_to_bucket(contents=content) logger.debug('content: %r len: %r', content, len(content)) with self.assertApiCalls(GetObject=1): fin = smart_open.s3.Reader(BUCKET_NAME, KEY_NAME) self.assertEqual(content[:6], fin.read(6)) self.assertEqual(content[6:14], fin.read(8)) # ř is 2 bytes self.assertEqual(content[14:], fin.read()) # read the rest def test_seek_beginning(self): """Does seeking to the beginning of S3 files work correctly?""" content = u"hello wořld\nhow are you?".encode('utf8') put_to_bucket(contents=content) with self.assertApiCalls(GetObject=1): fin = smart_open.s3.Reader(BUCKET_NAME, KEY_NAME) self.assertEqual(content[:6], fin.read(6)) self.assertEqual(content[6:14], fin.read(8)) # ř is 2 bytes with self.assertApiCalls(GetObject=1): fin.seek(0) self.assertEqual(content, fin.read()) # no size given => read whole file with self.assertApiCalls(GetObject=1): fin.seek(0) self.assertEqual(content, fin.read(-1)) # same thing def test_seek_start(self): """Does seeking from the start of S3 files work correctly?""" content = u"hello wořld\nhow are you?".encode('utf8') put_to_bucket(contents=content) with self.assertApiCalls(GetObject=1): fin = smart_open.s3.Reader(BUCKET_NAME, KEY_NAME, defer_seek=True) seek = fin.seek(6) self.assertEqual(seek, 6) self.assertEqual(fin.tell(), 6) self.assertEqual(fin.read(6), u'wořld'.encode('utf-8')) def test_seek_current(self): """Does seeking from the middle of S3 files work correctly?""" content = u"hello wořld\nhow are you?".encode('utf8') put_to_bucket(contents=content) with self.assertApiCalls(GetObject=1): fin = smart_open.s3.Reader(BUCKET_NAME, KEY_NAME) self.assertEqual(fin.read(5), b'hello') with self.assertApiCalls(GetObject=1): seek = fin.seek(1, whence=smart_open.constants.WHENCE_CURRENT) self.assertEqual(seek, 6) self.assertEqual(fin.read(6), u'wořld'.encode('utf-8')) def test_seek_end(self): """Does seeking from the end of S3 files work correctly?""" content = u"hello wořld\nhow are you?".encode('utf8') put_to_bucket(contents=content) with self.assertApiCalls(GetObject=1): fin = smart_open.s3.Reader(BUCKET_NAME, KEY_NAME, defer_seek=True) seek = fin.seek(-4, whence=smart_open.constants.WHENCE_END) self.assertEqual(seek, len(content) - 4) self.assertEqual(fin.read(), b'you?') def test_seek_past_end(self): content = u"hello wořld\nhow are you?".encode('utf8') put_to_bucket(contents=content) with self.assertApiCalls(GetObject=1), patch_invalid_range_response(str(len(content))): fin = smart_open.s3.Reader(BUCKET_NAME, KEY_NAME, defer_seek=True) seek = fin.seek(60) self.assertEqual(seek, len(content)) def test_detect_eof(self): content = u"hello wořld\nhow are you?".encode('utf8') put_to_bucket(contents=content) with self.assertApiCalls(GetObject=1): fin = smart_open.s3.Reader(BUCKET_NAME, KEY_NAME) fin.read() eof = fin.tell() self.assertEqual(eof, len(content)) fin.seek(0, whence=smart_open.constants.WHENCE_END) self.assertEqual(eof, fin.tell()) fin.seek(eof) self.assertEqual(eof, fin.tell()) def test_read_gzip(self): expected = u'раcцветали яблони и груши, поплыли туманы над рекой...'.encode('utf-8') buf = io.BytesIO() buf.close = lambda: None # keep buffer open so that we can .getvalue() with gzip.GzipFile(fileobj=buf, mode='w') as zipfile: zipfile.write(expected) put_to_bucket(contents=buf.getvalue()) # # Make sure we're reading things correctly. # with smart_open.s3.Reader(BUCKET_NAME, KEY_NAME) as fin: self.assertEqual(fin.read(), buf.getvalue()) # # Make sure the buffer we wrote is legitimate gzip. # sanity_buf = io.BytesIO(buf.getvalue()) with gzip.GzipFile(fileobj=sanity_buf) as zipfile: self.assertEqual(zipfile.read(), expected) logger.debug('starting actual test') with self.assertApiCalls(GetObject=1): with smart_open.s3.Reader(BUCKET_NAME, KEY_NAME) as fin: with gzip.GzipFile(fileobj=fin) as zipfile: actual = zipfile.read() self.assertEqual(expected, actual) def test_readline(self): content = b'englishman\nin\nnew\nyork\n' put_to_bucket(contents=content) with self.assertApiCalls(GetObject=2): with smart_open.s3.Reader(BUCKET_NAME, KEY_NAME) as fin: fin.readline() self.assertEqual(fin.tell(), content.index(b'\n')+1) fin.seek(0) actual = list(fin) self.assertEqual(fin.tell(), len(content)) expected = [b'englishman\n', b'in\n', b'new\n', b'york\n'] self.assertEqual(expected, actual) def test_readline_tiny_buffer(self): content = b'englishman\nin\nnew\nyork\n' put_to_bucket(contents=content) with self.assertApiCalls(GetObject=1): with smart_open.s3.Reader(BUCKET_NAME, KEY_NAME, buffer_size=8) as fin: actual = list(fin) expected = [b'englishman\n', b'in\n', b'new\n', b'york\n'] self.assertEqual(expected, actual) def test_read0_does_not_return_data(self): content = b'englishman\nin\nnew\nyork\n' put_to_bucket(contents=content) with self.assertApiCalls(): # set defer_seek to verify that read(0) doesn't trigger an unnecessary API call with smart_open.s3.Reader(BUCKET_NAME, KEY_NAME, defer_seek=True) as fin: data = fin.read(0) self.assertEqual(data, b'') def test_to_boto3(self): contents = b'the spice melange\n' put_to_bucket(contents=contents) with self.assertApiCalls(): # set defer_seek to verify that to_boto3() doesn't trigger an unnecessary API call with smart_open.s3.Reader(BUCKET_NAME, KEY_NAME, defer_seek=True) as fin: returned_obj = fin.to_boto3(boto3.resource('s3')) boto3_body = returned_obj.get()['Body'].read() self.assertEqual(contents, boto3_body) def test_binary_iterator(self): expected = u"выйду ночью в поле с конём".encode('utf-8').split(b' ') put_to_bucket(contents=b"\n".join(expected)) with self.assertApiCalls(GetObject=1): with smart_open.s3.open(BUCKET_NAME, KEY_NAME, 'rb') as fin: actual = [line.rstrip() for line in fin] self.assertEqual(expected, actual) def test_defer_seek(self): content = b'englishman\nin\nnew\nyork\n' put_to_bucket(contents=content) with self.assertApiCalls(): fin = smart_open.s3.Reader(BUCKET_NAME, KEY_NAME, defer_seek=True) with self.assertApiCalls(GetObject=1): self.assertEqual(fin.read(), content) with self.assertApiCalls(): fin = smart_open.s3.Reader(BUCKET_NAME, KEY_NAME, defer_seek=True) with self.assertApiCalls(GetObject=1): fin.seek(10) self.assertEqual(fin.read(), content[10:]) def test_read_empty_file(self): put_to_bucket(contents=b'') with self.assertApiCalls(GetObject=1), patch_invalid_range_response('0'): with smart_open.s3.Reader(BUCKET_NAME, KEY_NAME) as fin: data = fin.read() self.assertEqual(data, b'') @moto.mock_s3 class MultipartWriterTest(unittest.TestCase): """ Test writing into s3 files. """ def setUp(self): ignore_resource_warnings() def tearDown(self): cleanup_bucket() def test_write_01(self): """Does writing into s3 work correctly?""" test_string = u"žluťoučký koníček".encode('utf8') # write into key with smart_open.s3.MultipartWriter(BUCKET_NAME, WRITE_KEY_NAME) as fout: fout.write(test_string) # read key and test content output = list(smart_open.s3.open(BUCKET_NAME, WRITE_KEY_NAME, "rb")) self.assertEqual(output, [test_string]) def test_write_01a(self): """Does s3 write fail on incorrect input?""" try: with smart_open.s3.MultipartWriter(BUCKET_NAME, WRITE_KEY_NAME) as fin: fin.write(None) except TypeError: pass else: self.fail() def test_write_02(self): """Does s3 write unicode-utf8 conversion work?""" smart_open_write = smart_open.s3.MultipartWriter(BUCKET_NAME, WRITE_KEY_NAME) smart_open_write.tell() logger.info("smart_open_write: %r", smart_open_write) with smart_open_write as fout: fout.write(u"testžížáč".encode("utf-8")) self.assertEqual(fout.tell(), 14) def test_write_03(self): """Does s3 multipart chunking work correctly?""" # write smart_open_write = smart_open.s3.MultipartWriter( BUCKET_NAME, WRITE_KEY_NAME, min_part_size=10 ) with smart_open_write as fout: fout.write(b"test") self.assertEqual(fout._buf.tell(), 4) fout.write(b"test\n") self.assertEqual(fout._buf.tell(), 9) self.assertEqual(fout._total_parts, 0) fout.write(b"test") self.assertEqual(fout._buf.tell(), 0) self.assertEqual(fout._total_parts, 1) # read back the same key and check its content output = list(smart_open.s3.open(BUCKET_NAME, WRITE_KEY_NAME, 'rb')) self.assertEqual(output, [b"testtest\n", b"test"]) def test_write_04(self): """Does writing no data cause key with an empty value to be created?""" smart_open_write = smart_open.s3.MultipartWriter(BUCKET_NAME, WRITE_KEY_NAME) with smart_open_write as fout: # noqa pass # read back the same key and check its content with patch_invalid_range_response('0'): output = list(smart_open.s3.open(BUCKET_NAME, WRITE_KEY_NAME, 'rb')) self.assertEqual(output, []) def test_gzip(self): expected = u'а не спеть ли мне песню... о любви'.encode('utf-8') with smart_open.s3.MultipartWriter(BUCKET_NAME, WRITE_KEY_NAME) as fout: with gzip.GzipFile(fileobj=fout, mode='w') as zipfile: zipfile.write(expected) with smart_open.s3.Reader(BUCKET_NAME, WRITE_KEY_NAME) as fin: with gzip.GzipFile(fileobj=fin) as zipfile: actual = zipfile.read() self.assertEqual(expected, actual) def test_buffered_writer_wrapper_works(self): """ Ensure that we can wrap a smart_open s3 stream in a BufferedWriter, which passes a memoryview object to the underlying stream in python >= 2.7 """ expected = u'не думай о секундах свысока' with smart_open.s3.MultipartWriter(BUCKET_NAME, WRITE_KEY_NAME) as fout: with io.BufferedWriter(fout) as sub_out: sub_out.write(expected.encode('utf-8')) with smart_open.s3.open(BUCKET_NAME, WRITE_KEY_NAME, 'rb') as fin: with io.TextIOWrapper(fin, encoding='utf-8') as text: actual = text.read() self.assertEqual(expected, actual) def test_nonexisting_bucket(self): expected = u"выйду ночью в поле с конём".encode('utf-8') with self.assertRaises(ValueError): with smart_open.s3.open('thisbucketdoesntexist', 'mykey', 'wb') as fout: fout.write(expected) def test_read_nonexisting_key(self): with self.assertRaises(IOError): with smart_open.s3.open(BUCKET_NAME, 'my_nonexisting_key', 'rb') as fin: fin.read() def test_double_close(self): text = u'там за туманами, вечными, пьяными'.encode('utf-8') fout = smart_open.s3.open(BUCKET_NAME, 'key', 'wb') fout.write(text) fout.close() fout.close() def test_flush_close(self): text = u'там за туманами, вечными, пьяными'.encode('utf-8') fout = smart_open.s3.open(BUCKET_NAME, 'key', 'wb') fout.write(text) fout.flush() fout.close() def test_to_boto3(self): contents = b'the spice melange\n' with smart_open.s3.open(BUCKET_NAME, KEY_NAME, 'wb') as fout: fout.write(contents) returned_obj = fout.to_boto3(boto3.resource('s3')) boto3_body = returned_obj.get()['Body'].read() self.assertEqual(contents, boto3_body) def test_writebuffer(self): """Does the MultipartWriter support writing to a custom buffer?""" contents = b'get ready for a surprise' with tempfile.NamedTemporaryFile(mode='rb+') as f: with smart_open.s3.MultipartWriter(BUCKET_NAME, WRITE_KEY_NAME, writebuffer=f) as fout: fout.write(contents) with smart_open.s3.open(BUCKET_NAME, WRITE_KEY_NAME, 'rb') as fin: actual = fin.read() assert actual == contents @moto.mock_s3 class SinglepartWriterTest(unittest.TestCase): """ Test writing into s3 files using single part upload. """ def setUp(self): ignore_resource_warnings() def tearDown(self): cleanup_bucket() def test_write_01(self): """Does writing into s3 work correctly?""" test_string = u"žluťoučký koníček".encode('utf8') # write into key with smart_open.s3.SinglepartWriter(BUCKET_NAME, WRITE_KEY_NAME) as fout: fout.write(test_string) # read key and test content output = list(smart_open.s3.open(BUCKET_NAME, WRITE_KEY_NAME, "rb")) self.assertEqual(output, [test_string]) def test_write_01a(self): """Does s3 write fail on incorrect input?""" try: with smart_open.s3.SinglepartWriter(BUCKET_NAME, WRITE_KEY_NAME) as fin: fin.write(None) except TypeError: pass else: self.fail() def test_write_02(self): """Does s3 write unicode-utf8 conversion work?""" test_string = u"testžížáč".encode("utf-8") smart_open_write = smart_open.s3.SinglepartWriter(BUCKET_NAME, WRITE_KEY_NAME) smart_open_write.tell() logger.info("smart_open_write: %r", smart_open_write) with smart_open_write as fout: fout.write(test_string) self.assertEqual(fout.tell(), 14) def test_write_04(self): """Does writing no data cause key with an empty value to be created?""" smart_open_write = smart_open.s3.SinglepartWriter(BUCKET_NAME, WRITE_KEY_NAME) with smart_open_write as fout: # noqa pass # read back the same key and check its content with patch_invalid_range_response('0'): output = list(smart_open.s3.open(BUCKET_NAME, WRITE_KEY_NAME, 'rb')) self.assertEqual(output, []) def test_buffered_writer_wrapper_works(self): """ Ensure that we can wrap a smart_open s3 stream in a BufferedWriter, which passes a memoryview object to the underlying stream in python >= 2.7 """ expected = u'не думай о секундах свысока' with smart_open.s3.SinglepartWriter(BUCKET_NAME, WRITE_KEY_NAME) as fout: with io.BufferedWriter(fout) as sub_out: sub_out.write(expected.encode('utf-8')) with smart_open.s3.open(BUCKET_NAME, WRITE_KEY_NAME, 'rb') as fin: with io.TextIOWrapper(fin, encoding='utf-8') as text: actual = text.read() self.assertEqual(expected, actual) def test_nonexisting_bucket(self): expected = u"выйду ночью в поле с конём".encode('utf-8') with self.assertRaises(ValueError): with smart_open.s3.open('thisbucketdoesntexist', 'mykey', 'wb', multipart_upload=False) as fout: fout.write(expected) def test_double_close(self): text = u'там за туманами, вечными, пьяными'.encode('utf-8') fout = smart_open.s3.open(BUCKET_NAME, 'key', 'wb', multipart_upload=False) fout.write(text) fout.close() fout.close() def test_flush_close(self): text = u'там за туманами, вечными, пьяными'.encode('utf-8') fout = smart_open.s3.open(BUCKET_NAME, 'key', 'wb', multipart_upload=False) fout.write(text) fout.flush() fout.close() def test_writebuffer(self): """Does the SinglepartWriter support writing to a custom buffer?""" contents = b'get ready for a surprise' with tempfile.NamedTemporaryFile(mode='rb+') as f: with smart_open.s3.SinglepartWriter(BUCKET_NAME, WRITE_KEY_NAME, writebuffer=f) as fout: fout.write(contents) with smart_open.s3.open(BUCKET_NAME, WRITE_KEY_NAME, 'rb') as fin: actual = fin.read() assert actual == contents ARBITRARY_CLIENT_ERROR = botocore.client.ClientError(error_response={}, operation_name='bar') @moto.mock_s3 class IterBucketTest(unittest.TestCase): def setUp(self): ignore_resource_warnings() def tearDown(self): cleanup_bucket() @pytest.mark.skipif(condition=sys.platform == 'win32', reason="does not run on windows") @pytest.mark.xfail( condition=sys.platform == 'darwin', reason="MacOS uses spawn rather than fork for multiprocessing", ) def test_iter_bucket(self): populate_bucket() results = list(smart_open.s3.iter_bucket(BUCKET_NAME)) self.assertEqual(len(results), 10) def test_deprecated_top_level_s3_iter_bucket(self): populate_bucket() with self.assertLogs(smart_open.logger.name, level='WARN') as cm: # invoking once will generate a warning smart_open.s3_iter_bucket(BUCKET_NAME) # invoking again will not (to reduce spam) smart_open.s3_iter_bucket(BUCKET_NAME) # verify only one output assert len(cm.output) == 1 # verify the suggested new import is in the warning assert "from smart_open.s3 import iter_bucket as s3_iter_bucket" in cm.output[0] @pytest.mark.skipif(condition=sys.platform == 'win32', reason="does not run on windows") @pytest.mark.xfail( condition=sys.platform == 'darwin', reason="MacOS uses spawn rather than fork for multiprocessing", ) def test_accepts_boto3_bucket(self): populate_bucket() bucket = boto3.resource('s3').Bucket(BUCKET_NAME) results = list(smart_open.s3.iter_bucket(bucket)) self.assertEqual(len(results), 10) def test_list_bucket(self): num_keys = 10 populate_bucket() keys = list(smart_open.s3._list_bucket(BUCKET_NAME)) self.assertEqual(len(keys), num_keys) expected = ['key_%d' % x for x in range(num_keys)] self.assertEqual(sorted(keys), sorted(expected)) def test_list_bucket_long(self): num_keys = 1010 populate_bucket(num_keys=num_keys) keys = list(smart_open.s3._list_bucket(BUCKET_NAME)) self.assertEqual(len(keys), num_keys) expected = ['key_%d' % x for x in range(num_keys)] self.assertEqual(sorted(keys), sorted(expected)) @moto.mock_s3 @pytest.mark.skipif( condition=not smart_open.concurrency._CONCURRENT_FUTURES, reason='concurrent.futures unavailable', ) @pytest.mark.skipif(condition=sys.platform == 'win32', reason="does not run on windows") @pytest.mark.xfail( condition=sys.platform == 'darwin', reason="MacOS uses spawn rather than fork for multiprocessing", ) class IterBucketConcurrentFuturesTest(unittest.TestCase): def setUp(self): self.old_flag_multi = smart_open.concurrency._MULTIPROCESSING smart_open.concurrency._MULTIPROCESSING = False ignore_resource_warnings() def tearDown(self): smart_open.concurrency._MULTIPROCESSING = self.old_flag_multi cleanup_bucket() def test(self): num_keys = 101 populate_bucket(num_keys=num_keys) keys = list(smart_open.s3.iter_bucket(BUCKET_NAME)) self.assertEqual(len(keys), num_keys) expected = [('key_%d' % x, b'%d' % x) for x in range(num_keys)] self.assertEqual(sorted(keys), sorted(expected)) @moto.mock_s3 @pytest.mark.skipif( condition=not smart_open.concurrency._MULTIPROCESSING, reason='multiprocessing unavailable', ) @pytest.mark.skipif(condition=sys.platform == 'win32', reason="does not run on windows") @pytest.mark.xfail( condition=sys.platform == 'darwin', reason="MacOS uses spawn rather than fork for multiprocessing", ) class IterBucketMultiprocessingTest(unittest.TestCase): def setUp(self): self.old_flag_concurrent = smart_open.concurrency._CONCURRENT_FUTURES smart_open.concurrency._CONCURRENT_FUTURES = False ignore_resource_warnings() def tearDown(self): smart_open.concurrency._CONCURRENT_FUTURES = self.old_flag_concurrent cleanup_bucket() def test(self): num_keys = 101 populate_bucket(num_keys=num_keys) keys = list(smart_open.s3.iter_bucket(BUCKET_NAME)) self.assertEqual(len(keys), num_keys) expected = [('key_%d' % x, b'%d' % x) for x in range(num_keys)] self.assertEqual(sorted(keys), sorted(expected)) @moto.mock_s3 class IterBucketSingleProcessTest(unittest.TestCase): def setUp(self): self.old_flag_multi = smart_open.concurrency._MULTIPROCESSING self.old_flag_concurrent = smart_open.concurrency._CONCURRENT_FUTURES smart_open.concurrency._MULTIPROCESSING = False smart_open.concurrency._CONCURRENT_FUTURES = False ignore_resource_warnings() def tearDown(self): smart_open.concurrency._MULTIPROCESSING = self.old_flag_multi smart_open.concurrency._CONCURRENT_FUTURES = self.old_flag_concurrent cleanup_bucket() def test(self): num_keys = 101 populate_bucket(num_keys=num_keys) keys = list(smart_open.s3.iter_bucket(BUCKET_NAME)) self.assertEqual(len(keys), num_keys) expected = [('key_%d' % x, b'%d' % x) for x in range(num_keys)] self.assertEqual(sorted(keys), sorted(expected)) # # This has to be a separate test because we cannot run it against real S3 # (we don't want to expose our real S3 credentials). # @moto.mock_s3 class IterBucketCredentialsTest(unittest.TestCase): def test(self): num_keys = 10 populate_bucket(num_keys=num_keys) result = list( smart_open.s3.iter_bucket( BUCKET_NAME, workers=None, aws_access_key_id='access_id', aws_secret_access_key='access_secret' ) ) self.assertEqual(len(result), num_keys) @moto.mock_s3 class DownloadKeyTest(unittest.TestCase): def setUp(self): ignore_resource_warnings() def tearDown(self): cleanup_bucket() def test_happy(self): contents = b'hello' put_to_bucket(contents=contents) expected = (KEY_NAME, contents) actual = smart_open.s3._download_key(KEY_NAME, bucket_name=BUCKET_NAME) self.assertEqual(expected, actual) def test_intermittent_error(self): contents = b'hello' put_to_bucket(contents=contents) expected = (KEY_NAME, contents) side_effect = [ARBITRARY_CLIENT_ERROR, ARBITRARY_CLIENT_ERROR, contents] with mock.patch('smart_open.s3._download_fileobj', side_effect=side_effect): actual = smart_open.s3._download_key(KEY_NAME, bucket_name=BUCKET_NAME) self.assertEqual(expected, actual) def test_persistent_error(self): contents = b'hello' put_to_bucket(contents=contents) side_effect = [ARBITRARY_CLIENT_ERROR, ARBITRARY_CLIENT_ERROR, ARBITRARY_CLIENT_ERROR, ARBITRARY_CLIENT_ERROR] with mock.patch('smart_open.s3._download_fileobj', side_effect=side_effect): self.assertRaises(botocore.client.ClientError, smart_open.s3._download_key, KEY_NAME, bucket_name=BUCKET_NAME) def test_intermittent_error_retries(self): contents = b'hello' put_to_bucket(contents=contents) expected = (KEY_NAME, contents) side_effect = [ARBITRARY_CLIENT_ERROR, ARBITRARY_CLIENT_ERROR, ARBITRARY_CLIENT_ERROR, ARBITRARY_CLIENT_ERROR, contents] with mock.patch('smart_open.s3._download_fileobj', side_effect=side_effect): actual = smart_open.s3._download_key(KEY_NAME, bucket_name=BUCKET_NAME, retries=4) self.assertEqual(expected, actual) def test_propagates_other_exception(self): contents = b'hello' put_to_bucket(contents=contents) with mock.patch('smart_open.s3._download_fileobj', side_effect=ValueError): self.assertRaises(ValueError, smart_open.s3._download_key, KEY_NAME, bucket_name=BUCKET_NAME) @moto.mock_s3 class OpenTest(unittest.TestCase): def setUp(self): ignore_resource_warnings() def tearDown(self): cleanup_bucket() def test_read_never_returns_none(self): """read should never return None.""" test_string = u"ветер по морю гуляет..." with smart_open.s3.open(BUCKET_NAME, KEY_NAME, "wb") as fout: fout.write(test_string.encode('utf8')) r = smart_open.s3.open(BUCKET_NAME, KEY_NAME, "rb") self.assertEqual(r.read(), test_string.encode("utf-8")) self.assertEqual(r.read(), b"") self.assertEqual(r.read(), b"") def populate_bucket(num_keys=10): s3 = boto3.resource('s3') for key_number in range(num_keys): key_name = 'key_%d' % key_number s3.Object(BUCKET_NAME, key_name).put(Body=str(key_number)) class RetryIfFailedTest(unittest.TestCase): def test_success(self): partial = mock.Mock(return_value=1) result = smart_open.s3._retry_if_failed(partial, attempts=3, sleep_seconds=0) self.assertEqual(result, 1) self.assertEqual(partial.call_count, 1) def test_failure(self): partial = mock.Mock(side_effect=ValueError) exceptions = (ValueError, ) with self.assertRaises(IOError): smart_open.s3._retry_if_failed(partial, attempts=3, sleep_seconds=0, exceptions=exceptions) self.assertEqual(partial.call_count, 3) @moto.mock_s3() def test_client_propagation_singlepart(): """Does the client parameter make it from the caller to Boto3?""" # # Not sure why we need to create the bucket here, as setUpModule should # have done that for us by now. # session = boto3.Session() resource = session.resource('s3') bucket = resource.create_bucket(Bucket=BUCKET_NAME) bucket.wait_until_exists() client = session.client('s3') with smart_open.s3.open( BUCKET_NAME, WRITE_KEY_NAME, mode='wb', client=client, multipart_upload=False, ) as writer: assert writer._client.client == client assert id(writer._client.client) == id(client) @moto.mock_s3() def test_client_propagation_multipart(): """Does the resource parameter make it from the caller to Boto3?""" session = boto3.Session() resource = session.resource('s3') bucket = resource.create_bucket(Bucket=BUCKET_NAME) bucket.wait_until_exists() client = session.client('s3') with smart_open.s3.open( BUCKET_NAME, WRITE_KEY_NAME, mode='wb', client=client, multipart_upload=True, ) as writer: assert writer._client.client == client assert id(writer._client.client) == id(client) @moto.mock_s3() def test_resource_propagation_reader(): """Does the resource parameter make it from the caller to Boto3?""" session = boto3.Session() resource = session.resource('s3') bucket = resource.create_bucket(Bucket=BUCKET_NAME) bucket.wait_until_exists() client = session.client('s3') with smart_open.s3.open(BUCKET_NAME, WRITE_KEY_NAME, mode='wb') as writer: writer.write(b'hello world') with smart_open.s3.open(BUCKET_NAME, WRITE_KEY_NAME, mode='rb', client=client) as reader: assert reader._client.client == client assert id(reader._client.client) == id(client) if __name__ == '__main__': logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO) unittest.main() smart_open-5.2.1/smart_open/tests/test_s3_version.py000066400000000000000000000104731411241424400227040ustar00rootroot00000000000000# -*- coding: utf-8 -*- import logging import unittest import uuid import boto3 import moto from smart_open import open BUCKET_NAME = 'test-smartopen' KEY_NAME = 'test-key' logger = logging.getLogger(__name__) @moto.mock_s3 def setUpModule(): '''Called once by unittest when initializing this module. Sets up the test S3 bucket. ''' bucket = boto3.resource('s3').create_bucket(Bucket=BUCKET_NAME) bucket.wait_until_exists() boto3.resource('s3').BucketVersioning(BUCKET_NAME).enable() @moto.mock_s3 def tearDownModule(): '''Called once by unittest when tearing down this module. Empties and removes the test S3 bucket. ''' s3 = boto3.resource('s3') bucket = s3.Bucket(BUCKET_NAME) try: bucket.object_versions.delete() bucket.delete() except s3.meta.client.exceptions.NoSuchBucket: pass bucket.wait_until_not_exists() def get_versions(bucket, key): """Return object versions in chronological order.""" return [ v.id for v in sorted( boto3.resource('s3').Bucket(bucket).object_versions.filter(Prefix=key), key=lambda version: version.last_modified, ) ] @moto.mock_s3 class TestVersionId(unittest.TestCase): def setUp(self): # # Each run of this test reuses the BUCKET_NAME, but works with a # different key for isolation. # self.key = 'test-write-key-{}'.format(uuid.uuid4().hex) self.url = "s3://%s/%s" % (BUCKET_NAME, self.key) self.test_ver1 = u"String version 1.0".encode('utf8') self.test_ver2 = u"String version 2.0".encode('utf8') bucket = boto3.resource('s3').Bucket(BUCKET_NAME) bucket.put_object(Key=self.key, Body=self.test_ver1) logging.critical('versions after first write: %r', get_versions(BUCKET_NAME, self.key)) bucket.put_object(Key=self.key, Body=self.test_ver2) self.versions = get_versions(BUCKET_NAME, self.key) logging.critical('versions after second write: %r', get_versions(BUCKET_NAME, self.key)) assert len(self.versions) == 2 def test_good_id(self): """Does passing the version_id parameter into the s3 submodule work correctly when reading?""" params = {'version_id': self.versions[0]} with open(self.url, mode='rb', transport_params=params) as fin: actual = fin.read() self.assertEqual(actual, self.test_ver1) def test_bad_id(self): """Does passing an invalid version_id exception into the s3 submodule get handled correctly?""" params = {'version_id': 'bad-version-does-not-exist'} with self.assertRaises(IOError): open(self.url, 'rb', transport_params=params) def test_bad_mode(self): """Do we correctly handle non-None version when writing?""" params = {'version_id': self.versions[0]} with self.assertRaises(ValueError): open(self.url, 'wb', transport_params=params) def test_no_version(self): """Passing in no version at all gives the newest version of the file?""" with open(self.url, 'rb') as fin: actual = fin.read() self.assertEqual(actual, self.test_ver2) def test_newest_version(self): """Passing in the newest version explicitly gives the most recent content?""" params = {'version_id': self.versions[1]} with open(self.url, mode='rb', transport_params=params) as fin: actual = fin.read() self.assertEqual(actual, self.test_ver2) def test_oldset_version(self): """Passing in the oldest version gives the oldest content?""" params = {'version_id': self.versions[0]} with open(self.url, mode='rb', transport_params=params) as fin: actual = fin.read() self.assertEqual(actual, self.test_ver1) def test_version_to_boto3(self): """Passing in the oldest version gives the oldest content?""" self.versions = get_versions(BUCKET_NAME, self.key) params = {'version_id': self.versions[0]} with open(self.url, mode='rb', transport_params=params) as fin: returned_obj = fin.to_boto3(boto3.resource('s3')) boto3_body = boto3_body = returned_obj.get()['Body'].read() self.assertEqual(boto3_body, self.test_ver1) if __name__ == '__main__': unittest.main() smart_open-5.2.1/smart_open/tests/test_sanity.py000066400000000000000000000016561411241424400221240ustar00rootroot00000000000000import unittest import boto3 import moto @moto.mock_s3() def setUpModule(): bucket = boto3.resource('s3').create_bucket(Bucket='mybucket') bucket.wait_until_exists() @moto.mock_s3() def tearDownModule(): resource = boto3.resource('s3') bucket = resource.Bucket('mybucket') try: bucket.delete() except resource.meta.client.exceptions.NoSuchBucket: pass bucket.wait_until_not_exists() @moto.mock_s3() class Test(unittest.TestCase): def test(self): resource = boto3.resource('s3') bucket = resource.Bucket('mybucket') self.assertEqual(bucket.name, 'mybucket') expected = b'hello' resource.Object('mybucket', 'mykey').put(Body=expected) actual = resource.Object('mybucket', 'mykey').get()['Body'].read() self.assertEqual(expected, actual) def tearDown(self): boto3.resource('s3').Object('mybucket', 'mykey').delete() smart_open-5.2.1/smart_open/tests/test_smart_open.py000066400000000000000000002422031411241424400227570ustar00rootroot00000000000000# -*- coding: utf-8 -*- # # Copyright (C) 2019 Radim Rehurek # # This code is distributed under the terms and conditions # from the MIT License (MIT). # import bz2 import csv import contextlib import io import gzip import hashlib import logging import os from smart_open.compression import INFER_FROM_EXTENSION, NO_COMPRESSION import tempfile import unittest from unittest import mock import warnings import boto3 from moto import mock_s3 import parameterizedtestcase import pytest import responses import smart_open from smart_open import smart_open_lib from smart_open import webhdfs from smart_open.smart_open_lib import patch_pathlib, _patch_pathlib from smart_open.tests.test_s3 import patch_invalid_range_response logger = logging.getLogger(__name__) CURR_DIR = os.path.abspath(os.path.dirname(__file__)) SAMPLE_TEXT = 'Hello, world!' SAMPLE_BYTES = SAMPLE_TEXT.encode('utf-8') # # For Windows platforms, under which tempfile.NamedTemporaryFile has some # unwanted quirks. # # https://docs.python.org/3.8/library/tempfile.html#tempfile.NamedTemporaryFile # https://stackoverflow.com/a/58955530 # @contextlib.contextmanager def named_temporary_file(mode='w+b', prefix=None, suffix=None, delete=True): filename = io.StringIO() if prefix: filename.write(prefix) filename.write(os.urandom(8).hex()) if suffix: filename.write(suffix) pathname = os.path.join(tempfile.gettempdir(), filename.getvalue()) with open(pathname, mode) as f: yield f if delete: try: os.unlink(pathname) except PermissionError as e: # # This can happen on Windows for unknown reasons. # logger.error(e) class ParseUriTest(unittest.TestCase): """ Test ParseUri class. """ def test_scheme(self): """Do URIs schemes parse correctly?""" # supported schemes for scheme in ("s3", "s3a", "s3n", "hdfs", "file", "http", "https", "gs", "azure"): parsed_uri = smart_open_lib._parse_uri(scheme + "://mybucket/mykey") self.assertEqual(parsed_uri.scheme, scheme) # unsupported scheme => NotImplementedError self.assertRaises(NotImplementedError, smart_open_lib._parse_uri, "foobar://mybucket/mykey") # unknown scheme => default_scheme parsed_uri = smart_open_lib._parse_uri("blah blah") self.assertEqual(parsed_uri.scheme, "file") def test_s3_uri(self): """Do S3 URIs parse correctly?""" # correct uri without credentials parsed_uri = smart_open_lib._parse_uri("s3://mybucket/mykey") self.assertEqual(parsed_uri.scheme, "s3") self.assertEqual(parsed_uri.bucket_id, "mybucket") self.assertEqual(parsed_uri.key_id, "mykey") self.assertEqual(parsed_uri.access_id, None) self.assertEqual(parsed_uri.access_secret, None) def test_s3_uri_contains_slash(self): parsed_uri = smart_open_lib._parse_uri("s3://mybucket/mydir/mykey") self.assertEqual(parsed_uri.scheme, "s3") self.assertEqual(parsed_uri.bucket_id, "mybucket") self.assertEqual(parsed_uri.key_id, "mydir/mykey") self.assertEqual(parsed_uri.access_id, None) self.assertEqual(parsed_uri.access_secret, None) def test_s3_uri_with_credentials(self): parsed_uri = smart_open_lib._parse_uri("s3://ACCESSID456:acces/sse_cr-et@mybucket/mykey") self.assertEqual(parsed_uri.scheme, "s3") self.assertEqual(parsed_uri.bucket_id, "mybucket") self.assertEqual(parsed_uri.key_id, "mykey") self.assertEqual(parsed_uri.access_id, "ACCESSID456") self.assertEqual(parsed_uri.access_secret, "acces/sse_cr-et") def test_s3_uri_with_credentials2(self): parsed_uri = smart_open_lib._parse_uri("s3://accessid:access/secret@mybucket/mykey") self.assertEqual(parsed_uri.scheme, "s3") self.assertEqual(parsed_uri.bucket_id, "mybucket") self.assertEqual(parsed_uri.key_id, "mykey") self.assertEqual(parsed_uri.access_id, "accessid") self.assertEqual(parsed_uri.access_secret, "access/secret") def test_s3_uri_has_atmark_in_key_name(self): parsed_uri = smart_open_lib._parse_uri("s3://accessid:access/secret@mybucket/my@ke@y") self.assertEqual(parsed_uri.scheme, "s3") self.assertEqual(parsed_uri.bucket_id, "mybucket") self.assertEqual(parsed_uri.key_id, "my@ke@y") self.assertEqual(parsed_uri.access_id, "accessid") self.assertEqual(parsed_uri.access_secret, "access/secret") def test_s3_uri_has_atmark_in_key_name2(self): parsed_uri = smart_open_lib._parse_uri( "s3://accessid:access/secret@hostname:1234@mybucket/dir/my@ke@y" ) self.assertEqual(parsed_uri.scheme, "s3") self.assertEqual(parsed_uri.bucket_id, "mybucket") self.assertEqual(parsed_uri.key_id, "dir/my@ke@y") self.assertEqual(parsed_uri.access_id, "accessid") self.assertEqual(parsed_uri.access_secret, "access/secret") self.assertEqual(parsed_uri.host, "hostname") self.assertEqual(parsed_uri.port, 1234) def test_s3_uri_has_atmark_in_key_name3(self): parsed_uri = smart_open_lib._parse_uri("s3://accessid:access/secret@hostname@mybucket/dir/my@ke@y") self.assertEqual(parsed_uri.scheme, "s3") self.assertEqual(parsed_uri.bucket_id, "mybucket") self.assertEqual(parsed_uri.key_id, "dir/my@ke@y") self.assertEqual(parsed_uri.access_id, "accessid") self.assertEqual(parsed_uri.access_secret, "access/secret") self.assertEqual(parsed_uri.host, "hostname") self.assertEqual(parsed_uri.port, 443) def test_s3_handles_fragments(self): uri_str = 's3://bucket-name/folder/picture #1.jpg' parsed_uri = smart_open_lib._parse_uri(uri_str) self.assertEqual(parsed_uri.key_id, "folder/picture #1.jpg") def test_s3_handles_querystring(self): uri_str = 's3://bucket-name/folder/picture1.jpg?bar' parsed_uri = smart_open_lib._parse_uri(uri_str) self.assertEqual(parsed_uri.key_id, "folder/picture1.jpg?bar") def test_s3_invalid_url_atmark_in_bucket_name(self): self.assertRaises( ValueError, smart_open_lib._parse_uri, "s3://access_id:access_secret@my@bucket@port/mykey", ) def test_s3_invalid_uri_missing_colon(self): self.assertRaises( ValueError, smart_open_lib._parse_uri, "s3://access_id@access_secret@mybucket@port/mykey", ) def test_webhdfs_uri_to_http(self): parsed_uri = smart_open_lib._parse_uri("webhdfs://host:14000/path/file") actual = webhdfs.convert_to_http_uri(parsed_uri) expected = "http://host:14000/webhdfs/v1/path/file" self.assertEqual(actual, expected) def test_webhdfs_uri_to_http_with_query(self): parsed_uri = smart_open_lib._parse_uri("webhdfs://host:14000/path/file?a=1") actual = webhdfs.convert_to_http_uri(parsed_uri) expected = "http://host:14000/webhdfs/v1/path/file?a=1" self.assertEqual(actual, expected) def test_webhdfs_uri_to_http_with_user(self): parsed_uri = smart_open_lib._parse_uri("webhdfs://user@host:14000/path") actual = webhdfs.convert_to_http_uri(parsed_uri) expected = "http://host:14000/webhdfs/v1/path?user.name=user" self.assertEqual(actual, expected) def test_webhdfs_uri_to_http_with_user_and_query(self): parsed_uri = smart_open_lib._parse_uri("webhdfs://user@host:14000/path?a=1") actual = webhdfs.convert_to_http_uri(parsed_uri) expected = "http://host:14000/webhdfs/v1/path?a=1&user.name=user" self.assertEqual(actual, expected) def test_uri_from_issue_223_works(self): uri = "s3://:@omax-mis/twilio-messages-media/final/MEcd7c36e75f87dc6dd9e33702cdcd8fb6" parsed_uri = smart_open_lib._parse_uri(uri) self.assertEqual(parsed_uri.scheme, "s3") self.assertEqual(parsed_uri.bucket_id, "omax-mis") self.assertEqual(parsed_uri.key_id, "twilio-messages-media/final/MEcd7c36e75f87dc6dd9e33702cdcd8fb6") self.assertEqual(parsed_uri.access_id, "") self.assertEqual(parsed_uri.access_secret, "") def test_s3_uri_with_colon_in_key_name(self): """ Correctly parse the s3 url if there is a colon in the key or dir """ parsed_uri = smart_open_lib._parse_uri("s3://mybucket/mydir/my:key") self.assertEqual(parsed_uri.scheme, "s3") self.assertEqual(parsed_uri.bucket_id, "mybucket") self.assertEqual(parsed_uri.key_id, "mydir/my:key") self.assertEqual(parsed_uri.access_id, None) self.assertEqual(parsed_uri.access_secret, None) def test_s3_uri_contains_question_mark(self): parsed_uri = smart_open_lib._parse_uri("s3://mybucket/mydir/mykey?param") self.assertEqual(parsed_uri.scheme, "s3") self.assertEqual(parsed_uri.bucket_id, "mybucket") self.assertEqual(parsed_uri.key_id, "mydir/mykey?param") def test_host_and_port(self): as_string = 's3u://user:secret@host:1234@mybucket/mykey.txt' uri = smart_open_lib._parse_uri(as_string) self.assertEqual(uri.scheme, "s3u") self.assertEqual(uri.bucket_id, "mybucket") self.assertEqual(uri.key_id, "mykey.txt") self.assertEqual(uri.access_id, "user") self.assertEqual(uri.access_secret, "secret") self.assertEqual(uri.host, "host") self.assertEqual(uri.port, 1234) def test_invalid_port(self): as_string = 's3u://user:secret@host:port@mybucket/mykey.txt' self.assertRaises(ValueError, smart_open_lib._parse_uri, as_string) def test_invalid_port2(self): as_string = 's3u://user:secret@host:port:foo@mybucket/mykey.txt' self.assertRaises(ValueError, smart_open_lib._parse_uri, as_string) def test_leading_slash_local_file(self): path = "/home/misha/hello.txt" uri = smart_open_lib._parse_uri(path) self.assertEqual(uri.scheme, "file") self.assertEqual(uri.uri_path, path) uri = smart_open_lib._parse_uri('//' + path) self.assertEqual(uri.scheme, "file") self.assertEqual(uri.uri_path, '//' + path) def test_ssh(self): as_string = 'ssh://user@host:1234/path/to/file' uri = smart_open_lib._parse_uri(as_string) self.assertEqual(uri.scheme, 'ssh') self.assertEqual(uri.uri_path, '/path/to/file') self.assertEqual(uri.user, 'user') self.assertEqual(uri.host, 'host') self.assertEqual(uri.port, 1234) self.assertEqual(uri.password, None) def test_ssh_with_pass(self): as_string = 'ssh://user:pass@host:1234/path/to/file' uri = smart_open_lib._parse_uri(as_string) self.assertEqual(uri.scheme, 'ssh') self.assertEqual(uri.uri_path, '/path/to/file') self.assertEqual(uri.user, 'user') self.assertEqual(uri.host, 'host') self.assertEqual(uri.port, 1234) self.assertEqual(uri.password, 'pass') def test_scp(self): as_string = 'scp://user@host:/path/to/file' uri = smart_open_lib._parse_uri(as_string) self.assertEqual(uri.scheme, 'scp') self.assertEqual(uri.uri_path, '/path/to/file') self.assertEqual(uri.user, 'user') self.assertEqual(uri.host, 'host') self.assertEqual(uri.port, 22) self.assertEqual(uri.password, None) def test_scp_with_pass(self): as_string = 'scp://user:pass@host:/path/to/file' uri = smart_open_lib._parse_uri(as_string) self.assertEqual(uri.scheme, 'scp') self.assertEqual(uri.uri_path, '/path/to/file') self.assertEqual(uri.user, 'user') self.assertEqual(uri.host, 'host') self.assertEqual(uri.port, 22) self.assertEqual(uri.password, 'pass') def test_sftp(self): as_string = 'sftp://host/path/to/file' uri = smart_open_lib._parse_uri(as_string) self.assertEqual(uri.scheme, 'sftp') self.assertEqual(uri.uri_path, '/path/to/file') self.assertEqual(uri.user, None) self.assertEqual(uri.host, 'host') self.assertEqual(uri.port, 22) self.assertEqual(uri.password, None) def test_sftp_with_user_and_pass(self): as_string = 'sftp://user:pass@host:2222/path/to/file' uri = smart_open_lib._parse_uri(as_string) self.assertEqual(uri.scheme, 'sftp') self.assertEqual(uri.uri_path, '/path/to/file') self.assertEqual(uri.user, 'user') self.assertEqual(uri.host, 'host') self.assertEqual(uri.port, 2222) self.assertEqual(uri.password, 'pass') def test_ssh_complex_password_with_colon(self): as_string = 'sftp://user:some:complex@password$$@host:2222/path/to/file' uri = smart_open_lib._parse_uri(as_string) self.assertEqual(uri.password, 'some:complex@password$$') def test_gs_uri(self): """Do GCS URIs parse correctly?""" # correct uri without credentials parsed_uri = smart_open_lib._parse_uri("gs://mybucket/myblob") self.assertEqual(parsed_uri.scheme, "gs") self.assertEqual(parsed_uri.bucket_id, "mybucket") self.assertEqual(parsed_uri.blob_id, "myblob") def test_gs_uri_contains_slash(self): parsed_uri = smart_open_lib._parse_uri("gs://mybucket/mydir/myblob") self.assertEqual(parsed_uri.scheme, "gs") self.assertEqual(parsed_uri.bucket_id, "mybucket") self.assertEqual(parsed_uri.blob_id, "mydir/myblob") def test_gs_uri_contains_question_mark(self): parsed_uri = smart_open_lib._parse_uri("gs://mybucket/mydir/myblob?param") self.assertEqual(parsed_uri.scheme, "gs") self.assertEqual(parsed_uri.bucket_id, "mybucket") self.assertEqual(parsed_uri.blob_id, "mydir/myblob?param") def test_azure_blob_uri(self): """Do Azure Blob URIs parse correctly?""" # correct uri without credentials parsed_uri = smart_open_lib._parse_uri("azure://mycontainer/myblob") self.assertEqual(parsed_uri.scheme, "azure") self.assertEqual(parsed_uri.container_id, "mycontainer") self.assertEqual(parsed_uri.blob_id, "myblob") def test_azure_blob_uri_root_container(self): parsed_uri = smart_open_lib._parse_uri("azure://myblob") self.assertEqual(parsed_uri.scheme, "azure") self.assertEqual(parsed_uri.container_id, "$root") self.assertEqual(parsed_uri.blob_id, "myblob") def test_azure_blob_uri_contains_slash(self): parsed_uri = smart_open_lib._parse_uri("azure://mycontainer/mydir/myblob") self.assertEqual(parsed_uri.scheme, "azure") self.assertEqual(parsed_uri.container_id, "mycontainer") self.assertEqual(parsed_uri.blob_id, "mydir/myblob") def test_pathlib_monkeypatch(self): from smart_open.smart_open_lib import pathlib assert pathlib.Path.open != smart_open.open with patch_pathlib(): assert pathlib.Path.open == smart_open.open assert pathlib.Path.open != smart_open.open obj = patch_pathlib() assert pathlib.Path.open == smart_open.open _patch_pathlib(obj.old_impl) assert pathlib.Path.open != smart_open.open def test_pathlib_monkeypatch_read_gz(self): from smart_open.smart_open_lib import pathlib path = pathlib.Path(CURR_DIR) / 'test_data' / 'crime-and-punishment.txt.gz' # Check that standard implementation can't work with gzip with path.open("r") as infile: with self.assertRaises(Exception): lines = infile.readlines() # Check that our implementation works with gzip obj = patch_pathlib() try: with path.open("r", encoding='utf-8') as infile: lines = infile.readlines() self.assertEqual(len(lines), 3) finally: _patch_pathlib(obj.old_impl) class SmartOpenHttpTest(unittest.TestCase): """ Test reading from HTTP connections in various ways. """ @mock.patch('smart_open.ssh.open', return_value=open(__file__)) def test_read_ssh(self, mock_open): """Is SSH line iterator called correctly?""" obj = smart_open.open( "ssh://ubuntu:pass@ip_address:1022/some/path/lines.txt", mode='rb', transport_params=dict(hello='world'), ) obj.__iter__() mock_open.assert_called_with( '/some/path/lines.txt', 'rb', host='ip_address', user='ubuntu', password='pass', port=1022, transport_params={'hello': 'world'}, ) @responses.activate def test_http_read(self): """Does http read method work correctly""" responses.add(responses.GET, "http://127.0.0.1/index.html", body='line1\nline2', stream=True) smart_open_object = smart_open.open("http://127.0.0.1/index.html", 'rb') self.assertEqual(smart_open_object.read().decode("utf-8"), "line1\nline2") @responses.activate def test_https_readline(self): """Does https readline method work correctly""" responses.add(responses.GET, "https://127.0.0.1/index.html", body=u'line1\u2028still line1\nline2', stream=True) smart_open_object = smart_open.open("https://127.0.0.1/index.html", 'rb') self.assertEqual(smart_open_object.readline().decode("utf-8"), u"line1\u2028still line1\n") smart_open_object = smart_open.open("https://127.0.0.1/index.html", 'r', encoding='utf-8') self.assertEqual(smart_open_object.readline(), u"line1\u2028still line1\n") @responses.activate def test_http_pass(self): """Does http authentication work correctly""" responses.add(responses.GET, "http://127.0.0.1/index.html", body='line1\nline2', stream=True) tp = dict(user='me', password='pass') smart_open.open("http://127.0.0.1/index.html", transport_params=tp) self.assertEqual(len(responses.calls), 1) actual_request = responses.calls[0].request self.assertTrue('Authorization' in actual_request.headers) self.assertTrue(actual_request.headers['Authorization'].startswith('Basic ')) @responses.activate def _test_compressed_http(self, suffix, query): """Can open via http?""" raw_data = b'Hello World Compressed.' * 10000 buf = make_buffer(name='data' + suffix) with smart_open.open(buf, 'wb') as outfile: outfile.write(raw_data) compressed_data = buf._value_when_closed # check that the string was actually compressed self.assertNotEqual(compressed_data, raw_data) responses.add(responses.GET, 'http://127.0.0.1/data' + suffix, body=compressed_data, stream=True) url = 'http://127.0.0.1/data%s%s' % (suffix, '?some_param=some_val' if query else '') smart_open_object = smart_open.open(url, 'rb') # decompress the file and get the same md5 hash self.assertEqual(smart_open_object.read(), raw_data) def test_http_gz(self): """Can open gzip via http?""" self._test_compressed_http(".gz", False) def test_http_bz2(self): """Can open bzip2 via http?""" self._test_compressed_http(".bz2", False) def test_http_gz_query(self): """Can open gzip via http with a query appended to URI?""" self._test_compressed_http(".gz", True) def test_http_bz2_query(self): """Can open bzip2 via http with a query appended to URI?""" self._test_compressed_http(".bz2", True) def make_buffer(cls=io.BytesIO, initial_value=None, name=None, noclose=False): """ Construct a new in-memory file object aka "buf". :param cls: Class of the file object. Meaningful values are BytesIO and StringIO. :param initial_value: Passed directly to the constructor, this is the content of the returned buffer. :param name: Associated file path. Not assigned if is None (default). :param noclose: If True, disables the .close function. :return: Instance of `cls`. """ buf = cls(initial_value) if initial_value else cls() if name is not None: buf.name = name buf._value_when_closed = None orig_close = buf.close def close(): if buf.close.call_count == 1: buf._value_when_closed = buf.getvalue() if not noclose: orig_close() buf.close = mock.Mock(side_effect=close) return buf class RealFileSystemTests(unittest.TestCase): """Tests that touch the file system via temporary files.""" def setUp(self): with named_temporary_file(prefix='test', delete=False) as fout: fout.write(SAMPLE_BYTES) self.temp_file = fout.name def tearDown(self): os.unlink(self.temp_file) def test_rt(self): with smart_open.open(self.temp_file, 'rt') as fin: data = fin.read() self.assertEqual(data, SAMPLE_TEXT) def test_wt(self): # # The file already contains SAMPLE_TEXT, so write something different. # text = 'nippon budokan' with smart_open.open(self.temp_file, 'wt') as fout: fout.write(text) with smart_open.open(self.temp_file, 'rt') as fin: data = fin.read() self.assertEqual(data, text) def test_ab(self): with smart_open.open(self.temp_file, 'ab') as fout: fout.write(SAMPLE_BYTES) with smart_open.open(self.temp_file, 'rb') as fin: data = fin.read() self.assertEqual(data, SAMPLE_BYTES * 2) def test_aplus(self): with smart_open.open(self.temp_file, 'a+') as fout: fout.write(SAMPLE_TEXT) with smart_open.open(self.temp_file, 'rt') as fin: text = fin.read() self.assertEqual(text, SAMPLE_TEXT * 2) def test_at(self): with smart_open.open(self.temp_file, 'at') as fout: fout.write(SAMPLE_TEXT) with smart_open.open(self.temp_file, 'rt') as fin: text = fin.read() self.assertEqual(text, SAMPLE_TEXT * 2) def test_atplus(self): with smart_open.open(self.temp_file, 'at+') as fout: fout.write(SAMPLE_TEXT) with smart_open.open(self.temp_file, 'rt') as fin: text = fin.read() self.assertEqual(text, SAMPLE_TEXT * 2) class SmartOpenFileObjTest(unittest.TestCase): """ Test passing raw file objects. """ def test_read_bytes(self): """Can we read bytes from a byte stream?""" buf = make_buffer(initial_value=SAMPLE_BYTES) with smart_open.open(buf, 'rb') as sf: data = sf.read() self.assertEqual(data, SAMPLE_BYTES) def test_write_bytes(self): """Can we write bytes to a byte stream?""" buf = make_buffer() with smart_open.open(buf, 'wb') as sf: sf.write(SAMPLE_BYTES) self.assertEqual(buf.getvalue(), SAMPLE_BYTES) def test_read_text_stream_fails(self): """Attempts to read directly from a text stream should fail. This is because smart_open.open expects a byte stream as input. If you have a text stream, there's no point passing it to smart_open: you can read from it directly. """ buf = make_buffer(io.StringIO, initial_value=SAMPLE_TEXT) with smart_open.open(buf, 'r') as sf: self.assertRaises(TypeError, sf.read) # we expect binary mode def test_write_text_stream_fails(self): """Attempts to write directly to a text stream should fail.""" buf = make_buffer(io.StringIO) with smart_open.open(buf, 'w') as sf: with self.assertRaises(TypeError): sf.write(SAMPLE_TEXT) # we expect binary mode # Need to flush because TextIOWrapper may buffer and we need # to write to the underlying StringIO to get the TypeError. sf.flush() def test_read_text_from_bytestream(self): buf = make_buffer(initial_value=SAMPLE_BYTES) with smart_open.open(buf, 'r') as sf: data = sf.read() self.assertEqual(data, SAMPLE_TEXT) def test_read_text_from_bytestream_rt(self): buf = make_buffer(initial_value=SAMPLE_BYTES) with smart_open.open(buf, 'rt') as sf: data = sf.read() self.assertEqual(data, SAMPLE_TEXT) def test_read_text_from_bytestream_rtplus(self): buf = make_buffer(initial_value=SAMPLE_BYTES) with smart_open.open(buf, 'rt+') as sf: data = sf.read() self.assertEqual(data, SAMPLE_TEXT) def test_write_text_to_bytestream(self): """Can we write strings to a byte stream?""" buf = make_buffer(noclose=True) with smart_open.open(buf, 'w') as sf: sf.write(SAMPLE_TEXT) self.assertEqual(buf.getvalue(), SAMPLE_BYTES) def test_write_text_to_bytestream_wt(self): """Can we write strings to a byte stream?""" buf = make_buffer(noclose=True) with smart_open.open(buf, 'wt') as sf: sf.write(SAMPLE_TEXT) self.assertEqual(buf.getvalue(), SAMPLE_BYTES) def test_write_text_to_bytestream_wtplus(self): """Can we write strings to a byte stream?""" buf = make_buffer(noclose=True) with smart_open.open(buf, 'wt+') as sf: sf.write(SAMPLE_TEXT) self.assertEqual(buf.getvalue(), SAMPLE_BYTES) def test_name_read(self): """Can we use the "name" attribute to decompress on the fly?""" data = SAMPLE_BYTES * 1000 buf = make_buffer(initial_value=bz2.compress(data), name='data.bz2') with smart_open.open(buf, 'rb') as sf: data = sf.read() self.assertEqual(data, data) def test_name_write(self): """Can we use the "name" attribute to compress on the fly?""" data = SAMPLE_BYTES * 1000 buf = make_buffer(name='data.bz2') with smart_open.open(buf, 'wb') as sf: sf.write(data) self.assertEqual(bz2.decompress(buf._value_when_closed), data) def test_open_side_effect(self): """ Does our detection of the `name` attribute work with wrapped open()-ed streams? We `open()` a file with ".bz2" extension, pass the file object to `smart_open()` and check that we read decompressed data. This behavior is driven by detecting the `name` attribute in `_open_binary_stream()`. """ data = SAMPLE_BYTES * 1000 with named_temporary_file(prefix='smart_open_tests_', suffix=".bz2", delete=False) as tmpf: tmpf.write(bz2.compress(data)) try: with open(tmpf.name, 'rb') as openf: with smart_open.open(openf, 'rb') as smartf: smart_data = smartf.read() self.assertEqual(data, smart_data) finally: os.unlink(tmpf.name) # # What exactly to patch here differs on _how_ we're opening the file. # See the _shortcut_open function for details. # _IO_OPEN = 'io.open' _BUILTIN_OPEN = 'smart_open.smart_open_lib._builtin_open' class SmartOpenReadTest(unittest.TestCase): """ Test reading from files under various schemes. """ def test_shortcut(self): fpath = os.path.join(CURR_DIR, 'test_data/crime-and-punishment.txt') with mock.patch('smart_open.smart_open_lib._builtin_open') as mock_open: smart_open.open(fpath, 'r').read() mock_open.assert_called_with(fpath, 'r', buffering=-1) def test_open_binary(self): fpath = os.path.join(CURR_DIR, 'test_data/cp852.tsv.txt') with open(fpath, 'rb') as fin: expected = fin.read() with smart_open.open(fpath, 'rb') as fin: actual = fin.read() self.assertEqual(expected, actual) def test_open_with_keywords(self): """This test captures Issue #142.""" fpath = os.path.join(CURR_DIR, 'test_data/cp852.tsv.txt') with open(fpath, 'r', encoding='cp852') as fin: expected = fin.read() with smart_open.open(fpath, encoding='cp852') as fin: actual = fin.read() self.assertEqual(expected, actual) def test_open_with_keywords_explicit_r(self): fpath = os.path.join(CURR_DIR, 'test_data/cp852.tsv.txt') with open(fpath, 'r', encoding='cp852') as fin: expected = fin.read() with smart_open.open(fpath, mode='r', encoding='cp852') as fin: actual = fin.read() self.assertEqual(expected, actual) def test_open_and_read_pathlib_path(self): """If ``pathlib.Path`` is available we should be able to open and read.""" from smart_open.smart_open_lib import pathlib fpath = os.path.join(CURR_DIR, 'test_data/cp852.tsv.txt') with open(fpath, 'rb') as fin: expected = fin.read().decode('cp852') with smart_open.open(pathlib.Path(fpath), mode='r', encoding='cp852', newline='') as fin: actual = fin.read() self.assertEqual(expected, actual) @mock_s3 def test_read_never_returns_none(self): """read should never return None.""" s3 = boto3.resource('s3') s3.create_bucket(Bucket='mybucket') test_string = u"ветер по морю гуляет..." with smart_open.open("s3://mybucket/mykey", "wb") as fout: fout.write(test_string.encode('utf8')) r = smart_open.open("s3://mybucket/mykey", "rb") self.assertEqual(r.read(), test_string.encode("utf-8")) self.assertEqual(r.read(), b"") self.assertEqual(r.read(), b"") @mock_s3 def test_read_newline_none(self): """Does newline open() parameter for reading work according to https://docs.python.org/3/library/functions.html#open-newline-parameter """ boto3.resource('s3').create_bucket(Bucket='mybucket') # Unicode line separator and various others must never split lines test_file = u"line\u2028 LF\nline\x1c CR\rline\x85 CRLF\r\nlast line" with smart_open.open("s3://mybucket/mykey", "wb") as fout: fout.write(test_file.encode("utf-8")) # No newline parameter means newline=None i.e. universal newline mode with all # line endings translated to '\n' with smart_open.open("s3://mybucket/mykey", "r", encoding='utf-8') as fin: self.assertEqual(list(fin), [ u"line\u2028 LF\n", u"line\x1c CR\n", u"line\x85 CRLF\n", u"last line" ]) @mock_s3 def test_read_newline_empty(self): boto3.resource('s3').create_bucket(Bucket='mybucket') test_file = u"line\u2028 LF\nline\x1c CR\rline\x85 CRLF\r\nlast line" with smart_open.open("s3://mybucket/mykey", "wb") as fout: fout.write(test_file.encode("utf-8")) # If newline='' universal newline mode is enabled but line separators are not replaced with smart_open.open("s3://mybucket/mykey", "r", encoding='utf-8', newline='') as fin: self.assertEqual(list(fin), [ u"line\u2028 LF\n", u"line\x1c CR\r", u"line\x85 CRLF\r\n", u"last line" ]) @mock_s3 def test_read_newline_cr(self): boto3.resource('s3').create_bucket(Bucket='mybucket') test_file = u"line\u2028 LF\nline\x1c CR\rline\x85 CRLF\r\nlast line" with smart_open.open("s3://mybucket/mykey", "wb") as fout: fout.write(test_file.encode("utf-8")) # If newline='\r' only CR splits lines with smart_open.open("s3://mybucket/mykey", "r", encoding='utf-8', newline='\r') as fin: self.assertEqual(list(fin), [ u"line\u2028 LF\nline\x1c CR\r", u"line\x85 CRLF\r", u"\nlast line" ]) @mock_s3 def test_read_newline_lf(self): boto3.resource('s3').create_bucket(Bucket='mybucket') test_file = u"line\u2028 LF\nline\x1c CR\rline\x85 CRLF\r\nlast line" with smart_open.open("s3://mybucket/mykey", "wb") as fout: fout.write(test_file.encode("utf-8")) # If newline='\n' only LF splits lines with smart_open.open("s3://mybucket/mykey", "r", encoding='utf-8', newline='\n') as fin: self.assertEqual(list(fin), [ u"line\u2028 LF\n", u"line\x1c CR\rline\x85 CRLF\r\n", u"last line" ]) @mock_s3 def test_read_newline_crlf(self): boto3.resource('s3').create_bucket(Bucket='mybucket') test_file = u"line\u2028 LF\nline\x1c CR\rline\x85 CRLF\r\nlast line" with smart_open.open("s3://mybucket/mykey", "wb") as fout: fout.write(test_file.encode("utf-8")) # If newline='\r\n' only CRLF splits lines with smart_open.open("s3://mybucket/mykey", "r", encoding='utf-8', newline='\r\n') as fin: self.assertEqual(list(fin), [ u"line\u2028 LF\nline\x1c CR\rline\x85 CRLF\r\n", u"last line" ]) @mock_s3 def test_read_newline_slurp(self): boto3.resource('s3').create_bucket(Bucket='mybucket') test_file = u"line\u2028 LF\nline\x1c CR\rline\x85 CRLF\r\nlast line" with smart_open.open("s3://mybucket/mykey", "wb") as fout: fout.write(test_file.encode("utf-8")) # Even reading the whole file with read() must replace newlines with smart_open.open("s3://mybucket/mykey", "r", encoding='utf-8', newline=None) as fin: self.assertEqual( fin.read(), u"line\u2028 LF\nline\x1c CR\nline\x85 CRLF\nlast line" ) @mock_s3 def test_read_newline_binary(self): boto3.resource('s3').create_bucket(Bucket='mybucket') test_file = u"line\u2028 LF\nline\x1c CR\rline\x85 CRLF\r\nlast line" with smart_open.open("s3://mybucket/mykey", "wb") as fout: fout.write(test_file.encode("utf-8")) # If the file is opened in binary mode only LF splits lines with smart_open.open("s3://mybucket/mykey", "rb") as fin: self.assertEqual(list(fin), [ u"line\u2028 LF\n".encode('utf-8'), u"line\x1c CR\rline\x85 CRLF\r\n".encode('utf-8'), u"last line".encode('utf-8') ]) @mock_s3 def test_write_newline_none(self): """Does newline open() parameter for writing work according to https://docs.python.org/3/library/functions.html#open-newline-parameter """ boto3.resource('s3').create_bucket(Bucket='mybucket') # Unicode line separator and various others must never split lines test_file = u"line\u2028 LF\nline\x1c CR\rline\x85 CRLF\r\nlast line" # No newline parameter means newline=None, all LF are translatest to os.linesep with smart_open.open("s3://mybucket/mykey", "w", encoding='utf-8') as fout: fout.write(test_file) with smart_open.open("s3://mybucket/mykey", "rb") as fin: self.assertEqual( fin.read().decode('utf-8'), u"line\u2028 LF" + os.linesep + u"line\x1c CR\rline\x85 CRLF\r" + os.linesep + u"last line" ) @mock_s3 def test_write_newline_empty(self): boto3.resource('s3').create_bucket(Bucket='mybucket') test_file = u"line\u2028 LF\nline\x1c CR\rline\x85 CRLF\r\nlast line" # If newline='' nothing is changed with smart_open.open("s3://mybucket/mykey", "w", encoding='utf-8', newline='') as fout: fout.write(test_file) with smart_open.open("s3://mybucket/mykey", "rb") as fin: self.assertEqual( fin.read().decode('utf-8'), u"line\u2028 LF\nline\x1c CR\rline\x85 CRLF\r\nlast line" ) @mock_s3 def test_write_newline_lf(self): boto3.resource('s3').create_bucket(Bucket='mybucket') test_file = u"line\u2028 LF\nline\x1c CR\rline\x85 CRLF\r\nlast line" # If newline='\n' nothing is changed with smart_open.open("s3://mybucket/mykey", "w", encoding='utf-8', newline='\n') as fout: fout.write(test_file) with smart_open.open("s3://mybucket/mykey", "rb") as fin: self.assertEqual( fin.read().decode('utf-8'), u"line\u2028 LF\nline\x1c CR\rline\x85 CRLF\r\nlast line" ) @mock_s3 def test_write_newline_cr(self): boto3.resource('s3').create_bucket(Bucket='mybucket') test_file = u"line\u2028 LF\nline\x1c CR\rline\x85 CRLF\r\nlast line" # If newline='\r' all LF are replaced by CR with smart_open.open("s3://mybucket/mykey", "w", encoding='utf-8', newline='\r') as fout: fout.write(test_file) with smart_open.open("s3://mybucket/mykey", "rb") as fin: self.assertEqual( fin.read().decode('utf-8'), u"line\u2028 LF\rline\x1c CR\rline\x85 CRLF\r\rlast line" ) @mock_s3 def test_write_newline_crlf(self): boto3.resource('s3').create_bucket(Bucket='mybucket') test_file = u"line\u2028 LF\nline\x1c CR\rline\x85 CRLF\r\nlast line" # If newline='\r\n' all LF are replaced by CRLF with smart_open.open("s3://mybucket/mykey", "w", encoding='utf-8', newline='\r\n') as fout: fout.write(test_file) with smart_open.open("s3://mybucket/mykey", "rb") as fin: self.assertEqual( fin.read().decode('utf-8'), u"line\u2028 LF\r\nline\x1c CR\rline\x85 CRLF\r\r\nlast line" ) @mock_s3 def test_readline(self): """Does readline() return the correct file content?""" s3 = boto3.resource('s3') s3.create_bucket(Bucket='mybucket') test_string = u"hello žluťoučký\u2028world!\nhow are you?".encode('utf8') with smart_open.open("s3://mybucket/mykey", "wb") as fout: fout.write(test_string) reader = smart_open.open("s3://mybucket/mykey", "rb") self.assertEqual(reader.readline(), u"hello žluťoučký\u2028world!\n".encode("utf-8")) @mock_s3 def test_readline_iter(self): """Does __iter__ return the correct file content?""" s3 = boto3.resource('s3') s3.create_bucket(Bucket='mybucket') lines = [u"всем\u2028привет!\n", u"что нового?"] with smart_open.open("s3://mybucket/mykey", "wb") as fout: fout.write("".join(lines).encode("utf-8")) reader = smart_open.open("s3://mybucket/mykey", "rb") actual_lines = [line.decode("utf-8") for line in reader] self.assertEqual(2, len(actual_lines)) self.assertEqual(lines[0], actual_lines[0]) self.assertEqual(lines[1], actual_lines[1]) @mock_s3 def test_readline_eof(self): """Does readline() return empty string on EOF?""" s3 = boto3.resource('s3') s3.create_bucket(Bucket='mybucket') with smart_open.open("s3://mybucket/mykey", "wb"): pass with patch_invalid_range_response('0'): reader = smart_open.open("s3://mybucket/mykey", "rb") self.assertEqual(reader.readline(), b"") self.assertEqual(reader.readline(), b"") self.assertEqual(reader.readline(), b"") @mock_s3 def test_s3_iter_lines(self): """Does s3_iter_lines give correct content?""" # create fake bucket and fake key s3 = boto3.resource('s3') s3.create_bucket(Bucket='mybucket') test_string = u"hello žluťoučký\u2028world!\nhow are you?".encode('utf8') with smart_open.open("s3://mybucket/mykey", "wb") as fin: fin.write(test_string) # call s3_iter_lines and check output reader = smart_open.open("s3://mybucket/mykey", "rb") output = list(reader) self.assertEqual(len(output), 2) self.assertEqual(b''.join(output), test_string) # TODO: add more complex test for file:// @mock.patch('smart_open.smart_open_lib._builtin_open') def test_file(self, mock_smart_open): """Is file:// line iterator called correctly?""" prefix = "file://" full_path = '/tmp/test.txt' read_mode = "rb" smart_open_object = smart_open.open(prefix+full_path, read_mode) smart_open_object.__iter__() # called with the correct path? mock_smart_open.assert_called_with(full_path, read_mode, buffering=-1) full_path = '/tmp/test#hash##more.txt' read_mode = "rb" smart_open_object = smart_open.open(prefix+full_path, read_mode) smart_open_object.__iter__() # called with the correct path? mock_smart_open.assert_called_with(full_path, read_mode, buffering=-1) full_path = 'aa#aa' read_mode = "rb" smart_open_object = smart_open.open(full_path, read_mode) smart_open_object.__iter__() # called with the correct path? mock_smart_open.assert_called_with(full_path, read_mode, buffering=-1) short_path = "~/tmp/test.txt" full_path = os.path.expanduser(short_path) @mock.patch(_BUILTIN_OPEN) def test_file_errors(self, mock_smart_open): prefix = "file://" full_path = '/tmp/test.txt' read_mode = "r" short_path = "~/tmp/test.txt" full_path = os.path.expanduser(short_path) smart_open_object = smart_open.open(prefix+short_path, read_mode, errors='strict') smart_open_object.__iter__() # called with the correct expanded path? mock_smart_open.assert_called_with(full_path, read_mode, buffering=-1, errors='strict') @mock.patch(_BUILTIN_OPEN) def test_file_buffering(self, mock_smart_open): smart_open_object = smart_open.open('/tmp/somefile', 'rb', buffering=0) smart_open_object.__iter__() # called with the correct expanded path? mock_smart_open.assert_called_with('/tmp/somefile', 'rb', buffering=0) @mock.patch(_BUILTIN_OPEN) def test_file_buffering2(self, mock_smart_open): smart_open_object = smart_open.open('/tmp/somefile', 'rb', 0) smart_open_object.__iter__() # called with the correct expanded path? mock_smart_open.assert_called_with('/tmp/somefile', 'rb', buffering=0) # couldn't find any project for mocking up HDFS data # TODO: we want to test also a content of the files, not just fnc call params @mock.patch('smart_open.hdfs.subprocess') def test_hdfs(self, mock_subprocess): """Is HDFS line iterator called correctly?""" mock_subprocess.PIPE.return_value = "test" smart_open_object = smart_open.open("hdfs:///tmp/test.txt") smart_open_object.__iter__() # called with the correct params? mock_subprocess.Popen.assert_called_with( ["hdfs", "dfs", "-cat", "/tmp/test.txt"], stdout=mock_subprocess.PIPE, ) # second possibility of schema smart_open_object = smart_open.open("hdfs://tmp/test.txt") smart_open_object.__iter__() mock_subprocess.Popen.assert_called_with( ["hdfs", "dfs", "-cat", "/tmp/test.txt"], stdout=mock_subprocess.PIPE, ) @responses.activate def test_webhdfs(self): """Is webhdfs line iterator called correctly""" responses.add(responses.GET, "http://127.0.0.1:8440/webhdfs/v1/path/file", body='line1\nline2', stream=True) smart_open_object = smart_open.open("webhdfs://127.0.0.1:8440/path/file", 'rb') iterator = iter(smart_open_object) self.assertEqual(next(iterator).decode("utf-8"), "line1\n") self.assertEqual(next(iterator).decode("utf-8"), "line2") @responses.activate def test_webhdfs_encoding(self): """Is HDFS line iterator called correctly?""" input_url = "webhdfs://127.0.0.1:8440/path/file" actual_url = 'http://127.0.0.1:8440/webhdfs/v1/path/file' text = u'не для меня прийдёт весна, не для меня дон разольётся' body = text.encode('utf-8') responses.add(responses.GET, actual_url, body=body, stream=True) actual = smart_open.open(input_url, encoding='utf-8').read() self.assertEqual(text, actual) @responses.activate def test_webhdfs_read(self): """Does webhdfs read method work correctly""" responses.add(responses.GET, "http://127.0.0.1:8440/webhdfs/v1/path/file", body='line1\nline2', stream=True) smart_open_object = smart_open.open("webhdfs://127.0.0.1:8440/path/file", 'rb') self.assertEqual(smart_open_object.read().decode("utf-8"), "line1\nline2") @mock_s3 def test_s3_iter_moto(self): """Are S3 files iterated over correctly?""" # a list of strings to test with expected = [b"*" * 5 * 1024**2] + [b'0123456789'] * 1024 + [b"test"] # create fake bucket and fake key s3 = boto3.resource('s3') s3.create_bucket(Bucket='mybucket') tp = dict(s3_min_part_size=5 * 1024**2) with smart_open.open("s3://mybucket/mykey", "wb", transport_params=tp) as fout: # write a single huge line (=full multipart upload) fout.write(expected[0] + b'\n') # write lots of small lines for lineno, line in enumerate(expected[1:-1]): fout.write(line + b'\n') # ...and write the last line too, no newline at the end fout.write(expected[-1]) # connect to fake s3 and read from the fake key we filled above smart_open_object = smart_open.open("s3://mybucket/mykey", 'rb') output = [line.rstrip(b'\n') for line in smart_open_object] self.assertEqual(output, expected) # same thing but using a context manager with smart_open.open("s3://mybucket/mykey", 'rb') as smart_open_object: output = [line.rstrip(b'\n') for line in smart_open_object] self.assertEqual(output, expected) @mock_s3 def test_s3_read_moto(self): """Are S3 files read correctly?""" s3 = boto3.resource('s3') s3.create_bucket(Bucket='mybucket') # write some bogus key so we can check it below content = u"hello wořld\nhow are you?".encode('utf8') with smart_open.open("s3://mybucket/mykey", "wb") as fout: fout.write(content) smart_open_object = smart_open.open("s3://mybucket/mykey", 'rb') self.assertEqual(content[:6], smart_open_object.read(6)) self.assertEqual(content[6:14], smart_open_object.read(8)) # ř is 2 bytes self.assertEqual(content[14:], smart_open_object.read()) # read the rest @mock_s3 def test_s3_seek_moto(self): """Does seeking in S3 files work correctly?""" s3 = boto3.resource('s3') s3.create_bucket(Bucket='mybucket') # write some bogus key so we can check it below content = u"hello wořld\nhow are you?".encode('utf8') with smart_open.open("s3://mybucket/mykey", "wb") as fout: fout.write(content) smart_open_object = smart_open.open("s3://mybucket/mykey", 'rb') self.assertEqual(content[:6], smart_open_object.read(6)) self.assertEqual(content[6:14], smart_open_object.read(8)) # ř is 2 bytes smart_open_object.seek(0) self.assertEqual(content, smart_open_object.read()) # no size given => read whole file smart_open_object.seek(0) self.assertEqual(content, smart_open_object.read(-1)) # same thing @mock_s3 def test_s3_tell(self): """Does tell() work when S3 file is opened for text writing? """ s3 = boto3.resource('s3') s3.create_bucket(Bucket='mybucket') with smart_open.open("s3://mybucket/mykey", "w") as fout: fout.write(u"test") # Note that tell() in general returns an opaque number for text files. # See https://docs.python.org/3/library/io.html#io.TextIOBase.tell self.assertEqual(fout.tell(), 4) class SmartOpenS3KwargsTest(unittest.TestCase): @mock.patch('boto3.client') def test_no_kwargs(self, mock_client): smart_open.open('s3://mybucket/mykey', transport_params=dict(defer_seek=True)) mock_client.assert_called_with('s3') @mock.patch('boto3.client') def test_credentials(self, mock_client): smart_open.open('s3://access_id:access_secret@mybucket/mykey', transport_params=dict(defer_seek=True)) mock_client.assert_called_with( 's3', aws_access_key_id='access_id', aws_secret_access_key='access_secret', ) @mock.patch('boto3.client') def test_host(self, mock_client): tp = { 'client_kwargs': { 'S3.Client': {'endpoint_url': 'http://aa.domain.com'}, }, 'defer_seek': True, } smart_open.open("s3://access_id:access_secret@mybucket/mykey", transport_params=tp) mock_client.assert_called_with( 's3', aws_access_key_id='access_id', aws_secret_access_key='access_secret', endpoint_url='http://aa.domain.com', ) @mock.patch('boto3.client') def test_s3_upload(self, mock_client): tp = { 'client_kwargs': { 'S3.Client.create_multipart_upload': { 'ServerSideEncryption': 'AES256', 'ContentType': 'application/json', } } } smart_open.open("s3://bucket/key", 'wb', transport_params=tp) mock_client.return_value.create_multipart_upload.assert_called_with( Bucket='bucket', Key='key', ServerSideEncryption='AES256', ContentType='application/json', ) class SmartOpenTest(unittest.TestCase): """ Test reading and writing from/into files. """ def setUp(self): self.as_text = u'куда идём мы с пятачком - большой большой секрет' self.as_bytes = self.as_text.encode('utf-8') self.stringio = io.StringIO(self.as_text) self.bytesio = io.BytesIO(self.as_bytes) def test_file_mode_mock(self): """Are file:// open modes passed correctly?""" # correct read modes # # We always open files in binary mode first, but engage # encoders/decoders as necessary. Instead of checking how the file # _initially_ got opened, we now also check the end result: if the # contents got decoded correctly. # def test_text(self): with mock.patch(_BUILTIN_OPEN, mock.Mock(return_value=self.stringio)) as mock_open: with smart_open.open("blah", "r", encoding='utf-8') as fin: self.assertEqual(fin.read(), self.as_text) mock_open.assert_called_with("blah", "r", buffering=-1, encoding='utf-8') def test_binary(self): with mock.patch(_BUILTIN_OPEN, mock.Mock(return_value=self.bytesio)) as mock_open: with smart_open.open("blah", "rb") as fin: self.assertEqual(fin.read(), self.as_bytes) mock_open.assert_called_with("blah", "rb", buffering=-1) def test_expanded_path(self): short_path = "~/blah" full_path = os.path.expanduser(short_path) with mock.patch(_BUILTIN_OPEN, mock.Mock(return_value=self.stringio)) as mock_open: with smart_open.open(short_path, "rb"): mock_open.assert_called_with(full_path, "rb", buffering=-1) def test_incorrect(self): # incorrect file mode self.assertRaises(NotImplementedError, smart_open.smart_open, "s3://bucket/key", "x") # correct write modes, incorrect scheme self.assertRaises(NotImplementedError, smart_open.smart_open, "hdfs:///blah.txt", "wb+") self.assertRaises(NotImplementedError, smart_open.smart_open, "http:///blah.txt", "w") self.assertRaises(NotImplementedError, smart_open.smart_open, "s3://bucket/key", "wb+") def test_write_utf8(self): # correct write mode, correct file:// URI with mock.patch(_BUILTIN_OPEN, mock.Mock(return_value=self.stringio)) as mock_open: with smart_open.open("blah", "w", encoding='utf-8') as fout: mock_open.assert_called_with("blah", "w", buffering=-1, encoding='utf-8') fout.write(self.as_text) def test_write_utf8_absolute_path(self): with mock.patch(_BUILTIN_OPEN, mock.Mock(return_value=self.stringio)) as mock_open: with smart_open.open("/some/file.txt", "w", encoding='utf-8') as fout: mock_open.assert_called_with("/some/file.txt", "w", buffering=-1, encoding='utf-8') fout.write(self.as_text) def test_append_utf8(self): with mock.patch(_BUILTIN_OPEN, mock.Mock(return_value=self.stringio)) as mock_open: with smart_open.open("/some/file.txt", "w+", encoding='utf-8') as fout: mock_open.assert_called_with("/some/file.txt", "w+", buffering=-1, encoding='utf-8') fout.write(self.as_text) def test_append_binary_absolute_path(self): with mock.patch(_BUILTIN_OPEN, mock.Mock(return_value=self.bytesio)) as mock_open: with smart_open.open("/some/file.txt", "wb+") as fout: mock_open.assert_called_with("/some/file.txt", "wb+", buffering=-1) fout.write(self.as_bytes) def test_newline(self): with mock.patch(_BUILTIN_OPEN, mock.Mock(return_value=self.bytesio)) as mock_open: smart_open.open("/some/file.txt", "wb+", newline='\n') mock_open.assert_called_with("/some/file.txt", "wb+", buffering=-1, newline='\n') def test_newline_csv(self): # # See https://github.com/RaRe-Technologies/smart_open/issues/477 # rows = [{'name': 'alice\u2028beatrice', 'color': 'aqua'}, {'name': 'bob', 'color': 'blue'}] expected = 'name,color\r\nalice\u2028beatrice,aqua\r\nbob,blue\r\n' with named_temporary_file(mode='w') as tmp: # The csv module recommends using newline='' when opening files and letting # the csv writer handle line endings. By default it uses the 'excel' dialect which # emits \r\n as line terminator. with smart_open.open(tmp.name, 'w+', encoding='utf-8', newline='') as fout: out = csv.DictWriter(fout, fieldnames=['name', 'color']) out.writeheader() out.writerows(rows) with open(tmp.name, 'r', encoding='utf-8', newline='') as fin: content = fin.read() assert content == expected @mock.patch('boto3.client') def test_s3_mode_mock(self, mock_client): """Are s3:// open modes passed correctly?""" # correct write mode, correct s3 URI transport_params = { 'client_kwargs': { 'S3.Client': {'endpoint_url': 'http://s3.amazonaws.com'}, } } smart_open.open("s3://mybucket/mykey", "w", transport_params=transport_params) mock_client.assert_called_with('s3', endpoint_url='http://s3.amazonaws.com') @mock.patch('smart_open.hdfs.subprocess') def test_hdfs(self, mock_subprocess): """Is HDFS write called correctly""" smart_open_object = smart_open.open("hdfs:///tmp/test.txt", 'wb') smart_open_object.write("test") # called with the correct params? mock_subprocess.Popen.assert_called_with( ["hdfs", "dfs", "-put", "-f", "-", "/tmp/test.txt"], stdin=mock_subprocess.PIPE ) # second possibility of schema smart_open_object = smart_open.open("hdfs://tmp/test.txt", 'wb') smart_open_object.write("test") mock_subprocess.Popen.assert_called_with( ["hdfs", "dfs", "-put", "-f", "-", "/tmp/test.txt"], stdin=mock_subprocess.PIPE ) @mock_s3 def test_s3_modes_moto(self): """Do s3:// open modes work correctly?""" # fake bucket and key s3 = boto3.resource('s3') s3.create_bucket(Bucket='mybucket') raw_data = b"second test" # correct write mode, correct s3 URI with smart_open.open("s3://mybucket/newkey", "wb") as fout: logger.debug('fout: %r', fout) fout.write(raw_data) logger.debug("write successfully completed") output = list(smart_open.open("s3://mybucket/newkey", "rb")) self.assertEqual(output, [raw_data]) @mock_s3 def test_s3_metadata_write(self): # Read local file fixture path = os.path.join(CURR_DIR, 'test_data/crime-and-punishment.txt.gz') data = "" with smart_open.open(path, 'rb') as fd: data = fd.read() # Create a test bucket s3 = boto3.resource('s3') s3.create_bucket(Bucket='mybucket') tp = { 'client_kwargs': { 'S3.Client.create_multipart_upload': { 'ContentType': 'text/plain', 'ContentEncoding': 'gzip', } } } # Write data, with multipart_upload options write_stream = smart_open.open( 's3://mybucket/crime-and-punishment.txt.gz', 'wb', transport_params=tp, ) with write_stream as fout: fout.write(data) key = s3.Object('mybucket', 'crime-and-punishment.txt.gz') self.assertIn('text/plain', key.content_type) self.assertEqual(key.content_encoding, 'gzip') @mock_s3 def test_write_bad_encoding_strict(self): """Should abort on encoding error.""" text = u'欲しい気持ちが成長しすぎて' with self.assertRaises(UnicodeEncodeError): with named_temporary_file('wb', delete=True) as infile: with smart_open.open(infile.name, 'w', encoding='koi8-r', errors='strict') as fout: fout.write(text) @mock_s3 def test_write_bad_encoding_replace(self): """Should replace characters that failed to encode.""" text = u'欲しい気持ちが成長しすぎて' expected = u'?' * len(text) with named_temporary_file('wb', delete=True) as infile: with smart_open.open(infile.name, 'w', encoding='koi8-r', errors='replace') as fout: fout.write(text) with smart_open.open(infile.name, 'r', encoding='koi8-r') as fin: actual = fin.read() self.assertEqual(expected, actual) class WebHdfsWriteTest(unittest.TestCase): """ Test writing into webhdfs files. """ @responses.activate def test_initialize_write(self): def request_callback(_): resp_body = "" headers = {'location': 'http://127.0.0.1:8440/file'} return 307, headers, resp_body responses.add_callback( responses.PUT, "http://127.0.0.1:8440/webhdfs/v1/path/file", callback=request_callback, ) responses.add( responses.PUT, "http://127.0.0.1:8440/file", status=201, ) smart_open.open("webhdfs://127.0.0.1:8440/path/file", 'wb') assert len(responses.calls) == 2 path, params = responses.calls[0].request.url.split("?") assert path == "http://127.0.0.1:8440/webhdfs/v1/path/file" assert params == "overwrite=True&op=CREATE" or params == "op=CREATE&overwrite=True" assert responses.calls[1].request.url == "http://127.0.0.1:8440/file" @responses.activate def test_write(self): def request_callback(_): resp_body = "" headers = {'location': 'http://127.0.0.1:8440/file'} return 307, headers, resp_body responses.add_callback( responses.PUT, "http://127.0.0.1:8440/webhdfs/v1/path/file", callback=request_callback, ) responses.add(responses.PUT, "http://127.0.0.1:8440/file", status=201) smart_open_object = smart_open.open("webhdfs://127.0.0.1:8440/path/file", 'wb') def write_callback(request): assert request.body == u"žluťoučký koníček".encode('utf8') headers = {} return 200, headers, "" test_string = u"žluťoučký koníček".encode('utf8') responses.add_callback( responses.POST, "http://127.0.0.1:8440/webhdfs/v1/path/file", callback=request_callback, ) responses.add_callback( responses.POST, "http://127.0.0.1:8440/file", callback=write_callback, ) smart_open_object.write(test_string) smart_open_object.close() assert len(responses.calls) == 4 assert responses.calls[2].request.url == "http://127.0.0.1:8440/webhdfs/v1/path/file?op=APPEND" # noqa assert responses.calls[3].request.url == "http://127.0.0.1:8440/file" _DECOMPRESSED_DATA = "не слышны в саду даже шорохи".encode("utf-8") _MOCK_TIME = mock.Mock(return_value=1620256567) def gzip_compress(data, filename=None): # # gzip.compress is sensitive to the current time and the destination filename. # This function fixes those variables for consistent compression results. # buf = io.BytesIO() buf.name = filename with mock.patch('time.time', _MOCK_TIME): gzip.GzipFile(fileobj=buf, mode='w').write(data) return buf.getvalue() class CompressionFormatTest(parameterizedtestcase.ParameterizedTestCase): """Test transparent (de)compression.""" def write_read_assertion(self, suffix): test_file = make_buffer(name='file' + suffix) with smart_open.open(test_file, 'wb') as fout: fout.write(SAMPLE_BYTES) self.assertNotEqual(SAMPLE_BYTES, test_file._value_when_closed) # we have to recreate the buffer because it is closed test_file = make_buffer(initial_value=test_file._value_when_closed, name=test_file.name) with smart_open.open(test_file, 'rb') as fin: self.assertEqual(fin.read(), SAMPLE_BYTES) def test_open_gz(self): """Can open gzip?""" fpath = os.path.join(CURR_DIR, 'test_data/crlf_at_1k_boundary.warc.gz') with smart_open.open(fpath, 'rb') as infile: data = infile.read() m = hashlib.md5(data) assert m.hexdigest() == '18473e60f8c7c98d29d65bf805736a0d', 'Failed to read gzip' @parameterizedtestcase.ParameterizedTestCase.parameterize( ("extension", "compressed"), [ (".gz", gzip_compress(_DECOMPRESSED_DATA, 'key')), (".bz2", bz2.compress(_DECOMPRESSED_DATA)), ], ) def test_closes_compressed_stream(self, extension, compressed): """Transparent compression closes the compressed stream?""" compressed_stream = make_buffer(initial_value=compressed, name=f"file{extension}") with smart_open.open(compressed_stream, encoding="utf-8"): pass assert compressed_stream.close.call_count == 1 def test_write_read_gz(self): """Can write and read gzip?""" self.write_read_assertion('.gz') def test_write_read_bz2(self): """Can write and read bz2?""" self.write_read_assertion('.bz2') def test_gzip_text(self): with named_temporary_file(suffix='.gz') as f: with smart_open.open(f.name, 'wt') as fout: fout.write('hello world') with smart_open.open(f.name, 'rt') as fin: assert fin.read() == 'hello world' class MultistreamsBZ2Test(unittest.TestCase): """ Test that multistream bzip2 compressed files can be read. """ # note: these tests are derived from the Python 3.x tip bz2 tests. TEXT_LINES = [ b'root:x:0:0:root:/root:/bin/bash\n', b'bin:x:1:1:bin:/bin:\n', b'daemon:x:2:2:daemon:/sbin:\n', b'adm:x:3:4:adm:/var/adm:\n', b'lp:x:4:7:lp:/var/spool/lpd:\n', b'sync:x:5:0:sync:/sbin:/bin/sync\n', b'shutdown:x:6:0:shutdown:/sbin:/sbin/shutdown\n', b'halt:x:7:0:halt:/sbin:/sbin/halt\n', b'mail:x:8:12:mail:/var/spool/mail:\n', b'news:x:9:13:news:/var/spool/news:\n', b'uucp:x:10:14:uucp:/var/spool/uucp:\n', b'operator:x:11:0:operator:/root:\n', b'games:x:12:100:games:/usr/games:\n', b'gopher:x:13:30:gopher:/usr/lib/gopher-data:\n', b'ftp:x:14:50:FTP User:/var/ftp:/bin/bash\n', b'nobody:x:65534:65534:Nobody:/home:\n', b'postfix:x:100:101:postfix:/var/spool/postfix:\n', b'niemeyer:x:500:500::/home/niemeyer:/bin/bash\n', b'postgres:x:101:102:PostgreSQL Server:/var/lib/pgsql:/bin/bash\n', b'mysql:x:102:103:MySQL server:/var/lib/mysql:/bin/bash\n', b'www:x:103:104::/var/www:/bin/false\n', ] TEXT = b''.join(TEXT_LINES) DATA = ( b'BZh91AY&SY.\xc8N\x18\x00\x01>_\x80\x00\x10@\x02\xff\xf0\x01\x07n\x00?\xe7\xff\xe00\x01\x99\xaa\x00' b'\xc0\x03F\x86\x8c#&\x83F\x9a\x03\x06\xa6\xd0\xa6\x93M\x0fQ\xa7\xa8\x06\x804hh\x12$\x11\xa4i4\xf14S' b'\xd2\x88\xe5\xcd9gd6\x0b\n\xe9\x9b' b'\xd5\x8a\x99\xf7\x08.K\x8ev\xfb\xf7xw\xbb\xdf\xa1\x92\xf1\xdd|/";\xa2\xba\x9f\xd5\xb1#A\xb6\xf6' b'\xb3o\xc9\xc5y\\\xebO\xe7\x85\x9a\xbc\xb6f8\x952\xd5\xd7"%\x89>V,\xf7\xa6z\xe2\x9f\xa3\xdf\x11' b'\x11"\xd6E)I\xa9\x13^\xca\xf3r\xd0\x03U\x922\xf26\xec\xb6\xed\x8b\xc3U\x13\x9d\xc5\x170\xa4\xfa^' b'\x92\xacDF\x8a\x97\xd6\x19\xfe\xdd\xb8\xbd\x1a\x9a\x19\xa3\x80ankR\x8b\xe5\xd83]\xa9\xc6\x08' b'\x82f\xf6\xb9"6l$\xb8j@\xc0\x8a\xb0l1..\xbak\x83ls\x15\xbc\xf4\xc1\x13\xbe\xf8E\xb8\x9d\r\xa8\x9dk' b'\x84\xd3n\xfa\xacQ\x07\xb1%y\xaav\xb4\x08\xe0z\x1b\x16\xf5\x04\xe9\xcc\xb9\x08z\x1en7.G\xfc]\xc9' b'\x14\xe1B@\xbb!8`' ) def create_temp_bz2(self, streams=1): with named_temporary_file('wb', suffix='.bz2', delete=False) as f: f.write(self.DATA * streams) return f.name def cleanup_temp_bz2(self, test_file): if os.path.isfile(test_file): os.unlink(test_file) def test_can_read_multistream_bz2(self): from bz2 import BZ2File test_file = self.create_temp_bz2(streams=5) with BZ2File(test_file) as bz2f: self.assertEqual(bz2f.read(), self.TEXT * 5) self.cleanup_temp_bz2(test_file) def test_file_smart_open_can_read_multistream_bz2(self): test_file = self.create_temp_bz2(streams=5) with smart_open_lib.open(test_file, 'rb') as bz2f: self.assertEqual(bz2f.read(), self.TEXT * 5) self.cleanup_temp_bz2(test_file) class S3OpenTest(unittest.TestCase): @mock_s3 def test_r(self): """Reading a UTF string should work.""" text = u"физкульт-привет!" s3 = boto3.resource('s3') s3.create_bucket(Bucket='bucket') key = s3.Object('bucket', 'key') key.put(Body=text.encode('utf-8')) with smart_open.open('s3://bucket/key', "rb") as fin: self.assertEqual(fin.read(), text.encode('utf-8')) with smart_open.open('s3://bucket/key', "r", encoding='utf-8') as fin: self.assertEqual(fin.read(), text) def test_bad_mode(self): """Bad mode should raise and exception.""" uri = smart_open_lib._parse_uri("s3://bucket/key") self.assertRaises(NotImplementedError, smart_open.open, uri, "x") @mock_s3 def test_rw_encoding(self): """Should read and write text, respecting encodings, etc.""" s3 = boto3.resource('s3') s3.create_bucket(Bucket='bucket') key = "s3://bucket/key" text = u"расцветали яблони и груши" with smart_open.open(key, "w", encoding="koi8-r") as fout: fout.write(text) with smart_open.open(key, "r", encoding="koi8-r") as fin: self.assertEqual(text, fin.read()) with smart_open.open(key, "rb") as fin: self.assertEqual(text.encode("koi8-r"), fin.read()) with smart_open.open(key, "r", encoding="euc-jp") as fin: self.assertRaises(UnicodeDecodeError, fin.read) with smart_open.open(key, "r", encoding="euc-jp", errors="replace") as fin: fin.read() @mock_s3 def test_rw_gzip(self): """Should read/write gzip files, implicitly and explicitly.""" s3 = boto3.resource('s3') s3.create_bucket(Bucket='bucket') key = "s3://bucket/key.gz" text = u"не слышны в саду даже шорохи" with smart_open.open(key, "wb") as fout: fout.write(text.encode("utf-8")) # # Check that what we've created is a gzip. # with smart_open.open(key, "rb", ignore_ext=True) as fin: gz = gzip.GzipFile(fileobj=fin) self.assertEqual(gz.read().decode("utf-8"), text) # # We should be able to read it back as well. # with smart_open.open(key, "rb") as fin: self.assertEqual(fin.read().decode("utf-8"), text) @mock_s3 @mock.patch('smart_open.smart_open_lib._inspect_kwargs', mock.Mock(return_value={})) def test_gzip_write_mode(self): """Should always open in binary mode when writing through a codec.""" s3 = boto3.resource('s3') s3.create_bucket(Bucket='bucket') with mock.patch('smart_open.s3.open', return_value=open(__file__, 'rb')) as mock_open: smart_open.open("s3://bucket/key.gz", "wb") mock_open.assert_called_with('bucket', 'key.gz', 'wb') @mock_s3 @mock.patch('smart_open.smart_open_lib._inspect_kwargs', mock.Mock(return_value={})) def test_gzip_read_mode(self): """Should always open in binary mode when reading through a codec.""" s3 = boto3.resource('s3') s3.create_bucket(Bucket='bucket') key = "s3://bucket/key.gz" text = u"если-б я был султан и имел трёх жён, то тройной красотой был бы окружён" with smart_open.open(key, "wb") as fout: fout.write(text.encode("utf-8")) with mock.patch('smart_open.s3.open', return_value=open(__file__)) as mock_open: smart_open.open(key, "r") mock_open.assert_called_with('bucket', 'key.gz', 'rb') @mock_s3 def test_read_encoding(self): """Should open the file with the correct encoding, explicit text read.""" s3 = boto3.resource('s3') s3.create_bucket(Bucket='bucket') key = "s3://bucket/key.txt" text = u'это знала ева, это знал адам, колеса любви едут прямо по нам' with smart_open.open(key, 'wb') as fout: fout.write(text.encode('koi8-r')) with smart_open.open(key, 'r', encoding='koi8-r') as fin: actual = fin.read() self.assertEqual(text, actual) @mock_s3 def test_read_encoding_implicit_text(self): """Should open the file with the correct encoding, implicit text read.""" s3 = boto3.resource('s3') s3.create_bucket(Bucket='bucket') key = "s3://bucket/key.txt" text = u'это знала ева, это знал адам, колеса любви едут прямо по нам' with smart_open.open(key, 'wb') as fout: fout.write(text.encode('koi8-r')) with smart_open.open(key, encoding='koi8-r') as fin: actual = fin.read() self.assertEqual(text, actual) @mock_s3 def test_write_encoding(self): """Should open the file for writing with the correct encoding.""" s3 = boto3.resource('s3') s3.create_bucket(Bucket='bucket') key = "s3://bucket/key.txt" text = u'какая боль, какая боль, аргентина - ямайка, 5-0' with smart_open.open(key, 'w', encoding='koi8-r') as fout: fout.write(text) with smart_open.open(key, encoding='koi8-r') as fin: actual = fin.read() self.assertEqual(text, actual) @mock_s3 def test_write_bad_encoding_strict(self): """Should open the file for writing with the correct encoding.""" s3 = boto3.resource('s3') s3.create_bucket(Bucket='bucket') key = "s3://bucket/key.txt" text = u'欲しい気持ちが成長しすぎて' with self.assertRaises(UnicodeEncodeError): with smart_open.open(key, 'w', encoding='koi8-r', errors='strict') as fout: fout.write(text) @mock_s3 def test_write_bad_encoding_replace(self): """Should open the file for writing with the correct encoding.""" s3 = boto3.resource('s3') s3.create_bucket(Bucket='bucket') key = "s3://bucket/key.txt" text = u'欲しい気持ちが成長しすぎて' expected = u'?' * len(text) with smart_open.open(key, 'w', encoding='koi8-r', errors='replace') as fout: fout.write(text) with smart_open.open(key, encoding='koi8-r') as fin: actual = fin.read() self.assertEqual(expected, actual) @mock_s3 def test_write_text_gzip(self): """Should open the file for writing with the correct encoding.""" s3 = boto3.resource('s3') s3.create_bucket(Bucket='bucket') key = "s3://bucket/key.txt.gz" text = u'какая боль, какая боль, аргентина - ямайка, 5-0' with smart_open.open(key, 'w', encoding='utf-8') as fout: fout.write(text) with smart_open.open(key, 'r', encoding='utf-8') as fin: actual = fin.read() self.assertEqual(text, actual) @mock.patch('smart_open.s3.Reader') def test_transport_params_is_not_mutable(self, mock_open): smart_open.open('s3://access_key:secret_key@host@bucket/key') actual = mock_open.call_args_list[0][1]['client_kwargs'] expected = { 'S3.Client': { 'aws_access_key_id': 'access_key', 'aws_secret_access_key': 'secret_key', 'endpoint_url': 'https://host:443', } } assert actual == expected smart_open.open('s3://bucket/key') actual = mock_open.call_args_list[1][1].get('client_kwargs') assert actual is None @mock.patch('smart_open.s3.Reader') def test_respects_endpoint_url_read(self, mock_open): url = 's3://key_id:secret_key@play.min.io:9000@smart-open-test/README.rst' smart_open.open(url) expected = { 'aws_access_key_id': 'key_id', 'aws_secret_access_key': 'secret_key', 'endpoint_url': 'https://play.min.io:9000', } self.assertEqual(mock_open.call_args[1]['client_kwargs']['S3.Client'], expected) @mock.patch('smart_open.s3.MultipartWriter') def test_respects_endpoint_url_write(self, mock_open): url = 's3://key_id:secret_key@play.min.io:9000@smart-open-test/README.rst' smart_open.open(url, 'wb') expected = { 'aws_access_key_id': 'key_id', 'aws_secret_access_key': 'secret_key', 'endpoint_url': 'https://play.min.io:9000', } self.assertEqual(mock_open.call_args[1]['client_kwargs']['S3.Client'], expected) def function(a, b, c, foo='bar', baz='boz'): pass class CheckKwargsTest(unittest.TestCase): def test(self): kwargs = {'foo': 123, 'bad': False} expected = {'foo': 123} actual = smart_open.smart_open_lib._check_kwargs(function, kwargs) self.assertEqual(expected, actual) @mock_s3 @mock.patch('time.time', _MOCK_TIME) class S3CompressionTestCase(parameterizedtestcase.ParameterizedTestCase): def setUp(self): s3 = boto3.resource("s3") bucket = s3.create_bucket(Bucket="bucket") bucket.wait_until_exists() bucket.Object('gzipped').put(Body=gzip_compress(_DECOMPRESSED_DATA)) bucket.Object('bzipped').put(Body=bz2.compress(_DECOMPRESSED_DATA)) def test_gzip_compress_sanity(self): """Does our gzip_compress function actually produce gzipped data?""" assert gzip.decompress(gzip_compress(_DECOMPRESSED_DATA)) == _DECOMPRESSED_DATA @parameterizedtestcase.ParameterizedTestCase.parameterize( ("url", "_compression"), [ ("s3://bucket/gzipped", ".gz"), ("s3://bucket/bzipped", ".bz2"), ] ) def test_read_explicit(self, url, _compression): """Can we read using the explicitly specified compression?""" with smart_open.open(url, 'rb', compression=_compression) as fin: assert fin.read() == _DECOMPRESSED_DATA @parameterizedtestcase.ParameterizedTestCase.parameterize( ("_compression", "expected"), [ (".gz", gzip_compress(_DECOMPRESSED_DATA, 'key')), (".bz2", bz2.compress(_DECOMPRESSED_DATA)), ], ) def test_write_explicit(self, _compression, expected): """Can we write using the explicitly specified compression?""" with smart_open.open("s3://bucket/key", "wb", compression=_compression) as fout: fout.write(_DECOMPRESSED_DATA) with smart_open.open("s3://bucket/key", "rb", compression=NO_COMPRESSION) as fin: assert fin.read() == expected @parameterizedtestcase.ParameterizedTestCase.parameterize( ("url", "_compression", "expected"), [ ("s3://bucket/key.gz", ".gz", gzip_compress(_DECOMPRESSED_DATA, 'key.gz')), ("s3://bucket/key.bz2", ".bz2", bz2.compress(_DECOMPRESSED_DATA)), ], ) def test_write_implicit(self, url, _compression, expected): """Can we determine the compression from the file extension?""" with smart_open.open(url, "wb", compression=INFER_FROM_EXTENSION) as fout: fout.write(_DECOMPRESSED_DATA) with smart_open.open(url, "rb", compression=NO_COMPRESSION) as fin: assert fin.read() == expected @parameterizedtestcase.ParameterizedTestCase.parameterize( ("url", "_compression", "expected"), [ ("s3://bucket/key.gz", ".gz", gzip_compress(_DECOMPRESSED_DATA, 'key.gz')), ("s3://bucket/key.bz2", ".bz2", bz2.compress(_DECOMPRESSED_DATA)), ], ) def test_ignore_ext(self, url, _compression, expected): """Can we handle the deprecated ignore_ext parameter when reading/writing?""" with smart_open.open(url, "wb") as fout: fout.write(_DECOMPRESSED_DATA) with smart_open.open(url, "rb", ignore_ext=True) as fin: assert fin.read() == expected @parameterizedtestcase.ParameterizedTestCase.parameterize( ("extension", "kwargs", "error"), [ ("", dict(compression="foo"), ValueError), ("", dict(compression="foo", ignore_ext=True), ValueError), ("", dict(compression=NO_COMPRESSION, ignore_ext=True), ValueError), ( ".gz", dict(compression=INFER_FROM_EXTENSION, ignore_ext=True), ValueError, ), ( ".bz2", dict(compression=INFER_FROM_EXTENSION, ignore_ext=True), ValueError, ), ("", dict(compression=".gz", ignore_ext=True), ValueError), ("", dict(compression=".bz2", ignore_ext=True), ValueError), ], ) def test_compression_invalid(self, extension, kwargs, error): """Should detect and error on these invalid inputs""" with pytest.raises(error): smart_open.open(f"s3://bucket/key{extension}", "wb", **kwargs) with pytest.raises(error): smart_open.open(f"s3://bucket/key{extension}", "rb", **kwargs) class GetBinaryModeTest(parameterizedtestcase.ParameterizedTestCase): @parameterizedtestcase.ParameterizedTestCase.parameterize( ('mode', 'expected'), [ ('r', 'rb'), ('r+', 'rb+'), ('rt', 'rb'), ('rt+', 'rb+'), ('r+t', 'rb+'), ('w', 'wb'), ('w+', 'wb+'), ('wt', 'wb'), ('wt+', 'wb+'), ('w+t', 'wb+'), ('a', 'ab'), ('a+', 'ab+'), ('at', 'ab'), ('at+', 'ab+'), ('a+t', 'ab+'), ] ) def test(self, mode, expected): actual = smart_open.smart_open_lib._get_binary_mode(mode) assert actual == expected @parameterizedtestcase.ParameterizedTestCase.parameterize( ('mode', ), [ ('rw', ), ('rwa', ), ('rbt', ), ('r++', ), ('+', ), ('x', ), ] ) def test_bad(self, mode): self.assertRaises(ValueError, smart_open.smart_open_lib._get_binary_mode, mode) def test_backwards_compatibility_wrapper(): fpath = os.path.join(CURR_DIR, 'test_data/crime-and-punishment.txt') expected = open(fpath, 'rb').readline() with warnings.catch_warnings(): warnings.simplefilter('ignore') actual = smart_open.smart_open(fpath).readline() assert expected == actual actual = smart_open.smart_open(fpath, ignore_extension=True).readline() assert expected == actual with pytest.raises(DeprecationWarning): smart_open.smart_open(fpath, unsupported_keyword_param=123) if __name__ == '__main__': logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.DEBUG) unittest.main() smart_open-5.2.1/smart_open/tests/test_ssh.py000066400000000000000000000035721411241424400214110ustar00rootroot00000000000000# -*- coding: utf-8 -*- import logging import unittest from unittest import mock import smart_open.ssh def mock_ssh(func): def wrapper(*args, **kwargs): smart_open.ssh._SSH.clear() return func(*args, **kwargs) return mock.patch("paramiko.client.SSHClient.get_transport")( mock.patch("paramiko.client.SSHClient.connect")(wrapper) ) class SSHOpen(unittest.TestCase): @mock_ssh def test_open(self, mock_connect, get_transp_mock): smart_open.open("ssh://user:pass@some-host/") mock_connect.assert_called_with("some-host", 22, username="user", password="pass") @mock_ssh def test_percent_encoding(self, mock_connect, get_transp_mock): smart_open.open("ssh://user%3a:pass%40@some-host/") mock_connect.assert_called_with("some-host", 22, username="user:", password="pass@") @mock_ssh def test_open_without_password(self, mock_connect, get_transp_mock): smart_open.open("ssh://user@some-host/") mock_connect.assert_called_with("some-host", 22, username="user", password=None) @mock_ssh def test_open_with_transport_params(self, mock_connect, get_transp_mock): smart_open.open( "ssh://user:pass@some-host/", transport_params={"connect_kwargs": {"username": "ubuntu", "password": "pwd"}}, ) mock_connect.assert_called_with("some-host", 22, username="ubuntu", password="pwd") @mock_ssh def test_open_with_key_filename(self, mock_connect, get_transp_mock): smart_open.open( "ssh://user@some-host/", transport_params={"connect_kwargs": {"key_filename": "key"}}, ) mock_connect.assert_called_with("some-host", 22, username="user", key_filename="key") if __name__ == "__main__": logging.basicConfig(format="%(asctime)s : %(levelname)s : %(message)s", level=logging.DEBUG) unittest.main() smart_open-5.2.1/smart_open/tests/test_transport.py000066400000000000000000000014771411241424400226520ustar00rootroot00000000000000# -*- coding: utf-8 -*- import pytest import unittest from smart_open.transport import register_transport, get_transport class TransportTest(unittest.TestCase): def test_registry_requires_declared_schemes(self): with pytest.raises(ValueError): register_transport('smart_open.tests.fixtures.no_schemes_transport') def test_registry_errors_on_double_register_scheme(self): register_transport('smart_open.tests.fixtures.good_transport') with pytest.raises(AssertionError): register_transport('smart_open.tests.fixtures.good_transport') def test_registry_errors_get_transport_for_module_with_missing_deps(self): register_transport('smart_open.tests.fixtures.missing_deps_transport') with pytest.raises(ImportError): get_transport("missing") smart_open-5.2.1/smart_open/tests/test_utils.py000066400000000000000000000031511411241424400217450ustar00rootroot00000000000000# -*- coding: utf-8 -*- # # Copyright (C) 2019 Radim Rehurek # # This code is distributed under the terms and conditions # from the MIT License (MIT). # import urllib.parse import pytest import smart_open.utils @pytest.mark.parametrize( 'value,minval,maxval,expected', [ (5, 0, 10, 5), (11, 0, 10, 10), (-1, 0, 10, 0), (10, 0, None, 10), (-10, 0, None, 0), ] ) def test_clamp(value, minval, maxval, expected): assert smart_open.utils.clamp(value, minval=minval, maxval=maxval) == expected @pytest.mark.parametrize( 'value,params,expected', [ (10, {}, 10), (-10, {}, 0), (-10, {'minval': -5}, -5), (10, {'maxval': 5}, 5), ] ) def test_clamp_defaults(value, params, expected): assert smart_open.utils.clamp(value, **params) == expected def test_check_kwargs(): import smart_open.s3 kallable = smart_open.s3.open kwargs = {'client': 'foo', 'unsupported': 'bar', 'client_kwargs': 'boaz'} supported = smart_open.utils.check_kwargs(kallable, kwargs) assert supported == {'client': 'foo', 'client_kwargs': 'boaz'} @pytest.mark.parametrize( 'url,expected', [ ('s3://bucket/key', ('s3', 'bucket', '/key', '', '')), ('s3://bucket/key?', ('s3', 'bucket', '/key?', '', '')), ('s3://bucket/???', ('s3', 'bucket', '/???', '', '')), ('https://host/path?foo=bar', ('https', 'host', '/path', 'foo=bar', '')), ] ) def test_safe_urlsplit(url, expected): actual = smart_open.utils.safe_urlsplit(url) assert actual == urllib.parse.SplitResult(*expected) smart_open-5.2.1/smart_open/transport.py000066400000000000000000000063611411241424400204460ustar00rootroot00000000000000# -*- coding: utf-8 -*- # # Copyright (C) 2020 Radim Rehurek # # This code is distributed under the terms and conditions # from the MIT License (MIT). # """Maintains a registry of transport mechanisms. The main entrypoint is :func:`get_transport`. See also :file:`extending.md`. """ import importlib import logging import smart_open.local_file logger = logging.getLogger(__name__) NO_SCHEME = '' _REGISTRY = {NO_SCHEME: smart_open.local_file} _ERRORS = {} _MISSING_DEPS_ERROR = """You are trying to use the %(module)s functionality of smart_open but you do not have the correct %(module)s dependencies installed. Try: pip install smart_open[%(module)s] """ def register_transport(submodule): """Register a submodule as a transport mechanism for ``smart_open``. This module **must** have: - `SCHEME` attribute (or `SCHEMES`, if the submodule supports multiple schemes) - `open` function - `open_uri` function - `parse_uri' function Once registered, you can get the submodule by calling :func:`get_transport`. """ global _REGISTRY, _ERRORS module_name = submodule if isinstance(submodule, str): try: submodule = importlib.import_module(submodule) except ImportError: return else: module_name = submodule.__name__ # Save only the last module name piece module_name = module_name.rsplit(".")[-1] if hasattr(submodule, 'SCHEME'): schemes = [submodule.SCHEME] elif hasattr(submodule, 'SCHEMES'): schemes = submodule.SCHEMES else: raise ValueError('%r does not have a .SCHEME or .SCHEMES attribute' % submodule) for f in ('open', 'open_uri', 'parse_uri'): assert hasattr(submodule, f), '%r is missing %r' % (submodule, f) for scheme in schemes: assert scheme not in _REGISTRY if getattr(submodule, "MISSING_DEPS", False): _ERRORS[scheme] = module_name else: _REGISTRY[scheme] = submodule def get_transport(scheme): """Get the submodule that handles transport for the specified scheme. This submodule must have been previously registered via :func:`register_transport`. """ global _ERRORS, _MISSING_DEPS_ERROR, _REGISTRY, SUPPORTED_SCHEMES expected = SUPPORTED_SCHEMES readme_url = 'https://github.com/RaRe-Technologies/smart_open/blob/master/README.rst' message = ( "Unable to handle scheme %(scheme)r, expected one of %(expected)r. " "Extra dependencies required by %(scheme)r may be missing. " "See <%(readme_url)s> for details." % locals() ) if scheme in _ERRORS: raise ImportError(_MISSING_DEPS_ERROR % dict(module=_ERRORS[scheme])) if scheme in _REGISTRY: return _REGISTRY[scheme] raise NotImplementedError(message) register_transport(smart_open.local_file) register_transport('smart_open.azure') register_transport('smart_open.gcs') register_transport('smart_open.hdfs') register_transport('smart_open.http') register_transport('smart_open.s3') register_transport('smart_open.ssh') register_transport('smart_open.webhdfs') SUPPORTED_SCHEMES = tuple(sorted(_REGISTRY.keys())) """The transport schemes that the local installation of ``smart_open`` supports.""" smart_open-5.2.1/smart_open/utils.py000066400000000000000000000135531411241424400175530ustar00rootroot00000000000000# -*- coding: utf-8 -*- # # Copyright (C) 2020 Radim Rehurek # # This code is distributed under the terms and conditions # from the MIT License (MIT). # """Helper functions for documentation, etc.""" import inspect import logging import urllib.parse logger = logging.getLogger(__name__) WORKAROUND_SCHEMES = ['s3', 's3n', 's3u', 's3a', 'gs'] QUESTION_MARK_PLACEHOLDER = '///smart_open.utils.QUESTION_MARK_PLACEHOLDER///' def inspect_kwargs(kallable): # # inspect.getargspec got deprecated in Py3.4, and calling it spews # deprecation warnings that we'd prefer to avoid. Unfortunately, older # versions of Python (<3.3) did not have inspect.signature, so we need to # handle them the old-fashioned getargspec way. # try: signature = inspect.signature(kallable) except AttributeError: try: args, varargs, keywords, defaults = inspect.getargspec(kallable) except TypeError: # # Happens under Py2.7 with mocking. # return {} if not defaults: return {} supported_keywords = args[-len(defaults):] return dict(zip(supported_keywords, defaults)) else: return { name: param.default for name, param in signature.parameters.items() if param.default != inspect.Parameter.empty } def check_kwargs(kallable, kwargs): """Check which keyword arguments the callable supports. Parameters ---------- kallable: callable A function or method to test kwargs: dict The keyword arguments to check. If the callable doesn't support any of these, a warning message will get printed. Returns ------- dict A dictionary of argument names and values supported by the callable. """ supported_keywords = sorted(inspect_kwargs(kallable)) unsupported_keywords = [k for k in sorted(kwargs) if k not in supported_keywords] supported_kwargs = {k: v for (k, v) in kwargs.items() if k in supported_keywords} if unsupported_keywords: logger.warning('ignoring unsupported keyword arguments: %r', unsupported_keywords) return supported_kwargs def clamp(value, minval=0, maxval=None): """Clamp a numeric value to a specific range. Parameters ---------- value: numeric The value to clamp. minval: numeric The lower bound. maxval: numeric The upper bound. Returns ------- numeric The clamped value. It will be in the range ``[minval, maxval]``. """ if maxval is not None: value = min(value, maxval) value = max(value, minval) return value def make_range_string(start=None, stop=None): """Create a byte range specifier in accordance with RFC-2616. Parameters ---------- start: int, optional The start of the byte range. If unspecified, stop indicated offset from EOF. stop: int, optional The end of the byte range. If unspecified, indicates EOF. Returns ------- str A byte range specifier. """ # # https://www.w3.org/Protocols/rfc2616/rfc2616-sec14.html#sec14.35 # if start is None and stop is None: raise ValueError("make_range_string requires either a stop or start value") start_str = '' if start is None else str(start) stop_str = '' if stop is None else str(stop) return 'bytes=%s-%s' % (start_str, stop_str) def parse_content_range(content_range): """Extract units, start, stop, and length from a content range header like "bytes 0-846981/846982". Assumes a properly formatted content-range header from S3. See werkzeug.http.parse_content_range_header for a more robust version. Parameters ---------- content_range: str The content-range header to parse. Returns ------- tuple (units: str, start: int, stop: int, length: int) The units and three integers from the content-range header. """ units, numbers = content_range.split(' ', 1) range, length = numbers.split('/', 1) start, stop = range.split('-', 1) return units, int(start), int(stop), int(length) def safe_urlsplit(url): """This is a hack to prevent the regular urlsplit from splitting around question marks. A question mark (?) in a URL typically indicates the start of a querystring, and the standard library's urlparse function handles the querystring separately. Unfortunately, question marks can also appear _inside_ the actual URL for some schemas like S3, GS. Replaces question marks with a special placeholder substring prior to splitting. This work-around behavior is disabled in the unlikely event the placeholder is already part of the URL. If this affects you, consider changing the value of QUESTION_MARK_PLACEHOLDER to something more suitable. See Also -------- https://bugs.python.org/issue43882 https://github.com/python/cpython/blob/3.7/Lib/urllib/parse.py https://github.com/RaRe-Technologies/smart_open/issues/285 https://github.com/RaRe-Technologies/smart_open/issues/458 smart_open/utils.py:QUESTION_MARK_PLACEHOLDER """ sr = urllib.parse.urlsplit(url, allow_fragments=False) placeholder = None if sr.scheme in WORKAROUND_SCHEMES and '?' in url and QUESTION_MARK_PLACEHOLDER not in url: # # This is safe because people will _almost never_ use the below # substring in a URL. If they do, then they're asking for trouble, # and this special handling will simply not happen for them. # placeholder = QUESTION_MARK_PLACEHOLDER url = url.replace('?', placeholder) sr = urllib.parse.urlsplit(url, allow_fragments=False) if placeholder is None: return sr path = sr.path.replace(placeholder, '?') return urllib.parse.SplitResult(sr.scheme, sr.netloc, path, '', '') smart_open-5.2.1/smart_open/version.py000066400000000000000000000001121411241424400200630ustar00rootroot00000000000000__version__ = '5.2.1' if __name__ == '__main__': print(__version__) smart_open-5.2.1/smart_open/webhdfs.py000066400000000000000000000204411411241424400200270ustar00rootroot00000000000000# -*- coding: utf-8 -*- # # Copyright (C) 2019 Radim Rehurek # # This code is distributed under the terms and conditions # from the MIT License (MIT). # """Implements reading and writing to/from WebHDFS. The main entry point is the :func:`~smart_open.webhdfs.open` function. """ import io import logging import urllib.parse try: import requests except ImportError: MISSING_DEPS = True from smart_open import utils, constants import http.client as httplib logger = logging.getLogger(__name__) SCHEME = 'webhdfs' URI_EXAMPLES = ( 'webhdfs://host:port/path/file', ) MIN_PART_SIZE = 50 * 1024**2 # minimum part size for HDFS multipart uploads def parse_uri(uri_as_str): return dict(scheme=SCHEME, uri=uri_as_str) def open_uri(uri, mode, transport_params): kwargs = utils.check_kwargs(open, transport_params) return open(uri, mode, **kwargs) def open(http_uri, mode, min_part_size=MIN_PART_SIZE): """ Parameters ---------- http_uri: str webhdfs url converted to http REST url min_part_size: int, optional For writing only. """ if http_uri.startswith(SCHEME): http_uri = _convert_to_http_uri(http_uri) if mode == constants.READ_BINARY: fobj = BufferedInputBase(http_uri) elif mode == constants.WRITE_BINARY: fobj = BufferedOutputBase(http_uri, min_part_size=min_part_size) else: raise NotImplementedError("webhdfs support for mode %r not implemented" % mode) fobj.name = http_uri.split('/')[-1] return fobj def _convert_to_http_uri(webhdfs_url): """ Convert webhdfs uri to http url and return it as text Parameters ---------- webhdfs_url: str A URL starting with webhdfs:// """ split_uri = urllib.parse.urlsplit(webhdfs_url) netloc = split_uri.hostname if split_uri.port: netloc += ":{}".format(split_uri.port) query = split_uri.query if split_uri.username: query += ( ("&" if query else "") + "user.name=" + urllib.parse.quote(split_uri.username) ) return urllib.parse.urlunsplit( ("http", netloc, "/webhdfs/v1" + split_uri.path, query, "") ) # # For old unit tests. # def convert_to_http_uri(parsed_uri): return _convert_to_http_uri(parsed_uri.uri) class BufferedInputBase(io.BufferedIOBase): def __init__(self, uri): self._uri = uri payload = {"op": "OPEN", "offset": 0} self._response = requests.get(self._uri, params=payload, stream=True) if self._response.status_code != httplib.OK: raise WebHdfsException.from_response(self._response) self._buf = b'' # # Override some methods from io.IOBase. # def close(self): """Flush and close this stream.""" logger.debug("close: called") def readable(self): """Return True if the stream can be read from.""" return True def seekable(self): """If False, seek(), tell() and truncate() will raise IOError. We offer only seek support, and no truncate support.""" return False # # io.BufferedIOBase methods. # def detach(self): """Unsupported.""" raise io.UnsupportedOperation def read(self, size=None): if size is None: self._buf, retval = b'', self._buf + self._response.raw.read() return retval elif size < len(self._buf): self._buf, retval = self._buf[size:], self._buf[:size] return retval try: buffers = [self._buf] total_read = 0 while total_read < size: raw_data = self._response.raw.read(io.DEFAULT_BUFFER_SIZE) # some times read returns 0 length data without throwing a # StopIteration exception. We break here if this happens. if len(raw_data) == 0: break total_read += len(raw_data) buffers.append(raw_data) except StopIteration: pass self._buf = b"".join(buffers) self._buf, retval = self._buf[size:], self._buf[:size] return retval def read1(self, size=-1): """This is the same as read().""" return self.read(size=size) def readinto(self, b): """Read up to len(b) bytes into b, and return the number of bytes read.""" data = self.read(len(b)) if not data: return 0 b[:len(data)] = data return len(data) def readline(self): self._buf, retval = b'', self._buf + self._response.raw.readline() return retval class BufferedOutputBase(io.BufferedIOBase): def __init__(self, uri, min_part_size=MIN_PART_SIZE): """ Parameters ---------- min_part_size: int, optional For writing only. """ self._uri = uri self._closed = False self.min_part_size = min_part_size # creating empty file first payload = {"op": "CREATE", "overwrite": True} init_response = requests.put(self._uri, params=payload, allow_redirects=False) if not init_response.status_code == httplib.TEMPORARY_REDIRECT: raise WebHdfsException.from_response(init_response) uri = init_response.headers['location'] response = requests.put(uri, data="", headers={'content-type': 'application/octet-stream'}) if not response.status_code == httplib.CREATED: raise WebHdfsException.from_response(response) self.lines = [] self.parts = 0 self.chunk_bytes = 0 self.total_size = 0 # # This member is part of the io.BufferedIOBase interface. # self.raw = None # # Override some methods from io.IOBase. # def writable(self): """Return True if the stream supports writing.""" return True # # io.BufferedIOBase methods. # def detach(self): raise io.UnsupportedOperation("detach() not supported") def _upload(self, data): payload = {"op": "APPEND"} init_response = requests.post(self._uri, params=payload, allow_redirects=False) if not init_response.status_code == httplib.TEMPORARY_REDIRECT: raise WebHdfsException.from_response(init_response) uri = init_response.headers['location'] response = requests.post(uri, data=data, headers={'content-type': 'application/octet-stream'}) if not response.status_code == httplib.OK: raise WebHdfsException.from_response(response) def write(self, b): """ Write the given bytes (binary string) into the WebHDFS file from constructor. """ if self._closed: raise ValueError("I/O operation on closed file") if not isinstance(b, bytes): raise TypeError("input must be a binary string") self.lines.append(b) self.chunk_bytes += len(b) self.total_size += len(b) if self.chunk_bytes >= self.min_part_size: buff = b"".join(self.lines) logger.info( "uploading part #%i, %i bytes (total %.3fGB)", self.parts, len(buff), self.total_size / 1024.0 ** 3 ) self._upload(buff) logger.debug("upload of part #%i finished", self.parts) self.parts += 1 self.lines, self.chunk_bytes = [], 0 def close(self): buff = b"".join(self.lines) if buff: logger.info( "uploading last part #%i, %i bytes (total %.3fGB)", self.parts, len(buff), self.total_size / 1024.0 ** 3 ) self._upload(buff) logger.debug("upload of last part #%i finished", self.parts) self._closed = True @property def closed(self): return self._closed class WebHdfsException(Exception): def __init__(self, msg="", status_code=None): self.msg = msg self.status_code = status_code super(WebHdfsException, self).__init__(repr(self)) def __repr__(self): return "{}(status_code={}, msg={!r})".format( self.__class__.__name__, self.status_code, self.msg ) @classmethod def from_response(cls, response): return cls(msg=response.text, status_code=response.status_code) smart_open-5.2.1/tox.ini000066400000000000000000000030151411241424400151750ustar00rootroot00000000000000[tox] minversion = 2.0 envlist = py{36,37,38,39}-{test,doctest,integration,benchmark}, sdist, flake8 [pytest] addopts = -rfxEXs --durations=20 --showlocals [flake8] ignore = E12, W503, E226 max-line-length = 110 show-source = True [testenv] passenv = SO_* AWS_* COVERALLS_* RUN_BENCHMARKS GITHUB_* recreate = True whitelist_externals = sh bash deps = .[all] .[test] integration: numpy benchmark: pytest_benchmark benchmark: awscli commands = test: pytest smart_open -v integration: python tox_helpers/run_integration_tests.py benchmark: python tox_helpers/run_benchmarks.py doctest: python tox_helpers/doctest.py [testenv:sdist] whitelist_externals = rm recreate = True commands = rm -rf dist/ python setup.py sdist [testenv:flake8] skip_install = True recreate = True deps = flake8 commands = flake8 smart_open/ {posargs} [testenv:check_keys] skip_install = True recreate = True deps = boto3 commands = python tox_helpers/check_keys.py [testenv:enable_moto_server] skip_install = True recreate = False deps = moto[server] commands = bash tox_helpers/helpers.sh enable_moto_server [testenv:disable_moto_server] skip_install = True recreate = False deps = commands = bash tox_helpers/helpers.sh disable_moto_server [testenv:test_coverage] skip_install = True recreate = True deps = . pytest-cov commands = python tox_helpers/test_missing_dependencies.py pip install .[all,test] pytest smart_open -v --cov smart_open --cov-report term-missing --cov-append smart_open-5.2.1/tox_helpers/000077500000000000000000000000001411241424400162175ustar00rootroot00000000000000smart_open-5.2.1/tox_helpers/README.txt000066400000000000000000000002231411241424400177120ustar00rootroot00000000000000This subdirectory contains helper scripts for our tox.ini file. They are designed to be platform-independent: they run on both Linux and Windows. smart_open-5.2.1/tox_helpers/check_keys.py000066400000000000000000000022361411241424400207040ustar00rootroot00000000000000"""Check that the environment variables contain valid boto3 credentials.""" import logging import os import boto3 import boto3.session def check(session): client = session.client('s3') try: response = client.list_buckets() except Exception as e: logging.exception(e) return None else: return [b['Name'] for b in response['Buckets']] def check_implicit(): session = boto3.session.Session() buckets = check(session) if buckets: print('implicit check OK: %r' % buckets) else: print('implicit check failed') def check_explicit(): key_id = os.environ.get('AWS_ACCESS_KEY_ID') secret_key = os.environ.get('AWS_SECRET_ACCESS_KEY') if not (key_id and secret_key): print('no credentials found in os.environ, skipping explicit check') return session = boto3.session.Session(aws_access_key_id=key_id, aws_secret_access_key=secret_key) buckets = check(session) if buckets: print('explicit check OK: %r' % buckets) else: print('explicit check failed') def main(): check_implicit() check_explicit() if __name__ == '__main__': main() smart_open-5.2.1/tox_helpers/doctest.py000066400000000000000000000005221411241424400202350ustar00rootroot00000000000000"""Runs the doctests, if the AWS credentials are available. Without the credentials, skips the tests entirely, because otherwise they will fail. """ import os import subprocess if os.environ.get('AWS_ACCESS_KEY_ID') and os.environ.get('AWS_SECRET_ACCESS_KEY'): subprocess.check_call(['python', '-m', 'doctest', 'README.rst', '-v']) smart_open-5.2.1/tox_helpers/helpers.sh000077500000000000000000000002641411241424400202220ustar00rootroot00000000000000#!/bin/bash set -e set -x enable_moto_server(){ moto_server -p5000 2>/dev/null& } disable_moto_server(){ lsof -i tcp:5000 | tail -n1 | cut -f2 -d" " | xargs kill -9 } "$@" smart_open-5.2.1/tox_helpers/run_benchmarks.py000066400000000000000000000026331411241424400215760ustar00rootroot00000000000000"""Runs benchmarks. We only do this is AWS credentials are available, because without them, it is impossible to run the benchmarks at all. """ import os import platform import uuid import subprocess import smart_open if os.environ.get('AWS_ACCESS_KEY_ID') and os.environ.get('AWS_SECRET_ACCESS_KEY'): required = ('SO_BUCKET', ) for varname in required: assert varname in os.environ, 'the following env vars must be set: %s' % ', '.join(required) os.environ['PYTEST_ADDOPTS'] = "--reruns 3 --reruns-delay 1" commit_hash = subprocess.check_output( ['git', 'rev-parse', 'HEAD'] ).decode('utf-8').strip() # # This is a temporary key that test_s3 will use for I/O. # os.environ['SO_KEY'] = str(uuid.uuid4()) subprocess.check_call( [ 'pytest', '-v', 'integration-tests/test_s3.py', '--benchmark-save=%s' % commit_hash, ] ) url = 's3://%s/benchmark-results/%s' % ( os.environ['SO_BUCKET'], commit_hash, ) for root, subdirs, files in os.walk('.benchmarks'): for f in files: if f.endswith('%s.json' % commit_hash): out_url = '%s/%s.json' % (url, platform.python_version()) with open(os.path.join(root, f), 'rt') as fin: with smart_open.open(out_url, 'wt') as fout: fout.write(fin.read()) smart_open-5.2.1/tox_helpers/run_integration_tests.py000066400000000000000000000006531411241424400232260ustar00rootroot00000000000000"""Runs integration tests.""" import os import subprocess os.environ['PYTEST_ADDOPTS'] = "--reruns 3 --reruns-delay 1" subprocess.check_call( [ 'pytest', 'integration-tests/test_207.py', 'integration-tests/test_http.py', ] ) if os.environ.get('AWS_ACCESS_KEY_ID') and os.environ.get('AWS_SECRET_ACCESS_KEY'): subprocess.check_call(['pytest', '-v', 'integration-tests/test_s3_ported.py']) smart_open-5.2.1/tox_helpers/test_missing_dependencies.py000066400000000000000000000003721411241424400240110ustar00rootroot00000000000000import os import subprocess os.environ['SMART_OPEN_TEST_MISSING_DEPS'] = '1' command = [ 'pytest', 'smart_open/tests/test_package.py', '-v', '--cov', 'smart_open', '--cov-report', 'term-missing', ] subprocess.check_call(command)