pax_global_header00006660000000000000000000000064144622146600014517gustar00rootroot0000000000000052 comment=7e16205faf5434c40c7c75568b6e775c78dccf87 conda-package-streaming-0.9.0/000077500000000000000000000000001446221466000161715ustar00rootroot00000000000000conda-package-streaming-0.9.0/.flake8000066400000000000000000000000371446221466000173440ustar00rootroot00000000000000[flake8] max-line-length = 100 conda-package-streaming-0.9.0/.github/000077500000000000000000000000001446221466000175315ustar00rootroot00000000000000conda-package-streaming-0.9.0/.github/workflows/000077500000000000000000000000001446221466000215665ustar00rootroot00000000000000conda-package-streaming-0.9.0/.github/workflows/sphinx.yml000066400000000000000000000020251446221466000236210ustar00rootroot00000000000000name: Sphinx on: push: branches: - main pull_request: branches: - main jobs: sphinx: runs-on: ubuntu-latest steps: - uses: actions/checkout@v3 - uses: actions/setup-python@v4 with: python-version: "3.x" architecture: "x64" cache: "pip" - name: Build Documentation run: | pip install -e .[docs] make html - name: Upload artifact uses: actions/upload-pages-artifact@v1 with: # Upload entire repository path: 'build/html' pages: runs-on: ubuntu-latest if: github.ref == 'refs/heads/main' needs: [sphinx] # Sets permissions of the GITHUB_TOKEN to allow deployment to GitHub Pages permissions: contents: read pages: write id-token: write environment: name: github-pages url: ${{ steps.deployment.outputs.page_url }} steps: - name: Deploy to GitHub Pages id: deployment uses: actions/deploy-pages@v1 conda-package-streaming-0.9.0/.github/workflows/tests.yml000066400000000000000000000057731446221466000234670ustar00rootroot00000000000000name: Tests on: # NOTE: github.event context is push payload: # https://docs.github.com/en/developers/webhooks-and-events/webhooks/webhook-events-and-payloads#push push: branches: - main - feature/** # NOTE: github.event context is pull_request payload: # https://docs.github.com/en/developers/webhooks-and-events/webhooks/webhook-events-and-payloads#pull_request pull_request: concurrency: # Concurrency group that uses the workflow name and PR number if available # or commit SHA as a fallback. If a new build is triggered under that # concurrency group while a previous build is running it will be canceled. # Repeated pushes to a PR will cancel all previous builds, while multiple # merges to main will not cancel. group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.sha }} cancel-in-progress: true jobs: linux: runs-on: ubuntu-latest defaults: run: shell: bash -l {0} strategy: fail-fast: false matrix: python-version: ['3.10', '3.11'] steps: - name: Checkout repository uses: actions/checkout@v2 with: fetch-depth: 0 - uses: actions/setup-python@v4 with: python-version: ${{ matrix.python-version }} architecture: "x64" cache: "pip" - name: Setup Miniconda uses: conda-incubator/setup-miniconda@v2 with: python-version: ${{ matrix.python-version }} channels: defaults activate-environment: test_env auto-update-conda: false auto-activate-base: false show-channel-urls: true - name: Source Scripts run: | set -x # conda is our test dependency but can't be pip installed conda install --quiet conda pip pip install -e .[test] conda info --json echo "condarc" cat ~/.condarc echo "conda_pkgs_dir" ls /home/runner/conda_pkgs_dir echo "miniconda/pkgs" ls /usr/share/miniconda/pkgs echo "test_env" ls /usr/share/miniconda/envs/test_env pytest analyze: name: Analyze test results needs: [linux] if: always() runs-on: ubuntu-latest steps: - name: Download test results uses: actions/download-artifact@v3 - name: Upload combined test results # provides one downloadable archive of all .coverage/test-report.xml files # of all matrix runs for further analysis. uses: actions/upload-artifact@v3 with: name: test-results-${{ github.sha }}-all path: test-results-${{ github.sha }}-* retention-days: 90 # default: 90 - name: Test Summary uses: test-summary/action@v2 with: paths: ./test-results-${{ github.sha }}-**/test-report*.xml - name: Decide whether the needed jobs succeeded or failed uses: re-actors/alls-green@release/v1 with: jobs: ${{ toJSON(needs) }} conda-package-streaming-0.9.0/.gitignore000066400000000000000000000001151446221466000201560ustar00rootroot00000000000000.coverage* .vscode .nox __pycache__ PKG-INFO build dist deploy/metadata.json conda-package-streaming-0.9.0/.pre-commit-config.yaml000066400000000000000000000020511446221466000224500ustar00rootroot00000000000000# disable autofixing PRs, commenting "pre-commit.ci autofix" on a pull request triggers a autofix ci: autofix_prs: false repos: - repo: https://github.com/pre-commit/pre-commit-hooks rev: v4.4.0 hooks: - id: check-added-large-files - id: check-ast - id: fix-byte-order-marker - id: check-case-conflict - id: check-executables-have-shebangs - id: check-merge-conflict - id: check-shebang-scripts-are-executable - id: debug-statements - id: detect-private-key - id: mixed-line-ending - id: end-of-file-fixer - id: trailing-whitespace - id: check-yaml exclude: conda.recipe/meta.yaml - repo: https://github.com/asottile/pyupgrade rev: v3.4.0 hooks: - id: pyupgrade args: ["--py37-plus"] - repo: https://github.com/PyCQA/isort rev: 5.12.0 hooks: - id: isort - repo: https://github.com/psf/black rev: 23.3.0 hooks: - id: black - repo: https://github.com/PyCQA/flake8 rev: 6.0.0 hooks: - id: flake8 conda-package-streaming-0.9.0/CHANGELOG.md000066400000000000000000000010161446221466000200000ustar00rootroot00000000000000[//]: # (current developments) ## 0.9.0 (2023-07) * Respect umask when extracting files. [#65](https://github.com/conda/conda-package-streaming/pulls/65); [conda issue #12829](https://github.com/conda/conda/issues/12829). ## 0.8.0 (2023-05) * Update transmute to use SpooledTemporaryFile instead of streaming directly to zip [(#57)](https://github.com/conda/conda-package-streaming/issues/57). This can reduce zstd memory usage during decompression. * `transmute` returns Path to transmuted package instead of `None`. conda-package-streaming-0.9.0/CODE_OF_CONDUCT.md000066400000000000000000000024331446221466000207720ustar00rootroot00000000000000# Conda Organization Code of Conduct > **Note** > Below is the short version of our CoC, see the long version [here](https://github.com/conda-incubator/governance/blob/main/CODE_OF_CONDUCT.md). # The Short Version Be kind to others. Do not insult or put down others. Behave professionally. Remember that harassment and sexist, racist, or exclusionary jokes are not appropriate for the conda Organization. All communication should be appropriate for a professional audience including people of many different backgrounds. Sexual language and imagery is not appropriate. The conda Organization is dedicated to providing a harassment-free community for everyone, regardless of gender, sexual orientation, gender identity and expression, disability, physical appearance, body size, race, or religion. We do not tolerate harassment of community members in any form. Thank you for helping make this a welcoming, friendly community for all. ## Report an Incident * Report a code of conduct incident [using a form](https://form.jotform.com/221527028480048). * Report a code of conduct incident via email: [conduct@conda.org](mailto:conduct@conda.org). * Contact [an individual committee member](#committee-membership) or [CoC event representative](#coc-representatives) to report an incident in confidence. conda-package-streaming-0.9.0/LICENSE000066400000000000000000000051561446221466000172050ustar00rootroot00000000000000BSD 3-Clause License Copyright (c) 2022, Anaconda, Inc. All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. 3. Neither the name of the copyright holder nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. lazy_wheel.py: Copyright (c) 2008-present The pip developers (see https://github.com/pypa/pip/blob/main/AUTHORS.txt file) Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. conda-package-streaming-0.9.0/Makefile000066400000000000000000000012461446221466000176340ustar00rootroot00000000000000# Minimal makefile for Sphinx documentation # # You can set these variables from the command line, and also # from the environment for the first two. SPHINXOPTS ?= SPHINXBUILD ?= sphinx-build SOURCEDIR = docs BUILDDIR = build # Put it first so that "make" without argument is like "make help". help: @$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) .PHONY: help Makefile # Catch-all target: route all unknown targets to Sphinx using the new # "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS). %: Makefile @$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) style: isort --profile=black . black . conda-package-streaming-0.9.0/README.md000066400000000000000000000061351446221466000174550ustar00rootroot00000000000000# conda-package-streaming [![pre-commit.ci status](https://results.pre-commit.ci/badge/github/conda/conda-package-streaming/main.svg)](https://results.pre-commit.ci/latest/github/conda/conda-package-streaming/main) An efficient library to read from new and old format .conda and .tar.bz2 conda packages. Download conda metadata from packages without transferring entire file. Get metadata from local `.tar.bz2` packages without reading entire files. Uses enhanced pip `lazy_wheel` to fetch a file out of `.conda` with no more than 3 range requests, but usually 2. Uses `tar = tarfile.open(fileobj=...)` to stream remote `.tar.bz2`. Closes the HTTP request once desired files have been seen. # Quickstart The basic API yields (tarfile, member) tuples from conda files as tarfile is needed to extract member. Note the `.tar.bz2` format yields all members, not just `info/`, from `stream_conda_info` / `stream_conda_component`, while the `.conda` format yields members from the requested inner archive — allowing the caller to decide when to stop reading. From a url, ```python from conda_package_streaming.url import stream_conda_info # url = (ends with .conda or .tar.bz2) for tar, member in stream_conda_info(url): if member.name == "info/index.json": index_json = json.load(tar.extractfile(member)) break ``` From s3, ```python client = boto3.client("s3") from conda_package_streaming.s3 import stream_conda_info # key = (ends with .conda or .tar.bz2) for tar, member in stream_conda_info(client, bucket, key): if member.name == "info/index.json": index_json = json.load(tar.extractfile(member)) break ``` From a filename, ```python from conda_package_streaming import package_streaming # filename = (ends with .conda or .tar.bz2) for tar, member in package_streaming.stream_conda_info(filename): if member.name == "info/index.json": index_json = json.load(tar.extractfile(member)) break ``` From a file-like object, ```python from contextlib import closing from conda_package_streaming.url import conda_reader_for_url from conda_package_streaming.package_streaming import stream_conda_component filename, conda = conda_reader_for_url(url) # file object must be seekable for `.conda` format, but merely readable for `.tar.bz2` with closing(conda): for tar, member in stream_conda_component(filename, conda, component="info"): if member.name == "info/index.json": index_json = json.load(tar.extractfile(member)) break ``` If you need the entire package, download it first and use the file-based APIs. The URL-based APIs are more efficient if you only need to access package metadata. # Package goals * Extract conda packages (both formats) * Easy to install from pypi or conda * Do the least amount of I/O possible (no temporary files, transfer partial packages) * Open files from the network / standard HTTP / s3 * Continue using conda-package-handling to create .conda packages # Generating documentation Uses markdown, furo theme. Requires newer mdit-py-plugins. `pip install conda-package-streaming[docs]` One time: `sphinx-apidoc -o docs .` conda-package-streaming-0.9.0/conda.recipe/000077500000000000000000000000001446221466000205235ustar00rootroot00000000000000conda-package-streaming-0.9.0/conda.recipe/meta.yaml000066400000000000000000000023301446221466000223330ustar00rootroot00000000000000{% set name = "conda-package-streaming" %} {% set version_match = load_file_regex( load_file="conda_package_streaming/__init__.py", regex_pattern='^__version__ = "(.+)"') %} {% set version = version_match[1] %} package: name: {{ name|lower }} version: {{ version }} source: git_url: ../ # url: https://github.com/conda/conda-package-streaming/archive/refs/tags/v{{ version }}.tar.gz # sha256: 60a064dcb6adf775362339ffc8728320d89115c0f3870f2bb43fd368240a3205 build: script: {{ PYTHON }} -m pip install --no-build-isolation . -vv number: 0 noarch: python requirements: host: - flit-core - python >=3.7 - pip run: - zstandard >=0.15 - python >=3.7 # allow optional 'requests' test: imports: - conda_package_streaming.url commands: - pip check requires: - pip - requests about: home: https://github.com/conda/conda-package-streaming summary: An efficient library to read from new and old format .conda and .tar.bz2 conda packages. license: BSD-3-Clause license_family: BSD license_file: LICENSE doc_url: https://conda.github.io/conda-package-streaming/ dev_url: https://github.com/conda/conda-package-streaming extra: recipe-maintainers: - dholth conda-package-streaming-0.9.0/conda_package_streaming/000077500000000000000000000000001446221466000230015ustar00rootroot00000000000000conda-package-streaming-0.9.0/conda_package_streaming/__init__.py000066400000000000000000000000261446221466000251100ustar00rootroot00000000000000__version__ = "0.9.0" conda-package-streaming-0.9.0/conda_package_streaming/exceptions.py000066400000000000000000000012451446221466000255360ustar00rootroot00000000000000import tarfile class SafetyError(tarfile.TarError): def __init__(self, msg, *args, **kw): msg = f"Error with archive. {msg}" super().__init__(msg) class CaseInsensitiveFileSystemError(OSError): def __init__(self): message = """\ Cannot extract package to a case-insensitive file system. Your install destination does not differentiate between upper and lowercase characters, and this breaks things. Try installing to a location that is case-sensitive. Windows drives are usually the culprit here - can you install to a native Unix drive, or turn on case sensitivity for this (Windows) location? """ super().__init__(message) conda-package-streaming-0.9.0/conda_package_streaming/extract.py000066400000000000000000000047521446221466000250350ustar00rootroot00000000000000""" Extract package to directory, with checks against tar members extracting outside the target directory. """ from __future__ import annotations import os import tarfile from errno import ELOOP from pathlib import Path from typing import Generator from . import exceptions, package_streaming __all__ = ["extract_stream", "extract"] def extract_stream( stream: Generator[tuple[tarfile.TarFile, tarfile.TarInfo], None, None], dest_dir: Path | str, ): """ Pipe ``stream_conda_component`` output here to extract every member into dest_dir. For ``.conda`` will need to be called twice (for info and pkg components); for ``.tar.bz2`` every member is extracted. """ dest_dir = os.path.realpath(dest_dir) def is_within_dest_dir(name): abs_target = os.path.realpath(os.path.join(dest_dir, name)) prefix = os.path.commonpath((dest_dir, abs_target)) return prefix == dest_dir for tar_file, _ in stream: # careful not to seek backwards def checked_members(): # from conda_package_handling for member in tar_file: if not is_within_dest_dir(member.name): raise exceptions.SafetyError(f"contains unsafe path: {member.name}") yield member try: tar_file.extractall(path=dest_dir, members=checked_members()) except OSError as e: if e.errno == ELOOP: raise exceptions.CaseInsensitiveFileSystemError() from e raise # next iteraton of for loop raises GeneratorExit in stream stream.close() def extract(filename, dest_dir=None, fileobj=None): """ Extract all components of conda package to dest_dir. fileobj: must be seekable if provided, if a ``.conda`` package. """ assert dest_dir, "dest_dir is required" if str(filename).endswith(".conda"): components = [ package_streaming.CondaComponent.pkg, package_streaming.CondaComponent.info, ] else: # .tar.bz2 doesn't filter by component components = [package_streaming.CondaComponent.pkg] closefd = False if not fileobj: fileobj = open(filename, "rb") closefd = True try: for component in components: stream = package_streaming.stream_conda_component( filename, fileobj, component=component ) extract_stream(stream, dest_dir) finally: if closefd: fileobj.close() conda-package-streaming-0.9.0/conda_package_streaming/lazy_wheel.py000066400000000000000000000223511446221466000255210ustar00rootroot00000000000000"""Lazy ZIP over HTTP""" from __future__ import annotations import logging import zipfile from bisect import bisect_left, bisect_right from contextlib import contextmanager from tempfile import NamedTemporaryFile from typing import Any, Iterator from zipfile import BadZipfile, ZipFile from requests import Session from requests.models import CONTENT_CHUNK_SIZE, Response # from pip 22.0.3 with fixes & remove imports from pip log = logging.getLogger(__name__) # If-Match (etag) to detect file changed during fetch would also be nice HEADERS = {"Accept-Encoding": "identity"} class HTTPRangeRequestUnsupported(Exception): pass class LazyZipOverHTTP: """File-like object mapped to a ZIP file over HTTP. This uses HTTP range requests to lazily fetch the file's content, which is supposed to be fed to ZipFile. If such requests are not supported by the server, raise HTTPRangeRequestUnsupported during initialization. """ def __init__( self, url: str, session: Session, chunk_size: int = CONTENT_CHUNK_SIZE ) -> None: # if CONTENT_CHUNK_SIZE is bigger than the file: # In [8]: response.headers["Content-Range"] # Out[8]: 'bytes 0-3133374/3133375' self._request_count = 0 self._session, self._url, self._chunk_size = session, url, chunk_size # initial range request for the end of the file tail = self._stream_response(start="", end=CONTENT_CHUNK_SIZE) # e.g. {'accept-ranges': 'bytes', 'content-length': '10240', # 'content-range': 'bytes 12824-23063/23064', 'last-modified': 'Sat, 16 # Apr 2022 13:03:02 GMT', 'date': 'Thu, 21 Apr 2022 11:34:04 GMT'} if tail.status_code != 206: raise HTTPRangeRequestUnsupported("range request is not supported") # lowercase content-range to support s3 self._length = int(tail.headers["content-range"].partition("/")[-1]) self._file = NamedTemporaryFile() self.truncate(self._length) # length is also in Content-Length and Content-Range header with self._stay(): content_length = int(tail.headers["content-length"]) if hasattr(tail, "content"): assert content_length == len(tail.content) self.seek(self._length - content_length) for chunk in tail.iter_content(self._chunk_size): self._file.write(chunk) self._left: list[int] = [self._length - content_length] self._right: list[int] = [self._length - 1] @property def mode(self) -> str: """Opening mode, which is always rb.""" return "rb" @property def name(self) -> str: """Path to the underlying file.""" return self._file.name def seekable(self) -> bool: """Return whether random access is supported, which is True.""" return True def close(self) -> None: """Close the file.""" self._file.close() @property def closed(self) -> bool: """Whether the file is closed.""" return self._file.closed def read(self, size: int = -1) -> bytes: """Read up to size bytes from the object and return them. As a convenience, if size is unspecified or -1, all bytes until EOF are returned. Fewer than size bytes may be returned if EOF is reached. """ # BUG does not download correctly if size is unspecified download_size = size start, length = self.tell(), self._length stop = length if size < 0 else min(start + download_size, length) start = max(0, stop - download_size) self._download(start, stop - 1) return self._file.read(size) def readable(self) -> bool: """Return whether the file is readable, which is True.""" return True def seek(self, offset: int, whence: int = 0) -> int: """Change stream position and return the new absolute position. Seek to offset relative position indicated by whence: * 0: Start of stream (the default). pos should be >= 0; * 1: Current position - pos may be negative; * 2: End of stream - pos usually negative. """ return self._file.seek(offset, whence) def tell(self) -> int: """Return the current position.""" return self._file.tell() def truncate(self, size: int | None = None) -> int: """Resize the stream to the given size in bytes. If size is unspecified resize to the current position. The current stream position isn't changed. Return the new file size. """ return self._file.truncate(size) def writable(self) -> bool: """Return False.""" return False def __enter__(self) -> LazyZipOverHTTP: self._file.__enter__() return self def __exit__(self, *exc: Any) -> bool | None: return self._file.__exit__(*exc) @contextmanager def _stay(self) -> Iterator[None]: """Return a context manager keeping the position. At the end of the block, seek back to original position. """ pos = self.tell() try: yield finally: self.seek(pos) def _check_zip(self) -> None: """Check and download until the file is a valid ZIP.""" end = self._length - 1 for start in reversed(range(0, end, self._chunk_size)): self._download(start, end) with self._stay(): try: # For read-only ZIP files, ZipFile only needs # methods read, seek, seekable and tell. ZipFile(self) # type: ignore except BadZipfile: pass else: break def _stream_response( self, start: int | str, end: int, base_headers: dict[str, str] = HEADERS ) -> Response: """Return HTTP response to a range request from start to end. :param start: if "", request ``end` bytes from end of file.""" headers = base_headers.copy() headers["Range"] = f"bytes={start}-{end}" log.debug("%s", headers["Range"]) # TODO: Get range requests to be correctly cached headers["Cache-Control"] = "no-cache" self._request_count += 1 response = self._session.get(self._url, headers=headers, stream=True) response.raise_for_status() return response def _merge( self, start: int, end: int, left: int, right: int ) -> Iterator[tuple[int, int]]: """Return an iterator of intervals to be fetched. Args: start (int): Start of needed interval end (int): End of needed interval left (int): Index of first overlapping downloaded data right (int): Index after last overlapping downloaded data """ lslice, rslice = self._left[left:right], self._right[left:right] i = start = min([start] + lslice[:1]) end = max([end] + rslice[-1:]) for j, k in zip(lslice, rslice): if j > i: yield i, j - 1 i = k + 1 if i <= end: yield i, end self._left[left:right], self._right[left:right] = [start], [end] def _download(self, start: int, end: int) -> None: """Download bytes from start to end inclusively.""" with self._stay(): left = bisect_left(self._right, start) right = bisect_right(self._left, end) for start, end in self._merge(start, end, left, right): response = self._stream_response(start, end) self.seek(start) for chunk in response.iter_content(self._chunk_size): self._file.write(chunk) class LazyConda(LazyZipOverHTTP): def prefetch(self, conda_file_id): """ Conda fork specific. Prefetch the `.info` range from the remote archive. Reduces number of Range requests to 2 or 3 (1 or 2 for the directory, 1 for the file). conda_file_id: name of .conda without path or `.conda` extension """ target_file = f"info-{conda_file_id}.tar.zst" with self._stay(): # not strictly necessary # try to read entire conda info in one request zf = zipfile.ZipFile(self) infolist = zf.infolist() for i, info in enumerate(infolist): if info.filename == target_file: # could be incorrect if zipfile was concatenated to another # file (not likely for .conda) start = info.header_offset try: end = infolist[i + 1].header_offset # or info.header_offset # + len(info.filename) # + len(info.extra) # + info.compress_size # (unless Zip64) except IndexError: end = zf.start_dir self.seek(start) self.read(end - start) log.debug( "prefetch %s-%s", info.header_offset, end, ) break else: log.debug("no zip prefetch") conda-package-streaming-0.9.0/conda_package_streaming/package_streaming.py000066400000000000000000000113231446221466000270170ustar00rootroot00000000000000""" Unpack conda packages without using a temporary file. """ from __future__ import annotations import bz2 import os import os.path import tarfile import zipfile from enum import Enum from typing import Generator # acquire umask taking advantage of import system lock, instead of possibly in # multiple threads at once. UMASK = os.umask(0) os.umask(UMASK) try: import zstandard except ImportError: import warnings warnings.warn("zstandard could not be imported. Running without .conda support.") zstandard = None class CondaComponent(Enum): pkg = "pkg" info = "info" def __str__(self): return self.value class TarfileNoSameOwner(tarfile.TarFile): def __init__(self, *args, umask=UMASK, **kwargs): """Open an (uncompressed) tar archive `name'. `mode' is either 'r' to read from an existing archive, 'a' to append data to an existing file or 'w' to create a new file overwriting an existing one. `mode' defaults to 'r'. If `fileobj' is given, it is used for reading or writing data. If it can be determined, `mode' is overridden by `fileobj's mode. `fileobj' is not closed, when TarFile is closed. """ super().__init__(*args, **kwargs) self.umask = umask def chown(self, tarinfo, targetpath, numeric_owner): """ Override chown to be a no-op, since we don't want to preserve ownership here. (tarfile.TarFile only lets us toggle all of (chown, chmod, mtime)) """ return def chmod(self, tarinfo, targetpath): """ Set file permissions of targetpath according to tarinfo, respecting umask. """ try: os.chmod(targetpath, tarinfo.mode & (0o777 - self.umask)) except OSError as e: raise tarfile.ExtractError("could not change mode") from e def tar_generator( fileobj, tarfile_open=TarfileNoSameOwner.open, closefd=False ) -> Generator[tuple[tarfile.TarFile, tarfile.TarInfo], None, None]: """ Yield (tar, member) from fileobj. """ # tarfile will not close fileobj because _extfileobj is True # caller should take care to close files all the way back to the http request... try: with tarfile_open(fileobj=fileobj, mode="r|") as tar: for member in tar: yield tar, member finally: if closefd: fileobj.close() def stream_conda_info( filename, fileobj=None ) -> Generator[tuple[tarfile.TarFile, tarfile.TarInfo], None, None]: """ Yield members from conda's embedded info/ tarball. For .tar.bz2 packages, yield all members. Yields (tar, member) tuples. You must only use the current member to prevent tar seeks and scans. To extract to disk, it's possible to call ``tar.extractall(path)`` on the first result and then ignore the rest of this generator. ``extractall`` takes care of some directory permissions/mtime issues, compared to ``extract`` or writing out the file objects yourself. """ component = "info" return stream_conda_component(filename, fileobj, component) def stream_conda_component( filename, fileobj=None, component: CondaComponent | str = CondaComponent.pkg ) -> Generator[tuple[tarfile.TarFile, tarfile.TarInfo], None, None]: """ Yield members from .conda's embedded {component}- tarball. "info" or "pkg". For .tar.bz2 packages, yield all members. Yields (tar, member) tuples. You must only use the current member to prevent tar seeks and scans. To extract to disk, it's possible to call ``tar.extractall(path)`` on the first result and then ignore the rest of this generator. ``extractall`` takes care of some directory permissions/mtime issues, compared to ``extract`` or writing out the file objects yourself. """ if str(filename).endswith(".conda"): if zstandard is None: raise RuntimeError("Cannot unpack `.conda` without zstandard") zf = zipfile.ZipFile(fileobj or filename) file_id, _, _ = os.path.basename(filename).rpartition(".") component_name = f"{component}-{file_id}" component_filename = [ info for info in zf.infolist() if info.filename.startswith(component_name) ] if not component_filename: raise LookupError(f"didn't find {component_name} component in {filename}") assert len(component_filename) == 1 reader = zstandard.ZstdDecompressor().stream_reader( zf.open(component_filename[0]) ) elif str(filename).endswith(".tar.bz2"): reader = bz2.open(fileobj or filename, mode="rb") else: raise ValueError("unsupported file extension") return tar_generator(reader, closefd=fileobj is None) conda-package-streaming-0.9.0/conda_package_streaming/s3.py000066400000000000000000000043561446221466000237100ustar00rootroot00000000000000""" Adapt s3 to package_streaming """ from __future__ import annotations import typing from contextlib import closing from typing import Any from . import package_streaming if typing.TYPE_CHECKING: # pragma: no cover from mypy_boto3_s3 import Client from mypy_boto3_s3.type_defs import GetObjectOutputTypeDef else: Client = GetObjectOutputTypeDef = None from .url import conda_reader_for_url __all__ = ["stream_conda_info", "conda_reader_for_s3"] class ResponseFacade: def __init__(self, response: GetObjectOutputTypeDef): self.response = response self.raw: Any = response["Body"] def raise_for_status(self): # s3 get_object raises automatically? pass @property def status_code(self): return self.response["ResponseMetadata"]["HTTPStatusCode"] @property def headers(self): # a case-sensitive dict; keys may be lowercased always? return self.response["ResponseMetadata"]["HTTPHeaders"] def iter_content(self, n: int): return iter(lambda: self.raw.read(n), b"") class SessionFacade: """ Make s3 client look just enough like a requests.session for LazyZipOverHTTP """ def __init__(self, client: Client, bucket: str, key: str): self.client = client self.bucket = bucket self.key = key def get(self, url, *, headers: dict | None = None, stream=True): if headers and "Range" in headers: response = self.client.get_object( Bucket=self.bucket, Key=self.key, Range=headers["Range"] ) else: response = self.client.get_object(Bucket=self.bucket, Key=self.key) return ResponseFacade(response) def stream_conda_info(client, bucket, key): """ Yield (tar, member) for conda package. Just "info/" for .conda, all members for tar. """ filename, conda = conda_reader_for_s3(client, bucket, key) with closing(conda): yield from package_streaming.stream_conda_info(filename, conda) def conda_reader_for_s3(client: Client, bucket: str, key: str): """ Return (name, file_like) suitable for package_streaming APIs """ session: Any = SessionFacade(client, bucket, key) return conda_reader_for_url(key, session) conda-package-streaming-0.9.0/conda_package_streaming/transmute.py000066400000000000000000000131261446221466000254000ustar00rootroot00000000000000""" Convert .tar.bz2 to .conda Uses `tempfile.SpooledTemporaryFile` to buffer `pkg-*` `.tar` and `info-*` `.tar`, then compress directly into an open `ZipFile` at the end. `SpooledTemporaryFile` buffers the first 10MB of the package and its metadata in memory, but writes out to disk for larger packages. Conda packages created this way have `info-*` as the last element in the `ZipFile`, instead of the first for `.conda` packages created with pre-2.0 `conda-package-handling`. """ from __future__ import annotations import json import os import shutil import tarfile import tempfile import zipfile from pathlib import Path from typing import Callable import zstandard # streams everything in .tar.bz2 mode from .package_streaming import CondaComponent, stream_conda_component # increase to reduce speed and increase compression (levels above 19 use much # more memory) ZSTD_COMPRESS_LEVEL = 19 # increase to reduce compression and increase speed ZSTD_COMPRESS_THREADS = 1 CONDA_PACKAGE_FORMAT_VERSION = 2 def transmute( package, path, *, compressor: Callable[ [], zstandard.ZstdCompressor ] = lambda: zstandard.ZstdCompressor( level=ZSTD_COMPRESS_LEVEL, threads=ZSTD_COMPRESS_THREADS ), is_info: Callable[[str], bool] = lambda filename: filename.startswith("info/"), ) -> Path: """ Convert .tar.bz2 conda :package to .conda-format under path. :param package: path to .tar.bz2 conda package :param path: destination path for transmuted .conda package :param compressor: A function that creates instances of ``zstandard.ZstdCompressor()`` to override defaults. :param is_info: A function that returns True if a file belongs in the ``info`` component of a `.conda` package. ``conda-package-handling`` (not this package ``conda-package-streaming``) uses a set of regular expressions to keep expected items in the info- component, while other items starting with ``info/`` wind up in the pkg- component. :return: Path to transmuted package. """ assert package.endswith(".tar.bz2"), "can only convert .tar.bz2 to .conda" assert os.path.isdir(path) file_id = os.path.basename(package)[: -len(".tar.bz2")] output_path = Path(path, f"{file_id}.conda") with tempfile.SpooledTemporaryFile() as info_file, tempfile.SpooledTemporaryFile() as pkg_file: with tarfile.TarFile(fileobj=info_file, mode="w") as info_tar, tarfile.TarFile( fileobj=pkg_file, mode="w" ) as pkg_tar: # If we wanted to compress these at a low setting to save temporary # space, we could insert a file object that counts bytes written in # front of a zstd (level between 1..3) compressor. stream = iter(stream_conda_component(package)) for tar, member in stream: tar_get = info_tar if is_info(member.name) else pkg_tar if member.isfile(): tar_get.addfile(member, tar.extractfile(member)) else: tar_get.addfile(member) info_tar.close() pkg_tar.close() info_size = info_file.tell() pkg_size = pkg_file.tell() info_file.seek(0) pkg_file.seek(0) with zipfile.ZipFile( output_path, "x", # x to not append to existing compresslevel=zipfile.ZIP_STORED, ) as conda_file: # Use a maximum of one Zstd compressor, stream_writer at a time to save memory. data_compress = compressor() pkg_metadata = {"conda_pkg_format_version": CONDA_PACKAGE_FORMAT_VERSION} conda_file.writestr("metadata.json", json.dumps(pkg_metadata)) with conda_file.open( f"pkg-{file_id}.tar.zst", "w" ) as pkg_file_zip, data_compress.stream_writer( pkg_file_zip, size=pkg_size, closefd=False ) as pkg_stream: shutil.copyfileobj(pkg_file._file, pkg_stream) with conda_file.open( f"info-{file_id}.tar.zst", "w" ) as info_file_zip, data_compress.stream_writer( info_file_zip, size=info_size, closefd=False ) as info_stream: shutil.copyfileobj(info_file._file, info_stream) return output_path def transmute_tar_bz2( package: str, path, ) -> Path: """ Convert .conda :package to .tar.bz2 format under path. Can recompress .tar.bz2 packages. :param package: path to `.conda` or `.tar.bz2` package. :param path: destination path for transmuted package. :return: Path to transmuted package. """ assert package.endswith((".tar.bz2", ".conda")), "Unknown extension" assert os.path.isdir(path) incoming_format = ".conda" if package.endswith(".conda") else ".tar.bz2" file_id = os.path.basename(package)[: -len(incoming_format)] if incoming_format == ".conda": # .tar.bz2 MUST place info/ first. components = [CondaComponent.info, CondaComponent.pkg] else: # .tar.bz2 doesn't filter by component components = [CondaComponent.pkg] output_path = Path(path, f"{file_id}.tar.bz2") with open(package, "rb") as fileobj, tarfile.open(output_path, "x:bz2") as pkg_tar: for component in components: stream = iter(stream_conda_component(package, fileobj, component=component)) for tar, member in stream: if member.isfile(): pkg_tar.addfile(member, tar.extractfile(member)) else: pkg_tar.addfile(member) return output_path conda-package-streaming-0.9.0/conda_package_streaming/url.py000066400000000000000000000052231446221466000241570ustar00rootroot00000000000000""" Fetch metadata from remote .conda or .tar.bz2 package. Try to fetch less than the whole file if possible. This module should only be used to make *partial* reads against a remote package, typically just the ``info`` portion. If a full ``.conda`` format package is needed, it is more efficient to download locally first and then use the file-based API. """ import logging import sys import urllib.parse from pathlib import Path import requests from . import package_streaming # Excellent HTTP Range request file-like object from .lazy_wheel import LazyConda log = logging.getLogger(__name__) session = requests.Session() session.headers["User-Agent"] = "conda-package-streaming/0.1.0" METADATA_CHECKLIST = frozenset({"info/index.json", "info/recipe/meta.yaml"}) def extract_conda_info(url, destdir, checklist=METADATA_CHECKLIST, session=session): """ Extract info/index.json and info/recipe/meta.yaml from url to destdir; close url as soon as those files are found. """ checklist = set(checklist) stream = stream_conda_info(url, session=session) for tar, member in stream: if member.name in checklist: tar.extract(member, destdir) checklist.remove(member.name) if not checklist: stream.close() # next iteraton of for loop raises GeneratorExit in stream def stream_conda_info(url, session=session): """ Yield (tar, member) for conda package at url Just "info/" for .conda, all members for tar. """ filename, conda = conda_reader_for_url(url, session=session) try: yield from package_streaming.stream_conda_info(filename, conda) finally: if hasattr(conda, "release_conn"): # For .tar.bz2. Take extra care to drop connections after we are # done reading a partial response. conda.release_conn() conda.close() def conda_reader_for_url(url, session=session): """ Return (name, file_like) suitable for package_streaming APIs """ parsed_url = urllib.parse.urlparse(url) *_, filename = parsed_url.path.rsplit("/", 1) if filename.endswith(".conda"): file_id = filename[: -len(".conda")] conda = LazyConda(url, session) conda.prefetch(file_id) elif filename.endswith(".tar.bz2"): response = session.get(url, stream=True, headers={"Connection": "close"}) conda = response.raw else: raise ValueError("Unsupported extension %s", url) return filename, conda if __name__ == "__main__": # pragma nocover import logging logging.basicConfig(level=logging.DEBUG) extract_conda_info(sys.argv[1], Path(sys.argv[2]).absolute()) conda-package-streaming-0.9.0/docs/000077500000000000000000000000001446221466000171215ustar00rootroot00000000000000conda-package-streaming-0.9.0/docs/changelog.md000066400000000000000000000000551446221466000213720ustar00rootroot00000000000000# Changelog ```{include} ../CHANGELOG.md ``` conda-package-streaming-0.9.0/docs/conf.py000066400000000000000000000035271446221466000204270ustar00rootroot00000000000000# Configuration file for the Sphinx documentation builder. # # This file only contains a selection of the most common options. For a full # list see the documentation: # https://www.sphinx-doc.org/en/master/usage/configuration.html # -- Path setup -------------------------------------------------------------- # If extensions (or modules to document with autodoc) are in another directory, # add these directories to sys.path here. If the directory is relative to the # documentation root, use os.path.abspath to make it absolute, like shown here. # import os import sys sys.path.insert(0, os.path.abspath("..")) # -- Project information ----------------------------------------------------- project = "conda-package-streaming" copyright = "2022, Anaconda, Inc." author = "Anaconda, Inc." # -- General configuration --------------------------------------------------- # Add any Sphinx extension module names here, as strings. They can be # extensions coming with Sphinx (named 'sphinx.ext.*') or your custom # ones. extensions = [ "sphinx.ext.autodoc", "myst_parser", ] # Add any paths that contain templates here, relative to this directory. templates_path = ["_templates"] # List of patterns, relative to source directory, that match files and # directories to ignore when looking for source files. # This pattern also affects html_static_path and html_extra_path. exclude_patterns = [] # -- Options for HTML output ------------------------------------------------- # The theme to use for HTML and HTML Help pages. See the documentation for # a list of builtin themes. # html_theme = "furo" # Add any paths that contain custom static files (such as style sheets) here, # relative to this directory. They are copied after the builtin static files, # so a file named "default.css" will overwrite the builtin "default.css". html_static_path = ["_static"] conda-package-streaming-0.9.0/docs/extract.rst000066400000000000000000000002601446221466000213230ustar00rootroot00000000000000extract module ============== Extract conda packages to the filesystem. .. automodule:: conda_package_streaming.extract :members: :undoc-members: :show-inheritance: conda-package-streaming-0.9.0/docs/index.md000066400000000000000000000025421446221466000205550ustar00rootroot00000000000000% conda-package-streaming documentation master file, created by % sphinx-quickstart on Fri Jun 17 14:43:38 2022. % You can adapt this file completely to your liking, but it should at least % contain the root `toctree` directive. # Welcome to conda-package-streaming's documentation! `conda-package-streaming` strives to be the most efficient way to read from new and old format `.conda` and `.tar.bz2` [conda packages](https://docs.conda.io/projects/conda/en/latest/user-guide/concepts/packages.html). `conda-package-streaming` can read from conda packages without ever writing to disk, unlike [conda-package-handling](https://github.com/conda/conda-package-handling) `< 2.0.0`'s temporary directories. [conda-package-handling](https://github.com/conda/conda-package-handling) `>= 2.0.0` uses `conda-package-streaming`. This library can also read a package from a URL or a stream without transferring the entire archive. `conda-package-streaming` uses the standard library [`zipfile`](https://docs.python.org/3/library/zipfile.html) and [`tarfile`](https://docs.python.org/3/library/tarfile.html), and [`zstandard`](https://github.com/indygreg/python-zstandard) to handle zstd-compressed streams. ```{include} ../README.md ``` ```{toctree} :caption: 'Contents:' :maxdepth: 2 modules changelog ``` # Indices and tables - {ref}`genindex` - {ref}`modindex` - {ref}`search` conda-package-streaming-0.9.0/docs/lazy_wheel.md000066400000000000000000000014361446221466000216120ustar00rootroot00000000000000# lazy_wheel module `lazy_wheel` is derived from pip's wheel download code. It is really a seekable file-like based on HTTP range requests, backed by a sparse temporary file. Each `read()` issues one or more HTTP range requests to the URL depending on how much of the file has already been downloaded, while read()\`s from already-fetched portions of the file are fulfilled by the backing file. ZIP archives have a directory at the end of the file giving the offset to each compressed member. We fetch the directory, and then the portion of the file containing the member or members of interest, for a maximum of 3 requests to retrieve any individual file in the archive. ```{eval-rst} .. automodule:: conda_package_streaming.lazy_wheel :members: :undoc-members: :show-inheritance: ``` conda-package-streaming-0.9.0/docs/modules.md000066400000000000000000000052441446221466000211200ustar00rootroot00000000000000# conda_package_streaming Fetch metadata from remote .conda or .tar.bz2 package. Try to fetch less than the whole file if possible. Zip (.conda) is made for this: ``` $ python -m conda_package_streaming.url https://repo.anaconda.com/pkgs/main/osx-64/sqlalchemy-1.4.32-py310hca72f7f_0.conda /tmp/ DEBUG:conda_package_streaming.lazy_wheel:bytes=-10240 DEBUG:urllib3.connectionpool:Starting new HTTPS connection (1): repo.anaconda.com:443 DEBUG:urllib3.connectionpool:https://repo.anaconda.com:443 "GET /pkgs/main/osx-64/sqlalchemy-1.4.32-py310hca72f7f_0.conda HTTP/1.1" 206 10240 DEBUG:conda_package_streaming.lazy_wheel:bytes=43-38176 DEBUG:urllib3.connectionpool:https://repo.anaconda.com:443 "GET /pkgs/main/osx-64/sqlalchemy-1.4.32-py310hca72f7f_0.conda HTTP/1.1" 206 38134 DEBUG:conda_package_streaming.lazy_wheel:prefetch 43-38177 $ curl -s -I https://repo.anaconda.com/pkgs/main/osx-64/sqlalchemy-1.4.32-py310hca72f7f_0.conda | grep content-length content-length: 1984926 ``` We fetch 10240 + 38134 = 48374 bytes in two requests of this 1984926-byte package. ## Older format bzip2 has a very large block size, and we don't know if the info/ directory is finished before reading the entire archive. However if we only want certain files from info/ we can stop after we've seen them all. Fetching repodata and calling response.raw.tell() after each tar member: ``` $ python -m metayaml.fetch_metadata \ https://repo.anaconda.com/pkgs/main/linux-64/absl-py-0.1.10-py27_0.tar.bz2 128948 info/hash_input.json 128948 info/index.json 128948 info/files 128948 info/about.json 128948 info/paths.json 128948 info/LICENSE.txt 128948 info/git 128948 lib/python2.7/site-packages/absl_py-0.1.10-py2.7.egg-info/dependency_links.txt 128948 lib/python2.7/site-packages/absl_py-0.1.10-py2.7.egg-info/requires.txt 128948 lib/python2.7/site-packages/absl_py-0.1.10-py2.7.egg-info/top_level.txt 128948 lib/python2.7/site-packages/absl/__init__.pyc 128948 lib/python2.7/site-packages/absl/testing/__init__.pyc 128948 info/test/run_test.py ... ``` A larger package: ``` # Fetch https://repo.anaconda.com/pkgs/main/linux-64/airflow-1.10.10-py36_0.tar.bz2 # Printing bytes transferred after each archive member, 286720 info/hash_input.json 286720 info/has_prefix 286720 info/index.json 286720 info/about.json 286720 info/git 286720 info/files 286720 info/paths.json 286720 lib/python3.6/site-packages/airflow/alembic.ini 286720 lib/python3.6/site-packages/airflow/www/templates/airflow/variables/README.md ... 286720 info/test/test_time_dependencies.json ... 634880 lib/python3.6/site-packages/airflow/www/static/ace.js 634880 bin/airflow ``` ```{toctree} :maxdepth: 4 url s3 lazy_wheel package_streaming extract transmute ``` conda-package-streaming-0.9.0/docs/package_streaming.rst000066400000000000000000000002451446221466000233200ustar00rootroot00000000000000package\_streaming module ========================= .. automodule:: conda_package_streaming.package_streaming :members: :undoc-members: :show-inheritance: conda-package-streaming-0.9.0/docs/s3.md000066400000000000000000000005261446221466000177730ustar00rootroot00000000000000s3 module ====================== conda_package_streaming.s3 adapts a s3 client, bucket name, and key to `LazyConda`, or, for `.tar.bz2`, a normal streaming `GET` request that can be closed before transferring the whole file. ```{eval-rst} .. automodule:: conda_package_streaming.s3 :members: :undoc-members: :show-inheritance: ``` conda-package-streaming-0.9.0/docs/transmute.rst000066400000000000000000000002131446221466000216710ustar00rootroot00000000000000transmute module ================ .. automodule:: conda_package_streaming.transmute :members: :undoc-members: :show-inheritance: conda-package-streaming-0.9.0/docs/url.rst000066400000000000000000000002051446221466000204520ustar00rootroot00000000000000url module ====================== .. automodule:: conda_package_streaming.url :members: :undoc-members: :show-inheritance: conda-package-streaming-0.9.0/make.bat000066400000000000000000000014441446221466000176010ustar00rootroot00000000000000@ECHO OFF pushd %~dp0 REM Command file for Sphinx documentation if "%SPHINXBUILD%" == "" ( set SPHINXBUILD=sphinx-build ) set SOURCEDIR=source set BUILDDIR=build %SPHINXBUILD% >NUL 2>NUL if errorlevel 9009 ( echo. echo.The 'sphinx-build' command was not found. Make sure you have Sphinx echo.installed, then set the SPHINXBUILD environment variable to point echo.to the full path of the 'sphinx-build' executable. Alternatively you echo.may add the Sphinx directory to PATH. echo. echo.If you don't have Sphinx installed, grab it from echo.https://www.sphinx-doc.org/ exit /b 1 ) if "%1" == "" goto help %SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O% goto end :help %SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O% :end popd conda-package-streaming-0.9.0/noxfile.py000066400000000000000000000003411446221466000202050ustar00rootroot00000000000000import nox @nox.session(venv_backend="conda") @nox.parametrize( "python", [(python) for python in ("3.7", "3.8", "3.9", "3.10")], ) def tests(session): session.install("-e", ".[test]") session.run("pytest") conda-package-streaming-0.9.0/pyproject.toml000066400000000000000000000024421446221466000211070ustar00rootroot00000000000000[tool.black] target-version = ["py38", "py39", "py310"] [tool.isort] profile = "black" [build-system] requires = ["flit_core >=3.2,<4"] build-backend = "flit_core.buildapi" [project] name = "conda_package_streaming" authors = [ { name = "Anaconda, Inc. & Contributors", email = "conda@continuum.io" }, ] description = "An efficient library to read from new and old format .conda and .tar.bz2 conda packages." license = { file = "LICENSE" } readme = "README.md" classifiers = [ "License :: OSI Approved :: BSD License", "Programming Language :: Python :: 3", "Programming Language :: Python :: Implementation :: CPython", "Programming Language :: Python :: Implementation :: PyPy", ] dynamic = ["version"] requires-python = ">=3.7" dependencies = ["requests", "zstandard >=0.15"] [project.optional-dependencies] test = [ "pytest >=7", "pytest-cov", "pytest-mock", "boto3", "boto3-stubs[essential]", "bottle", "conda", "conda-package-handling >=2", ] docs = ["furo", "sphinx", "myst-parser", "mdit-py-plugins>=0.3.0"] [project.urls] Home = "https://github.com/conda/conda-package-streaming" Documentation = "https://conda.github.io/conda-package-streaming/" # pyproject.toml [tool.pytest.ini_options] minversion = "7.0" addopts = "--cov=conda_package_streaming" testpaths = ["tests"] conda-package-streaming-0.9.0/requirements.txt000066400000000000000000000002431446221466000214540ustar00rootroot00000000000000requests zstandard >=0.15 # test pytest >=7 pytest-cov pytest-mock boto3 boto3-stubs[essential] bottle conda # docs furo sphinx myst-parser mdit-py-plugins>=0.3.0 conda-package-streaming-0.9.0/tests/000077500000000000000000000000001446221466000173335ustar00rootroot00000000000000conda-package-streaming-0.9.0/tests/conftest.py000066400000000000000000000052101446221466000215300ustar00rootroot00000000000000import json import logging import os.path import shutil import subprocess from pathlib import Path import pytest import server from conda_package_streaming.transmute import transmute_tar_bz2 log = logging.getLogger(__name__) LIMIT_TEST_PACKAGES = 16 def find_packages_dirs() -> Path: """ Ask conda for package directories. """ conda_info = json.loads( subprocess.run( [os.environ["CONDA_EXE"], "info", "--json"], stdout=subprocess.PIPE, check=True, ).stdout ) # XXX can run individual environment's conda (base conda is more likely to # have useful cached packages) pkgs_dirs = conda_info["pkgs_dirs"] + [os.path.expanduser("~/miniconda3/pkgs")] log.debug("search %s", pkgs_dirs) first_pkg_dir = next(path for path in pkgs_dirs if os.path.exists(path)) return Path(first_pkg_dir) @pytest.fixture(scope="session") def pkgs_dir(tmp_path_factory): """ Dedicated test package directory. """ return tmp_path_factory.mktemp("pkgs") @pytest.fixture(scope="session") def package_server(pkgs_dir, conda_paths): thread = server.get_server_thread(pkgs_dir) thread.start() return thread @pytest.fixture(scope="session") def conda_paths(pkgs_dir: Path): found_packages = find_packages_dirs() conda_paths = [] for path in found_packages.iterdir(): if path.name.endswith((".tar.bz2", ".conda")): conda_paths.append(path) return add_tar_bz2s(conda_paths, pkgs_dir) def add_tar_bz2s(paths: list[Path], pkgs_dir: Path): """ If there aren't enough .tar.bz2's available, create some from available .conda's. Return paths. """ conda_paths: list[Path] = [] tarbz2_paths: list[Path] = [] output_paths: list[Path] = [] assert isinstance(pkgs_dir, Path) for path in paths: if path.name.endswith(".tar.bz2"): tarbz2_paths.append(path) elif path.name.endswith(".conda"): conda_paths.append(path) tarbz2_path: Path = pkgs_dir medium_conda_paths = [] for path in conda_paths: if 1 << 20 < path.stat().st_size < 1 << 22: medium_conda_paths.append(path) medium_conda_paths = medium_conda_paths[:LIMIT_TEST_PACKAGES] # this ignores existing .tar.bz2 for simplicity (.tar.bz2 is missing in CI) for conda in set(medium_conda_paths + conda_paths[:10]): shutil.copy(conda, tarbz2_path) transmute_tar_bz2(str(conda), tarbz2_path) output_paths.extend(tarbz2_path.glob("*.tar.bz2")) output_paths.extend(tarbz2_path.glob("*.conda")) return sorted(output_paths) # sort interleaves .tar.bz2 and .conda conda-package-streaming-0.9.0/tests/server.py000066400000000000000000000035101446221466000212120ustar00rootroot00000000000000""" Test web server. """ import logging import threading import wsgiref.simple_server from pathlib import Path from typing import Any import bottle import conftest log = logging.getLogger(__name__) def get_app(pkgs_dir): """ Bottle conveniently supports Range requests. Server may block if browser etc. keeps connection open. """ app = bottle.Bottle() app.pkgs_dir = pkgs_dir def serve_file(filename): mimetype = "auto" # from https://repo.anaconda.com/ behavior: if filename.endswith(".tar.bz2"): mimetype = "application/x-tar" elif filename.endswith(".conda"): mimetype = "binary/octet-stream" return bottle.static_file(filename, root=pkgs_dir, mimetype=mimetype) app.route("/pkgs/", "GET", serve_file) return app def selftest(): """ Run server in a thread that will die when the application exits. """ t = get_server_thread(conftest.find_packages_dirs()) t.start() import time time.sleep(300) class ServerThread(threading.Thread): server: wsgiref.simple_server.WSGIServer app: Any def get_server_thread(pkgs_dir: Path): """ Return test server thread with additional .server, .app properties. Call .start() to serve in the background. """ app = get_app(pkgs_dir) server = wsgiref.simple_server.make_server("127.0.0.1", 0, app) log.info(f"serving {app.pkgs_dir} on {server.server_address}/pkgs") t = ServerThread(daemon=True, target=server.serve_forever) t.app = app t.server = server # server.application == app return t if __name__ == "__main__": import logging logging.basicConfig( level=logging.INFO, format="%(asctime)s %(message)s", datefmt="%Y-%m-%dT%H:%M:%S", ) log.setLevel(logging.DEBUG) selftest() conda-package-streaming-0.9.0/tests/test_degraded.py000066400000000000000000000027731446221466000225140ustar00rootroot00000000000000""" Allow conda_package_streaming to work in .tar.bz2-only mode if zstandard is not available (please immediately install zstandard if this is the case). """ import importlib import sys import tarfile import zipfile from pathlib import Path import pytest def test_degraded(tmpdir): try: sys.modules["zstandard"] = None # type: ignore import conda_package_streaming.extract import conda_package_streaming.package_streaming importlib.reload(conda_package_streaming.package_streaming) testconda = Path(tmpdir, "testconda.conda") with zipfile.ZipFile(testconda, "w"): pass testtar = Path(tmpdir, "test.tar.bz2") with tarfile.open(testtar, "w:bz2") as tar: pass for ( tar, member, ) in conda_package_streaming.package_streaming.stream_conda_component(testtar): pass with pytest.raises(RuntimeError): for ( tar, member, ) in conda_package_streaming.package_streaming.stream_conda_component( testconda ): pass with pytest.raises(RuntimeError): conda_package_streaming.extract.extract(testconda, tmpdir) finally: sys.modules.pop("zstandard", None) import conda_package_streaming.package_streaming importlib.reload(conda_package_streaming.package_streaming) assert conda_package_streaming.package_streaming.zstandard conda-package-streaming-0.9.0/tests/test_extract.py000066400000000000000000000073621446221466000224260ustar00rootroot00000000000000import io import stat import tarfile from errno import ELOOP import pytest from conda_package_streaming import exceptions, extract, package_streaming MAX_CONDAS = 8 def test_extract_stream(conda_paths, tmp_path): for i, package in enumerate(conda_paths): print(package) with open(package, "rb") as fileobj: stream = package_streaming.stream_conda_component( package, fileobj, component=package_streaming.CondaComponent.pkg ) dest_dir = tmp_path / package.name extract.extract_stream(stream, dest_dir) if i >= MAX_CONDAS: break def test_extract_all(conda_paths, tmp_path): for i, package in enumerate(conda_paths): print(package) dest_dir = tmp_path / package.name extract.extract(package, dest_dir=dest_dir) if i >= MAX_CONDAS: break def empty_tarfile(name, mode=0o644): """ Return BytesIO containing a tarfile with one empty file named :name """ tar = io.BytesIO() t = tarfile.TarFile(mode="w", fileobj=tar) tarinfo = tarfile.TarInfo(name=name) tarinfo.mode = mode t.addfile(tarinfo, io.BytesIO()) t.close() tar.seek(0) return tar def test_oserror(tmp_path): """ Fail if tarfile raises OSError (formerly known as IOError) """ tar = empty_tarfile("empty-test") class TarELOOP(tarfile.TarFile): def extractall(self, path=None, members=None): raise OSError(ELOOP, "case sensitivity") class TarOSError(tarfile.TarFile): def extractall(self, path=None, members=None): raise OSError("not eloop") def stream(cls): yield (cls(fileobj=tar), tarfile.TarInfo()) with pytest.raises(exceptions.CaseInsensitiveFileSystemError): extract.extract_stream(stream(TarELOOP), tmp_path) with pytest.raises(OSError): extract.extract_stream(stream(TarOSError), tmp_path) def stream(fileobj): """ Like the tuples produced by part of conda-package-streaming. """ yield (package_streaming.TarfileNoSameOwner(fileobj=fileobj), tarfile.TarInfo()) def stream_stdlib(fileobj): """ Like the tuples produced by part of conda-package-streaming. """ yield (tarfile.TarFile(fileobj=fileobj), tarfile.TarInfo()) def test_slip(tmp_path): """ Fail if tarfile tries to put files outside its dest_dir (tmp_path) """ tar = empty_tarfile(name="../slip") with pytest.raises(exceptions.SafetyError): extract.extract_stream(stream(tar), tmp_path) tar2 = empty_tarfile(name="/absolute") with pytest.raises(exceptions.SafetyError): extract.extract_stream(stream(tar2), tmp_path) def test_chown(conda_paths, tmp_path, mocker): for package in conda_paths[:2]: print(package) with open(package, "rb") as fileobj: stream = package_streaming.stream_conda_component( package, fileobj, component=package_streaming.CondaComponent.pkg ) for tar, member in stream: assert isinstance(tar, package_streaming.TarfileNoSameOwner), tar break def test_umask(tmp_path, mocker): """ Demonstrate that umask-respecting tar implementation works. Mock umask in case it is different on your system. """ mocker.patch("conda_package_streaming.package_streaming.UMASK", new=0o22) tar3 = empty_tarfile(name="naughty_umask", mode=0o777) extract.extract_stream(stream_stdlib(tar3), tmp_path) mode = (tmp_path / "naughty_umask").stat().st_mode assert mode & stat.S_IWGRP, "%o" % mode tar3.seek(0) extract.extract_stream(stream(tar3), tmp_path) mode = (tmp_path / "naughty_umask").stat().st_mode assert not mode & stat.S_IWGRP, "%o" % mode conda-package-streaming-0.9.0/tests/test_s3.py000066400000000000000000000022001446221466000212630ustar00rootroot00000000000000import boto3 import pytest from conda_package_streaming import s3 LIMIT = 16 @pytest.fixture def s3_client(package_server): host, port = package_server.server.server_address client = boto3.client( "s3", aws_access_key_id="test_id", aws_secret_access_key="test_key", endpoint_url=f"http://{host}:{port}", use_ssl=False, verify=False, ) return client def test_head_objects(s3_client, conda_paths): bucket = "pkgs" # independent of filesystem path for path in conda_paths[:LIMIT]: s3_client.head_object(Bucket=bucket, Key=path.name) def test_stream_s3(s3_client, conda_paths): with pytest.raises(ValueError): next(s3.stream_conda_info(s3_client, "pkgs", "notaconda.rar")) for path in conda_paths[:LIMIT]: members = s3.stream_conda_info(s3_client, "pkgs", path.name) print("stream s3", path.name) for tar, member in members: if member.name == "info/index.json": members.close() # faster than waiting for gc? break else: pytest.fail("info/index.json not found") conda-package-streaming-0.9.0/tests/test_streaming.py000066400000000000000000000021411446221466000227330ustar00rootroot00000000000000import json import pytest from conda_package_streaming import package_streaming def test_package_streaming(conda_paths): for path in conda_paths: if str(path).endswith(".conda"): with pytest.raises(LookupError): package_streaming.stream_conda_component(path, component="notfound") with pytest.raises(ValueError): package_streaming.stream_conda_component("notapackage.rar") def test_early_exit(conda_paths): for package in conda_paths: print(package) stream = iter(package_streaming.stream_conda_info(package)) found = False for tar, member in stream: assert not found, "early exit did not work" if member.name == "info/index.json": reader = tar.extractfile(member) if reader: json.load(reader) found = True stream.close() # PEP 342 close() # stream_conda_info doesn't close a passed-in fileobj, but a # filename should be closed. assert found, f"index.json not found in {package}" conda-package-streaming-0.9.0/tests/test_transmute.py000066400000000000000000000102161446221466000227660ustar00rootroot00000000000000import contextlib import io import os import tarfile import time from pathlib import Path from zipfile import ZipFile import pytest from conda_package_handling.validate import validate_converted_files_match_streaming from conda_package_streaming.package_streaming import ( CondaComponent, stream_conda_component, ) from conda_package_streaming.transmute import transmute, transmute_tar_bz2 @pytest.fixture def testtar_bytes(): buffer = io.BytesIO() with tarfile.open("test.tar.bz2", "w:bz2", fileobj=buffer) as tar: symlink = tarfile.TarInfo(name="symlink") symlink.type = tarfile.LNKTYPE symlink.linkname = "target" tar.addfile(symlink) expected = tarfile.TarInfo(name="info/expected") tar.addfile(expected, io.BytesIO()) unexpected = tarfile.TarInfo(name="info/unexpected") tar.addfile(unexpected, io.BytesIO()) return buffer.getbuffer() @contextlib.contextmanager def timeme(message: str = ""): begin = time.time() yield end = time.time() print(f"{message}{end-begin:0.2f}s") def test_transmute(conda_paths: list[Path], tmpdir): tarbz_packages = [] for path in conda_paths: path = str(path) if path.endswith(".tar.bz2") and (1 << 20 < os.stat(path).st_size < 1 << 22): tarbz_packages = [path] conda_packages = [] # not supported assert tarbz_packages, "no medium-sized .tar.bz2 packages found" metadata_checks = 0 for packages in (conda_packages, tarbz_packages): for package in packages: with timeme(f"{package} took "): out = transmute(package, tmpdir) _, missing, mismatched = validate_converted_files_match_streaming( out, package, strict=True ) assert missing == mismatched == [] if out.name.endswith(".conda"): with ZipFile(out) as zf: metadata_checks += 1 assert "metadata.json" in zf.namelist() assert metadata_checks > 0 def test_transmute_symlink(tmpdir, testtar_bytes): testtar = Path(tmpdir, "test.tar.bz2") testtar.write_bytes(testtar_bytes) out = transmute(str(testtar), tmpdir) _, missing, mismatched = validate_converted_files_match_streaming( out, testtar, strict=True ) assert missing == mismatched == [] def test_transmute_info_filter(tmpdir, testtar_bytes): testtar = Path(tmpdir, "test.tar.bz2") testtar.write_bytes(testtar_bytes) transmute( str(testtar), tmpdir, is_info=lambda filename: filename == "info/expected" ) with open(Path(tmpdir, "test.conda"), "rb") as fileobj: for component, expected in (CondaComponent.info, {"info/expected"}), ( CondaComponent.pkg, { "info/unexpected", "symlink", }, ): items = stream_conda_component("test.conda", fileobj, component) assert {member.name for tar, member in items} == expected, items def test_transmute_backwards(tmpdir, conda_paths): tarbz_packages = [] for path in conda_paths: path = str(path) if path.endswith(".conda") and (1 << 20 < os.stat(path).st_size < 1 << 22): tarbz_packages = [path] conda_packages = [] # not supported assert tarbz_packages, "no medium-sized .conda packages found" for packages in (conda_packages, tarbz_packages): for package in packages: with timeme(f"{package} took "): out = transmute_tar_bz2(package, tmpdir) _, missing, mismatched = validate_converted_files_match_streaming( out, package, strict=True ) assert missing == mismatched == [] def test_transmute_tarbz2_to_tarbz2(tmpdir, testtar_bytes): testtar = Path(tmpdir, "test.tar.bz2") testtar.write_bytes(testtar_bytes) outdir = Path(tmpdir, "output") outdir.mkdir() out = transmute_tar_bz2(str(testtar), outdir) _, missing, mismatched = validate_converted_files_match_streaming( out, testtar, strict=True ) assert missing == mismatched == [] conda-package-streaming-0.9.0/tests/test_url.py000066400000000000000000000116341446221466000215530ustar00rootroot00000000000000import io import tempfile from contextlib import closing, contextmanager from pathlib import Path from zipfile import ZipFile import pytest from requests import HTTPError, Session from conda_package_streaming import lazy_wheel from conda_package_streaming.lazy_wheel import LazyConda from conda_package_streaming.url import ( conda_reader_for_url, extract_conda_info, stream_conda_info, ) LIMIT = 16 @pytest.fixture def package_url(package_server): """ Base url for all test packages. """ host, port = package_server.server.server_address return f"http://{host}:{port}/pkgs" @pytest.fixture def package_urls(package_server, package_url): pkgs_dir = Path(package_server.app.pkgs_dir) conda = [] tar_bz2 = [] for path in pkgs_dir.iterdir(): if len(conda) > LIMIT and len(tar_bz2) > LIMIT: break url = f"{package_url}/{path.name}" if path.name.endswith(".tar.bz2") and len(tar_bz2) < LIMIT: tar_bz2.append(url) elif path.name.endswith(".conda") and len(conda) < LIMIT: conda.append(url) # interleave urls = [] for pair in zip(conda, tar_bz2): urls.extend(pair) return urls def test_stream_url(package_urls): with pytest.raises(ValueError): next(stream_conda_info("https://localhost/notaconda.rar")) for url in package_urls: with closing(stream_conda_info(url)) as members: print("stream_url", url) for tar, member in members: if member.name == "info/index.json": break else: pytest.fail("info/index.json not found") def test_fetch_meta(package_urls): for url in package_urls: with tempfile.TemporaryDirectory() as destdir: extract_conda_info(url, destdir) def test_lazy_wheel(package_urls): lazy_tests = 7 for url in package_urls: if url.endswith(".conda"): # API works with `.tar.bz2` but only returns LazyConda for `.conda` filename, conda = conda_reader_for_url(url) assert filename == url.rsplit("/")[-1] with conda: assert isinstance(conda, LazyConda) assert conda.mode == "rb" assert conda.readable() assert not conda.writable() assert not conda.closed request_count = conda._request_count # did we really prefetch the info? zf = ZipFile(conda) # type: ignore filename = filename[: -len(".conda")] zf.open(f"info-{filename}.tar.zst").read() assert ( conda._request_count == request_count ), "info required extra GET request" assert conda._request_count <= 3 conda.prefetch("not-appearing-in-archive.txt") # zip will figure this out naturally; delete method? conda._check_zip() lazy_tests -= 1 if lazy_tests <= 0: break else: raise LookupError( "not enough .conda packages found %d %s" % (lazy_tests, package_urls) ) with pytest.raises(HTTPError): conda_reader_for_url(package_urls[0] + ".404.conda") class Session200(Session): def get(self, *args, **kwargs): response = super().get(*args, **kwargs) response.status_code = 200 return response with pytest.raises(lazy_wheel.HTTPRangeRequestUnsupported): LazyConda(package_urls[0], Session200()) for url in package_urls: if url.endswith(".tar.bz2"): LazyConda(url, Session())._check_zip() break else: raise LookupError("no .tar.bz2 packages found") def test_no_file_after_info(): """ If info is the last file, LazyConda must fetch (start of info file .. start of zip directory) instead of to the next file in the zip. """ class MockBytesIO(io.BytesIO): prefetch = LazyConda.prefetch @contextmanager def _stay(self): yield zip = MockBytesIO() zf = ZipFile(zip, "w") zf.writestr("info-test.tar.zst", b"00000000") # a short file zf.close() zip.prefetch("test") @pytest.mark.skip() def test_obsolete_lazy_wheel_selftest(): import logging import requests logging.basicConfig(level=logging.DEBUG) session = requests.Session() lzoh = lazy_wheel.LazyZipOverHTTP( "https://repodata.fly.dev/repo.anaconda.com/pkgs/main/win-32/current_repodata.jlap", session, ) lzoh.seek(1024) lzoh.read(768) lzoh.seek(0) # compare against regular fetch with open("outfile.txt", "wb+") as out: buf = b" " while buf: buf = lzoh.read(1024 * 10) print(list(zip(lzoh._left, lzoh._right)), lzoh._length) if not buf: break out.write(buf)