pax_global_header00006660000000000000000000000064147667770670014545gustar00rootroot0000000000000052 comment=66e9436ccea07262c80012ea1470f652d7c0c452 multiurl-0.3.5/000077500000000000000000000000001476677706700134275ustar00rootroot00000000000000multiurl-0.3.5/.github/000077500000000000000000000000001476677706700147675ustar00rootroot00000000000000multiurl-0.3.5/.github/ci-hpc-config.yml000066400000000000000000000000761476677706700201230ustar00rootroot00000000000000build: python: '3.10' modules: - ninja parallel: 64 multiurl-0.3.5/.github/workflows/000077500000000000000000000000001476677706700170245ustar00rootroot00000000000000multiurl-0.3.5/.github/workflows/ci.yml000066400000000000000000000024211476677706700201410ustar00rootroot00000000000000name: ci on: # Trigger the workflow on push to master or develop, except tag creation push: branches: - 'main' - 'develop' tags-ignore: - '**' # Trigger the workflow on pull request pull_request: ~ # Trigger the workflow manually workflow_dispatch: ~ # Trigger after public PR approved for CI pull_request_target: types: [labeled] jobs: # Run CI including downstream packages on self-hosted runners downstream-ci: name: downstream-ci if: ${{ !github.event.pull_request.head.repo.fork && github.event.action != 'labeled' || github.event.label.name == 'approved-for-ci' }} uses: ecmwf-actions/downstream-ci/.github/workflows/downstream-ci.yml@main with: multiurl: ecmwf/multiurl@${{ github.event.pull_request.head.sha || github.sha }} codecov_upload: true secrets: inherit # Build downstream packages on HPC downstream-ci-hpc: name: downstream-ci-hpc if: ${{ !github.event.pull_request.head.repo.fork && github.event.action != 'labeled' || github.event.label.name == 'approved-for-ci' }} uses: ecmwf-actions/downstream-ci/.github/workflows/downstream-ci-hpc.yml@main with: multiurl: ecmwf/multiurl@${{ github.event.pull_request.head.sha || github.sha }} secrets: inherit multiurl-0.3.5/.github/workflows/python-publish.yml000066400000000000000000000042551476677706700225420ustar00rootroot00000000000000# This workflow will upload a Python Package using Twine when a release is created # For more information see: https://help.github.com/en/actions/language-and-framework-guides/using-python-with-github-actions#publishing-to-package-registries name: Upload Python Package on: push: branches: - main pull_request: branches: - main release: types: [created] jobs: quality: name: Code QA runs-on: ubuntu-latest steps: - uses: actions/checkout@v2 - run: pip install black flake8 isort - run: black --version - run: isort --version - run: flake8 --version - run: isort --check . - run: black --check . - run: flake8 . checks: strategy: fail-fast: false matrix: platform: ["ubuntu-latest", "macos-latest", "windows-latest"] python-version: ["3.9", "3.10", "3.11", "3.12", "3.13"] name: Python ${{ matrix.python-version }} on ${{ matrix.platform }} runs-on: ${{ matrix.platform }} needs: quality steps: - uses: actions/checkout@v2 - uses: actions/setup-python@v2 with: python-version: ${{ matrix.python-version }} - name: Install run: | pip install pytest pip install pytest-localftpserver pip install -e . pip freeze - name: Tests run: pytest deploy: if: ${{ github.event_name == 'release' }} runs-on: ubuntu-latest needs: checks steps: - uses: actions/checkout@v2 - name: Set up Python uses: actions/setup-python@v2 with: python-version: '3.x' - name: Install dependencies run: | python -m pip install --upgrade pip pip install setuptools build twine - name: Check that tag version matches code version run: | tag=${GITHUB_REF#refs/tags/} version=$(python setup.py --version) echo 'tag='$tag echo "version file="$version test "$tag" == "$version" - name: Build and publish env: TWINE_USERNAME: __token__ TWINE_PASSWORD: ${{ secrets.PYPI_API_TOKEN }} run: | python -m build twine upload dist/* multiurl-0.3.5/.gitignore000066400000000000000000000035331476677706700154230ustar00rootroot00000000000000# Byte-compiled / optimized / DLL files __pycache__/ *.py[cod] *$py.class # C extensions *.so # Distribution / packaging .Python build/ develop-eggs/ dist/ downloads/ eggs/ .eggs/ lib/ lib64/ parts/ sdist/ var/ wheels/ pip-wheel-metadata/ share/python-wheels/ *.egg-info/ .installed.cfg *.egg MANIFEST # PyInstaller # Usually these files are written by a python script from a template # before PyInstaller builds the exe, so as to inject date/other infos into it. *.manifest *.spec # Installer logs pip-log.txt pip-delete-this-directory.txt # Unit test / coverage reports htmlcov/ .tox/ .nox/ .coverage .coverage.* .cache nosetests.xml coverage.xml *.cover *.py,cover .hypothesis/ .pytest_cache/ # Translations *.mo *.pot # Django stuff: *.log local_settings.py db.sqlite3 db.sqlite3-journal # Flask stuff: instance/ .webassets-cache # Scrapy stuff: .scrapy # Sphinx documentation docs/_build/ # PyBuilder target/ # Jupyter Notebook .ipynb_checkpoints # IPython profile_default/ ipython_config.py # pyenv .python-version # pipenv # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. # However, in case of collaboration, if having platform-specific dependencies or dependencies # having no cross-platform support, pipenv may install dependencies that don't work, or not # install all needed dependencies. #Pipfile.lock # PEP 582; used by e.g. github.com/David-OConnor/pyflow __pypackages__/ # Celery stuff celerybeat-schedule celerybeat.pid # SageMath parsed files *.sage.py # Environments .env .venv env/ venv/ ENV/ env.bak/ venv.bak/ # Spyder project settings .spyderproject .spyproject # Rope project settings .ropeproject # mkdocs documentation /site # mypy .mypy_cache/ .dmypy.json dmypy.json # Pyre type checker .pyre/ # Other *.swp test.py .vscode/ target data *.download *.data *.bufr *.grib *.grib2 multiurl-0.3.5/LICENSE000066400000000000000000000261351476677706700144430ustar00rootroot00000000000000 Apache License Version 2.0, January 2004 http://www.apache.org/licenses/ TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 1. Definitions. "License" shall mean the terms and conditions for use, reproduction, and distribution as defined by Sections 1 through 9 of this document. "Licensor" shall mean the copyright owner or entity authorized by the copyright owner that is granting the License. "Legal Entity" shall mean the union of the acting entity and all other entities that control, are controlled by, or are under common control with that entity. For the purposes of this definition, "control" means (i) the power, direct or indirect, to cause the direction or management of such entity, whether by contract or otherwise, or (ii) ownership of fifty percent (50%) or more of the outstanding shares, or (iii) beneficial ownership of such entity. "You" (or "Your") shall mean an individual or Legal Entity exercising permissions granted by this License. "Source" form shall mean the preferred form for making modifications, including but not limited to software source code, documentation source, and configuration files. "Object" form shall mean any form resulting from mechanical transformation or translation of a Source form, including but not limited to compiled object code, generated documentation, and conversions to other media types. "Work" shall mean the work of authorship, whether in Source or Object form, made available under the License, as indicated by a copyright notice that is included in or attached to the work (an example is provided in the Appendix below). "Derivative Works" shall mean any work, whether in Source or Object form, that is based on (or derived from) the Work and for which the editorial revisions, annotations, elaborations, or other modifications represent, as a whole, an original work of authorship. For the purposes of this License, Derivative Works shall not include works that remain separable from, or merely link (or bind by name) to the interfaces of, the Work and Derivative Works thereof. "Contribution" shall mean any work of authorship, including the original version of the Work and any modifications or additions to that Work or Derivative Works thereof, that is intentionally submitted to Licensor for inclusion in the Work by the copyright owner or by an individual or Legal Entity authorized to submit on behalf of the copyright owner. For the purposes of this definition, "submitted" means any form of electronic, verbal, or written communication sent to the Licensor or its representatives, including but not limited to communication on electronic mailing lists, source code control systems, and issue tracking systems that are managed by, or on behalf of, the Licensor for the purpose of discussing and improving the Work, but excluding communication that is conspicuously marked or otherwise designated in writing by the copyright owner as "Not a Contribution." "Contributor" shall mean Licensor and any individual or Legal Entity on behalf of whom a Contribution has been received by Licensor and subsequently incorporated within the Work. 2. Grant of Copyright License. Subject to the terms and conditions of this License, each Contributor hereby grants to You a perpetual, worldwide, non-exclusive, no-charge, royalty-free, irrevocable copyright license to reproduce, prepare Derivative Works of, publicly display, publicly perform, sublicense, and distribute the Work and such Derivative Works in Source or Object form. 3. Grant of Patent License. Subject to the terms and conditions of this License, each Contributor hereby grants to You a perpetual, worldwide, non-exclusive, no-charge, royalty-free, irrevocable (except as stated in this section) patent license to make, have made, use, offer to sell, sell, import, and otherwise transfer the Work, where such license applies only to those patent claims licensable by such Contributor that are necessarily infringed by their Contribution(s) alone or by combination of their Contribution(s) with the Work to which such Contribution(s) was submitted. If You institute patent litigation against any entity (including a cross-claim or counterclaim in a lawsuit) alleging that the Work or a Contribution incorporated within the Work constitutes direct or contributory patent infringement, then any patent licenses granted to You under this License for that Work shall terminate as of the date such litigation is filed. 4. Redistribution. You may reproduce and distribute copies of the Work or Derivative Works thereof in any medium, with or without modifications, and in Source or Object form, provided that You meet the following conditions: (a) You must give any other recipients of the Work or Derivative Works a copy of this License; and (b) You must cause any modified files to carry prominent notices stating that You changed the files; and (c) You must retain, in the Source form of any Derivative Works that You distribute, all copyright, patent, trademark, and attribution notices from the Source form of the Work, excluding those notices that do not pertain to any part of the Derivative Works; and (d) If the Work includes a "NOTICE" text file as part of its distribution, then any Derivative Works that You distribute must include a readable copy of the attribution notices contained within such NOTICE file, excluding those notices that do not pertain to any part of the Derivative Works, in at least one of the following places: within a NOTICE text file distributed as part of the Derivative Works; within the Source form or documentation, if provided along with the Derivative Works; or, within a display generated by the Derivative Works, if and wherever such third-party notices normally appear. The contents of the NOTICE file are for informational purposes only and do not modify the License. You may add Your own attribution notices within Derivative Works that You distribute, alongside or as an addendum to the NOTICE text from the Work, provided that such additional attribution notices cannot be construed as modifying the License. You may add Your own copyright statement to Your modifications and may provide additional or different license terms and conditions for use, reproduction, or distribution of Your modifications, or for any such Derivative Works as a whole, provided Your use, reproduction, and distribution of the Work otherwise complies with the conditions stated in this License. 5. Submission of Contributions. Unless You explicitly state otherwise, any Contribution intentionally submitted for inclusion in the Work by You to the Licensor shall be under the terms and conditions of this License, without any additional terms or conditions. Notwithstanding the above, nothing herein shall supersede or modify the terms of any separate license agreement you may have executed with Licensor regarding such Contributions. 6. Trademarks. This License does not grant permission to use the trade names, trademarks, service marks, or product names of the Licensor, except as required for reasonable and customary use in describing the origin of the Work and reproducing the content of the NOTICE file. 7. Disclaimer of Warranty. Unless required by applicable law or agreed to in writing, Licensor provides the Work (and each Contributor provides its Contributions) on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied, including, without limitation, any warranties or conditions of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A PARTICULAR PURPOSE. You are solely responsible for determining the appropriateness of using or redistributing the Work and assume any risks associated with Your exercise of permissions under this License. 8. Limitation of Liability. In no event and under no legal theory, whether in tort (including negligence), contract, or otherwise, unless required by applicable law (such as deliberate and grossly negligent acts) or agreed to in writing, shall any Contributor be liable to You for damages, including any direct, indirect, special, incidental, or consequential damages of any character arising as a result of this License or out of the use or inability to use the Work (including but not limited to damages for loss of goodwill, work stoppage, computer failure or malfunction, or any and all other commercial damages or losses), even if such Contributor has been advised of the possibility of such damages. 9. Accepting Warranty or Additional Liability. While redistributing the Work or Derivative Works thereof, You may choose to offer, and charge a fee for, acceptance of support, warranty, indemnity, or other liability obligations and/or rights consistent with this License. However, in accepting such obligations, You may act only on Your own behalf and on Your sole responsibility, not on behalf of any other Contributor, and only if You agree to indemnify, defend, and hold each Contributor harmless for any liability incurred by, or claims asserted against, such Contributor by reason of your accepting any such warranty or additional liability. END OF TERMS AND CONDITIONS APPENDIX: How to apply the Apache License to your work. To apply the Apache License to your work, attach the following boilerplate notice, with the fields enclosed by brackets "[]" replaced with your own identifying information. (Don't include the brackets!) The text should be enclosed in the appropriate comment syntax for the file format. We also recommend that a file or class name and description of purpose be included on the same "printed page" as the copyright notice for easier identification within third-party archives. Copyright [yyyy] [name of copyright owner] Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. multiurl-0.3.5/README.md000066400000000000000000000026261476677706700147140ustar00rootroot00000000000000# multiurl A package to download several URL as one, as well as supporting multi-part URLs ## Simple example ```python from multiurl import download download(url="http://example.com/test.data", target="data.file") ``` ## Download from two URLs into one file ```python from multiurl import download download(url=["http://example.com/test1.data", "http://example.com/test2.data"], target="data.file") ``` URLs types can be mixed: ```python from multiurl import download download(url=["http://example.com/test1.data", "ftp://example.com/test2.data"], target="data.file") ``` ## Download parts of URLs Provide parts of URLs as a list of `(offset, length)` tuples, expressed in bytes. ```python from multiurl import download download(url="http://example.com/test.data", parts = [(0, 10), (40, 10), (60, 10)], target="data.file") ``` ## Download parts of URLs form several URLs ```python from multiurl import download download(url=[("http://example.com/test1.data", [(0, 10), (40, 10), (60, 10)]), ("http://example.com/test2.data", [(0, 10), (40, 10), (60, 10)])], target="data.file") ``` ### License [Apache License 2.0](LICENSE) In applying this licence, ECMWF does not waive the privileges and immunities granted to it by virtue of its status as an intergovernmental organisation nor does it submit to any jurisdiction. multiurl-0.3.5/multiurl/000077500000000000000000000000001476677706700153045ustar00rootroot00000000000000multiurl-0.3.5/multiurl/__init__.py000066400000000000000000000010101476677706700174050ustar00rootroot00000000000000# (C) Copyright 2021 ECMWF. # # This software is licensed under the terms of the Apache Licence Version 2.0 # which can be obtained at http://www.apache.org/licenses/LICENSE-2.0. # In applying this licence, ECMWF does not waive the privileges and immunities # granted to it by virtue of its status as an intergovernmental organisation # nor does it submit to any jurisdiction. # from .downloader import Downloader, download, robust __version__ = "0.3.5" __all__ = [ "download", "Downloader", "robust", ] multiurl-0.3.5/multiurl/base.py000066400000000000000000000075101476677706700165730ustar00rootroot00000000000000# (C) Copyright 2021 ECMWF. # # This software is licensed under the terms of the Apache Licence Version 2.0 # which can be obtained at http://www.apache.org/licenses/LICENSE-2.0. # In applying this licence, ECMWF does not waive the privileges and immunities # granted to it by virtue of its status as an intergovernmental organisation # nor does it submit to any jurisdiction. # import logging import os LOG = logging.getLogger(__name__) def _ignore(*args, **kwargs): pass class NoBar: def __init__(self, *args, **kwargs): pass def __enter__(self): return self def __exit__(self, *args, **kwargs): pass def update(self, *args, **kwargs): pass def close(self, *args, **kwargs): pass def progress_bar(total, initial=0, desc=None): try: # There is a bug in tqdm that expects ipywidgets # to be installed if running in a notebook import ipywidgets # noqa F401 from tqdm.auto import tqdm # noqa F401 except ImportError: try: from tqdm import tqdm # noqa F401 except ImportError: tqdm = NoBar return tqdm( total=total, initial=initial, unit_scale=True, unit_divisor=1024, unit="B", disable=False, leave=False, desc=desc, ) class DownloaderBase: def __init__( self, url, chunk_size=1024 * 1024, timeout=None, parts=None, observer=_ignore, statistics_gatherer=_ignore, progress_bar=progress_bar, resume_transfers=False, override_target_file=True, download_file_extension=None, auth=None, **kwargs, ): self.url = url self.chunk_size = chunk_size self.timeout = timeout self.parts = parts self.observer = observer self.statistics_gatherer = statistics_gatherer self.progress_bar = progress_bar self.resume_transfers = resume_transfers self.override_target_file = override_target_file self.download_file_extension = download_file_extension self.auth = auth def mutate(self, *args, **kwargs): return self def local_path(self): return None def extension(self, url=None): if url is None: url = self.url url_no_args = url.split("?")[0] base = os.path.basename(url_no_args) extensions = [] while True: base, ext = os.path.splitext(base) if not ext: break extensions.append(ext) if not extensions: extensions.append(".unknown") return "".join(reversed(extensions)) def download(self, target): if os.path.exists(target) and not self.override_target_file: return if self.download_file_extension is not None: download = target + ".download" else: download = target LOG.info("Downloading %s", self.url) size, mode, skip, trust_size = self.estimate_size(download) with self.progress_bar( total=size, initial=skip, desc=self.title(), ) as pbar: with open(download, mode) as f: total = self.transfer(f, pbar) pbar.close() if trust_size and size is not None: assert ( os.path.getsize(download) == size ), f"File size mismatch {os.path.getsize(download)} bytes instead of {size}" if download != target: os.rename(download, target) self.finalise() return total def finalise(self): pass def title(self): return os.path.basename(self.url) def cache_data(self): return None def out_of_date(self, path, cache_data): return False multiurl-0.3.5/multiurl/downloader.py000066400000000000000000000061331476677706700200170ustar00rootroot00000000000000# (C) Copyright 2021 ECMWF. # # This software is licensed under the terms of the Apache Licence Version 2.0 # which can be obtained at http://www.apache.org/licenses/LICENSE-2.0. # In applying this licence, ECMWF does not waive the privileges and immunities # granted to it by virtue of its status as an intergovernmental organisation # nor does it submit to any jurisdiction. # import logging import os from pathlib import Path from urllib.parse import urlparse from .file import FullFileDownloader, PartFileDownloader from .ftp import FullFTPDownloader, PartFTPDownloader from .heuristics import Part from .http import FullHTTPDownloader, PartHTTPDownloader, robust from .multipart import compress_parts LOG = logging.getLogger(__name__) __all__ = ["Downloader", "download", "robust"] DOWNLOADERS = { ("ftp", False): FullFTPDownloader, ("ftp", True): PartFTPDownloader, ("http", False): FullHTTPDownloader, ("http", True): PartHTTPDownloader, ("https", False): FullHTTPDownloader, ("https", True): PartHTTPDownloader, ("file", False): FullFileDownloader, ("file", True): PartFileDownloader, } def _ensure_scheme(url): o = urlparse(url) if not o.scheme or (len(o.scheme) == 1 and not o.netloc): path = Path(url) if not path.is_absolute(): path = Path(os.path.abspath(path)) url = path.as_uri() return url def _ensure_parts(parts): if parts is None: return None parts = [Part(offset, length) for offset, length in parts] if len(parts) == 0: return None return parts def _canonicalize(url, **kwargs): if not isinstance(url, (list, tuple)): url = [url] result = [] if isinstance(url[0], (list, tuple)): assert "parts" not in kwargs for u, p in url: result.append((_ensure_scheme(u), _ensure_parts(p))) else: p = _ensure_parts(kwargs.pop("parts", None)) for u in url: result.append((_ensure_scheme(u), p)) urls_and_parts = [] # Break into ascending order if needed for url, parts in result: if parts is None: urls_and_parts.append((url, None)) continue last = 0 newparts = [] for p in parts: if p.offset < last: if newparts: urls_and_parts.append((url, compress_parts(newparts))) newparts = [] newparts.append(p) last = p.offset urls_and_parts.append((url, compress_parts(newparts))) return urls_and_parts, kwargs def Downloader(url, **kwargs): urls, kwargs = _canonicalize(url, **kwargs) downloaders = [] for url, parts in urls: o = urlparse(url) klass = DOWNLOADERS[(o.scheme, parts is not None)] downloaders.append( klass(url, parts=parts, **kwargs).mutate(url, parts=parts, **kwargs) ) if len(downloaders) == 1: return downloaders[0] from .multiurl import MultiDownloader return MultiDownloader(downloaders) def download(url, target, **kwargs): return Downloader(url, **kwargs).download(target) multiurl-0.3.5/multiurl/file.py000066400000000000000000000050031476677706700165730ustar00rootroot00000000000000# (C) Copyright 2021 ECMWF. # # This software is licensed under the terms of the Apache Licence Version 2.0 # which can be obtained at http://www.apache.org/licenses/LICENSE-2.0. # In applying this licence, ECMWF does not waive the privileges and immunities # granted to it by virtue of its status as an intergovernmental organisation # nor does it submit to any jurisdiction. # import logging import os import sys from urllib.parse import urlparse from .base import DownloaderBase LOG = logging.getLogger(__name__) class FileDownloaderBase(DownloaderBase): def __init__(self, url, **kwargs): super().__init__(url, **kwargs) o = urlparse(self.url) path = o.path if sys.platform == "win32" and self.url.startswith("file://"): # this is because urllib does not decode # 'file://C:\Users\name\climetlab\docs\examples\test.nc' # as expected. path = self.url[len("file://") :] if sys.platform == "win32" and path[0] == "/" and path[2] == ":": path = path[1:] self.path = path class FullFileDownloader(FileDownloaderBase): def local_path(self): return self.path def __repr__(self): return f"FullFileDownloader({self.path})" def estimate_size(self, target): # TODO: resume transfers size = os.path.getsize(self.path) return (size, "wb", 0, True) def transfer(self, f, pbar): total = 0 with open(self.path, "rb") as g: while True: chunk = g.read(self.chunk_size) if not chunk: break f.write(chunk) pbar.update(len(chunk)) total += len(chunk) return total class PartFileDownloader(FileDownloaderBase): def __repr__(self): return f"PartFileDownloader({self.path, self.parts})" def estimate_size(self, target): parts = self.parts size = sum(p.length for p in parts) return (size, "wb", 0, True) def transfer(self, f, pbar): with open(self.path, "rb") as g: total = 0 for offset, length in self.parts: g.seek(offset) self.observer() while length > 0: chunk = g.read(min(length, self.chunk_size)) assert chunk f.write(chunk) length -= len(chunk) total += len(chunk) pbar.update(len(chunk)) return total multiurl-0.3.5/multiurl/ftp.py000066400000000000000000000042231476677706700164500ustar00rootroot00000000000000# (C) Copyright 2021 ECMWF. # # This software is licensed under the terms of the Apache Licence Version 2.0 # which can be obtained at http://www.apache.org/licenses/LICENSE-2.0. # In applying this licence, ECMWF does not waive the privileges and immunities # granted to it by virtue of its status as an intergovernmental organisation # nor does it submit to any jurisdiction. # import logging import os from ftplib import FTP, error_perm from urllib.parse import urlparse from .base import DownloaderBase LOG = logging.getLogger(__name__) class FTPDownloaderBase(DownloaderBase): def __init__(self, url, **kwargs): super().__init__(url, **kwargs) def estimate_size(self, target): url_object = urlparse(self.url) assert url_object.scheme == "ftp" user, password = url_object.username, url_object.password ftp = FTP(timeout=self.timeout) connect_kwargs = {} if url_object.port is not None: connect_kwargs["port"] = url_object.port ftp.connect(host=url_object.hostname, **connect_kwargs) ftp.login(user, password) ftp.cwd(os.path.dirname(url_object.path)) ftp.set_pasv(True) self.filename = os.path.basename(url_object.path) self.ftp = ftp try: return (ftp.size(self.filename), "wb", 0, True) except error_perm: return (-1, "wb", True, False) def transfer(self, f, pbar): total = 0 def callback(chunk): nonlocal total self.observer() f.write(chunk) total += len(chunk) pbar.update(len(chunk)) self.ftp.retrbinary(f"RETR {self.filename}", callback) def finalise(self): self.ftp.close() class FullFTPDownloader(FTPDownloaderBase): def __repr__(self): return f"FullFTPDownloader({self.url})" class PartFTPDownloader(FTPDownloaderBase): def __init__(self, url, **kwargs): # If needed, that can be implemented with the PartFilter raise NotImplementedError("Part FTPDownloader is not yet implemented") def __repr__(self): return f"PartFTPDownloader({self.url, self.parts})" multiurl-0.3.5/multiurl/heuristics.py000066400000000000000000000106001476677706700200350ustar00rootroot00000000000000# (C) Copyright 2021 ECMWF. # # This software is licensed under the terms of the Apache Licence Version 2.0 # which can be obtained at http://www.apache.org/licenses/LICENSE-2.0. # In applying this licence, ECMWF does not waive the privileges and immunities # granted to it by virtue of its status as an intergovernmental organisation # nor does it submit to any jurisdiction. # import logging import re from collections import namedtuple LOG = logging.getLogger(__name__) Part = namedtuple("Part", ["offset", "length"]) def round_down(a, b): return (a // b) * b def round_up(a, b): return ((a + b - 1) // b) * b class HierarchicalClustering: def __init__(self, min_clusters=5): self.min_clusters = min_clusters def __call__(self, parts): clusters = [p for p in parts] while len(clusters) > self.min_clusters: min_dist = min( clusters[i].offset - clusters[i - 1].offset + clusters[i - 1].length for i in range(1, len(clusters)) ) i = 1 while i < len(clusters): d = clusters[i].offset - clusters[i - 1].offset + clusters[i - 1].length if d <= min_dist: clusters[i - 1] = Part( clusters[i - 1].offset, clusters[i].offset + clusters[i].length - clusters[i - 1].offset, ) clusters.pop(i) else: i += 1 return clusters def __repr__(self): return f"cluster({self.min_clusters})" class BlockGrouping: def __init__(self, block_size): self.block_size = block_size def __call__(self, parts): blocks = [] last_block_offset = -1 last_offset = 0 for offset, length in parts: assert offset >= last_offset block_offset = round_down(offset, self.block_size) block_length = round_up(offset + length, self.block_size) - block_offset if block_offset <= last_block_offset: prev_offset, prev_length = blocks.pop() end_offset = block_offset + block_length prev_end_offset = prev_offset + prev_length block_offset = min(block_offset, prev_offset) assert block_offset == prev_offset block_length = max(end_offset, prev_end_offset) - block_offset blocks.append(Part(block_offset, block_length)) last_block_offset = block_offset + block_length last_offset = offset + length return blocks def __repr__(self): return f"blocked({self.block_size})" class Automatic: def __call__(self, parts): smallest = min(x.length for x in parts) range_method = round_up(max(x.length for x in parts), 1024) while range_method >= smallest: blocks = BlockGrouping(range_method)(parts) range_method //= 2 # Max number of parts return blocks def __repr__(self): return "auto" class Pipe: def __init__(self, first, second): self.first = first self.second = second def __call__(self, parts): return self.first(self.second(parts)) def __repr__(self): return f"{self.second}|{self.first}" class Debug: def __call__(self, parts): print("DEBUG", parts) return parts def __repr__(self): return "debug" HEURISTICS = { "auto": Automatic, "cluster": HierarchicalClustering, "blocked": BlockGrouping, "debug": Debug, } def parts_heuristics(method, statistics_gatherer): if isinstance(method, int): return BlockGrouping(method) result = None for name in method.split("|"): if "(" in name: m = re.match(r"(.+)\((.+)\)", name) name = m.group(1) args = [] for a in m.group(2).split(","): try: args.append(int(a)) except ValueError: args.append(float(a)) else: args = [] obj = HEURISTICS[name](*args) if result is None: result = obj else: result = Pipe(obj, result) statistics_gatherer( "parts-heuristics", full_method=str(method), method_args=args, ) return result multiurl-0.3.5/multiurl/http.py000066400000000000000000000407341476677706700166450ustar00rootroot00000000000000# (C) Copyright 2021 ECMWF. # # This software is licensed under the terms of the Apache Licence Version 2.0 # which can be obtained at http://www.apache.org/licenses/LICENSE-2.0. # In applying this licence, ECMWF does not waive the privileges and immunities # granted to it by virtue of its status as an intergovernmental organisation # nor does it submit to any jurisdiction. # import datetime import json import logging import os import random import time import pytz import requests from dateutil.parser import parse as parse_date from .base import DownloaderBase from .multipart import DecodeMultipart, PartFilter, compute_byte_ranges LOG = logging.getLogger(__name__) class ServerCapabilities: def __init__(self, accept_ranges, accept_multiple_ranges): self.accept_ranges = accept_ranges self.accept_multiple_ranges = accept_multiple_ranges def NoFilter(x): return x def parse_separated_header(value: str): """Adapted from https://peps.python.org/pep-0594/#cgi.""" from email.message import Message m = Message() m["content-type"] = value return dict(m.get_params()) class HTTPDownloaderBase(DownloaderBase): def __init__( self, url, verify=True, http_headers=None, fake_headers=None, range_method=None, maximum_retries=500, retry_after=120, mirrors=None, session=None, **kwargs, ): super().__init__(url, **kwargs) self._headers = None self._url = None self.http_headers = http_headers if http_headers else {} self.verify = verify self.fake_headers = fake_headers self.range_method = range_method self.retry_after = retry_after self.maximum_retries = maximum_retries self.mirrors = mirrors self.session = requests if session is None else session def headers(self): if self._headers is None or self.url != self._url: self._url = self.url self._headers = {} if self.fake_headers is not None: self._headers = dict(**self.fake_headers) else: try: r = self.robust(self.session.head)( self.url, headers=self.http_headers, verify=self.verify, timeout=self.timeout, allow_redirects=True, auth=self.auth, ) r.raise_for_status() for k, v in r.headers.items(): self._headers[k.lower()] = v LOG.debug( "HTTP headers %s", json.dumps(self._headers, sort_keys=True, indent=4), ) except Exception: self._url = None self._headers = {} if LOG.level == logging.DEBUG: LOG.exception("HEAD %s", self.url) LOG.error("Ignoring HEAD exception.") return self._headers def extension(self): ext = super().extension() if ext == ".unknown": # Only check for "content-disposition" if # the URL does not end with an extension # so we avoid fetching the headers unesseraly headers = self.headers() if "content-disposition" in headers: params = parse_separated_header(headers["content-disposition"]) assert "attachment" in params, params if "filename" in params: ext = super().extension(params["filename"]) return ext def title(self): headers = self.headers() if "content-disposition" in headers: params = parse_separated_header(headers["content-disposition"]) if "filename" in params: return params["filename"] return super().title() def transfer(self, f, pbar): total = 0 start = time.time() stream = self.make_stream() for chunk in stream(chunk_size=self.chunk_size): self.observer() if chunk: f.write(chunk) total += len(chunk) pbar.update(len(chunk)) self.statistics_gatherer( "transfer", url=self.url, total=total, elapsed=time.time() - start, ) return total def cache_data(self): return self.headers() def out_of_date(self, path, cache_data): if cache_data is not None: # TODO: check 'cache-control' to see if we should check the etag if "cache-control" in cache_data: pass if "expires" in cache_data: if cache_data["expires"] != "0": # HTTP1.0 legacy try: expires = parse_date(cache_data["expires"]) now = pytz.UTC.localize(datetime.datetime.utcnow()) if expires > now: LOG.debug( "URL %s not expired (%s > %s)", self.url, expires, now ) return False except Exception: LOG.exception( "Failed to check URL expiry date '%s'", cache_data["expires"], ) try: headers = self.headers() except requests.exceptions.ConnectionError: return False cached_etag = cache_data.get("etag") remote_etag = headers.get("etag") if cached_etag != remote_etag and remote_etag is not None: LOG.warning("Remote content of URL %s has changed", self.url) return True else: LOG.debug("Remote content of URL %s unchanged", self.url) return False def check_for_restarts(self, target): if not self.resume_transfers: return 0 if not os.path.exists(target): return 0 # Check if we can restarts the transfer # TODO: check etags... the file may have changed since bytes = os.path.getsize(target) if bytes > 0: headers = self.headers() if headers.get("accept-ranges") != "bytes": LOG.warning( "%s: %s bytes already download, but server does not support restarts", target, bytes, ) return 0 LOG.info( "%s: resuming download from byte %s", target, bytes, ) return bytes def issue_request(self, bytes_ranges=None): headers = {} headers.update(self.http_headers) if bytes_ranges is not None: headers["range"] = bytes_ranges LOG.debug("Issue request for %s", self.url) LOG.debug("Headers: %s", json.dumps(headers, indent=4, sort_keys=True)) r = self.robust(self.session.get)( self.url, stream=True, verify=self.verify, timeout=self.timeout, headers=headers, auth=self.auth, ) try: r.raise_for_status() except Exception as e: if ( isinstance(e, requests.HTTPError) and e.response is not None and e.response.status_code == requests.codes.not_found ): raise # Keep quiet on 404s LOG.error("URL %s: %s", self.url, r.text) raise return r def robust(self, call): return robust(call, self.maximum_retries, self.retry_after, self.mirrors) class FullHTTPDownloader(HTTPDownloaderBase): def __repr__(self): return f"FullHTTPDownloader({self.url})" def estimate_size(self, target): assert self.parts is None size = None mode = "wb" skip = 0 headers = self.headers() if "content-length" in headers: try: size = int(headers["content-length"]) except Exception: LOG.exception("content-length %s", self.url) # content-length is the size of the encoded body # so we cannot rely on it to check the file size trust_size = size is not None and headers.get("content-encoding") is None # Check if we can restarts the transfer self.range = None bytes = self.check_for_restarts(target) if bytes > 0: assert size is None or bytes < size, (bytes, size, self.url, target) skip = bytes mode = "ab" self.range = f"bytes={bytes}-" LOG.debug( "url estimate_size size=%s mode=%s skip=%s trust_size=%s", size, mode, skip, trust_size, ) return (size, mode, skip, trust_size) def make_stream(self): request = self.issue_request(self.range) return request.iter_content class ServerDoesNotSupportPartsHTTPDownloader(HTTPDownloaderBase): def __repr__(self): return f"ServerDoesNotSupportPartsHTTPDownloader({self.url, self.parts})" def estimate_size(self, target): size = sum(p.length for p in self.parts) return (size, "wb", 0, True) def make_stream(self): request = self.issue_request() return PartFilter(self.parts)(request.iter_content) class SinglePartHTTPDownloader(HTTPDownloaderBase): def __repr__(self): return f"SinglePartHTTPDownloader({self.url, self.parts})" def estimate_size(self, target): assert len(self.parts) == 1 offset, length = self.parts[0] start = offset end = offset + length - 1 bytes = self.check_for_restarts(target) if bytes > 0: start += bytes skip = bytes mode = "ab" else: skip = 0 mode = "wb" self.bytes_range = f"bytes={start}-{end}" size = sum(p.length for p in self.parts) return (size, mode, skip, True) def make_stream(self): request = self.issue_request(self.bytes_range) return request.iter_content class PartHTTPDownloader(HTTPDownloaderBase): _server_capabilities = None def __repr__(self): return f"PartHTTPDownloader({self.url, self.parts})" @property def server_capabilities(self): if self._server_capabilities is None: self._server_capabilities = ServerCapabilities( accept_ranges=False, accept_multiple_ranges=None, ) headers = self.headers() if headers.get("accept-ranges") == "bytes": self._server_capabilities.accept_ranges = True # Special case for Azure # The server does not announce byte-range support, but supports it # The server will ignore multiple ranges and return everything # https://docs.microsoft.com/en-us/rest/api/storageservices/specifying-the-range-header-for-blob-service-operations if headers.get("server", "unknown").startswith("Windows-Azure-Blob"): self._server_capabilities = ServerCapabilities( accept_ranges=True, accept_multiple_ranges=False, ) # Special case for AWS # The server will ignore multiple ranges and return everything if headers.get("server", "unknown").startswith("AmazonS3"): self._server_capabilities = ServerCapabilities( accept_ranges=True, accept_multiple_ranges=False, ) return self._server_capabilities def mutate(self, *args, **kwargs): if not self.server_capabilities.accept_ranges: LOG.warning( "Server for %s does not support byte ranges, downloading whole file", self.url, ) return ServerDoesNotSupportPartsHTTPDownloader(*args, **kwargs) if len(self.parts) == 1: # Special case, we let HTTP to its job, so we can resume transfers if needed return SinglePartHTTPDownloader(*args, **kwargs) return self def split_large_requests(self, parts): ranges = [] for offset, length in parts: ranges.append(f"{offset}-{offset+length-1}") # Nginx default is 4K # https://stackoverflow.com/questions/686217/maximum-on-http-header-values bytes_range = f"bytes={','.join(ranges)}" if len(bytes_range) <= 4000: return [(bytes_range, parts)] middle = len(parts) // 2 return self.split_large_requests(parts[:middle]) + self.split_large_requests( parts[middle:] ) def estimate_size(self, target): size = sum(p.length for p in self.parts) return (size, "wb", 0, True) def make_stream(self): # TODO: implement transfer restarts by trimming the list of parts filter = NoFilter parts = self.parts if self.range_method: rounded, positions = compute_byte_ranges( self.parts, self.range_method, self.url, self.statistics_gatherer, ) filter = PartFilter(self.parts, positions) parts = rounded splits = self.split_large_requests(parts) accept_multiple_ranges = self.server_capabilities.accept_multiple_ranges def iterate_requests(chunk_size): for bytes_ranges, parts in splits: if accept_multiple_ranges is False: request = self.issue_request(bytes_ranges.split(",")[0]) else: request = self.issue_request(bytes_ranges) stream = DecodeMultipart( self.url, request, parts, verify=self.verify, timeout=self.timeout, headers=self.http_headers, ) yield from stream(chunk_size) return filter(iterate_requests) RETRIABLE = ( requests.codes.internal_server_error, requests.codes.bad_gateway, requests.codes.service_unavailable, requests.codes.gateway_timeout, requests.codes.too_many_requests, requests.codes.request_timeout, ) def robust(call, maximum_tries=500, retry_after=120, mirrors=None): def retriable(code): return code in RETRIABLE def wrapped(url, *args, **kwargs): tries = 0 main_url = url while True: tries += 1 if tries >= maximum_tries: # Last attempt, don't do anything return call(main_url, *args, **kwargs) try: r = call(main_url, *args, **kwargs) except requests.exceptions.SSLError: raise except ( requests.exceptions.ConnectionError, requests.exceptions.ReadTimeout, ) as e: r = None LOG.warning( "Recovering from connection error [%s], attempt %s of %s", e, tries, maximum_tries, ) if r is not None: if not retriable(r.status_code): return r LOG.warning( "Recovering from HTTP error [%s %s], attempt %s of %s", r.status_code, r.reason, tries, maximum_tries, ) alternate = None replace = 0 if mirrors is not None: for key, values in mirrors.items(): if url.startswith(key): alternate = values replace = len(key) if not isinstance(alternate, (list, tuple)): alternate = [alternate] if alternate is not None: mirror = random.choice(alternate) LOG.warning("Retrying using mirror %s", mirror) main_url = f"{mirror}{url[replace:]}" else: LOG.warning("Retrying in %s seconds", retry_after) time.sleep(retry_after) LOG.info("Retrying now...") return wrapped multiurl-0.3.5/multiurl/multipart.py000066400000000000000000000231441476677706700177030ustar00rootroot00000000000000# (C) Copyright 2021 ECMWF. # # This software is licensed under the terms of the Apache Licence Version 2.0 # which can be obtained at http://www.apache.org/licenses/LICENSE-2.0. # In applying this licence, ECMWF does not waive the privileges and immunities # granted to it by virtue of its status as an intergovernmental organisation # nor does it submit to any jurisdiction. # import logging import re import requests from .heuristics import Part, parts_heuristics LOG = logging.getLogger(__name__) # S3 does not support multiple ranges class S3Streamer: def __init__(self, url, request, parts, headers, **kwargs): self.url = url self.parts = parts self.request = request self.headers = dict(**headers) self.kwargs = kwargs def __call__(self, chunk_size): # See https://docs.aws.amazon.com/AmazonS3/latest/API/API_GetObject.html headers = dict(**self.headers) # TODO: add assertions for i, part in enumerate(self.parts): if i == 0: request = self.request else: offset, length = part headers["range"] = f"bytes={offset}-{offset+length-1}" request = requests.get( self.url, stream=True, headers=headers, **self.kwargs, ) try: request.raise_for_status() except Exception: LOG.error("URL %s: %s", self.url, request.text) raise header = request.headers bytes = header["content-range"] LOG.debug("HEADERS %s", header) m = re.match(r"^bytes (\d+)d?-(\d+)d?/(\d+)d?$", bytes) assert m, header start, end, total = int(m.group(1)), int(m.group(2)), int(m.group(3)) assert end >= start assert start < total assert end < total assert start == part.offset, (bytes, part) # (end + 1 == total) means that we overshoot the end of the file, # this happens when we round transfer blocks assert (end == part.offset + part.length - 1) or (end + 1 == total), ( bytes, part, ) yield from request.iter_content(chunk_size) class MultiPartStreamer: def __init__(self, url, request, parts, boundary, **kwargs): self.request = request self.size = None if "content-length" in request.headers: self.size = int(request.headers["content-length"]) self.encoding = "utf-8" self.parts = parts self.boundary = boundary def __call__(self, chunk_size): from email.parser import HeaderParser from requests.structures import CaseInsensitiveDict header_parser = HeaderParser() marker = f"--{self.boundary}\r\n".encode(self.encoding) end_header = b"\r\n\r\n" end_data = b"\r\n" end_of_input = f"--{self.boundary}--\r\n".encode(self.encoding) if chunk_size < len(end_data): chunk_size = len(end_data) iter_content = self.request.iter_content(chunk_size) chunk = next(iter_content) # Some servers start with \r\n if chunk[:2] == end_data: chunk = chunk[2:] LOG.debug("MARKER %s", marker) part = 0 while True: while len(chunk) < max(len(marker), len(end_of_input)): more = next(iter_content) assert more is not None chunk += more if chunk.find(end_of_input) == 0: assert part == len(self.parts) break pos = chunk.find(marker) assert pos == 0, (pos, marker, chunk) chunk = chunk[pos + len(marker) :] while True: pos = chunk.find(end_header) LOG.debug("FIND %s %s", end_header, chunk[:80]) if pos != -1: break more = next(iter_content) assert more is not None chunk += more assert len(chunk) < 1024 * 1024 pos += len(end_header) header = chunk[:pos].decode(self.encoding) header = CaseInsensitiveDict(header_parser.parsestr(header)) chunk = chunk[pos:] # kind = header["content-type"] bytes = header["content-range"] LOG.debug("HEADERS %s", header) m = re.match(r"^bytes (\d+)d?-(\d+)d?/(\d+)d?$", bytes) assert m, header start, end, total = int(m.group(1)), int(m.group(2)), int(m.group(3)) assert end >= start assert start < total assert end < total size = end - start + 1 assert start == self.parts[part].offset # (end + 1 == total) means that we overshoot the end of the file, # this happens when we round transfer blocks assert (end == self.parts[part].offset + self.parts[part].length - 1) or ( end + 1 == total ), (bytes, self.parts[part]) while size > 0: if len(chunk) >= size: yield chunk[:size] chunk = chunk[size:] size = 0 else: yield chunk size -= len(chunk) chunk = next(iter_content) if len(chunk) == 0: chunk = next(iter_content) assert chunk assert chunk.find(end_data) == 0, chunk chunk = chunk[len(end_data) :] part += 1 class DecodeMultipart: def __init__(self, url, request, parts, **kwargs): LOG.debug("URL: %s", url) LOG.debug("RESPONSE Headers: %s", request.headers) self.request = request assert request.status_code == 206, request.status_code content_type = request.headers["content-type"] if content_type.startswith("multipart/byteranges; boundary="): _, boundary = content_type.split("=") LOG.debug("****** MULTI-PART supported by server %s", url) self.streamer = MultiPartStreamer(url, request, parts, boundary, **kwargs) else: LOG.debug("****** MULTI-PART *NOT* supported by server %s", url) self.streamer = S3Streamer(url, request, parts, **kwargs) def __call__(self, chunk_size): return self.streamer(chunk_size) class PartFilter: def __init__(self, parts, positions=None): self.parts = parts if positions is None: positions = [x.offset for x in parts] self.positions = positions assert len(self.parts) == len(self.positions) def __call__(self, streamer): def execute(chunk_size): stream = streamer(chunk_size) chunk = next(stream) pos = 0 for (_, length), offset in zip(self.parts, self.positions): offset -= pos while offset > len(chunk): pos += len(chunk) offset -= len(chunk) chunk = next(stream) assert chunk chunk = chunk[offset:] pos += offset size = length while size > 0: if len(chunk) >= size: yield chunk[:size] chunk = chunk[size:] pos += size size = 0 else: yield chunk size -= len(chunk) pos += len(chunk) chunk = next(stream) # Drain stream, so we don't created error messages in the server's logs while True: try: next(stream) except StopIteration: break return execute def compress_parts(parts): last = -1 result = [] # Compress and check for offset, length in parts: assert offset >= 0 and length > 0 assert offset >= last, ( f"Offsets and lengths must be in order, and not overlapping:" f" offset={offset}, end of previous part={last}" ) if offset == last: # Compress offset, prev_length = result.pop() length += prev_length result.append((offset, length)) last = offset + length return tuple(Part(offset, length) for offset, length in result) def compute_byte_ranges(parts, method, url, statistics_gatherer): if callable(method): blocks = method(parts) else: blocks = parts_heuristics(method, statistics_gatherer)(parts) blocks = compress_parts(blocks) assert len(blocks) > 0 assert len(blocks) <= len(parts) statistics_gatherer( "byte-ranges", method=str(method), url=url, parts=parts, blocks=blocks, ) i = 0 positions = [] block_offset, block_length = blocks[i] for offset, length in parts: while offset > block_offset + block_length: i += 1 block_offset, block_length = blocks[i] start = i while offset + length > block_offset + block_length: i += 1 block_offset, block_length = blocks[i] end = i # Sanity check: assert that each parts is contain in a rounded part assert start == end positions.append( offset - blocks[i].offset + sum(blocks[j].length for j in range(i)) ) return blocks, positions multiurl-0.3.5/multiurl/multiurl.py000066400000000000000000000024101476677706700175300ustar00rootroot00000000000000# (C) Copyright 2021 ECMWF. # # This software is licensed under the terms of the Apache Licence Version 2.0 # which can be obtained at http://www.apache.org/licenses/LICENSE-2.0. # In applying this licence, ECMWF does not waive the privileges and immunities # granted to it by virtue of its status as an intergovernmental organisation # nor does it submit to any jurisdiction. # import logging from .base import DownloaderBase LOG = logging.getLogger(__name__) class MultiDownloader(DownloaderBase): def __init__(self, downloaders, **kwargs): super().__init__("", **kwargs) self.downloaders = downloaders def __repr__(self): return f"MultiDownloader({self.downloaders})" def estimate_size(self, download): total = 0 trust_size = True for downloader in self.downloaders: size, _, _, trust = downloader.estimate_size(download) if size is not None: total += size trust_size = trust_size and trust return total, "wb", 0, trust_size def finalise(self): for downloader in self.downloaders: downloader.finalise() def transfer(self, f, pbar): for downloader in self.downloaders: downloader.transfer(f, pbar) multiurl-0.3.5/pytest.ini000066400000000000000000000000601476677706700154540ustar00rootroot00000000000000[pytest] addopts=-s --verbose testpaths = tests multiurl-0.3.5/setup.py000066400000000000000000000040071476677706700151420ustar00rootroot00000000000000#!/usr/bin/env python # (C) Copyright 2021 ECMWF. # # This software is licensed under the terms of the Apache Licence Version 2.0 # which can be obtained at http://www.apache.org/licenses/LICENSE-2.0. # In applying this licence, ECMWF does not waive the privileges and immunities # granted to it by virtue of its status as an intergovernmental organisation # nor does it submit to any jurisdiction. # import io import os import setuptools def read(fname): file_path = os.path.join(os.path.dirname(__file__), fname) return io.open(file_path, encoding="utf-8").read() version = None for line in read("multiurl/__init__.py").split("\n"): if line.startswith("__version__"): version = line.split("=")[-1].strip()[1:-1] assert version setuptools.setup( name="multiurl", version=version, description="A package to download several URL as one, as well as supporting multi-part URLs", long_description=read("README.md"), long_description_content_type="text/markdown", author="European Centre for Medium-Range Weather Forecasts (ECMWF)", author_email="software.support@ecmwf.int", license="Apache License Version 2.0", url="https://github.com/ecmwf/multiurl", packages=setuptools.find_packages(), include_package_data=True, install_requires=["requests", "tqdm", "pytz", "python-dateutil"], zip_safe=True, keywords="tool", classifiers=[ "Development Status :: 3 - Alpha", "Intended Audience :: Developers", "License :: OSI Approved :: Apache Software License", "Programming Language :: Python :: 3", "Programming Language :: Python :: 3.9", "Programming Language :: Python :: 3.10", "Programming Language :: Python :: 3.11", "Programming Language :: Python :: 3.12", "Programming Language :: Python :: 3.13", "Programming Language :: Python :: Implementation :: CPython", "Programming Language :: Python :: Implementation :: PyPy", "Operating System :: OS Independent", ], ) multiurl-0.3.5/tests/000077500000000000000000000000001476677706700145715ustar00rootroot00000000000000multiurl-0.3.5/tests/test_auth.py000066400000000000000000000062131476677706700171450ustar00rootroot00000000000000# (C) Copyright 2021 ECMWF. # # This software is licensed under the terms of the Apache Licence Version 2.0 # which can be obtained at http://www.apache.org/licenses/LICENSE-2.0. # In applying this licence, ECMWF does not waive the privileges and immunities # granted to it by virtue of its status as an intergovernmental organisation # nor does it submit to any jurisdiction. # import logging import os from multiurl import download # NOTE: we just test if the auth object is properly called with the # requests when using download() class Auth: def __init__(self): from collections import defaultdict self.calls = defaultdict(set) def __call__(self, r): method = r.method.lower() self.calls[method].add(r.url) return r def test_auth_single(): auth = Auth() url = "http://get.ecmwf.int/test-data/metview/gallery/temp.bufr" download(url=url, target="out.data", auth=auth) assert auth.calls["head"] == set([url]) assert auth.calls["get"] == set([url]) def test_auth_single_fake_headers(): auth = Auth() url = "http://get.ecmwf.int/test-data/metview/gallery/temp.bufr" download(url=url, target="out.data", auth=auth, fake_headers={}) assert auth.calls["head"] == set() assert auth.calls["get"] == set([url]) def test_auth_single_parts(): auth = Auth() url = "http://get.ecmwf.int/test-data/metview/gallery/temp.bufr" download(url=url, target="out.data", parts=((0, 4),), auth=auth) assert auth.calls["head"] == set([url]) assert auth.calls["get"] == set([url]) assert os.path.getsize("out.data") == 4 with open("out.data", "rb") as f: assert f.read() == b"BUFR" def test_auth_single_multi_parts(): auth = Auth() url = "http://get.ecmwf.int/test-data/metview/gallery/temp.bufr" download(url=url, target="out.data", parts=((0, 4), (20, 4)), auth=auth) assert auth.calls["head"] == set([url]) assert auth.calls["get"] == set([url]) assert os.path.getsize("out.data") == 8 with open("out.data", "rb") as f: assert f.read(4) == b"BUFR" def test_auth_multi(): auth = Auth() urls = [ "http://get.ecmwf.int/test-data/earthkit-data/examples/test.grib", "http://get.ecmwf.int/test-data/earthkit-data/examples/test6.grib", ] download(url=urls, target="out.data", auth=auth) assert auth.calls["head"] == set(urls) assert auth.calls["get"] == set(urls) assert os.path.getsize("out.data") == 2492 with open("out.data", "rb") as f: assert f.read(4) == b"GRIB" def test_auth_multi_parts(): auth = Auth() urls = [ "http://get.ecmwf.int/test-data/earthkit-data/examples/test.grib", "http://get.ecmwf.int/test-data/earthkit-data/examples/test6.grib", ] download(url=urls, target="out.data", parts=((0, 4),), auth=auth) assert auth.calls["head"] == set(urls) assert auth.calls["get"] == set(urls) assert os.path.getsize("out.data") == 8 with open("out.data", "rb") as f: assert f.read(4) == b"GRIB" assert f.read(4) == b"GRIB" if __name__ == "__main__": logging.basicConfig(level=logging.DEBUG) # test_order() multiurl-0.3.5/tests/test_downloader.py000066400000000000000000000063051476677706700203440ustar00rootroot00000000000000# (C) Copyright 2021 ECMWF. # # This software is licensed under the terms of the Apache Licence Version 2.0 # which can be obtained at http://www.apache.org/licenses/LICENSE-2.0. # In applying this licence, ECMWF does not waive the privileges and immunities # granted to it by virtue of its status as an intergovernmental organisation # nor does it submit to any jurisdiction. # import logging import os import pytest from multiurl import Downloader, download from multiurl.http import FullHTTPDownloader def test_http(): Downloader("http://localhost") def test_ftp(): Downloader("ftp://localhost") def test_parts(): download( url="http://get.ecmwf.int/test-data/metview/gallery/temp.bufr", target="out.data", ) download( url="http://get.ecmwf.int/test-data/metview/gallery/temp.bufr", parts=((0, 4),), target="out.data", ) assert os.path.getsize("out.data") == 4 with open("out.data", "rb") as f: assert f.read() == b"BUFR" download( url="http://get.ecmwf.int/test-data/metview/gallery/temp.bufr", parts=((0, 10), (50, 10), (60, 10)), target="out.data", ) assert os.path.getsize("out.data") == 30 with open("out.data", "rb") as f: assert f.read()[:4] == b"BUFR" def test_order(): d = Downloader( url="http://get.ecmwf.int/test-data/metview/gallery/temp.bufr", parts=((3, 1), (2, 1), (1, 1), (0, 1)), ) d.download( target="out.data", ) with open("out.data", "rb") as f: assert f.read()[:4] == b"RFUB" d = Downloader( url="http://get.ecmwf.int/test-data/metview/gallery/temp.bufr", parts=reversed([(3, 1), (2, 1), (1, 1), (0, 1)]), ) print(d) d.download( target="out.data", ) with open("out.data", "rb") as f: assert f.read()[:4] == b"BUFR" def test_content_disposition_handling(): class TestDownloader(FullHTTPDownloader): def headers(self): headers = super().headers() headers["content-disposition"] = 'attachment; filename="temp.bufr"' return headers TestDownloader( url="http://get.ecmwf.int/test-data/metview/gallery/temp.bufr", ).download(target="out") @pytest.mark.skip(reason="ftpserver not defined") def test_ftp_download(tmp_path, ftpserver): local_test_file = os.path.join(tmp_path, "testfile.txt") with open(local_test_file, "w") as f: f.write("This is a test file") ftp_url = ftpserver.put_files(local_test_file, style="url", anon=True) local_test_download = os.path.join(tmp_path, "testdownload.txt") download(ftp_url[0], local_test_download) with open(local_test_file) as original, open(local_test_download) as downloaded: assert original.read() == downloaded.read() ftp_url = ftpserver.put_files(local_test_file, style="url", anon=False) local_test_download = os.path.join(tmp_path, "testdownload.txt") download(ftp_url[0], local_test_download) with open(local_test_file) as original, open(local_test_download) as downloaded: assert original.read() == downloaded.read() if __name__ == "__main__": logging.basicConfig(level=logging.DEBUG) # test_order() multiurl-0.3.5/tests/test_file.py000066400000000000000000000017771476677706700171350ustar00rootroot00000000000000# (C) Copyright 2021 ECMWF. # # This software is licensed under the terms of the Apache Licence Version 2.0 # which can be obtained at http://www.apache.org/licenses/LICENSE-2.0. # In applying this licence, ECMWF does not waive the privileges and immunities # granted to it by virtue of its status as an intergovernmental organisation # nor does it submit to any jurisdiction. # import logging import os from contextlib import contextmanager from multiurl import Downloader, download @contextmanager def chdir(path): save = os.getcwd() os.chdir(path) try: yield finally: os.chdir(save) def test_file(): Downloader("file://localhost") def test_absolute_path(): download( __file__, target="out.data", ) def test_relative_path(): with chdir(os.path.dirname(__file__)): base = os.path.basename(__file__) download(base, target="out.data") if __name__ == "__main__": logging.basicConfig(level=logging.DEBUG) test_absolute_path() multiurl-0.3.5/tests/test_robust.py000066400000000000000000000027571476677706700175330ustar00rootroot00000000000000# (C) Copyright 2021 ECMWF. # # This software is licensed under the terms of the Apache Licence Version 2.0 # which can be obtained at http://www.apache.org/licenses/LICENSE-2.0. # In applying this licence, ECMWF does not waive the privileges and immunities # granted to it by virtue of its status as an intergovernmental organisation # nor does it submit to any jurisdiction. # import logging import os import random import threading from contextlib import contextmanager import pytest from multiurl import download from multiurl.http import RETRIABLE def handler(signum, frame): raise TimeoutError() @contextmanager def timeout(s): def killer(): os._exit(1) save = threading.Timer(s, killer) save.start() try: yield finally: save.cancel() def test_robust(): sleep = 5 with timeout(len(RETRIABLE * sleep * 10)): code = random.choice(RETRIABLE) download( f"http://httpbin.org/status/200,{code}", retry_after=sleep, target="test.data", ) @pytest.mark.skipif(True, reason="Mirror disabled") def test_mirror(): download( "http://datastore.copernicus-climate.eu/error/test-data/metview/gallery/temp.bufr", mirrors={ "http://datastore.copernicus-climate.eu/error/": [ "http://download.ecmwf.int/" ] }, target="data.bufr", ) if __name__ == "__main__": logging.basicConfig(level=logging.DEBUG) test_mirror() multiurl-0.3.5/tox.ini000066400000000000000000000002761476677706700147470ustar00rootroot00000000000000[flake8] ; ignore = E226,E302,E41 max-line-length = 120 ; exclude = tests/* ; See https://black.readthedocs.io/en/stable/the_black_code_style.html extend-ignore = E203 [isort] profile=black