pax_global_header00006660000000000000000000000064141344626000014512gustar00rootroot0000000000000052 comment=38a41ed135e50f9a73b97f49916b190cf56f707c ncbi-acc-download-0.2.8/000077500000000000000000000000001413446260000147655ustar00rootroot00000000000000ncbi-acc-download-0.2.8/.coveragerc000066400000000000000000000000511413446260000171020ustar00rootroot00000000000000[run] omit=ncbi_acc_download/__main__.py ncbi-acc-download-0.2.8/.github/000077500000000000000000000000001413446260000163255ustar00rootroot00000000000000ncbi-acc-download-0.2.8/.github/workflows/000077500000000000000000000000001413446260000203625ustar00rootroot00000000000000ncbi-acc-download-0.2.8/.github/workflows/publish.yml000066400000000000000000000011471413446260000225560ustar00rootroot00000000000000name: Upload release on: release: types: [created] jobs: deploy: runs-on: ubuntu-latest steps: - uses: actions/checkout@v2 - name: Set up Python uses: actions/setup-python@v1 with: python-version: '3.x' - name: Install dependencies run: | python -m pip install --upgrade pip pip install setuptools wheel twine - name: Build and publish env: TWINE_USERNAME: ${{ secrets.PYPI_USERNAME }} TWINE_PASSWORD: ${{ secrets.PYPI_PASSWORD }} run: | python setup.py sdist bdist_wheel twine upload dist/* ncbi-acc-download-0.2.8/.github/workflows/test.yml000066400000000000000000000016331413446260000220670ustar00rootroot00000000000000name: Run tests on: push: branches: [ master ] pull_request: jobs: build: runs-on: ubuntu-latest strategy: matrix: python-version: [3.6, 3.7, 3.8, 3.9] steps: - uses: actions/checkout@v2 - name: Set up Python ${{ matrix.python-version }} uses: actions/setup-python@v1 with: python-version: ${{ matrix.python-version }} - name: Install dependencies run: | python -m pip install --upgrade pip pip install -e .[testing,recursive,validate] - name: Lint with flake8 run: | # stop the build if there are Python syntax errors or undefined names flake8 . --count --select=E9,F63,F7,F82 --show-source --statistics # exit-zero treats all errors as warnings. flake8 . --count --exit-zero --max-complexity=20 --statistics - name: Test with pytest run: | make make coverage ncbi-acc-download-0.2.8/.gitignore000066400000000000000000000001501413446260000167510ustar00rootroot00000000000000*.swp *.pyc ncbi_acc_download.egg-info/* .coverage htmlcov/* .cache/* dist/* # IDEs .idea/* .vscode/* ncbi-acc-download-0.2.8/LICENSE000066400000000000000000000261351413446260000160010ustar00rootroot00000000000000 Apache License Version 2.0, January 2004 http://www.apache.org/licenses/ TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 1. Definitions. "License" shall mean the terms and conditions for use, reproduction, and distribution as defined by Sections 1 through 9 of this document. "Licensor" shall mean the copyright owner or entity authorized by the copyright owner that is granting the License. "Legal Entity" shall mean the union of the acting entity and all other entities that control, are controlled by, or are under common control with that entity. For the purposes of this definition, "control" means (i) the power, direct or indirect, to cause the direction or management of such entity, whether by contract or otherwise, or (ii) ownership of fifty percent (50%) or more of the outstanding shares, or (iii) beneficial ownership of such entity. "You" (or "Your") shall mean an individual or Legal Entity exercising permissions granted by this License. "Source" form shall mean the preferred form for making modifications, including but not limited to software source code, documentation source, and configuration files. "Object" form shall mean any form resulting from mechanical transformation or translation of a Source form, including but not limited to compiled object code, generated documentation, and conversions to other media types. "Work" shall mean the work of authorship, whether in Source or Object form, made available under the License, as indicated by a copyright notice that is included in or attached to the work (an example is provided in the Appendix below). "Derivative Works" shall mean any work, whether in Source or Object form, that is based on (or derived from) the Work and for which the editorial revisions, annotations, elaborations, or other modifications represent, as a whole, an original work of authorship. For the purposes of this License, Derivative Works shall not include works that remain separable from, or merely link (or bind by name) to the interfaces of, the Work and Derivative Works thereof. "Contribution" shall mean any work of authorship, including the original version of the Work and any modifications or additions to that Work or Derivative Works thereof, that is intentionally submitted to Licensor for inclusion in the Work by the copyright owner or by an individual or Legal Entity authorized to submit on behalf of the copyright owner. For the purposes of this definition, "submitted" means any form of electronic, verbal, or written communication sent to the Licensor or its representatives, including but not limited to communication on electronic mailing lists, source code control systems, and issue tracking systems that are managed by, or on behalf of, the Licensor for the purpose of discussing and improving the Work, but excluding communication that is conspicuously marked or otherwise designated in writing by the copyright owner as "Not a Contribution." "Contributor" shall mean Licensor and any individual or Legal Entity on behalf of whom a Contribution has been received by Licensor and subsequently incorporated within the Work. 2. Grant of Copyright License. Subject to the terms and conditions of this License, each Contributor hereby grants to You a perpetual, worldwide, non-exclusive, no-charge, royalty-free, irrevocable copyright license to reproduce, prepare Derivative Works of, publicly display, publicly perform, sublicense, and distribute the Work and such Derivative Works in Source or Object form. 3. Grant of Patent License. Subject to the terms and conditions of this License, each Contributor hereby grants to You a perpetual, worldwide, non-exclusive, no-charge, royalty-free, irrevocable (except as stated in this section) patent license to make, have made, use, offer to sell, sell, import, and otherwise transfer the Work, where such license applies only to those patent claims licensable by such Contributor that are necessarily infringed by their Contribution(s) alone or by combination of their Contribution(s) with the Work to which such Contribution(s) was submitted. If You institute patent litigation against any entity (including a cross-claim or counterclaim in a lawsuit) alleging that the Work or a Contribution incorporated within the Work constitutes direct or contributory patent infringement, then any patent licenses granted to You under this License for that Work shall terminate as of the date such litigation is filed. 4. Redistribution. You may reproduce and distribute copies of the Work or Derivative Works thereof in any medium, with or without modifications, and in Source or Object form, provided that You meet the following conditions: (a) You must give any other recipients of the Work or Derivative Works a copy of this License; and (b) You must cause any modified files to carry prominent notices stating that You changed the files; and (c) You must retain, in the Source form of any Derivative Works that You distribute, all copyright, patent, trademark, and attribution notices from the Source form of the Work, excluding those notices that do not pertain to any part of the Derivative Works; and (d) If the Work includes a "NOTICE" text file as part of its distribution, then any Derivative Works that You distribute must include a readable copy of the attribution notices contained within such NOTICE file, excluding those notices that do not pertain to any part of the Derivative Works, in at least one of the following places: within a NOTICE text file distributed as part of the Derivative Works; within the Source form or documentation, if provided along with the Derivative Works; or, within a display generated by the Derivative Works, if and wherever such third-party notices normally appear. The contents of the NOTICE file are for informational purposes only and do not modify the License. You may add Your own attribution notices within Derivative Works that You distribute, alongside or as an addendum to the NOTICE text from the Work, provided that such additional attribution notices cannot be construed as modifying the License. You may add Your own copyright statement to Your modifications and may provide additional or different license terms and conditions for use, reproduction, or distribution of Your modifications, or for any such Derivative Works as a whole, provided Your use, reproduction, and distribution of the Work otherwise complies with the conditions stated in this License. 5. Submission of Contributions. Unless You explicitly state otherwise, any Contribution intentionally submitted for inclusion in the Work by You to the Licensor shall be under the terms and conditions of this License, without any additional terms or conditions. Notwithstanding the above, nothing herein shall supersede or modify the terms of any separate license agreement you may have executed with Licensor regarding such Contributions. 6. Trademarks. This License does not grant permission to use the trade names, trademarks, service marks, or product names of the Licensor, except as required for reasonable and customary use in describing the origin of the Work and reproducing the content of the NOTICE file. 7. Disclaimer of Warranty. Unless required by applicable law or agreed to in writing, Licensor provides the Work (and each Contributor provides its Contributions) on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied, including, without limitation, any warranties or conditions of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A PARTICULAR PURPOSE. You are solely responsible for determining the appropriateness of using or redistributing the Work and assume any risks associated with Your exercise of permissions under this License. 8. Limitation of Liability. In no event and under no legal theory, whether in tort (including negligence), contract, or otherwise, unless required by applicable law (such as deliberate and grossly negligent acts) or agreed to in writing, shall any Contributor be liable to You for damages, including any direct, indirect, special, incidental, or consequential damages of any character arising as a result of this License or out of the use or inability to use the Work (including but not limited to damages for loss of goodwill, work stoppage, computer failure or malfunction, or any and all other commercial damages or losses), even if such Contributor has been advised of the possibility of such damages. 9. Accepting Warranty or Additional Liability. While redistributing the Work or Derivative Works thereof, You may choose to offer, and charge a fee for, acceptance of support, warranty, indemnity, or other liability obligations and/or rights consistent with this License. However, in accepting such obligations, You may act only on Your own behalf and on Your sole responsibility, not on behalf of any other Contributor, and only if You agree to indemnify, defend, and hold each Contributor harmless for any liability incurred by, or claims asserted against, such Contributor by reason of your accepting any such warranty or additional liability. END OF TERMS AND CONDITIONS APPENDIX: How to apply the Apache License to your work. To apply the Apache License to your work, attach the following boilerplate notice, with the fields enclosed by brackets "{}" replaced with your own identifying information. (Don't include the brackets!) The text should be enclosed in the appropriate comment syntax for the file format. We also recommend that a file or class name and description of purpose be included on the same "printed page" as the copyright notice for easier identification within third-party archives. Copyright {yyyy} {name of copyright owner} Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. ncbi-acc-download-0.2.8/Makefile000066400000000000000000000001521413446260000164230ustar00rootroot00000000000000unit: py.test -v coverage: py.test --cov=ncbi_acc_download --cov-report term-missing --cov-report html ncbi-acc-download-0.2.8/README.md000066400000000000000000000056261413446260000162550ustar00rootroot00000000000000# NCBI accession download script A partner script to the popular [ncbi-genome-download](https://github.com/kblin/ncbi-genome-download) script, `ncbi-acc-download` allows you to download sequences from GenBank/RefSeq by accession through the NCBI [ENTREZ API](https://www.ncbi.nlm.nih.gov/books/NBK184582/). ## Installation ``` pip install ncbi-acc-download ``` Alternatively, clone this repository from GitHub, then run (in a python virtual environment) ``` pip install . ``` If this fails on older versions of Python, try updating your `pip` tool first: ``` pip install --upgrade pip ``` and then rerun the `ncbi-acc-download` install. `ncbi-acc-download` is only developed and tested on Python releases still under active support by the Python project. At the moment, this means versions 3.6, 3.7, 3.8, and 3.9. Specifically, no attempt at testing under Python versions older than 3.6 is being made. `ncbi-acc-download` 0.2.6 was the last version to support Python 2.7. If your system is stuck on an older version of Python, consider using a tool like [Homebrew](http://brew.sh) or [Linuxbrew](http://linuxbrew.sh) to obtain a more up-to-date version. ## Usage To download a nucleotide record AB_12345 in GenBank format, run ``` ncbi-acc-download AB_12345 ``` To download a nucleotide record AB_12345 in FASTA format, run ``` ncbi-acc-download --format fasta AB_12345 ``` To download a protein record WP_12345 in FASTA format, run ``` ncbi-acc-download --molecule protein WP_12345 ``` To just generate a list of download URLs to run the actual download elsewhere, run ``` ncbi-acc-download --url AB_12345 ``` If you want to concatenate multiple sequences into a single file, run ``` ncbi-acc-download --out two_genomes.gbk AB_12345 AB_23456 ``` You can use this with `/dev/stdout` as the filename to print the downloaded data to standard output instead of writing to a file if you want to chain `ncbi-acc-download` with other command line tools, like so: ``` ncbi-genome-download --out /dev/stdout --format fasta AB_12345 AB_23456 | gzip > two_genomes.fa.gz ``` If you want to download all records covered by a WGS master record instead of the master record itself, run ``` ncbi-acc-download --recursive NZ_EXMP01000000 ``` You can supply a genomic range to the accession download using `--range` ``` ncbi-acc-download NC_007194 --range 1001:9000 ``` As cutting a record up with a range operator like that can leave partial features at both ends of the record, you can combine the range download with the new `correct` extended validator to remove the partial features. ``` ncbi-acc-download NC_007194 --range 1001:9000 --extended-validation correct ``` You can get more detailed information on the download progress by using the `--verbose` or `-v` flag. To get an overview of all options, run ``` ncbi-acc-download --help ``` ## License All code is available under the Apache License version 2, see the [`LICENSE`](LICENSE) file for details. ncbi-acc-download-0.2.8/ncbi_acc_download/000077500000000000000000000000001413446260000203755ustar00rootroot00000000000000ncbi-acc-download-0.2.8/ncbi_acc_download/__init__.py000066400000000000000000000000261413446260000225040ustar00rootroot00000000000000__version__ = '0.2.8' ncbi-acc-download-0.2.8/ncbi_acc_download/__main__.py000066400000000000000000000065651413446260000225030ustar00rootroot00000000000000#!/usr/bin/env python """Get sequences from NCBI by GenBank/RefSeq ID.""" from argparse import ArgumentParser, SUPPRESS import sys from .core import download_to_file, generate_url, Config, HAVE_BIOPYTHON from .errors import ( DownloadError, InvalidIdError, ) def main(): """Command line handling.""" parser = ArgumentParser() parser.add_argument('ids', nargs='+', metavar='NCBI-accession') parser.add_argument('-m', '--molecule', default="nucleotide", choices=["nucleotide", "protein"], help="Molecule type to download. Default: %(default)s") parser.add_argument('--api-key', default=SUPPRESS, help="Specify USER NCBI API key. More info at https://www.ncbi.nlm.nih.gov/books/NBK25497/") if HAVE_BIOPYTHON: parser.add_argument('-e', '--extended-validation', action="store", default='none', choices=('none', 'loads', 'all', 'correct'), help="Perform extended validation. Possible options are 'none' to skip validation, " "'loads' to check if the sequence file loads in Biopython, " "or 'all' to run all checks. Default: %(default)s") parser.add_argument('-F', '--format', action="store", default='genbank', choices=('fasta', 'genbank', 'featuretable', 'gff3'), help="File format to download nucleotide sequences in. Default: %(default)s") parser.add_argument('-o', '--out', default=SUPPRESS, help="Single filename to use for the combined output.") parser.add_argument('-p', '--prefix', default=SUPPRESS, help="Filename prefix to use for output files instead of using the NCBI ID.") parser.add_argument('-g', '--range', default=SUPPRESS, help="region to subset accession. only for single accession") if HAVE_BIOPYTHON: parser.add_argument('-r', '--recursive', action="store_true", default=False, help="Recursively get all entries of a WGS entry.") parser.add_argument('--url', action="store_true", default=False, help="Instead of downloading the sequences, just print the URLs to stdout.") parser.add_argument('-v', '--verbose', action="store_true", default=False, help="Print a progress indicator.") opts = parser.parse_args() config = Config.from_args(opts) if len(opts.ids) > 1 and config.range != "none": raise ValueError("Ambiguous range for multiple ids") # TODO: Change this to download multiple records at once? for i, dl_id in enumerate(opts.ids): filename = None append = False if 'prefix' in opts: filename = "{fn}_{i}".format(fn=opts.prefix, i=i) elif 'out' in opts: filename = opts.out append = (i > 0) try: if opts.url: print(generate_url(dl_id, config)) else: download_to_file(dl_id, config, filename, append) except InvalidIdError as err: print("NCBI Entrez returned error code {e.status_code}, are ID(s) {e.ids} valid?".format(e=err)) sys.exit(1) except DownloadError as err: print(err, file=sys.stderr) sys.exit(1) if __name__ == "__main__": main() ncbi-acc-download-0.2.8/ncbi_acc_download/core.py000066400000000000000000000133261413446260000217040ustar00rootroot00000000000000# Copyright 2017,2018 Kai Blin # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. """Core functions of the ncbi-by-accession downloader.""" from __future__ import print_function import functools from io import StringIO import sys import time from urllib.parse import urlencode from ncbi_acc_download.download import ( build_params, get_stream, get_url_by_format, write_stream, ) from ncbi_acc_download.errors import ( TooManyRequests, ValidationError, ) from ncbi_acc_download.validate import ( HAVE_BIOPYTHON, run_extended_validation, VALIDATION_LEVELS, ) from ncbi_acc_download.wgs import download_wgs_parts ENTREZ_URL = 'https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi' SVIEWER_URL = 'https://eutils.ncbi.nlm.nih.gov/sviewer/viewer.cgi' class Config(object): """NCBI genome download configuration.""" __slots__ = ( 'range', 'api_key', 'emit', 'entrez_url', '_extended_validation', 'format', 'keep_filename', 'molecule', 'recursive', 'sviewer_url', 'verbose', ) def __init__(self, *, extended_validation="none", molecule="nucleotide", out=None, recursive=False, api_key="none", entrez_url=ENTREZ_URL, sviewer_url=SVIEWER_URL, format="genbank", verbose=False, **kwargs): """Initialise the config from scratch.""" self.extended_validation = extended_validation self.molecule = molecule self.keep_filename = out is not None self.recursive = recursive self.api_key = api_key self.range = kwargs.get('range', 'none') self.entrez_url = entrez_url self.sviewer_url = sviewer_url if self.molecule == 'nucleotide': self.format = format else: self.format = 'fasta' self.verbose = verbose def noop(arg): """Don't do anything.""" pass self.emit = noop if self.verbose: self.emit = functools.partial(print, file=sys.stderr, end='', flush=True) @property def extended_validation(self): """Get the extended validation setting.""" return self._extended_validation @extended_validation.setter def extended_validation(self, value): if value != 'none' and not HAVE_BIOPYTHON: raise ValueError("Asked for extended validation, but Biopython not available") if value not in VALIDATION_LEVELS: raise ValueError("Invalid validation level {}".format(value)) self._extended_validation = value @classmethod def from_args(cls, args): """Initialise from argpase.Namespace object.""" config = cls(**args.__dict__) return config def download_to_file(dl_id, config, filename=None, append=False): """Download a single ID from NCBI and store it to a file.""" # types: string, Config, string, bool -> None mode = 'a' if append else 'w' url = get_url_by_format(config) params = build_params(dl_id, config) try: r = get_stream(url, params) config.emit("Downloading {}\n".format(r.url)) except TooManyRequests as err: config.emit("Server requested us to slow down, waiting {} seconds.".format(err.retry_after)) time.sleep(int(err.retry_after)) r = get_stream(url, params) config.emit("Downloading {}\n".format(r.url)) if config.keep_filename: outfile_name = filename else: outfile_name = _generate_filename(params, filename) with open(outfile_name, mode) as fh: _validate_and_write(r, fh, dl_id, config) def generate_url(dl_id, config): """Generate the Entrez URL to download a file using a separate tool""" # types: string, Config -> string url = get_url_by_format(config) params = build_params(dl_id, config) # remove the tool field, some other tool will do the download del params['tool'] encoded_params = urlencode(params, doseq=True) return "?".join([url, encoded_params]) def _generate_filename(params, filename): safe_ids = params['id'][:20].replace(' ', '_') file_ending = '.fa' if params.get('rettype') == 'gbwithparts': file_ending = '.gbk' elif params.get('rettype') == 'ft': file_ending = '.ft' elif params.get('report') == 'gff3': file_ending = '.gff' if filename: outfile_name = "{filename}{ending}".format(filename=filename, ending=file_ending) else: outfile_name = "{ncbi_id}{ending}".format(ncbi_id=safe_ids, ending=file_ending) return outfile_name def _validate_and_write(request, orig_handle, dl_id, config): if config.extended_validation != 'none' or config.recursive: handle = StringIO() else: handle = orig_handle write_stream(request, handle, dl_id, config) if config.recursive: downloaded = download_wgs_parts(handle, config) handle = downloaded if config.extended_validation != 'none': if not run_extended_validation(handle, config.format, config.extended_validation): raise ValidationError("Sequence(s) downloaded for {} failed to load.".format(dl_id)) if config.extended_validation != 'none' or config.recursive: orig_handle.write(handle.getvalue()) ncbi-acc-download-0.2.8/ncbi_acc_download/download.py000066400000000000000000000103241413446260000225560ustar00rootroot00000000000000# Copyright 2017,2018 Kai Blin # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. """The actual download functionality.""" from __future__ import print_function from collections import OrderedDict from http.client import IncompleteRead import requests import sys from ncbi_acc_download.errors import ( BadPatternError, DownloadError, InvalidIdError, TooManyRequests, ) ERROR_PATTERNS = ( u'Error reading from remote server', u'Bad gateway', u'Bad Gateway', u'Cannot process ID list', u'server is temporarily unable to service your request', u'Service unavailable', u'Server Error', u'ID list is empty', u'Resource temporarily unavailable', u'Failed to retrieve sequence', u'Failed to understand id', ) def get_url_by_format(config): """Get URL depending on the format.""" # types: Config -> string if config.format == 'gff3': return config.sviewer_url return config.entrez_url def build_params(dl_id, config): """Build the query parameters for the Entrez query.""" params = OrderedDict(tool='ncbi-acc-download', retmode='text') # delete / characters and as NCBI ignores IDs after #, do the same. params['id'] = dl_id params['db'] = config.molecule if config.api_key != 'none': params['api_key'] = config.api_key if config.range != 'none': rli = config.range.split(":") if len(rli)==1: rli = config.range.split("..") if len(rli)==1: rli = config.range.split(".") if rli[0] != "": params['from'] = int(rli[0]) if rli[1] != "": params['to'] = int(rli[1]) if config.molecule == 'nucleotide': if config.format == 'genbank': params['rettype'] = 'gbwithparts' elif config.format == 'featuretable': params['rettype'] = 'ft' elif config.format == 'gff3': params['report'] = 'gff3' else: params['rettype'] = 'fasta' else: params['rettype'] = 'fasta' return params def get_stream(url, params): """Get the actual streamed request from NCBI.""" try: r = requests.get(url, params=params, stream=True) except (requests.exceptions.RequestException, IncompleteRead) as e: print("Failed to download {!r} from NCBI".format(params['id']), file=sys.stderr) raise DownloadError(str(e)) if r.status_code != requests.codes.ok: if r.status_code == 429: retry_after = r.headers.get("retry-after") print("Too many requests, please consider using --api-key parameter" " (see https://www.ncbi.nlm.nih.gov/books/NBK25497/).") raise TooManyRequests("Blocked at NCBI Enterz API for too many requests", retry_after) print("Failed to download file with id {} from NCBI".format(params['id']), file=sys.stderr) raise InvalidIdError("Download failed with return code: {}".format(r.status_code), params["id"], r.status_code) return r def write_stream(request, handle, dl_id, config): """Write all chunks of the request to the handle.""" # use a chunk size of 4k, as that's what most filesystems use these days try: for chunk in request.iter_content(4096, decode_unicode=True): config.emit(u'.') for pattern in ERROR_PATTERNS: if pattern in chunk: raise BadPatternError("Failed to download record(s) with id(s) {} from NCBI: {}".format( dl_id, pattern)) handle.write(chunk) except requests.exceptions.ChunkedEncodingError as err: print("Download of {!r} aborted: {}".format(dl_id, str(err)), file=sys.stderr) raise DownloadError(str(err)) config.emit(u'\n') ncbi-acc-download-0.2.8/ncbi_acc_download/errors.py000066400000000000000000000027101413446260000222630ustar00rootroot00000000000000# Copyright 2017,2018 Kai Blin # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. """Custom error classes of the ncbi-by-accession downloader.""" class DownloadError(RuntimeError): """Base error for all problems when downloading from NCBI.""" pass class InvalidIdError(RuntimeError): """Error thrown when Entrez responds with a 4xx error (other than 429).""" def __init__(self, message, ids, status_code): super().__init__(message) self.ids = ids self.status_code = status_code class TooManyRequests(DownloadError): """Error thrown when Entrez responds with a 429 error.""" def __init__(self, message, retry_after): super().__init__(message) self.retry_after = retry_after class BadPatternError(DownloadError): """Error thrown when download file contains an error pattern.""" pass class ValidationError(DownloadError): """Error thrown when download file failes extended validation.""" pass ncbi-acc-download-0.2.8/ncbi_acc_download/validate.py000066400000000000000000000036231413446260000225440ustar00rootroot00000000000000"""Record validation logic.""" import logging # If Biopython is not available, all checks will return False try: from Bio import SeqIO from Bio.SeqFeature import BeforePosition, AfterPosition HAVE_BIOPYTHON = True except ImportError: # pragma: no cover HAVE_BIOPYTHON = False VALIDATION_LEVELS = {'none', 'loads', 'all', 'correct'} def run_extended_validation(handle, file_format, validation_level): """Check if the dowloaded sequence file can load.""" if not HAVE_BIOPYTHON: return False # we wrote to the handle, so rewind it handle.seek(0) try: processed_seq = False records = [] for rec in SeqIO.parse(handle, file_format): processed_seq = True if validation_level == 'loads': continue if validation_level == 'correct': ## Correct possible errors from downloaded a restricted-range genbank file ## Will write over the downloaded file newfeats = [] for f in rec.features: try: if not isinstance(f.location.start, BeforePosition) and not isinstance(f.location.end, AfterPosition): newfeats.append(f) except AttributeError: newfeats.append(f) rec.features = newfeats records.append(rec) if not processed_seq: logging.error('no seq') if validation_level == 'correct': ## rewrite StringIO data handle.truncate(0) handle.seek(0) SeqIO.write(records, handle, 'genbank') return processed_seq except (ValueError, AssertionError) as err: logging.error(err) return False except Exception as err: logging.error("Unhandled exception %s while parsing sequence file.", err) return False ncbi-acc-download-0.2.8/ncbi_acc_download/wgs.py000066400000000000000000000142501413446260000215510ustar00rootroot00000000000000# Copyright 2017,2018 Kai Blin # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. """Recursively download the actual entries for WGS records.""" from io import StringIO import time from ncbi_acc_download.download import ( build_params, get_stream, get_url_by_format, write_stream, ) from ncbi_acc_download.errors import TooManyRequests try: from Bio import SeqIO from Bio.Seq import UndefinedSequenceError HAVE_BIOPYTHON = True except ImportError: # pragma: no cover HAVE_BIOPYTHON = False STEP_SIZE = 10 class WgsRange: def __init__(self, identifier, width, start, end): self.identifier = identifier self.width = width self.start = start self.end = end def get_ids(self): """Get the list of identifier strings covered by the range.""" ids = [] for i in range(self.start, self.end+1): ids.append("{s.identifier}{i:0{s.width}}".format(i=i, s=self)) return ids @classmethod def from_string(cls, range_string): if '-' not in range_string: first = last = range_string else: first, last = range_string.split('-', 1) if '-' in last: raise ValueError("More than one hyphen in input.") if '.' in first: first, _ = first.split('.', 1) if '.' in last: last, _ = last.split('.', 1) identifier = "" for i, char in enumerate(first): if not char.isdigit(): identifier += char else: break width = len(first) - len(identifier) if width < 1: raise ValueError("String identifier is too large.") if not last.startswith(identifier): raise ValueError("Failed to find shared identifier.") first_int = int(first[-width:]) last_int = int(last[-width:]) if last_int < first_int: raise ValueError("Last identifier smaller than first.") return cls(identifier, width, first_int, last_int) def download_wgs_parts(handle, config): """Download all parts of all WGS records in a file handle.""" if not HAVE_BIOPYTHON: return handle updated_records = [] handle.seek(0) records = list(SeqIO.parse(handle, config.format)) for record in records: # TODO: If Biopython ever provides a nice check for undefined sequences, # replace the exception-based check here. run_download = False try: bytes(record.seq) except UndefinedSequenceError: run_download = True if run_download and ('wgs_scafld' in record.annotations or 'wgs' in record.annotations or 'tsa' in record.annotations): updated_records.extend(download_wgs_for_record(record, config)) elif run_download and 'contig' in record.annotations: updated_records.extend(fix_supercontigs(record, config)) else: updated_records.append(record) outhandle = StringIO() SeqIO.write(updated_records, outhandle, config.format) outhandle.seek(0) return outhandle def download_wgs_for_record(record, config): """Download all WGS records in a record.""" if 'wgs_scafld' in record.annotations: # Biopython splits on '-' for us, but doesn't actually calculate the range # Also this is somehow a list of lists wgs_range = WgsRange.from_string('-'.join(record.annotations['wgs_scafld'][0])) elif 'wgs' in record.annotations: # Biopython splits on '-' for us, but doesn't actually calculate the range # Unlike WGS_SCAFLD, this is just a list wgs_range = WgsRange.from_string('-'.join(record.annotations['wgs'])) elif 'tsa' in record.annotations: # Biopython splits on '-' for us, but doesn't actually calculate the range # Like WGS, this is just a list wgs_range = WgsRange.from_string('-'.join(record.annotations['tsa'])) else: return [record] handle = StringIO() id_list = wgs_range.get_ids() i = 0 while i < len(id_list): dl_id = ",".join(id_list[i:i + STEP_SIZE]) i += STEP_SIZE url = get_url_by_format(config) params = build_params(dl_id, config) try: r = get_stream(url, params) config.emit("Downloading {}\n".format(r.url)) except TooManyRequests as err: # Wait, and then retry config.emit("Server requested us to slow down, waiting {} seconds\n".format(err.retry_after)) time.sleep(int(err.retry_after)) r = get_stream(url, params) config.emit("Downloading {}\n".format(r.url)) write_stream(r, handle, dl_id, config) # Rewind, so Biopython can parse this handle.seek(0) return list(SeqIO.parse(handle, config.format)) def fix_supercontigs(record, config): """Fix a record containing a CONTIG entry instead of a seq.""" handle = StringIO() # Let the NCBI assemble the proper record for us by asking for the right format. dl_id = record.id url = get_url_by_format(config) params = build_params(dl_id, config) try: r = get_stream(url, params) config.emit("Downloading {}\n".format(r.url)) except TooManyRequests as err: # Wait, and then retry config.emit("Server requested us to slow down, waiting {} seconds\n".format(err.retry_after)) time.sleep(int(err.retry_after)) r = get_stream(url, params) config.emit("Downloading {}\n".format(r.url)) write_stream(r, handle, dl_id, config) # Rewind, so Biopython can parse this handle.seek(0) return list(SeqIO.parse(handle, config.format)) ncbi-acc-download-0.2.8/publish.sh000077500000000000000000000001411413446260000167660ustar00rootroot00000000000000#!/bin/bash set -euo pipefail rm -rf dist python setup.py sdist bdist_wheel twine upload dist/* ncbi-acc-download-0.2.8/requirements.txt000066400000000000000000000000051413446260000202440ustar00rootroot00000000000000-e . ncbi-acc-download-0.2.8/setup.cfg000066400000000000000000000015221413446260000166060ustar00rootroot00000000000000[metadata] # This will completely override (not add to, but replace) the install_requires # when building a wheel. Unfortunately this requires duplicating all of the # depenencies here, but this allows us to support back to an older version of # pip/setuptools inside the setup.py while still using the newer environment # marker technique inside of a wheel. requires-dist = # We require this gross hack of listing out every 2.7 version < 2.7.9 because # older versions of setuptools/pip used string comparisons and thus 2.7.12 < 2.7.9. pyOpenSSL; python_version < '2.7.9' or python_full_version in '2.7.0 2.7.1 2.7.2 2.7.3 2.7.4 2.7.5 2.7.6 2.7.7 2.7.8' requests >= 2.4.3 [options.extras_require] validate = biopython recursive = biopython [bdist_wheel] universal=1 [pycodestyle] max-line-length=120 [flake8] max-line-length=120 ncbi-acc-download-0.2.8/setup.py000066400000000000000000000046121413446260000165020ustar00rootroot00000000000000import os import sys from setuptools import setup from setuptools.command.test import test as TestCommand def read(fname): return open(os.path.join(os.path.dirname(__file__), fname)).read() long_description = read('README.md') install_requires = [ 'requests >= 2.4.3', ] # Can't use environment markers on old setuptools, so fix the requirements # dynamically here. For wheels, again override the requirements in setup.cfg # to not cause conflicts. if sys.version_info[:3] < (2, 7, 9): install_requires.extend(['pyOpenSSL', 'ndg-httpsclient']) tests_require = [ 'pytest', 'coverage', 'pytest-cov', 'requests-mock', 'pytest-mock', 'flake8', ] recursive_require = [ 'biopython >= 1.79', ] validate_require = [ 'biopython >= 1.79', ] def read_version(): for line in open(os.path.join('ncbi_acc_download', '__init__.py'), 'r'): if line.startswith('__version__'): return line.split('=')[-1].strip().strip("'") class PyTest(TestCommand): def finalize_options(self): TestCommand.finalize_options(self) self.test_args = [] self.test_suite = True def run_tests(self): import pytest errcode = pytest.main(self.test_args) sys.exit(errcode) setup( name='ncbi-acc-download', version=read_version(), author='Kai Blin', author_email='kblin@biosustain.dtu.dk', description='Download genome files from NCBI by accession.', long_description=long_description, long_description_content_type='text/markdown', install_requires=install_requires, tests_require=tests_require, cmdclass={'test': PyTest}, entry_points={ 'console_scripts': [ 'ncbi-acc-download=ncbi_acc_download.__main__:main', 'nad=ncbi_acc_download.__main__:main', ], }, packages=['ncbi_acc_download'], url='https://github.com/kblin/ncbi-acc-download/', license='Apache Software License', classifiers=[ 'Programming Language :: Python', 'Development Status :: 3 - Alpha', 'Intended Audience :: Science/Research', 'Topic :: Scientific/Engineering :: Bio-Informatics', 'License :: OSI Approved :: Apache Software License', 'Operating System :: OS Independent', ], extras_require={ 'recursive': recursive_require, 'testing': tests_require, 'validate': validate_require, }, ) ncbi-acc-download-0.2.8/tests/000077500000000000000000000000001413446260000161275ustar00rootroot00000000000000ncbi-acc-download-0.2.8/tests/conftest.py000066400000000000000000000002641413446260000203300ustar00rootroot00000000000000import pytest import requests_mock @pytest.fixture def req(): """Get requests_mock into the pytest infrastructure.""" with requests_mock.mock() as req: yield req ncbi-acc-download-0.2.8/tests/partialcontig.gbk000066400000000000000000000051171413446260000214600ustar00rootroot00000000000000LOCUS NC_007194 541 bp DNA linear CON 03-APR-2018 DEFINITION Aspergillus fumigatus Af293 chromosome 1, whole genome shotgun sequence. ACCESSION NC_007194 REGION: 60..600 VERSION NC_007194.1 FEATURES Location/Qualifiers source 1..541 /organism="Aspergillus fumigatus Af293" /mol_type="genomic DNA" /strain="Af293" /db_xref="taxon:330879" /chromosome="1" repeat_region <1..64 /rpt_type=tandem /rpt_unit_seq="ccctaa" /satellite="microsatellite" gene <157..>541 /locus_tag="AFUA_1G00100" /old_locus_tag="Afu1g00100" /db_xref="GeneID:3507995" mRNA <157..>541 /locus_tag="AFUA_1G00100" /old_locus_tag="Afu1g00100" /product="MFS monocarboxylate transporter, putative" /note="transcript AFUA_1G00100A" /transcript_id="XM_001481640.1" /db_xref="GeneID:3507995" CDS <157..>541 /locus_tag="AFUA_1G00100" /old_locus_tag="Afu1g00100" /note="encoded by transcript AFUA_1G00100A" /codon_start=1 /product="MFS monocarboxylate transporter, putative" /protein_id="XP_001481690.1" /db_xref="GeneID:3507995" /translation="MSVRQVPKQRRTLIEFSAFKEVPCMLFCVAMFFGYIGFFNPIFY IEAFAIQKHAMGETLAFHLISILNATSVPGRIVPGILGLRFGPLNILLGSAIISGILS LCWIAIYNAGPLIVLAVLYGFSGAFVSLLAVALTTLNLNLQTLRTRMGMCSLLCGFGS LCRAPVAGAVLDNTRSYLGVQLYSGLTIGTTGVLLFFANHLKRRTN" ORIGIN 1 ccctaaccct aaccctaacc ctaaccctaa ccctaaccct aaccctaacc ctaaccctaa 61 ccctttaggc aactgcagct tcaaaccaga tttggatggg ccacacgcgt gctaggtttc 121 ctggttcttg gaacgacatt gttctcacct agtgtgatga gcgttcgtca agttccaaag 181 caaagacgca ctttgatcga gttttcagcg ttcaaggagg tcccgtgtat gctgttctgc 241 gtagcgatgt tctttggtta tattggattc ttcaacccca ttttctacat cgaagcgttt 301 gctatccaaa agcatgcaat gggagagacg cttgcattcc accttatctc gatcctaaat 361 gccacctcag tcccaggtcg gattgttcct ggcattcttg gcttgcgctt tggtccatta 421 aatatccttc taggcagtgc aatcattagc ggcatccttt cactctgctg gatagccatc 481 tacaacgcgg ggcccctaat agtgttagct gtcttgtacg gcttctccgg cgctttcgtc 541 t // ncbi-acc-download-0.2.8/tests/supercontig.gbk000066400000000000000000000011531413446260000211560ustar00rootroot00000000000000LOCUS NC_007194 4918979 bp DNA linear CON 11-NOV-2009 DEFINITION Aspergillus fumigatus Af293 chromosome 1, whole genome shotgun sequence. ACCESSION NC_007194 VERSION NC_007194.1 FEATURES Location/Qualifiers source 1..4918979 /organism="Aspergillus fumigatus Af293" /mol_type="genomic DNA" /strain="Af293" /db_xref="taxon:330879" /chromosome="1" CONTIG join(AAHF01000007.1:1..2204071,gap(43100), AAHF01000004.1:1..2671808) // ncbi-acc-download-0.2.8/tests/supercontig_full.gbk000066400000000000000000000012211413446260000221740ustar00rootroot00000000000000LOCUS NC_007194 60 bp DNA linear CON 03-APR-2018 DEFINITION Aspergillus fumigatus Af293 chromosome 1, whole genome shotgun sequence. ACCESSION NC_007194 VERSION NC_007194.1 KEYWORDS . SOURCE ORGANISM . . FEATURES Location/Qualifiers source 1..60 /organism="Aspergillus fumigatus Af293" /mol_type="genomic DNA" /strain="Af293" /db_xref="taxon:330879" /chromosome="1" ORIGIN 1 cctaacccta accctaaccc taaccctaac cctaacccta accctaaccc taaccctaac // ncbi-acc-download-0.2.8/tests/test_core.py000066400000000000000000000206241413446260000204740ustar00rootroot00000000000000"""Tests for the core module.""" from argparse import Namespace from io import StringIO import pytest import requests from ncbi_acc_download import core from ncbi_acc_download.core import ( ENTREZ_URL, SVIEWER_URL, ) from ncbi_acc_download.errors import ( BadPatternError, DownloadError, InvalidIdError, TooManyRequests, ) def test_config(): """Test the config class.""" args = Namespace(molecule="nucleotide", verbose=False) config = core.Config.from_args(args) assert config.verbose is False assert config.molecule == 'nucleotide' assert config.extended_validation == 'none' args = Namespace(molecule="protein", verbose=True) config = core.Config.from_args(args) assert config.verbose is True assert config.molecule == 'protein' def test_config_no_biopython(monkeypatch): """Test the correct errors are raised if Biopython is not available.""" monkeypatch.setattr(core, 'HAVE_BIOPYTHON', False) assert core.HAVE_BIOPYTHON is False args = Namespace(extended_validation='all') with pytest.raises(ValueError): core.Config.from_args(args) def test_config_have_biopython(): """Test we detect Biopython.""" assert core.HAVE_BIOPYTHON args = Namespace(extended_validation='all') config = core.Config.from_args(args) assert config.extended_validation == 'all' def test_download_to_file(req, tmpdir): """Test downloading things from NCBI.""" req.get(ENTREZ_URL, text='This works.') outdir = tmpdir.mkdir('outdir') filename = outdir.join('foo') expected = outdir.join('foo.gbk') config = core.Config(molecule='nucleotide', verbose=False) core.download_to_file('FOO', config, filename=filename) assert expected.check() def test_download_to_file_append(req, tmpdir): """Test appending multiple downloads into a single file.""" req.get(ENTREZ_URL, text='This works.\n') outdir = tmpdir.mkdir('outdir') filename = outdir.join('foo.txt') expected = outdir.join('foo.txt') config = core.Config(molecule='nucleotide', verbose=False, out='foo.txt') core.download_to_file('FOO', config, filename=str(filename), append=False) core.download_to_file('BAR', config, filename=str(filename), append=True) core.download_to_file('BAZ', config, filename=str(filename), append=True) assert expected.check() assert len(expected.readlines()) == 3 def test_download_to_file_retry(req, tmpdir): """Test downloading things from NCBI, retrying after a 429 status.""" req.get(ENTREZ_URL, response_list=[ {"text": u'Whoa, slow down', "status_code": 429, "headers": {"Retry-After": "0"}}, {"text": 'This works.'}, ]) outdir = tmpdir.mkdir('outdir') filename = outdir.join('foo') expected = outdir.join('foo.gbk') config = core.Config(molecule='nucleotide', verbose=False) core.download_to_file('FOO', config, filename=filename) assert expected.check() def test_build_params(): """Test we build the right set of parameters.""" config = core.Config(molecule='nucleotide', verbose=False) dl_id = 'TEST' expected_params = { 'tool': 'ncbi-acc-download', 'retmode': 'text', 'rettype': 'gbwithparts', 'id': 'TEST', 'db': 'nucleotide' } params = core.build_params(dl_id, config) assert params == expected_params expected_params = { 'tool': 'ncbi-acc-download', 'retmode': 'text', 'rettype': 'fasta', 'id': 'TEST', 'db': 'nucleotide' } config.format = 'fasta' params = core.build_params(dl_id, config) assert params == expected_params expected_params = { 'tool': 'ncbi-acc-download', 'retmode': 'text', 'rettype': 'ft', 'id': 'TEST', 'db': 'nucleotide' } config.format = 'featuretable' params = core.build_params(dl_id, config) assert params == expected_params expected_params = { 'tool': 'ncbi-acc-download', 'retmode': 'text', 'report': 'gff3', 'id': 'TEST', 'db': 'nucleotide' } config.format = 'gff3' params = core.build_params(dl_id, config) assert params == expected_params config = core.Config(molecule='protein', verbose=False) expected_params = { 'tool': 'ncbi-acc-download', 'retmode': 'text', 'rettype': 'fasta', 'id': 'TEST', 'db': 'protein' } params = core.build_params(dl_id, config) assert params == expected_params def test_generate_filename(): """Test output file name generation.""" params = dict(id='TEST', db='nucleotide', rettype='gbwithparts') filename = core._generate_filename(params, 'foo') assert filename == 'foo.gbk' params['rettype'] = 'fasta' filename = core._generate_filename(params, 'foo') assert filename == 'foo.fa' params['rettype'] = 'ft' filename = core._generate_filename(params, 'foo') assert filename == 'foo.ft' del params['rettype'] params['report'] = 'gff3' filename = core._generate_filename(params, 'foo') assert filename == 'foo.gff' params = dict(id='TEST', db='protein', rettype='fasta') filename = core._generate_filename(params, None) assert filename == 'TEST.fa' def test_validate_and_write_error_pattern_raises(req): """Test scanning the download file for error patterns.""" handle = StringIO() req.get('http://fake/', text=u'ID list is empty') r = requests.get('http://fake/') config = core.Config() with pytest.raises(BadPatternError): core._validate_and_write(r, handle, 'FAKE', config) req.get('http://fake/', text=u'Error: CEFetchPApplication::proxy_stream(): Failed to retrieve sequence: NC_405534') r = requests.get('http://fake/') with pytest.raises(BadPatternError): core._validate_and_write(r, handle, 'FAKE', config) def test_validate_and_write_emit(req): """Test writing prints dots in verbose mode.""" handle = StringIO() req.get('http://fake/', text=u'This is a sequence file, honest.') r = requests.get('http://fake/') output = StringIO() config = core.Config() config.emit = output.write core._validate_and_write(r, handle, 'FAKE', config) assert output.getvalue() == u'.\n' assert handle.getvalue() == u'This is a sequence file, honest.' def test_validate_and_write_extended_validation(req): """Test extended validation before writing.""" handle = StringIO() req.get('http://fake/', text=u'>foo\nMAGIC') r = requests.get('http://fake/') config = core.Config(extended_validation='loads', molecule='protein') core._validate_and_write(r, handle, 'FAKE', config) assert handle.getvalue() == u'>foo\nMAGIC' def test_get_stream_exception(req): """Test getting a download stream handles exceptions.""" req.get(ENTREZ_URL, exc=requests.exceptions.RequestException) params = dict(id='FAKE') with pytest.raises(DownloadError): core.get_stream(ENTREZ_URL, params) def test_get_stream_bad_status(req): """Test getting a download stream handles bad status codes.""" req.get(ENTREZ_URL, text=u'Nope!', status_code=404) params = dict(id='FAKE') with pytest.raises(InvalidIdError): core.get_stream(ENTREZ_URL, params) def test_get_stream_too_many_requests(req): """Test getting a download stream handles bad status codes.""" req.get(ENTREZ_URL, text=u'Whoa, slow down', status_code=429, headers={"Retry-After": "2"}) params = dict(id='FAKE') with pytest.raises(TooManyRequests): core.get_stream(ENTREZ_URL, params) def test_generate_url(): """Test URL generation.""" config = core.Config() expected = "{}?{}".format(ENTREZ_URL, "retmode=text&id=FAKE&db=nucleotide&rettype=gbwithparts") assert expected == core.generate_url("FAKE", config) config.format = 'gff3' expected = "{}?{}".format(SVIEWER_URL, "retmode=text&id=FAKE&db=nucleotide&report=gff3") assert expected == core.generate_url("FAKE", config) def test_generate_url_with_api_key(): """Test URL generation for API key""" config = core.Config(api_key='FAKE') expected = "{}?{}".format(ENTREZ_URL, "retmode=text&id=FAKE&db=nucleotide&api_key=FAKE&rettype=gbwithparts") assert expected == core.generate_url("FAKE", config) config.format = 'gff3' expected = "{}?{}".format(SVIEWER_URL, "retmode=text&id=FAKE&db=nucleotide&api_key=FAKE&report=gff3") assert expected == core.generate_url("FAKE", config) ncbi-acc-download-0.2.8/tests/test_correct.py000066400000000000000000000021331413446260000212000ustar00rootroot00000000000000from io import StringIO import pytest import requests import os from ncbi_acc_download.core import Config from ncbi_acc_download.errors import DownloadError from ncbi_acc_download import download from ncbi_acc_download import validate def full_path(name): return os.path.abspath(os.path.join(os.path.dirname(__file__), name)) def test_download_and_validate_partial_wgs(req): handle = StringIO(open(full_path('partialcontig.gbk'), 'r').read()) assert validate.run_extended_validation(handle, 'genbank', 'loads') try: from Bio import SeqIO handle.seek(0) records = list(SeqIO.parse(handle, 'genbank')) assert len(records)==1 assert len(records[0].features)==5 except ModuleNotFoundError: pass ## overwrites data in stringIO handle assert validate.run_extended_validation(handle, 'genbank', 'correct') try: from Bio import SeqIO handle.seek(0) records = list(SeqIO.parse(handle, 'genbank')) assert len(records)==1 assert len(records[0].features)==1 except ModuleNotFoundError: pass ncbi-acc-download-0.2.8/tests/test_download.py000066400000000000000000000007301413446260000213470ustar00rootroot00000000000000from io import StringIO import pytest import requests from ncbi_acc_download.core import Config from ncbi_acc_download.errors import DownloadError from ncbi_acc_download import download def test_write_stream(mocker): req = mocker.Mock() req.iter_content = mocker.Mock(side_effect=requests.exceptions.ChunkedEncodingError) handle = StringIO() cfg = Config() with pytest.raises(DownloadError): download.write_stream(req, handle, "FAKE", cfg) ncbi-acc-download-0.2.8/tests/test_validate.py000066400000000000000000000024061413446260000213330ustar00rootroot00000000000000"""Tests for the validation functions.""" from io import StringIO from ncbi_acc_download import validate def test_run_extended_validation_no_biopython(monkeypatch): """Test extended validation returns False if Biopython is not available.""" monkeypatch.setattr(validate, 'HAVE_BIOPYTHON', False) handle = StringIO(u'>foo\nATGC\n>bar\nATGTGA\n') assert validate.run_extended_validation(handle, 'fasta', 'all') is False def test_run_extended_validation_raises(monkeypatch, mocker): """Test the "seqence loads" validator catches exceptions in SeqIO.parse().""" seqio_mock = mocker.MagicMock() seqio_mock.parse = mocker.MagicMock(side_effect=ValueError) monkeypatch.setattr(validate, 'SeqIO', seqio_mock) assert validate.run_extended_validation(StringIO(u''), None, 'all') is False seqio_mock.parse = mocker.MagicMock(side_effect=Exception) assert validate.run_extended_validation(StringIO(u''), None, 'all') is False def test_run_extended_validation_loads(): """Test the "sequence loads" validator.""" handle = StringIO(u'>foo\nATGC\n>bar\nATGTGA\n') assert validate.run_extended_validation(handle, 'fasta', 'loads') handle = StringIO(u'') assert validate.run_extended_validation(handle, 'fasta', 'loads') is False ncbi-acc-download-0.2.8/tests/test_wgs.py000066400000000000000000000130161413446260000203410ustar00rootroot00000000000000"""Test the WGS download helper functions.""" from io import StringIO import os import pytest from ncbi_acc_download.core import Config from ncbi_acc_download.core import ENTREZ_URL from ncbi_acc_download import wgs from ncbi_acc_download.wgs import WgsRange def full_path(name): return os.path.abspath(os.path.join(os.path.dirname(__file__), name)) def test_init(): wgs_range = WgsRange("ABCD", 5, 1, 3) assert wgs_range.identifier == "ABCD" assert wgs_range.width == 5 assert wgs_range.start == 1 assert wgs_range.end == 3 wgs_range = WgsRange("ABCD", 5, 1, 1) assert wgs_range.identifier == "ABCD" assert wgs_range.width == 5 assert wgs_range.start == 1 assert wgs_range.end == 1 def test_get_ids(): wgs_range = WgsRange("ABCD", 5, 1, 3) expected = ["ABCD00001", "ABCD00002", "ABCD00003"] assert wgs_range.get_ids() == expected wgs_range = WgsRange("ABCD", 5, 1, 1) expected = ["ABCD00001"] assert wgs_range.get_ids() == expected def test_from_string(): range_string = "ABCD01000001.1-ABCD01000022.1" wgs_range = WgsRange.from_string(range_string) assert wgs_range.identifier == "ABCD" assert wgs_range.width == 8 assert wgs_range.start == 1000001 assert wgs_range.end == 1000022 # Some records like JOAR00000000.1 only have a single entry in WGS_SCAFLD wgs_range = WgsRange.from_string("ABCD123") assert wgs_range.identifier == "ABCD" assert wgs_range.width == 3 assert wgs_range.start == 123 assert wgs_range.end == 123 assert wgs_range.get_ids() == ["ABCD123"] with pytest.raises(ValueError, match="More than one hyphen in input."): _ = WgsRange.from_string("ABCD123-ABCD234-ABCD345") with pytest.raises(ValueError, match="String identifier is too large."): _ = WgsRange.from_string("ABCD-ABCD") with pytest.raises(ValueError, match="Failed to find shared identifier."): _ = WgsRange.from_string("ABCD123-EFGH234") with pytest.raises(ValueError, match="Last identifier smaller than first."): _ = WgsRange.from_string("ABCD234-ABCD123") def test_download_wgs_parts_no_biopython(): old_have_biopython = wgs.HAVE_BIOPYTHON wgs.HAVE_BIOPYTHON = False cfg = Config(format="genbank") handle = StringIO() new_handle = wgs.download_wgs_parts(handle, cfg) wgs.HAVE_BIOPYTHON = old_have_biopython assert handle == new_handle def test_download_wgs_parts_wgs(req): cfg = Config(format="genbank") wgs_contig = open(full_path('wgs.gbk'), 'rt') req.get(ENTREZ_URL, body=open(full_path('wgs_full.gbk'), 'rt')) outhandle = wgs.download_wgs_parts(wgs_contig, cfg) wgs_full = open(full_path('wgs_full.gbk'), 'rt') assert outhandle.getvalue() == wgs_full.read() wgs_full.close() wgs_contig.close() def test_download_wgs_parts_wgs_retry(req): cfg = Config(format="genbank") wgs_contig = open(full_path('wgs.gbk'), 'rt') req.get(ENTREZ_URL, response_list=[ {"text": u'Whoa, slow down', "status_code": 429, "headers": {"Retry-After": "0"}}, {"body": open(full_path('wgs_full.gbk'), 'rt')}, ]) outhandle = wgs.download_wgs_parts(wgs_contig, cfg) wgs_full = open(full_path('wgs_full.gbk'), 'rt') assert outhandle.getvalue() == wgs_full.read() wgs_full.close() wgs_contig.close() def test_download_wgs_parts_wgs_scafld(req): cfg = Config(format="genbank") wgs_contig = open(full_path('wgs_scafld.gbk'), 'rt') with open(full_path('wgs_full.gbk'), 'rt') as handle: full_file = handle.read() req.get(ENTREZ_URL, text=full_file) outhandle = wgs.download_wgs_parts(wgs_contig, cfg) assert outhandle.getvalue() == full_file wgs_contig.close() def test_download_wgs_parts_supercontig(req): cfg = Config(format="genbank") supercontig = open(full_path('supercontig.gbk'), 'rt') req.get(ENTREZ_URL, body=open(full_path('supercontig_full.gbk'), 'rt')) outhandle = wgs.download_wgs_parts(supercontig, cfg) supercontig_full = open(full_path('supercontig_full.gbk'), 'rt') assert outhandle.getvalue() == supercontig_full.read() supercontig_full.close() supercontig.close() def test_download_wgs_parts_supercontig_retry(req): cfg = Config(format="genbank") supercontig = open(full_path('supercontig.gbk'), 'rt') req.get(ENTREZ_URL, response_list=[ {"text": u'Whoa, slow down', "status_code": 429, "headers": {"Retry-After": "0"}}, {"body": open(full_path('supercontig_full.gbk'), 'rt')} ]) outhandle = wgs.download_wgs_parts(supercontig, cfg) supercontig_full = open(full_path('supercontig_full.gbk'), 'rt') assert outhandle.getvalue() == supercontig_full.read() supercontig_full.close() supercontig.close() def test_download_wgs_no_parts(req): cfg = Config(format="genbank") supercontig = open(full_path('supercontig_full.gbk'), 'rt') req.get(ENTREZ_URL, status_code=404) outhandle = wgs.download_wgs_parts(supercontig, cfg) supercontig_full = open(full_path('supercontig_full.gbk'), 'rt') assert outhandle.getvalue() == supercontig_full.read() supercontig_full.close() supercontig.close() @pytest.mark.xfail def test_download_wgs_parts_tsa(req): cfg = Config(format="genbank") wgs_contig = open(full_path('tsa.gbk'), 'rt') req.get(ENTREZ_URL, body=open(full_path('tsa_full.gbk'), 'rt')) outhandle = wgs.download_wgs_parts(wgs_contig, cfg) wgs_full = open(full_path('tsa_full.gbk'), 'rt') assert outhandle.getvalue() == wgs_full.read() wgs_full.close() wgs_contig.close() ncbi-acc-download-0.2.8/tests/tsa.gbk000066400000000000000000000042431413446260000174060ustar00rootroot00000000000000LOCUS GHGH01000000 126539 rc RNA linear TSA 02-APR-2019 DEFINITION TSA: Acropora millepora, transcriptome shotgun assembly. ACCESSION GHGH00000000 VERSION GHGH00000000.1 DBLINK BioProject: PRJNA473876 BioSample: SAMN10491283 Sequence Read Archive: SRR8254540 KEYWORDS TSA; Transcriptome Shotgun Assembly. SOURCE Acropora millepora ORGANISM Acropora millepora Eukaryota; Metazoa; Cnidaria; Anthozoa; Hexacorallia; Scleractinia; Astrocoeniina; Acroporidae; Acropora. REFERENCE 1 (bases 1 to 126539) AUTHORS Ying,H., Foret,S., Ball,E., Hayward,D.C., Moya,A. and Miller,D.J. TITLE Acropora millepora genome sequencing and assembly JOURNAL Unpublished REFERENCE 2 (bases 1 to 126539) AUTHORS Ying,H., Foret,S., Ball,E., Hayward,D.C., Moya,A. and Miller,D.J. TITLE Direct Submission JOURNAL Submitted (19-FEB-2019) Research School of Biology, Australian National University, Sullivans Creek Road, Canberra, ACT 2617, Australia COMMENT The Acropora millepora transcriptome shotgun assembly (TSA) project has the project accession GHGH00000000. This version of the project (01) has the accession number GHGH01000000, and consists of sequences GHGH01000001-GHGH01126539. Raw RNA-seq reads were trimmed using libngs with minimum quality of 20 and minimum read size of 80 nt. Trinity r2013-02-16 was then employed for de novo transcriptome assembly. ##Assembly-Data-START## Assembly Method :: Trinity r2013-02-16 Assembly Name :: trinity.001 Sequencing Technology :: Illumina ##Assembly-Data-END## FEATURES Location/Qualifiers source 1..126539 /organism="Acropora millepora" /mol_type="transcribed RNA" /db_xref="taxon:45264" /tissue_type="late planula" /country="Australia: Queensland" /collection_date="2011" TSA GHGH01000001-GHGH01000003 // ncbi-acc-download-0.2.8/tests/tsa_full.gbk000066400000000000000000000206271413446260000204340ustar00rootroot00000000000000LOCUS GHGH01000001 507 bp RNA linear TSA 02-APR-2019 DEFINITION TSA: Acropora millepora comp83269_c2_seq5, transcribed RNA sequence. ACCESSION GHGH01000001 VERSION GHGH01000001.1 DBLINK BioProject: PRJNA473876 BioSample: SAMN10491283 Sequence Read Archive: SRR8254540 KEYWORDS TSA; Transcriptome Shotgun Assembly. SOURCE Acropora millepora ORGANISM Acropora millepora Eukaryota; Metazoa; Cnidaria; Anthozoa; Hexacorallia; Scleractinia; Astrocoeniina; Acroporidae; Acropora. REFERENCE 1 (bases 1 to 507) AUTHORS Ying,H., Foret,S., Ball,E., Hayward,D.C., Moya,A. and Miller,D.J. TITLE Acropora millepora genome sequencing and assembly JOURNAL Unpublished REFERENCE 2 (bases 1 to 507) AUTHORS Ying,H., Foret,S., Ball,E., Hayward,D.C., Moya,A. and Miller,D.J. TITLE Direct Submission JOURNAL Submitted (19-FEB-2019) Research School of Biology, Australian National University, Sullivans Creek Road, Canberra, ACT 2617, Australia COMMENT Raw RNA-seq reads were trimmed using libngs with minimum quality of 20 and minimum read size of 80 nt. Trinity r2013-02-16 was then employed for de novo transcriptome assembly. ##Assembly-Data-START## Assembly Method :: Trinity r2013-02-16 Assembly Name :: trinity.001 Sequencing Technology :: Illumina ##Assembly-Data-END## FEATURES Location/Qualifiers source 1..507 /organism="Acropora millepora" /mol_type="transcribed RNA" /submitter_seqid="comp83269_c2_seq5" /db_xref="taxon:45264" /tissue_type="late planula" /country="Australia: Queensland" /collection_date="2011" ORIGIN 1 tttttttttt ccagaacgaa gttcccaggc ttcaaaatcc aaagtccttc agttggttta 61 tgtgactttc ccgtgctgca tgggcgttgg aggtttagtt tccttgttgg taccaggatg 121 atcttgatta ataagaaata cgaagcgaag accgcagatc tgtcatgatg aaaggagcag 181 aaaaagggca aatatgaaaa cacgatcgct cgaaaagttt aagcgtgctt atcattggtt 241 agaacatgtt ggacggatcg gtagtccgca aatataacat gtcgagtctg ccgaaagaga 301 aaactgctcc agtgaccaag gtcaactggt tatcctcaca gggcaacata agtttcgcaa 361 gcatctagga tcgcataacc aaaatcaact cccttgctaa ttacatctgc ttgtggtgtt 421 gtgcgccaag atcagacatg gctgtgtagt ggctgccatg gggagaaact ggtattggac 481 tcagtttatt aatctccata gggtaca // LOCUS GHGH01000002 230 bp RNA linear TSA 02-APR-2019 DEFINITION TSA: Acropora millepora comp1264959_c0_seq1, transcribed RNA sequence. ACCESSION GHGH01000002 VERSION GHGH01000002.1 DBLINK BioProject: PRJNA473876 BioSample: SAMN10491283 Sequence Read Archive: SRR8254540 KEYWORDS TSA; Transcriptome Shotgun Assembly. SOURCE Acropora millepora ORGANISM Acropora millepora Eukaryota; Metazoa; Cnidaria; Anthozoa; Hexacorallia; Scleractinia; Astrocoeniina; Acroporidae; Acropora. REFERENCE 1 (bases 1 to 230) AUTHORS Ying,H., Foret,S., Ball,E., Hayward,D.C., Moya,A. and Miller,D.J. TITLE Acropora millepora genome sequencing and assembly JOURNAL Unpublished REFERENCE 2 (bases 1 to 230) AUTHORS Ying,H., Foret,S., Ball,E., Hayward,D.C., Moya,A. and Miller,D.J. TITLE Direct Submission JOURNAL Submitted (19-FEB-2019) Research School of Biology, Australian National University, Sullivans Creek Road, Canberra, ACT 2617, Australia COMMENT Raw RNA-seq reads were trimmed using libngs with minimum quality of 20 and minimum read size of 80 nt. Trinity r2013-02-16 was then employed for de novo transcriptome assembly. ##Assembly-Data-START## Assembly Method :: Trinity r2013-02-16 Assembly Name :: trinity.001 Sequencing Technology :: Illumina ##Assembly-Data-END## FEATURES Location/Qualifiers source 1..230 /organism="Acropora millepora" /mol_type="transcribed RNA" /submitter_seqid="comp1264959_c0_seq1" /db_xref="taxon:45264" /tissue_type="late planula" /country="Australia: Queensland" /collection_date="2011" ORIGIN 1 cgcctgtaac aagagactgt gaagatgagt ctcactttct tgactatgtt tttcattcaa 61 accttaaaaa aattttcttc aacaaaagtt aataagaaaa atcctttgcc ataaaatatt 121 gctctatttg tgagatgaaa tttcatcact gctctttgaa accctcacag taacaagttt 181 ttattaggtc atattatcaa tgattatcac tacttctttt ccataataat // LOCUS GHGH01000003 1386 bp RNA linear TSA 02-APR-2019 DEFINITION TSA: Acropora millepora comp76929_c0_seq2, transcribed RNA sequence. ACCESSION GHGH01000003 VERSION GHGH01000003.1 DBLINK BioProject: PRJNA473876 BioSample: SAMN10491283 Sequence Read Archive: SRR8254540 KEYWORDS TSA; Transcriptome Shotgun Assembly. SOURCE Acropora millepora ORGANISM Acropora millepora Eukaryota; Metazoa; Cnidaria; Anthozoa; Hexacorallia; Scleractinia; Astrocoeniina; Acroporidae; Acropora. REFERENCE 1 (bases 1 to 1386) AUTHORS Ying,H., Foret,S., Ball,E., Hayward,D.C., Moya,A. and Miller,D.J. TITLE Acropora millepora genome sequencing and assembly JOURNAL Unpublished REFERENCE 2 (bases 1 to 1386) AUTHORS Ying,H., Foret,S., Ball,E., Hayward,D.C., Moya,A. and Miller,D.J. TITLE Direct Submission JOURNAL Submitted (19-FEB-2019) Research School of Biology, Australian National University, Sullivans Creek Road, Canberra, ACT 2617, Australia COMMENT Raw RNA-seq reads were trimmed using libngs with minimum quality of 20 and minimum read size of 80 nt. Trinity r2013-02-16 was then employed for de novo transcriptome assembly. ##Assembly-Data-START## Assembly Method :: Trinity r2013-02-16 Assembly Name :: trinity.001 Sequencing Technology :: Illumina ##Assembly-Data-END## FEATURES Location/Qualifiers source 1..1386 /organism="Acropora millepora" /mol_type="transcribed RNA" /submitter_seqid="comp76929_c0_seq2" /db_xref="taxon:45264" /tissue_type="late planula" /country="Australia: Queensland" /collection_date="2011" ORIGIN 1 ccgatctggc gtcctcgttg ccgtagtcta tgcacaagct ccctgccttc ctgctcgcac 61 gtacgaccaa acaaattgca caatgtcaat cagtaggtga tagactacaa aagtgtgtga 121 ggcttcttcg atattccgat tggttgccga aaaaatctgt tttgaagttg gcgaattgac 181 tatttgcgca tcttcgtaaa tcgagagcgc gtaaaaggga aatattcgga atatcaaaaa 241 agcctcacac acttttgcgc attgataacc aaggaacatt ttggccaaat ttcataaaaa 301 tccgtcaaat ctacataccc tattatttcg cttgaagaag ggaaatcaag ttcctcagtt 361 ttctttcctt tacttttaag gcctggtttt taaaacgtaa gtgaaaactt ttacatgatc 421 gagtcatttg atgtcagatt ctatgacaga agcatcaaat gagtcgacac tgttcgtggc 481 ttgctttgga ttgcgactgt cctcaatttc ggtgagagag aagcgagcga ccacaataca 541 cgcgaaactc ggacacgcaa ctcgtgcgaa attagagact actcgcactc taggctagct 601 taaaagtgga agtcagccca agtgggctgg cggataagtt ttagcgggat atcgaataac 661 gatcatttac catgctgaaa tgacggttcg tttgtccaat aacgctcgct ggcctggttt 721 ccgagcaaac gtaacataaa agttcctgcc aggtttgatg agacgaaaaa ttttacaaca 781 taagcatgcg ttatcttgcc caaggaacgc ggttacccga tcacaagttt atttggtgtt 841 tttggtatag aaagaattca tgtaaaacat gaaaacactt cttcttactc aaaataaaat 901 ataagattga aaatatttcg gtgactttga agacagtagt aatatttcat cccggacctt 961 ccagaagagc cagcctgcct catttaaaca agctcttcga ccattgatgc tctcaccaaa 1021 cgggataaga aatagtcatc atccctagtc ctttgtgctg tacaaaagta tgccattttg 1081 atatcaacta cacattcaag taagagacaa atgctacttg tttatcagcg ttcatttcta 1141 ctgtgtgcaa agtttcattg aatcagccat tgttttcttg actgtgagag aaaaaagaag 1201 ctattaagct ctttttagtg actgaagatt ccattttgaa tgtcatgaat cacacaacaa 1261 tttcattgcg aactgccatt ttttaagcac attttagcgc gtgatagaat taaaagcgta 1321 gtactattcg ttaagctcct gatatcatcc tctttttttt ttcttttttt tttttttttt 1381 tttttt // ncbi-acc-download-0.2.8/tests/wgs.gbk000066400000000000000000000010031413446260000174060ustar00rootroot00000000000000LOCUS NZ_BASQ01000000 3396165 bp DNA linear BCT 24-APR-2017 DEFINITION Microbacterium sp. TS-1, whole genome shotgun sequencing project. ACCESSION NZ_BASQ00000000 VERSION NZ_BASQ00000000.1 FEATURES Location/Qualifiers source 1..3396165 /organism="Microbacterium sp. TS-1" /mol_type="genomic DNA" /strain="TS-1" /db_xref="taxon:1344956" WGS BASQ01000001-BASQ01000003 // ncbi-acc-download-0.2.8/tests/wgs_full.gbk000066400000000000000000000042721413446260000204430ustar00rootroot00000000000000LOCUS NZ_BASQ01000001 120 bp DNA linear CON 24-APR-2017 DEFINITION Microbacterium sp. TS-1, whole genome shotgun sequence. ACCESSION NZ_BASQ01000001 VERSION NZ_BASQ01000001.1 KEYWORDS . SOURCE Microbacterium sp. TS-1 ORGANISM Microbacterium sp. TS-1 Bacteria; Actinobacteria; Micrococcales; Microbacteriaceae; Microbacterium. FEATURES Location/Qualifiers source 1..120 /organism="Microbacterium sp. TS-1" /mol_type="genomic DNA" /strain="TS-1" /db_xref="taxon:1344956" ORIGIN 1 aagagggcca cccaatgttg ggtggccctt tttcgtgccc gtgtgaccgc agagttgact 61 ctcgcggctg tgtggtgtgc aatatattgc atgtcgctgg cccccgtctc ccgtccggcg // LOCUS NZ_BASQ01000002 60 bp DNA linear CON 24-APR-2017 DEFINITION Microbacterium sp. TS-1, whole genome shotgun sequence. ACCESSION NZ_BASQ01000002 VERSION NZ_BASQ01000002.1 KEYWORDS . SOURCE Microbacterium sp. TS-1 ORGANISM Microbacterium sp. TS-1 Bacteria; Actinobacteria; Micrococcales; Microbacteriaceae; Microbacterium. FEATURES Location/Qualifiers source 1..60 /organism="Microbacterium sp. TS-1" /mol_type="genomic DNA" /strain="TS-1" /db_xref="taxon:1344956" ORIGIN 1 ttgacctgtt cataaccaga gccggagacc gactcggttg tgaggtgtgg tggggaccct // LOCUS NZ_BASQ01000003 30 bp DNA linear CON 09-APR-2017 DEFINITION Microbacterium sp. TS-1, whole genome shotgun sequence. ACCESSION NZ_BASQ01000003 VERSION NZ_BASQ01000003.1 KEYWORDS . SOURCE Microbacterium sp. TS-1 ORGANISM Microbacterium sp. TS-1 Bacteria; Actinobacteria; Micrococcales; Microbacteriaceae; Microbacterium. FEATURES Location/Qualifiers source 1..30 /organism="Microbacterium sp. TS-1" /mol_type="genomic DNA" /strain="TS-1" /db_xref="taxon:1344956" ORIGIN 1 gggtggggta tggttgtgaa gttgccctct // ncbi-acc-download-0.2.8/tests/wgs_scafld.gbk000066400000000000000000000010571413446260000207330ustar00rootroot00000000000000LOCUS NZ_BASQ01000000 3396165 bp DNA linear BCT 24-APR-2017 DEFINITION Microbacterium sp. TS-1, whole genome shotgun sequencing project. ACCESSION NZ_BASQ00000000 VERSION NZ_BASQ00000000.1 FEATURES Location/Qualifiers source 1..3396165 /organism="Microbacterium sp. TS-1" /mol_type="genomic DNA" /strain="TS-1" /db_xref="taxon:1344956" WGS BASQ01000001-BASQ01000003 WGS_SCAFLD NZ_BASQ01000001-NZ_BASQ01000003 //