pax_global_header00006660000000000000000000000064145745770470014536gustar00rootroot0000000000000052 comment=e82bea128581851628ae5104b449690a1c31afc7 cutadapt-4.7/000077500000000000000000000000001457457704700132155ustar00rootroot00000000000000cutadapt-4.7/.codecov.yml000066400000000000000000000003571457457704700154450ustar00rootroot00000000000000codecov: require_ci_to_pass: no coverage: precision: 1 round: down range: "90...100" status: project: default: target: auto threshold: 1% base: auto patch: no changes: no comment: off cutadapt-4.7/.editorconfig000066400000000000000000000001511457457704700156670ustar00rootroot00000000000000[*.{py,pyx,rst}] charset=utf-8 end_of_line=lf insert_final_newline=true indent_style=space indent_size=4 cutadapt-4.7/.github/000077500000000000000000000000001457457704700145555ustar00rootroot00000000000000cutadapt-4.7/.github/issue_template.md000066400000000000000000000007001457457704700201170ustar00rootroot00000000000000 cutadapt-4.7/.github/workflows/000077500000000000000000000000001457457704700166125ustar00rootroot00000000000000cutadapt-4.7/.github/workflows/ci.yml000066400000000000000000000070761457457704700177420ustar00rootroot00000000000000name: CI on: [push, pull_request] jobs: lint: # Run for PRs only if they come from a forked repo (avoids duplicate runs) if: github.event_name != 'pull_request' || github.event.pull_request.head.repo.full_name != github.event.pull_request.base.repo.full_name timeout-minutes: 10 runs-on: ubuntu-latest strategy: matrix: python-version: ["3.10"] toxenv: [flake8, mypy, docs, black] steps: - uses: actions/checkout@v4 - name: Set up Python ${{ matrix.python-version }} uses: actions/setup-python@v5 with: python-version: ${{ matrix.python-version }} - name: Install tox run: pip install tox - name: Run tox ${{ matrix.toxenv }} run: tox -e ${{ matrix.toxenv }} build: if: github.event_name != 'pull_request' || github.event.pull_request.head.repo.full_name != github.event.pull_request.base.repo.full_name runs-on: ubuntu-latest steps: - uses: actions/checkout@v4 with: fetch-depth: 0 # required for setuptools_scm - name: Build sdist and temporary wheel run: pipx run build - uses: actions/upload-artifact@v4 with: name: sdist path: dist/*.tar.gz test: if: github.event_name != 'pull_request' || github.event.pull_request.head.repo.full_name != github.event.pull_request.base.repo.full_name timeout-minutes: 10 runs-on: ${{ matrix.os }} strategy: matrix: os: [ubuntu-latest] python-version: ["3.8", "3.9", "3.10", "3.11", "3.12"] include: - os: macos-latest python-version: "3.10" - os: windows-latest python-version: "3.10" steps: - uses: actions/checkout@v4 - name: Set up Python ${{ matrix.python-version }} uses: actions/setup-python@v5 with: python-version: ${{ matrix.python-version }} - name: Install tox run: pip install tox - name: Test run: tox -e py$(echo ${{ matrix.python-version }} | tr -d .) - name: Upload coverage report uses: codecov/codecov-action@v4 wheels: if: github.event_name == 'push' && github.ref_type == 'tag' needs: [lint, build, test] timeout-minutes: 20 strategy: matrix: os: [ubuntu-20.04, windows-2019, macos-latest] runs-on: ${{ matrix.os }} steps: - uses: actions/checkout@v4 with: fetch-depth: 0 # required for setuptools_scm - name: Build wheels uses: pypa/cibuildwheel@v2.17.0 env: CIBW_BUILD: "cp*-manylinux_x86_64 cp3*-win_amd64 cp3*-macosx_x86_64" CIBW_SKIP: "cp37-*" - uses: actions/upload-artifact@v4 with: name: wheels-${{ matrix.os }} path: wheelhouse/*.whl publish: if: github.event_name == 'push' && github.ref_type == 'tag' needs: [build, wheels] timeout-minutes: 10 runs-on: ubuntu-latest steps: - uses: actions/download-artifact@v4 with: name: sdist path: dist/ - uses: actions/download-artifact@v4 with: pattern: wheels-* path: dist/ merge-multiple: true - name: Publish alpha release to test PyPI if: contains(github.ref_name, 'a') uses: pypa/gh-action-pypi-publish@release/v1 with: user: __token__ password: ${{ secrets.test_pypi_password }} repository-url: https://test.pypi.org/legacy/ - name: Publish to PyPI if: "!contains(github.ref_name, '.dev') && !contains(github.ref_name, 'a')" uses: pypa/gh-action-pypi-publish@release/v1 with: user: __token__ password: ${{ secrets.pypi_password }} cutadapt-4.7/.github/workflows/pyinstaller.yml000066400000000000000000000017111457457704700217030ustar00rootroot00000000000000name: PyInstaller on: schedule: - cron: "0 15 6 * *" workflow_dispatch: jobs: build: runs-on: windows-latest steps: - uses: actions/checkout@v3 - name: Python uses: actions/setup-python@v3 with: python-version: '3.9' - name: Install run: | python -m venv venv venv/Scripts/pip install pyinstaller venv/Scripts/pip install . - name: Make exe run: | echo "from cutadapt.__main__ import main_cli" > script.py echo "sys.exit(main_cli())" >> script.py venv/Scripts/pyinstaller -F --hidden-import=cutadapt._match_tables -n cutadapt script.py - name: Run it run: dist/cutadapt.exe --version - name: Test multicore run: ( echo ">read" && echo "ACGT" ) | dist/cutadapt.exe -j 2 --quiet - - uses: actions/upload-artifact@v3 with: name: cutadapt-exe path: dist/cutadapt.exe cutadapt-4.7/.gitignore000066400000000000000000000004511457457704700152050ustar00rootroot00000000000000# Editor-specific ignore patterns such as "*~" should be added to # ~/.config/git/ignore, not here. *.pyc __pycache__/ /MANIFEST /build/ /dist/ /.coverage* /.tox /.cache /.pytest_cache /src/cutadapt/*.c /src/cutadapt/*.so /doc/_build /src/cutadapt.egg-info/ src/cutadapt/_version.py .mpypy_cache cutadapt-4.7/.pre-commit-config.yaml000066400000000000000000000005221457457704700174750ustar00rootroot00000000000000repos: - repo: https://github.com/pre-commit/pre-commit-hooks rev: "v2.3.0" hooks: - id: end-of-file-fixer - id: trailing-whitespace - repo: https://github.com/psf/black rev: 22.3.0 hooks: - id: black - repo: https://github.com/charliermarsh/ruff-pre-commit rev: "v0.0.261" hooks: - id: ruff cutadapt-4.7/.readthedocs.yaml000066400000000000000000000001751457457704700164470ustar00rootroot00000000000000version: 2 build: os: "ubuntu-22.04" tools: python: "3.11" python: install: - requirements: doc/requirements.txt cutadapt-4.7/CHANGES.rst000066400000000000000000001317431457457704700150300ustar00rootroot00000000000000========= Changelog ========= v4.7 (2024-03-14) ----------------- * :issue:`767`: Error messages are always sent to stderr. They were previously sent to stdout when using `-o`. * Cutadapt can now read single-end data from unaligned BAM files (uBAM). * Dropped support for Python 3.7. v4.6 (2023-12-06) ------------------- * :issue:`744`, :issue:`561`: Make ``--revcomp`` work with paired-end data. v4.5 (2023-10-06) ----------------- * :pr:`725`: Added a ``--max-average-error-rate``/``--max-aer`` option to add a filter that checks if the number of expected errors divided by read length is above a certain threshold. The expected errors are calculated the same as in ``--max-expected-errors``, and dividing by read length helps for reads that have varying lengths. * :issue:`696`: Added a histogram of the lengths of removed poly-A tails to the report. * :issue:`696`: For paired-end data, ``--poly-a`` was changed to trim poly-T "heads" on R2 (this is still experimental as it is unclear whether that is the desired behavior. Please give feedback!) * A poly-A tail is only removed if it is at least three nucleotides long. * :issue:`734`: Fixed misassignments during demultiplexing that would sometimes happen when there are collisions between adapter sequences (when the warning "sequence ... cannot be assigned uniquely" was printed). Previously, sequences could incorrectly be assigned to an adapter that is not actually the best match. v4.4 (2023-04-28) ----------------- * :issue:`695`: Fixed very slow k-mer heuristic initialization (hours instead of seconds) for degenerate adapter sequences such as ``A{100}`` as used when doing poly-A trimming. * :issue:`696`: Added option ``--poly-a`` for trimming poly-A tails. This is more accurate and multiple times faster than using ``-a A{100}`` as previously recommended. This is currently experimental (that is, the algorithm may change in the next one or two releases). * :issue:`685`: Sped up index generation somewhat. This is most noticable when demultiplexing using thousands or more adapters. The speedup is different depending on whether indels are allowed or not because different algorithms are used. * :issue:`685`: Sped up demultiplexing (when using an index) for the case when the read contains ``N`` bases within the region where the adapter matches. Previously, any ``N`` would disable the index for that read and trigger a fallback to the slow method of matching each adapter one-by-one. Now the index is used even in those cases. * :pr:`700`: Sped up ``--max-expected-errors``. Thanks @rhpvorderman. v4.3 (2023-03-17) ----------------- * :pr:`663`: Cutadapt became significantly faster due to an added runtime heuristic that avoids running the full alignment algorithm if it can be proven that it cannot succeed. Thanks to @rhpvorderman for this great improvement! * :issue:`665`: 5' adapters did not allow partial matches in the beginning when the :ref:`rightmost ` adapter-search parameter was used. * :issue:`662`: Fixed assertion error when ``--discard-untrimmed`` was used together with ``--json`` and demultiplexing. * :issue:`674`: When reading 3' adapters from an external file, they can now all be anchored by using the syntax ``-a file$:adapters.fasta`` (note the ``$`` in ``file$:``). * :issue:`669`: The ``--rename`` option now understands the ``\t`` escape sequence and will insert a tab character in its place. This is useful when transferring FASTQ header comments to SAM tags. v4.2 (2022-12-09) ----------------- * :issue:`654`: When determining the error rate for a partial match of an adapter with ``N`` wildcards, the number of non-N bases was not computed correctly, which could lead to matches not being found. * :issue:`546`: Automatically replace ``I`` in adapter sequences with ``N``. ``I`` is used to encode inosine, which matches any base. Contributed by @peterjc. * :issue:`528`: Cutadapt should now no long hang in multicore mode when an error was raised in a worker process (for example, when an incorrectly formatted FASTQ file was encountered). v4.1 (2022-06-07) ----------------- * :issue:`624`: You can now combine reading adapter sequences from an external file with the search parameter notation. For example, ``-a "file:adapters.fasta;min_overlap=5"`` sets the minimum overlap to 5 for all adapters in ``adapters.fasta``. * :issue:`361`: When reading 5' adapters from an external file, they can now all be anchored by using the syntax ``-g ^file:adapters.fasta`` (note the ``^`` before ``file:``). * :issue:`254`: Finding the *rightmost* 5' adapter occurrence is now supported by using the ``rightmost`` search parameter (the default is to find the leftmost occurrence). * :issue:`615`: Fix linked adapter statistics for 5' and 3' end not being reported separated correctly. * :issue:`616`: Report correct number of quality-trimmed bases when both ``-q`` and ``--nextseq-trim`` are used. v4.0 (2022-04-13) ----------------- * :issue:`604`, :pr:`608`: The :ref:`alignment algorithm was tweaked ` to penalize indels more and to more accurately pick the leftmost adapter occurrence if there are multiple. This will normally affect very few reads, but should generally lead to fewer surprising results in cases where it matters. Because this changes trimming results, it was appropriate to bump the major version to 4. * :issue:`607`: Print an error when an output file was specified multiple times (for example, for ``--untrimmed-output`` and ``--too-short-output``). Sending output from different filters to the same file is not supported at the moment. * :issue:`603`: When ``-e`` was used with an absolute number of errors and there were ``N`` wildcards in the sequence, the actual number of allowed errors was too low. * Speed up quality trimming (both ``-q`` and ``--nextseq-trim``) somewhat. * Python 3.6 is no longer supported as it is end-of-life. v3.7 (2022-02-23) ----------------- * :issue:`600`: Fixed ``{match_sequence}`` placeholder not working when renaming paired-end reads. v3.6 (2022-02-18) --------------------- * :issue:`437`: Add ``{match_sequence}`` to the placeholders that ``--rename`` accepts. This allows to add the sequence matching an adapter (including errors) to the read header. An empty string is inserted if there is no match. * :issue:`589`: Windows wheels are now available on PyPI. That is, ``pip install`` will no longer attempt to compile things, but just install a pre-compiled binary. * :issue:`592`: Clarify in documentation and error messages that anchored adapters need to match in full and that therefore setting an explict minimum overlap (``min_overlap=``, ``o=``) for them is not possible. v3.5 (2021-09-29) ----------------- * :issue:`555`: Add support for dumping statistics in JSON format using ``--json``. * :issue:`541`: Add a "Read fate breakdown" section heading to the report, and also add statistics for reads discarded because of ``--discard-untrimmed`` and ``--discard-trimmed``. With this, the numbers in that section should add up to 100%. * Add option ``-Q``, which allows to specify a quality-trimming threshold for R2 that is different from the one for R1. * :issue:`567`: Add ``noindels`` adapter-trimming parameter. You can now write ``-a "ADAPTER;noindels"`` to disallow indels for a single adapter only. * :issue:`570`: Fix ``--pair-adapters`` not finding some pairs when reads contain more than one adapter. * :issue:`524`: Fix a memory leak when using ``--info-file`` with multiple cores. * :issue:`559`: Fix adjacent base statistics not being shown for linked adapters. v3.4 (2021-03-30) ----------------- * :issue:`481`: An experimental single-file Windows executable of Cutadapt is `available for download on the GitHub "releases" page `_. * :issue:`517`: Report correct sequence in info file if read was reverse complemented * :issue:`517`: Added a column to the info file that shows whether the read was reverse-complemented (if ``--revcomp`` was used) * :issue:`320`: Fix (again) "Too many open files" when demultiplexing v3.3 (2021-03-04) ----------------- * :issue:`504`: Fix a crash on Windows. * :issue:`490`: When ``--rename`` is used with ``--revcomp``, disable adding the ``rc`` suffix to reads that were reverse-complemented. * Also, there is now a ``{rc}`` template variable for the ``--rename`` option, which is replaced with "rc" if the read was reverse-complemented (and the empty string if not). * :issue:`512`: Fix issue :issue:`128` once more (the “Reads written” figure in the report incorrectly included both trimmed and untrimmed reads if ``--untrimmed-output`` was used). * :issue:`515`: The report is now sent to stderr if any output file is written to stdout v3.2 (2021-01-07) ----------------- * :issue:`437`: Implement a ``--rename`` option for :ref:`flexible read name modifications ` such as moving a barcode sequence into the read name. * :issue:`503`: The index for demultiplexing is now created a lot faster (within seconds instead of minutes) when allowing indels. * :issue:`499`: Fix combinatorial demultiplexing not working when using multiple cores. v3.1 (2020-12-03) ----------------- * :issue:`443`: With ``--action=retain``, it is now possible to trim reads while leaving the adapter sequence itself in the read. That is, only the sequence before (for 5’ adapters) or after (for 3’ adapters) is removed. With linked adapters, both adapters are retained. * :issue:`495`: Running with multiple cores did not work using macOS and Python 3.8+. To prevent problems like these in the future, automated testing has been extended to also run on macOS. * :issue:`482`: Print statistics for ``--discard-casava`` and ``--max-ee`` in the report. * :issue:`497`: The changelog for 3.0 previously forgot to mention that the following options, which were deprecated in version 2.0, have now been removed, and using them will lead to an error: ``--format``, ``--colorspace``, ``-c``, ``-d``, ``--double-encode``, ``-t``, ``--trim-primer``, ``--strip-f3``, ``--maq``, ``--bwa``, ``--no-zero-cap``. This frees up some single-character options, allowing them to be re-purposed for future Cutadapt features. v3.0 (2020-11-10) ----------------- * Demultiplexing on multiple cores is now supported. This was the last feature that only ran single-threaded. * :issue:`478`: Demultiplexing now always generates all possible output files. * :issue:`358`: You can now use ``-e`` also :ref:`to specify the maximum number of errors ` (instead of the maximum error rate). For example, write ``-e 2`` to allow two errors over a full-length adapter match. * :pr:`486`: Trimming many anchored adapters (for example when demultiplexing) is now faster by using an index even when indels are allowed. Previously, Cutadapt would only be able to build an index with ``--no-indels``. * :issue:`469`: Cutadapt did not run under Python 3.8 on recent macOS versions. * :issue:`425`: Change the default compression level for ``.gz`` output files from 6 to 5. This reduces the time used for compression by about 50% while increasing file size by less than 10%. To get the old behavior, use ``--compression-level=6``. If you use Cutadapt to create intermediate files that are deleted anyway, consider also using the even faster option ``-Z`` (same as ``--compression-level=1``). * :pr:`485`: Fix that, under some circumstances, in particular when trimming a 5' adapter and there was a mismatch in its last nucleotide(s), not the entire adapter sequence would be trimmed from the read. Since fixing this required changed the alignment algorithm slightly, this is a backwards incompatible change. * Fix that the report did not include the number of reads that are too long, too short or had too many ``N``. (This unintentionally disappeared in a previous version.) * :issue:`487`: When demultiplexing, the reported number of written pairs was always zero. * :issue:`497`: The following options, which were deprecated in version 2.0, have been removed, and using them will lead to an error: ``--format``, ``--colorspace``, ``-c``, ``-d``, ``--double-encode``, ``-t``, ``--trim-primer``, ``--strip-f3``, ``--maq``, ``--bwa``, ``--no-zero-cap``. This frees up some single-character options, allowing them to be re-purposed for future Cutadapt features. * Ensure Cutadapt runs under Python 3.9. * Drop support for Python 3.5. v2.10 (2020-04-22) ------------------ * Fixed a performance regression introduced in version 2.9. * :pr:`449`: ``--action=`` could not be used with ``--pair-adapters``. Fix contributed by wlokhorst. * :issue:`450`: ``--untrimmed-output``, ``--too-short-output`` and ``--too-long-output`` can now be written interleaved. * :issue:`453`: Fix problem that ``N`` wildcards in adapters did not match ``N`` characters in the read. ``N`` characters now match any character in the read, independent of whether ``--match-read-wildcards`` is used or not. * With ``--action=lowercase``/``mask``, print which sequences would have been removed in the “Overview of removed sequences” statistics. Previously, it would show that no sequences have been removed. v2.9 (2020-03-18) ----------------- * :issue:`441`: Add a ``--max-ee`` (or ``--max-expected-errors``) option for filtering reads whose number of expected errors exceeds the given threshold. The idea comes from `Edgar et al. (2015) `_. * :issue:`438`: The info file now contains the ``rc`` suffix that is added to the names of reverse-complemented reads (with ``--revcomp``). * :issue:`448`: ``.bz2`` and ``.xz`` output wasn’t possible in multi-core mode. v2.8 (2020-01-13) ----------------- * :issue:`220`: With option ``--revcomp``, Cutadapt now searches both the read and its reverse complement for adapters. The version that matches best is kept. This can be used to “normalize” strandedness. * :issue:`430`: ``--action=lowercase`` now works with linked adapters * :issue:`431`: Info files can now be written even for linked adapters. v2.7 (2019-11-22) ----------------- * :issue:`427`: Multicore is now supported even when using ``--info-file``, ``--rest-file`` or ``--wildcard-file``. The only remaining feature that still does not work with multicore is now demultiplexing. * :issue:`290`: When running on a single core, Cutadapt no longer spawns external ``pigz`` processes for writing gzip-compressed files. This is a first step towards ensuring that using ``--cores=n`` uses only at most *n* CPU cores. * This release adds support for Python 3.8. v2.6 (2019-10-26) ----------------- * :issue:`395`: Do not show animated progress when ``--quiet`` is used. * :issue:`399`: When two adapters align to a read equally well (in terms of the number of matches), prefer the alignment that has fewer errors. * :issue:`401` Give priority to adapters given earlier on the command line. Previously, the priority was: All 3' adapters, all 5' adapters, all anywhere adapters. In rare cases this could lead to different results. * :issue:`404`: Fix an issue preventing Cutadapt from being used on Windows. * This release no longer supports Python 3.4 (which has reached end of life). v2.5 (2019-09-04) ----------------- * :issue:`391`: Multicore is now supported even when using ``--untrimmed-output``, ``--too-short-output``, ``--too-long-output`` or the corresponding ``...-paired-output`` options. * :issue:`393`: Using ``--info-file`` no longer crashes when processing paired-end data. However, the info file itself will only contain results for R1. * :issue:`394`: Options ``-e``/``--no-indels``/``-O`` were ignored for linked adapters * :issue:`320`: When a “Too many open files” error occurs during demultiplexing, Cutadapt can now automatically raise the limit and re-try if the limit is a “soft” limit. v2.4 (2019-07-09) ----------------- * :issue:`292`: Implement support for demultiplexing paired-end reads that use :ref:`combinatorial indexing (“combinatorial demultiplexing”) `. * :pr:`384`: Speed up reading compressed files by requiring an xopen version that uses an external pigz process even for *reading* compressed input files (not only for writing). * :issue:`381`: Fix ``--report=minimal`` not working. * :issue:`380`: Add a ``--fasta`` option for forcing that FASTA is written to standard output even when input is FASTQ. Previously, forcing FASTA was only possible by providing an output file name. v2.3 (2019-04-25) ----------------- * :issue:`378`: The ``--pair-adapters`` option, added in version 2.1, was not actually usable for demultiplexing. v2.2 (2019-04-20) --------------------- * :issue:`376`: Fix a crash when using anchored 5' adapters together with ``--no-indels`` and trying to trim an empty read. * :issue:`369`: Fix a crash when attempting to trim an empty read using a ``-g`` adapter with wildcards. v2.1 (2019-03-15) ----------------- * :issue:`366`: Fix problems when combining ``--cores`` with reading from standard input or writing to standard output. * :issue:`347`: Support :ref:`“paired adapters” `. One use case is demultiplexing Illumina *Unique Dual Indices* (UDI). v2.0 (2019-03-06) ----------------- This is a major new release with lots of bug fixes and new features, but also some backwards-incompatible changes. These should hopefully not affect too many users, but please make sure to review them and possibly update your scripts! Backwards-incompatible changes ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ * :issue:`329`: Linked adapters specified with ``-a ADAPTER1...ADAPTER2`` are no longer anchored by default. To get results consist with the old behavior, use ``-a ^ADAPTER1...ADAPTER2`` instead. * Support for colorspace data was removed. Thus, the following command-line options can no longer be used: ``-c``, ``-d``, ``-t``, ``--strip-f3``, ``--maq``, ``--bwa``, ``--no-zero-cap``. * “Legacy mode” has been removed. This mode was enabled under certain conditions and would change the behavior such that the read-modifying options such as ``-q`` would only apply to the forward/R1 reads. This was necessary for compatibility with old Cutadapt versions, but became increasingly confusing. * :issue:`360`: Computation of the error rate of an adapter match no longer counts the ``N`` wildcard bases. Previously, an adapter like ``N{18}CC`` (18 ``N`` wildcards followed by ``CC``) would effectively match anywhere because the default error rate of 0.1 (10%) would allow for two errors. The error rate of a match is now computed as the number of non-``N`` bases in the matching part of the adapter divided by the number of errors. * This release of Cutadapt requires at least Python 3.4 to run. Python 2.7 is no longer supported. Features ~~~~~~~~ * A progress indicator is printed while Cutadapt is working. If you redirect standard error to a file, the indicator is disabled. * Reading of FASTQ files has gotten faster due to a new parser. The FASTA and FASTQ reading/writing functions are now available as part of the `dnaio library `_. This is a separate Python package that can be installed independently from Cutadapt. There is one regression at the moment: FASTQ files that use a second header (after the "+") will have that header removed in the output. * Some other performance optimizations were made. Speedups of up to 15% are possible. * Demultiplexing has become a lot faster :ref:`under certain conditions `. * :issue:`335`: For linked adapters, it is now possible to :ref:`specify which of the two adapters should be required `, overriding the default. * :issue:`166`: By specifying ``--action=lowercase``, it is now possible to not trim adapters, but to instead convert the section of the read that would have been trimmed to lowercase. Bug fixes ~~~~~~~~~ * Removal of legacy mode fixes also :issue:`345`: ``--length`` would not enable legacy mode. * The switch to ``dnaio`` also fixed :issue:`275`: Input files with non-standard names now no longer lead to a crash. Instead the format is now recognized from the file content. * Fix :issue:`354`: Sequences given using ``file:`` can now be unnamed. * Fix :issue:`257` and :issue:`242`: When only R1 or only R2 adapters are given, the ``--pair-filter`` setting is now forced to ``both`` for the ``--discard-untrimmed`` (and ``--untrimmed-(paired-)output``) filters. Otherwise, with the default ``--pair-filter=any``, all pairs would be considered untrimmed because one of the reads in the pair is always untrimmed. Other ~~~~~ * :issue:`359`: The ``-f``/``--format`` option is now ignored and a warning will be printed if it is used. The input file format is always auto-detected. v1.18 (2018-09-07) ------------------ Features ~~~~~~~~ * Close :issue:`327`: Maximum and minimum lengths can now be specified separately for R1 and R2 with ``-m LENGTH1:LENGTH2``. One of the lengths can be omitted, in which case only the length of the other read is checked (as in ``-m 17:`` or ``-m :17``). * Close :issue:`322`: Use ``-j 0`` to auto-detect how many cores to run on. This should even work correctly on cluster systems when Cutadapt runs as a batch job to which fewer cores than exist on the machine have been assigned. Note that the number of threads used by ``pigz`` cannot be controlled at the moment, see :issue:`290`. * Close :issue:`225`: Allow setting the maximum error rate and minimum overlap length per adapter. A new :ref:`syntax for adapter-specific parameters ` was added for this. Example: ``-a "ADAPTER;min_overlap=5"``. * Close :issue:`152`: Using the new syntax for adapter-specific parameters, it is now possible to allow partial matches of a 3' adapter at the 5' end (and partial matches of a 5' adapter at the 3' end) by specifying the ``anywhere`` parameter (as in ``-a "ADAPTER;anywhere"``). * Allow ``--pair-filter=first`` in addition to ``both`` and ``any``. If used, a read pair is discarded if the filtering criterion applies to R1; and R2 is ignored. * Close :issue:`112`: Implement a ``--report=minimal`` option for printing a succinct two-line report in tab-separated value (tsv) format. Thanks to :user:`jvolkening` for coming up with an initial patch! Bug fixes ~~~~~~~~~ * Fix :issue:`128`: The “Reads written” figure in the report incorrectly included both trimmed and untrimmed reads if ``--untrimmed-output`` was used. Other ~~~~~ * The options ``--no-trim`` and ``--mask-adapter`` should now be written as ``--action=mask`` and ``--action=none``. The old options still work. * This is the last release to support `colorspace data `_ * This is the last release to support Python 2. v1.17 (2018-08-20) ------------------ * Close :issue:`53`: Implement adapters :ref:`that disallow internal matches `. This is a bit like anchoring, but less strict: The adapter sequence can appear at different lengths, but must always be at one of the ends. Use ``-a ADAPTERX`` (with a literal ``X``) to disallow internal matches for a 3' adapter. Use ``-g XADAPTER`` to disallow for a 5' adapter. * :user:`klugem` contributed PR :issue:`299`: The ``--length`` option (and its alias ``-l``) can now be used with negative lengths, which will remove bases from the beginning of the read instead of from the end. * Close :issue:`107`: Add a ``--discard-casava`` option to remove reads that did not pass CASAVA filtering (this is possibly relevant only for older datasets). * Fix :issue:`318`: Cutadapt should now be installable with Python 3.7. * Running Cutadapt under Python 3.3 is no longer supported (Python 2.7 or 3.4+ are needed) * Planned change: One of the next Cutadapt versions will drop support for Python 2 entirely, requiring Python 3. v1.16 (2018-02-21) ------------------ * Fix :issue:`291`: When processing paired-end reads with multiple cores, there could be errors about incomplete FASTQs although the files are intact. * Fix :issue:`280`: Quality trimming statistics incorrectly show the same values for R1 and R2. v1.15 (2017-11-23) ------------------ * Cutadapt can now run on multiple CPU cores in parallel! To enable it, use the option ``-j N`` (or the long form ``--cores=N``), where ``N`` is the number of cores to use. Multi-core support is only available on Python 3, and not yet with some command-line arguments. See :ref:`the new section about multi-core in the documentation ` for details. When writing ``.gz`` files, make sure you have ``pigz`` installed to get the best speedup. * The plan is to make multi-core the default (automatically using as many cores as are available) in future releases, so please test it and `report an issue `_ if you find problems! * Issue :issue:`256`: ``--discard-untrimmed`` did not have an effect on non-anchored linked adapters. * Issue :issue:`118`: Added support for demultiplexing of paired-end data. v1.14 (2017-06-16) ------------------ * Fix: Statistics for 3' part of a linked adapter were reported incorrectly * Fix `issue #244 `_: Quality trimming with ``--nextseq-trim`` would not apply to R2 when trimming paired-end reads. * ``--nextseq-trim`` now disables legacy mode. * Fix `issue #246 `_: installation failed on non-UTF8 locale v1.13 (2017-03-16) ------------------ * The 3' adapter of linked adapters can now be anchored. Write ``-a ADAPTER1...ADAPTER2$`` to enable this. Note that the 5' adapter is always anchored in this notation. * Issue #224: If you want the 5' part of a linked adapter *not* to be anchored, you can now write ``-g ADAPTER...ADAPTER2`` (note ``-g`` instead of ``-a``). This feature is experimental and may change behavior in the next release. * Issue #236: For more accurate statistics, it is now possible to specify the GC content of the input reads with ``--gc-content``. This does not change trimming results, only the number in the "expect" column of the report. Since this is probably not needed by many people, the option is not listed when running ``cutadapt --help``. * Issue #235: Adapter sequences are now required to contain only valid IUPAC codes (lowercase is also allowed, ``U`` is an alias for ``T``). This should help to catch hard-to-find bugs, especially in scripts. Use option ``-N`` to match characters literally (possibly useful for amino acid sequences). * Documentation updates and some refactoring of the code v1.12 (2016-11-28) ------------------ * Add read modification option ``--length`` (short: ``--l``), which will shorten each read to the given length. * Cutadapt will no longer complain that it has nothing to do when you do not give it any adapters. For example, you can use this to convert file formats: ``cutadapt -o output.fasta input.fastq.gz`` converts FASTQ to FASTA. * The ``xopen`` module for opening compressed files was moved to a `separate package on PyPI `_. v1.11 (2016-08-16) ------------------ * The ``--interleaved`` option no longer requires that both input and output is interleaved. It is now possible to have two-file input and interleaved output, and to have interleaved input and two-file output. * Fix issue #202: First and second FASTQ header could get out of sync when options modifying the read name were used. v1.10 (2016-05-19) ------------------ * Added a new “linked adapter” type, which can be used to search for a 5' and a 3' adapter at the same time. Use ``-a ADAPTER1...ADAPTER2`` to search for a linked adapter. ADAPTER1 is interpreted as an anchored 5' adapter, which is searched for first. Only if ADAPTER1 is found will ADAPTER2 be searched for, which is a regular 3' adapter. * Added experimental ``--nextseq-trim`` option for quality trimming of NextSeq data. This is necessary because that machine cannot distinguish between G and reaching the end of the fragment (it encodes G as 'black'). * Even when trimming FASTQ files, output can now be FASTA (quality values are simply dropped). Use the ``-o``/``-p`` options with a file name that ends in ``.fasta`` or ``.fa`` to enable this. * Cutadapt does not bundle pre-compiled C extension modules (``.so`` files) anymore. This affects only users that run cutadapt directly from an unpacked tarball. Install through ``pip`` or ``conda`` instead. * Fix issue #167: Option ``--quiet`` was not entirely quiet. * Fix issue #199: Be less strict when checking for properly-paired reads. * This is the last version of cutadapt to support Python 2.6. Future versions will require at least Python 2.7. v1.9.1 (2015-12-02) ------------------- * Added ``--pair-filter`` option, which :ref:`modifies how filtering criteria apply to paired-end reads ` * Add ``--too-short-paired-output`` and ``--too-long-paired-output`` options. * Fix incorrect number of trimmed bases reported if ``--times`` option was used. v1.9 (2015-10-29) ----------------- * Indels in the alignment can now be disabled for all adapter types (use ``--no-indels``). * Quality values are now printed in the info file (``--info-file``) when trimming FASTQ files. Fixes issue #144. * Options ``--prefix`` and ``--suffix``, which modify read names, now accept the placeholder ``{name}`` and will replace it with the name of the found adapter. Fixes issue #104. * Interleaved FASTQ files: With the ``--interleaved`` switch, paired-end reads will be read from and written to interleaved FASTQ files. Fixes issue #113. * Anchored 5' adapters can now be specified by writing ``-a SEQUENCE...`` (note the three dots). * Fix ``--discard-untrimmed`` and ``--discard-trimmed`` not working as expected in paired-end mode (issue #146). * The minimum overlap is now automatically reduced to the adapter length if it is too large. Fixes part of issue #153. * Thanks to Wolfgang Gerlach, there is now a Dockerfile. * The new ``--debug`` switch makes cutadapt print out the alignment matrix. v1.8.3 (2015-07-29) ------------------- * Fix issue #95: Untrimmed reads were not listed in the info file. * Fix issue #138: pip install cutadapt did not work with new setuptools versions. * Fix issue #137: Avoid a hang when writing to two or more gzip-compressed output files in Python 2.6. v1.8.2 (2015-07-24) ------------------- v1.8.1 (2015-04-09) ------------------- * Fix #110: Counts for 'too short' and 'too long' reads were swapped in statistics. * Fix #115: Make ``--trim-n`` work also on second read for paired-end data. v1.8 (2015-03-14) ----------------- * Support single-pass paired-end trimming with the new ``-A``/``-G``/``-B``/``-U`` parameters. These work just like their -a/-g/-b/-u counterparts, but they specify sequences that are removed from the *second read* in a pair. Also, if you start using one of those options, the read modification options such as ``-q`` (quality trimming) are applied to *both* reads. For backwards compatibility, read modifications are applied to the first read only if neither of ``-A``/``-G``/``-B``/``-U`` is used. See `the documentation `_ for details. This feature has not been extensively tested, so please give feedback if something does not work. * The report output has been re-worked in order to accomodate the new paired-end trimming mode. This also changes the way the report looks like in single-end mode. It is hopefully now more accessible. * Chris Mitchell contributed a patch adding two new options: ``--trim-n`` removes any ``N`` bases from the read ends, and the ``--max-n`` option can be used to filter out reads with too many ``N``. * Support notation for repeated bases in the adapter sequence: Write ``A{10}`` instead of ``AAAAAAAAAA``. Useful for poly-A trimming: Use ``-a A{100}`` to get the longest possible tail. * Quality trimming at the 5' end of reads is now supported. Use ``-q 15,10`` to trim the 5' end with a cutoff of 15 and the 3' end with a cutoff of 10. * Fix incorrectly reported statistics (> 100% trimmed bases) when ``--times`` set to a value greater than one. * Support .xz-compressed files (if running in Python 3.3 or later). * Started to use the GitHub issue tracker instead of Google Code. All old issues have been moved. v1.7 (2014-11-25) ----------------- * IUPAC characters are now supported. For example, use ``-a YACGT`` for an adapter that matches both ``CACGT`` and ``TACGT`` with zero errors. Disable with ``-N``. By default, IUPAC characters in the read are not interpreted in order to avoid matches in reads that consist of many (low-quality) ``N`` bases. Use ``--match-read-wildcards`` to enable them also in the read. * Support for demultiplexing was added. This means that reads can be written to different files depending on which adapter was found. See `the section in the documentation `_ for how to use it. This is currently only supported for single-end reads. * Add support for anchored 3' adapters. Append ``$`` to the adapter sequence to force the adapter to appear in the end of the read (as a suffix). Closes issue #81. * Option ``--cut`` (``-u``) can now be specified twice, once for each end of the read. Thanks to Rasmus Borup Hansen for the patch! * Options ``--minimum-length``/``--maximum-length`` (``-m``/``-M``) can be used standalone. That is, cutadapt can be used to filter reads by length without trimming adapters. * Fix bug: Adapters read from a FASTA file can now be anchored. v1.6 (2014-10-07) ----------------- * Fix bug: Ensure ``--format=...`` can be used even with paired-end input. * Fix bug: Sometimes output files would be incomplete because they were not closed correctly. * Alignment algorithm is a tiny bit faster. * Extensive work on the documentation. It's now available at https://cutadapt.readthedocs.org/ . * For 3' adapters, statistics about the bases preceding the trimmed adapter are collected and printed. If one of the bases is overrepresented, a warning is shown since this points to an incomplete adapter sequence. This happens, for example, when a TruSeq adapter is used but the A overhang is not taken into account when running cutadapt. * Due to code cleanup, there is a change in behavior: If you use ``--discard-trimmed`` or ``--discard-untrimmed`` in combination with ``--too-short-output`` or ``--too-long-output``, then cutadapt now writes also the discarded reads to the output files given by the ``--too-short`` or ``--too-long`` options. If anyone complains, I will consider reverting this. * Galaxy support files are now in `a separate repository `_. v1.5 (2014-08-05) ----------------- * Adapter sequences can now be read from a FASTA file. For example, write ``-a file:adapters.fasta`` to read 3' adapters from ``adapters.fasta``. This works also for ``-b`` and ``-g``. * Add the option ``--mask-adapter``, which can be used to not remove adapters, but to instead mask them with ``N`` characters. Thanks to Vittorio Zamboni for contributing this feature! * U characters in the adapter sequence are automatically converted to T. * Do not run Cython at installation time unless the --cython option is provided. * Add the option -u/--cut, which can be used to unconditionally remove a number of bases from the beginning or end of each read. * Make ``--zero-cap`` the default for colorspace reads. * When the new option ``--quiet`` is used, no report is printed after all reads have been processed. * When processing paired-end reads, cutadapt now checks whether the reads are properly paired. * To properly handle paired-end reads, an option --untrimmed-paired-output was added. v1.4 (2014-03-13) ----------------- * This release of cutadapt reduces the overhead of reading and writing files. On my test data set, a typical run of cutadapt (with a single adapter) takes 40% less time due to the following two changes. * Reading and writing of FASTQ files is faster (thanks to Cython). * Reading and writing of gzipped files is faster (up to 2x) on systems where the ``gzip`` program is available. * The quality trimming function is four times faster (also due to Cython). * Fix the statistics output for 3' colorspace adapters: The reported lengths were one too short. Thanks to Frank Wessely for reporting this. * Support the ``--no-indels`` option. This disallows insertions and deletions while aligning the adapter. Currently, the option is only available for anchored 5' adapters. This fixes issue 69. * As a sideeffect of implementing the --no-indels option: For colorspace, the length of a read (for ``--minimum-length`` and ``--maximum-length``) is now computed after primer base removal (when ``--trim-primer`` is specified). * Added one column to the info file that contains the name of the found adapter. * Add an explanation about colorspace ambiguity to the README v1.3 (2013-11-08) ----------------- * Preliminary paired-end support with the ``--paired-output`` option (contributed by James Casbon). See the README section on how to use it. * Improved statistics. * Fix incorrectly reported amount of quality-trimmed Mbp (issue 57, fix by Chris Penkett) * Add the ``--too-long-output`` option. * Add the ``--no-trim`` option, contributed by Dave Lawrence. * Port handwritten C alignment module to Cython. * Fix the ``--rest-file`` option (issue 56) * Slightly speed up alignment of 5' adapters. * Support bzip2-compressed files. v1.2 (2012-11-30) ----------------- * At least 25% faster processing of .csfasta/.qual files due to faster parser. * Between 10% and 30% faster writing of gzip-compressed output files. * Support 5' adapters in colorspace, even when no primer trimming is requested. * Add the ``--info-file`` option, which has a line for each found adapter. * Named adapters are possible. Usage: ``-a My_Adapter=ACCGTA`` assigns the name "My_adapter". * Improve alignment algorithm for better poly-A trimming when there are sequencing errors. Previously, not the longest possible poly-A tail would be trimmed. * James Casbon contributed the ``--discard-untrimmed`` option. v1.1 (2012-06-18) ----------------- * Allow to "anchor" 5' adapters (``-g``), forcing them to be a prefix of the read. To use this, add the special character ``^`` to the beginning of the adapter sequence. * Add the "-N" option, which allows 'N' characters within adapters to match literally. * Speedup of approx. 25% when reading from .gz files and using Python 2.7. * Allow to only trim qualities when no adapter is given on the command-line. * Add a patch by James Casbon: include read names (ids) in rest file * Use nosetest for testing. To run, install nose and run "nosetests". * When using cutadapt without installing it, you now need to run ``bin/cutadapt`` due to a new directory layout. * Allow to give a colorspace adapter in basespace (gets automatically converted). * Allow to search for 5' adapters (those specified with ``-g``) in colorspace. * Speed up the alignment by a factor of at least 3 by using Ukkonen's algorithm. The total runtime decreases by about 30% in the tested cases. * allow to deal with colorspace FASTQ files from the SRA that contain a fake additional quality in the beginning (use ``--format sra-fastq``) v1.0 (2011-11-04) ----------------- * ASCII-encoded quality values were assumed to be encoded as ascii(quality+33). With the new parameter ``--quality-base``, this can be changed to ascii(quality+64), as used in some versions of the Illumina pipeline. (Fixes issue 7.) * Allow to specify that adapters were ligated to the 5' end of reads. This change is based on a patch contributed by James Casbon. * Due to cutadapt being published in EMBnet.journal, I found it appropriate to call this release version 1.0. Please see http://journal.embnet.org/index.php/embnetjournal/article/view/200 for the article and I would be glad if you cite it. * Add Galaxy support, contributed by Lance Parsons. * Patch by James Casbon: Allow N wildcards in read or adapter or both. Wildcard matching of 'N's in the adapter is always done. If 'N's within reads should also match without counting as error, this needs to be explicitly requested via ``--match-read-wildcards``. v0.9.5 (2011-07-20) ------------------- * Fix issue 20: Make the report go to standard output when ``-o``/``--output`` is specified. * Recognize `.fq` as an extension for FASTQ files * many more unit tests * The alignment algorithm has changed. It will now find some adapters that previously were missed. Note that this will produce different output than older cutadapt versions! Before this change, finding an adapter would work as follows: - Find an alignment between adapter and read -- longer alignments are better. - If the number of errors in the alignment (divided by length) is above the maximum error rate, report the adapter as not being found. Sometimes, the long alignment that is found had too many errors, but a shorter alignment would not. The adapter was then incorrectly seen as "not found". The new alignment algorithm checks the error rate while aligning and only reports alignments that do not have too many errors. v0.9.4 (2011-05-20) ------------------- * now compatible with Python 3 * Add the ``--zero-cap`` option, which changes negative quality values to zero. This is a workaround to avoid segmentation faults in BWA. The option is now enabled by default when ``--bwa``/``--maq`` is used. * Lots of unit tests added. Run them with ``cd tests && ./tests.sh``. * Fix issue 16: ``--discard-trimmed`` did not work. * Allow to override auto-detection of input file format with the new ``-f``/``--format`` parameter. This mostly fixes issue 12. * Don't break when input file is empty. v0.9.2 (2011-03-16) ------------------- * Install a single ``cutadapt`` Python package instead of multiple Python modules. This avoids cluttering the global namespace and should lead to less problems with other Python modules. Thanks to Steve Lianoglou for pointing this out to me! * ignore case (ACGT vs acgt) when comparing the adapter with the read sequence * .FASTA/.QUAL files (not necessarily colorspace) can now be read (some 454 software uses this format) * Move some functions into their own modules * lots of refactoring: replace the fasta module with a much nicer seqio module. * allow to input FASTA/FASTQ on standard input (also FASTA/FASTQ is autodetected) v0.9 (2011-01-10) ----------------- * add ``--too-short-output`` and ``--untrimmed-output``, based on patch by Paul Ryvkin (thanks!) * add ``--maximum-length`` parameter: discard reads longer than a specified length * group options by category in ``--help`` output * add ``--length-tag`` option. allows to fix read length in FASTA/Q comment lines (e.g., ``length=123`` becomes ``length=58`` after trimming) (requested by Paul Ryvkin) * add ``-q``/``--quality-cutoff`` option for trimming low-quality ends (uses the same algorithm as BWA) * some refactoring * the filename ``-`` is now interpreted as standard in or standard output v0.8 (2010-12-08) ----------------- * Change default behavior of searching for an adapter: The adapter is now assumed to be an adapter that has been ligated to the 3' end. This should be the correct behavior for at least the SOLiD small RNA protocol (SREK) and also for the Illumina protocol. To get the old behavior, which uses a heuristic to determine whether the adapter was ligated to the 5' or 3' end and then trimmed the read accordingly, use the new ``-b`` (``--anywhere``) option. * Clear up how the statistics after processing all reads are printed. * Fix incorrect statistics. Adapters starting at pos. 0 were correctly trimmed, but not counted. * Modify scoring scheme: Improves trimming (some reads that should have been trimmed were not). Increases no. of trimmed reads in one of our SOLiD data sets from 36.5 to 37.6%. * Speed improvements (20% less runtime on my test data set). v0.7 (2010-12-03) ----------------- * Useful exit codes * Better error reporting when malformed files are encountered * Add ``--minimum-length`` parameter for discarding reads that are shorter than a specified length after trimming. * Generalize the alignment function a bit. This is preparation for supporting adapters that are specific to either the 5' or 3' end. * pure Python fallback for alignment function for when the C module cannot be used. v0.6 (2010-11-18) ----------------- * Support gzipped input and output. * Print timing information in statistics. v0.5 (2010-11-17) ----------------- * add ``--discard`` option which makes cutadapt discard reads in which an adapter occurs v0.4 (2010-11-17) ----------------- * (more) correctly deal with multiple adapters: If a long adapter matches with lots of errors, then this could lead to a a shorter adapter matching with few errors getting ignored. v0.3 (2010-09-27) ----------------- * fix huge memory usage (entire input file was unintentionally read into memory) v0.2 (2010-09-14) ----------------- * allow FASTQ input v0.1 (2010-09-14) ----------------- * initial release cutadapt-4.7/CITATION.cff000066400000000000000000000012631457457704700151110ustar00rootroot00000000000000cff-version: 1.2.0 title: Cutadapt message: "If you use this software, please cite DOI 10.14806/ej.17.1.200 as below" authors: - given-names: Marcel family-names: Martin orcid: 'https://orcid.org/0000-0002-0680-200X' url: 'https://cutadapt.readthedocs.io/' repository-code: 'https://github.com/marcelm/cutadapt/' license: MIT preferred-citation: type: article authors: - given-names: Marcel family-names: Martin orcid: 'https://orcid.org/0000-0002-0680-200X' doi: 10.14806/ej.17.1.200 journal: EMBnet.journal month: 5 year: 2011 start: 10 end: 12 title: 'Cutadapt removes adapter sequences from high-throughput sequencing reads' volume: 17 issue: 1 cutadapt-4.7/CONTRIBUTING.rst000066400000000000000000000035621457457704700156640ustar00rootroot00000000000000Contributing ------------ Contributions to Cutadapt in the form of source code or documentation improvements or helping out with responding to issues are welcome! To contribute to Cutadapt development, it is easiest to send in a pull request (PR) on GitHub. Here are some guidelines for how to do this. They are not strict rules. When in doubt, send in a PR and we will sort it out. * Limit a PR to a single topic. Submit multiple PRs if necessary. This way, it is easier to discuss the changes individually, and in case we find that one of them should not go in, the others can still be accepted. * For larger changes, consider opening an issue first to plan what you want to do. * Include appropriate unit or integration tests. Sometimes, tests are hard to write or don’t make sense. If you think this is the case, just leave the tests out initially and we can discuss whether to add any. * Add documentation and a changelog entry if appropriate. Code style ~~~~~~~~~~ * The source code needs to be formatted with `black `_. If you install `pre-commit `_, the formatting will be done for you. * There are inconsistencies in the current code base since it’s a few years old already. New code should follow the current rules, however. * Using an IDE is beneficial (PyCharm, for example). It helps to catch lots of style issues early (unused imports, spacing etc.). * Use `Google-style docstrings `_ (this is not PyCharm’s default setting). * Avoid unnecessary abbreviations for variable names. Code is more often read than written. * When writing a help text for a new command-line option, look at the output of ``cutadapt --help`` and try to make it look nice and short. * In comments and documentation, capitalize FASTQ, BWA, CPU etc. cutadapt-4.7/LICENSE000066400000000000000000000021201457457704700142150ustar00rootroot00000000000000Copyright (c) 2010 Marcel Martin and contributors Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. cutadapt-4.7/MANIFEST.in000066400000000000000000000004021457457704700147470ustar00rootroot00000000000000include CHANGES.rst include CITATION include LICENSE include README.rst include pyproject.toml include doc/*.rst include doc/conf.py include doc/Makefile include src/cutadapt/*.c include src/cutadapt/*.pyx include tests/*.py graft tests/data graft tests/cut cutadapt-4.7/README.rst000066400000000000000000000041731457457704700147110ustar00rootroot00000000000000.. image:: https://github.com/marcelm/cutadapt/workflows/CI/badge.svg :alt: .. image:: https://img.shields.io/pypi/v/cutadapt.svg :target: https://pypi.python.org/pypi/cutadapt :alt: .. image:: https://codecov.io/gh/marcelm/cutadapt/branch/main/graph/badge.svg :target: https://codecov.io/gh/marcelm/cutadapt :alt: .. image:: https://img.shields.io/badge/install%20with-bioconda-brightgreen.svg?style=flat :target: http://bioconda.github.io/recipes/cutadapt/README.html :alt: install with bioconda ======== Cutadapt ======== Cutadapt finds and removes adapter sequences, primers, poly-A tails and other types of unwanted sequence from your high-throughput sequencing reads. Cleaning your data in this way is often required: Reads from small-RNA sequencing contain the 3’ sequencing adapter because the read is longer than the molecule that is sequenced. Amplicon reads start with a primer sequence. Poly-A tails are useful for pulling out RNA from your sample, but often you don’t want them to be in your reads. Cutadapt helps with these trimming tasks by finding the adapter or primer sequences in an error-tolerant way. It can also modify and filter single-end and paired-end reads in various ways. Adapter sequences can contain IUPAC wildcard characters. Cutadapt can also demultiplex your reads. Cutadapt is available under the terms of the MIT license. Cutadapt development was started at `TU Dortmund University `_ in the group of `Prof. Dr. Sven Rahmann `_. It is currently being developed within `NBIS (National Bioinformatics Infrastructure Sweden) `_. If you use Cutadapt, please cite `DOI:10.14806/ej.17.1.200 `_ . Links ----- * `Documentation `_ * `Source code `_ * `Report an issue `_ * `Project page on PyPI (Python package index) `_ * `Wrapper for the Galaxy platform `_ cutadapt-4.7/doc/000077500000000000000000000000001457457704700137625ustar00rootroot00000000000000cutadapt-4.7/doc/Makefile000066400000000000000000000152001457457704700154200ustar00rootroot00000000000000# Makefile for Sphinx documentation # # You can set these variables from the command line. SPHINXOPTS = -W SPHINXBUILD = sphinx-build PAPER = BUILDDIR = _build # User-friendly check for sphinx-build ifeq ($(shell which $(SPHINXBUILD) >/dev/null 2>&1; echo $$?), 1) $(error The '$(SPHINXBUILD)' command was not found. Make sure you have Sphinx installed, then set the SPHINXBUILD environment variable to point to the full path of the '$(SPHINXBUILD)' executable. Alternatively you can add the directory with the executable to your PATH. If you don't have Sphinx installed, grab it from http://sphinx-doc.org/) endif # Internal variables. PAPEROPT_a4 = -D latex_paper_size=a4 PAPEROPT_letter = -D latex_paper_size=letter ALLSPHINXOPTS = -d $(BUILDDIR)/doctrees $(PAPEROPT_$(PAPER)) $(SPHINXOPTS) . # the i18n builder cannot share the environment and doctrees with the others I18NSPHINXOPTS = $(PAPEROPT_$(PAPER)) $(SPHINXOPTS) . .PHONY: help clean html dirhtml singlehtml pickle json htmlhelp qthelp devhelp epub latex latexpdf text man changes linkcheck doctest gettext all: html help: @echo "Please use \`make ' where is one of" @echo " html to make standalone HTML files" @echo " dirhtml to make HTML files named index.html in directories" @echo " singlehtml to make a single large HTML file" @echo " pickle to make pickle files" @echo " json to make JSON files" @echo " htmlhelp to make HTML files and a HTML help project" @echo " qthelp to make HTML files and a qthelp project" @echo " devhelp to make HTML files and a Devhelp project" @echo " epub to make an epub" @echo " latex to make LaTeX files, you can set PAPER=a4 or PAPER=letter" @echo " latexpdf to make LaTeX files and run them through pdflatex" @echo " latexpdfja to make LaTeX files and run them through platex/dvipdfmx" @echo " text to make text files" @echo " man to make manual pages" @echo " texinfo to make Texinfo files" @echo " info to make Texinfo files and run them through makeinfo" @echo " gettext to make PO message catalogs" @echo " changes to make an overview of all changed/added/deprecated items" @echo " xml to make Docutils-native XML files" @echo " pseudoxml to make pseudoxml-XML files for display purposes" @echo " linkcheck to check all external links for integrity" @echo " doctest to run all doctests embedded in the documentation (if enabled)" clean: rm -rf $(BUILDDIR)/* html: $(SPHINXBUILD) -b html $(ALLSPHINXOPTS) $(BUILDDIR)/html @echo @echo "Build finished. The HTML pages are in $(BUILDDIR)/html." dirhtml: $(SPHINXBUILD) -b dirhtml $(ALLSPHINXOPTS) $(BUILDDIR)/dirhtml @echo @echo "Build finished. The HTML pages are in $(BUILDDIR)/dirhtml." singlehtml: $(SPHINXBUILD) -b singlehtml $(ALLSPHINXOPTS) $(BUILDDIR)/singlehtml @echo @echo "Build finished. The HTML page is in $(BUILDDIR)/singlehtml." pickle: $(SPHINXBUILD) -b pickle $(ALLSPHINXOPTS) $(BUILDDIR)/pickle @echo @echo "Build finished; now you can process the pickle files." json: $(SPHINXBUILD) -b json $(ALLSPHINXOPTS) $(BUILDDIR)/json @echo @echo "Build finished; now you can process the JSON files." htmlhelp: $(SPHINXBUILD) -b htmlhelp $(ALLSPHINXOPTS) $(BUILDDIR)/htmlhelp @echo @echo "Build finished; now you can run HTML Help Workshop with the" \ ".hhp project file in $(BUILDDIR)/htmlhelp." qthelp: $(SPHINXBUILD) -b qthelp $(ALLSPHINXOPTS) $(BUILDDIR)/qthelp @echo @echo "Build finished; now you can run "qcollectiongenerator" with the" \ ".qhcp project file in $(BUILDDIR)/qthelp, like this:" @echo "# qcollectiongenerator $(BUILDDIR)/qthelp/cutadapt.qhcp" @echo "To view the help file:" @echo "# assistant -collectionFile $(BUILDDIR)/qthelp/cutadapt.qhc" devhelp: $(SPHINXBUILD) -b devhelp $(ALLSPHINXOPTS) $(BUILDDIR)/devhelp @echo @echo "Build finished." @echo "To view the help file:" @echo "# mkdir -p $$HOME/.local/share/devhelp/cutadapt" @echo "# ln -s $(BUILDDIR)/devhelp $$HOME/.local/share/devhelp/cutadapt" @echo "# devhelp" epub: $(SPHINXBUILD) -b epub $(ALLSPHINXOPTS) $(BUILDDIR)/epub @echo @echo "Build finished. The epub file is in $(BUILDDIR)/epub." latex: $(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex @echo @echo "Build finished; the LaTeX files are in $(BUILDDIR)/latex." @echo "Run \`make' in that directory to run these through (pdf)latex" \ "(use \`make latexpdf' here to do that automatically)." latexpdf: $(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex @echo "Running LaTeX files through pdflatex..." $(MAKE) -C $(BUILDDIR)/latex all-pdf @echo "pdflatex finished; the PDF files are in $(BUILDDIR)/latex." latexpdfja: $(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex @echo "Running LaTeX files through platex and dvipdfmx..." $(MAKE) -C $(BUILDDIR)/latex all-pdf-ja @echo "pdflatex finished; the PDF files are in $(BUILDDIR)/latex." text: $(SPHINXBUILD) -b text $(ALLSPHINXOPTS) $(BUILDDIR)/text @echo @echo "Build finished. The text files are in $(BUILDDIR)/text." man: $(SPHINXBUILD) -b man $(ALLSPHINXOPTS) $(BUILDDIR)/man @echo @echo "Build finished. The manual pages are in $(BUILDDIR)/man." texinfo: $(SPHINXBUILD) -b texinfo $(ALLSPHINXOPTS) $(BUILDDIR)/texinfo @echo @echo "Build finished. The Texinfo files are in $(BUILDDIR)/texinfo." @echo "Run \`make' in that directory to run these through makeinfo" \ "(use \`make info' here to do that automatically)." info: $(SPHINXBUILD) -b texinfo $(ALLSPHINXOPTS) $(BUILDDIR)/texinfo @echo "Running Texinfo files through makeinfo..." make -C $(BUILDDIR)/texinfo info @echo "makeinfo finished; the Info files are in $(BUILDDIR)/texinfo." gettext: $(SPHINXBUILD) -b gettext $(I18NSPHINXOPTS) $(BUILDDIR)/locale @echo @echo "Build finished. The message catalogs are in $(BUILDDIR)/locale." changes: $(SPHINXBUILD) -b changes $(ALLSPHINXOPTS) $(BUILDDIR)/changes @echo @echo "The overview file is in $(BUILDDIR)/changes." linkcheck: $(SPHINXBUILD) -b linkcheck $(ALLSPHINXOPTS) $(BUILDDIR)/linkcheck @echo @echo "Link check complete; look for any errors in the above output " \ "or in $(BUILDDIR)/linkcheck/output.txt." doctest: $(SPHINXBUILD) -b doctest $(ALLSPHINXOPTS) $(BUILDDIR)/doctest @echo "Testing of doctests in the sources finished, look at the " \ "results in $(BUILDDIR)/doctest/output.txt." xml: $(SPHINXBUILD) -b xml $(ALLSPHINXOPTS) $(BUILDDIR)/xml @echo @echo "Build finished. The XML files are in $(BUILDDIR)/xml." pseudoxml: $(SPHINXBUILD) -b pseudoxml $(ALLSPHINXOPTS) $(BUILDDIR)/pseudoxml @echo @echo "Build finished. The pseudo-XML files are in $(BUILDDIR)/pseudoxml." cutadapt-4.7/doc/_static/000077500000000000000000000000001457457704700154105ustar00rootroot00000000000000cutadapt-4.7/doc/_static/adapters.svg000066400000000000000000000433761457457704700177510ustar00rootroot00000000000000 image/svg+xml Removed sequence Adapter Read 5' Adapter 3' Adapter Anchored 5' adapter or or cutadapt-4.7/doc/_static/cutadapt-logo.inkscape.svg000066400000000000000000000072761457457704700225040ustar00rootroot00000000000000 image/svg+xml cutadapt-4.7/doc/_static/cutadapt-logo.png000066400000000000000000000136331457457704700206670ustar00rootroot00000000000000PNG  IHDR NsBIT|d pHYs$$P$tEXtSoftwarewww.inkscape.org<IDATx{|\uLR۴Қ;m(ꮼ ˮU[PEwˊ .v&֑MR`Q`Q!(`Zy(-P bҲ[$}i2st̽wf2sqPB *TPB#6/p9VW4 5ީ3aC)}(^`&PՄat:zx{8Ҡ,& 3S G?y1ߺnv,aBPHTlS"?pH?Q b8E L(_Qc:#|` o?Wlc`B8L(Z,*_B8*=Y~s)k-1^Q8cǧV m v#|0k,Z(9s(Q \l{dwɕpPS("ŶTH&,!PTY|Q7"׀bR,}BL^cwۖl(Xci, %V6@mXE)VZg9XQuѺTk0 i;P8'k/gqlqնm(zOg=l34j8 s""H|nv? ?t̹z={.3jg|6aO.2JD>2S7woRp}tC }|j퉵v=5ﯤ`4]Nt ))JRp}ZE%Uyɇ}p _gp$E9nWjg~ModI#1!6IOݛKo3L(;AnS Z%c\ 5Nwe1<-*'$Z/6mWcK|Y$ ED)scRPXM9猧YP;]5hUsgM;u;r=bM?yݛ_j }PT~@ӵfjNVC7W ~owo>d+>ZL/dAyWqbb>gW&'Iro.)oO30fjI1{(>1:3 EBW xX;cO~Ʃ==SβxRѧőg[m݃a/V;=P"1Eg{nux럜ev왓tw3Rfb+b!|YO#wSYxU*?u3ˠ-mG:~ds2+ϵXK'D䞴n.P,4|VDhYFѥÃ1hn׏= ƴ FN}U}7n_}Gu¢]]W#/"wUi%K=p2oY|QsF/WF>(4—Ɍ۾b2oռ̾U(+*$W%[ :>QizTjF.y;V#<|9њ:!И&]CXbonF'm(͉Į\aMṪ"Pl:&RZjcq)1;D#y~8p raR?pƊZOWi jq}[[$U_axG<{ *84 .Qpii [y8HT1="plY]29x3t}ny~8 &p8ύFgUGKo|rsV0+Fa?v#wzUFc VU%[+LTn&OuP4lIna ^1ڭEf?孩$ْ> hJbo\o2#~1~`[˶.#e ]WCbE EW,q{o*Y0X^lN~G%A].lb\hMl B5+9 :Uu]_b v+jmNڀa\o7&D\G%0S*@M>-i&e r !ْ\=6d e8Y7d:V9? Nǣ0>|[v)QE 1:z0Bgު'=[FLŸ~YX~?z-#>5$|-U ӭ xZ~w?dˉ30׭ (&/+/oaTՏZCe3~M~+ RBYH>9 >8lv|FhFN^2f/~FDQ! ?1}pfr׽Y8/ 3a0gk*T6\t=9x=&R`8/z+JMBaiqY7(0f<.9r]mFhz,1{=_GѶ^/PR;*>8^0Q++,q8vxjl;jTpy6b;JB̈́keN5DR>8 7BGr-z=I:&'c<)~ENFag,Sڙ0}~uF18vUs]]|8 ldta$HܲC~UߺalnǗJ:TYWKVHD哋/4([8kF_ | >I~ًZwV3 ^rgB3jVE6r;ݴ`,Z(Yy%Eik ,&p03/FLrK6TA>欄Wu; X L6>cָSчPng22N p$ G- iKܲc8 d1 <酠1pG](LIMJ]ܒEEG h$ݛH ټv E #xFhL c GUW zkQ'{Ƙ%HKzJwH@.X5rqd.pˢx831nAڼR6Fʸ'%G5y9yZR.O5WƟP~8<=J熛g$H4'3 x,YQks8> !Q>Iq !8[+j|K ûs;cn^UdQT2inBVB}̾=ޥdelcms6BD"Wo&y9q(*ْL}xχm1xȥ֏V{oK{жb^Uc-Ä"?pP>e~-K9Wϙppޙw ?#'f˺-{NòF/#ɭlzZJpbGEeUsg*@i#}P/$+b]-`JbP)5Snn{r*!Lևa˧V;?P{Eg) `UC}g.Ǥ\,`$6z ܫ _YhI 09"y:DK29{b}~KrrX4lOǁ[mgeu17!.őĺį|򚺬u/P45^3x{>o?QȲTZKd /4A ]^SAr9WϙH,/D噴̄:[:#)[pPǺyg+p R])'ֽwGh("HKzY5hu+>2WMD9y0ɊZˁ[#ȭgC饘VPԑǜ*犎;AhIl3l®$hc.Gu*އqG}_UWžð["PxQYhZDž#w ;.XkSlSj."2}<هrc:7c}stjzk.P~G]ƝCX$ ]!"7^l[ Ϸ<)"{\/ oLBv҆Kh|q;t@'87!dzTPt LlIޣ  y_ O x('px 򺨏qW v2# +=#2P>`/|N sˉ~8ME)њ*Eo&U zJDtIg٭g#YY'̺O'4rv" =`M k.'N%_ZqԔaR-o]SS'ˆb|1њ؆G^I#ٵyǟqS{|ZѩrL6'9%qn E4.ET4H6')-P3D8>SEo$s%ra5zr` ھ!t#nVKqL h@rlPjD&,&%0é֟\,al_n1MWhLbΕs&gvV>њzP3U sҒ^"j)(#4K$m?Tbc,ć Cg<xvklg&0h5N۷w3 S@ PhP#NدMԢ*TPB *TȒ.`HTYtIENDB`cutadapt-4.7/doc/_static/cutadapt-logo.svg000066400000000000000000000027441457457704700207030ustar00rootroot00000000000000 image/svg+xml cutadapt-4.7/doc/algorithms.rst000066400000000000000000000252711457457704700166740ustar00rootroot00000000000000================= Algorithm details ================= .. _adapter-alignment-algorithm: Adapter alignment algorithm =========================== Since the publication of the `EMBnet journal application note about Cutadapt `_, the alignment algorithm used for finding adapters has changed significantly. An overview of this new algorithm is given in this section. An even more detailed description is available in Chapter 2 of my PhD thesis `Algorithms and tools for the analysis of high-throughput DNA sequencing data `_. The algorithm is based on *semiglobal alignment*, also called *free-shift*, *ends-free* or *overlap* alignment. In a regular (global) alignment, the two sequences are compared from end to end and all differences occuring over that length are counted. In semiglobal alignment, the sequences are allowed to freely shift relative to each other and differences are only penalized in the overlapping region between them:: FANTASTIC ELEFANT The prefix ``ELE`` and the suffix ``ASTIC`` do not have a counterpart in the respective other row, but this is not counted as an error. The overlap ``FANT`` has a length of four characters. Traditionally, *alignment scores* are used to find an optimal overlap aligment: This means that the scoring function assigns a positive value to matches, while mismatches, insertions and deletions get negative values. The optimal alignment is then the one that has the maximal total score. Usage of scores has the disadvantage that they are not at all intuitive: What does a total score of *x* mean? Is that good or bad? How should a threshold be chosen in order to avoid finding alignments with too many errors? For Cutadapt, the adapter alignment algorithm primarily uses *unit costs* instead. This means that mismatches, insertions and deletions are counted as one error, which is easier to understand and allows to specify a single parameter for the algorithm (the maximum error rate) in order to describe how many errors are acceptable. There is a problem with this: When using costs instead of scores, we would like to minimize the total costs in order to find an optimal alignment. But then the best alignment would always be the one in which the two sequences do not overlap at all! This would be correct, but meaningless for the purpose of finding an adapter sequence. The optimization criteria are therefore a bit different. The basic idea is to consider the alignment optimal that maximizes the overlap between the two sequences, as long as the allowed error rate is not exceeded. Conceptually, the procedure is as follows: 1. Consider all possible overlaps between the two sequences and compute an alignment for each, minimizing the total number of errors in each one. 2. Keep only those alignments that do not exceed the specified maximum error rate. 3. Then, keep only those alignments that have a maximal number of matches (that is, there is no alignment with more matches). (Note: This has been changed, see the section below for an update.) 4. If there are multiple alignments with the same number of matches, then keep only those that have the smallest error rate. 5. If there are still multiple candidates left, choose the alignment that starts at the leftmost position within the read. In Step 1, the different adapter types are taken into account: Only those overlaps that are actually allowed by the adapter type are actually considered. .. _algorithm-indel-scores: Alignment algorithm changes in Cutadapt 4 ========================================= The above algorithm has been tweaked slightly in Cutadapt 4. The main problem was that the idea of maximizing the number of matches (criterion 3 in the section above) sometimes leads to unintuitive results. For example, the previous algorithm would prefer an alignment such as this one:: CCAGTCCTTTCCTGAGAGT Read |||||||| || CCAGTCCT---CT 5' adapter This alignment was considered to be the best one because it contains 10 matches, which is the maximum possible. The three consecutive deletions are ignored when making that decision. To the user, the unexpected result is visible because the read would end up as ``GAGAGT`` after trimming. With the tuned algorithm, the alignment is more sensible:: CCAGTCCTTTCCTGAGAGT Read ||||||||X| CCAGTCCTCT 5' adapter The trimmed read is now ``CCTGAGAGT``, which is what one would likely expect. The alignment algorithm in Cutadapt can perhaps now be described as a *hybrid* algorithm that uses both edit distance and score: - Edit distance is used to fill out the dynamic programming matrix. Conceptually, this can be seen as computing the edit distance for all possible overlaps between the read and the adapter. We need to use the edit distance as optimization criterion at this stage because we want to be able to let the user provide a maximum error rate (``-e``). Also, using edit distance (that is, unit costs) allows using some optimizations while filling in the matrix (Ukkonen’s trick). - A second matrix with scores is filled in simultaneously. The value in a cell is the score of the edit-distance-based alignment, the score is not used as optimization criterion. - Finally, the score is used to decide which of the overlaps between read and adapter is the best one. (This means looking into the last row and column of the score matrix.) The score function is currently: match: +1, mismatch: -1, indel: -2 A second change in the alignment algorithm is relevant if there are multiple adapter occurrences in a read (such as adapter dimers). With the new algorithm, leftmost (earlier) adapter occurrences are now more reliably preferred even if a later match has fewer errors. Here are two examples from the SRR452441 dataset (R1 only), trimmed with the standard Illumina adapter. The top row shows the alignment as found by the previous algorithm, the middle row shows the sequencing read, and the last row shows the alignment as found by the updated algorithm. :: @SRR452441.2151945 AGATCGGAAGAGCACACGTCTGAACTCCAGTCAC Previous alignment |||||||||||||||||||||||||||||||||| -GATCGGAAGAGCACACGTCTGAACTCCAGTCACGCACACGAGATCGGAAGAGCACACGTCTGAACTCCAGTCACGCACACGAATCTCGTATGCCGTCTTCT X||||||||||||||||||||||||||||||||| AGATCGGAAGAGCACACGTCTGAACTCCAGTCAC New alignment Previously the read was trimmed to the first 40 bases, now the earlier, nearly full-length occurrence is taken into account, and the read is empty after trimming. :: @SRR452441.2157038 AGATCGGAAGAGCACACGTCTGAACTCCAGTCAC Previous alignment |||||||||||||||||||||||||||||||||| -GATCGGAAGAGCACACGTCTGAACTCCAGTCAGATCGGAAGAGCACACGTCTGAACTCCAGTCACGCACACGAATCTCGTATGCCGTCTTCTGCTTGAAAA X||||||||||||||||||||||||||||||||X AGATCGGAAGAGCACACGTCTGAACTCCAGTCAC New alignment Only very few reads should be affected by the above changes (in SRR452441, which has 2.2 million reads, only four reads were trimmed differently). In those cases where it matters, however, there should now be fewer surprises. .. _quality-trimming-algorithm: Quality trimming algorithm ========================== The trimming algorithm implemented in Cutadapt is the same as the one used by BWA, but applied to both ends of the read in turn (if requested). That is: Subtract the given cutoff from all qualities; compute partial sums from all indices to the end of the sequence; cut the sequence at the index at which the sum is minimal. If both ends are to be trimmed, repeat this for the other end. The basic idea is to remove all bases starting from the end of the read whose quality is smaller than the given threshold. This is refined a bit by allowing some good-quality bases among the bad-quality ones. In the following example, we assume that the 3' end is to be quality-trimmed. Assume you use a threshold of 10 and have these quality values: 42, 40, 26, 27, 8, 7, 11, 4, 2, 3 Subtracting the threshold gives: 32, 30, 16, 17, -2, -3, 1, -6, -8, -7 Then sum up the numbers, starting from the end (partial sums). Stop early if the sum is greater than zero: (70), (38), 8, -8, -25, -23, -20, -21, -15, -7 The numbers in parentheses are not computed (because 8 is greater than zero), but shown here for completeness. The position of the minimum (-25) is used as the trimming position. Therefore, the read is trimmed to the first four bases, which have quality values 42, 40, 26, 27. .. _poly-a-algorithm: Poly-A trimming algorithm ========================= The aim of the ``--poly-A`` trimming algorithm is to find a suffix of the read that contains a high number of ``A`` nucleotides. Conceptually, we consider all possible suffixes of the read. For each suffix, we count how many ``A`` and non-``A`` nucleotides it contains. We then exclude all suffixes from consideration that have more than 20% non-``A`` because we assume these are not actually poly-A tails. For each remaining suffix, we compute a score: Non-``A`` nucleotides get -2 and ``A`` nucleotides get +1, which we add up to get the score for the suffix. Finally, we choose the suffix that maximizes that score and remove it from the read. Shorter suffixes win if there is a tie. An implementation in Python (input string is ``s``):: n = len(s) best_index = n best_score = score = errors = 0 for i, nuc in reversed(list(enumerate(range(n)))): if nuc == "A": score += 1 else: score -= 2 errors += 1 if score > best_score and errors <= 0.2 * (n - i): best_index = i best_score = score s = s[:best_index] .. _expected-errors: Expected errors =============== The ``--max-expected-errors`` (short version: ``--max-ee``) option discards a read if its number of expected errors exceeds the specified threshold. This emulates a filtering option originally implemented in `USEARCH `_. The number of expected errors is computed from the quality scores as described in the USEARCH paper by `Edgar et al. (2015) `_, (Section 2.2). That is, it is the sum of the error probabilities. The USEARCH manual page `has a lot more background on expected errors `_ and how to choose a threshold. The ``--max-average-error-rate`` (short version: ``--max-aer``) option works similarly but divides the expected errors by the length of the read. The resulting fraction is then used to filter the read. This is especially helpful on reads that have highly varying read length, such as those coming from nanopore sequencing. cutadapt-4.7/doc/changes.rst000066400000000000000000000000521457457704700161210ustar00rootroot00000000000000.. _changes: .. include:: ../CHANGES.rst cutadapt-4.7/doc/conf.py000066400000000000000000000022261457457704700152630ustar00rootroot00000000000000import os from setuptools_scm import get_version # If extensions (or modules to document with autodoc) are in another directory, # add these directories to sys.path here. If the directory is relative to the # documentation root, use os.path.abspath to make it absolute, like shown here. # sys.path.insert(0, os.path.abspath(os.path.join(os.pardir, "src"))) extensions = [ "sphinx.ext.autodoc", "sphinx_issues", "sphinx_better_subsection", ] source_suffix = ".rst" master_doc = "index" project = "Cutadapt" copyright = "2010 Marcel Martin" version = get_version(root="..", relative_to=__file__) # Read The Docs modifies the conf.py script and we therefore get # version numbers like 0.12+0.g27d0d31 if os.environ.get("READTHEDOCS") == "True": version = ".".join(version.split(".")[:2]) html_theme = "sphinx_rtd_theme" # The full version, including alpha/beta/rc tags. release = version issues_uri = "https://github.com/marcelm/cutadapt/issues/{issue}" issues_pr_uri = "https://github.com/marcelm/cutadapt/pull/{pr}" suppress_warnings = ["image.nonlocal_uri"] exclude_patterns = ["_build"] pygments_style = "sphinx" html_static_path = ["_static"] cutadapt-4.7/doc/develop.rst000066400000000000000000000051101457457704700161470ustar00rootroot00000000000000Developing ========== The `Cutadapt source code is on GitHub `_. Cutadapt is written in Python 3 with some extension modules that are written in Cython. Development installation ------------------------ For development, make sure that you install Cython and tox. We also recommend using a virtualenv. This sequence of commands should work:: git clone https://github.com/marcelm/cutadapt.git # or clone your own fork cd cutadapt virtualenv .venv source .venv/bin/activate pip install Cython pytest tox pre-commit pre-commit install pip install -e . Then you should be able to run Cutadapt:: cutadapt --version Remember that you do not need to activate a virtualenv to run binaries in it, so this works even when the environment is activated:: venv/bin/cutadapt --version The tests can then be run like this:: pytest Or with tox (but then you will need to have binaries for all tested Python versions installed):: tox Making a release ---------------- A new release is automatically deployed to PyPI whenever a new tag is pushed to the Git repository. Cutadapt uses `setuptools_scm `_ to automatically manage version numbers. This means that the version is not stored in the source code but derived from the most recent Git tag. The following procedure can be used to bump the version and make a new release. #. Update ``CHANGES.rst`` (version number and list of changes) #. Ensure you have no uncommitted changes in the working copy. #. Run a ``git pull``. #. Run ``tox``, ensuring all tests pass. #. Tag the current commit with the version number (there must be a ``v`` prefix):: git tag v0.1 To release a development version, use a ``dev`` version number such as ``v1.17.dev1``. Users will not automatically get these unless they use ``pip install --pre``. #. Push the tag:: git push --tags #. Wait for the GitHub Action to finish and to deploy to PyPI. #. The `bioconda recipe `_ also needs to be updated, but the bioconda bot will likely do this automatically if you just wait a little while. Ensure that the list of dependencies (the ``requirements:`` section in the recipe) is in sync with the ``setup.cfg`` file. If something went wrong *after* a version has already been tagged and published to PyPI, fix the problem and tag a new version. Do not change a version that has already been uploaded. .. include:: ../CONTRIBUTING.rst .. include:: ideas.rst cutadapt-4.7/doc/guide.rst000066400000000000000000003035631457457704700156230ustar00rootroot00000000000000========== User guide ========== Basic usage =========== To trim a 3' adapter, the basic command-line for Cutadapt is:: cutadapt -a AACCGGTT -o output.fastq input.fastq The sequence of the adapter is given with the ``-a`` option. You need to replace ``AACCGGTT`` with the correct adapter sequence. Reads are read from the input file ``input.fastq`` and are written to the output file ``output.fastq``. Compressed in- and output files are also supported:: cutadapt -a AACCGGTT -o output.fastq.gz input.fastq.gz Cutadapt searches for the adapter in all reads and removes it when it finds it. Unless you use a filtering option, all reads that were present in the input file will also be present in the output file, some of them trimmed, some of them not. Even reads that were trimmed to a length of zero are output. All of this can be changed with command-line options, explained further down. :ref:`Trimming of paired-end data ` is also supported. Input and output file formats ----------------------------- Supported input formats are FASTA, FASTQ and unaligned BAM (uBAM, only for single-end data at the moment). Supported output formats are FASTA and FASTQ. Compression :ref:`is supported in multiple formats and detected automatically `. The input file format is recognized from the file name extension. If the extension was not recognized or when Cutadapt reads from standard input, the contents are inspected instead. The output file format is also recognized from the file name extension. If the extensions was not recognized or when Cutadapt writes to standard output, the same format as the input is used for the output. When writing a FASTQ file, a second header (the text after the ``+`` on the third line of a record) that possibly exists in the input is removed. When writing a FASTA file, line breaks within the sequence are removed. See also :ref:`file format conversion `. .. _compressed-files: Compressed files ---------------- Cutadapt supports compressed input and output files. Whether an input file needs to be decompressed or an output file needs to be compressed is detected automatically by inspecting the file name: For example, if it ends in ``.gz``, then gzip compression is assumed :: cutadapt -a AACCGGTT -o output.fastq.gz input.fastq.gz All of Cutadapt's options that expect a file name support this. The supported compression formats are gzip (``.gz``), bzip2 (``.bz2``), xz (``.xz``) and zstandard (``.zst``). The default compression level for gzip output is 4. Use option ``-Z`` to change this to level 1. The files need more space, but it is faster and therefore a good choice for short-lived intermediate files. Cutadapt uses the `xopen library `_ to speed up reading and writing compressed files. .. _standard-input-output: Standard input and output ------------------------- If no output file is specified via the ``-o`` option, then the output is sent to the standard output stream. Example:: cutadapt -a AACCGGTT input.fastq > output.fastq There is one difference in behavior if you use Cutadapt without ``-o``: The report is sent to the standard error stream instead of standard output. You can redirect it to a file like this:: cutadapt -a AACCGGTT input.fastq > output.fastq 2> report.txt Wherever Cutadapt expects a file name, you can also write a dash (``-``) in order to specify that standard input or output should be used. For example:: tail -n 4 input.fastq | cutadapt -a AACCGGTT - > output.fastq The ``tail -n 4`` prints out only the last four lines of ``input.fastq``, which are then piped into Cutadapt. Thus, Cutadapt will work only on the last read in the input file. In most cases, you should probably use ``-`` at most once for an input file and at most once for an output file, in order not to get mixed output. For the same reason, you should not use ``-`` for non-interleaved paired-end data. You cannot combine ``-`` and gzip compression since Cutadapt needs to know the file name of the output or input file. if you want to have a gzip-compressed output file, use ``-o`` with an explicit name. One last "trick" is to use ``/dev/null`` as an output file name. This special file discards everything you send into it. If you only want to see the statistics output, for example, and do not care about the trimmed reads at all, you could use something like this:: cutadapt -a AACCGGTT -o /dev/null input.fastq .. _multicore: Multi-core support ------------------ Cutadapt supports parallel processing, that is, it can use multiple CPU cores. Multi-core is not enabled by default. To enable it, use the option ``-j N`` (or the spelled-out version ``--cores=N``), where ``N`` is the number of cores to use. To automatically detect the number of available cores, use ``-j 0`` (or ``--cores=0``). The detection takes into account resource restrictions that may be in place. For example, if running Cutadapt as a batch job on a cluster system, the actual number of cores assigned to the job will be used. (This works if the cluster systems uses the cpuset(1) mechanism to impose the resource limitation.) Make also sure that you have ``pigz`` (parallel gzip) installed if you use multiple cores and write to a ``.gz`` output file. Otherwise, compression of the output will be done in a single thread and therefore be a bottleneck. .. versionadded:: 1.15 .. versionadded:: 1.18 ``--cores=0`` for autodetection .. versionadded:: 2.5 Multicore works with ``--untrimmed/too-short/too-long-(paired)-output`` .. versionadded:: 2.7 Multicore works with ``--info-file``, ``--rest-file``, ``--wildcard-file`` .. versionadded:: 3.0 Multicore support for demultiplexing added. .. _speedup: Speed-up tricks --------------- There are several tricks for limiting wall-clock time while using Cutadapt. Option ``-Z`` (equivalent to ``--compression-level=1``) can be used to limit the amount of CPU time which is spent on the compression of output files. Alternatively, choosing filenames not ending with ``.gz``, ``.bz2`` or ``.xz`` will make sure no CPU time is spent on compression at all. On systems with slow I/O, it can actually be faster to set a higher compression-level than 1. Increasing the number of cores with ``-j`` will increase the number of reads per minute at near-linear rate. It is also possible to use pipes in order to bypass the filesystem and pipe cutadapt's output into an aligner such as BWA. The ``mkfifo`` command allows you to create named pipes in bash. .. code-block::bash mkfifo R1.fastq R2.fastq cutadapt -a ${ADAPTER_R1} -A ${ADAPTER_R2} -o R1.fastq -p R2.fastq ${READ1} ${READ2} > cutadapt.report & \ bwa mem ${INDEX} R1.fastq R2.fastq This command will run cutadapt and BWA simultaneously, using Cutadapt’s output as BWA’s input, and capturing Cutadapt’s report in ``cutadapt.report``. Read processing stages ====================== Cutadapt can do a lot more in addition to removing adapters. There are various command-line options that make it possible to modify and filter reads and to redirect them to various output files. Each read is processed in the following order: 1. :ref:`Read modification options ` are applied. This includes :ref:`adapter removal `, :ref:`quality trimming `, read name modifications etc. The order in which they are applied is the order in which they are listed in the help shown by ``cutadapt --help`` under the “Additional read modifications” heading. Adapter trimming itself does not appear in that list and is done after quality trimming and before length trimming (``--length``/``-l``). 2. :ref:`Filtering options ` are applied, such as removal of too short or untrimmed reads. Some of the filters also allow to redirect a read to a separate output file. The filters are applied in the order in which they are listed in the help shown by ``cutadapt --help`` under the “Filtering of processed reads” heading. 3. If the read has passed all the filters, it is written to the output file. .. _adapter-types: Adapter types ============= Cutadapt can detect multiple adapter types. 5' adapters preceed the sequence of interest and 3' adapters follow it. Further distinctions are made according to where in the read the adapter sequence is allowed to occur. ========================================================= ============================= Adapter type Command-line option ========================================================= ============================= :ref:`Regular 3' adapter ` ``-a ADAPTER`` :ref:`Regular 5' adapter ` ``-g ADAPTER`` :ref:`Non-internal 3' adapter ` ``-a ADAPTERX`` :ref:`Non-internal 5' adapter ` ``-g XADAPTER`` :ref:`Anchored 3' adapter ` ``-a ADAPTER$`` :ref:`Anchored 5' adapter ` ``-g ^ADAPTER`` :ref:`5' or 3' (both possible) ` ``-b ADAPTER`` :ref:`Linked adapter ` | ``-a ^ADAPTER1...ADAPTER2`` | ``-g ADAPTER1...ADAPTER2`` ========================================================= ============================= By default, all adapters :ref:`are searched error-tolerantly `. Adapter sequences :ref:`may also contain any IUPAC wildcard character (degenerate bases) ` (such as ``N``). In addition, it is possible to :ref:`remove a fixed number of bases ` from the beginning or end of each read, to :ref:`remove low-quality bases (quality trimming) ` from the 3' and 5' ends, and to :ref:`search for adapters also in the reverse-complemented reads `. Overview of adapter types ------------------------- 3' adapter types ~~~~~~~~~~~~~~~~ A 3' adapter is assumed to be ligated to the 3' end of your sequence of interest. When such an adapter is found, the adapter sequence itself and the sequence following it (if there is any) are trimmed. This table shows in which ways the different 3' adapter types are allowed to occur in a read in order to be recognized by the program. ================================== =================== ======================== ============================= ========================= Adapter location in read Read layout | Found by regular 3’ | Found by non-internal 3’ | Found by anchored 3’ | ``-a ADAPTER`` | ``-a ADAPTERX`` | ``-a ADAPTER$`` ================================== =================== ======================== ============================= ========================= Full adapter sequence anywhere acgtacgtADAPTERacgt yes no no Partial adapter sequence at 3’ end acgtacgtacgtADAP yes yes no Full adapter sequence at 3’ end acgtacgtacgtADAPTER yes yes yes ================================== =================== ======================== ============================= ========================= 5' adapter types ~~~~~~~~~~~~~~~~ A 5' adapter is assumed to be ligated to the 5' end of your sequence of interest. When such an adapter is found, the adapter sequence itself and the sequence preceding it (if there is any) are trimmed. This table shows in which ways the different 5' adapter types are allowed to occur in a read in order to be recognized by the program. ================================== =================== ======================== ============================= ========================= Adapter location in read Read layout | Found by regular 5’ | Found by non-internal 5’ | Found by anchored 5’ | ``-g ADAPTER`` | ``-g XADAPTER`` | ``-g ^ADAPTER`` ================================== =================== ======================== ============================= ========================= Full adapter sequence anywhere acgtADAPTERacgtacgt yes no no Partial adapter sequence at 5’ end PTERacgtacgtacgt yes yes no Full adapter sequence at 5’ end ADAPTERacgtacgtacgt yes yes yes ================================== =================== ======================== ============================= ========================= .. _three-prime-adapters: Regular 3' adapters ------------------- A 3' adapter is a piece of DNA ligated to the 3' end of the DNA fragment of interest. The sequencer starts the sequencing process at the 5' end of the fragment. If the fragment is shorter than the read length, the sequencer will sequence into the adapter and the reads will thus contain some part of the adapter. Depending on how much longer the read is than the fragment of interest, the adapter occurs 1) not at all, 2) partially or fully at the end of the read (not followed by any other bases), or 3) in full somewhere within the read, followed by some other bases. Use Cutadapt’s ``-a`` option to find and trim such an adapter, allowing both partial and full occurrences. For example, assume your fragment of interest is *mysequence* and the adapter is *ADAPTER*. Depending on the read length, you will get reads that look like this:: mysequen mysequenceADAP mysequenceADAPTER mysequenceADAPTERsomethingelse Using ``-a ADAPTER`` to remove this type of adapter, this will be the result:: mysequen mysequence mysequence mysequence As this example shows, Cutadapt allows regular 3' adapters to occur in full anywhere within the read (preceeded and/or succeeded by zero or more bases), and also partially degraded at the 3' end. Cutadapt deals with 3' adapters by removing the adapter itself and any sequence that may follow. As a consequence, a sequence that starts with an adapter, like this, will be trimmed to an empty read:: ADAPTERsomething By default, empty reads are kept and will appear in the output. If you do not want this, use the ``--minimum-length``/``-m`` :ref:`filtering option `. .. _five-prime-adapters: Regular 5' adapters ------------------- .. note:: Unless your adapter may also occur in a degraded form, you probably want to use an :ref:`anchored 5' adapter `. A 5' adapter is a piece of DNA ligated to the 5' end of the DNA fragment of interest. For this type of adapter to be found, the adapter sequence needs to either appear in full somewhere within the read (internal match) or at the start (5' end) of it, where in the latter case also partial occurrences are allowed. In all cases, the adapter itself and the sequence preceding it is removed. Assume your fragment of interest is *mysequence* and the adapter is *ADAPTER*. The reads may look like this:: ADAPTERmysequence DAPTERmysequence TERmysequence somethingADAPTERmysequence All the above sequences are trimmed to ``mysequence`` when you use `-g ADAPTER`. As with 3' adapters, the resulting read may have a length of zero when the sequence ends with the adapter. For example, the read :: somethingADAPTER will be empty after trimming. .. _anchored-5adapters: Anchored 5' adapters -------------------- An anchored 5' adapter is an adapter that is expected to occur in full length at the beginning of the read. Example:: ADAPTERsomething This is usually how forward PCR primers are found in the read in amplicon sequencing, for instance. In Cutadapt’s terminology, this type of adapter is called "anchored" to distinguish it from :ref:`"regular" 5' adapters `, which are 5' adapters with a less strict placement requirement. If the adapter sequence is ``ADAPTER``, use ``-g ^ADAPTER`` to remove an anchored 5' adapter. The ``^`` is meant to indicate the "anchoring" to the beginning of the read. With this, the example read ``ADAPTERsomething`` is trimmed to just ``something``. An anchored 5' adapter must occur in full at the beginning of the read. If the read happens to be shorter than the adapter, partial occurrences such as ``ADAPT`` are not found. The requirement for a full match at the beginning of the read is relaxed when Cutadapt searches error-tolerantly, as it does by default. In particular, insertions and deletions may allow reads such as these to be trimmed, assuming the maximum error rate is sufficiently high:: BADAPTERsomething ADAPTE The ``B`` in the beginning is seen as an insertion, and the missing ``R`` as a deletion. If you also want to prevent this from happening, use the option ``--no-indels``, which disallows insertions and deletions entirely. .. _anchored-3adapters: Anchored 3' adapters -------------------- It is also possible to anchor 3' adapters to the end of the read. This is useful, for example, if you work with merged overlapping paired-end reads. Add the ``$`` character to the end of an adapter sequence specified via ``-a`` in order to anchor the adapter to the end of the read, such as ``-a ADAPTER$``. The adapter will only be found if it occurs in full at the end of the read (that is, it must be a *suffix* of the read. The requirement for a full match exactly at the end of the read is relaxed when Cutadapt searches error-tolerantly, as it does by default. You can disable insertions and deletions with ``--no-indels``. Anchored 3' adapters work as if you had reversed the sequence and used an appropriate anchored 5' adapter. As an example, assume you have these reads:: mysequenceADAP mysequenceADAPTER mysequenceADAPTERsomethingelse Using ``-a ADAPTER$`` will result in:: mysequenceADAP mysequence mysequenceADAPTERsomethingelse That is, only the middle read is trimmed at all. .. _non-internal: Non-internal 5' and 3' adapters ------------------------------- The non-internal 5' and 3' adapter types disallow internal occurrences of the adapter sequence. This is like a less strict version of anchoring: The adapter must always be at one of the ends of the read, but - unlike anchored adapters - partial occurrences are also ok. Use ``-a ADAPTERX`` (replace ``ADAPTER`` with your actual adapter sequence, but use a literal ``X``) to disallow internal matches for a 3' adapter. Use ``-g XADAPTER`` to disallow them for a 5' adapter. Mnemonic: The ``X`` is not allowed to “shift into” the read. Here are some examples for trimming reads with ``-a ADAPTERX``: ================================== ================================== Input read Processed read ================================== ================================== ``mysequenceADAP`` ``mysequence`` ``mysequenceADAPTER`` ``mysequence`` ``mysequenceADAPTERsomethingelse`` ``mysequenceADAPTERsomethingelse`` ================================== ================================== Here are some examples for trimming reads with ``-g XADAPTER``: ================================== =================================== Input read Processed read ================================== =================================== ``APTERmysequence`` ``mysequence`` ``ADAPTERmysequence`` ``mysequence`` ``somethingelseADAPTERmysequence`` ``somethingelseADAPTERmysequence`` ================================== =================================== .. versionadded:: 1.17 .. _linked-adapters: Linked adapters (combined 5' and 3' adapter) -------------------------------------------- If your sequence of interest is surrounded by a 5' and a 3' adapter, and you want to remove both adapters, then you can use a *linked adapter*. A linked adapter combines a 5' and a 3' adapter. By default, the adapters are not anchored, but in many cases, you should anchor the 5’ adapter by prefixing it with ``^``. :ref:`See the previous sections ` for what anchoring means. .. note:: Cutadapt versions before 2.0 anchored the 5’ adapter within linked adapters automatically even if the initial ``^`` was not specified. If you have scripts written for Cutadapt versions earlier than 2.0, please add the ``^`` so that the behavior does not change! Linked adapters are specified as two sequences separated by ``...`` (three dots):: cutadapt -a ^ADAPTER1...ADAPTER2 -o out.fastq.gz in.fastq.gz If you anchor an adapter, it will also become marked as being *required*. If a required adapter cannot be found, the read will not be trimmed at all even if the other adapter occurs. If an adapter is not required, it is *optional*. Also, when you use the ``--discard-untrimmed`` option (or ``--trimmed-only``) with a linked adapter, then a read is considered to be trimmed only if all required adapters were found. In the previous example, ``ADAPTER1`` was anchored and therefore required, but ``ADAPTER2`` was optional. Anchoring also ``ADAPTER2`` (and making it required as well) would look like this:: cutadapt -a ^ADAPTER1...ADAPTER2$ -o out.fastq.gz in.fastq.gz As an example, assume the 5' adapter is *FIRST*, the 3' adapter is *SECOND* and you have these input reads:: FIRSTmysequenceSECONDextrabases FIRSTmysequenceSEC FIRSTmyseque anotherreadSECOND Trimming with :: cutadapt -a ^FIRST...SECOND -o output.fastq input.fastq will result in :: mysequence mysequence myseque anotherreadSECOND The 3' adapter in the last read is not trimmed because the anchored 5’ adapter is required, but missing in the read. Linked adapters do not work when used in combination with ``--info-file`` and ``--mask-adapter``. To provide :ref:`adapter-search parameters ` for linked adapters, they need to be set for each constituent adapter separately, as in ``-g "ADAPTER1;min_overlap=5...ADAPTER2;min_overlap=6"``. .. versionadded:: 1.10 .. versionadded:: 1.13 Ability to anchor the 3' adapter. .. versionadded:: 2.0 The 5’ adapter is no longer anchored by default. .. _linked-override: Changing which adapters are required ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ As described, when you specify a linked adapter with ``-a``, the adapters that are anchored become *required*, and the non-anchored adapters become *optional*. To change this, you can instead use ``-g`` to specify a linked adapter. In that case, *both* adapters are required (even if they are not anchored). This type of linked adapter type is especially suited for trimming CRISPR screening reads. For example:: cutadapt -g ADAPTER1...ADAPTER2 -o out.fastq.gz in.fastq.gz Here, both ``ADAPTER1`` and ``ADAPTER2`` are not anchored, but they are required because ``-g`` was used. The ``-g`` option does not cover all cases, so you can also mark each adapter explicitly as required or optional using the :ref:`search parameters ` ``required`` and ``optional``. This is the only way to make an anchored adapter optional. For example, to request that an anchored 5' adapter (here ``ADAPTER1``) should not be required, you can specify it like this :: cutadapt -a "^ADAPTER1;optional...ADAPTER2" -o output.fastq.gz input.fastq.gz .. versionadded:: 1.13 Option ``-g`` added. .. versionchanged:: 1.15 Option ``-g`` requires both adapters. Linked adapter statistics ~~~~~~~~~~~~~~~~~~~~~~~~~ For linked adapters, the statistics report contains a line like this:: === Adapter 1 === Sequence: AAAAAAAAA...TTTTTTTTTT; Type: linked; Length: 9+10; Trimmed: 3 times; Half matches: 2 The value for “Half matches” tells you how often only the 5'-side of the adapter was found, but not the 3'-side of it. This applies only to linked adapters with regular (non-anchored) 3' adapters. .. _anywhere-adapters: 5' or 3' adapters ----------------- The last type of adapter is a combination of the 5' and 3' adapter. You can use it when your adapter is ligated to the 5' end for some reads and to the 3' end in other reads. This probably does not happen very often, and this adapter type was in fact originally implemented because the library preparation in an experiment did not work as it was supposed to. For this type of adapter, the sequence is specified with ``-b ADAPTER`` (or use the longer spelling ``--anywhere ADAPTER``). The adapter may appear in the beginning (even degraded), within the read, or at the end of the read (even partially). The decision which part of the read to remove is made as follows: If there is at least one base before the found adapter, then the adapter is considered to be a 3' adapter and the adapter itself and everything following it is removed. Otherwise, the adapter is considered to be a 5' adapter and it is removed from the read, but the sequence after it remains. Here are some examples. ============================== =================== ===================== Read before trimming Read after trimming Detected adapter type ============================== =================== ===================== ``MYSEQUENCEADAPTERSOMETHING`` ``MYSEQUENCE`` 3' adapter ``MYSEQUENCEADAPTER`` ``MYSEQUENCE`` 3' adapter ``MYSEQUENCEADAP`` ``MYSEQUENCE`` 3' adapter ``MADAPTER`` ``M`` 3' adapter ``ADAPTERMYSEQUENCE`` ``MYSEQUENCE`` 5' adapter ``PTERMYSEQUENCE`` ``MYSEQUENCE`` 5' adapter ``TERMYSEQUENCE`` ``MYSEQUENCE`` 5' adapter ============================== =================== ===================== .. _rightmost: Multiple adapter occurrences within a single read ------------------------------------------------- If a single read contains multiple copies of the same adapter, the basic rule is that the leftmost match is used for both 5' and 3' adapters. For example, when searching for a 3' adapter in :: cccccADAPTERgggggADAPTERttttt the read will be trimmed to :: ccccc When the adapter is a 5' adapter instead, the read will be trimmed to :: gggggADAPTERttttt For 5' adapters, this can be changed so that the *rightmost* occurrence is found by using the ``rightmost`` :ref:`search parameter `, as in ``-g "ACGT;rightmost"``. .. versionadded:: 4.1 The ``rightmost`` search parameter .. _trimming-parameters: .. _search-parameters: Adapter-search parameters ========================= The adapter search algorithm has a few parameters specific to each adapter that control how the adapter sequence is found. The command-line options ``-e`` and ``-O`` set the maximum error rate and minimum overlap parameters (see details in the following sections) for all adapters listed via the ``-a``/``-b``/``-g`` etc. options. When trimming more than one adapter, it may be necessary to change search parameters for each adapter individually. You can do so by adding a semicolon and ``parameter=value`` to the end of the adapter sequence, as in ``-a "ADAPTER;max_error_rate=0.2"``. There are also "flags" that enable certain behavior. These are written without the ``=value`` part. Multiple parameters can be set, as in ``-a "ADAPTER;max_error_rate=0.2;min_overlap=5"``. For linked adapters, search parameters need to be specified separately for each adapter as in ``-g "ADAPTER1;min_overlap=5...ADAPTER2;min_overlap=6"``. Remember to add the quotation marks; otherwise the shell will interpret the semicolon as a separator between two commands. The following parameters are supported: ======================================================= =============== ================================ Parameter Global option Adapter-specific parameter ======================================================= =============== ================================ Maximum error rate (default: 0.1) ``-e 0.2`` | ``ADAPTER;e=0.2`` or | ``ADAPTER;max_errors=0.2`` or | ``ADAPTER;max_error_rate=0.2`` Minimum overlap (default: 3) ``-O 5`` | ``ADAPTER;o=5`` or | ``ADAPTER;min_overlap=5`` Disallow indels ``--no-indels`` ``ADAPTER;noindels`` Allow indels (this is the default) ``ADAPTER;indels`` Allow matches anywhere ``ADAPTER;anywhere`` :ref:`Linked adapter required ` ``ADAPTER;required`` :ref:`Linked adapter optional ` ``ADAPTER;optional`` :ref:`Find rightmost 5' adapter occurrence ` ``ADAPTER;rightmost`` ======================================================= =============== ================================ The minimum overlap length cannot be set for anchored adapters as these always need to occur at full length. When using the ``file:`` notation to read in adapters from a FASTA file, it is possible to specify file-specific search parameters:: cutadapt -a "file:adapters.fa;min_overlap=5;noindels" The individual adapter specifications in the FASTA file can also contain search parameters:: >adapter1 ^ACGT;min_overlap=3 >adapter2 AACCGGT;noindels More specific parameters override less specific ones: 1. Adapter-specific parameters override the file-specific settings 2. File-specific search parameters override the global settings .. versionadded:: 1.18 Syntax for setting adapter-specific search parameters .. versionadded:: 3.5 The ``indels`` and ``noindels`` parameters. .. versionadded:: 4.1 Support file-specific search parameters (when using the ``file:`` notation) .. versionadded: 4.1 The ``rightmost`` search parameter .. _error-tolerance: Error tolerance --------------- All searches for adapter sequences are error tolerant. Allowed errors are mismatches, insertions and deletions. For example, if you search for the adapter sequence ``ADAPTER`` and the error tolerance is set appropriately (as explained below), then also ``ADABTER`` will be found (with 1 mismatch), as well as ``ADAPTR`` (with 1 deletion), and also ``ADAPPTER`` (with 1 insertion). If insertions and deletions are disabled with ``--no-indels``, then mismatches are the only type of errors. The level of error tolerance is determined by a *maximum error rate*, which is 0.1 (=10%) by default. An adapter occurrence is only found if the actual error rate of the match does not exceed the maximum error rate. The actual error rate is computed as the *number of errors in the match* divided by the *length of the matching part of the adapter*. For example, an adapter match of length 8 containing 1 error has an error rate of 1/8=0.125. At the default maximum error rate 0.1, it would not be found, but a match of length 10 containing 1 error has an error rate of 1/10=0.1 and would be found. Relating the number of errros to the length of the matching part of the adapter is important because Cutadapt allows for partial adapter occurrences (for the non-anchored adapter types). If only the absolute number of errors were used, shorter matches would be favored unfairly. For example, assume an adapter has 30 bases and we allow three errors over that length. If we allowed these three errors even for a partial occurrences of, for example, four bases, we can immediately see that this results in unexpected matches. Using the error rate as a criterion helps to keep sensitivity and specificity roughly the same over the possible lengths of the matches. The ``-e`` option on the command line allows you to change the maximum error rate. If the value is between 0 and 1 (but not 1 exactly), then this sets the maximum error rate directly for all specified adapters. The default is ``-e 0.1``. You can also use the adapter-specific parameter ``max_error_rate`` or ``max_errors`` or just ``e`` to override the default for a single adapter only. Examples: ``-a "ADAPTER;max_error_rate=0.15"``, ``-a "ADAPTER;e=0.15"`` (the quotation marks are necessary). Alternatively, you can also specify a value of 1 or greater as the number of allowed errors, which is then converted to a maximum error rate for each adapter individually. For example, with an adapter of length 10, using ``-e 2`` will set the maximum error rate to 0.2 for an adapter of length 10. The value does not have to be an integer, and if you use an adapter type that allows partial matches, you may want to add 0.5 to the desired number of errors, which achieves that even slightly shorter than full-lengths matches will be allowed at the specified number of errors. In short, if you want to allow two errors, use ``-e 2.5``. This also works in the adapter-specific parameters. Examples: ``-a "ADAPTER;e=1"``, ``-a "ADAPTER;max_errors=2.5"``. Note that ``e``, ``max_error_rate`` and ``max_errors`` are all equivalent and the decision whether a rate or an absolute number is meant is based on whether the given value is less than 1 or not. The number of errors allowed for a given adapter match length is also shown under the “No. of allowed errors” heading in the report that Cutadapt prints:: Sequence: 'SOMEADAPTER'; Length: 11; Trimmed: 2 times. No. of allowed errors: 0-9 bp: 0; 10-11 bp: 1 This tells us: For match lengths of 0-9 bases, zero errors are allowed and for matches of length 10-11 bases, one error is allowed. See also the :ref:`section on details of the alignment algorithm `. .. versionadded:: 2.11 Allow specifying the number of errors N wildcard characters ~~~~~~~~~~~~~~~~~~~~~ Any ``N`` wildcard characters in the adapter sequence are skipped when computing the error rate. That is, they do not contribute to the length of a match. For example, the adapter sequence ``ACGTACNNNNNNNNGTACGT`` has a length of 20, but only 12 non-``N``-characters. At a maximum error rate of 0.1, only one error is allowed if this sequence is found in full in a read because 12·0.1=1.2, which is 1 when rounded down. This is done because ``N`` bases cannot contribute to the number of errors. In previous versions, ``N`` wildcard characters did contribute to the match length, but this artificially inflates the number of allowed errors. For example, an adapter like ``N{18}CC`` (18 ``N`` wildcards followed by ``CC``) would effectively match anywhere because the default error rate of 0.1 would allow for two errors, but there are only two non-``N`` bases in the particular adapter. However, even in previous versions, the location with the greatest number of matching bases is chosen as the best location for an adapter, so in many cases the adapter would still be placed properly. .. versionadded:: 2.0 Ignore ``N`` wildcards when computing the error rate. .. _minimum-overlap: .. _random-matches: Minimum overlap (reducing random matches) ----------------------------------------- Since Cutadapt allows partial matches between the read and the adapter sequence for most adapter types, short matches can occur by chance, leading to erroneously trimmed bases. For example, just by chance, we expect that roughly 25% of all reads end with a base that is identical to the first base of the adapter. To reduce the number of falsely trimmed bases, the alignment algorithm requires that at least *three bases* of the adapter are aligned to the read. This minimum overlap length can be changed globally (for all adapters) with the parameter ``--overlap`` (or its short version ``-O``). The option is ignored for anchored adapters since these do not allow partial matches. Alternatively, use the adapter-specific parameter ``min_overlap`` to change it for a single adapter only. Example: ``-a "ADAPTER;min_overlap=5"`` (the quotation marks are necessary). For anchored adapters, attempting to set a minimum overlap this way will result in an error. In :ref:`linked adapters `, the minimum overlap length is applied separately to the 5' and the 3' adapter. If a read contains a partial adapter sequence shorter than the minimum overlap length, no match will be found (and therefore no bases are trimmed). Requiring at least three bases to match is quite conservative. Even if no minimum overlap was required, we can compute that we lose only about 0.44 bases per read on average, see `Section 2.3.3 in my thesis `_. With the default minimum overlap length of 3, only about 0.07 bases are lost per read. When choosing an appropriate minimum overlap length, take into account that true adapter matches are also lost when the overlap length is higher than zero, reducing Cutadapt's sensitivity. It is possible that fewer bases are removed from a read than the minimum overlap length seems to imply. The overlap length is the number of bases in the adapter that got aligned to the read, which means that if there are deletions in the adapter, the corresponding part in the read will be shorter. (This is only relevant when the maximum allowed error rate and/or the minimum overlap length are changed such that at least one error is allowed over the given length.) Allowing partial matches at both ends ------------------------------------- The regular 5' and 3' adapter types allow partial adapter occurrences only at the 5' and 3' end of the read, respectively. To allow partial matches at both ends, you can use the ``anywhere`` adapter-specific parameter. A 3' adapter specified via ``-a ADAPTER`` will be found even when it occurs partially at the 3' end, as in ``mysequenceADAPT``. However, it will by default not be found if it occurs partially at the 5' end, as in ``APTERmysequence``. To find the adapter in both cases, specify the adapter as ``-a "ADAPTER;anywhere"``. Similarly, for a 5' adapter specified via ``-g ADAPTER``, partial matches at the 3' end are not found, as in ``mysequenceADAPT``. To allow partial matches at both ends, use ``-g "ADAPTER;anywhere"``. .. note:: With ``anywhere``, partial matches at the end that is usually not allowed to be matched will result in empty reads! This means that short random matches have a much greater detrimental effect and you should :ref:`increase the minimum overlap length `. .. _reverse-complement: Searching reverse complements ----------------------------- By default, Cutadapt expects adapters to be given in the same orientation (5' to 3') as the reads. That is, Cutadapt considers neither the reverse complement of the reads nor of the adapters. To make Cutadapt consider the reverse-complement as well, use option ``--revcomp`` or its abbreviation ``--rc``. If given, both the input sequence and its reverse complement are searched for adapters. Whichever of the two versions matches best is kept. That is, if the reverse-complemented sequence yields a better match, the output file will contain the reverse-complemented version. This can be used to “normalize” read orientation/strandedness. For paired-end reads, reverse complementing is done by swapping R1 and R2. To determine whether the forward or reverse-complemented sequence yields the better match, the full adapter search is done independently on both versions, and the version that results in the higher number of matching nucleotides is considered to be the better one. If the reverse-complemented version of a read or read pair was chosen, a space and the text ``rc`` is added to the read name. To not change the read name, add option ``--rename="{header}"``. (Please file an issue if you would like this to be configurable.) The report will show the number of reads or read pairs that were reverse-complemented, like this:: Total reads processed: 60 Reads with adapters: 50 (83.3%) Reverse-complemented: 20 (33.3%) Here, 20 reverse-complemented reads contain an adapter and 50 - 20 = 30 reads that did not need to be reverse-complemented contain an adapter. .. versionadded:: 2.8 .. versionadded:: 4.6 The ``--revcomp`` option now works for both single-end and paired-end data. Specifying adapter sequences ============================ .. _wildcards: Wildcards --------- All `IUPAC nucleotide codes `_ (wildcard characters, degenerate bases) are supported. For example, use an ``N`` in the adapter sequence to match any nucleotide in the read, or use ``-a YACGT`` for an adapter that matches both ``CACGT`` and ``TACGT``. The wildcard character ``N`` is useful for trimming adapters with an embedded variable barcode:: cutadapt -a ACGTAANNNNTTAGC -o output.fastq input.fastq Even the ``X`` wildcard that does not match any nucleotide is supported. If used as in ``-a ADAPTERX`` or ``-g XADAPTER``, it acquires a special meaning for the matching algorithm :ref:`and disallows internal adapter matches `. The character ``I``, used to encode the base inosine, is automatically replaced with ``N`` within the adapter sequence. Wildcard characters are by default only allowed in adapter sequences and are not recognized when they occur in a read. This is to avoid matches in reads that consist of many (often low-quality) ``N`` bases. Use ``--match-read-wildcards`` to enable wildcards also in reads. Use the option ``-N`` to disable interpretation of wildcard characters even in the adapters. If wildcards are disabled entirely, that is, when you use ``-N`` and *do not* use ``--match-read-wildcards``, then Cutadapt compares characters by their ASCII value. Thus, both the read and adapter can be arbitrary strings (such as ``SEQUENCE`` or ``ADAPTER`` as used here in the examples). .. versionadded:: 4.2 Inosine ``I`` Repeated bases -------------- If you have many repeated bases in the adapter sequence, such as many ``N`` s or many ``A`` s, you do not have to spell them out. For example, instead of writing ten ``A`` in a row (``AAAAAAAAAA``), write ``A{10}`` instead. The number within the curly braces specifies how often the character that preceeds it will be repeated. This works also for IUPAC wildcard characters, as in ``N{5}``. It is recommended that you use quotation marks around your adapter sequence if you use this feature, as in ``-a "N{5}ACGT"``. .. _modifying-reads: Modifying reads =============== This section describes in which ways reads can be modified other than adapter removal. .. seealso:: :ref:`Read modification order ` .. _changing-what-is-done-when-an-adapter-is-found: .. _action: ``--action`` changes what is done when an adapter is found ---------------------------------------------------------- The ``--action`` option can be used to change what is done when an adapter match is found in a read. The default is ``--action=trim``, which will remove the adapter and the sequence before or after it from the read. For 5' adapters, the adapter and the sequence preceding it is removed. For 3' adapters, the adapter and the sequence following it is removed. Since linked adapters are a combination of a 5' and 3' adapter, in effect only the sequence between the 5' and the 3' adapter matches is kept. With ``--action=retain``, the read is trimmed, but the adapter sequence itself is not removed. Up- and downstream sequences are removed in the same way as for the ``trim`` action. For linked adapters, both adapter sequences are kept. .. note:: Because it is somewhat unclear what should happen, ``--action=retain`` can at the moment not be combined with ``--times`` (multiple rounds of adapter removal). Use ``--action=none`` to not change the read even if there is a match. This is useful because the statistics will still be updated as before and because the read will still be considered "trimmed" for the read filtering options. Combining this with ``--untrimmed-output``, for example, can be used to copy reads without adapters to a different file. Other read modification options, if used, may still change the read. Use ``--action=mask`` to write ``N`` characters to those parts of the read that would otherwise have been removed. Use ``--action=lowercase`` to change to lowercase those parts of the read that would otherwise have been removed. The rest is converted to uppercase. .. versionadded:: 3.1 The ``retain`` action. .. _cut-bases: Removing a fixed number of bases -------------------------------- By using the ``--cut`` option or its abbreviation ``-u``, it is possible to unconditionally remove bases from the beginning or end of each read. If the given length is positive, the bases are removed from the beginning of each read. If it is negative, the bases are removed from the end. For example, to remove the first five bases of each read:: cutadapt -u 5 -o trimmed.fastq reads.fastq To remove the last seven bases of each read:: cutadapt -u -7 -o trimmed.fastq reads.fastq The ``-u``/``--cut`` option can be combined with the other options, but the ``--cut`` is applied *before* any adapter trimming. .. _quality-trimming: Quality trimming ---------------- The ``-q`` (or ``--quality-cutoff``) parameter can be used to trim low-quality ends from reads. If you specify a single cutoff value, the 3' end of each read is trimmed:: cutadapt -q 10 -o output.fastq input.fastq For Illumina reads, this is sufficient as their quality is high at the beginning, but degrades towards the 3' end. It is also possible to also trim from the 5' end by specifying two comma-separated cutoffs as *5' cutoff,3' cutoff*. For example, :: cutadapt -q 15,10 -o output.fastq input.fastq will quality-trim the 5' end with a cutoff of 15 and the 3' end with a cutoff of 10. To only trim the 5' end, use a cutoff of 0 for the 3' end, as in ``-q 15,0``. Quality trimming is done before any adapter trimming. For paired-end data, quality trimming is by default applied to both reads using the same cutoff(s). Use option ``-Q`` to specify different cutoffs for R2:: cutadapt -q 5 -Q 15,20 -o out.1.fastq -p out.2.fastq in.1.fastq in.2.fastq To disable quality-trimming of R2, use ``-Q 0``. By default, quality values are assumed to be encoded as ascii(phred quality + 33). Nowadays, this should always be the case. Some old Illumina FASTQ files encode qualities as ascii(phred quality + 64). For those, you must add ``--quality-base=64`` to the command line. A :ref:`description of the quality-trimming algorithm is also available `. The algorithm is the same as used by BWA. .. versionadded:: 3.5 The ``-Q`` option .. _nextseq-trim: Quality trimming of reads using two-color chemistry (NextSeq) ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ Some Illumina instruments use a two-color chemistry to encode the four bases. This includes the NextSeq and the NovaSeq. In those instruments, a 'dark cycle' (with no detected color) encodes a ``G``. However, dark cycles also occur when sequencing "falls off" the end of the fragment. The read then `contains a run of high-quality, but incorrect “G” calls `_ at its 3' end. Since the regular quality-trimming algorithm cannot deal with this situation, you need to use the ``--nextseq-trim`` option:: cutadapt --nextseq-trim=20 -o out.fastq input.fastq This works like regular quality trimming (where one would use ``-q 20`` instead), except that the qualities of ``G`` bases are ignored. .. versionadded:: 1.10 .. _poly-a: Poly-A/poly-T trimming ~~~~~~~~~~~~~~~~~~~~~~ Use `--poly-a`` to trim poly-A tails. Poly-A trimming is done after adapter trimming. On paired-end reads, ``--poly-a`` removes poly-A tails from R1 and poly-T "heads" from R2. .. seealso:: :ref:`Description of the poly-A trimming algorithm `. .. versionadded:: 4.4 .. versionchanged:: 4.5 Remove poly-T heads from R2. (Version 4.4 attempted to remove poly-A tails even from R2.) Shortening reads to a fixed length ---------------------------------- To shorten each read down to a certain length, use the ``--length`` option or the short version ``-l``:: cutadapt -l 10 -o output.fastq.gz input.fastq.gz This shortens all reads from ``input.fastq.gz`` down to 10 bases. The removed bases are those on the 3' end. If you want to remove a fixed number of bases from each read, use :ref:`the --cut option instead `. .. _modifying-read-names: Modifying read names -------------------- If you feel the need to modify the names of processed reads, some of the following options may be useful. These options exist; they are explained in more detail in the following sections: - ``--rename`` changes a read name according to a template. - ``--prefix`` (or ``-x``) adds a prefix to read names. - ``--suffix`` (or ``-y``) adds a suffix to read names. - ``--length-tag`` updates a “length tag” such as ``length=`` with the correct read length - ``--strip-suffix`` removes a known suffix from read names The ``--prefix`` and ``--suffix`` options are outdated as they do not ensure that paired-end read names remain consistent, and you should prefer to use ``--rename``. ``--prefix`` and ``--suffix`` can currently not be used together with ``--rename``. .. _rename: .. _read-renaming: ``--rename`` renames reads ~~~~~~~~~~~~~~~~~~~~~~~~~~ The ``--rename`` option can be used to rename both single-end and paired-end reads. This section describes how it can be used to rename single-end reads. We use the following terminology: The FASTQ or FASTA record header line consists of a *read ID* and is optionally followed by a separator (whitespace) and a *comment*. For example, in this FASTQ header, the read ID is ``read1234`` and the comment is ``value=17`` (sequence and qualities not shown):: @read1234 value=17 The ``--rename`` option expects a *template string* such as ``{id} extra_info {adapter_name}`` as a parameter. It can contain regular text and placeholders that consist of a name enclosed in curly braces (``{placeholdername}``). The character sequence ``\t`` will be replaced by a tab character (this is currently the only allowed escape sequence). The read name will be set to the template string in which the placeholders are replaced with the actual values relevant for the current read. The following placeholders are currently available for single-end reads: * ``{header}`` -- the full, unchanged header * ``{id}`` -- the read ID, that is, the part of the header before the first whitespace * ``{comment}`` -- the part of the header after the whitespace following the ID * ``{adapter_name}`` -- the name of adapter that was found in this read or ``no_adapter`` if there was no adapter match. If you use ``--times`` to do multiple rounds of adapter matching, this is the name of the *last* found adapter. * ``{match_sequence}`` -- the sequence of the read that matched the adapter (including errors). If there was no adapter match, this is set to an empty string. If you use a linked adapter, this is to the two matching strings, separated by a comma. * ``{cut_prefix}`` -- the prefix removed by the ``--cut`` (or ``-u``) option (that is, when used with a positive length argument) * ``{cut_suffix}`` -- the suffix removed by the ``--cut`` (or ``-u``) option (that is, when used with a negative length argument) * ``{rc}`` -- this is replaced with the string ``rc`` if the read was reverse complemented. This only applies when :ref:`reverse complementing ` was requested. * ``\t`` -- not a placeholder, but will be replaced with the tab character. For example, assume you have this input read in ``in.fasta``:: >myread extra info ACGTAAAATTTTCCCC Running the command :: cutadapt -a myadapter=TTTT -u 4 --rename='{id} barcode={cut_prefix} adapter={adapter_name} {comment}' in.fasta Will result in this modified read:: >myread barcode=ACGT adapter=myadapter extra info AAAA .. versionadded:: 3.2 The ``{rn}`` placeholder. .. versionadded:: 3.3 The ``{rc}`` placeholder. .. versionadded:: 3.6 The ``{match_sequence}`` placeholder. .. versionadded:: 4.3 The ``\t`` escape sequence. ``--rename`` also renames paired-end reads ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ If the ``--rename`` option is used with paired-end data, the template is applied separately to both R1 and R2. That is, for R1, the placeholders are replaced with values from R1, and for R2, the placeholders are replaced with values from R2. For example, ``{comment}`` becomes R1’s comment in R1 and it becomes R2’s comment in R2. As another example, using ``--rename='{id} please note: {comment}'``, the paired-end reads :: >myread important comment ... >myread also quite important ... are renamed to :: >myread please note: important comment ... >myread please note: also quite important ... For paired-end data, the placeholder ``{rn}`` is available (“read number”), and it is replaced with ``1`` in R1 and with ``2`` in R2. In addition, it is possible to write a placeholder as ``{r1.placeholdername}`` or ``{r2.placeholdername}``, which always takes the replacement value from R1 or R2, respectively. For example, assume R1 starts with a 4 nt barcode that you want to “move” from the sequence into the ID of both reads. You can use ``--cut=4 --rename='{id}_{r1.cut_prefix} {comment}'`` and the read pair :: >myread this is R1 ACGTAAAATTTT >myread this is R2 GGGGCCCC will be changed to :: >myread_ACGT this is R1 AAAATTTT >myread_ACGT this is R2 GGGGCCCC The ``{r1.placeholder}`` and ``{r2.placeholder}`` notation is available for all placeholders except ``{rn}`` and ``{id}`` because the read ID needs to be identical for both reads. In general, the read IDs of R1 and R2 need to be identical. Cutadapt enforces this when reading paired-end FASTQ files, except that it allows a single trailing "1" or "2" as the only difference between the read IDs. This allows for read IDs ending in ``/1`` and ``/2`` (some old formats are like this) or ``.1`` and ``.2`` (``fastq-dump`` produces this). If you use ``--rename``, Cutadapt will also enforce this when *writing* paired-end reads. .. versionadded:: 3.2 The ``--rename`` option Other read name modifications ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ Use ``-y`` (or its alias ``--suffix``) to append a text to read names. The given string can contain the placeholder ``{name}``, which will be replaced with the name of the adapter found in that read. For example, writing :: cutadapt -a adapter1=ACGT -y ' we found {name}' input.fastq changes a read named ``read1`` to ``read1 we found adapter1`` if the adapter ``ACGT`` was found. The option ``-x`` (and its alias ``--prefix``) work the same, except that the text is added in front of the read name. For both options, spaces need to be specified explicitly, as in the above example. If no adapter was found in a read, the text ``no_adapter`` is inserted for ``{name}``. We recommend that you no longer use the ``-x``/``--prefix``/``-y``/``--suffix`` options and use ``--rename`` instead, which is more general. In order to remove a suffix of each read name, use ``--strip-suffix``. Some old 454 read files contain the length of the read in the name:: >read1 length=17 ACGTACGTACAAAAAAA If you want to update this to the correct length after trimming, use the option ``--length-tag``. In this example, this would be ``--length-tag 'length='``. After trimming, the read would perhaps look like this:: >read1 length=10 ACGTACGTAC .. _filtering: Filtering reads =============== By default, all processed reads, no matter whether they were trimmed or not, are written to the output file specified by the ``-o`` option (or to standard output if ``-o`` was not provided). For paired-end reads, the second read in a pair is always written to the file specified by the ``-p`` option. The options described here make it possible to filter reads by either discarding them entirely or by redirecting them to other files. When redirecting reads, the basic rule is that *each read is written to at most one file*. You cannot write reads to more than one output file. Filters are applied to *all* processed reads, no matter whether they have been modified by adapter- or quality trimming. ``--minimum-length LENGTH`` or ``-m LENGTH`` Discard processed reads that are shorter than LENGTH. If you do not use this option, reads that have a length of zero (empty reads) are kept in the output. Some downstream tools may have problems with zero-length sequences. In that case, specify at least ``-m 1``. ``--too-short-output FILE`` Instead of discarding the reads that are too short according to ``-m``, write them to *FILE* (in FASTA/FASTQ format). ``--maximum-length LENGTH`` or ``-M LENGTH`` Discard processed reads that are longer than LENGTH. ``--too-long-output FILE`` Instead of discarding reads that are too long (according to ``-M``), write them to *FILE* (in FASTA/FASTQ format). ``--untrimmed-output FILE`` Write all reads without adapters to *FILE* (in FASTA/FASTQ format) instead of writing them to the regular output file. ``--discard-trimmed`` Discard reads in which an adapter was found. ``--discard-untrimmed`` Discard reads in which *no* adapter was found. This has the same effect as specifying ``--untrimmed-output /dev/null``. The options ``--too-short-output`` and ``--too-long-output`` are applied first. This means, for example, that a read that is too long will never end up in the ``--untrimmed-output`` file when ``--too-long-output`` was given, no matter whether it was trimmed or not. The options ``--untrimmed-output``, ``--discard-trimmed`` and ``-discard-untrimmed`` are mutually exclusive. The following filtering options do not have a corresponding option for redirecting reads. They always discard those reads for which the filtering criterion applies. ``--max-n COUNT_or_FRACTION`` Discard reads with more than COUNT ``N`` bases. If ``COUNT_or_FRACTION`` is a number between 0 and 1, it is interpreted as a fraction of the read length ``--max-expected-errors ERRORS`` or ``--max-ee ERRORS`` Discard reads with more than ERRORS :ref:`expected errors `. ``--max-average-error-rate ERROR_RATE`` or ``--max-aer`` Discard reads with more than ERROR_RATE average expected errors (total expected errors divided by the read length). ERROR_RATE must be between 0.0 and 1.0. ``--discard-casava`` Discard reads that did not pass CASAVA filtering. Illumina’s CASAVA pipeline in version 1.8 adds an *is_filtered* header field to each read. Specifying this option, the reads that did not pass filtering (these are the reads that have a ``Y`` for *is_filtered*) will be discarded. Reads for which the header cannot be recognized are kept. .. _paired-end: Trimming paired-end reads ========================= Cutadapt supports trimming of paired-end reads. To enable this, provide two input files and a second output file with the ``-p`` option (this is the short form of ``--paired-output``). This is the basic command line syntax:: cutadapt -a ADAPTER_FWD -A ADAPTER_REV -o out.1.fastq -p out.2.fastq reads.1.fastq reads.2.fastq Here, the input reads are in ``reads.1.fastq`` and ``reads.2.fastq``, and the result will be written to ``out.1.fastq`` and ``out.2.fastq``. In paired-end mode, the options ``-a``, ``-b``, ``-g`` and ``-u`` that also exist in single-end mode are applied to the forward reads only. To modify the reverse read, these options have uppercase versions ``-A``, ``-B``, ``-G`` and ``-U`` that work just like their counterparts. In the example above, ``ADAPTER_FWD`` will therefore be trimmed from the forward reads and ``ADAPTER_REV`` from the reverse reads. ====================== =========================== Single-end/R1 option Corresponding option for R2 ====================== =========================== ``--adapter``, ``-a`` ``-A`` ``--front``, ``-g`` ``-G`` ``--anywhere``, ``-b`` ``-B`` ``--cut``, ``-u`` ``-U`` ``--output``, ``-o`` ``--paired-output``, ``-p`` ====================== =========================== In paired-end mode, Cutadapt checks whether the input files are properly paired. An error is raised if one of the files contains more reads than the other or if the read names in the two files do not match. The read name comparison ignores a trailing ``/1`` or ``/2`` to allow processing some old Illumina paired-end files. In some cases, it works to run Cutadapt twice in single-end mode on the input files, but we recommend against it as this skips the consistency checks that Cutadapt can do otherwise. Also, as soon as you start to use one of the filtering options that discard reads, it is mandatory you process both files at the same time to make sure that the output files are kept synchronized. If a read is removed from one of the files, Cutadapt will always ensure that it is also removed from the other file. The following command-line options are applied to *both* reads: * ``-q`` (along with ``--quality-base``) * ``--times`` applies to all the adapters given * ``--trim-n`` * ``--action`` * ``--length`` * ``--length-tag`` * ``--prefix``, ``--suffix`` The following limitations still exist: * The ``--info-file``, ``--rest-file`` and ``--wildcard-file`` options write out information only from the first read. .. _filtering-paired: Filtering paired-end reads -------------------------- The :ref:`filtering options listed above ` can also be used when trimming paired-end data. Importantly, Cutadapt *always discards both reads of a pair* if it determines that the pair should be discarded. This ensures that the reads in the output files are in sync. (If you don’t want or need this, you can run Cutadapt separately on the R1 and R2 files.) The same applies also to the options that redirect reads to other files if they fulfill a filtering criterion, such as ``--too-short-output``/``--too-short-paired-output``. That is, the reads are always sent in pairs to these alternative output files. The ``--pair-filter`` option determines how to combine the filters for R1 and R2 into a single decision about the read pair. The default is ``--pair-filter=any``, which means that a read pair is discarded (or redirected) if at least *one of* the reads (R1 or R2) fulfills the filtering criterion. As an example, if option ``--minimum-length=20`` is used and paired-end data is processed, a read pair is discarded if at least one of the reads is shorter than 20 nt. With ``--pair-filter=both``, you can require that filtering criteria must apply to *both* reads in order for a read pair to be discarded. Finally, ``--pair-filter=first`` will make a decision about the read pair by inspecting whether the filtering criterion applies to the first read, ignoring the second read. The following table describes the effect for some filtering options. +----------------------------+------------------------------------------------+-----------------------------------------+ | Filtering option | With ``--pair-filter=any``, the pair | With ``--pair-filter=both``, the pair | | | is discarded if ... | is discarded if ... | +============================+================================================+=========================================+ | ``--minimum-length`` | one of the reads is too short | both reads are too short | +----------------------------+------------------------------------------------+-----------------------------------------+ | ``--maximum-length`` | one of the reads is too long | both reads are too long | +----------------------------+------------------------------------------------+-----------------------------------------+ | ``--discard-trimmed`` | one of the reads contains an adapter | both reads contain an adapter | +----------------------------+------------------------------------------------+-----------------------------------------+ | ``--discard-untrimmed`` | one of the reads does not contain an adapter | both reads do not contain an adapter | +----------------------------+------------------------------------------------+-----------------------------------------+ | ``--max-n`` | one of the reads contains too many ``N`` bases | both reads contain too many ``N`` bases | +----------------------------+------------------------------------------------+-----------------------------------------+ There is currently no way to change the pair-filter mode for each filter individually. .. note:: As an exception, when you specify adapters *only* for R1 (``-a``/``-g``/``-b``) or *only* for R2 (``-A``/``-G``/``-B``), then the ``--pair-filter`` mode for ``--discard-untrimmed`` is forced to be ``both`` (and accordingly, also for the ``--untrimmed-(paired-)output`` options). Otherwise, with the default ``--pair-filter=any`` setting, all pairs would be considered untrimmed because it would always be the case that one of the reads in the pair does not contain an adapter. The pair-filter mode for the other filtering options, such as ``--minimum-length``, is not overridden in the same way and remains ``any`` unless changed explicitly with the ``--pair-filter`` option. These are the paired-end specific filtering and output options: ``--minimum-length LENGTH1:LENGTH2`` or ``-m LENGTH1:LENGTH2`` When trimming paired-end reads, the minimum lengths for R1 and R2 can be specified separately by separating them with a colon (``:``). If the colon syntax is not used, the same minimum length applies to both reads, as discussed above. Also, one of the values can be omitted to impose no restrictions. For example, with ``-m 17:``, the length of R1 must be at least 17, but the length of R2 is ignored. ``--maximum-length LENGTH1:LENGTH2`` or ``-M LENGTH1:LENGTH2`` Maximum lengths can also be specified separately, see the explanation of ``-m`` above. ``--paired-output FILE`` or ``-p FILE`` Write the second read of each processed pair to *FILE* (in FASTA/FASTQ format). ``--untrimmed-paired-output FILE`` Used together with ``--untrimmed-output``. The second read in a pair is written to this file when the processed pair was *not* trimmed. ``--too-short-paired-output FILE`` Write the second read in a pair to this file if pair is too short. Use together with ``--too-short-output``. ``--too-long-paired-output FILE`` Write the second read in a pair to this file if pair is too long. Use together with ``--too-long-output``. ``--pair-filter=(any|both|first)`` Which of the reads in a paired-end read have to match the filtering criterion in order for it to be filtered. Note that the option names can be abbreviated as long as it is clear which option is meant (unique prefix). For example, instead of ``--untrimmed-output`` and ``--untrimmed-paired-output``, you can write ``--untrimmed-o`` and ``--untrimmed-p``. .. versionadded:: 1.18 ``--pair-filter=first`` .. _paired-adapters: .. _pair-adapters: Paired adapters --------------- When processing paired-end data, Cutadapt has two sets of adapters to work with: The ones that are to be found and removed in the forward read (R1), specified with ``-a``/``-g``/``-b``, and the ones to be found and removed in the reverse read (R2), specified with ``-A``/``-G``/``-B``. Normally, the program looks at the R1 and R2 reads independently. That is, the best matching R1 adapter is removed from R1 and the best matching R2 adapter is removed from R2. To change this, the option ``--pair-adapters`` can be used. It causes each R1 adapter to be paired up with its corresponding R2 adapters. The first R1 adapter will be paired up with the first R2 adapter, and so on. The adapters are then always removed in pairs from a read pair. It is an error if the number of provided adapters is not identical for the R1 and R2 sets. Example:: cutadapt --pair-adapters -a AAAAA -a GGGG -A CCCCC -A TTTT -o out.1.fastq -p out.2.fastq in.1.fastq in.2.fastq Here, the adapter pairs are (``AAAAA``, ``CCCCC``) and (``GGGG``, ``TTTT``). That is, paired-end reads will only be trimmed if either * ``AAAAA`` is found in R1 *and* ``CCCCC`` is found in R2, * or ``GGGG`` is found in R1 *and* ``TTTT`` is found in R2. There is one limitation of the algorithm at the moment: The program looks for the best-matching R1 adapter first and then checks whether the corresponding R2 adapter can be found. If not, the read pair remains unchanged. However, it is in theory possible that a different R1 adapter that does not fit as well would have a partner that *can* be found. Some read pairs may therefore remain untrimmed. This option was added to help with :ref:`demultiplexing Illumina unique dual indices (UDIs) `. .. versionadded:: 2.1 Interleaved paired-end reads ---------------------------- Cutadapt supports reading and writing paired-end reads from a single FASTQ file in which the entries for the first and second read from each pair alternate. The first read in each pair comes before the second. This is called “interleaved” format. Enable this file format by adding the ``--interleaved`` option to the command-line. Then, if you provide only a single file where usually two would be expected, reads are automatically read or written interleaved. For example, to read interleaved from ``reads.fastq`` and to write interleaved to ``trimmed.fastq``:: cutadapt --interleaved -q 20 -a ACGT -A TGCA -o trimmed.fastq reads.fastq In the following example, the input ``reads.fastq`` is interleaved, but output is written to two files ``trimmed.1.fastq`` and ``trimmed.2.fastq``:: cutadapt --interleaved -q 20 -a ACGT -A TGCA -o trimmed.1.fastq -p trimmed.2.fastq reads.fastq Reading two-file input and writing interleaved is also possible by providing a second input file:: cutadapt --interleaved -q 20 -a ACGT -A TGCA -o trimmed.1.fastq reads.1.fastq reads.2.fastq The following options also supported interleaved output:: * ``--untrimmed-output`` (omit ``--untrimmed-paired-output``) * ``--too-short-output`` (omit ``--too-short-paired-output``) * ``--too-long-output`` (omit ``--too-long-paired-output``) If you omit ``--interleaved`` but trim paired-end files, the above options must be used in pairs. Cutadapt will detect if an input file is not properly interleaved by checking whether read names match and whether the file contains an even number of entries. Trimming paired-end reads separately ------------------------------------ .. warning:: Trimming paired-end data in this way is not recommended as it bypasses all paired-end error-checking, such as checking whether the number of reads is the same in both files. You should use the normal paired-end trimming mode with the ``-o``/``--p`` options described above. If you do not use any of the filtering options that discard reads, such as ``--discard``, ``--minimum-length`` or ``--maximum-length``, you can run Cutadapt on each file separately:: cutadapt -a ADAPTER_FWD -o trimmed.1.fastq.gz reads1.fastq.gz cutadapt -a ADAPTER_REV -o trimmed.2.fastq.gz reads2.fastq.gz You can use the options that are listed under 'Additional modifications' in Cutadapt's help output without problems. For example, if you want to quality-trim the first read in each pair with a threshold of 10, and the second read in each pair with a threshold of 15, then the commands could be:: cutadapt -q 10 -a ADAPTER_FWD -o trimmed.1.fastq reads1.fastq cutadapt -q 15 -a ADAPTER_REV -o trimmed.2.fastq reads2.fastq .. note:: Previous Cutadapt versions (up to 1.18) had a “legacy mode” that was activated under certain conditions and in which the read-modifying options such as ``-q`` would only apply to the forward/R1 reads. This mode no longer exists. .. _multiple-adapters: Multiple adapters ================= It is possible to specify more than one adapter sequence by using the options ``-a``, ``-b`` and ``-g`` more than once. Any combination is allowed, such as five ``-a`` adapters and two ``-g`` adapters. Each read will be searched for all given adapters, but **only the best matching adapter is removed**. (But it is possible to :ref:`trim more than one adapter from each read `). This is how a command may look to trim one of two possible 3' adapters:: cutadapt -a TGAGACACGCA -a AGGCACACAGGG -o output.fastq input.fastq The adapter sequences can also be read from a FASTA file. Instead of giving an explicit adapter sequence, you need to write ``file:`` followed by the name of the FASTA file:: cutadapt -a file:adapters.fasta -o output.fastq input.fastq All of the sequences in the file ``adapters.fasta`` will be used as 3' adapters. The other adapter options ``-b`` and ``-g`` also support this. With ``-g``, you can also write ``-g ^file:adapters.fasta`` to specify that all adapters read from ``adapters.fasta`` should be anchored. Similarly, with ``-a``, you can also write ``-a file$:adapters.fasta`` to anchor all adapters to the 3' end. The ``file:`` syntax can be combined with the regular way of specifying an adapter. But no matter how you specify multiple adapter sequences, remember that only the best matching adapter is trimmed from each read. When Cutadapt has multiple adapter sequences to work with, either specified explicitly on the command line or via a FASTA file, it decides in the following way which adapter should be trimmed: * All given adapter sequences are matched to the read. * Adapter matches where the overlap length (see the ``-O`` parameter) is too small or where the error rate is too high (``-e``) are removed from further consideration. * Among the remaining matches, the one with the largest alignment score is chosen. * If there is a tie, the first adapter wins. The order of adapters is the order in which they are given on the command line or in which they are found in the FASTA file. If your adapter sequences are all similar and differ only by a variable barcode sequence, you can use a single adapter sequence instead that :ref:`contains wildcard characters `. If you want to search for a combination of a 5' and a 3' adapter, you may want to provide them as a single so-called :ref:`"linked adapter" ` instead. .. versionadded:: 4.1 Ability to anchor 5’ adapters from an external file with ``-g ^file:`` .. versionadded:: 4.3 Ability to anchor 3' adapters from an external file with ``-a file$:`` .. _named-adapters: Named adapters -------------- Cutadapt reports statistics for each adapter separately. To identify the adapters, they are numbered and the adapter sequence is also printed:: === Adapter 1 === Sequence: AACCGGTT; Length 8; Trimmed: 5 times. If you want this to look a bit nicer, you can give each adapter a name in this way:: cutadapt -a My_Adapter=AACCGGTT -o output.fastq input.fastq The actual adapter sequence in this example is ``AACCGGTT`` and the name assigned to it is ``My_Adapter``. The report will then contain this name in addition to the other information:: === Adapter 'My_Adapter' === Sequence: TTAGACATATCTCCGTCG; Length 18; Trimmed: 5 times. When adapters are read from a FASTA file, the sequence header is used as the adapter name. Adapter names are also used in column 8 of :ref:`info files `. .. _more-than-one: Trimming more than one adapter from each read --------------------------------------------- By default, at most one adapter sequence is removed from each read, even if multiple adapter sequences were provided. This can be changed by using the ``--times`` option (or its abbreviated form ``-n``). Cutadapt will then search for all the given adapter sequences repeatedly, either until no adapter match was found or until the specified number of rounds was reached. As an example, assume you have a protocol in which a 5' adapter gets ligated to your DNA fragment, but it's possible that the adapter is ligated more than once. So your sequence could look like this:: ADAPTERADAPTERADAPTERmysequence To be on the safe side, you assume that there are at most five copies of the adapter sequence. This command can be used to trim the reads correctly:: cutadapt -g ^ADAPTER -n 5 -o output.fastq.gz input.fastq.gz To search for a combination of a 5' and a 3' adapter, have a look at the :ref:`support for "linked adapters" ` instead, which works better for that particular case because it is allows you to require that the 3' adapter is trimmed only when the 5' adapter also occurs, and it cannot happen that the same adapter is trimmed twice. Before Cutadapt supported linked adapters, the ``--times`` option was the recommended way to search for 5'/3' linked adapters. For completeness, we describe how it was done. For example, when the 5' adapter is *FIRST* and the 3' adapter is *SECOND*, then the read could look like this:: FIRSTmysequenceSECOND That is, the sequence of interest is framed by the 5' and the 3' adapter. The following command would be used to trim such a read:: cutadapt -g ^FIRST -a SECOND -n 2 ... .. _demultiplexing: Demultiplexing ============== Cutadapt supports demultiplexing, which means that reads are written to different output files depending on which adapter was found in them. To use this, include the string ``{name}`` in the name of the output file and :ref:`give each adapter a name `. The path is then interpreted as a template and each trimmed read is written to the path in which ``{name}`` is replaced with the name of the adapter that was found in the read. Reads in which no adapter was found will be written to a file in which ``{name}`` is replaced with ``unknown``. Example:: cutadapt -a one=TATA -a two=GCGC -o trimmed-{name}.fastq.gz input.fastq.gz This command will create the three files ``demulti-one.fastq.gz``, ``demulti-two.fastq.gz`` and ``demulti-unknown.fastq.gz``. More realistically, your “adapters” would actually be barcode sequences that you will want to :ref:`provide in a FASTA file `. Here is a made-up example for such a ``barcodes.fasta`` file:: >barcode01 TTAAGGCC >barcode02 TAGCTAGC >barcode03 ATGATGAT Since our barcodes are located at the 5’ end of the R1 read, we use the ``-g`` option to provide Cutadapt with the adapter sequences, as in ``-g ^file:barcodes.fasta``. Also, we prefix the ``^file:`` with the ``^`` character to specify that we want to :ref:`anchor the 5’ adapters `. Since these barcode sequences have a length of 8 and the default maximum error rate is 10%, Cutadapt would by default not allow any errors when matching them (a single error would result in an error rate of 1/8=12.5%). We therefore use ``-e 1`` to allow one error. Here is the final command:: cutadapt -e 1 -g ^file:barcodes.fasta -o "trimmed-{name}.fastq.gz" input.fastq.gz Demultiplexing is also supported for paired-end data if you provide the ``{name}`` template in both output file names (``-o`` and ``-p``). Example:: cutadapt -e 1 -g ^file:barcodes.fasta -o trimmed-{name}.1.fastq.gz -p trimmed-{name}.2.fastq.gz input.1.fastq.gz input.2.fastq.gz Paired-end demultiplexing always uses the adapter matches of the *first* read to decide where a read should be written. If adapters for read 2 are given (``-A``/``-G``), they are detected and removed as normal, but these matches do not influence where the read pair is written. This is to ensure that read 1 and read 2 are always synchronized. To demultiplex using a barcode that is located on read 2, you can "cheat" and swap the roles of R1 and R2 for both the input and output files :: cutadapt -e 1 -g ^file:barcodes.fasta -o trimmed-{name}.2.fastq.gz -p trimmed-{name}.1.fastq.gz input.2.fastq.gz input.1.fastq.gz If you do this in a script or pipeline, it may be a good idea to add a comment to clarify that this reversal of R1 and R2 is intended. More advice on demultiplexing: * You can use ``--untrimmed-output`` to change the name of the output file that receives the untrimmed reads (those in which no barcode could be found). * Similarly, you can use ``--untrimmed-paired-output`` to change the name of the output file that receives the untrimmed R2 reads. * If you want to demultiplex, but keep the barcode in the reads, use the option ``--action=none``. .. _combinatorial-demultiplexing: Demultiplexing paired-end reads with combinatorial dual indexes --------------------------------------------------------------- `Illumina’s combinatorial dual indexing strategy `_ uses a set of indexed adapters on R1 and another one on R2. Unlike unique dual indexes (UDI) (described on the same page) all combinations of indexes are possible. For demultiplexing this type of data ("combinatorial demultiplexing"), it is necessary to write each read pair to an output file depending on the adapters found on R1 *and* R2. Doing this is similar to doing normal demultiplexing as described above, but you need to use ``{name1}`` and ``{name2}`` in both output file name templates. For example:: cutadapt \ -e 0.15 --no-indels \ -g ^file:barcodes_fwd.fasta \ -G ^file:barcodes_rev.fasta \ -o {name1}-{name2}.1.fastq.gz -p {name1}-{name2}.2.fastq.gz \ input.1.fastq.gz input.2.fastq.gz The ``{name1}`` will be replaced with the name of the best-matching R1 adapter and ``{name2}`` will be replaced with the name of the best-matching R2 adapter. If there was no match of an R1 adapter, ``{name1}`` is set to "unknown", and if there is no match of an R2 adapter, ``{name2}`` is set to "unknown". To discard read pairs for which one or both adapters could not be found, use ``--discard-untrimmed``. The ``--untrimmed-output`` and ``--untrimmed-paired-output`` options cannot be used. Read the :ref:`demultiplexing ` section for how to choose the error rate etc. Also, the tips below about how to speed up demultiplexing apply even with combinatorial demultiplexing. When doing the above, you will end up with lots of files named ``first-second.x.fastq.gz``, where *first* is the name of the first indexed adapter and *second* is the name of the second indexed adapter, and *x* is 1 or 2. Each indexed adapter combination may correspond to a sample name and you may want to name your files according to the sample name, not the name of the adapters. Cutadapt does not have built-in functionality to achieve this, but you can use an external tool such as ``mmv`` (“multiple move”). First, create a list of patterns in ``patterns.txt``:: fwdindex1-revindex1.[12].fastq.gz sampleA.#1.fastq.gz fwdindex1-revindex2.[12].fastq.gz sampleB.#1.fastq.gz fwdindex1-revindex3.[12].fastq.gz sampleC.#1.fastq.gz fwdindex2-revindex1.[12].fastq.gz sampleD.#1.fastq.gz fwdindex2-revindex2.[12].fastq.gz sampleE.#1.fastq.gz ... Here, *fwdindex1*/*revindex1* etc. are the names of indexes, and *sampleA* etc. are your sample names. Then rename all files at once with :: mmv < patterns.txt .. versionadded:: 2.4 .. _paired-adapters-dual-indices: .. _unique-dual-indices: Demultiplexing unique dual indices ---------------------------------- `Illumina’s unique dual indexing (UDI) scheme `_ (“non-redundant indexing”) uses 96 unique i5 indices and 96 unique i7 indices, which are only used in pairs. That is, the first i5 index is always used with the first i7 index and so on. To demultiplex this type of data, the :ref:`--pair-adapters option ` needs to be used. Example:: cutadapt -j 8 -e 1 --no-indels --pair-adapters -g ^file:i5indices.fasta -G ^file:i7indices.fasta -o 'demultiplexed-{name}_R1.fastq.gz' -p 'demultiplexed-{name}_R2.fastq.gz' input.R1.fastq.gz input.R2.fastq.gz .. note:: If the adapters do not come in pairs, but all combinations are possible, use :ref:`combinatorial demultiplexing `. .. _speed-up-demultiplexing: Speeding up demultiplexing -------------------------- Finding many adapters/barcodes simultaneously (which is what demultiplexing in Cutadapt is about), can be sped up tremendously by using the right options since Cutadapt will then be able to create an index of the barcode sequences instead of checking for each barcode separately. Currently, the following conditions need to be met in order for index creation to be enabled: * The barcodes/adapters must be anchored: For 5’ adapters, use ``-g ^ADAPTER`` or ``-g ^file:adapters.fasta``. For 3’ adapters, use ``-a ADAPTER$`` or ``-a file$:adapters.fasta``. * The maximum error rate (``-e``) must be set such that at most 2 errors are allowed: Use ``-e 0``, ``-e 1`` or ``-e 2``. * No IUPAC wildcards must be used in the barcode/adapter. Also, you cannot use the option ``--match-read-wildcards``. An index will be built for all the adapters that fulfill these criteria if there are at least two of them. You can provide additional adapters/barcodes, and they will just not be included in the index. Whether an index is created or not should not affect the results, only how fast you get them. To see whether an index is created, look for a message like this in the first few lines of Cutadapt’s output:: Building index of 23 adapters ... Hopefully some of the above restrictions will be lifted in the future. .. versionadded:: 1.15 Demultiplexing of paired-end data. .. versionadded:: 2.0 Added ability to use an index of adapters for speeding up demultiplexing .. versionadded:: 3.0 An index can be built even when indels are allowed (that is, ``--no-indels`` is no longer required). Demultiplexing paired-end reads in mixed orientation ---------------------------------------------------- For some protocols, the barcode will be located either on R1 or on R2 depending on the orientation in which the DNA fragment was sequenced. For example, the read layout could be either this :: R1: barcode-forwardprimer-sequence R2: reverseprimer-sequence or this :: R1: reverseprimer-sequence R2: barcode-forwardprimer-sequence To demultiplex such data, use :ref:`the --revcomp option `. When used with paired-end reads, Cutadapt searches both R1/R2 as given, but then swaps R1 and R2 and searches that as well. It then keeps the swapped or unswapped version depending on where a barcode could be found. Example:: cutadapt --revcomp \ -g ^file:barcodes.fasta \ -o demultiplexed-{name}.R1.fastq.gz \ -p demultiplexed-{name}.R2.fastq.gz \ R1.fastq.gz R2.fastq.gz Option ``--revcomp`` is only supported starting with Cutadapt 4.6. For earlier versions, the following instructions can be used. The idea is to run Cutadapt twice, once with R1 and R2 as normal and then with R1 and R2 swapped. For Cutadapt versions before 4.6, choose one of the orientations first and demultiplex the reads as if only that existed in the data, using a command like this:: cutadapt -g ^file:barcodes.fasta \ -o round1-{name}.R1.fastq.gz \ -p round1-{name}.R2.fastq.gz \ R1.fastq.gz R2.fastq.gz Then all the read pairs in which no barcode could be found will end up in ``round1-unknown.R1.fastq.gz`` and ``round1-unknown.R2.fastq.gz``. This will also include the pairs in which the barcode was not actually in R1, but in R2. To demultiplex these reads as well, run Cutadapt a second time with those “unknown” files as input, but also reverse the roles of R1 and R2:: cutadapt -g ^file:barcodes.fasta \ -o round2-{name}.R2.fastq.gz \ -p round2-{name}.R1.fastq.gz \ round1-unknown.R2.fastq.gz round1-unknown.R1.fastq.gz .. _truseq: Illumina TruSeq =============== Illumina makes their adapter sequences available in the `Illumina Adapter Sequences Document `_. As an example for how to use that information with Cutadapt, we show how to trim TruSeq adapters. The document gives the adapter sequence for read 1 as ``AGATCGGAAGAGCACACGTCTGAACTCCAGTCA`` and for read 2 as ``AGATCGGAAGAGCGTCGTGTAGGGAAAGAGTGT``. When using Cutadapt, this means you should trim your paired-end data as follows:: cutadapt \ -a AGATCGGAAGAGCACACGTCTGAACTCCAGTCA \ -A AGATCGGAAGAGCGTCGTGTAGGGAAAGAGTGT \ -o trimmed.R1.fastq.gz -p trimmed.R2.fastq.gz \ reads.R1.fastq.gz reads.R2.fastq.gz See also the :ref:`section about paired-end adapter trimming above `. Keep in mind that Cutadapt removes the adapter that it finds and also the sequence following it, so even if the actual adapter sequence that is used in a protocol is longer than that (and possibly contains a variable index), it is sufficient to specify a prefix of the sequence(s). .. note:: Previous versions of this document also recommended using ``AGATCGGAAGAGC`` as adapter sequence for both read 1 and read 2, but you should avoid doing so as that sequence occurs multiple times in the human genome. To understand the structure of Illumina libraries and what the i5, i7, P5, P7 sequences are, see `this overview `_. Some older information is also available in the document `Illumina TruSeq Adapters De-Mystified `_, but keep in mind that it does not cover newer protocols. Under some circumstances, you may want to consider not trimming adapters at all. For example, a good library prepared for exome, genome or transcriptome sequencing should contain very few reads with adapters anyway. Also, some read mapping programs including BWA-MEM and STAR will soft-clip bases at the 3' ends of reads that do not match the reference, which will take care of adapters implicitly. .. _warnbase: Warning about incomplete adapter sequences ------------------------------------------ Sometimes Cutadapt’s report ends with these lines:: WARNING: One or more of your adapter sequences may be incomplete. Please see the detailed output above. Further up, you’ll see a message like this:: Bases preceding removed adapters: A: 95.5% C: 1.0% G: 1.6% T: 1.6% none/other: 0.3% WARNING: The adapter is preceded by "A" extremely often. The provided adapter sequence may be incomplete. To fix the problem, add "A" to the beginning of the adapter sequence. This means that in 95.5% of the cases in which an adapter was removed from a read, the base coming *before* that was an ``A``. If your DNA fragments are not random, such as in amplicon sequencing, then this is to be expected and the warning can be ignored. If the DNA fragments are supposed to be random, then the message may be genuine: The adapter sequence may be incomplete and should include an additional ``A`` in the beginning. This warning exists because some documents list the Illumina TruSeq adapters as starting with ``GATCGGA...``. While that is technically correct, the library preparation actually results in an additional ``A`` before that sequence, which also needs to be removed. See the :ref:`previous section ` for the correct sequence. .. _n-bases: .. _dealing-with-ns: Dealing with ``N`` bases ======================== Cutadapt supports the following options to deal with ``N`` bases in your reads: ``--max-n COUNT`` Discard reads containing more than *COUNT* ``N`` bases. A fractional *COUNT* between 0 and 1 can also be given and will be treated as the proportion of maximally allowed ``N`` bases in the read. For example, ``--max-n 0`` removes all reads that contain any ``N`` bases. ``--trim-n`` Remove flanking ``N`` bases from each read. That is, a read such as this:: NNACGTACGTNNNN Is trimmed to just ``ACGTACGT``. This option is applied *after* adapter trimming. If you want to get rid of ``N`` bases before adapter removal, use quality trimming: ``N`` bases typically also have a low quality value associated with them. .. _cutadapt-s-output: Cutadapt's output ================= Reporting --------- Cutadapt will by default print a full report after it has finished processing the reads. To suppress all output except error messages, use the option ``--quiet``. The report type can be changed to a one-line summary with the option ``--report=minimal``. The output will be a tab-separated table (tsv) with one header row and one row of content. Here is an example:: $ cutadapt --report=minimal -a ... -m 20 -q 10 -o ... -p ... in.[12].fastq.gz status in_reads in_bp too_short too_long too_many_n out_reads w/adapters qualtrim_bp out_bp w/adapters2 qualtrim2_bp out2_bp OK 1000000 202000000 24827 0 0 975173 28968 1674222 97441426 0 0 98492473 This is the meaning of each column: =============== ========================================================== Column heading Explanation =============== ========================================================== status Incomplete adapter warning (``OK`` or ``WARN``) in_reads Number of processed reads (read pairs for paired-end) in_bp Number of processed basepairs too_short Number of reads/read pairs that were too short too_long Number of reads/read pairs that were too long too_many_n Number of reads/read pairs that contained too many ``N`` out_reads Number of reads written w/adapters Number of reads containing at least one adapter qualtrim_bp Number of bases removed from R1 reads by quality trimming out_bp Number of bases written to R1 reads w/adapters2 Number of R2 reads containing at least one adapter qualtrim2_bp Number of bases removed from R2 reads by quality trimming out2_bp Number of bases written =============== ========================================================== The last three fields are omitted for single-end data. .. versionadded:: 1.18 How to read the report ---------------------- After every run, Cutadapt prints out per-adapter statistics. The output starts with something like this:: Sequence: 'ACGTACGTACGTTAGCTAGC'; Length: 20; Trimmed: 2402 times. If option ``--revcomp`` was used, this line will additionally contain something like ``Reverse-complemented: 984 times``. This describes how many times of the 2402 total times the adapter was found on the reverse complement of the read. The next piece of information is this:: No. of allowed errors: 0-7 bp: 0; 8-15 bp: 1; 16-20 bp: 2 The adapter, as was shown above, has a length of 20 characters. We are using a custom error rate of 0.12. What this implies is shown above: Matches up to a length of 7 bp are allowed to have no errors. Matches of lengths 8-15 bp are allowd to have 1 error and matches of length 16 or more can have 2 errors. See also :ref:`the section about error-tolerant matching `. Finally, a table is output that gives more detailed information about the lengths of the removed sequences. The following is only an excerpt; some rows are left out:: Overview of removed sequences length count expect max.err error counts 3 140 156.2 0 140 4 57 39.1 0 57 5 50 9.8 0 50 6 35 2.4 0 35 7 13 0.3 0 1 12 8 31 0.1 1 0 31 ... 100 397 0.0 3 358 36 3 The first row tells us the following: Three bases were removed in 140 reads; randomly, one would expect this to occur 156.2 times; the maximum number of errors at that match length is 0 (this is actually redundant since we know already that no errors are allowed at lengths 0-7 bp). The last column shows the number of reads that had 0, 1, 2 ... errors. In the last row, for example, 358 reads matched the adapter with zero errors, 36 with 1 error, and 3 matched with 2 errors. In the row for length 7 is an apparent anomaly, where the max.err column is 0 and yet we have 31 reads matching with 1 error. This is because the matches are actually contributed by alignments to the first 8 bases of the adapter with one deletion, so 7 bases are removed but the error cut-off applied is for length 8. The "expect" column gives only a rough estimate of the number of sequences that is expected to match randomly, but it can help to estimate whether the matches that were found are true adapter matches or if they are due to chance. At lengths 6, for example, only 2.4 reads are expected, but 35 do match, which hints that most of these matches are due to actual adapters. For slightly more accurate estimates, you can provide the correct GC content (as a percentage) of your reads with the option ``--gc-content``. The default is ``--gc-content=50``. Note that the "length" column refers to the length of the removed sequence. That is, the actual length of the match in the above row at length 100 is 20 since that is the adapter length. Assuming the read length is 100, the adapter was found in the beginning of 397 reads and therefore those reads were trimmed to a length of zero. The table may also be useful in case the given adapter sequence contains an error. In that case, it may look like this:: ... length count expect max.err error counts 10 53 0.0 1 51 2 11 45 0.0 1 42 3 12 51 0.0 1 48 3 13 39 0.0 1 0 39 14 40 0.0 1 0 40 15 36 0.0 1 0 36 ... We can see that no matches longer than 12 have zero errors. In this case, it indicates that the 13th base of the given adapter sequence is incorrect. JSON report ----------- With ``--json=filename.cutadapt.json``, a report in JSON format is written to the given file. We strongly recommend that you use the ``.cutadapt.json`` file name extension for this file for easier discoverability by log-parsing tools such as `MultiQC `_. See the :ref:`description of the JSON report file format `. .. versionadded:: 3.5 .. _info-file: Info file --------- When the ``--info-file=info.tsv`` command-line parameter is given, detailed information about where adapters were found in each read are written to the given text file as tab-separated values. See the :ref:`description of the info file format `. cutadapt-4.7/doc/ideas.rst000066400000000000000000000023431457457704700156030ustar00rootroot00000000000000Ideas/To Do ----------- This is a rather unsorted list of features that would be nice to have, of things that could be improved in the source code, and of possible algorithmic improvements. - show average error rate - length histogram - ``--detect`` prints out best guess which of the given adapters is the correct one - warn when given adapter sequence contains non-IUPAC characters Specifying adapters ~~~~~~~~~~~~~~~~~~~ Allow something such as ``-a ADAP$TER`` or ``-a ADAPTER$NNN``. This would be a way to specify less strict anchoring. Allow ``N{3,10}`` as in regular expressions (for a variable-length sequence). Use parentheses to specify the part of the sequence that should be kept: * ``-a (...)ADAPTER`` (default) * ``-a (...ADAPTER)`` (default) * ``-a ADAPTER(...)`` (default) * ``-a (ADAPTER...)`` (??) Or, specify the part that should be removed: ``-a ...(ADAPTER...)`` ``-a ...ADAPTER(...)`` ``-a (ADAPTER)...`` Available letters for command-line options ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ * Lowercase letters: i, k, s, w * Uppercase letters: C, D, E, F, H, I, J, K, L, P, R, S, T, V, W * Deprecated, could be re-used: c, d, t * Planned/reserved: Q (paired-end quality trimming), V (alias for --version) cutadapt-4.7/doc/index.rst000066400000000000000000000003301457457704700156170ustar00rootroot00000000000000.. include:: ../README.rst .. toctree:: :maxdepth: 2 :hidden: /installation /guide /reference /recipes /algorithms /develop /changes Source code cutadapt-4.7/doc/installation.rst000066400000000000000000000157231457457704700172250ustar00rootroot00000000000000============ Installation ============ Because Cutadapt development happens on Linux, this is the best supported platform, but it should also run on macOS and Windows. Installation with Conda ----------------------- Cutadapt is available as a Conda package from the `Bioconda channel `_. 1. Install Conda. For example, by `installing miniforge `_. 2. Configure the Bioconda channel by following the `Bioconda setup instructions `_. In short:: conda config --add channels bioconda conda config --add channels conda-forge conda config --set channel_priority strict (The Bioconda instructions mention the ``defaults`` channel, but it is not needed.) 3. Install Cutadapt into a new Conda environment:: conda create -n cutadapt cutadapt The first ``cutadapt`` in this command is the name of the Conda environment. You can choose a different name. If you are on macOS and your machine uses an M1/M2 processor (Apple Silicon), you may need to run this command instead:: CONDA_SUBDIR=osx-64 conda create -n cutadapt cutadapt (If you have problems, see `this issue for troubleshooting `_.) 4. Activate the Conda environment. This needs to be done every time you open a new shell in order to be able to use Cutadapt:: conda activate cutadaptenv 5. Finally, check whether the installation was successful:: cutadapt --version This should show the Cutadapt version number. Installation with pipx ---------------------- This works on Ubuntu 20.04 and later:: sudo apt install pipx python3-venv pipx install cutadapt cutadapt --version Installation with pip --------------------- Ensure you have virtualenv installed. On Ubuntu/Debian:: sudo apt install python3-virtualenv Create a new virtual environment and install Cutadapt into it:: virtualenv cutadapt-venv cutadapt-venv/bin/pip --upgrade pip cutadapt-venv/bin/pip install cutadapt Cutadapt is now available as `cutadapt-venv/bin/cutadapt`:: cutadapt-venv/bin/cutadapt --version Optionally, you can *activate* the virtual environment, which allows you to just type `cutadapt` without the full path:: source cutadapt-venv/bin/activate cutadapt --version Activation must be re-done whenever you open a new shell/terminal window. Installation on Debian/Ubuntu ----------------------------- Cutadapt is also included in Debian-based Linux distributions, such as Ubuntu. Simply use your favorite package manager to install Cutadapt. On the command-line, this should work :: sudo apt install cutadapt or possibly :: sudo apt install python3-cutadapt Please be aware that distribution packages are very likely to be outdated. If you encounter unexpected behavior or need newer features, please use one of the other installation methods to get an up-to-date version before reporting bugs. Installation on Windows ----------------------- For some releases of Cutadapt, a single-file executable (``cutadapt.exe``) is made available on the `GitHub releases page `_. Try that first, and if it does not work for you, please report the issue. To install Cutadapt manually, keep reading. There is no Bioconda package for Windows because Bioconda does not produce Windows packages. To install Cutadapt, you can use ``pip``, but because Cutadapt contains components that need to be compiled, you also need to install a compiler. 1. Download a recent version (at least 3.7) of Python for Windows from and install it. 2. Download and install “Build Tools for Visual Studio 2019” from . (There are many similarly named downloads on that page, ensure you get the right one.) During installation, when the dialog about which components to install pops up, ensure that “C++ Build tools” is ticked. The download is quite big and can take a long time. 3. Open the command line (``cmd.exe``) and run ``py -m pip install cutadapt``. 4. Test whether it worked by running ``py -m cutadapt --version``. You should see the version number of Cutadapt. When running Cutadapt this way, you will need to remember to write ``py -m cutadapt`` instead of just ``cutadapt``. Shared installation (on a cluster) ---------------------------------- If you have a larger installation and want to provide Cutadapt as a module that can be loaded and unloaded (with the Lmod system, for example), we recommend that you create a virtual environment and 'pip install' Cutadapt into it. These instructions work on a SLURM cluster that uses the Lmod system (replace ``3.1`` with the actual version you want to use):: BASE=/software/cutadapt-3.1 virtualenv $BASE/venv $BASE/venv/bin/pip install cutadapt==3.1 mkdir $BASE/bin cd $BASE/bin ln -s ../venv/bin/cutadapt Then add the directory ``$BASE/bin/`` to the ``$PATH`` when a user loads the module, somewhat like this (this is for the Lmod system):: conflict("cutadapt") whatis("adapter trimming tool") prepend_path("PATH", "/software/cutadapt-3.1/bin") Make sure that you **do not** add ``$BASE/venv/bin/`` to the ``$PATH``! Otherwise, a user trying to run ``python`` who also has the cutadapt module loaded would get the python from the virtual environment, which leads to confusing error messages. The ``$BASE/bin/`` directory only contains the ``cutadapt`` script and nothing else, avoiding this problem. Please note that there is no need to “activate” virtual environments. Installing the development version ---------------------------------- We recommend that you install Cutadapt into a so-called virtual environment if you decide to use the development version. The virtual environment is a single directory that contains everything needed to run the software. Nothing else on your system is changed, so you can uninstall this particular version of Cutadapt by just removing the directory with the virtual environment. The following instructions work on Linux using Python 3. Make sure you have installed the ``python3-dev`` and ``build-essential`` packages on Ubuntu. First, choose where you want to place the directory with the virtual environment and what you want to call it. Let us assume you chose the path ``~/cutadapt-venv``. Then use these commands for the installation:: python3 -m venv ~/cutadapt-venv ~/cutadapt-venv/bin/python3 -m pip install --upgrade pip ~/cutadapt-venv/bin/pip install git+https://github.com/marcelm/cutadapt.git To run Cutadapt and see the version number, type :: ~/cutadapt-venv/bin/cutadapt --version The reported version number will be something like ``2.2.dev5+gf564208``. This means that you are now running the version of Cutadapt that will become 2.2, and that it contains 5 changes (*commits*) since the previous release (2.1 in this case). cutadapt-4.7/doc/recipes.rst000066400000000000000000000332701457457704700161530ustar00rootroot00000000000000======= Recipes ======= This section contains short how-to guides for doing certain tasks. Remove more than one adapter ---------------------------- If you want to remove a 5' and 3' adapter at the same time, :ref:`use the support for linked adapters `. If your situation is different, for example, when you have many 5' adapters but only one 3' adapter, then you have two options. First, you can specify the adapters and also ``--times=2`` (or the short version ``-n 2``). For example:: cutadapt -g ^TTAAGGCC -g ^AAGCTTA -a TACGGACT -n 2 -o output.fastq input.fastq This instructs Cutadapt to run two rounds of adapter finding and removal. That means that, after the first round and only when an adapter was actually found, another round is performed. In both rounds, all given adapters are searched and removed. The problem is that it could happen that one adapter is found twice (so the 3' adapter, for example, could be removed twice). The second option is to not use the ``-n`` option, but to run Cutadapt twice, first removing one adapter and then the other. It is easiest if you use a pipe as in this example:: cutadapt -g ^TTAAGGCC -g ^AAGCTTA input.fastq | cutadapt -a TACGGACT - > output.fastq Separating trimmed and untrimmed reads -------------------------------------- To send trimmed and untrimmed reads to separate output files, use the the ``--untrimmed-output`` option:: cutadapt -a TTAAGGCC --untrimmed-output=untrimmed.fastq.gz -o trimmed.fastq.gz input.fastq.gz For paired-end data, use also the ``--untrimmed-paired-output`` option:: cutadapt \ -g ^AAGGCC \ -G ^TTGGAA \ --untrimmed-output=untrimmed.1.fastq.gz \ --untrimmed-paired-output=untrimmed.2.fastq.gz -o trimmed.2.fastq.gz \ -p trimmed.2.fastq.gz \ input.1.fastq.gz \ input.2.fastq.gz Trim poly-A tails ----------------- Use ``--poly-a``, see :ref:`poly-A trimming `. In versions of Cutadapt earlier than 4.4, the recommendation was to use ``-a "A{100}"`` for poly-A-trimming, but the ``--poly-a`` option is more accurate and much faster. Trim a fixed number of bases preceding each adapter --------------------------------------------------- If the adapters you want to remove are preceded by some unknown sequence (such as a random tag/molecular identifier), you can specify this as part of the adapter sequence in order to remove both in one go. For example, assume you want to trim Illumina adapters preceded by 10 bases that you want to trim as well. Instead of this command:: cutadapt -a AGATCGGAAGAGCACACGTCTGAACTCCAGTCAC ... Use this command:: cutadapt -O 13 -a N{10}AGATCGGAAGAGCACACGTCTGAACTCCAGTCAC ... The ``-O 13`` is the minimum overlap for an adapter match, where the 13 is computed as 3 plus 10 (where 3 is the default minimum overlap and 10 is the length of the unknown section). If you do not specify it, the adapter sequence would match the end of every read (because ``N`` matches anything), and ten bases would then be removed from every read. Trimming (amplicon-) primers from paired-end reads -------------------------------------------------- If the reads are shorter than the amplicon, use :: cutadapt -g ^FWDPRIMER -G ^REVPRIMER --discard-untrimmed -o out.1.fastq.gz -p out.2.fastq.gz in.1.fastq.gz in.2.fastq.gz If the reads can be longer than the amplicon, use a :ref:`"linked adapter" `:: cutadapt -a ^FWDPRIMER...RCREVPRIMER -A ^REVPRIMER...RCFWDPRIMER --discard-untrimmed -o out.1.fastq.gz -p out.2.fastq.gz in.1.fastq.gz in.2.fastq.gz You need to insert your own sequences as follows. The three dots ``...`` need to be written as they are. FWDPRIMER Sequence of the forward primer REVPRIMER Sequence of the reverse primer RCFWDPRIMER Reverse-complemented sequence of the forward primer RCREVPRIMER Reverse-complemented sequence of the reverse primer Explanation ~~~~~~~~~~~ The full DNA fragment that is put on the sequencer looks like this (looking only at the forward strand): 5' sequencing primer -- forward primer -- sequence of interest -- reverse complement of reverse primer -- reverse complement of 3' sequencing primer Since sequencing of R1 starts after the 5' sequencing primer, R1 will start with the forward primer and then continue into the sequence of interest and possibly into the two primers to the right of it, depending on the read length and how long the sequence of interest is. If the reads are sufficiently short, R1 will not extend into the reverse primer, and R2 will not extend into the forward primer. In that case, only the forward primer on R1 and the reverse primer on R2 need to be removed:: -g ^FWDPRIMER -G ^REVPRIMER --discard-untrimmed If the reads are so long that they can possibly extend into the primer on the respective other side, :ref:`linked adapters ` for both R1 and R2 can be used. For R1:: -a ^FWDPRIMER...RCREVPRIMER Sequencing of R2 starts before the 3' sequencing primer and proceeds along the reverse-complementary strand. For the correct linked adapter, the sequences from above therefore need to be swapped and reverse-complemented:: -A ^REVPRIMER...RCFWDPRIMER The uppercase ``-A`` specifies that this option is meant to work on R2. Cutadapt does not reverse-complement any sequences of its own; you will have to do that yourself. Finally, you may want to filter the trimmed read pairs. Option ``--discard-untrimmed`` throws away all read pairs in which R1 doesn’t start with ``FWDPRIMER`` or in which R2 does not start with ``REVPRIMER``. A note on how the filtering works: In linked adapters, by default the first part (before the ``...``) is anchored. Anchored sequences *must* occur. If they don’t, then the other sequence (after the ``...``) is not even searched for and the entire read is internally marked as “untrimmed”. This is done for both R1 and R2 and as soon as any of them is marked as “untrimmed”, the entire pair is considered to be “untrimmed”. If ``--discard-untrimmed`` is used, this means that the entire pair is discarded if R1 or R2 are untrimmed. (Option ``--pair-filter=both`` can be used to change this to require that *both* were marked as untrimmed.) Piping paired-end data ---------------------- Sometimes it is necessary to run Cutadapt twice on your data. For example, when you want to change the order in which read modification or filtering options are applied. To simplify this, you can use Unix pipes (``|``), but this is more difficult with paired-end data since then input and output consists of two files each. The solution is to interleave the paired-end data, send it over the pipe and then de-interleave it in the other process. Here is how this looks in principle:: cutadapt [options] --interleaved in.1.fastq.gz in.2.fastq.gz | \ cutadapt [options] --interleaved -o out.1.fastq.gz -p out.2.fastq.gz - Note the ``-`` character in the second invocation to Cutadapt. Support for concatenated compressed files ----------------------------------------- Cutadapt supports concatenated gzip and bzip2 input files. Check whether a FASTQ file is properly formatted ------------------------------------------------ cutadapt -o /dev/null input.fastq Any problems with the FASTQ file will be detected and reported. Check whether FASTQ files are properly paired --------------------------------------------- cutadapt -o /dev/null -p /dev/null input.R1.fastq input.R2.fastq Any problems with the individual FASTQ files or improperly paired reads (mismatching read ids) will be detected and reported. See :ref:`the requirements for properly paired reads `. Rescuing single reads from paired-end reads that were filtered -------------------------------------------------------------- When trimming and filtering paired-end reads, Cutadapt always discards entire read pairs. If you want to keep one of the reads, you need to write the filtered read pairs to an output file and postprocess it. For example, assume you are using ``-m 30`` to discard too short reads. Cutadapt discards all read pairs in which just one of the reads is too short (but see the ``--pair-filter`` option). To recover those (individual) reads that are long enough, you can first use the ``--too-short-(paired)-output`` options to write the filtered pairs to a file, and then postprocess those files to keep only the long enough reads. cutadapt -m 30 -q 20 -o out.1.fastq.gz -p out.2.fastq.gz --too-short-output=tooshort.1.fastq.gz --too-short-paired-output=tooshort.2.fastq.gz in.1.fastq.gz in.2.fastq.gz cutadapt -m 30 -o rescued.a.fastq.gz tooshort.1.fastq.gz cutadapt -m 30 -o rescued.b.fastq.gz tooshort.2.fastq.gz The two output files ``rescued.a.fastq.gz`` and ``rescued.b.fastq.gz`` contain those individual reads that are long enough. Note that the file names do not end in ``.1.fastq.gz`` and ``.2.fastq.gz`` to make it very clear that these files no longer contain synchronized paired-end reads. .. _bisulfite: Bisulfite sequencing (RRBS) --------------------------- When trimming reads that come from a library prepared with the RRBS (reduced representation bisulfite sequencing) protocol, the last two 3' bases must be removed in addition to the adapter itself. This can be achieved by using not the adapter sequence itself, but by adding two wildcard characters to its beginning. If the adapter sequence is ``ADAPTER``, the command for trimming should be:: cutadapt -a NNADAPTER -o output.fastq input.fastq Details can be found in `Babraham bioinformatics' "Brief guide to RRBS" `_. A summary follows. During RRBS library preparation, DNA is digested with the restriction enzyme MspI, generating a two-base overhang on the 5' end (``CG``). MspI recognizes the sequence ``CCGG`` and cuts between ``C`` and ``CGG``. A double-stranded DNA fragment is cut in this way:: 5'-NNNC|CGGNNN-3' 3'-NNNGGC|CNNN-5' The fragment between two MspI restriction sites looks like this:: 5'-CGGNNN...NNNC-3' 3'-CNNN...NNNGGC-5' Before sequencing (or PCR) adapters can be ligated, the missing base positions must be filled in with GTP and CTP:: 5'-ADAPTER-CGGNNN...NNNCcg-ADAPTER-3' 3'-ADAPTER-gcCNNN...NNNGGC-ADAPTER-5' The filled-in bases, marked in lowercase above, do not contain any original methylation information, and must therefore not be used for methylation calling. By prefixing the adapter sequence with ``NN``, the bases will be automatically stripped during adapter trimming. .. _file-format-conversion: Convert FASTQ to FASTA ---------------------- Cutadapt detects the output format from the output file name extension. Convert FASTQ to FASTA format:: cutadapt -o output.fasta.gz input.fastq.gz Cutadapt detects FASTA output and omits the qualities. .. _force-fasta: If output is written to standard output, no output file name is available, so the same format as the input is used. To force FASTA output even in this case, use the ``--fasta`` option:: cutadapt --fasta input.fastq.gz > out.fasta Trim qualities -------------- Quality-trim 3' ends:: cutadapt -q 20 -o output.fastq.gz input.fastq.gz .. _json-jq: Extract information from the JSON report with ``jq`` ---------------------------------------------------- The :ref:`JSON report ` that is written when using the ``--json`` option can be read by `jq `_. Get the number of reads (or read pairs) written:: jq '.read_counts.output' mysample.cutadapt.json Get the percentage of reads that contain an adapter:: jq '.read_counts.read1_with_adapter / .read_counts.input * 100' mysample.cutadapt.json Get how often the first adapter was found:: jq '.adapters_read1[0].total_matches' mysample.cutadapt.json Quickly test how Cutadapt trims a single sequence ------------------------------------------------- Use ``echo`` to write the sequence in FASTA format, and run Cutadapt with ``--quiet``:: echo -e ">r\nAACCGGTT" | cutadapt --quiet -a CCGGTTGGAA - Output:: >r AA .. _many-samples: Processing many samples (with a for loop) ----------------------------------------- Cutadapt can only process one set of input reads or read pairs at a time. If you have multiple inputs, possibly corresponding to multiple samples, Cutadapt needs to be run once for each input. Instead of typing in each command, this can be done with a ``for`` loop in the shell. Single-end reads ~~~~~~~~~~~~~~~~ We start with the simpler case of processing many single-end files. Let’s say these are the input files:: Sample1_L001_R1_001.fastq.gz Sample2_L001_R1_001.fastq.gz Sample3_L001_R1_001.fastq.gz ... Then this loop will run ``cutadapt`` on each file and produce output files named ``trimmed-Sample1_L001_R1_001.fastq.gz`` etc.:: for f in *.fastq.gz; do cutadapt ... -o trimmed-${f} ${f} done Of course the ``...`` need to be replaced with the trimming options you want to use. You can write this on one line as well:: for f in *.fastq.gz; do cutadapt ... -o trimmed-${f} ${f}; done Paired-end reads ~~~~~~~~~~~~~~~~ Let us assume the input consists of paired-end files named like this:: Sample1_L001_R1_001.fastq.gz Sample1_L001_R2_001.fastq.gz Sample2_L001_R1_001.fastq.gz Sample2_L001_R2_001.fastq.gz Sample3_L001_R1_001.fastq.gz Sample3_L001_R2_001.fastq.gz ... Then this loop can be used to trim each file pair:: for r1 in *_R1_*.fastq.gz; do r2=${r1/_R1_/_R2_} cutadapt ... -o trimmed-${r1} -p trimmed-${r2} ${r1} ${r2} done Here, the trick is to loop only over the R1 files to ensure we run Cutadapt only once for each pair. In the ``r2=...`` line, we re-create the R2 file name from the R1 file name by replacing ``_R1_`` with ``_R2_``. cutadapt-4.7/doc/reference.rst000066400000000000000000000761401457457704700164620ustar00rootroot00000000000000=============== Reference guide =============== Command-line options ==================== General options --------------- ``-h``, ``--help`` Show help ``--version`` Show version number and exit ``--debug`` Print debug log. Use twice to also print the dynamic programming matrices computed when aligning an adapter against a read. This is highly verbose, it is recommended to use this only for a single read. ``-j CORES``, ``--cores CORES`` (default: 1) Run on the :ref:`given number of CPU cores `. Use 0 to auto-detect the number of available cores. Adapter-finding options ----------------------- ``-a ADAPTER``, ``--adapter ADAPTER`` Specification of a :ref:`3' adapter ` or a :ref:`linked adapter `. ``-g ADAPTER``, ``--front ADAPTER`` Specification of a :ref:`5' adapter ` or a :ref:`linked adapter `. ``-b ADAPTER``, ``--anywhere ADAPTER`` Specification of an adapter that can be :ref:`5' or 3' ("anywhere") `. ``-e E``, ``--error-rate E``, ``--errors E`` (default: 0.1) This sets the :ref:`error tolerance ` used when searching for adapters. If E is an integer >= 1, then E errors in a full-length adapter match are allowed. For each specified adapter, this is converted to a maximum allowed error rate. This allows proportionally fewer errors for shorter (partial) adapter matches. If E is a floating-point value with 0 <= E < 1, this sets the maximum allowed error rate directly. ``--no-indels`` (default: allow indels) Do not allow insertions and deletions when matching adapters against reads. ``-n COUNT``, ``--times COUNT`` (default: 1) Repeat the adapter finding and removal step up to COUNT times. :ref:`The default is to search for only one adapter in each read `. ``-O MINLENGTH``, ``--overlap MINLENGTH`` (default: 3) Set the :ref:`minimum overlap ` to MINLENGTH. ``--match-read-wildcards`` Interpret :ref:`IUPAC wildcards in reads ` (such as ``N``). ``-N``, ``--no-match-adapter-wildcards`` Do not interpret :ref:`IUPAC wildcards in adapters `. ``--action {trim,retain,mask,lowercase,none}`` (default: ``trim``) Specify what to do if an adapter match was found. ``trim``: Trim the adapter itself and up- or downstream sequence (depending on adapter type). ``retain``: Trim the up- or downstream sequence (depending on adapter type), but retain the adapter sequence itself. ``mask``: Replace the adapter sequence and up- or downstream sequence with 'N' characters ``lowercase``: Convert the adapter and up- or downstream sequence to lowercase. ``none``: Do not change the read. Found matches are still tracked and can be used for renaming the read or demultiplexing. ``--rc``, ``--revcomp`` :ref:`Check both the read and its reverse complement for adapter matches `. If the reverse-complemented version yields a better match, output that one. For paired-end reads, the reverse complement is obtained by swapping R1 and R2. If the reverse-complemented version was chosen, the string `` rc`` is added to the read name. Additional read modifications ----------------------------- .. seealso:: :ref:`Read modification order ` ``-u LENGTH``, ``--cut LENGTH`` :ref:`Remove a fixed number of bases from each read `. If LENGTH is positive, remove bases from the beginning. If LENGTH is negative, remove bases from the end. Can be used twice if LENGTHs have different signs. This is applied *before* adapter trimming. ``-q [5'CUTOFF,]3'CUTOFF``, ``--quality-cutoff [5'CUTOFF,]3'CUTOFF`` :ref:`Trim low-quality bases ` from 5' and/or 3' ends of each read before adapter removal. This is applied to both reads if data is paired (use ``-Q`` to provide a different cutoff for R2). If one value is given, only the 3' end is trimmed. If two comma-separated cutoffs are given, the 5' end is trimmed with the first cutoff, the 3' end with the second. .. seealso:: :ref:`Description of the quality-trimming algorithm ` ``--nextseq-trim 3'CUTOFF`` :ref:`NextSeq-specific quality trimming ` that also trims dark cycles appearing as high-quality G bases. ``--quality-base N`` (default: 33) Assume that quality values in FASTQ files are encoded as ascii(quality + N). This needs to be set to 64 for some very old Illumina FASTQ files. ``--poly-a`` :ref:`Trim poly-A tails ` from R1 and poly-T heads from R2. ``--length LENGTH``, ``-l LENGTH`` Shorten reads to LENGTH, where LENGTH is an integer. Positive values remove bases at the end while negative ones remove bases at the beginning. ``--trim-n`` Trim N's from 5' and 3' ends of reads. See: :ref:`Dealing with N bases `. ``--length-tag TAG`` Search for TAG followed by a decimal number in the header of the FASTQ or FASTA record. Replace the decimal number with the correct length of the trimmed read. For example, use ``--length-tag 'length='`` to correct fields like 'length=123'. ``--strip-suffix SUFFIX`` Remove this suffix from read names if present. Can be given multiple times. ``-x PREFIX``, ``--prefix PREFIX`` Add this prefix to read names. Use ``{name}`` to insert the name of the matching adapter. Deprecated, use ``--rename`` instead. ``-y SUFFIX``, ``--suffix SUFFIX`` Add this suffix to read names. Use ``{name}``` to insert the name of the matching adapter. Deprecated, use ``--rename`` instead. ``--rename TEMPLATE`` :ref:`Rename reads ` using the TEMPLATE, which can contain placeholders such as ``{id}``, ``{adapter_name}`` etc. ``--zero-cap``, ``-z`` Change negative quality values to zero. Filtering of processed reads ---------------------------- Filters are applied after above read modifications. Paired-end reads are always discarded pairwise (see also ``--pair-filter``). The default is to not apply any filters. ``-m LEN[:LEN2]``, ``--minimum-length LEN[:LEN2]`` Discard reads shorter than LEN. If LEN2 is given for paired-end data, it is applied to R2. ``-M LEN[:LEN2]``, ``--maximum-length LEN[:LEN2]`` Discard reads longer than LEN. If LEN2 is given for paired-end data, it is applied to R2. ``--max-n COUNT`` Discard reads with more than COUNT 'N' bases. If COUNT is a number between 0 and 1, it is interpreted as a fraction of the read length. See :ref:`Dealing with N bases `. ``--max-expected-errors E``, ``--max-ee E`` Discard reads whose :ref:`expected number of errors ` exceeds the value *E*. ``--discard-trimmed``, ``--discard`` Discard reads in which an adapter match was found. Use also ``-O`` to avoid discarding too many randomly matching reads. ``--discard-untrimmed``, ``--trimmed-only`` Discard reads in which no adapter match was found. ``--discard-casava`` Discard reads that did not pass CASAVA filtering (that is, the record header has ``:Y:``). Output ------ ``-o FILE``, ``--output FILE`` Write processed output to FILE (FASTA or FASTQ). :ref:`Compressed file formats are supported `. Including the special placeholder string ``{name}`` in the file name activates :ref:`demultiplexing`. Including ``{name1}`` and ``{name2}`` activates :ref:`combinatorial demultiplexing `. For paired-end data, this option is typically combined with ``-p``. If this option is omitted, :ref:`processed reads are written to standard output `. ``--quiet`` Print only error messages. ``--report {full,minimal}`` (default: full) Which type of report to print: 'full' or 'minimal'. ``--json FILE`` Write :ref:`a report in JSON format ` to FILE. ``--fasta`` :ref:`Force writing FASTA to standard output `. This option is usually not needed as FASTA output can be selected by using an appropriate output file name (``.fasta``, ``.fasta.gz`` etc.) with the ``-o`` option. However, when processing FASTQ files *and* when not using ``-o``, FASTQ format is written to standard output by default. Use this option to force FASTA even in such a case. ``-Z`` Use compression level 1 for gzipped output files. This is a shorthand for ``--compression-level=1``. See: :ref:`speed-up tricks ` ``--info-file FILE`` Write information about each read and its adapter matches to FILE. See: :ref:`Info file format `. ``-r FILE``, ``--rest-file FILE`` When the adapter matches in the middle of a read, write the "rest" to FILE. For 3' adapters, the "rest" is the part of the read after the adapter match. For 5' adapters, the "rest" is the part of the read before the adapter match. ``--wildcard-file FILE`` When the adapter has N wildcard bases, write adapter bases matching wildcard positions to FILE. This is unreliable unless you also use ``--noindels``. Does not work with linked adapters. ``--too-short-output FILE`` Write reads that are too short (according to the length specified by ``-m``) to FILE. Default: discard too short reads ``--too-long-output FILE`` Write reads that are too long (according to length specified by -M) to FILE. Default: discard too long reads ``--untrimmed-output FILE`` Write reads that do not contain any adapter to FILE. Default: output to the same file as trimmed reads. Paired-end options ------------------ .. seealso:: :ref:`Trimming paired-end reads ` The ``-A``, ``-G``, ``-B``, ``-U``, ``-Q`` options work like their lowercase counterparts, but are applied to the second read in each pair (R2). ``-A ADAPTER`` 3' adapter to be removed from R2 ``-G ADAPTER`` 5' adapter to be removed from R2 ``-B ADAPTER`` 5'/3 adapter to be removed from R2 ``-U LENGTH`` Remove LENGTH bases from R2 ``-Q [5'CUTOFF,]3'CUTOFF`` Quality-trimming cutoff for R2. Default: same as for R1 .. -p FILE, --paired-output FILE Write R2 to FILE. --pair-adapters Treat adapters given with -a/-A etc. as pairs. Either both or none are removed from each read pair. --pair-filter {any,both,first} Which of the reads in a paired-end read have to match the filtering criterion in order for the pair to be filtered. Default: any --interleaved Read and/or write interleaved paired-end reads. --untrimmed-paired-output FILE Write second read in a pair to this FILE when no adapter was found. Use with --untrimmed-output. Default: output to same file as trimmed reads --too-short-paired-output FILE Write second read in a pair to this file if pair is too short. --too-long-paired-output FILE Write second read in a pair to this file if pair is too long. (To Do: needs to be finished, see ``cutadapt --help`` for now) .. _json-report-format: JSON report format ================== The JSON reported is generated if ``--json=filename.cutadapt.json`` is used. The file name extension must be ``.cutadapt.json`` for the file to be recognized by log-parsing tools such as `MultiQC `_. (However, at the time of writing, MultiQC does not support Cutadapt’s JSON report format.) See how to :ref:`extract information from the JSON report with jq `. Example ------- This example was reformatted to use less vertical space:: { "tag": "Cutadapt report", "schema_version": [0, 3], "cutadapt_version": "4.5", "python_version": "3.8.10", "command_line_arguments": [ "--json=out.cutadapt.json", "--poly-a", "-m", "20", "-a", "AACCGGTTACGTTGCA", "-q", "20", "--discard-trimmed", "-o", "out.fastq.gz", "reads.fastq"], "cores": 1, "input": { "path1": "reads.fastq", "path2": null, "paired": false, "interleaved": null }, "read_counts": { "input": 100000, "filtered": { "too_short": 251, "too_long": null, "too_many_n": null, "too_many_expected_errors": null, "casava_filtered": null, "discard_trimmed": 2061, "discard_untrimmed": null }, "output": 97688, "reverse_complemented": null, "read1_with_adapter": 2254, "read2_with_adapter": null }, "basepair_counts": { "input": 10100000, "input_read1": 10100000, "input_read2": null, "quality_trimmed": 842048, "quality_trimmed_read1": 842048, "quality_trimmed_read2": null, "poly_a_trimmed": 1028, "poly_a_trimmed_read1": 1028, "poly_a_trimmed_read2": null, "output": 9037053, "output_read1": 9037053, "output_read2": null }, "adapters_read1": [ { "name": "1", "total_matches": 2254, "on_reverse_complement": null, "linked": false, "five_prime_end": null, "three_prime_end": { "type": "regular_three_prime", "sequence": "AACCGGTTACGTTGCA", "error_rate": 0.1, "indels": true, "error_lengths": [6], "matches": 2254, "adjacent_bases": { "A": 473, "C": 1240, "G": 328, "T": 207, "": 6 }, "dominant_adjacent_base": null, "trimmed_lengths": [ {"len": 3, "expect": 1562.5, "counts": [1220]}, {"len": 4, "expect": 390.6, "counts": [319]}, {"len": 5, "expect": 97.7, "counts": [30]}, {"len": 6, "expect": 24.4, "counts": [4]}, {"len": 7, "expect": 24.4, "counts": [5]}, {"len": 8, "expect": 24.4, "counts": [7]}, {"len": 9, "expect": 24.4, "counts": [4]}, {"len": 10, "expect": 24.4, "counts": [7]}, {"len": 11, "expect": 24.4, "counts": [7]}, {"len": 12, "expect": 24.4, "counts": [6]}, {"len": 13, "expect": 24.4, "counts": [8, 2]}, {"len": 14, "expect": 24.4, "counts": [1, 1]}, {"len": 15, "expect": 24.4, "counts": [2, 0]}, {"len": 16, "expect": 24.4, "counts": [3, 1]}, ] } } ], "adapters_read2": null, "poly_a_trimmed_read1": [ {"len": 23, "count": 10}, {"len": 42, "count": 19} ], "poly_a_trimmed_read2": null } Schema ------ Some concepts used in the JSON file: * Keys are always included. If a key is not applicable, its value is set to null. * Single-end data appears as "paired-end data without read 2". That is, values for read 1 are filled in and values for read 2 are set to null. The file defines the following keys. For nested objects (dictionaries), a dot notation is used, as in "outer_key.inner_key". tag : string Always ``"Cutadapt report"``. A marker so that this can be recognized as a file produced by Cutadapt. schema_version : list of two integers Major and minor version of the schema. If additions are made to the schema, the minor version is increased. If backwards incompatible changes are made, the major version is increased. Example: ``[0, 1]`` cutadapt_version : str The version of Cutadapt that generated the report. Example: ``"4.4"`` python_version : str The Python version used to run Cutadapt. Example: ``"3.10"`` command_line_arguments : list of strings The command-line arguments for this invocation. Only for information, do not parse this. Example: ``["-a", "ACGT", "-o", "out.fastq", "input.fastq"]``` cores : int Number of cores used input : dictionary Input files input.path1 : str Path to the first input file. Example: ``"reads.1.fastq"`` input.path2 : str | null Path to the second input file if given, null otherwise. input.paired : bool True if input was paired-end reads, false if input was single-end reads. If this is true and input.path2 is null, input was interleaved. read_counts : dictionary Read count statistics read_counts.input : int Number of reads (for single-end data) or read pairs (for paired-end data) in the input. read_counts.filtered : dictionary Statistics about filtered reads. Keys of the dictionary correspond to a filter. If a filter was not used, its value is set to null. read_counts.filtered.too_short : int | null Number of reads or read pairs that were filtered because they were too short read_counts.filtered.too_long : int | null Number of reads or read pairs that were filtered because they were too long read_counts.filtered.too_many_n : int | null Number of reads or read pairs that were filtered because they had too many N bases read_counts.filtered.too_many_expected_errors : int | null Number of reads or read pairs that were filtered because they had too many expected errors read_counts.filtered.casava_filtered : int | null Number of reads or read pairs that were filtered because the CASAVA filter was ``Y`` read_counts.filtered.discard_trimmed : int | null Number of reads or read pairs that were filtered because at least one adapter match was found for them read_counts.filtered.discard_untrimmed : int | null Number of reads or read pairs that were filtered because no adapter match was found for them read_counts.output : int Number of reads written to the final output. This plus the sum of all filtered reads/read will equal the number of input reads. read_counts.reverse_complemented : int | null If ``--revcomp`` was used, the number of reads or read pairs that were output reverse-complemented, null otherwise. read_counts.read1_with_adapter : int | null Number of R1 reads (or single-end reads) with at least one adapter match, null if no adapter trimming was done. read_counts.read2_with_adapter : int | null Number of R2 reads with at least one adapter match, null if input is single end or no adapter trimming was done. basepair_counts : dictionary Statistics about the number of basepairs. basepair_counts.input : int Total number of basepairs in the input. (The sum of the lengths of all input reads.) basepair_counts.input_read1 : int Number of basepairs in the input, read 1 only. basepair_counts.input_read2 : int | null If paired-end, number of basepairs in the input counting read 2 only, null otherwise. basepair_counts.quality_trimmed : int | null Total number of basepairs removed due to quality trimming or null if no quality trimming was done. basepair_counts.quality_trimmed_read1 : int | null Number of basepairs removed from read 1 due to quality trimming or null if no quality trimming was done. basepair_counts.quality_trimmed_read2 : int Number of basepairs removed from read 2 due to quality trimming or null if no quality trimming was done or if input was single end. basepair_counts.poly_a_trimmed : int | null Total number of basepairs removed due to poly-A trimming or null if no poly-A trimming was done. basepair_counts.poly_a_trimmed_read1 : int | null Number of basepairs removed from read 1 due to poly-A trimming or null if no poly-A trimming was done. basepair_counts.poly_a_trimmed_read2 : int Number of basepairs removed from read 2 due to poly-T trimming or null if no poly-T trimming was done or if input was single end. basepair_counts.output : int Total number of basepairs in the final output. basepair_counts.output_read1 : int Number of basepairs written to the read 1 final output. basepair_counts.output_read2 : int | null Number of basepairs written to the read 2 final output. adapters_read1 : list of dictionaries A list with statistics about all adapters that were matched against read 1. The list is empty if no adapter trimming was done. The schema for the items in this list is described below. adapters_read2 : list of dictionaries | null A list with statistics about all adapters that were matched against read 2. The list is empty if no adapter trimming was done against R2. The value is set to null if the input was single end reads. The schema for the items in this list is described below. poly_a_trimmed_read1 : list of dictionaries | null A histogram of the lengths of poly-A tails removed from read 1. Each item in the list is a dictionary with keys ``len`` and ``count``. This value is null if no poly-A trimming was done. poly_a_trimmed_read2 : list of dictionaries | null A histogram of the lengths of poly-T "heads" removed from read 2, see above. This value is null if no poly-A/poly-T trimming was done or the input was single-end reads. Adapter statistics ------------------ The statistics about each adapter (items in the adapters_read1 and adapters_read2 list) are dictionaries with the following keys. name : str The adapter name. If no adapter name was given, a name is automatically generated as "1", "2", "3" etc. total_matches : int Number of times this adapter was found on a read. If ``--times`` is used, multiple matches per read are possible. on_reverse_complement : int | null If ``--revcomp`` was used, the number of times the adapter was found on the reverse-complemented read, null otherwise. linked : bool Whether this is a linked adapter. If true, then both ``five_prime_end`` and ``three_prime_end`` (below) are filled in and describe the 5' and 3' components, respectively, of the linked adapter. five_prime_end : dictionary | null Statistics about matches of this adapter to the 5' end, that is, causing a prefix of the read to be removed. If the adapter is of type regular_five_prime, noninternal_five_prime or anchored_five_prime, all its matches are summarized here. If the adapter is a linked adapter (``linked`` is true), the matches of its 5' component are summarized here. If the adapter is of type "anywhere", the matches that were determined to be 5' matches are summarized here. This is null for the other adapter types. three_prime_end : dictionary | null Statistics about matches of this adapter to the 3' end, that is, causing a suffix of the read to be removed. If the adapter is of type regular_three_prime, noninternal_three_prime or anchored_three_prime, all its matches are summarized here. If the adapter is a linked adapter (``linked`` is true), the matches of its 3' component are summarized here. If the adapter is of type "anywhere", the matches that were determined to be 3' matches are summarized here. This is null for the other adapter types. three/five_prime_end.type : str Type of the adapter. One of these strings: - ``"regular_five_prime"`` - ``"regular_three_prime"`` - ``"noninternal_five_prime"`` - ``"noninternal_three_prime"`` - ``"anchored_five_prime"`` - ``"anchored_three_prime"`` - ``"anywhere"`` For linked adapters, this is the type of its 5' or 3' component. three/five_prime_end.sequence : str Sequence of this adapter. For linked adapters, this is the sequence of its 5' or 3' component. Example: ``"AACCGGTT"`` three/five_prime_end.error_rate : float Error rate for this adapter. For linked adapters, the error rate for the respective end. three/five_prime_end.indels : bool Whether indels are allowed when matching this adapter against the read. three/five_prime_end.error_lengths : list of ints If the adapter type allows partial matches, this lists the lengths up to which 0, 1, 2 etc. errors are allowed. Example: ``[9, 16]`` means: 0 errors allowed up to a match of length 9, 1 error up to a match of length 16. The last number in this list is the length of the adapter sequence. For anchored adapter types, this is null. three/five_prime_end.matches : int The number of matches of this adapter against the 5' or 3' end. three/five_prime_end.adjacent_bases : dictionary | null For 3' adapter types, this shows which bases occurred adjacent to (upstream of) the 3' adapter match. It is a dictionary mapping the strings "A", "C", "G", "T" and "" (empty string) to the number of occurrences. The empty string covers those cases in which the adjacent base was not one of A, C, G or T or in which there was no adjacent base (3' adapter found at the beginning of the read). This is null for 5' adapters (adjacent base statistics are currently not tracked for those). three/five_prime_end.dominant_adjacent_base : str | null This is set to the dominant adjacent base if adjacent_bases exist and were determined to be sufficiently skewed, corresponding to the :ref:`warning `: "The adapter is preceded by "x" extremely often." This is null otherwise. three/five_prime_end.trimmed_lengths : list of dictionaries The histogram of the lengths of removed sequences. Each item in the list is a dictionary that describes how often a sequence of a certain length was removed, broken down by the number of errors in the adapter match. Example:: "trimmed_lengths": [ {"len": 4, "expect": 390.6, "counts": [319]}, {"len": 5, "expect": 97.7, "counts": [30]}, {"len": 6, "expect": 24.4, "counts": [4]}, {"len": 7, "expect": 24.4, "counts": [5]}, {"len": 15, "expect": 24.4, "counts": [2, 1]}, ] three/five_prime_end.trimmed_lengths.expect : float How often a sequence of length *len* would be expected to be removed due to random chance. three/five_prime_end.trimmed_lengths.counts : list of int Element at index *i* in this list gives how often a sequence of length *len* was removed due to an adapter match with *i* errors. Sum these values to get the total count. Example (5 sequences had 0 errors in the adapter matches, 3 had 1 and 1 had 2):: [5, 3, 1] .. _info-file-format: Info file format ================ When the ``--info-file`` command-line parameter is given, detailed information about where adapters were found in each read are written to the given file. It is a tab-separated text file that contains at least one row per input read. Normally, there is exactly one row per input read, but in the following cases, multiple rows may be output: - The option ``--times`` is in use. - A linked adapter is used. A row is written for *all* input reads, even those that are discarded from the final FASTA/FASTQ output due to filtering options. .. note:: Paired-end reads are not supported. The info file currently does not contain any info about read 2 when Cutadapt is run in paired-end mode. Which fields are output in each row depends on whether an adapter match was found in the read or not. If an adapter match was found, these fields are output in a row: 1. Read name 2. Number of errors 3. 0-based start coordinate of the adapter match 4. 0-based end coordinate of the adapter match 5. Sequence of the read to the left of the adapter match (can be empty) 6. Sequence of the read that was matched to the adapter 7. Sequence of the read to the right of the adapter match (can be empty) 8. Name of the found adapter. 9. Quality values corresponding to sequence left of the adapter match (can be empty) 10. Quality values corresponding to sequence matched to the adapter (can be empty) 11. Quality values corresponding to sequence to the right of the adapter match (can be empty) 12. Flag indicating whether the read was reverse complemented: 1 if yes, 0 if not, and empty if ``--revcomp`` was not used. The concatenation of the fields 5-7 yields the full read sequence. Column 8 identifies the found adapter. `The section about named adapters ` describes how to give a name to an adapter. Adapters without a name are numbered starting from 1. Fields 9-11 are empty if quality values are not available. Concatenating them yields the full sequence of quality values. If the adapter match was found on the reverse complement of the read, fields 5 to 7 show the reverse-complemented sequence, and fields 9-11 contain the qualities in reversed order. If no adapter was found, the format is as follows: 1. Read name 2. The value -1 (use this to distinguish between match and non-match) 3. The read sequence 4. Quality values When parsing the file, be aware that additional columns may be added in the future. Also, some fields can be empty, resulting in consecutive tabs within a line. If the ``--times`` option is used and greater than 1, each read can appear more than once in the info file. There will be one line for each found adapter, all with identical read names. Only for the first of those lines will the concatenation of columns 5-7 be identical to the original read sequence (and accordingly for columns 9-11). For subsequent lines, the shown sequence are the ones that were used in subsequent rounds of adapter trimming, that is, they get successively shorter. Linked adapters appear with up to two rows for each read, one for each constituent adapter for which a match has been found. To be able to see which of the two adapters a row describes, the adapter name in column 8 is modified: If the row describes a match of the 5' adapter, the string ``;1`` is added. If it describes a match of the 3' adapter, the string ``;2`` is added. If there are two rows, the 5' match always comes first. .. versionadded:: 1.9 Columns 9-11 were added. .. versionadded:: 2.8 Linked adapters in info files work. .. versionadded:: 3.4 Column 12 (revcomp flag) added .. _properly-paired-reads: Properly paired reads ===================== When reading paired-end reads, Cutadapt compares the read IDs of R1 and R2. It prints an error message and aborts if they do not match. Comments in the FASTQ or FASTA header are ignored when doing the comparison. Also, if the read ID ends with ``1`` or ``2`` or ``3``, then that is also ignored. For example, two FASTQ headers that would be considered to denote properly paired reads are:: @my_read/1 a comment and:: @my_read/2 another comment This is an example for *improperly paired* reads:: @my_read/1;1 and:: @my_read/2;1 Since the ``1`` and ``2`` are ignored only if they occur at the end of the read name, and since the ``;1`` is considered to be part of the read name, these reads will not be considered to be properly paired. .. _read-modification-order: Read modification order ======================= Read modifications are applied in the following order to each read. Steps not requested on the command-line are skipped. 1. Unconditional base removal with ``--cut`` 2. Quality trimming (``-q``) 3. Adapter trimming (``-a``, ``-b``, ``-g`` and uppercase versions) 4. Poly-A/poly-T trimming (``--poly-a``) 5. Read shortening (``--length``) 6. N-end trimming (``--trim-n``) 7. Length tag modification (``--length-tag``) 8. Read name suffix removal (``--strip-suffix``) 9. Addition of prefix and suffix to read name (``-x``/``--prefix`` and ``-y``/``--suffix``) 10. Read renaming according to ``--rename`` 11. Replacing of negative quality values with zero (zero capping) cutadapt-4.7/doc/requirements.txt000066400000000000000000000001161457457704700172440ustar00rootroot00000000000000sphinx sphinx-rtd-theme sphinx_issues sphinx_better_subsection setuptools_scm cutadapt-4.7/pyproject.toml000066400000000000000000000032021457457704700161260ustar00rootroot00000000000000[build-system] requires = ["setuptools >= 63", "setuptools_scm[toml] >= 6.2", "Cython >= 0.29.20"] build-backend = "setuptools.build_meta" [project] name = "cutadapt" authors = [ {name = "Marcel Martin", email = "marcel.martin@scilifelab.se"} ] description = "Adapter trimming and other preprocessing of high-throughput sequencing reads" readme = "README.rst" license = {text = "MIT"} classifiers = [ "Development Status :: 5 - Production/Stable", "Environment :: Console", "Intended Audience :: Science/Research", "License :: OSI Approved :: MIT License", "Programming Language :: Cython", "Programming Language :: Python :: 3", "Topic :: Scientific/Engineering :: Bio-Informatics" ] requires-python = ">=3.8" dynamic = ["version"] dependencies = [ "dnaio >= 1.2.0", "xopen >= 1.6.0", ] [project.urls] "Homepage" = "https://cutadapt.readthedocs.io/" "Changelog" = "https://cutadapt.readthedocs.io/en/stable/changes.html" "Repository" = "https://github.com/marcelm/cutadapt/" [project.scripts] cutadapt = "cutadapt.cli:main_cli" [project.optional-dependencies] dev = [ "Cython", "pytest", "pytest-timeout", "pytest-mock", "sphinx", "sphinx_issues", "sphinx_better_subsection" ] [tool.setuptools.exclude-package-data] cutadapt = ["*.pyx", "*.c", "*.h"] [tool.pytest.ini_options] filterwarnings = [ "error", ] testpaths = ["tests"] xfail_strict = true [tool.setuptools_scm] write_to = "src/cutadapt/_version.py" [tool.mypy] warn_unused_configs = true [tool.cibuildwheel] environment = "CFLAGS=-g0" test-extras = ["dev"] test-command = ["pytest {project}"] [tool.ruff] line-length = 130 cutadapt-4.7/setup.py000066400000000000000000000006701457457704700147320ustar00rootroot00000000000000from setuptools import setup, Extension import setuptools_scm # noqa Ensure it’s installed extensions = [ Extension("cutadapt._align", sources=["src/cutadapt/_align.pyx"]), Extension("cutadapt.qualtrim", sources=["src/cutadapt/qualtrim.pyx"]), Extension("cutadapt.info", sources=["src/cutadapt/info.pyx"]), Extension("cutadapt._kmer_finder", sources=["src/cutadapt/_kmer_finder.pyx"]), ] setup(ext_modules=extensions) cutadapt-4.7/src/000077500000000000000000000000001457457704700140045ustar00rootroot00000000000000cutadapt-4.7/src/cutadapt/000077500000000000000000000000001457457704700156115ustar00rootroot00000000000000cutadapt-4.7/src/cutadapt/__init__.py000066400000000000000000000001101457457704700177120ustar00rootroot00000000000000__all__ = ["__version__"] from ._version import version as __version__ cutadapt-4.7/src/cutadapt/__main__.py000066400000000000000000000001421457457704700177000ustar00rootroot00000000000000import sys from cutadapt.cli import main_cli if __name__ == "__main__": sys.exit(main_cli()) cutadapt-4.7/src/cutadapt/_align.pyi000066400000000000000000000025251457457704700175710ustar00rootroot00000000000000from typing import Optional, Tuple, Iterable class DPMatrix: m: int n: int def __init__(self, reference: str, query: str): ... def set_entry(self, i: int, j: int, cost: int) -> None: ... def __str__(self) -> str: ... class Aligner: def __init__( self, reference: str, max_error_rate: float, flags: int = 15, wildcard_ref: bool = False, wildcard_query: bool = False, indel_cost: int = 1, min_overlap: int = 1, ): ... def __repr__(self) -> str: ... def _set_reference(self, reference: str) -> None: ... @property def dpmatrix(self) -> DPMatrix: ... def enable_debug(self) -> None: ... def locate(self, query: str) -> Optional[Tuple[int, int, int, int, int, int]]: ... class PrefixComparer: @property def effective_length(self) -> int: ... def __init__( self, reference: str, max_error_rate: float, wildcard_ref: bool = False, wildcard_query: bool = False, min_overlap: int = 1, ): ... def __repr__(self) -> str: ... def locate(self, query: str) -> Optional[Tuple[int, int, int, int, int, int]]: ... class SuffixComparer(PrefixComparer): ... def hamming_sphere(s: str, k: int) -> Iterable[str]: ... def edit_environment(t: str, k: int) -> Iterable[Tuple[str, int, int]]: ... cutadapt-4.7/src/cutadapt/_align.pyx000066400000000000000000000777151457457704700176250ustar00rootroot00000000000000# cython: profile=False, emit_code_comments=False, language_level=3 from cpython.ref cimport PyObject from cpython.bytes cimport PyBytes_FromStringAndSize, PyBytes_AS_STRING from cpython.mem cimport PyMem_Malloc, PyMem_Free, PyMem_Realloc from cpython.unicode cimport PyUnicode_GET_LENGTH from libc.string cimport memcpy, memset from ._match_tables import _upper_table, _acgt_table, _iupac_table cdef extern from "Python.h": unsigned char * PyUnicode_1BYTE_DATA(object o) void * PyUnicode_DATA(object o) bint PyUnicode_IS_COMPACT_ASCII(object o) object PyUnicode_New(Py_ssize_t size, Py_UCS4 maxchar) DEF MATCH_SCORE = +1 DEF MISMATCH_SCORE = -1 DEF INSERTION_SCORE = -2 DEF DELETION_SCORE = -2 # structure for a DP matrix entry ctypedef struct _Entry: int cost int score # score for this alignment (mostly keeps track of matches) int origin # where the alignment originated: negative for positions within seq1, positive for pos. within seq2 ctypedef struct _Match: int origin int cost int score int ref_stop int query_stop cdef: bytes ACGT_TABLE = _acgt_table() bytes IUPAC_TABLE = _iupac_table() bytes UPPER_TABLE = _upper_table() cdef translate(object string, bytes table): if not PyUnicode_IS_COMPACT_ASCII(string): raise ValueError("String must contain only ASCII characters") cdef: unsigned char * string_chars = PyUnicode_DATA(string) Py_ssize_t string_length = PyUnicode_GET_LENGTH(string) char * char_table = PyBytes_AS_STRING(table) object retval = PyBytes_FromStringAndSize(NULL, string_length) char * translated_chars = PyBytes_AS_STRING(retval) Py_ssize_t i for i in range(string_length): translated_chars[i] = char_table[string_chars[i]] return retval class DPMatrix: """ Representation of the dynamic-programming matrix. This is used only when debugging is enabled in the Aligner class since the matrix is normally not stored in full. Entries in the matrix may be None, in which case that value was not computed. """ def __init__(self, reference, query): m = len(reference) n = len(query) self._rows = [ [None] * (n+1) for _ in range(m + 1) ] self.reference = reference self.query = query def set_entry(self, int i, int j, cost): """ Set an entry in the dynamic programming matrix. """ self._rows[i][j] = cost def __str__(self): """ Return a representation of the matrix as a string. """ rows = [' ' + ' '.join(c.rjust(2) for c in self.query)] for c, row in zip(' ' + self.reference, self._rows): r = c + ' ' + ' '.join(' ' if v is None else '{:2d}'.format(v) for v in row) rows.append(r) return '\n'.join(rows) cdef class Aligner: """ Find a full or partial occurrence of a query string in a reference string allowing errors (mismatches, insertions, deletions). This is a hybrid alignment algorithm that uses both costs and scores. By default, unit costs are used, meaning that mismatches, insertions and deletions are counted as one error (edit distance). Semi-global alignments allow skipping a suffix and/or prefix of query or reference at no cost. Combining semi-global alignment with edit distance is a bit unusual because the trivial “optimal” solution at edit distance 0 would be to skip all of the reference and all of the query, like this: REFERENCE----- ---------QUERY Conceptually, the algorithm used here instead tests all possible overlaps between the two sequences and chooses the overlap which maximizes the score in the overlapping part, while the error rate must not go above a threshold. TODO working here To allow skipping of a prefix of the reference at no cost, set the START_IN_REFERENCE flag. To allow skipping of a prefix of the query at no cost, set the START_IN_QUERY flag. If both are set, a prefix of the reference or the query is skipped, never both. Similarly, set STOP_IN_REFERENCE and STOP_IN_QUERY to allow skipping of suffixes of the reference or of the query. Again, it is never the case that both suffixes are skipped. If all flags are set, this results in standard semiglobal alignment. The aligned parts are described with two intervals (ref_start, ref_stop), (query_start, query_stop). For example, an optimal semiglobal alignment of MISSISSIPPI and SISSI looks like this: MISSISSIPPI (reference) ---SISSI--- (query) query_start, query_stop = 0, 5 ref_start, ref_stop = 3, 8 (with zero errors) The aligned parts are reference[ref_start:ref_stop] and query[query_start:query_stop]. The error rate is: errors / length where length is (reference_stop - reference_start). An optimal alignment fulfills all of these criteria: - its error_rate is at most max_error_rate - Among those alignments with error_rate <= max_error_rate, the alignment has highest score - If there are multiple alignments with the same score, then one that has minimal no. of errors is chosen. - If there are still multiple candidates, choose the alignment that starts at the leftmost position within the read. The alignment itself is not returned, only the tuple (ref_start, ref_stop, query_start, query_stop, score, errors). score is the total score and errors is the number of errors in the alignment. It is always the case that at least one of query_start and reference_start is zero. IUPAC wildcard characters can be allowed in the reference and the query by setting the appropriate flags. If neither flag is set, the full ASCII alphabet is used for comparison. If any of the flags is set, all non-IUPAC characters in the sequences compare as 'not equal'. """ cdef: int m _Entry* column # one column of the DP matrix double max_error_rate bint start_in_reference bint start_in_query bint stop_in_reference bint stop_in_query int _insertion_cost int _deletion_cost int _min_overlap bint wildcard_ref bint wildcard_query bint debug object _dpmatrix str reference # reference as set by the user (as str) bytes _reference # internal, bytes version of reference (possibly translated to a non-ASCII representation) readonly int effective_length int* n_counts # n_counts[i] == number of N characters in reference[:i] int _match_score int _mismatch_score int _insertion_score int _deletion_score def __cinit__( self, str reference, double max_error_rate, int flags=15, bint wildcard_ref=False, bint wildcard_query=False, int indel_cost=1, int min_overlap=1, ): self.max_error_rate = max_error_rate self.start_in_reference = flags & 1 self.start_in_query = flags & 2 self.stop_in_reference = flags & 4 self.stop_in_query = flags & 8 self.wildcard_ref = wildcard_ref self.wildcard_query = wildcard_query self._set_reference(reference) self._min_overlap = min_overlap self.debug = False self._dpmatrix = None if indel_cost < 1: raise ValueError('indel_cost must be at least 1') self._insertion_cost = indel_cost self._deletion_cost = indel_cost self._match_score = MATCH_SCORE self._mismatch_score = MISMATCH_SCORE self._insertion_score = INSERTION_SCORE self._deletion_score = DELETION_SCORE def _compute_flags(self): cdef int flags = 0 if self.start_in_reference: flags |= 1 if self.start_in_query: flags |= 2 if self.stop_in_reference: flags |= 4 if self.stop_in_query: flags |= 8 return flags def __reduce__(self): return (Aligner, (self.reference, self.max_error_rate, self._compute_flags(), self.wildcard_ref, self.wildcard_query, self._insertion_cost, self._min_overlap)) def __repr__(self): return ( f"Aligner(reference='{self.reference}', max_error_rate={self.max_error_rate}, " f"flags={self._compute_flags()}, wildcard_ref={self.wildcard_ref}, " f"wildcard_query={self.wildcard_query}, indel_cost={self._insertion_cost}, " f"min_overlap={self._min_overlap})" ) def _set_reference(self, str reference): mem = <_Entry*> PyMem_Realloc(self.column, (len(reference) + 1) * sizeof(_Entry)) if not mem: raise MemoryError() mem_nc = PyMem_Realloc(self.n_counts, (len(reference) + 1) * sizeof(int)) if not mem_nc: raise MemoryError() self.column = mem self.n_counts = mem_nc self.m = len(reference) self.effective_length = self.m n_count = 0 for i in range(self.m): self.n_counts[i] = n_count if reference[i] == 'n' or reference[i] == 'N': n_count += 1 self.n_counts[self.m] = n_count assert self.n_counts[self.m] == reference.count('N') + reference.count('n') if self.wildcard_ref: self.effective_length = self.m - self.n_counts[self.m] if self.effective_length == 0: raise ValueError("Cannot have only N wildcards in the sequence") self._reference = translate(reference, IUPAC_TABLE) elif self.wildcard_query: self._reference = translate(reference, ACGT_TABLE) else: self._reference = reference.encode('ascii') self.reference = reference property dpmatrix: """ The dynamic programming matrix as a DPMatrix object. This attribute is usually None, unless debugging has been enabled with enable_debug(). """ def __get__(self): return self._dpmatrix def enable_debug(self): """ Store the dynamic programming matrix while running the locate() method and make it available in the .dpmatrix attribute. """ self.debug = True def locate(self, str query): """ locate(query) -> (refstart, refstop, querystart, querystop, score, errors) Find the query within the reference associated with this aligner. The intervals (querystart, querystop) and (refstart, refstop) give the location of the match. That is, the substrings query[querystart:querystop] and self.reference[refstart:refstop] were found to align best to each other. The alignment itself is not returned. """ cdef: const char* s1 = PyBytes_AS_STRING(self._reference) bytes query_bytes const char* s2 int m = self.m int n = len(query) _Entry* column = self.column # Current column of the DP matrix double max_error_rate = self.max_error_rate bint stop_in_query = self.stop_in_query bint compare_ascii = False if self.wildcard_query: query_bytes = translate(query, IUPAC_TABLE) elif self.wildcard_ref: query_bytes = translate(query, ACGT_TABLE) else: query_bytes = translate(query, UPPER_TABLE) compare_ascii = True s2 = query_bytes """ DP Matrix: query (j) ----------> n | ref (i) | | V m """ cdef int i, j # maximum no. of errors cdef int k = (max_error_rate * m) # Determine largest and smallest column we need to compute cdef int max_n = n cdef int min_n = 0 if not self.start_in_query: # costs can only get worse after column m max_n = min(n, m + k) if not self.stop_in_query: min_n = max(0, n - m - k) # Fill column min_n. # # Four cases: # not startin1, not startin2: c(i,j) = max(i,j); origin(i, j) = 0 # startin1, not startin2: c(i,j) = j ; origin(i, j) = min(0, j - i) # not startin1, startin2: c(i,j) = i ; origin(i, j) = # startin1, startin2: c(i,j) = min(i,j) # TODO (later) # fill out columns only until 'last' if not self.start_in_reference and not self.start_in_query: for i in range(m + 1): column[i].score = 0 column[i].cost = max(i, min_n) * self._insertion_cost column[i].origin = 0 elif self.start_in_reference and not self.start_in_query: for i in range(m + 1): column[i].score = 0 column[i].cost = min_n * self._insertion_cost column[i].origin = min(0, min_n - i) elif not self.start_in_reference and self.start_in_query: for i in range(m + 1): column[i].score = 0 column[i].cost = i * self._insertion_cost column[i].origin = max(0, min_n - i) else: for i in range(m + 1): column[i].score = 0 column[i].cost = min(i, min_n) * self._insertion_cost column[i].origin = min_n - i if self.debug: self._dpmatrix = DPMatrix(self.reference, query) for i in range(m + 1): self._dpmatrix.set_entry(i, min_n, column[i].cost) cdef _Match best best.ref_stop = m best.query_stop = n best.cost = m + n + 1 best.origin = 0 best.score = 0 # Ukkonen's trick: index of the last cell that is at most k cdef int last = min(m, k + 1) if self.start_in_reference: last = m cdef: int cost_diag int cost_deletion int cost_insertion int origin, cost, score int length int ref_start int cur_effective_length int last_filled_i = 0 int best_length int origin_increment = 1 if self.start_in_query else 0 int insertion_cost_increment = self._insertion_cost if not self.start_in_query else 0 bint characters_equal bint is_acceptable int insertion_cost = self._insertion_cost int deletion_cost = self._deletion_cost int match_score = self._match_score int mismatch_score = self._mismatch_score int insertion_score = self._insertion_score int deletion_score = self._deletion_score # We keep only a single column of the DP matrix in memory. # To access the diagonal cell to the upper left, # we store it here before overwriting it. _Entry diag_entry _Entry current_entry _Entry previous_entry with nogil: # iterate over columns for j in range(min_n + 1, max_n + 1): # remember first entry before overwriting diag_entry = column[0] # fill in first entry in this column column[0].origin += origin_increment column[0].cost += insertion_cost_increment for i in range(1, last + 1): if compare_ascii: characters_equal = (s1[i-1] == s2[j-1]) else: characters_equal = (s1[i-1] & s2[j-1]) != 0 if characters_equal: # If the characters match, we can skip computing costs for # insertion and deletion as they are at least as high. cost = diag_entry.cost origin = diag_entry.origin # Among the optimal alignments whose edit distance is within the # maximum allowed error rate, we prefer those with maximal score. score = diag_entry.score + match_score else: # Characters do not match. current_entry = column[i] previous_entry = column[i-1] cost_diag = diag_entry.cost + 1 cost_deletion = current_entry.cost + deletion_cost cost_insertion = previous_entry.cost + insertion_cost if cost_diag <= cost_deletion and cost_diag <= cost_insertion: # MISMATCH cost = cost_diag origin = diag_entry.origin score = diag_entry.score + mismatch_score elif cost_insertion <= cost_deletion: # INSERTION cost = cost_insertion origin = previous_entry.origin # penalize insertions slightly score = previous_entry.score + insertion_score else: # DELETION cost = cost_deletion origin = current_entry.origin # penalize deletions slightly score = current_entry.score + deletion_score # Remember the current cell for next iteration diag_entry = column[i] column[i].cost = cost column[i].origin = origin column[i].score = score last_filled_i = last if self.debug: with gil: for i in range(last + 1): self._dpmatrix.set_entry(i, j, column[i].cost) while last >= 0 and column[last].cost > k: last -= 1 # last can be -1 here, but will be incremented next. # TODO if last is -1, can we stop searching? if last < m: last += 1 elif stop_in_query: # Found a match. If requested, find best match in last row. # length of the aligned part of the reference cost = column[m].cost score = column[m].score origin = column[m].origin length = m + min(origin, 0) cur_effective_length = length if self.wildcard_ref: if length < m: # Recompute effective length so that it only takes into # account the matching part of the reference cur_effective_length = length - (self.n_counts[m] - self.n_counts[m - length]) else: cur_effective_length = self.effective_length is_acceptable = ( length >= self._min_overlap and cost <= cur_effective_length * max_error_rate ) best_length = m + min(best.origin, 0) # Update if # - this is the first occurrence # - or this occurrence is longer # - or if this occurrence overlaps the previous best one and has a higher score if is_acceptable and ( (best.cost == m + n + 1) # No best match recorded so far, this is the first one or (origin <= best.origin + m // 2 and score > best.score) # This match overlaps the previous best one sufficiently (and has higher score) or (length > best_length and score > best.score) # Length is greater than best length so far ): best.score = score best.cost = cost best.origin = origin best.ref_stop = m best.query_stop = j if cost == 0 and origin >= 0: # exact match, stop early break # column finished if max_n == n: first_i = 0 if self.stop_in_reference else m # search in last column for i in reversed(range(first_i, last_filled_i + 1)): length = i + min(column[i].origin, 0) cost = column[i].cost score = column[i].score if self.wildcard_ref: if length < m: # Recompute effective length so that it only takes into # account the matching part of the reference ref_start = -min(column[i].origin, 0) assert 0 <= ref_start <= m cur_effective_length = length - (self.n_counts[i] - self.n_counts[ref_start]) else: cur_effective_length = self.effective_length else: cur_effective_length = length assert 0 <= cur_effective_length and cur_effective_length <= length assert cur_effective_length <= self.effective_length is_acceptable = ( length >= self._min_overlap and cost <= cur_effective_length * max_error_rate ) best_length = best.ref_stop + min(best.origin, 0) if is_acceptable and ( (best.cost == m + n + 1) or (origin <= best.origin + m // 2 and score > best.score) or (length > best_length and score > best.score) ): best.score = score best.cost = cost best.origin = column[i].origin best.ref_stop = i best.query_stop = n if best.cost == m + n + 1: # best.cost was initialized with this value. # If it is unchanged, no alignment was found that has # an error rate within the allowed range. return None cdef int query_start if best.origin >= 0: ref_start = 0 query_start = best.origin else: ref_start = -best.origin query_start = 0 return (ref_start, best.ref_stop, query_start, best.query_stop, best.score, best.cost) def __dealloc__(self): PyMem_Free(self.column) PyMem_Free(self.n_counts) cdef class PrefixComparer: """ A version of the Aligner that is specialized in the following way: - it does not allow indels - it allows only 5' anchored adapters This is a separate class, not simply a function, in order to be able to cache the reference (avoiding to convert it from str to bytes on every invocation) """ cdef: bytes reference bint wildcard_ref bint wildcard_query int m int max_k # max. number of errors readonly int effective_length int min_overlap # __init__ instead of __cinit__ because we need to override this in SuffixComparer def __init__( self, str reference, double max_error_rate, bint wildcard_ref=False, bint wildcard_query=False, int min_overlap=1, ): self.wildcard_ref = wildcard_ref self.wildcard_query = wildcard_query self.m = len(reference) self.effective_length = self.m if self.wildcard_ref: self.effective_length -= reference.count('N') - reference.count('n') if self.effective_length == 0: raise ValueError("Cannot have only N wildcards in the sequence") if not (0 <= max_error_rate <= 1.): raise ValueError("max_error_rate must be between 0 and 1") self.max_k = int(max_error_rate * self.effective_length) if min_overlap < 1: raise ValueError("min_overlap must be at least 1") self.min_overlap = min_overlap if self.wildcard_ref: self.reference = translate(reference, IUPAC_TABLE) elif self.wildcard_query: self.reference = translate(reference, ACGT_TABLE) else: self.reference = translate(reference, UPPER_TABLE) def __repr__(self): return "{}(reference={!r}, max_k={}, wildcard_ref={}, "\ "wildcard_query={})".format( self.__class__.__name__, self.reference, self.max_k, self.wildcard_ref, self.wildcard_query) def locate(self, str query): """ Find out whether one string is the prefix of the other one, allowing IUPAC wildcards in ref and/or query if the appropriate flag is set. This is used to find an anchored 5' adapter (type 'FRONT') in the 'no indels' mode. This is very simple as only the number of errors needs to be counted. This function returns a tuple compatible with what Aligner.locate returns. """ cdef: bytes query_bytes char* r_ptr = self.reference char* q_ptr int i int n = len(query) int length = min(self.m, n) bint compare_ascii = False int errors = 0 int score if self.wildcard_query: query_bytes = translate(query, IUPAC_TABLE) elif self.wildcard_ref: query_bytes = translate(query, ACGT_TABLE) else: query_bytes = translate(query, UPPER_TABLE) compare_ascii = True q_ptr = query_bytes if compare_ascii: for i in range(length): if r_ptr[i] != q_ptr[i]: errors += 1 else: for i in range(length): if (r_ptr[i] & q_ptr[i]) == 0: errors += 1 if errors > self.max_k or length < self.min_overlap: return None score = (length - errors) * MATCH_SCORE + errors * MISMATCH_SCORE return (0, length, 0, length, score, errors) cdef class SuffixComparer(PrefixComparer): def __init__( self, str reference, double max_error_rate, bint wildcard_ref=False, bint wildcard_query=False, int min_overlap=1, ): super().__init__(reference[::-1], max_error_rate, wildcard_ref, wildcard_query, min_overlap) def locate(self, str query): cdef int n = len(query) result = super().locate(query[::-1]) if result is None: return None _, length, _, _, score, errors = result return (self.m - length, self.m, n - length, n, score, errors) def hamming_sphere(str s, int k): """ Yield all strings t for which the hamming distance between s and t is exactly k, assuming the alphabet is A, C, G, T. """ if k == 0: yield s return if not PyUnicode_IS_COMPACT_ASCII(s): raise ValueError("String must contain only ASCII characters") cdef: Py_ssize_t n = PyUnicode_GET_LENGTH(s) unsigned char* s_ptr = PyUnicode_DATA(s) Py_ssize_t i, j #str prefix, prefix2 unsigned char ch, ch1, ch2 unsigned char* result_ptr if k == 1: for i in range(n): for ch in "ACGT": if s_ptr[i] == ch: continue result = PyUnicode_New(n, 255) if result == NULL: raise MemoryError() result_ptr = PyUnicode_1BYTE_DATA(result) memcpy(result_ptr, s_ptr, n) result_ptr[i] = ch yield result return if k == 2: for i in range(n): for ch1 in "ACGT": if s_ptr[i] == ch1: continue for j in range(i + 1, n): for ch2 in "ACGT": if s[j] == ch2: continue result = PyUnicode_New(n, 255) if result == NULL: raise MemoryError() result_ptr = PyUnicode_1BYTE_DATA(result) memcpy(result_ptr, s_ptr, n) result_ptr[i] = ch1 result_ptr[j] = ch2 yield result return # Recursive solution for k > 2 # i is the first position that is varied for i in range(n - k + 1): prefix = s[:i] c = s[i] suffix = s[i + 1 :] for pch in "ACGT": if pch == c: continue for t in hamming_sphere(suffix, k - 1): yield prefix + pch + t def edit_environment(str t_str, int k): """ Find all strings s for which the edit distance between s and t is at most k, assuming the alphabet is A, C, G, T. Yield tuples (s, e, m), where e is the edit distance between s and t and m is the number of matches in the optimal alignment. """ cdef bytes t = t_str.encode().translate(bytes.maketrans(b"ACGTacgt", b"\0\1\2\3\0\1\2\3")) cdef unsigned char* t_ptr = t # len(s) = m, indexed by i # len(t) = n, indexed by j cdef: int j, match, diag, left, up, c, m int i = 0 int n = len(t) char ch int min_cost cdef int* costs = PyMem_Malloc((n + 1) * (n + k + 1) * sizeof(int)) if not costs: raise MemoryError() memset(costs, k+1, (n+1) * (n+k+1) * sizeof(int)) for i in range(n + k + 1): costs[i*(n+1)] = i for j in range(n + 1): costs[j] = j cdef int* matches = PyMem_Malloc((n + 1) * (n + k + 1) * sizeof(int)) if not matches: raise MemoryError() memset(matches, 0, (n+1) * (n+k+1) * sizeof(int)) trans = bytes.maketrans(b"\0\1\2\3", b"ACGT") cdef char* s = PyMem_Malloc((n + k + 1) * sizeof(char)) i = 0 if not s: raise MemoryError() while True: # Fill in row i (unless it is row 0, which is already filled in) if i > 0: ch = s[i - 1] min_cost = 999999999 for j in range(max(1, i - k), min(n + 1, i + k + 1)): match = 0 if t_ptr[j - 1] == ch else 1 diag = costs[(i - 1) * (n+1) + j - 1] + match left = costs[i * (n+1) + j - 1] + 1 up = costs[(i - 1) * (n+1) + j] + 1 if diag <= left and diag <= up: c = diag m = matches[(i - 1) * (n+1) + j - 1] + (1 - match) elif left <= up: c = left m = matches[i * (n+1) + j - 1] else: c = up m = matches[(i - 1) * (n+1) + j] costs[i*(n+1) + j] = c matches[i*(n+1) + j] = m min_cost = min(min_cost, c) else: min_cost = 0 if costs[i * (n+1) + n] <= k: # The costs of an optimal alignment of t against s are at most k, # so s is within the edit environment. result = PyBytes_FromStringAndSize(s, i) if result == NULL: raise MemoryError() yield result.translate(trans).decode(), costs[i * (n+1) + n], matches[i * (n+1) + n] # Next string if min_cost <= k and i < n + k: # When all entries are greater than k, we can skip remaining prefixes since costs in # subsequent rows cannot get lower s[i] = 0 i += 1 else: while True: if i == 0: PyMem_Free(costs) PyMem_Free(matches) return i -= 1 ch = s[i] if ch < 3: break s[i] = ch + 1 i += 1 assert False cutadapt-4.7/src/cutadapt/_kmer_finder.pyi000066400000000000000000000005161457457704700207620ustar00rootroot00000000000000from typing import List, Optional, Tuple MAXIMUM_WORD_SIZE: int class KmerFinder: def __init__( self, positions_and_kmers: List[Tuple[int, Optional[int], List[str]]], ref_wildcards: bool = False, query_wildcards: bool = False, ): ... def kmers_present(self, __sequence: str) -> bool: ... cutadapt-4.7/src/cutadapt/_kmer_finder.pyx000066400000000000000000000247631457457704700210130ustar00rootroot00000000000000# cython: profile=False, emit_code_comments=False, language_level=3 from cpython.mem cimport PyMem_Realloc, PyMem_Free from libc.string cimport memcpy, memset, strlen from cpython.unicode cimport PyUnicode_CheckExact, PyUnicode_GET_LENGTH from libc.stdint cimport uint8_t, uint64_t from ._match_tables import matches_lookup """ Kmer finder that works using an enhanced shift-and algorithm. Shift-and works by using a bitmatrix to determine matches in words. For the four-letter alphabet we can make the following bitmatrix for the word ACGTA ATGCA 0b00000000_00000000_00000000_00000000_00000000_00000000_00000000_00010001 A 0b00000000_00000000_00000000_00000000_00000000_00000000_00000000_00000010 C 0b00000000_00000000_00000000_00000000_00000000_00000000_00000000_00000100 G 0b00000000_00000000_00000000_00000000_00000000_00000000_00000000_00001000 T However, this leaves a lot of bits unused in the machine word. It is also possible to use as many bits in a bitmask as possible simply concatenating all the words together. For example here, with appended GATTACA. ACAT_TAGATGCA 0b00000000_00000000_00000000_00000000_00000000_00000000_00001010_01010001 A 0b00000000_00000000_00000000_00000000_00000000_00000000_00000100_00000010 C 0b00000000_00000000_00000000_00000000_00000000_00000000_00000000_00100100 G 0b00000000_00000000_00000000_00000000_00000000_00000000_00000001_10001000 T Normal shift-and starts with initializing a 1 in bit 1. The same can be achieved for multiword by multiplying with a initmask with a one at each word start position. Normal shift-and only checks a single bit. The bit that is at exactly the length of the word. We can check multiple bits simultaneously by using a mask that checks for each bit that is at the end of a word. This way we can check for multiple words simultaneously. It does not scale if the combined length of the words exceeds a machine integer, but for cutadapt the words that are searched are usually smaller. (Illumina adapter is 33 bases for example). """ # Dnaio conveniently ensures that all sequences are ASCII only. DEF BITMASK_INDEX_SIZE = 128 ctypedef uint64_t bitmask_t DEF MAX_WORD_SIZE = 64 # "sizeof(bitmask_t) * 8" does not work for some reason MAXIMUM_WORD_SIZE = MAX_WORD_SIZE cdef extern from "Python.h": void *PyUnicode_DATA(object o) bint PyUnicode_IS_COMPACT_ASCII(object o) ctypedef struct KmerSearchEntry: size_t mask_offset ssize_t search_start ssize_t search_stop # 0 if going to end of sequence. bitmask_t init_mask bitmask_t found_mask cdef class KmerFinder: """ Find kmers in strings. Allows case-independent and IUPAC matching. ``ref_wildcards=True`` allows IUPAC characters in the kmer sequences. ``query_wildcards=True`` allows IUPAC characters in the sequences fed to the ``kmers_present`` method. Replaces the following code: kmers_and_positions = [("AGA", -10, None), ("AGCATGA", 0, None)] for sequence in sequences: for kmer, start, stop in kmers_and_positions: if sequence.find(kmer, start, stop) != -1: # do something pass This has a lot of python overhead. The following code accomplishes the same task and allows for case-independent matching: positions_and_kmers = [(-10, None, ["AGA"]), (0, None, ["AGCATGA"])] kmer_finder = KmerFinder(positions_and_kmers) for sequence in sequences: if kmer_finder.kmers_present(sequence): # do something continue This is more efficient as the kmers_present method can be applied to a lot of sequences and all the necessary unpacking for each kmer into C variables happens only once. Note that multiple kmers can be given per position. Kmerfinder finds all of these simultaneously using a multiple pattern matching algorithm. """ cdef: KmerSearchEntry *search_entries bitmask_t *search_masks size_t number_of_searches readonly object positions_and_kmers readonly bint ref_wildcards readonly bint query_wildcards def __cinit__(self, positions_and_kmers, ref_wildcards=False, query_wildcards=False): cdef char[64] search_word self.search_masks = NULL self.search_entries = NULL self.number_of_searches = 0 self.ref_wildcards = ref_wildcards self.query_wildcards = query_wildcards self.number_of_searches = 0 cdef size_t mask_offset = 0 cdef char *kmer_ptr cdef size_t offset cdef bitmask_t init_mask, found_mask cdef Py_ssize_t kmer_length match_lookup = matches_lookup(ref_wildcards, query_wildcards) for (start, stop, kmers) in positions_and_kmers: index = 0 while index < len(kmers): memset(search_word, 0, 64) offset = 0 init_mask = 0 found_mask = 0 # Run an inner loop in case all words combined are larger than # the maximum bitmask size. In that case we create multiple # bitmasks to hold all the words. while index < len(kmers): kmer = kmers[index] if not PyUnicode_CheckExact(kmer): raise TypeError(f"Kmer should be a string not {type(kmer)}") if not PyUnicode_IS_COMPACT_ASCII(kmer): raise ValueError("Only ASCII strings are supported") kmer_length = PyUnicode_GET_LENGTH(kmer) if kmer_length > MAX_WORD_SIZE: raise ValueError(f"{kmer} of length {kmer_length} is longer " f"than the maximum of {MAX_WORD_SIZE}.") if (offset + kmer_length) > MAX_WORD_SIZE: break init_mask |= 1ULL << offset kmer_ptr = PyUnicode_DATA(kmer) memcpy(search_word + offset, kmer_ptr, kmer_length) # Set the found bit at the last character. found_mask |= 1ULL << (offset + kmer_length - 1) offset = offset + kmer_length index += 1 i = self.number_of_searches # Save the index position for the mask and entry self.number_of_searches += 1 self.search_entries = PyMem_Realloc(self.search_entries, self.number_of_searches * sizeof(KmerSearchEntry)) self.search_masks = PyMem_Realloc(self.search_masks, self.number_of_searches * sizeof(bitmask_t) * BITMASK_INDEX_SIZE) mask_offset = i * BITMASK_INDEX_SIZE self.search_entries[i].search_start = start if stop is None: # Encode 'end of sequence' as 0. stop = 0 self.search_entries[i].search_stop = stop self.search_entries[i].mask_offset = mask_offset self.search_entries[i].init_mask = init_mask self.search_entries[i].found_mask = found_mask # Offset -1 because we don't count the last NULL byte populate_needle_mask(self.search_masks + mask_offset, search_word, offset, match_lookup) self.positions_and_kmers = positions_and_kmers def __reduce__(self): return KmerFinder, (self.positions_and_kmers, self.ref_wildcards, self.query_wildcards) def kmers_present(self, str sequence): cdef: KmerSearchEntry entry size_t i size_t kmer_offset bitmask_t init_mask bitmask_t found_mask ssize_t start, stop const bitmask_t *mask_ptr const char *search_ptr bint search_result ssize_t search_length if not PyUnicode_IS_COMPACT_ASCII(sequence): raise ValueError("Only ASCII strings are supported") cdef const char *seq = PyUnicode_DATA(sequence) cdef Py_ssize_t seq_length = PyUnicode_GET_LENGTH(sequence) for i in range(self.number_of_searches): entry = self.search_entries[i] start = entry.search_start stop = entry.search_stop if start < 0: start = seq_length + start if start < 0: start = 0 elif start > seq_length: continue if stop < 0: stop = seq_length + stop if stop <= 0: # No need to search continue elif stop == 0: # stop == 0 means go to end of sequence. stop = seq_length search_length = stop - start if search_length <= 0: continue search_ptr = seq + start init_mask = entry.init_mask found_mask = entry.found_mask mask_ptr = self.search_masks + entry.mask_offset search_result = shift_and_multiple_is_present( search_ptr, search_length, mask_ptr, init_mask, found_mask) if search_result: return True return False def __dealloc__(self): PyMem_Free(self.search_masks) PyMem_Free(self.search_entries) cdef void set_masks(bitmask_t *needle_mask, size_t pos, const char *chars): cdef char c cdef size_t i for i in range(strlen(chars)): needle_mask[chars[i]] |= 1ULL << pos cdef populate_needle_mask(bitmask_t *needle_mask, const char *needle, size_t needle_length, match_lookup): cdef size_t i cdef char c cdef uint8_t j if needle_length > MAX_WORD_SIZE: raise ValueError("The pattern is too long!") memset(needle_mask, 0, sizeof(bitmask_t) * BITMASK_INDEX_SIZE) for i in range(needle_length): c = needle[i] if c == 0: continue set_masks(needle_mask, i, match_lookup[c]) cdef bint shift_and_multiple_is_present( const char *haystack, size_t haystack_length, const bitmask_t *needle_mask, bitmask_t init_mask, bitmask_t found_mask): cdef: bitmask_t R = 0 size_t i for i in range(haystack_length): R <<= 1 R |= init_mask R &= needle_mask[haystack[i]] if (R & found_mask): return True return False cutadapt-4.7/src/cutadapt/_match_tables.py000066400000000000000000000053241457457704700207540ustar00rootroot00000000000000import operator def _acgt_table(): """ Return a translation table that maps A, C, G, T characters to the lower four bits of a byte. Other characters (including possibly IUPAC characters) are mapped to the most significant bit (0x80). Lowercase versions are also translated, and U is treated the same as T. """ d = dict(A=1, C=2, G=4, T=8, U=8) t = bytearray([0x80]) * 256 for c, v in d.items(): t[ord(c)] = v t[ord(c.lower())] = v return bytes(t) def _iupac_table(): """ Return a translation table for IUPAC characters. The table maps ASCII-encoded IUPAC nucleotide characters to bytes in which the four least significant bits are used to represent one nucleotide each. For the "N" wildcard, additionally the most significant bit is set (0x80), which allows it to match characters that are not A, C, G or T if _acgt_table was used to encode them. Whether two encoded characters x and y match can then be checked with the expression "x & y != 0". """ A = 1 C = 2 G = 4 T = 8 iupac = dict( X=0, A=A, C=C, G=G, T=T, U=T, R=A | G, Y=C | T, S=G | C, W=A | T, K=G | T, M=A | C, B=C | G | T, D=A | G | T, H=A | C | T, V=A | C | G, N=A | C | G | T + 0x80, ) t = bytearray(b"\0") * 256 for c, v in iupac.items(): t[ord(c)] = v t[ord(c.lower())] = v return bytes(t) def _upper_table(): table = bytes(range(256)).upper() return table def all_matches_generator(ref: bytes, query: bytes, comp_op): for i, ref_char in enumerate(ref): matches = "" for j, query_char in enumerate(query): if j >= 128: # Only ASCII characters supported. break if bool(comp_op(ref_char, query_char)): matches += chr(j) # NULL byte should not match anything yield matches.encode("ascii").replace(b"\00", b"") def matches_lookup(ref_wildcards, query_wildcards): if (not ref_wildcards) and (not query_wildcards): ref_table = _upper_table() query_table = _upper_table() comp_op = operator.eq elif ref_wildcards and (not query_wildcards): ref_table = _iupac_table() query_table = _acgt_table() comp_op = operator.and_ elif (not ref_wildcards) and query_wildcards: ref_table = _acgt_table() query_table = _iupac_table() comp_op = operator.and_ else: ref_table = _iupac_table() query_table = _iupac_table() comp_op = operator.and_ return list(all_matches_generator(ref_table, query_table, comp_op)) cutadapt-4.7/src/cutadapt/adapters.py000066400000000000000000001407371457457704700200020ustar00rootroot00000000000000""" Adapter finding and trimming classes The ...Adapter classes are responsible for finding adapters. The ...Match classes trim the reads. """ import logging from enum import IntFlag from collections import defaultdict from typing import Optional, Tuple, Sequence, Dict, Any, List, Union from abc import ABC, abstractmethod import time from ._kmer_finder import KmerFinder from .align import ( EndSkip, Aligner, PrefixComparer, SuffixComparer, edit_environment, hamming_sphere, ) from .kmer_heuristic import create_positions_and_kmers, kmer_probability_analysis logger = logging.getLogger() class MockKmerFinder: def kmers_present(self, sequence: str): return True class InvalidCharacter(Exception): pass # TODO remove this enum, this should be within each Adapter class class Where(IntFlag): """ Aligner flag combinations for all adapter types. "REFERENCE" is the adapter sequence, "QUERY" is the read sequence """ BACK = EndSkip.QUERY_START | EndSkip.QUERY_STOP | EndSkip.REFERENCE_END FRONT = EndSkip.QUERY_START | EndSkip.QUERY_STOP | EndSkip.REFERENCE_START PREFIX = EndSkip.QUERY_STOP SUFFIX = EndSkip.QUERY_START # Just like FRONT/BACK, but without internal matches FRONT_NOT_INTERNAL = EndSkip.REFERENCE_START | EndSkip.QUERY_STOP BACK_NOT_INTERNAL = EndSkip.QUERY_START | EndSkip.REFERENCE_END ANYWHERE = EndSkip.SEMIGLOBAL def returns_defaultdict_int(): # We need this function to make EndStatistics picklable. # Even a @staticmethod of EndStatistics is not sufficient # as that is not picklable before Python 3.5. return defaultdict(int) class EndStatistics: """Statistics about the 5' or 3' end""" def __init__(self, adapter: "SingleAdapter"): self.max_error_rate: float = adapter.max_error_rate self.sequence: str = adapter.sequence self.effective_length: int = adapter.effective_length self.has_wildcards: bool = adapter.adapter_wildcards self.indels: bool = adapter.indels self.adapter_type: str = adapter.descriptive_identifier() self.allows_partial_matches: bool = adapter.allows_partial_matches # self.errors[l][e] == n iff a sequence of length l matching at e errors was removed n times self.errors: Dict[int, Dict[int, int]] = defaultdict(returns_defaultdict_int) self.adjacent_bases = {"A": 0, "C": 0, "G": 0, "T": 0, "": 0} # TODO avoid hard-coding the list of classes self._remove_prefix = isinstance(adapter, FrontAdapter) def __repr__(self): errors = {k: dict(v) for k, v in self.errors.items()} return "EndStatistics(max_error_rate={}, errors={}, adjacent_bases={})".format( self.max_error_rate, errors, self.adjacent_bases, ) def __iadd__(self, other: Any): if not isinstance(other, self.__class__): raise ValueError("Cannot compare") if ( self.max_error_rate != other.max_error_rate or self.sequence != other.sequence or self.effective_length != other.effective_length or self.indels != other.indels ): raise RuntimeError("Incompatible EndStatistics, cannot be added") for base in ("A", "C", "G", "T", ""): self.adjacent_bases[base] += other.adjacent_bases[base] for length, error_dict in other.errors.items(): for errors in error_dict: self.errors[length][errors] += other.errors[length][errors] return self @property def lengths(self): d = {length: sum(errors.values()) for length, errors in self.errors.items()} return d def random_match_probabilities(self, gc_content: float) -> List[float]: """ Estimate probabilities that this adapter end matches a random sequence. Indels are not taken into account. Returns a list p, where p[i] is the probability that i bases of this adapter match a random sequence with GC content gc_content. """ assert 0.0 <= gc_content <= 1.0 seq = self.sequence # FIXME this is broken for 'anywhere' adapters if self._remove_prefix: seq = seq[::-1] allowed_bases = "CGRYSKMBDHVN" if self.has_wildcards else "GC" p = 1.0 probabilities = [p] for i, c in enumerate(seq): if c in allowed_bases: p *= gc_content / 2.0 else: p *= (1.0 - gc_content) / 2.0 probabilities.append(p) return probabilities class AdapterStatistics(ABC): reverse_complemented: int = 0 name: str adapter: "Adapter" @abstractmethod def __iadd__(self, other): pass @abstractmethod def end_statistics(self) -> Tuple[Optional[EndStatistics], Optional[EndStatistics]]: pass @abstractmethod def add_match(self, match) -> None: pass class SingleAdapterStatistics(AdapterStatistics, ABC): """ Statistics about a 5' or 3' adapter, where we only need to keep track of sequences removed from one "end". """ def __init__(self, adapter: "SingleAdapter"): self.name = adapter.name self.adapter = adapter self.end = EndStatistics(adapter) def __repr__(self): return f"SingleAdapterStatistics(name={self.name}, end={self.end})" def __iadd__(self, other: "SingleAdapterStatistics"): if not isinstance(other, self.__class__): raise ValueError("Cannot iadd") self.end += other.end self.reverse_complemented += other.reverse_complemented return self class FrontAdapterStatistics(SingleAdapterStatistics): def add_match(self, match: "RemoveBeforeMatch"): self.end.errors[match.removed_sequence_length()][match.errors] += 1 def end_statistics(self) -> Tuple[Optional[EndStatistics], Optional[EndStatistics]]: return self.end, None class BackAdapterStatistics(SingleAdapterStatistics): def add_match(self, match: "RemoveAfterMatch"): adjacent_base = match.adjacent_base() self.end.errors[match.removed_sequence_length()][match.errors] += 1 try: self.end.adjacent_bases[adjacent_base] += 1 except KeyError: self.end.adjacent_bases[""] += 1 def end_statistics(self) -> Tuple[Optional[EndStatistics], Optional[EndStatistics]]: return None, self.end class LinkedAdapterStatistics(AdapterStatistics): """ Statistics about sequences removed by a lined adapter. """ def __init__( self, adapter: "LinkedAdapter", front: "SingleAdapter", back: "SingleAdapter", ): self.name = adapter.name self.adapter = adapter self.front = EndStatistics(front) self.back = EndStatistics(back) self.reverse_complemented = 0 def __repr__(self): return f"LinkedAdapterStatistics(name={self.name}, front={self.front}, back={self.back})" def __iadd__(self, other: "LinkedAdapterStatistics"): if not isinstance(other, self.__class__): raise ValueError("Cannot iadd") self.front += other.front self.back += other.back self.reverse_complemented += other.reverse_complemented return self def add_match(self, match: "LinkedMatch"): # TODO this is duplicated code if match.front_match: self.front.errors[match.front_match.removed_sequence_length()][ match.front_match.errors ] += 1 if match.back_match: adjacent_base = match.back_match.adjacent_base() self.back.errors[match.back_match.removed_sequence_length()][ match.back_match.errors ] += 1 try: self.back.adjacent_bases[adjacent_base] += 1 except KeyError: self.back.adjacent_bases[""] += 1 def end_statistics(self) -> Tuple[Optional[EndStatistics], Optional[EndStatistics]]: return self.front, self.back class AnywhereAdapterStatistics(AdapterStatistics): """ Statistics about sequences removed by a lined adapter. """ def __init__(self, adapter: "AnywhereAdapter"): self.name = adapter.name self.adapter = adapter self.front = EndStatistics(adapter) self.back = EndStatistics(adapter) self.reverse_complemented = 0 def __repr__(self): return f"AnywhereAdapterStatistics(name={self.name}, front={self.front}, back={self.back})" def __iadd__(self, other: "AnywhereAdapterStatistics"): if not isinstance(other, AnywhereAdapterStatistics): raise ValueError("Cannot add") self.front += other.front self.back += other.back self.reverse_complemented += other.reverse_complemented return self def add_match(self, match: Union["RemoveBeforeMatch", "RemoveAfterMatch"]) -> None: # TODO contains duplicated code from the other add_match() methods if isinstance(match, RemoveBeforeMatch): self.front.errors[match.removed_sequence_length()][match.errors] += 1 else: adjacent_base = match.adjacent_base() self.back.errors[match.removed_sequence_length()][match.errors] += 1 try: self.back.adjacent_bases[adjacent_base] += 1 except KeyError: self.back.adjacent_bases[""] += 1 def end_statistics(self) -> Tuple[Optional[EndStatistics], Optional[EndStatistics]]: return self.front, self.back class Match(ABC): adapter: "Adapter" @abstractmethod def remainder_interval(self) -> Tuple[int, int]: pass @abstractmethod def retained_adapter_interval(self) -> Tuple[int, int]: pass @abstractmethod def get_info_records(self, read) -> List[List]: pass @abstractmethod def trimmed(self, read): pass @abstractmethod def match_sequence(self): pass class SingleMatch(Match, ABC): """ Representation of a single adapter matched to a single string """ __slots__ = [ "astart", "astop", "rstart", "rstop", "score", "errors", "adapter", "sequence", "length", "adjacent_base", ] def __init__( self, astart: int, astop: int, rstart: int, rstop: int, score: int, errors: int, adapter: "SingleAdapter", sequence: str, ): self.astart: int = astart self.astop: int = astop self.rstart: int = rstart self.rstop: int = rstop self.score: int = score self.errors: int = errors self.adapter: SingleAdapter = adapter self.sequence = sequence # Number of aligned characters in the adapter. If there are # indels, this may be different from the number of characters # in the read. self.length: int = astop - astart def __repr__(self): return ( f"{self.__class__.__name__}(astart={self.astart}, astop={self.astop}, " f"rstart={self.rstart}, rstop={self.rstop}, " f"score={self.score}, errors={self.errors})" ) def __eq__(self, other) -> bool: return ( other.__class__ is self.__class__ and self.astart == other.astart and self.astop == other.astop and self.rstart == other.rstart and self.rstop == other.rstop and self.score == other.score and self.errors == other.errors and self.adapter is other.adapter and self.sequence == other.sequence ) def wildcards(self, wildcard_char: str = "N") -> str: """ Return a string that contains, for each wildcard character, the character that it matches. For example, if the adapter ATNGNA matches ATCGTA, then the string 'CT' is returned. If there are indels, this is not reliable as the full alignment is not available. """ wildcards = [ self.sequence[self.rstart + i] for i in range(self.length) if self.adapter.sequence[self.astart + i] == wildcard_char and self.rstart + i < len(self.sequence) ] return "".join(wildcards) def get_info_records(self, read) -> List[List]: seq = read.sequence qualities = read.qualities info = [ "", self.errors, self.rstart, self.rstop, seq[0 : self.rstart], seq[self.rstart : self.rstop], seq[self.rstop :], self.adapter.name, ] if qualities: info += [ qualities[0 : self.rstart], qualities[self.rstart : self.rstop], qualities[self.rstop :], ] else: info += ["", "", ""] return [info] def match_sequence(self): return self.sequence[self.rstart : self.rstop] @abstractmethod def removed_sequence_length(self) -> int: pass class RemoveBeforeMatch(SingleMatch): """A match that removes sequence before the match""" def rest(self) -> str: """ Return the part of the read before this match if this is a 'front' (5') adapter, return the part after the match if this is not a 'front' adapter (3'). This can be an empty string. """ return self.sequence[: self.rstart] def remainder_interval(self) -> Tuple[int, int]: """ Return an interval (start, stop) that describes the part of the read that would remain after trimming """ return self.rstop, len(self.sequence) def retained_adapter_interval(self) -> Tuple[int, int]: return self.rstart, len(self.sequence) def trim_slice(self): # Same as remainder_interval, but as a slice() object return slice(self.rstop, None) def trimmed(self, read): return read[self.rstop :] def removed_sequence_length(self) -> int: return self.rstop class RemoveAfterMatch(SingleMatch): """A match that removes sequence after the match""" def rest(self) -> str: """ Return the part of the read before this match if this is a 'front' (5') adapter, return the part after the match if this is not a 'front' adapter (3'). This can be an empty string. """ return self.sequence[self.rstop :] def remainder_interval(self) -> Tuple[int, int]: """ Return an interval (start, stop) that describes the part of the read that would remain after trimming """ return 0, self.rstart def retained_adapter_interval(self) -> Tuple[int, int]: return 0, self.rstop def trim_slice(self): # Same as remainder_interval, but as a slice() object return slice(None, self.rstart) def trimmed(self, read): return read[: self.rstart] def adjacent_base(self) -> str: return self.sequence[self.rstart - 1 : self.rstart] def removed_sequence_length(self) -> int: return len(self.sequence) - self.rstart def _generate_adapter_name(_start=[1]) -> str: name = str(_start[0]) _start[0] += 1 return name class Matchable(ABC): """Something that has a match_to() method.""" def __init__(self, name: Optional[str], *args, **kwargs): self.name = name @abstractmethod def match_to(self, sequence: str): pass class Adapter(Matchable, ABC): description = "adapter with one component" # this is overriden in subclasses @abstractmethod def spec(self) -> str: """Return string representation of this adapter""" @abstractmethod def create_statistics(self) -> AdapterStatistics: pass @abstractmethod def descriptive_identifier(self) -> str: pass @abstractmethod def enable_debug(self) -> None: pass class SingleAdapter(Adapter, ABC): """ This class is used to find a single adapter characterized by sequence, error rate, type etc. within reads. Arguments: sequence (str): The adapter sequence. Will be converted to uppercase. Also, Us will be converted to Ts. max_errors: Maximum allowed errors (non-negative float). If the values is less than 1, this is interpreted as a rate and passed to the aligner. If it is 1 or greater, the value is converted to a rate by dividing it by the number of non-N characters in the sequence. The error rate is the number of errors in the alignment divided by the length of the part of the alignment that matches the adapter. min_overlap: Report a match only if at least this number of bases of the adapter are aligned to the read. read_wildcards: Whether IUPAC wildcards in the read are allowed. adapter_wildcards: Whether IUPAC wildcards in the adapter are allowed. name: Optional name of the adapter. If not provided, the name is set to a unique number. indels: Whether indels are allowed in the alignment. """ allows_partial_matches: bool = True def __init__( self, sequence: str, max_errors: float = 0.1, min_overlap: int = 3, read_wildcards: bool = False, adapter_wildcards: bool = True, name: Optional[str] = None, indels: bool = True, ): self.name: str = _generate_adapter_name() if name is None else name super().__init__(self.name) self._debug: bool = False self.sequence: str = sequence.upper().replace("U", "T").replace("I", "N") if not self.sequence: raise ValueError("Adapter sequence is empty") if max_errors >= 1 and self.sequence.count("N") != len(self.sequence): max_errors /= len(self.sequence) - self.sequence.count("N") self.max_error_rate: float = max_errors self.min_overlap: int = min(min_overlap, len(self.sequence)) iupac = frozenset("ABCDGHKMNRSTUVWXY") if adapter_wildcards and not set(self.sequence) <= iupac: for c in self.sequence: if c not in iupac: raise InvalidCharacter( f"Character '{c}' in adapter sequence '{self.sequence}' is " f"not a valid IUPAC code. Use only characters 'ABCDGHIKMNRSTUVWXY'." ) # Optimization: Use non-wildcard matching if only ACGT is used self.adapter_wildcards: bool = adapter_wildcards and not set( self.sequence ) <= set("ACGT") self.read_wildcards: bool = read_wildcards self.indels: bool = indels self.aligner = self._aligner() self.kmer_finder = self._kmer_finder() def _make_aligner(self, sequence: str, flags: int) -> Aligner: # TODO # Indels are suppressed by setting their cost very high, but a different algorithm # should be used instead. indel_cost = 1 if self.indels else 100000 return Aligner( sequence, self.max_error_rate, flags=flags, wildcard_ref=self.adapter_wildcards, wildcard_query=self.read_wildcards, indel_cost=indel_cost, min_overlap=self.min_overlap, ) def _make_kmer_finder( self, sequence: str, back_adapter: bool, front_adapter: bool, internal: bool = True, ) -> Union[KmerFinder, MockKmerFinder]: positions_and_kmers = create_positions_and_kmers( sequence, self.min_overlap, self.max_error_rate, back_adapter, front_adapter, internal, ) if self._debug: print(kmer_probability_analysis(positions_and_kmers)) try: return KmerFinder( positions_and_kmers, self.adapter_wildcards, self.read_wildcards ) except ValueError: # Kmers too long. return MockKmerFinder() def __repr__(self): return ( "<{cls}(name={name!r}, sequence={sequence!r}, " "max_error_rate={max_error_rate}, min_overlap={min_overlap}, " "read_wildcards={read_wildcards}, " "adapter_wildcards={adapter_wildcards}, " "indels={indels})>".format(cls=self.__class__.__name__, **vars(self)) ) @property def effective_length(self) -> int: return self.aligner.effective_length def enable_debug(self) -> None: """ Print out the dynamic programming matrix after matching a read to an adapter. """ self._debug = True self.aligner.enable_debug() @abstractmethod def _aligner(self): pass @abstractmethod def _kmer_finder(self): pass @abstractmethod def match_to(self, sequence: str): """ Attempt to match this adapter to the given string. Return a Match instance if a match was found; return None if no match was found given the matching criteria (minimum overlap length, maximum error rate). """ def __len__(self) -> int: return len(self.sequence) class FrontAdapter(SingleAdapter): """A 5' adapter""" description = "regular 5'" def __init__(self, *args, **kwargs): self._force_anywhere = kwargs.pop("force_anywhere", False) super().__init__(*args, **kwargs) def descriptive_identifier(self) -> str: return "regular_five_prime" def _aligner(self) -> Aligner: return self._make_aligner( self.sequence, Where.ANYWHERE.value if self._force_anywhere else Where.FRONT.value, ) def _kmer_finder(self): return self._make_kmer_finder( self.sequence, back_adapter=self._force_anywhere, front_adapter=True ) def match_to(self, sequence: str): """ Attempt to match this adapter to the given read. Return a Match instance if a match was found; return None if no match was found given the matching criteria (minimum overlap length, maximum error rate). """ if not self.kmer_finder.kmers_present(sequence): return None alignment: Optional[Tuple[int, int, int, int, int, int]] = self.aligner.locate( sequence ) if self._debug: print(self.aligner.dpmatrix) if alignment is None: return None return RemoveBeforeMatch(*alignment, adapter=self, sequence=sequence) def spec(self) -> str: return f"{self.sequence}..." def create_statistics(self) -> FrontAdapterStatistics: return FrontAdapterStatistics(self) class RightmostFrontAdapter(FrontAdapter): """A 5' adapter that prefers rightmost matches""" description = "rightmost 5'" # def __init__(self, *args, **kwargs): # self._force_anywhere = kwargs.pop("force_anywhere", False) # super().__init__(*args, **kwargs) def descriptive_identifier(self) -> str: return "rightmost_five_prime" def _aligner(self) -> Aligner: aligner = self._make_aligner( self.sequence[::-1], Where.ANYWHERE.value if self._force_anywhere else Where.BACK.value, ) return aligner def _kmer_finder(self): kmer_finder = self._make_kmer_finder( self.sequence[::-1], back_adapter=True, front_adapter=self._force_anywhere ) return kmer_finder def match_to(self, sequence: str): """ Attempt to match this adapter to the given read. Return a Match instance if a match was found; return None if no match was found given the matching criteria (minimum overlap length, maximum error rate). """ reversed_sequence = sequence[::-1] if not self.kmer_finder.kmers_present(reversed_sequence): return None alignment: Optional[Tuple[int, int, int, int, int, int]] = self.aligner.locate( reversed_sequence ) if self._debug: print(self.aligner.dpmatrix) if alignment is None: return None ref_start, ref_end, query_start, query_end, score, errors = alignment alignment = ( len(self.sequence) - ref_end, len(self.sequence) - ref_start, len(sequence) - query_end, len(sequence) - query_start, score, errors, ) return RemoveBeforeMatch(*alignment, adapter=self, sequence=sequence) def spec(self) -> str: return f"{self.sequence}...;rightmost" class BackAdapter(SingleAdapter): """A 3' adapter""" description = "regular 3'" def __init__(self, *args, **kwargs): self._force_anywhere = kwargs.pop("force_anywhere", False) super().__init__(*args, **kwargs) def descriptive_identifier(self) -> str: return "regular_three_prime" def _aligner(self): return self._make_aligner( self.sequence, Where.ANYWHERE.value if self._force_anywhere else Where.BACK.value, ) def _kmer_finder(self): return self._make_kmer_finder( self.sequence, back_adapter=True, front_adapter=self._force_anywhere ) def match_to(self, sequence: str): """ Attempt to match this adapter to the given read. Return a Match instance if a match was found; return None if no match was found given the matching criteria (minimum overlap length, maximum error rate). """ if not self.kmer_finder.kmers_present(sequence): return None alignment: Optional[Tuple[int, int, int, int, int, int]] = self.aligner.locate( sequence ) if self._debug: print(self.aligner.dpmatrix) # pragma: no cover if alignment is None: return None return RemoveAfterMatch(*alignment, adapter=self, sequence=sequence) def spec(self) -> str: return f"{self.sequence}" def create_statistics(self) -> BackAdapterStatistics: return BackAdapterStatistics(self) class AnywhereAdapter(SingleAdapter): """ An adapter that can be 5' or 3'. If a match involves the first base of the read, it is assumed to be a 5' adapter and a 3' otherwise. """ description = "variable 5'/3'" def descriptive_identifier(self) -> str: return "anywhere" def _aligner(self): return self._make_aligner(self.sequence, Where.ANYWHERE.value) def _kmer_finder(self): return self._make_kmer_finder( self.sequence, back_adapter=True, front_adapter=True ) def match_to(self, sequence: str): """ Attempt to match this adapter to the given string. Return a Match instance if a match was found; return None if no match was found given the matching criteria (minimum overlap length, maximum error rate). """ if not self.kmer_finder.kmers_present(sequence): return None alignment = self.aligner.locate(sequence.upper()) if self._debug: print(self.aligner.dpmatrix) if alignment is None: return None # guess: if alignment starts at pos 0, it’s a 5' adapter if alignment[2] == 0: # index 2 is rstart match = RemoveBeforeMatch(*alignment, adapter=self, sequence=sequence) # type: ignore else: match = RemoveAfterMatch(*alignment, adapter=self, sequence=sequence) # type: ignore return match def spec(self) -> str: return f"...{self.sequence}..." def create_statistics(self) -> AnywhereAdapterStatistics: return AnywhereAdapterStatistics(self) class NonInternalFrontAdapter(FrontAdapter): """A non-internal 5' adapter""" description = "non-internal 5'" def descriptive_identifier(self) -> str: return "noninternal_five_prime" def _aligner(self): return self._make_aligner(self.sequence, Where.FRONT_NOT_INTERNAL.value) def _kmer_finder(self): return self._make_kmer_finder( self.sequence, front_adapter=True, back_adapter=self._force_anywhere, internal=False, ) def match_to(self, sequence: str): if not self.kmer_finder.kmers_present(sequence): return None # The locate function takes care of uppercasing the sequence alignment = self.aligner.locate(sequence) if self._debug: try: print(self.aligner.dpmatrix) except AttributeError: pass if alignment is None: return None return RemoveBeforeMatch(*alignment, adapter=self, sequence=sequence) # type: ignore def spec(self) -> str: return f"X{self.sequence}..." class NonInternalBackAdapter(BackAdapter): """A non-internal 3' adapter""" description = "non-internal 3'" def descriptive_identifier(self) -> str: return "noninternal_three_prime" def _aligner(self): return self._make_aligner(self.sequence, Where.BACK_NOT_INTERNAL.value) def _kmer_finder(self): return self._make_kmer_finder( self.sequence, back_adapter=True, front_adapter=self._force_anywhere, internal=False, ) def match_to(self, sequence: str): if not self.kmer_finder.kmers_present(sequence): return None # The locate function takes care of uppercasing the sequence alignment = self.aligner.locate(sequence) if self._debug: try: print(self.aligner.dpmatrix) # pragma: no cover except AttributeError: pass if alignment is None: return None return RemoveAfterMatch(*alignment, adapter=self, sequence=sequence) # type: ignore def spec(self) -> str: return f"{self.sequence}X" class PrefixAdapter(NonInternalFrontAdapter): """An anchored 5' adapter""" description = "anchored 5'" allows_partial_matches = False def __init__(self, sequence: str, *args, **kwargs): kwargs["min_overlap"] = len(sequence) super().__init__(sequence, *args, **kwargs) def descriptive_identifier(self) -> str: return "anchored_five_prime" def _aligner(self): if not self.indels: # TODO or if error rate allows 0 errors anyway return PrefixComparer( self.sequence, self.max_error_rate, wildcard_ref=self.adapter_wildcards, wildcard_query=self.read_wildcards, min_overlap=self.min_overlap, ) else: return self._make_aligner(self.sequence, Where.PREFIX.value) def _kmer_finder(self): if isinstance(self.aligner, PrefixComparer): # Prefix comparer does not create a dynamic programming matrix # so the heuristic will be slow and unnecessary. return MockKmerFinder() else: return super()._kmer_finder() def spec(self) -> str: return f"^{self.sequence}..." class SuffixAdapter(NonInternalBackAdapter): """An anchored 3' adapter""" description = "anchored 3'" allows_partial_matches = False def __init__(self, sequence: str, *args, **kwargs): kwargs["min_overlap"] = len(sequence) super().__init__(sequence, *args, **kwargs) def descriptive_identifier(self) -> str: return "anchored_three_prime" def _aligner(self): if not self.indels: # TODO or if error rate allows 0 errors anyway return SuffixComparer( self.sequence, self.max_error_rate, wildcard_ref=self.adapter_wildcards, wildcard_query=self.read_wildcards, min_overlap=self.min_overlap, ) else: return self._make_aligner(self.sequence, Where.SUFFIX.value) def _kmer_finder(self): if isinstance(self.aligner, SuffixComparer): # Suffix comparer does not create a dynamic programming matrix # so the heuristic will be slow and unnecessary. return MockKmerFinder() else: return super()._kmer_finder() def spec(self) -> str: return f"{self.sequence}$" class LinkedMatch(Match): """ Represent a match of a LinkedAdapter """ def __init__( self, front_match: RemoveBeforeMatch, back_match: RemoveAfterMatch, adapter: "LinkedAdapter", ): assert front_match is not None or back_match is not None self.front_match: RemoveBeforeMatch = front_match self.back_match: RemoveAfterMatch = back_match self.adapter: LinkedAdapter = adapter def __repr__(self): return "".format( self.front_match, self.back_match, self.adapter ) @property def score(self): """Number of matching bases""" s = 0 if self.front_match is not None: s += self.front_match.score if self.back_match is not None: s += self.back_match.score return s @property def errors(self): e = 0 if self.front_match is not None: e += self.front_match.errors if self.back_match is not None: e += self.back_match.errors return e def trimmed(self, read): if self.front_match: read = self.front_match.trimmed(read) if self.back_match: read = self.back_match.trimmed(read) return read def remainder_interval(self) -> Tuple[int, int]: matches = [ match for match in [self.front_match, self.back_match] if match is not None ] return remainder(matches) def retained_adapter_interval(self) -> Tuple[int, int]: if self.front_match: start = self.front_match.rstart offset = self.front_match.rstop else: start = offset = 0 if self.back_match: end = self.back_match.rstop + offset else: end = len(self.front_match.sequence) return start, end def get_info_records(self, read) -> List[List]: records = [] for match, namesuffix in [ (self.front_match, ";1"), (self.back_match, ";2"), ]: if match is None: continue record = match.get_info_records(read)[0] record[7] = ( "none" if self.adapter.name is None else self.adapter.name ) + namesuffix records.append(record) read = match.trimmed(read) return records def match_sequence(self): return ( (self.front_match.match_sequence() if self.front_match else "") + "," + (self.back_match.match_sequence() if self.back_match else "") ) class LinkedAdapter(Adapter): """A 5' adapter combined with a 3' adapter""" description = "linked" def __init__( self, front_adapter: SingleAdapter, back_adapter: SingleAdapter, front_required: bool, back_required: bool, name: Optional[str], ): super().__init__(name) self.front_required = front_required self.back_required = back_required # The following attributes are needed for the report self.where = "linked" self.name: str = _generate_adapter_name() if name is None else name self.front_adapter = front_adapter self.front_adapter.name = self.name self.back_adapter = back_adapter def __repr__(self): return f"{self.__class__.__name__}(front_adapter={self.front_adapter}, back_adapter={self.back_adapter})" def descriptive_identifier(self) -> str: return "linked" def enable_debug(self): self.front_adapter.enable_debug() self.back_adapter.enable_debug() def match_to(self, sequence: str) -> Optional[LinkedMatch]: """ Match the two linked adapters against a string """ front_match = self.front_adapter.match_to(sequence) if self.front_required and front_match is None: return None if front_match is not None: sequence = sequence[front_match.trim_slice()] back_match = self.back_adapter.match_to(sequence) if back_match is None and (self.back_required or front_match is None): return None return LinkedMatch(front_match, back_match, self) def create_statistics(self) -> LinkedAdapterStatistics: return LinkedAdapterStatistics( self, front=self.front_adapter, back=self.back_adapter ) @property def sequence(self): return self.front_adapter.sequence + "..." + self.back_adapter.sequence @property def remove(self): return None def spec(self) -> str: return f"{self.front_adapter.spec()}...{self.back_adapter.spec()}" class MultipleAdapters(Matchable): """ Represent multiple adapters at once """ def __init__(self, adapters: Sequence[Matchable]): super().__init__(name="multiple_adapters") self._adapters = adapters def enable_debug(self): for a in self._adapters: a.enable_debug() def __getitem__(self, item): return self._adapters[item] def __len__(self): return len(self._adapters) def match_to(self, sequence: str) -> Optional[SingleMatch]: """ Find the adapter that best matches the sequence. Return either a Match instance or None if there are no matches. """ best_match = None for adapter in self._adapters: match = adapter.match_to(sequence) if match is None: continue # the score determines which adapter fits best if ( best_match is None or match.score > best_match.score or ( match.score == best_match.score and match.errors < best_match.errors ) ): best_match = match return best_match class AdapterIndex: """ Index of multiple adapters Represent multiple adapters of the same type at once and use an index data structure to speed up matching. This is faster than iterating over multiple adapters. There are quite a few restrictions: - the error rate allows at most 2 mismatches - wildcards in the adapter are not allowed - wildcards in the read are not allowed Use the is_acceptable() method to check individual adapters. """ AdapterIndexDict = Dict[str, Tuple[SingleAdapter, int, int]] def __init__(self, adapters, prefix: bool): """All given adapters must be of the same type""" if not adapters: raise ValueError("Adapter list is empty") for adapter in adapters: self._accept(adapter, prefix) self._adapters = adapters self._lengths, self._index = self._make_index() logger.debug( "String lengths in the index: %s", sorted(self._lengths, reverse=True) ) if len(self._lengths) == 1: self._length = self._lengths[0] self.match_to = self._match_to_one_length else: self.match_to = self._match_to_multiple_lengths if prefix: self._make_affix = self._make_prefix self._make_match = self._make_prefix_match else: self._make_affix = self._make_suffix self._make_match = self._make_suffix_match def __repr__(self): return f"{self.__class__.__name__}(adapters={self._adapters!r})" @staticmethod def _make_suffix(s, n): return s[-n:] @staticmethod def _make_prefix(s, n): return s[:n] @staticmethod def _make_prefix_match(adapter, length, score, errors, sequence): return RemoveBeforeMatch( astart=0, astop=len(adapter.sequence), rstart=0, rstop=length, score=score, errors=errors, adapter=adapter, sequence=sequence, ) @staticmethod def _make_suffix_match(adapter, length, score, errors, sequence): return RemoveAfterMatch( astart=0, astop=len(adapter.sequence), rstart=len(sequence) - length, rstop=len(sequence), score=score, errors=errors, adapter=adapter, sequence=sequence, ) @classmethod def _accept(cls, adapter: SingleAdapter, prefix: bool): """Raise a ValueError if the adapter is not acceptable""" if prefix and not isinstance(adapter, PrefixAdapter): raise ValueError("Only 5' anchored adapters are allowed") elif not prefix and not isinstance(adapter, SuffixAdapter): raise ValueError("Only 3' anchored adapters are allowed") if adapter.read_wildcards: raise ValueError("Wildcards in the read not supported") if adapter.adapter_wildcards: raise ValueError("Wildcards in the adapter not supported") k = int(len(adapter) * adapter.max_error_rate) if k > 2: raise ValueError("Error rate too high") @classmethod def is_acceptable(cls, adapter: SingleAdapter, prefix: bool): """ Return whether this adapter is acceptable for being used in an index Adapters are not acceptable if they allow wildcards, allow too many errors, or would lead to a very large index. """ try: cls._accept(adapter, prefix) except ValueError: return False return True def _make_index(self) -> Tuple[List[int], "AdapterIndexDict"]: start_time = time.time() logger.info("Building index of %s adapters ...", len(self._adapters)) index: Dict[str, Tuple[SingleAdapter, int, int]] = dict() lengths = set() has_warned = False for adapter in self._adapters: sequence = adapter.sequence k = int(adapter.max_error_rate * len(sequence)) if adapter.indels: for s, errors, matches in edit_environment(sequence, k): if s in index: other_adapter, other_errors, other_matches = index[s] if matches < other_matches: continue if other_matches == matches and not has_warned: self._warn_similar(adapter, other_adapter, k, s, matches) has_warned = True index[s] = (adapter, errors, matches) lengths.add(len(s)) else: n = len(sequence) for errors in range(k + 1): for s in hamming_sphere(sequence, errors): matches = n - errors if s in index: other_adapter, other_errors, other_matches = index[s] if matches < other_matches: continue if other_matches == matches and not has_warned: self._warn_similar( adapter, other_adapter, k, s, matches ) has_warned = True index[s] = (adapter, errors, matches) lengths.add(n) elapsed = time.time() - start_time logger.info( "Built an index containing %s strings in %.1f s.", len(index), elapsed ) return sorted(lengths, reverse=True), index @staticmethod def _warn_similar(adapter, other_adapter, k, s, matches): logger.warning( "Adapters %s %r and %s %r are very similar. At %s allowed errors, " "the sequence %r cannot be assigned uniquely because the number of " "matches is %s compared to both adapters.", other_adapter.name, other_adapter.sequence, adapter.name, adapter.sequence, k, s, matches, ) def _match_to_one_length(self, sequence: str): """ Match a query string against all adapters and return a Match that represents the best match or None if no match was found """ affix = self._make_affix(sequence.upper(), self._length) if "N" in affix: result = self._lookup_with_n(affix) if result is None: return None adapter, e, m = result else: try: adapter, e, m = self._index[affix] except KeyError: return None return self._make_match(adapter, self._length, m, e, sequence) def _match_to_multiple_lengths(self, sequence: str): """ Match the adapters against a string and return a Match that represents the best match or None if no match was found """ affix = sequence.upper() # Check all the prefixes or suffixes (affixes) that could match best_adapter: Optional[SingleAdapter] = None best_length = 0 best_m = -1 best_e = 1000 # Check successively shorter affixes for length in self._lengths: if length < best_m: # No chance of getting the same or a higher number of matches, so we can stop early break affix = self._make_affix(affix, length) if "N" in affix: result = self._lookup_with_n(affix) if result is None: continue adapter, e, m = result else: try: adapter, e, m = self._index[affix] except KeyError: continue if m > best_m or (m == best_m and e < best_e): # TODO this could be made to work: # assert best_m == -1 best_adapter = adapter best_e = e best_m = m best_length = length if best_m == -1: return None else: return self._make_match(best_adapter, best_length, best_m, best_e, sequence) def _lookup_with_n(self, affix): # N wildcards need to be counted as mismatches (read wildcards aren’t allowed). # We can thus look up an affix where we replace N with an arbitrary nucleotide. affix_without_n = affix.replace("N", "A") try: result = self._index[affix_without_n] except KeyError: return None # The looked up number of matches and errors is too low if # the adapter actually has an A where the N is in the query. # Fix this by re-doing the alignment. adapter = result[0] match = adapter.match_to(affix) if match is None: return None return adapter, match.errors, match.score class IndexedPrefixAdapters(Matchable): def __init__(self, adapters): super().__init__(name="indexed_prefix_adapters") self._index = AdapterIndex(adapters, prefix=True) self.match_to = self._index.match_to def match_to(self, sequence: str): pass class IndexedSuffixAdapters(Matchable): def __init__(self, adapters): super().__init__(name="indexed_suffix_adapters") self._index = AdapterIndex(adapters, prefix=False) self.match_to = self._index.match_to def match_to(self, sequence: str): pass def warn_duplicate_adapters(adapters): d = dict() for adapter in adapters: key = (adapter.__class__, adapter.sequence) if key in d: logger.warning( "Adapter %r (%s) was specified multiple times! " "Please make sure that this is what you want.", adapter.sequence, adapter.description, ) d[key] = adapter.name def remainder(matches: Sequence[Match]) -> Tuple[int, int]: """ Determine which section of the read would not be trimmed. Return a tuple (start, stop) that gives the interval of the untrimmed part relative to the original read. matches must be non-empty """ if not matches: raise ValueError("matches must not be empty") start = 0 for match in matches: match_start, match_stop = match.remainder_interval() start += match_start length = match_stop - match_start return (start, start + length) cutadapt-4.7/src/cutadapt/align.py000066400000000000000000000133001457457704700172520ustar00rootroot00000000000000__all__ = [ "EndSkip", "Aligner", "PrefixComparer", "SuffixComparer", "hamming_sphere", "hamming_environment", "edit_environment", "edit_distance", ] from enum import IntFlag from typing import Iterator, Tuple from cutadapt._align import ( Aligner, PrefixComparer, SuffixComparer, hamming_sphere, edit_environment, ) class EndSkip(IntFlag): """ Flags for the Aligner that indicate which ends of reference or query may be skipped at no cost. Setting all four flags at the same time results in semiglobal alignment. """ REFERENCE_START = 1 # a prefix of the reference may be skipped at no cost QUERY_START = 2 # a prefix of the query may be skipped at no cost REFERENCE_END = 4 # a suffix of the reference may be skipeed at no cost QUERY_STOP = 8 # a suffix of the query may be skipeed at no cost SEMIGLOBAL = 15 # all of the above def edit_distance(s: str, t: str) -> int: """ Return the edit distance between the strings s and t. The edit distance is the sum of the numbers of insertions, deletions, and mismatches that is minimally necessary to transform one string into the other. """ m = len(s) # index i n = len(t) # index j costs = list(range(m + 1)) for j in range(1, n + 1): prev = costs[0] costs[0] += 1 for i in range(1, m + 1): match = int(s[i - 1] == t[j - 1]) c = min( prev + 1 - match, costs[i] + 1, costs[i - 1] + 1, ) prev = costs[i] costs[i] = c return costs[-1] def hamming_environment(s: str, k: int) -> Iterator[Tuple[str, int, int]]: """ Find all strings t for which the hamming distance between s and t is at most k, assuming the alphabet is A, C, G, T. Yield tuples (t, e, m), where e is the hamming distance between s and t and m is the number of matches (equal to len(t) - e). """ n = len(s) for e in range(k + 1): for t in hamming_sphere(s, e): yield t, e, n - e def naive_edit_environment(s: str, k: int) -> Iterator[str]: """ Apply all possible edits up to edit distance k to string s and yield the resulting strings. A string may be returned more than once. """ yield s if k == 0: return for s in naive_edit_environment(s, k - 1): n = len(s) for ch in "ACGT": for i in range(n): prefix = s[:i] + ch yield prefix + s[i:] # insertion yield prefix + s[i + 1 :] # substitution yield s + ch # insertion into final position # all deletions for i in range(n): yield s[:i] + s[i + 1 :] def py_edit_environment(s: str, k: int) -> Iterator[Tuple[str, int, int]]: """ Find all strings t for which the edit distance between s and t is at most k, assuming the alphabet is A, C, G, T. Yield tuples (t, e, score), where e is the edit distance between s and t and score is the score of the optimal alignment. """ rate = k / len(s) if s else 0 aligner = Aligner(s, max_error_rate=rate, flags=0, min_overlap=len(s)) seen = set() for t in naive_edit_environment(s, k): if t in seen: continue seen.add(t) result = aligner.locate(t) score, errors = result[-2:] # type: ignore yield t, errors, score def slow_edit_environment(s: str, k: int) -> Iterator[Tuple[str, int, int]]: """ Find all strings t for which the edit distance between s and t is at most k, assuming the alphabet is A, C, G, T. Yield tuples (t, e, m), where e is the edit distance between s and t and m is the number of matches in the optimal alignment. This is slow and only used in testing. """ n = len(s) alphabet = "TGCA" work_stack = [ ( "", list(range(n + 1)), [0] * (n + 1), ) ] while work_stack: # t is the current prefix # costs is a row at index len(t) in the DP matrix # matches is a row in the corresponding matrix of the no. of matches t, costs, matches = work_stack.pop() # The row is the last row of the DP matrix for aligning t against s i = len(t) if costs[-1] <= k: # The costs of an optimal alignment of t against s are at most k, # so t is within the edit environment. yield t, costs[-1], matches[-1] if i == n + k: # Last row reached continue # Runtime heuristic: The entries in the DP matrix cannot get lower # in subsequent rows, so don’t try longer suffixs if all entries are # greater than k. if min(costs) > k: continue # compute next row in DP matrix for all characters of the alphabet for ch in alphabet: # create a new DP matrix row for each character of the alphabet next_costs = [0] * (n + 1) next_costs[0] = len(t) + 1 next_matches = [0] * (n + 1) for j in range(1, n + 1): match = 0 if s[j - 1] == ch else 1 assert j > 0 diag = costs[j - 1] + match left = next_costs[j - 1] + 1 up = costs[j] + 1 if diag <= left and diag <= up: c, m = diag, matches[j - 1] + (1 - match) elif left <= up: c, m = left, next_matches[j - 1] else: c, m = up, matches[j] next_costs[j] = c next_matches[j] = m work_stack.append((t + ch, next_costs, next_matches)) cutadapt-4.7/src/cutadapt/cli.py000066400000000000000000001512061457457704700167370ustar00rootroot00000000000000#!/usr/bin/env python # # Copyright (c) 2010 Marcel Martin and contributors # # Permission is hereby granted, free of charge, to any person obtaining a copy # of this software and associated documentation files (the "Software"), to deal # in the Software without restriction, including without limitation the rights # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell # copies of the Software, and to permit persons to whom the Software is # furnished to do so, subject to the following conditions: # # The above copyright notice and this permission notice shall be included in # all copies or substantial portions of the Software. # # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN # THE SOFTWARE. """ cutadapt version {version} Copyright (C) 2010 Marcel Martin and contributors Cutadapt removes adapter sequences from high-throughput sequencing reads. Usage: cutadapt -a ADAPTER [options] [-o output.fastq] input.fastq For paired-end reads: cutadapt -a ADAPT1 -A ADAPT2 [options] -o out1.fastq -p out2.fastq in1.fastq in2.fastq Replace "ADAPTER" with the actual sequence of your 3' adapter. IUPAC wildcard characters are supported. All reads from input.fastq will be written to output.fastq with the adapter sequence removed. Adapter matching is error-tolerant. Multiple adapter sequences can be given (use further -a options), but only the best-matching adapter will be removed. Input may also be in FASTA format. Compressed input and output is supported and auto-detected from the file name (.gz, .xz, .bz2). Use the file name '-' for standard input/output. Without the -o option, output is sent to standard output. Citation: Marcel Martin. Cutadapt removes adapter sequences from high-throughput sequencing reads. EMBnet.Journal, 17(1):10-12, May 2011. http://dx.doi.org/10.14806/ej.17.1.200 Run "cutadapt --help" to see all command-line options. See https://cutadapt.readthedocs.io/ for full documentation. """ import copy import sys import time import shutil import logging import platform import itertools import multiprocessing from pathlib import Path from typing import Tuple, Optional, Sequence, List, Any, Iterator, Union, Dict from argparse import ArgumentParser, SUPPRESS, HelpFormatter import dnaio import xopen as xopen from cutadapt import __version__ from cutadapt.adapters import warn_duplicate_adapters, Adapter, InvalidCharacter from cutadapt.json import OneLine, dumps as json_dumps from cutadapt.parser import make_adapters_from_specifications from cutadapt.modifiers import ( SingleEndModifier, LengthTagModifier, SuffixRemover, PrefixSuffixAdder, ZeroCapper, QualityTrimmer, UnconditionalCutter, NEndTrimmer, AdapterCutter, PairedAdapterCutterError, PairedAdapterCutter, NextseqQualityTrimmer, Shortener, ReverseComplementer, PairedEndRenamer, Renamer, InvalidTemplate, PolyATrimmer, PairedReverseComplementer, ) from cutadapt.predicates import ( TooShort, TooLong, TooManyN, TooManyExpectedErrors, TooHighAverageErrorRate, CasavaFiltered, IsTrimmed, IsUntrimmed, ) from cutadapt.report import full_report, minimal_report, Statistics from cutadapt.pipeline import SingleEndPipeline, PairedEndPipeline from cutadapt.runners import make_runner from cutadapt.files import InputPaths, OutputFiles, FileOpener from cutadapt.steps import ( InfoFileWriter, PairedSingleEndStep, RestFileWriter, WildcardFileWriter, SingleEndFilter, PairedEndFilter, Demultiplexer, CombinatorialDemultiplexer, PairedDemultiplexer, PairedEndSink, SingleEndSink, ) from cutadapt.utils import available_cpu_count, Progress, DummyProgress from cutadapt.log import setup_logging, REPORT from cutadapt.qualtrim import HasNoQualities logger = logging.getLogger() class CutadaptArgumentParser(ArgumentParser): """ This ArgumentParser customizes two things: - The usage message is not prefixed with 'usage:' - A brief message is shown on errors, not full usage """ class CustomUsageHelpFormatter(HelpFormatter): def __init__(self, *args, **kwargs): kwargs["width"] = min(24 + 80, shutil.get_terminal_size().columns) super().__init__(*args, **kwargs) def add_usage(self, usage, actions, groups, prefix=None): if usage is not SUPPRESS: # pragma: no cover args = usage, actions, groups, "" self._add_item(self._format_usage, args) def __init__(self, *args, **kwargs): kwargs["formatter_class"] = self.CustomUsageHelpFormatter kwargs["usage"] = kwargs["usage"].replace("{version}", __version__) super().__init__(*args, **kwargs) def error(self, message): """ If you override this in a subclass, it should not return -- it should either exit or raise an exception. """ print('Run "cutadapt --help" to see command-line options.', file=sys.stderr) print( "See https://cutadapt.readthedocs.io/ for full documentation.", file=sys.stderr, ) self.exit(2, f"\n{self.prog}: error: {message}\n") class CommandLineError(Exception): pass # fmt: off def get_argument_parser() -> ArgumentParser: # noqa: E131 parser = CutadaptArgumentParser(usage=__doc__, add_help=False) group = parser.add_argument_group("Options") group.add_argument("-h", "--help", action="help", help="Show this help message and exit") group.add_argument("--version", action="version", help="Show version number and exit", version=__version__) group.add_argument("--debug", action="count", default=0, help="Print debug log. Use twice to also print DP matrices") group.add_argument("--profile", action="store_true", default=False, help=SUPPRESS) group.add_argument("-j", "--cores", type=int, default=1, help='Number of CPU cores to use. Use 0 to auto-detect. Default: %(default)s') # Hidden options # GC content as a percentage group.add_argument("--gc-content", type=float, default=50, help=SUPPRESS) # Buffer size for the reader process when running in parallel group.add_argument("--buffer-size", type=int, default=4000000, help=SUPPRESS) # Compression level for gzipped output files. Not exposed since we have -Z group.add_argument("--compression-level", type=int, default=5, help=SUPPRESS) # Disable adapter index creation group.add_argument("--no-index", dest="index", default=True, action="store_false", help=SUPPRESS) group = parser.add_argument_group("Finding adapters", description="Parameters -a, -g, -b specify adapters to be removed from " "each read (or from R1 if data is paired-end. " "If specified multiple times, only the best matching adapter is " "trimmed (but see the --times option). Use notation " "'file:FILE' to read adapter sequences from a FASTA file.") group.add_argument("-a", "--adapter", type=lambda x: ("back", x), action="append", default=[], metavar="ADAPTER", dest="adapters", help="Sequence of an adapter ligated to the 3' end (paired data: of the " "first read). The adapter and subsequent bases are trimmed. If a " "'$' character is appended ('anchoring'), the adapter is only " "found if it is a suffix of the read.") group.add_argument("-g", "--front", type=lambda x: ("front", x), action="append", default=[], metavar="ADAPTER", dest="adapters", help="Sequence of an adapter ligated to the 5' end (paired data: of the " "first read). The adapter and any preceding bases are trimmed. " "Partial matches at the 5' end are allowed. If a '^' character is " "prepended ('anchoring'), the adapter is only found if it is a " "prefix of the read.") group.add_argument("-b", "--anywhere", type=lambda x: ("anywhere", x), action="append", default=[], metavar="ADAPTER", dest="adapters", help="Sequence of an adapter that may be ligated to the 5' or 3' end " "(paired data: of the first read). Both types of matches as " "described under -a and -g are allowed. If the first base of the " "read is part of the match, the behavior is as with -g, otherwise " "as with -a. This option is mostly for rescuing failed library " "preparations - do not use if you know which end your adapter was " "ligated to!") group.add_argument("-e", "--error-rate", "--errors", type=float, metavar="E", default=0.1, help="Maximum allowed error rate (if 0 <= E < 1), or absolute number of errors " "for full-length adapter match (if E is an integer >= 1). Error rate = " "no. of errors divided by length of matching region. Default: %(default)s (10%%)") group.add_argument("--no-indels", action='store_false', dest='indels', default=True, help="Allow only mismatches in alignments. " "Default: allow both mismatches and indels") group.add_argument("-n", "--times", type=int, metavar="COUNT", default=1, help="Remove up to COUNT adapters from each read. Default: %(default)s") group.add_argument("-O", "--overlap", type=int, metavar="MINLENGTH", default=3, help="Require MINLENGTH overlap between read and adapter for an adapter " "to be found. Default: %(default)s") group.add_argument("--match-read-wildcards", action="store_true", default=False, help="Interpret IUPAC wildcards in reads. Default: %(default)s") group.add_argument("-N", "--no-match-adapter-wildcards", action="store_false", default=True, dest="match_adapter_wildcards", help="Do not interpret IUPAC wildcards in adapters.") group.add_argument("--action", choices=("trim", "retain", "mask", "lowercase", "none"), default="trim", help="What to do if a match was found. " "trim: trim adapter and up- or downstream sequence; " "retain: trim, but retain adapter; " "mask: replace with 'N' characters; " "lowercase: convert to lowercase; " "none: leave unchanged. Default: %(default)s") group.add_argument("--rc", "--revcomp", dest="reverse_complement", default=False, action="store_true", help="Check both the read and its reverse complement for adapter matches. If " "match is on reverse-complemented version, output that one. " "Default: check only read") group.add_argument("--no-trim", dest='action', action='store_const', const='none', help=SUPPRESS) # Deprecated, use --action=none group.add_argument("--mask-adapter", dest='action', action='store_const', const='mask', help=SUPPRESS) # Deprecated, use --action=mask group = parser.add_argument_group("Additional read modifications") group.add_argument("-u", "--cut", action='append', default=[], type=int, metavar="LEN", help="Remove LEN bases from each read (or R1 if paired; use -U option for R2). " "If LEN is positive, remove bases from the beginning. " "If LEN is negative, remove bases from the end. " "Can be used twice if LENs have different signs. " "Applied *before* adapter trimming.") group.add_argument("--nextseq-trim", type=int, default=None, metavar="3'CUTOFF", help="NextSeq-specific quality trimming (each read). Trims also dark " "cycles appearing as high-quality G bases.") group.add_argument("-q", "--quality-cutoff", default=None, metavar="[5'CUTOFF,]3'CUTOFF", help="Trim low-quality bases from 5' and/or 3' ends of each read before " "adapter removal. Applied to both reads if data is paired. If one " "value is given, only the 3' end is trimmed. If two " "comma-separated cutoffs are given, the 5' end is trimmed with " "the first cutoff, the 3' end with the second.") group.add_argument("--quality-base", type=int, default=33, metavar='N', help="Assume that quality values in FASTQ are encoded as ascii(quality " "+ N). This needs to be set to 64 for some old Illumina " "FASTQ files. Default: %(default)s") group.add_argument("--poly-a", action="store_true", default=False, help="Trim poly-A tails") group.add_argument("--length", "-l", type=int, default=None, metavar="LENGTH", help="Shorten reads to LENGTH. Positive values remove bases at the end " "while negative ones remove bases at the beginning. This and the " "following modifications are applied after adapter trimming.") group.add_argument("--trim-n", action='store_true', default=False, help="Trim N's on ends of reads.") group.add_argument("--length-tag", metavar="TAG", help="Search for TAG followed by a decimal number in the description " "field of the read. Replace the decimal number with the correct " "length of the trimmed read. For example, use --length-tag 'length=' " "to correct fields like 'length=123'.") group.add_argument("--strip-suffix", action='append', default=[], help="Remove this suffix from read names if present. Can be given multiple times.") group.add_argument("-x", "--prefix", default='', help="Add this prefix to read names. Use {name} to insert the name of the matching " "adapter.") group.add_argument("-y", "--suffix", default='', help="Add this suffix to read names; can also include {name}") group.add_argument("--rename", metavar="TEMPLATE", help="Rename reads using TEMPLATE containing variables such as {id}, {adapter_name} " "etc. (see documentation)") group.add_argument("--zero-cap", "-z", action='store_true', default=False, help="Change negative quality values to zero.") group = parser.add_argument_group("Filtering of processed reads", description="Filters are applied after above read modifications. " "Paired-end reads are always discarded pairwise (see also " "--pair-filter).") group.add_argument("-m", "--minimum-length", default=None, metavar="LEN[:LEN2]", help="Discard reads shorter than LEN. Default: 0") group.add_argument("-M", "--maximum-length", default=None, metavar="LEN[:LEN2]", help="Discard reads longer than LEN. Default: no limit") group.add_argument("--max-n", type=float, default=None, metavar="COUNT", help="Discard reads with more than COUNT 'N' bases. If COUNT is a number " "between 0 and 1, it is interpreted as a fraction of the read length.") group.add_argument("--max-expected-errors", "--max-ee", type=float, default=None, metavar="ERRORS", help="Discard reads whose expected number of errors (computed " "from quality values) exceeds ERRORS.") group.add_argument("--max-average-error-rate", "--max-aer", type=float, default=None, metavar="ERROR_RATE", help="as --max-expected-errors (see above), but divided by length to " "account for reads of varying length.") group.add_argument("--discard-trimmed", "--discard", action='store_true', default=False, help="Discard reads that contain an adapter. Use also -O to avoid " "discarding too many randomly matching reads.") group.add_argument("--discard-untrimmed", "--trimmed-only", action='store_true', default=False, help="Discard reads that do not contain an adapter.") group.add_argument("--discard-casava", action='store_true', default=False, help="Discard reads that did not pass CASAVA filtering (header has :Y:).") group = parser.add_argument_group("Output") group.add_argument("--quiet", default=False, action='store_true', help="Print only error messages.") group.add_argument("--report", choices=('full', 'minimal'), default=None, help="Which type of report to print: 'full' or 'minimal'. Default: full") group.add_argument("--json", metavar="FILE", help="Dump report in JSON format to FILE") group.add_argument("-o", "--output", metavar="FILE", help="Write trimmed reads to FILE. FASTQ or FASTA format is chosen " "depending on input. Summary report is sent to standard output. " "Use '{name}' for demultiplexing (see docs). " "Default: write to standard output") group.add_argument("--fasta", default=False, action='store_true', help="Output FASTA to standard output even on FASTQ input.") group.add_argument("-Z", action="store_const", const=1, dest="compression_level", help="Use compression level 1 for gzipped output files (faster, but uses more space)") group.add_argument("--info-file", metavar="FILE", help="Write information about each read and its adapter matches into FILE. " "See the documentation for the file format.") group.add_argument("-r", "--rest-file", metavar="FILE", help="When the adapter matches in the middle of a read, write the " "rest (after the adapter) to FILE.") group.add_argument("--wildcard-file", metavar="FILE", help="When the adapter has N wildcard bases, write adapter bases " "matching wildcard positions to FILE. (Inaccurate with indels.)") group.add_argument("--too-short-output", metavar="FILE", help="Write reads that are too short (according to length specified by " "-m) to FILE. Default: discard reads") group.add_argument("--too-long-output", metavar="FILE", help="Write reads that are too long (according to length specified by " "-M) to FILE. Default: discard reads") group.add_argument("--untrimmed-output", default=None, metavar="FILE", help="Write reads that do not contain any adapter to FILE. Default: " "output to same file as trimmed reads") group = parser.add_argument_group("Paired-end options", description="The " "-A/-G/-B/-U/-Q options work like their lowercase counterparts, but " "are applied to R2 (second read in pair)") group.add_argument("-A", type=lambda x: ("back", x), dest='adapters2', action='append', default=[], metavar='ADAPTER', help="3' adapter to be removed from R2") group.add_argument("-G", type=lambda x: ("front", x), dest='adapters2', action='append', default=[], metavar='ADAPTER', help="5' adapter to be removed from R2") group.add_argument("-B", type=lambda x: ("anywhere", x), dest='adapters2', action='append', default=[], metavar='ADAPTER', help="5'/3 adapter to be removed from R2") group.add_argument("-U", dest='cut2', action='append', default=[], type=int, metavar="LENGTH", help="Remove LENGTH bases from R2") group.add_argument("-Q", dest="quality_cutoff2", default=None, metavar="[5'CUTOFF,]3'CUTOFF", help="Quality-trimming cutoff for R2. Default: same as for R1") group.add_argument("-p", "--paired-output", metavar="FILE", help="Write R2 to FILE.") group.add_argument("--pair-adapters", action="store_true", help="Treat adapters given with -a/-A etc. as pairs. Either both " "or none are removed from each read pair.") # Setting the default for pair_filter to None allows us to find out whether # the option was used at all. group.add_argument("--pair-filter", default=None, choices=("any", "both", "first"), help="Which of the reads in a paired-end read have to match the " "filtering criterion in order for the pair to be filtered. " "Default: any") group.add_argument("--interleaved", action='store_true', default=False, help="Read and/or write interleaved paired-end reads.") group.add_argument("--untrimmed-paired-output", metavar="FILE", help="Write second read in a pair to this FILE when no adapter " "was found. Use with --untrimmed-output. Default: output " "to same file as trimmed reads") group.add_argument("--too-short-paired-output", metavar="FILE", default=None, help="Write second read in a pair to this file if pair is too short.") group.add_argument("--too-long-paired-output", metavar="FILE", default=None, help="Write second read in a pair to this file if pair is too long.") # We could have two positional arguments here, with the second one optional, but # we want custom, more helpful error messages. parser.add_argument("inputs", nargs='*', help=SUPPRESS) return parser # fmt: on def parse_cutoffs(s: str) -> Tuple[int, int]: """Parse a string INT[,INT] into a pair of integers >>> parse_cutoffs("5") (0, 5) >>> parse_cutoffs("6,7") (6, 7) """ try: cutoffs = [int(value) for value in s.split(",")] except ValueError as e: raise CommandLineError(f"Quality cutoff value not recognized: {e}") if len(cutoffs) == 1: cutoffs = [0, cutoffs[0]] elif len(cutoffs) != 2: raise CommandLineError( "Expected one value or two values separated by comma for " "the quality cutoff" ) return (cutoffs[0], cutoffs[1]) def parse_lengths(s: str) -> Tuple[Optional[int], ...]: """Parse [INT][:[INT]] into a pair of integers. If a value is omitted, use None >>> parse_lengths('25') (25,) >>> parse_lengths('17:25') (17, 25) >>> parse_lengths('25:') (25, None) >>> parse_lengths(':25') (None, 25) """ fields = s.split(":") if len(fields) not in (1, 2): raise CommandLineError("Only at most one colon is allowed") try: values = tuple(int(f) if f != "" else None for f in fields) except ValueError as e: raise CommandLineError(f"Value not recognized: {e}") if len(values) == 2 and values[0] is None and values[1] is None: raise CommandLineError( f"Cannot parse '{s}': At least one length needs to be given" ) return tuple(values) def complain_about_duplicate_paths(paths: List[str]): if sys.platform == "win32" and sys.version_info < (3, 8): # Bug in handling of NUL return seen = set() for path in paths: if path is None: continue p = Path(path) if p.exists() and not p.is_file(): # assumed to be FIFO, /dev/null etc. continue if path in seen: raise CommandLineError( f"Path {path} specified more than once as an output file. " f"This is not supported at the moment." ) seen.add(path) def determine_demultiplex_mode( output: Optional[str], paired_output: Optional[str] ) -> Union[str, bool]: """Return one of "normal", "combinatorial" or False""" demultiplex = output is not None and "{name}" in output if paired_output is not None and (demultiplex != ("{name}" in paired_output)): raise CommandLineError( 'When demultiplexing paired-end data, "{name}" must appear in ' "both output file names (-o and -p)" ) demultiplex_combinatorial = ( output is not None and paired_output is not None and "{name1}" in output and "{name2}" in output and "{name1}" in paired_output and "{name2}" in paired_output ) if demultiplex and demultiplex_combinatorial: raise CommandLineError("You cannot combine {name} with {name1} and {name2}") if demultiplex: return "normal" elif demultiplex_combinatorial: return "combinatorial" else: return False def determine_paired(args) -> bool: """ Determine whether we should work in paired-end mode. """ # Any of these options enable paired-end mode return bool( args.paired_output or args.interleaved or args.adapters2 or args.cut2 or args.pair_filter or args.untrimmed_paired_output or args.too_short_paired_output or args.too_long_paired_output or args.quality_cutoff2 ) def make_input_paths( inputs: Sequence[str], paired: bool, interleaved: bool ) -> InputPaths: """ Do some other error checking of the input file names and return InputPaths. """ if len(inputs) == 0: raise CommandLineError( "You did not provide any input file names. Please give me something to do!" ) elif len(inputs) > 2: raise CommandLineError( f"You provided {len(inputs)} input file names, but either one or two are expected. " + "The file names were:\n - " + "\n - ".join(f"'{p}'" for p in inputs) + "\nHint: If your path contains spaces, you need to enclose it in quotes" ) input_filename = inputs[0] if paired and not interleaved: # Two file names required if len(inputs) == 1: raise CommandLineError( "You used an option that enables paired-end mode (such as -p, -A, -G, -B, -U), but " "only provided one input file. Pleise either provide two input files or use " "use --interleaved as appropriate." ) else: input_paired_filename = inputs[1] # type: Optional[str] else: if len(inputs) == 2: raise CommandLineError( "It appears you want to trim paired-end data because you provided two input files, " "but then you also need to provide two output files (with -o and -p) or use the " "--interleaved option." ) input_paired_filename = None if input_paired_filename: return InputPaths( input_filename, input_paired_filename, interleaved=interleaved ) else: return InputPaths(input_filename, interleaved=interleaved) def check_arguments(args, paired: bool) -> None: if not paired: if args.untrimmed_paired_output: raise CommandLineError( "Option --untrimmed-paired-output can only be used when " "trimming paired-end reads." ) if args.pair_adapters: raise CommandLineError( "Option --pair-adapters can only be used when trimming " "paired-end reads" ) if paired and not args.interleaved: if not args.paired_output: raise CommandLineError( "When a paired-end trimming option such as -A/-G/-B/-U, " "is used, a second output file needs to be specified via -p (--paired-output)." ) if not args.output: raise CommandLineError( "When you use -p or --paired-output, you must also " "use the -o option." ) for out, paired_out, argname in [ (args.untrimmed_output, args.untrimmed_paired_output, "untrimmed"), (args.too_short_output, args.too_short_paired_output, "too-short"), (args.too_long_output, args.too_long_paired_output, "too-long"), ]: if bool(out) != bool(paired_out): raise CommandLineError( "When trimming paired-end data, you must use either none or both of the" " --{name}-output/--{name}-paired-output options.".format( name=argname ) ) if args.overlap < 1: raise CommandLineError("The overlap must be at least 1.") if not (0 <= args.gc_content <= 100): raise CommandLineError( "GC content must be given as percentage between 0 and 100" ) if args.pair_adapters and args.times != 1: raise CommandLineError("--pair-adapters cannot be used with --times") def make_pipeline_from_args( # noqa: C901 args, input_file_format, outfiles, paired, adapters, adapters2 ): """ Set up a processing pipeline from parsed command-line arguments. If there are any problems parsing the arguments, raise a CommandLineError. Return an instance of Pipeline (SingleEndPipeline or PairedEndPipeline) """ action = None if args.action == "none" else args.action pair_filter_mode = None if paired: pair_filter_mode = "any" if args.pair_filter is None else args.pair_filter def make_filter( predicate1, predicate2, path1, path2, pair_filter_mode=pair_filter_mode ): record_writer = None if path1 or path2: paths = [path1, path2] if paired else [path1] if paired and path2 is None: interleaved = True paths = paths[:1] else: interleaved = False record_writer = outfiles.open_record_writer(*paths, interleaved=interleaved) if paired: step = PairedEndFilter( predicate1, predicate2, record_writer, pair_filter_mode=pair_filter_mode ) else: step = SingleEndFilter(predicate1, record_writer) return step adapter_names: List[Optional[str]] = [a.name for a in adapters] adapter_names2: List[Optional[str]] = [a.name for a in adapters2] steps = [] for step_class, path in ( (RestFileWriter, args.rest_file), (InfoFileWriter, args.info_file), (WildcardFileWriter, args.wildcard_file), ): if path is None: continue step: Any = step_class(outfiles.open_text(path)) if paired: step = PairedSingleEndStep(step) steps.append(step) # Add filtering steps for length, path1, path2, predicate_class in [ ( args.minimum_length, args.too_short_output, args.too_short_paired_output, TooShort, ), ( args.maximum_length, args.too_long_output, args.too_long_paired_output, TooLong, ), ]: if length is None: if path1 or path2: if predicate_class is TooShort: raise CommandLineError( "When --too-short-output or --too-short-paired-output are used, " "a minimum length must be provided with -m/--minimum-length" ) if predicate_class is TooLong: raise CommandLineError( "When --too-long-output or --too-long-paired-output are used, " "a maximum length must be provided with -M/--maximum-length" ) continue if not paired and path2: raise CommandLineError( "--too-short/long-paired-output cannot be used with single-end data" ) lengths = parse_lengths(length) if not paired and len(lengths) == 2: raise CommandLineError( "Two minimum or maximum lengths given for single-end data" ) if paired and len(lengths) == 1: lengths = (lengths[0], lengths[0]) predicate1 = predicate_class(lengths[0]) if lengths[0] is not None else None if len(lengths) == 2 and lengths[1] is not None: predicate2 = predicate_class(lengths[1]) else: predicate2 = None steps.append(make_filter(predicate1, predicate2, path1, path2)) if args.max_n is not None: predicate = TooManyN(args.max_n) if paired: step = PairedEndFilter( predicate, predicate, pair_filter_mode=pair_filter_mode ) else: step = SingleEndFilter(predicate) steps.append(step) if args.max_expected_errors is not None: if not input_file_format.has_qualities(): logger.warning( "Ignoring option --max-ee because input does not provide quality values" ) else: predicate = TooManyExpectedErrors(args.max_expected_errors) if paired: step = PairedEndFilter( predicate, predicate, pair_filter_mode=pair_filter_mode ) else: step = SingleEndFilter(predicate) steps.append(step) if args.max_average_error_rate is not None: if not input_file_format.has_qualities(): logger.warning( "Ignoring option --max-er because input does not contain quality values" ) else: predicate = TooHighAverageErrorRate(args.max_average_error_rate) if paired: step = PairedEndFilter( predicate, predicate, pair_filter_mode=pair_filter_mode ) else: step = SingleEndFilter(predicate) steps.append(step) if args.discard_casava: predicate = CasavaFiltered() if paired: step = PairedEndFilter( predicate, predicate, pair_filter_mode=pair_filter_mode ) else: step = SingleEndFilter(predicate) steps.append(step) # Add the last step that writes the records that made it through the pipeline # to the final output(s) if ( int(args.discard_trimmed) + int(args.discard_untrimmed) + int( args.untrimmed_output is not None or args.untrimmed_paired_output is not None ) > 1 ): raise CommandLineError( "Only one of the --discard-trimmed, --discard-untrimmed " "and --untrimmed-output options can be used at the same time." ) demultiplex_mode = determine_demultiplex_mode(args.output, args.paired_output) if demultiplex_mode and args.discard_trimmed: raise CommandLineError("Do not use --discard-trimmed when demultiplexing.") if demultiplex_mode == "combinatorial" and args.pair_adapters: raise CommandLineError( "With --pair-adapters, you can only use {name} in your output file name template, " "not {name1} and {name2} (no combinatorial demultiplexing)." ) if demultiplex_mode == "normal": if paired: step = PairedDemultiplexer( adapter_names, template1=args.output, template2=args.paired_output, untrimmed_output=args.untrimmed_output, untrimmed_paired_output=args.untrimmed_paired_output, discard_untrimmed=args.discard_untrimmed, outfiles=outfiles, ) else: step = Demultiplexer( adapter_names, template=args.output, untrimmed_output=args.untrimmed_output, discard_untrimmed=args.discard_untrimmed, outfiles=outfiles, ) steps.append(step) elif demultiplex_mode == "combinatorial": assert "{name1}" in args.output and "{name2}" in args.output assert "{name1}" in args.paired_output and "{name2}" in args.paired_output if args.untrimmed_output or args.untrimmed_paired_output: raise CommandLineError( "Combinatorial demultiplexing (with {name1} and {name2})" " cannot be combined with --untrimmed-output or --untrimmed-paired-output" ) step = CombinatorialDemultiplexer( adapter_names, adapter_names2, template1=args.output, template2=args.paired_output, discard_untrimmed=args.discard_untrimmed, outfiles=outfiles, ) steps.append(step) else: # When adapters are being trimmed only in R1 or R2, override the pair filter mode # as using the default of 'any' would regard all read pairs as untrimmed. override_pair_filter_mode = ( paired and (not adapters2 or not adapters) and ( args.discard_untrimmed or args.untrimmed_output or args.untrimmed_paired_output ) ) # Set up the remaining filters to deal with --discard-trimmed, # --discard-untrimmed and --untrimmed-output. These options # are mutually exclusive to help prevent brain damage. if args.discard_trimmed: predicate = IsTrimmed() if paired: step = PairedEndFilter( predicate, predicate, pair_filter_mode=pair_filter_mode ) else: step = SingleEndFilter(predicate) steps.append(step) elif args.discard_untrimmed: predicate = IsUntrimmed() if paired: step = PairedEndFilter( predicate, predicate, pair_filter_mode="both" if override_pair_filter_mode else pair_filter_mode, ) else: step = SingleEndFilter(predicate) steps.append(step) elif args.untrimmed_output or args.untrimmed_paired_output: predicate1 = IsUntrimmed() predicate2 = IsUntrimmed() steps.append( make_filter( predicate1, predicate2 if paired else None, args.untrimmed_output, args.untrimmed_paired_output, pair_filter_mode="both" if override_pair_filter_mode else pair_filter_mode, ) ) if paired: paths = [args.output, args.paired_output] if args.paired_output is None: interleaved = True paths = paths[:1] else: interleaved = False steps.append( PairedEndSink( outfiles.open_record_writer(*paths, interleaved=interleaved) ) ) else: if args.output is None: out = outfiles.open_stdout_record_writer( interleaved=paired and args.interleaved, force_fasta=args.fasta, ) else: out = outfiles.open_record_writer(args.output, force_fasta=args.fasta) steps.append(SingleEndSink(out)) logger.debug("Pipeline steps:") for step in steps: logger.debug("- %s", step) modifiers = [] modifiers.extend(make_unconditional_cutters(args.cut, args.cut2, paired)) if args.nextseq_trim is not None: trimmer = NextseqQualityTrimmer(args.nextseq_trim, args.quality_base) if paired: modifiers.append((trimmer, copy.copy(trimmer))) else: modifiers.append(trimmer) modifiers.extend( make_quality_trimmers( args.quality_cutoff, args.quality_cutoff2, args.quality_base, paired, ) ) modifiers.extend( make_adapter_cutter( adapters, adapters2, paired, args.pair_adapters, action, args.times, args.reverse_complement, not args.rename, # no "rc" suffix if --rename is used args.index, ) ) if args.poly_a: if paired: modifiers.append((PolyATrimmer(), PolyATrimmer(revcomp=True))) else: modifiers.append(PolyATrimmer()) for modifier in modifiers_applying_to_both_ends_if_paired(args): if paired: modifiers.append((modifier, copy.copy(modifier))) else: modifiers.append(modifier) if args.rename and (args.prefix or args.suffix): raise CommandLineError( "Option --rename cannot be combined with --prefix (-x) or --suffix (-y)" ) if args.rename and args.rename != "{header}": try: renamer = PairedEndRenamer(args.rename) if paired else Renamer(args.rename) modifiers.append(renamer) except InvalidTemplate as e: raise CommandLineError(e) # Create the processing pipeline if paired: pipeline = PairedEndPipeline(modifiers, steps) # type: Any else: pipeline = SingleEndPipeline(modifiers, steps) return pipeline def adapters_from_args(args) -> Tuple[List[Adapter], List[Adapter]]: search_parameters = dict( max_errors=args.error_rate, min_overlap=args.overlap, read_wildcards=args.match_read_wildcards, adapter_wildcards=args.match_adapter_wildcards, indels=args.indels, ) try: adapters = make_adapters_from_specifications(args.adapters, search_parameters) adapters2 = make_adapters_from_specifications(args.adapters2, search_parameters) except ( KeyError, ValueError, InvalidCharacter, ) as e: raise CommandLineError(e.args[0]) warn_duplicate_adapters(adapters) warn_duplicate_adapters(adapters2) if args.debug > 1: for adapter in adapters + adapters2: adapter.enable_debug() return adapters, adapters2 def make_unconditional_cutters(cut1: List[int], cut2: List[int], paired: bool): for i, cut_arg in enumerate([cut1, cut2]): # cut_arg is a list if not cut_arg: continue if len(cut_arg) > 2: raise CommandLineError("You cannot remove bases from more than two ends.") if len(cut_arg) == 2 and cut_arg[0] * cut_arg[1] > 0: raise CommandLineError("You cannot remove bases from the same end twice.") for c in cut_arg: if c == 0: continue if i == 0: # R1 if paired: yield (UnconditionalCutter(c), None) else: yield UnconditionalCutter(c) else: # R2 assert paired yield (None, UnconditionalCutter(c)) def make_quality_trimmers( cutoff1: Optional[str], cutoff2: Optional[str], quality_base: int, paired: bool, ): qtrimmers = [ QualityTrimmer(*parse_cutoffs(cutoff), quality_base) if cutoff is not None and cutoff != "0" else None for cutoff in (cutoff1, cutoff2) ] if paired: if cutoff1 is not None and cutoff2 is None: qtrimmers[1] = copy.copy(qtrimmers[0]) if qtrimmers[0] is not None or qtrimmers[1] is not None: yield tuple(qtrimmers) elif qtrimmers[0] is not None: assert not paired yield qtrimmers[0] def make_adapter_cutter( adapters, adapters2, paired: bool, pair_adapters: bool, action: Optional[str], times: int, reverse_complement: bool, add_rc_suffix: bool, allow_index: bool, ): if pair_adapters: if reverse_complement: raise CommandLineError("Cannot use --revcomp with --pair-adapters") try: cutter = PairedAdapterCutter(adapters, adapters2, action) except PairedAdapterCutterError as e: raise CommandLineError("--pair-adapters: " + str(e)) yield cutter else: adapter_cutter, adapter_cutter2 = None, None try: if adapters: adapter_cutter = AdapterCutter(adapters, times, action, allow_index) if adapters2: adapter_cutter2 = AdapterCutter(adapters2, times, action, allow_index) except ValueError as e: raise CommandLineError(e) if paired: if adapter_cutter or adapter_cutter2: if reverse_complement: yield PairedReverseComplementer( adapter_cutter, adapter_cutter2, rc_suffix=" rc" if add_rc_suffix else None, ) else: yield (adapter_cutter, adapter_cutter2) elif adapter_cutter: if reverse_complement: yield ReverseComplementer( adapter_cutter, rc_suffix=" rc" if add_rc_suffix else None, ) else: yield adapter_cutter def modifiers_applying_to_both_ends_if_paired(args) -> Iterator[SingleEndModifier]: if args.length is not None: yield Shortener(args.length) if args.trim_n: yield NEndTrimmer() if args.length_tag: yield LengthTagModifier(args.length_tag) for suffix in args.strip_suffix: yield SuffixRemover(suffix) if args.prefix or args.suffix: yield PrefixSuffixAdder(args.prefix, args.suffix) if args.zero_cap: yield ZeroCapper(quality_base=args.quality_base) def log_header(cmdlineargs): """Print the "This is cutadapt ..." header""" implementation = platform.python_implementation() opt = " (" + implementation + ")" if implementation != "CPython" else "" logger.info( "This is cutadapt %s with Python %s%s", __version__, platform.python_version(), opt, ) logger.info("Command line parameters: %s", " ".join(cmdlineargs)) def main_cli(): # pragma: no cover """Entry point for command-line script""" multiprocessing.freeze_support() main(sys.argv[1:]) return 0 def main(cmdlineargs) -> Statistics: """ Set up a processing pipeline from the command-line arguments, run it and return a Statistics object. """ start_time = time.time() parser = get_argument_parser() if not cmdlineargs: parser.print_usage() sys.exit(2) args, leftover_args = parser.parse_known_args(args=cmdlineargs) # Setup logging only if there are not already any handlers (can happen when # this function is being called externally such as from unit tests) if not logging.root.handlers: setup_logging( logger, log_to_stderr=is_any_output_stdout(args), quiet=args.quiet, minimal=args.report == "minimal", debug=args.debug, ) log_header(cmdlineargs) profiler = setup_profiler_if_requested(args.profile) log_system_info() if args.quiet and args.report: parser.error("Options --quiet and --report cannot be used at the same time") if leftover_args: warn_if_en_dashes(cmdlineargs) parser.error("unrecognized arguments: " + " ".join(leftover_args)) if args.cores < 0: parser.error("Value for --cores cannot be negative") cores = available_cpu_count() if args.cores == 0 else args.cores file_opener = FileOpener( compression_level=args.compression_level, threads=estimate_compression_threads(cores), ) if sys.stderr.isatty() and not args.quiet and not args.debug: progress = Progress() else: progress = DummyProgress() paired = determine_paired(args) try: is_interleaved_input = args.interleaved and len(args.inputs) == 1 input_paths = make_input_paths(args.inputs, paired, is_interleaved_input) check_arguments(args, paired) adapters, adapters2 = adapters_from_args(args) log_adapters(adapters, adapters2 if paired else None) complain_about_duplicate_paths( [ args.rest_file, args.info_file, args.wildcard_file, args.too_short_output, args.too_short_paired_output, args.too_long_output, args.too_long_paired_output, args.untrimmed_output, args.untrimmed_paired_output, args.output, args.paired_output, ] ) with make_runner(input_paths, cores, args.buffer_size) as runner: outfiles = OutputFiles( proxied=cores > 1, qualities=runner.input_file_format().has_qualities(), file_opener=file_opener, interleaved=args.interleaved, ) pipeline = make_pipeline_from_args( args, runner.input_file_format(), outfiles, paired, adapters, adapters2, ) logger.info( "Processing %s reads on %d core%s ...", {False: "single-end", True: "paired-end"}[pipeline.paired], cores, "s" if cores > 1 else "", ) stats = runner.run(pipeline, progress, outfiles) except KeyboardInterrupt: if args.debug: raise else: print("Interrupted", file=sys.stderr) sys.exit(130) except BrokenPipeError: sys.exit(1) except ( OSError, EOFError, HasNoQualities, dnaio.UnknownFileFormat, dnaio.FileFormatError, CommandLineError, ) as e: logger.debug("Command line error. Traceback:", exc_info=True) logger.error("%s", e) exit_code = 2 if isinstance(e, CommandLineError) else 1 sys.exit(exit_code) finally: # TODO ... try: outfiles.close() except UnboundLocalError: pass elapsed = time.time() - start_time if args.report == "minimal": report = minimal_report else: report = full_report logger.log(REPORT, "%s", report(stats, elapsed, args.gc_content / 100.0)) if args.json is not None: with open(args.json, "w") as f: json_dict = json_report( stats=stats, cmdlineargs=cmdlineargs, path1=input_paths.paths[0], path2=input_paths.paths[1] if len(input_paths.paths) > 1 else None, cores=cores, paired=paired, gc_content=args.gc_content / 100.0, ) f.write(json_dumps(json_dict)) f.write("\n") if profiler is not None: import pstats profiler.disable() pstats.Stats(profiler).sort_stats("time").print_stats(20) return stats def log_system_info(): logger.debug("Python executable: %s", sys.executable) logger.debug("dnaio version: %s", dnaio.__version__) logger.debug("xopen version: %s", xopen.__version__) def log_adapters(adapters, adapters2): paired = adapters2 is not None logger.debug("R1 adapters (%d):" if paired else "Adapters (%d):", len(adapters)) for a in itertools.islice(adapters, 20): logger.debug("- %s", a) if len(adapters) > 20: logger.debug("- (%d more)", len(adapters) - 20) if paired: logger.debug("R2 adapters (%d):", len(adapters2)) for a in itertools.islice(adapters2, 20): logger.debug("- %s", a) if len(adapters2) > 20: logger.debug("- (%d more)", len(adapters2) - 20) def setup_profiler_if_requested(requested): if requested: import cProfile profiler = cProfile.Profile() profiler.enable() else: profiler = None return profiler def warn_if_en_dashes(args): for arg in args: if arg.startswith("–"): logger.warning( "The first character in argument '%s' is '–' (an en-dash, Unicode U+2013)" " and will therefore be interpreted as a file name. If you wanted to" " provide an option, use a regular hyphen '-'.", arg, ) def estimate_compression_threads(cores: int) -> Optional[int]: return max(0, min(cores - 1, 4)) def is_any_output_stdout(args): return any( [ args.output is None, args.output == "-", args.paired_output == "-", args.untrimmed_output == "-", args.untrimmed_paired_output == "-", args.too_short_output == "-", args.too_short_paired_output == "-", args.too_long_output == "-", args.too_long_paired_output == "-", args.rest_file == "-", args.info_file == "-", args.wildcard_file == "-", ] ) def json_report( stats: Statistics, cmdlineargs: List[str], path1: str, path2: Optional[str], cores: int, paired: bool, gc_content: float, ) -> Dict: d = { "tag": "Cutadapt report", "schema_version": OneLine([0, 3]), "cutadapt_version": __version__, "python_version": platform.python_version(), "command_line_arguments": cmdlineargs, "cores": cores, "input": { "path1": path1, "path2": path2, "paired": paired, }, } d.update(stats.as_json(gc_content, one_line=True)) return d if __name__ == "__main__": # pragma: no cover sys.exit(main_cli()) cutadapt-4.7/src/cutadapt/expected_errors.h000066400000000000000000000146541457457704700211710ustar00rootroot00000000000000#include #ifdef __SSE2__ #include "emmintrin.h" #endif static const double SCORE_TO_ERROR_RATE[94] = { 1.0L, // 0 0.7943282347242815L, // 1 0.6309573444801932L, // 2 0.5011872336272722L, // 3 0.3981071705534972L, // 4 0.31622776601683794L, // 5 0.251188643150958L, // 6 0.19952623149688797L, // 7 0.15848931924611134L, // 8 0.12589254117941673L, // 9 0.1L, // 10 0.07943282347242814L, // 11 0.06309573444801933L, // 12 0.05011872336272722L, // 13 0.039810717055349734L, // 14 0.03162277660168379L, // 15 0.025118864315095794L, // 16 0.0199526231496888L, // 17 0.015848931924611134L, // 18 0.012589254117941675L, // 19 0.01L, // 20 0.007943282347242814L, // 21 0.00630957344480193L, // 22 0.005011872336272725L, // 23 0.003981071705534973L, // 24 0.0031622776601683794L, // 25 0.0025118864315095794L, // 26 0.001995262314968879L, // 27 0.001584893192461114L, // 28 0.0012589254117941675L, // 29 0.001L, // 30 0.0007943282347242813L, // 31 0.000630957344480193L, // 32 0.0005011872336272725L, // 33 0.00039810717055349735L, // 34 0.00031622776601683794L, // 35 0.00025118864315095795L, // 36 0.00019952623149688788L, // 37 0.00015848931924611142L, // 38 0.00012589254117941674L, // 39 0.0001L, // 40 7.943282347242822E-05L, // 41 6.309573444801929E-05L, // 42 5.011872336272725E-05L, // 43 3.9810717055349695E-05L, // 44 3.1622776601683795E-05L, // 45 2.5118864315095822E-05L, // 46 1.9952623149688786E-05L, // 47 1.584893192461114E-05L, // 48 1.2589254117941661E-05L, // 49 1E-05L, // 50 7.943282347242822E-06L, // 51 6.30957344480193E-06L, // 52 5.011872336272725E-06L, // 53 3.981071705534969E-06L, // 54 3.162277660168379E-06L, // 55 2.5118864315095823E-06L, // 56 1.9952623149688787E-06L, // 57 1.584893192461114E-06L, // 58 1.2589254117941661E-06L, // 59 1E-06L, // 60 7.943282347242822E-07L, // 61 6.30957344480193E-07L, // 62 5.011872336272725E-07L, // 63 3.981071705534969E-07L, // 64 3.162277660168379E-07L, // 65 2.5118864315095823E-07L, // 66 1.9952623149688787E-07L, // 67 1.584893192461114E-07L, // 68 1.2589254117941662E-07L, // 69 1E-07L, // 70 7.943282347242822E-08L, // 71 6.30957344480193E-08L, // 72 5.011872336272725E-08L, // 73 3.981071705534969E-08L, // 74 3.162277660168379E-08L, // 75 2.511886431509582E-08L, // 76 1.9952623149688786E-08L, // 77 1.5848931924611143E-08L, // 78 1.2589254117941661E-08L, // 79 1E-08L, // 80 7.943282347242822E-09L, // 81 6.309573444801943E-09L, // 82 5.011872336272715E-09L, // 83 3.981071705534969E-09L, // 84 3.1622776601683795E-09L, // 85 2.511886431509582E-09L, // 86 1.9952623149688828E-09L, // 87 1.584893192461111E-09L, // 88 1.2589254117941663E-09L, // 89 1E-09L, // 90 7.943282347242822E-10L, // 91 6.309573444801942E-10L, // 92 5.011872336272714E-10L, // 93 }; static inline double expected_errors_from_phreds(const uint8_t *phreds, size_t phreds_length, uint8_t base) { const uint8_t *end_ptr = phreds + phreds_length; const uint8_t *cursor = phreds; double expected_errors = 0.0; uint8_t max_phred = 126 - base; #ifdef __SSE2__ const uint8_t *vec_end_ptr = end_ptr - sizeof(__m128i); __m128d accumulator = _mm_set1_pd(0.0); while (cursor < vec_end_ptr) { __m128i phred_array = _mm_loadu_si128((__m128i *)cursor); __m128i illegal_phreds = _mm_cmpgt_epi8(phred_array, _mm_set1_epi8(126)); illegal_phreds = _mm_or_si128( illegal_phreds, _mm_cmplt_epi8(phred_array, _mm_set1_epi8(base))); if (_mm_movemask_epi8(illegal_phreds)) { return -1.0; } /* By explicitly setting multiple accumulators, the processor can perform out of order execution for increased speed See also: https://stackoverflow.com/a/36591776/16437839 */ __m128d accumulator1 = _mm_add_pd( _mm_set_pd( SCORE_TO_ERROR_RATE[cursor[0] - base], SCORE_TO_ERROR_RATE[cursor[1] - base] ), _mm_set_pd( SCORE_TO_ERROR_RATE[cursor[2] - base], SCORE_TO_ERROR_RATE[cursor[3] - base] ) ); __m128d accumulator2 = _mm_add_pd( _mm_set_pd( SCORE_TO_ERROR_RATE[cursor[4] - base], SCORE_TO_ERROR_RATE[cursor[5] - base] ), _mm_set_pd( SCORE_TO_ERROR_RATE[cursor[6] - base], SCORE_TO_ERROR_RATE[cursor[7] - base] ) ); __m128d accumulator3 = _mm_add_pd( _mm_set_pd( SCORE_TO_ERROR_RATE[cursor[8] - base], SCORE_TO_ERROR_RATE[cursor[9] - base] ), _mm_set_pd( SCORE_TO_ERROR_RATE[cursor[10] - base], SCORE_TO_ERROR_RATE[cursor[11] - base] ) ); __m128d accumulator4 = _mm_add_pd( _mm_set_pd( SCORE_TO_ERROR_RATE[cursor[12] - base], SCORE_TO_ERROR_RATE[cursor[13] - base] ), _mm_set_pd( SCORE_TO_ERROR_RATE[cursor[14] - base], SCORE_TO_ERROR_RATE[cursor[15] - base] ) ); accumulator = _mm_add_pd(accumulator, accumulator1); accumulator = _mm_add_pd(accumulator, accumulator2); accumulator = _mm_add_pd(accumulator, accumulator3); accumulator = _mm_add_pd(accumulator, accumulator4); cursor += sizeof(__m128i); } double double_store[2]; _mm_store_pd(double_store, accumulator); expected_errors = double_store[0] + double_store[1]; #endif while (cursor < end_ptr) { uint8_t phred = *cursor - base; if (phred > max_phred) { return -1.0; } expected_errors += SCORE_TO_ERROR_RATE[phred]; cursor += 1; } return expected_errors; } cutadapt-4.7/src/cutadapt/files.py000066400000000000000000000247571457457704700173040ustar00rootroot00000000000000import errno import io import sys from abc import ABC, abstractmethod from enum import Enum from typing import BinaryIO, Optional, Dict, List, TextIO, Any import dnaio from xopen import xopen from cutadapt.utils import logger try: import resource except ImportError: # Windows resource = None # type: ignore def xopen_rb_raise_limit(path: str): """ Open a (possibly compressed) file for reading in binary mode, trying to avoid the "Too many open files" problem using `open_raise_limit`. """ mode = "rb" f = open_raise_limit(xopen, path, mode, threads=0) logger.debug("Opening '%s', mode '%s' with xopen resulted in %s", path, mode, f) return f def open_raise_limit(func, *args, **kwargs): """ Run 'func' (which should be some kind of open() function) and return its result. If "Too many open files" occurs, increase limit and try again. """ try: f = func(*args, **kwargs) except OSError as e: if e.errno == errno.EMFILE: # Too many open files logger.debug("Too many open files, attempting to raise soft limit") raise_open_files_limit(8) f = func(*args, **kwargs) else: raise return f def raise_open_files_limit(n): if resource is None: return soft, hard = resource.getrlimit(resource.RLIMIT_NOFILE) soft = min(soft + n, hard) resource.setrlimit(resource.RLIMIT_NOFILE, (soft, hard)) class FileOpener: def __init__(self, compression_level: int = 1, threads: Optional[int] = None): """ threads -- no. of external compression threads. 0: write in-process None: min(cpu_count(), 4) """ self.compression_level = compression_level self.threads = threads def xopen(self, path, mode): threads = self.threads if "w" in mode else 0 f = open_raise_limit( xopen, path, mode, compresslevel=self.compression_level, threads=threads ) if "w" in mode: extra = f" (compression level {self.compression_level}, {threads} threads)" else: extra = "" logger.debug( "Opening '%s', mode '%s'%s with xopen resulted in %s", path, mode, extra, f ) return f def xopen_or_none(self, path, mode): """Return opened file or None if the path is None""" if path is None: return None return self.xopen(path, mode) def xopen_pair(self, path1: str, path2: Optional[str], mode): if path1 is None and path2 is not None: raise ValueError( "When giving paths for paired-end files, only providing the second" " file is not supported" ) file1 = self.xopen_or_none(path1, mode) file2 = self.xopen_or_none(path2, mode) return file1, file2 def dnaio_open(self, *args, **kwargs): kwargs["opener"] = self.xopen f = dnaio.open(*args, **kwargs) if not isinstance(args[0], io.BytesIO): logger.debug( "Opening %r, mode '%s' with dnaio resulted in %s", args[0], kwargs["mode"], f, ) return f def dnaio_open_raise_limit(self, *args, **kwargs): """ Open a FASTA/FASTQ file for writing. If it fails because the number of open files would be exceeded, try to raise the soft limit and re-try. """ return open_raise_limit(self.dnaio_open, *args, **kwargs) class InputFiles: def __init__( self, *files: BinaryIO, interleaved: bool = False, ): self._files = files self.interleaved = interleaved for f in self._files: assert f is not None def open(self): return dnaio.open(*self._files, interleaved=self.interleaved, mode="r") def close(self) -> None: for file in self._files: file.close() class InputPaths: def __init__(self, *paths: str, interleaved: bool = False): self.paths = paths self.interleaved = interleaved def open(self) -> InputFiles: files = [xopen_rb_raise_limit(path) for path in self.paths] return InputFiles(*files, interleaved=self.interleaved) class ProxyWriter(ABC): @abstractmethod def drain(self) -> List[bytes]: pass class ProxyTextFile(ProxyWriter): """ A file object for writing in text mode that is backed by a BytesIO object """ def __init__(self): self._buffer = io.BytesIO() self._file = io.TextIOWrapper(self._buffer) def write(self, text): self._file.write(text) def drain(self) -> List[bytes]: self._file.flush() chunk = self._buffer.getvalue() self._buffer.seek(0) self._buffer.truncate() return [chunk] def __getstate__(self): """TextIOWrapper cannot be pickled. Just don’t include our state.""" return True # ensure __setstate__ is called def __setstate__(self, state): self.__init__() class ProxyRecordWriter(ProxyWriter): """ A writer for FASTA, FASTQ records etc. that is backed by a BytesIO object """ def __init__(self, n_files: int, **kwargs): self._n_files = n_files self._kwargs = kwargs self._buffers = [io.BytesIO() for _ in range(n_files)] self._writer = open_raise_limit(dnaio.open, *self._buffers, mode="w", **kwargs) def write(self, *args, **kwargs): self._writer.write(*args, **kwargs) def drain(self) -> List[bytes]: chunks = [buf.getvalue() for buf in self._buffers] for buf in self._buffers: buf.seek(0) buf.truncate() return chunks def __getstate__(self): """Exclude the dnaio Reader class from the state""" return (self._n_files, self._kwargs) def __setstate__(self, state): n_files, kwargs = state self.__init__(n_files, **kwargs) class OutputFiles: def __init__( self, *, proxied: bool, qualities: bool, interleaved: bool, file_opener: Optional[FileOpener] = None, ): self._file_opener: FileOpener = ( file_opener if file_opener is not None else FileOpener() ) self._binary_files: List[BinaryIO] = [] self._binary_files_to_close: List[BinaryIO] = [] self._text_files: List[TextIO] = [] self._writers: List[Any] = [] self._proxy_files: List[ProxyWriter] = [] self._proxied = proxied self._to_close: List[BinaryIO] = [] self._qualities = qualities self._interleaved = interleaved def open_text(self, path): # TODO # - serial runner needs only text_file # - parallel runner needs binary_file and proxy_file # split into SerialOutputFiles and ParallelOutputFiles? if self._proxied: binary_file = self._file_opener.xopen(path, "wb") self._binary_files.append(binary_file) self._binary_files_to_close.append(binary_file) proxy_file = ProxyTextFile() self._proxy_files.append(proxy_file) return proxy_file else: text_file = self._file_opener.xopen(path, "wt") self._text_files.append(text_file) return text_file def open_record_writer( self, *paths, interleaved: bool = False, force_fasta: bool = False ): kwargs: Dict[str, Any] = dict( qualities=self._qualities, interleaved=interleaved ) if len(paths) not in (1, 2): raise ValueError("Expected one or two paths") if interleaved and len(paths) != 1: raise ValueError("Cannot write to two files when interleaved is True") if len(paths) == 1 and paths[0] == "-" and force_fasta: kwargs["fileformat"] = "fasta" for path in paths: assert path is not None binary_files = [] for path in paths: binary_file = self._file_opener.xopen(path, "wb") binary_files.append(binary_file) self._binary_files.append(binary_file) self._binary_files_to_close.append(binary_file) if self._proxied: proxy_writer = ProxyRecordWriter(len(paths), **kwargs) self._proxy_files.append(proxy_writer) return proxy_writer else: writer = self._file_opener.dnaio_open(*binary_files, mode="w", **kwargs) self._writers.append(writer) return writer def open_stdout_record_writer( self, interleaved: bool = False, force_fasta: bool = False ): self._binary_files.append(sys.stdout.buffer) kwargs: Dict[str, Any] = dict( qualities=self._qualities, interleaved=interleaved ) if force_fasta: kwargs["fileformat"] = "fasta" if self._proxied: proxy_writer = ProxyRecordWriter(1, **kwargs) self._proxy_files.append(proxy_writer) return proxy_writer else: writer = self._file_opener.dnaio_open(sys.stdout.buffer, mode="w", **kwargs) self._writers.append(writer) return writer def binary_files(self) -> List[BinaryIO]: return self._binary_files[:] def proxy_files(self) -> List[ProxyWriter]: return self._proxy_files def close(self) -> None: """Close all output files that are not stdout""" if not self._proxied: for f in self._text_files: f.close() for f in self._writers: f.close() for bf in self._binary_files_to_close: bf.close() class FileFormat(Enum): FASTA = 1 FASTQ = 2 BAM = 3 def has_qualities(self) -> bool: return self is FileFormat.FASTQ or self is FileFormat.BAM # TODO BAM? # TODO copied and adjusted from dnaio; upstream this def detect_file_format(file: BinaryIO) -> Optional[FileFormat]: if file.seekable(): original_position = file.tell() magic = file.read(4) file.seek(original_position) else: # We cannot always use peek() because BytesIO objects do not suppert it magic = file.peek(4)[0:4] # type: ignore if magic.startswith(b"@") or magic == b"": # Pretend FASTQ for empty input return FileFormat.FASTQ elif magic.startswith(b">") or magic.startswith(b"#"): # Some FASTA variants allow comments return FileFormat.FASTA elif magic == b"BAM\1": return FileFormat.BAM return None cutadapt-4.7/src/cutadapt/info.pyi000066400000000000000000000004731457457704700172730ustar00rootroot00000000000000from dnaio import SequenceRecord from cutadapt.adapters import Match class ModificationInfo: matches: list[Match] = ... original_read: SequenceRecord cut_prefix: str cut_suffix: str is_rc: bool | None def __init__(self, read: SequenceRecord) -> None: ... def __repr__(self) -> str: ... cutadapt-4.7/src/cutadapt/info.pyx000066400000000000000000000017001457457704700173040ustar00rootroot00000000000000cdef class ModificationInfo: """ An object of this class is created for each read that passes through the pipeline. Any information (except the read itself) that needs to be passed from one modifier to one later in the pipeline or from one modifier to the filters is recorded here. """ cdef: public object matches public object original_read public object cut_prefix public object cut_suffix public object is_rc def __init__(self, read): self.matches = [] self.original_read = read self.cut_prefix = None self.cut_suffix = None self.is_rc = None def __repr__(self): return ( "ModificationInfo(" f"matches={self.matches!r}, " f"original_read={self.original_read}, " f"cut_prefix={self.cut_prefix}, " f"cut_suffix={self.cut_suffix}, " f"is_rc={self.is_rc})" ) cutadapt-4.7/src/cutadapt/json.py000066400000000000000000000034601457457704700171370ustar00rootroot00000000000000import json class OneLine: """Wrap any value in this class to print it on one line in the JSON file""" def __init__(self, value): self.value = value def dumps(obj, indent: int = 2, _level: int = 0) -> str: """ Encode an object hierarchy as JSON string. In addition to what json.dumps in the standard library provides, this function allows disabling indentation for selected parts of the hierarchy by marking lists or dicts with the "OneLine" class. Arguments: obj: object to encode indent: indentation level >>> print(dumps({"a": [1, 2], "b": OneLine([3, 4]), "c": dict(x=5, y=6), "d": OneLine(dict(x=7, y=8))})) { "a": [ 1, 2 ], "b": [3, 4], "c": { "x": 5, "y": 6 }, "d": {"x": 7, "y": 8} } >>> print(dumps({"a": []})) { "a": [] } """ if isinstance(obj, (float, int, str, bool, OneLine)) or obj is None: if isinstance(obj, OneLine): obj = obj.value return json.dumps(obj) start = "\n" + (_level + 1) * indent * " " sep = "," + start end = "\n" + _level * indent * " " if isinstance(obj, (tuple, list)): if not obj: return "[]" return ( "[" + start + sep.join(dumps(elem, indent, _level + 1) for elem in obj) + end + "]" ) elif isinstance(obj, dict): if not obj: return "{}" return ( "{" + start + sep.join( json.dumps(k) + ": " + dumps(v, indent, _level + 1) for k, v in obj.items() ) + end + "}" ) else: raise ValueError(f"cannot serialize type {obj.__class__.__name__}") cutadapt-4.7/src/cutadapt/kmer_heuristic.py000066400000000000000000000220311457457704700211760ustar00rootroot00000000000000import io from typing import List, Optional, Set, Tuple from collections import defaultdict def kmer_chunks(sequence: str, chunks: int) -> Set[str]: """ Partition a sequence in almost equal sized chunks. Returns the shortest possibility. AABCABCABC, 3 returns {"AABC", "ABC"} """ chunk_size = len(sequence) // (chunks) remainder = len(sequence) % (chunks) chunk_sizes: List[int] = remainder * [chunk_size + 1] + (chunks - remainder) * [ chunk_size ] offset = 0 chunk_set = set() for size in chunk_sizes: chunk_set.add(sequence[offset : offset + size]) offset += size return chunk_set # A SearchSet is a start and stop combined with a set of strings to search # for at that position SearchSet = Tuple[int, Optional[int], Set[str]] def minimize_kmer_search_list( kmer_search_list: List[Tuple[str, int, Optional[int]]] ) -> List[Tuple[str, int, Optional[int]]]: kmer_and_offsets_dict = defaultdict(list) for kmer, start, stop in kmer_search_list: # type: ignore kmer_and_offsets_dict[kmer].append((start, stop)) kmers_and_positions: List[Tuple[str, int, Optional[int]]] = [] for kmer, positions in kmer_and_offsets_dict.items(): if len(positions) == 1: start, stop = positions[0] kmers_and_positions.append((kmer, start, stop)) continue if (0, None) in positions: kmers_and_positions.append((kmer, 0, None)) continue front_searches = [(start, stop) for start, stop in positions if start == 0] back_searches = [(start, stop) for start, stop in positions if stop is None] middle_searches = [ (start, stop) for start, stop in positions if start != 0 and stop is not None ] if middle_searches: raise NotImplementedError( "Situations with searches starting in the middle have not been considered." ) if front_searches: # (0, None) condition is already caught, so stop is never None. kmers_and_positions.append( (kmer, 0, max(stop for start, stop in front_searches)) # type: ignore ) if back_searches: kmers_and_positions.append( (kmer, min(start for start, stop in back_searches), None) ) return kmers_and_positions def remove_redundant_kmers( search_sets: List[SearchSet], ) -> List[Tuple[int, Optional[int], List[str]]]: """ This removes kmers that are searched in multiple search sets and makes sure they are only searched in the larger search set. This reduces the amount of searched patterns and therefore the number of false positives. """ kmer_search_list = [] for start, stop, kmer_set in search_sets: for kmer in kmer_set: kmer_search_list.append((kmer, start, stop)) minimized_search_list = minimize_kmer_search_list(kmer_search_list) result_dict = defaultdict(list) for kmer, start, stop in minimized_search_list: result_dict[(start, stop)].append(kmer) return [(start, stop, kmers) for (start, stop), kmers in result_dict.items()] def create_back_overlap_searchsets( adapter: str, min_overlap: int, error_rate: float ) -> List[SearchSet]: adapter_length = len(adapter) error_lengths = [] max_error = 0 search_sets: List[SearchSet] = [] for i in range(adapter_length + 1): if int(i * error_rate) > max_error: error_lengths.append((max_error, i - 1)) max_error += 1 error_lengths.append((max_error, adapter_length)) minimum_length = min_overlap for max_errors, length in error_lengths: if minimum_length > length: continue if max_errors == 0: # Add a couple of directly matching 1, 2, 3 and 4-mer searches. # The probability of a false positive is just to high when for # example a 3-mer is evaluated in more than one position. min_overlap_kmer_length = 5 if minimum_length < min_overlap_kmer_length: for i in range(minimum_length, min_overlap_kmer_length): search_set = (-i, None, {adapter[:i]}) search_sets.append(search_set) minimum_length = min_overlap_kmer_length kmer_sets = kmer_chunks(adapter[:minimum_length], max_errors + 1) search_sets.append((-length, None, kmer_sets)) minimum_length = length + 1 return search_sets def create_positions_and_kmers( adapter: str, min_overlap: int, error_rate: float, back_adapter: bool, front_adapter: bool, internal: bool = True, ) -> List[Tuple[int, Optional[int], List[str]]]: """ Create a set of position and words combinations where at least one of the words needs to occur at its specified position. If not an alignment algorithm will not be able to find a solution. This can be checked very quickly and allows for skipping alignment in cases where the adapter would not align anyway. Example: looking for AAAAATTTTT with at most one error. This means either AAAAA or TTTTT (or both) must be present, otherwise alignment will not succeed. This function returns the positions and the accompanying words while also taking into account partial overlap for back and front adapters. """ max_errors = int(len(adapter) * error_rate) search_sets = [] if back_adapter: search_sets.extend( create_back_overlap_searchsets(adapter, min_overlap, error_rate) ) if front_adapter: # To create a front adapter the code is practically the same except # with some parameters set differently. Reversing the adapter, running # the back adapter code and reversing all the kmers and positions has # the same effect without needing to duplicate the code. reversed_back_search_sets = create_back_overlap_searchsets( adapter[::-1], min_overlap, error_rate ) front_search_sets = [] for start, stop, kmer_set in reversed_back_search_sets: new_kmer_set = {kmer[::-1] for kmer in kmer_set} front_search_sets.append((0, -start, new_kmer_set)) search_sets.extend(front_search_sets) if internal: kmer_sets = kmer_chunks(adapter, max_errors + 1) search_sets.append((0, None, kmer_sets)) return remove_redundant_kmers(search_sets) def kmer_probability_analysis( kmers_and_offsets: List[Tuple[int, Optional[int], List[str]]], default_length: int = 150, ) -> str: # pragma: no cover # only for debugging use """ Returns a tab separated table with for each kmer a start, stop, the number of considered sites and the hit chance on a randomly generated sequence containing only A, C, G and T. Assumes kmers only consist of A, C, G and T too. Useful for investigating whether the create_positions_and_kmers function creates a useful runtime heuristic. """ out = io.StringIO() out.write( "kmer\tstart\tstop\tconsidered sites\thit chance by random sequence (%)\n" ) accumulated_not_hit_chance = 1.0 for start, stop, kmers in kmers_and_offsets: if stop is None: check_length = -start if start < 0 else default_length - start else: start = default_length - start if start < 0 else start check_length = max(stop - start, 0) for kmer in kmers: kmer_length = len(kmer) considered_sites = check_length - kmer_length + 1 single_kmer_hit_chance = 1 / 4**kmer_length not_hit_chance = (1 - single_kmer_hit_chance) ** considered_sites accumulated_not_hit_chance *= not_hit_chance out.write( f"{kmer:10}\t{start}\t{stop}\t{considered_sites}\t{(1 - not_hit_chance) * 100:.2f}\n" ) out.write( f"Chance for profile hit by random sequence: {(1 - accumulated_not_hit_chance) * 100:.2f}%\n" ) return out.getvalue() if __name__ == "__main__": # This allows for easy debugging and benchmarking of the kmer heuristic code. import argparse from ._kmer_finder import KmerFinder import dnaio parser = argparse.ArgumentParser() parser.add_argument("--adapter") parser.add_argument("--anywhere", action="store_true") parser.add_argument("fastq") args = parser.parse_args() kmers_and_offsets = create_positions_and_kmers( args.adapter, 3, 0.1, back_adapter=True, front_adapter=args.anywhere ) kmer_finder = KmerFinder(kmers_and_offsets) print(kmer_probability_analysis(kmers_and_offsets)) with dnaio.open(args.fastq, mode="r", open_threads=0) as reader: # type: ignore number_of_records = 0 possible_adapters_found = 0 for number_of_records, record in enumerate(reader, start=1): if kmer_finder.kmers_present(record.sequence): possible_adapters_found += 1 print( f"Percentage possible adapters: " f"{possible_adapters_found * 100 / number_of_records:.2f}%" ) cutadapt-4.7/src/cutadapt/log.py000066400000000000000000000035071457457704700167510ustar00rootroot00000000000000import sys import logging # Custom log level REPORT = 25 class CrashingHandler(logging.StreamHandler): def emit(self, record): """Unlike the method it overrides, this will not catch exceptions""" msg = self.format(record) stream = self.stream stream.write(msg) stream.write(self.terminator) self.flush() class NiceFormatter(logging.Formatter): """ Do not prefix "INFO:" to info-level log messages (but do it for all other levels). Based on http://stackoverflow.com/a/9218261/715090 . """ def format(self, record): if record.levelno not in (logging.INFO, REPORT): record.msg = f"{record.levelname}: {record.msg}" return super().format(record) def setup_logging(logger, log_to_stderr=True, minimal=False, quiet=False, debug=0): """ Attach handler to the global logger object """ # For --report=minimal, we need this custom log level because we want to # print nothing except the minimal report and therefore cannot use the # INFO level (and the ERROR level would give us an 'ERROR:' prefix). logging.addLevelName(REPORT, "REPORT") stream_handler = CrashingHandler(sys.stderr if log_to_stderr else sys.stdout) stream_handler.setFormatter(NiceFormatter()) # debug overrides quiet overrides minimal if debug > 0: level = logging.DEBUG elif quiet: level = logging.ERROR elif minimal: level = REPORT else: level = logging.INFO stream_handler.setLevel(level) stderr_level = logging.WARNING stream_handler.addFilter(lambda record: record.levelno < stderr_level) logger.setLevel(level) logger.addHandler(stream_handler) stderr_handler = CrashingHandler(sys.stderr) stderr_handler.setLevel(stderr_level) logger.addHandler(stderr_handler) cutadapt-4.7/src/cutadapt/modifiers.py000066400000000000000000000775111457457704700201570ustar00rootroot00000000000000""" This module implements all the read modifications that cutadapt supports. A modifier must be callable and typically implemented as a class with a __call__ method. """ import re import logging from collections import defaultdict from types import SimpleNamespace from typing import Sequence, List, Tuple, Optional, Set from abc import ABC, abstractmethod from dnaio import record_names_match, SequenceRecord from .qualtrim import quality_trim_index, nextseq_trim_index, poly_a_trim_index from .adapters import ( MultipleAdapters, SingleAdapter, IndexedPrefixAdapters, IndexedSuffixAdapters, Match, remainder, Adapter, AdapterIndex, ) from .tokenizer import tokenize_braces, TokenizeError, Token, BraceToken from .info import ModificationInfo logger = logging.getLogger() # If the number of prefix or suffix adapters is higher than this, switch to using an index INDEXING_THRESHOLD = 5 class SingleEndModifier(ABC): @abstractmethod def __call__(self, read: SequenceRecord, info: ModificationInfo): pass class PairedEndModifier(ABC): @abstractmethod def __call__( self, read1: SequenceRecord, read2: SequenceRecord, info1: ModificationInfo, info2: ModificationInfo, ) -> Tuple[SequenceRecord, SequenceRecord]: pass class PairedEndModifierWrapper(PairedEndModifier): """ Wrap two SingleEndModifiers that work on both reads in a paired-end read """ paired = True def __init__( self, modifier1: Optional[SingleEndModifier], modifier2: Optional[SingleEndModifier], ): """Set one of the modifiers to None to work on R1 or R2 only""" self._modifier1 = modifier1 self._modifier2 = modifier2 if self._modifier1 is None and self._modifier2 is None: raise ValueError("Not both modifiers may be None") def __repr__(self): return ( "PairedEndModifierWrapper(" f"modifier1={self._modifier1!r}, modifier2={self._modifier2!r})" ) def __call__(self, read1, read2, info1: ModificationInfo, info2: ModificationInfo): if self._modifier1 is None: return read1, self._modifier2(read2, info2) # type: ignore if self._modifier2 is None: return self._modifier1(read1, info1), read2 return self._modifier1(read1, info1), self._modifier2(read2, info2) class AdapterCutter(SingleEndModifier): """ Repeatedly find one of multiple adapters in reads. Arguments: adapters: Adapters to be searched times: Repeat the search this number of times. action: What to do with a found adapter. - *None*: Do nothing, only update the ModificationInfo appropriately - "trim": Remove the adapter and down- or upstream sequence depending on adapter type - "mask": Replace the part of the sequence that would have been removed with "N" bases - "lowercase": Convert the part of the sequence that would have been removed to lowercase - "retain": Like "trim", but leave the adapter sequence itself in the read index: If True, attempt to create an index to speed up the search (if possible) """ def __init__( self, adapters: Sequence[Adapter], times: int = 1, action: Optional[str] = "trim", index: bool = True, ): self.times = times assert action in ("trim", "mask", "lowercase", "retain", None) self.action = action self.with_adapters = 0 self.adapter_statistics = {a: a.create_statistics() for a in adapters} if index: self.adapters = MultipleAdapters( self._regroup_into_indexed_adapters(adapters) ) else: self.adapters = MultipleAdapters(adapters) if action == "retain" and times > 1: raise ValueError("'retain' cannot be combined with times > 1") if self.times == 1 and self.action == "trim": self.match_and_trim = self._match_and_trim_once_action_trim # type: ignore def __repr__(self): return ( "AdapterCutter(" f"adapters={self.adapters!r}, times={self.times}, action='{self.action}')" ) def _regroup_into_indexed_adapters(self, adapters): prefix, suffix, single = self._split_adapters(adapters) # For somewhat better backwards compatibility, avoid re-ordering # the adapters when we don’t need to if len(prefix) > INDEXING_THRESHOLD or len(suffix) > INDEXING_THRESHOLD: result = single if len(prefix) > 1: result.append(IndexedPrefixAdapters(prefix)) else: result.extend(prefix) if len(suffix) > 1: result.append(IndexedSuffixAdapters(suffix)) else: result.extend(suffix) return result else: return adapters @staticmethod def _split_adapters( adapters: Sequence[SingleAdapter], ) -> Tuple[ Sequence[SingleAdapter], Sequence[SingleAdapter], Sequence[SingleAdapter] ]: """ Split adapters into three different categories so that they can possibly be used with a MultiAdapter. Return a tuple (prefix, suffix, other), where - prefix is a list of all anchored 5' adapters that MultiAdapter would accept - suffix is a list of all anchored 3' adapters that MultiAdapter would accept - other is a list of all remaining adapters. """ prefix: List[SingleAdapter] = [] suffix: List[SingleAdapter] = [] other: List[SingleAdapter] = [] for a in adapters: if AdapterIndex.is_acceptable(a, prefix=True): prefix.append(a) elif AdapterIndex.is_acceptable(a, prefix=False): suffix.append(a) else: other.append(a) return prefix, suffix, other @staticmethod def trim_but_retain_adapter(read, matches: Sequence[Match]): start, stop = matches[-1].retained_adapter_interval() return read[start:stop] @staticmethod def masked_read(read, matches: Sequence[Match]): start, stop = remainder(matches) result = read[:] result.sequence = ( "N" * start + read.sequence[start:stop] + "N" * (len(read) - stop) ) return result @staticmethod def lowercased_read(read, matches: Sequence[Match]): start, stop = remainder(matches) result = read[:] result.sequence = ( read.sequence[:start].lower() + read.sequence[start:stop].upper() + read.sequence[stop:].lower() ) return result def __call__(self, read, info: ModificationInfo): trimmed_read, matches = self.match_and_trim(read) if matches: self.with_adapters += 1 for match in matches: self.adapter_statistics[match.adapter].add_match(match) info.matches.extend(matches) # TODO extend or overwrite? return trimmed_read def match_and_trim(self, read): """ Search for the best-matching adapter in a read, perform the requested action ('trim', 'mask' etc. as determined by self.action) and return the (possibly) modified read. *self.times* adapter removal rounds are done. During each round, only the best-matching adapter is trimmed. If no adapter was found in a round, no further rounds are attempted. Return a pair (trimmed_read, matches), where matches is a list of Match instances. """ matches = [] if self.action == "lowercase": # TODO this should not be needed read.sequence = read.sequence.upper() trimmed_read = read for _ in range(self.times): match = self.adapters.match_to(trimmed_read.sequence) if match is None: # if nothing found, attempt no further rounds break matches.append(match) trimmed_read = match.trimmed(trimmed_read) if not matches: return trimmed_read, [] if self.action == "trim": # read is already trimmed, nothing to do pass elif self.action == "retain": trimmed_read = self.trim_but_retain_adapter(read, matches) elif self.action == "mask": trimmed_read = self.masked_read(read, matches) elif self.action == "lowercase": trimmed_read = self.lowercased_read(read, matches) assert len(trimmed_read.sequence) == len(read) elif self.action is None: trimmed_read = read[:] return trimmed_read, matches def _match_and_trim_once_action_trim(self, read): """ Specalization of match_and_trim for the case that self.times == 1 and self.action == 'trim' """ match = self.adapters.match_to(read.sequence) if match is not None: return match.trimmed(read), [match] else: return read, [] class ReverseComplementer(SingleEndModifier): """Trim adapters from a read and its reverse complement""" def __init__(self, adapter_cutter: AdapterCutter, rc_suffix: Optional[str] = " rc"): """ rc_suffix -- suffix to add to the read name if sequence was reverse-complemented """ self.adapter_cutter = adapter_cutter self.reverse_complemented = 0 self._suffix = rc_suffix def __repr__(self): return f"ReverseComplementer(adapter_cutter={self.adapter_cutter})" def __call__(self, read: SequenceRecord, info: ModificationInfo): reverse_read = read.reverse_complement() forward_trimmed_read, forward_matches = self.adapter_cutter.match_and_trim(read) reverse_trimmed_read, reverse_matches = self.adapter_cutter.match_and_trim( reverse_read ) forward_score = sum(m.score for m in forward_matches) reverse_score = sum(m.score for m in reverse_matches) use_reverse_complement = reverse_score > forward_score if use_reverse_complement: self.reverse_complemented += 1 assert reverse_matches trimmed_read, matches = reverse_trimmed_read, reverse_matches info.is_rc = True if self._suffix: trimmed_read.name += self._suffix else: info.is_rc = False trimmed_read, matches = forward_trimmed_read, forward_matches if matches: self.adapter_cutter.with_adapters += 1 for match in matches: stats = self.adapter_cutter.adapter_statistics[match.adapter] stats.add_match(match) stats.reverse_complemented += bool(use_reverse_complement) info.matches.extend(matches) # TODO extend or overwrite? return trimmed_read class PairedReverseComplementer(PairedEndModifier): """Trim adapters from a read pair or its reverse complement (R1/R2 swapped)""" def __init__( self, adapter_cutter1: Optional[AdapterCutter], adapter_cutter2: Optional[AdapterCutter], rc_suffix: Optional[str] = " rc", ): """ rc_suffix -- suffix to add to the read name if sequence was reverse-complemented """ self.adapter_cutter1 = adapter_cutter1 self.adapter_cutter2 = adapter_cutter2 self.reverse_complemented = 0 # counter self._suffix = rc_suffix def __repr__(self): return ( "PairedReverseComplementer(" f"adapter_cutter1={self.adapter_cutter1}, adapter_cutter2={self.adapter_cutter2})" ) def __call__( self, r1: SequenceRecord, r2: SequenceRecord, info1: ModificationInfo, info2: ModificationInfo, ): # Run normally if self.adapter_cutter1 is not None: r1_trimmed, r1_matches = self.adapter_cutter1.match_and_trim(r1) else: r1_trimmed, r1_matches = r1, [] if self.adapter_cutter2 is not None: r2_trimmed, r2_matches = self.adapter_cutter2.match_and_trim(r2) else: r2_trimmed, r2_matches = r2, [] unswapped_score = sum(m.score for m in r1_matches) + sum( m.score for m in r2_matches ) # Run with R1 and R2 swapped (equivalent to reverse complementing) if self.adapter_cutter1 is not None: ( r1_trimmed_swapped, r1_matches_swapped, ) = self.adapter_cutter1.match_and_trim(r2) else: r1_trimmed_swapped, r1_matches_swapped = r2, [] if self.adapter_cutter2 is not None: ( r2_trimmed_swapped, r2_matches_swapped, ) = self.adapter_cutter2.match_and_trim(r1) else: r2_trimmed_swapped, r2_matches_swapped = r1, [] swapped_score = sum(m.score for m in r1_matches_swapped) + sum( m.score for m in r2_matches_swapped ) # Compare and pick the variant that is better use_reverse_complement = swapped_score > unswapped_score if use_reverse_complement: self.reverse_complemented += 1 r1_trimmed = r1_trimmed_swapped r2_trimmed = r2_trimmed_swapped r1_matches = r1_matches_swapped r2_matches = r2_matches_swapped info1.is_rc = info2.is_rc = True if self._suffix: r1_trimmed.name += self._suffix r2_trimmed.name += self._suffix else: info1.is_rc = info2.is_rc = False if r1_matches: self.adapter_cutter1.with_adapters += 1 # type: ignore for match in r1_matches: stats = self.adapter_cutter1.adapter_statistics[match.adapter] # type: ignore stats.add_match(match) stats.reverse_complemented += bool(use_reverse_complement) info1.matches.extend(r1_matches) # TODO extend or overwrite? if r2_matches: self.adapter_cutter2.with_adapters += 1 # type: ignore for match in r2_matches: stats = self.adapter_cutter2.adapter_statistics[match.adapter] # type: ignore stats.add_match(match) stats.reverse_complemented += bool(use_reverse_complement) info2.matches.extend(r2_matches) # TODO extend or overwrite? return r1_trimmed, r2_trimmed class PairedAdapterCutterError(Exception): pass class PairedAdapterCutter(PairedEndModifier): """ Trim adapters in pairs from R1 and R2. """ def __init__(self, adapters1, adapters2, action="trim"): """ adapters1 -- list of Adapters to be removed from R1 adapters2 -- list of Adapters to be removed from R2 Both lists must have the same, non-zero length. read pair is trimmed if adapters1[i] is found in R1 and adapters2[i] in R2. action -- What to do with a found adapter: None, 'trim', 'lowercase' or 'mask' """ if len(adapters1) != len(adapters2): raise PairedAdapterCutterError( "The number of adapters to trim from R1 and R2 must be the same. " "Given: {} for R1, {} for R2".format(len(adapters1), len(adapters2)) ) if not adapters1: raise PairedAdapterCutterError("No adapters given") self._adapter_pairs = list(zip(adapters1, adapters2)) logger.debug("Adapter pairs:") for a1, a2 in self._adapter_pairs: logger.debug(" • %s=%s -- %s=%s", a1.name, a1.spec(), a2.name, a2.spec()) self.action = action self.with_adapters = 0 self.adapter_statistics = [None, None] self.adapter_statistics[0] = {a: a.create_statistics() for a in adapters1} self.adapter_statistics[1] = {a: a.create_statistics() for a in adapters2} def __repr__(self): return f"PairedAdapterCutter(adapter_pairs={self._adapter_pairs!r})" def __call__(self, read1, read2, info1, info2): """ """ best_matches = self._find_best_match_pair(read1.sequence, read2.sequence) if best_matches is None: return read1, read2 match1, match2 = best_matches self.with_adapters += 1 result = [] for i, match, read in zip([0, 1], [match1, match2], [read1, read2]): trimmed_read = read if self.action == "lowercase": trimmed_read.sequence = trimmed_read.sequence.upper() trimmed_read = match.trimmed(trimmed_read) self.adapter_statistics[i][match.adapter].add_match(match) if self.action == "trim": # read is already trimmed, nothing to do pass elif self.action == "mask": trimmed_read = AdapterCutter.masked_read(read, [match]) elif self.action == "lowercase": trimmed_read = AdapterCutter.lowercased_read(read, [match]) assert len(trimmed_read.sequence) == len(read) elif self.action == "retain": trimmed_read = AdapterCutter.trim_but_retain_adapter(read, [match]) elif self.action is None: # --no-trim trimmed_read = read[:] result.append(trimmed_read) info1.matches.append(match1) info2.matches.append(match2) return result def _find_best_match_pair( self, sequence1: str, sequence2: str ) -> Optional[Tuple[Match, Match]]: best = None best_score = None best_errors = None for adapter1, adapter2 in self._adapter_pairs: match1 = adapter1.match_to(sequence1) if match1 is None: continue match2 = adapter2.match_to(sequence2) if match2 is None: continue total_score = match1.score + match2.score total_errors = match1.errors + match2.errors if ( best is None or total_score > best_score or (total_score == best_score and total_errors < best_errors) ): best = match1, match2 best_score = total_score best_errors = total_errors return best class UnconditionalCutter(SingleEndModifier): """ A modifier that unconditionally removes the first n or the last n bases from a read. If the length is positive, the bases are removed from the beginning of the read. If the length is negative, the bases are removed from the end of the read. """ def __init__(self, length: int): self.length = length def __repr__(self): return f"UnconditionalCutter(length={self.length})" def __call__(self, read, info: ModificationInfo): if self.length > 0: info.cut_prefix = read.sequence[: self.length] return read[self.length :] elif self.length < 0: info.cut_suffix = read.sequence[self.length :] return read[: self.length] class LengthTagModifier(SingleEndModifier): """ Replace "length=..." strings in read names. """ def __init__(self, length_tag): self.regex = re.compile(r"\b" + length_tag + r"[0-9]*\b") self.length_tag = length_tag def __repr__(self): return f"LengthTagModifier(length_tag='{self.length_tag}')" def __call__(self, read, info: ModificationInfo): read = read[:] if read.name.find(self.length_tag) >= 0: read.name = self.regex.sub( self.length_tag + str(len(read.sequence)), read.name ) return read class SuffixRemover(SingleEndModifier): """ Remove a given suffix from read names. """ def __init__(self, suffix): self.suffix = suffix def __repr__(self): return f"SuffixRemover('{self.suffix}')" def __call__(self, read, info: ModificationInfo): read = read[:] if read.name.endswith(self.suffix): read.name = read.name[: -len(self.suffix)] return read class PrefixSuffixAdder(SingleEndModifier): """ Add a suffix and a prefix to read names """ def __init__(self, prefix, suffix): self.prefix = prefix self.suffix = suffix def __repr__(self): return f"PrefixSuffixAdder(prefix='{self.prefix}', suffix='{self.suffix}')" def __call__(self, read, info): read = read[:] adapter_name = info.matches[-1].adapter.name if info.matches else "no_adapter" read.name = ( self.prefix.replace("{name}", adapter_name) + read.name + self.suffix.replace("{name}", adapter_name) ) return read class InvalidTemplate(Exception): pass class Renamer(SingleEndModifier): """ Rename reads using a template The template string can contain the following placeholders: - {header} -- full, unchanged header - {id} -- the part of the header before the first whitespace - {comment} -- the part of the header after the ID, excluding initial whitespace - {cut_prefix} -- prefix removed by UnconditionalCutter (with positive length argument) - {cut_suffix} -- suffix removed by UnconditionalCutter (with negative length argument) - {adapter_name} -- name of the *last* adapter match or no_adapter if there was none - {match_sequence} -- the sequence that matched the adapter (this includes possible errors) or an empty string if there was no match - {rc} -- the string 'rc' if the read was reverse complemented (with --revcomp) or '' otherwise """ variables = { "header", "id", "comment", "cut_prefix", "cut_suffix", "adapter_name", "rc", "match_sequence", } def __init__(self, template: str): template = template.replace(r"\t", "\t") try: self._tokens = list(tokenize_braces(template)) except TokenizeError as e: raise InvalidTemplate(f"Error in template '{template}': {e}") self.raise_if_invalid_variable(self._tokens, self.variables) self._template = template self._rename = self.compile_rename_function() def __repr__(self): return f"{self.__class__.__name__}('{self._template}')" def __reduce__(self): return Renamer, (self._template,) def compile_rename_function(self): """ Create the function that computes a new name By creating the code dynamically, we can ensure that only those placeholder values are computed that are actually used in the template. """ code = { "header": "read.name", "id": "id_", "comment": "comment", "cut_prefix": "info.cut_prefix if info.cut_prefix else ''", "cut_suffix": "info.cut_suffix if info.cut_suffix else ''", "adapter_name": "info.matches[-1].adapter.name if info.matches else 'no_adapter'", "rc": "'rc' if info.is_rc else ''", "match_sequence": "info.matches[-1].match_sequence() if info.matches else ''", } placeholders = set( token.value for token in self._tokens if isinstance(token, BraceToken) ) lines = ["def rename(self, read, info):"] if "id" in placeholders or "header" in placeholders: lines.append(" id_, comment = self.parse_name(read.name)") lines.append(" return self._template.format(") for placeholder in placeholders: lines.append(f" {placeholder}={code[placeholder]},") lines.append(" )") logger.debug("Generated code of rename function:\n%s", "\n".join(lines)) namespace = dict() exec("\n".join(lines), namespace) return namespace["rename"] @staticmethod def raise_if_invalid_variable(tokens: List[Token], allowed: Set[str]) -> None: for token in tokens: if not isinstance(token, BraceToken): continue value = token.value if value not in allowed: raise InvalidTemplate( f"Error in template: Variable '{value}' not recognized" ) @staticmethod def parse_name(read_name: str) -> Tuple[str, str]: """Parse read header and return (id, comment) tuple""" fields = read_name.split(maxsplit=1) if len(fields) == 2: return (fields[0], fields[1]) else: return (read_name, "") def __call__(self, read: SequenceRecord, info: ModificationInfo) -> SequenceRecord: read.name = self._rename(self, read, info) return read class PairedEndRenamer(PairedEndModifier): """ Rename paired-end reads using a template. The template is applied to both R1 and R2, and the same template variables as in the (single-end) renamer are allowed. However, these variables are evaluated separately for each read. For example, if `{comment}` is used, it gets replaced with the R1 comment in the R1 header, and with the R2 comment in the R2 header. Additionally, all template variables except `id` can be used in the read-specific forms `{r1.variablename}` and `{r2.variablename}`. For example, `{r1.comment}` always gets replaced with the R1 comment, even in R2. """ def __init__(self, template: str): try: self._tokens = list(tokenize_braces(template)) except TokenizeError as e: raise InvalidTemplate(f"Error in template '{template}': {e}") Renamer.raise_if_invalid_variable(self._tokens, self._get_allowed_variables()) self._template = template.replace(r"\t", "\t") @staticmethod def _get_allowed_variables() -> Set[str]: allowed = (Renamer.variables - {"rc"}) | {"rn"} for v in Renamer.variables - {"id", "rc"}: allowed.add("r1." + v) allowed.add("r2." + v) return allowed def __call__( self, read1: SequenceRecord, read2: SequenceRecord, info1: ModificationInfo, info2: ModificationInfo, ) -> Tuple[SequenceRecord, SequenceRecord]: if not record_names_match(read1.name, read2.name): id1 = Renamer.parse_name(read1.name)[0] id2 = Renamer.parse_name(read1.name)[1] raise ValueError(f"Input read IDs not identical: '{id1}' != '{id2}'") name1, name2 = self._rename(read1, read2, info1, info2) if not record_names_match(name1, name2): new_id1 = Renamer.parse_name(name1)[0] new_id2 = Renamer.parse_name(name2)[0] id1 = Renamer.parse_name(read1.name)[0] raise InvalidTemplate( "After renaming R1 and R2, their IDs are no longer identical: " f"'{new_id1}' != '{new_id2}'. Original read ID: '{id1}'. " ) read1.name = name1 read2.name = name2 return read1, read2 def _rename( self, read1: SequenceRecord, read2: SequenceRecord, info1: ModificationInfo, info2: ModificationInfo, ) -> Tuple[str, str]: id1, comment1 = Renamer.parse_name(read1.name) id2, comment2 = Renamer.parse_name(read2.name) header1 = read1.name header2 = read2.name d = [] for id_, comment, header, info in ( (id1, comment1, header1, info1), (id2, comment2, header2, info2), ): if info.matches: adapter_name = info.matches[-1].adapter.name match_sequence = info.matches[-1].match_sequence() else: adapter_name = "no_adapter" match_sequence = "" d.append( dict( comment=comment, header=header, cut_prefix=info.cut_prefix if info.cut_prefix else "", cut_suffix=info.cut_suffix if info.cut_suffix else "", adapter_name=adapter_name, match_sequence=match_sequence, ) ) name1 = self._template.format( id=id1, rn=1, **d[0], r1=SimpleNamespace(**d[0]), r2=SimpleNamespace(**d[1]), ) name2 = self._template.format( id=id2, rn=2, **d[1], r1=SimpleNamespace(**d[0]), r2=SimpleNamespace(**d[1]), ) return name1, name2 class ZeroCapper(SingleEndModifier): """ Change negative quality values of a read to zero """ def __init__(self, quality_base=33): self.quality_base = quality_base qb = quality_base self.zero_cap_trans = str.maketrans("".join(map(chr, range(qb))), chr(qb) * qb) def __repr__(self): return f"ZeroCapper(quality_base={self.quality_base})" def __call__(self, read, info: ModificationInfo): read = read[:] read.qualities = read.qualities.translate(self.zero_cap_trans) return read class NextseqQualityTrimmer(SingleEndModifier): def __init__(self, cutoff: int, base: int = 33): self.cutoff = cutoff self.base = base self.trimmed_bases = 0 def __repr__(self): return f"NextseqQualityTrimmer(cutoff={self.cutoff}, base={self.base})" def __call__(self, read, info: ModificationInfo): stop = nextseq_trim_index(read, self.cutoff, self.base) self.trimmed_bases += len(read) - stop return read[:stop] class QualityTrimmer(SingleEndModifier): def __init__(self, cutoff_front: int, cutoff_back: int, base: int = 33): self.cutoff_front = cutoff_front self.cutoff_back = cutoff_back self.base = base self.trimmed_bases = 0 def __repr__(self): return ( f"QualityTrimmer(cutoff_front={self.cutoff_front}, " f"cutoff_back={self.cutoff_back}, base={self.base})" ) def __call__(self, read, info: ModificationInfo): start, stop = quality_trim_index( read.qualities, self.cutoff_front, self.cutoff_back, self.base ) self.trimmed_bases += len(read) - (stop - start) return read[start:stop] class PolyATrimmer(SingleEndModifier): """Trim poly-A tails or poly-T heads""" def __init__(self, revcomp=False): self.trimmed_bases = defaultdict(int) self.revcomp = revcomp def __repr__(self): return "PolyATrimmer()" def __call__(self, record: SequenceRecord, info: ModificationInfo): if self.revcomp: index = poly_a_trim_index(record.sequence, revcomp=True) self.trimmed_bases[index] += 1 return record[index:] else: index = poly_a_trim_index(record.sequence) self.trimmed_bases[len(record) - index] += 1 return record[:index] class Shortener(SingleEndModifier): """Unconditionally shorten a read to the given length If the length is positive, the bases are removed from the end of the read. If the length is negative, the bases are removed from the beginning of the read. """ def __init__(self, length): self.length = length def __repr__(self): return f"Shortener(length={self.length})" def __call__(self, read, info: ModificationInfo): if self.length >= 0: return read[: self.length] else: return read[self.length :] class NEndTrimmer(SingleEndModifier): """Trims Ns from the 3' and 5' end of reads""" def __init__(self): self.start_trim = re.compile(r"^N+") self.end_trim = re.compile(r"N+$") def __repr__(self): return "NEndTrimmer()" def __call__(self, read, info: ModificationInfo): sequence = read.sequence start_cut = self.start_trim.match(sequence) end_cut = self.end_trim.search(sequence) start_cut = start_cut.end() if start_cut else 0 end_cut = end_cut.start() if end_cut else len(read) return read[start_cut:end_cut] cutadapt-4.7/src/cutadapt/parser.py000066400000000000000000000453271457457704700174720ustar00rootroot00000000000000""" Parse adapter specifications """ import re import logging from pathlib import Path from typing import Type, Optional, List, Tuple, Any, Dict, Iterable from xopen import xopen from dnaio.readers import FastaReader from .adapters import ( Adapter, FrontAdapter, NonInternalFrontAdapter, BackAdapter, NonInternalBackAdapter, AnywhereAdapter, PrefixAdapter, SuffixAdapter, LinkedAdapter, InvalidCharacter, RightmostFrontAdapter, ) logger = logging.getLogger(__name__) def parse_search_parameters(spec: str): """Parse key=value;key=value;key=value into a dict""" allowed_parameters = { # abbreviations "e": "max_error_rate", "error_rate": "max_errors", "max_error_rate": "max_errors", "o": "min_overlap", # allowed parameters "max_errors": None, "min_overlap": None, "anywhere": None, "required": None, "optional": None, # If this is specified, 'required' will be set to False "indels": None, "noindels": None, "rightmost": None, } fields = spec.split(";") result: Dict[str, Any] = dict() for field in fields: field = field.strip() if not field: continue key, equals, value = field.partition("=") # type: (str, str, Any) key = key.strip() if key not in allowed_parameters: raise KeyError(f"Unknown parameter '{key}'") if equals == "=" and value == "": raise ValueError(f"No value given for key '{key}'") # unabbreviate while allowed_parameters[key] is not None: key = allowed_parameters[key] # type: ignore value = value.strip() if value == "": value = True else: try: value = int(value) except ValueError: value = float(value) if key in result: raise KeyError(f"Key '{key}' specified twice") result[key] = value if "optional" in result and "required" in result: raise ValueError( "'optional' and 'required' cannot be specified at the same time" ) if "indels" in result and "noindels" in result: raise ValueError("'indels' and 'noindels' cannot be specified at the same time") if "optional" in result: result["required"] = False del result["optional"] if "noindels" in result: result["indels"] = False del result["noindels"] return result def expand_braces(sequence: str) -> str: """ Replace all occurrences of ``x{n}`` (where x is any character) with n occurrences of x. Raise ValueError if the expression cannot be parsed. >>> expand_braces('TGA{5}CT') 'TGAAAAACT' """ # Simple DFA with four states, encoded in prev result = "" prev = None for s in re.split("([{}])", sequence): if s == "": continue if prev is None: if s == "{": raise ValueError('"{" must be used after a character') if s == "}": raise ValueError('"}" cannot be used here') prev = s result += s elif prev == "{": prev = int(s) if not 0 <= prev <= 10000: raise ValueError(f"Value {prev} invalid") elif isinstance(prev, int): if s != "}": raise ValueError('"}" expected') result = result[:-1] + result[-1] * prev prev = None else: if s != "{": raise ValueError('Expected "{"') prev = "{" # Check if we are in a non-terminating state if isinstance(prev, int) or prev == "{": raise ValueError("Unterminated expression") return result def _normalize_ellipsis(spec1: str, spec2: str, adapter_type) -> Tuple[str, str]: if adapter_type == "anywhere": raise ValueError('No ellipsis ("...") allowed in "anywhere" adapters') if not spec1: if adapter_type == "back": # -a ...ADAPTER spec = spec2 else: # -g ...ADAPTER raise ValueError("Invalid adapter specification") elif not spec2: if adapter_type == "back": # -a ADAPTER... adapter_type = "front" spec = spec1 else: # -g ADAPTER... spec = spec1 else: raise ValueError("Expected either spec1 or spec2") return spec, adapter_type class AdapterSpecification: """# noqa: E501 Description of a single non-linked adapter. These are the attributes: - name (None or str) - restriction (None, 'anchored', or 'noninternal') - sequence (nucleotide sequence as string) - search parameters (dict with keys such as 'max_errors', 'min_overlap') - adapter_type ('front' for -a, 'back' for -g and 'anywhere' for -b) >>> AdapterSpecification.parse('a_name=ACGT;anywhere', 'back') AdapterSpecification(name='a_name', restriction=None, sequence='ACGT', parameters={'anywhere': True}, adapter_type='back') """ def __init__( self, name: Optional[str], restriction: Optional[str], sequence: str, parameters, adapter_type: str, rightmost: bool, ): assert restriction in (None, "anchored", "noninternal") assert adapter_type in ("front", "back", "anywhere") self.name = name self.restriction = restriction self.sequence = sequence self.parameters = parameters self.adapter_type = adapter_type self.rightmost = rightmost def __repr__(self): return "{}(name={!r}, restriction={!r}, sequence={!r}, parameters={!r}, adapter_type={!r})".format( self.__class__.__name__, self.name, self.restriction, self.sequence, self.parameters, self.adapter_type, ) def __eq__(self, other): return ( self.name == other.name and self.restriction == other.restriction and self.sequence == other.sequence and self.parameters == other.parameters and self.adapter_type == other.adapter_type ) @staticmethod def _extract_name(spec: str) -> Tuple[Optional[str], str]: """ Parse an adapter specification given as 'name=adapt' into 'name' and 'adapt'. """ fields = spec.split("=", 1) name: Optional[str] = None if len(fields) > 1: name, spec = fields name = name.strip() spec = spec.strip() return name, spec @classmethod def parse(cls, spec: str, adapter_type: str) -> "AdapterSpecification": """ Parse an adapter specification for a non-linked adapter (without '...') and return an AdapterSpecification instance. Allow: 'back' and ADAPTER 'back' and ADAPTERX 'back' and ADAPTER$ 'front' and ADAPTER 'front' and XADAPTER 'front' and ^ADAPTER 'anywhere' and ADAPTER """ if adapter_type not in ("front", "back", "anywhere"): raise ValueError("adapter_type must be front, back or anywhere") spec, middle, parameters_spec = spec.partition(";") name, spec = cls._extract_name(spec) spec = spec.strip() parameters = parse_search_parameters(parameters_spec) spec = expand_braces(spec) rightmost = parameters.pop("rightmost", False) # Special case for adapters consisting of only X characters: # This needs to be supported for backwards-compatibilitity if len(spec.strip("X")) == 0: return cls(name, None, spec, {}, adapter_type, False) try: front_restriction, back_restriction, spec = cls._parse_restrictions(spec) except ValueError: raise ValueError( "You cannot use multiple placement restrictions for an adapter at the same time. " "Choose one of ^ADAPTER, ADAPTER$, XADAPTER or ADAPTERX" ) from None if adapter_type == "front" and back_restriction: raise ValueError( "Allowed placement restrictions for a 5' adapter are XADAPTER and ^ADAPTER" ) if adapter_type == "back" and front_restriction: raise ValueError( "Allowed placement restrictions for a 3' adapter are ADAPTERX and ADAPTER$" ) if front_restriction is not None: restriction: Optional[str] = front_restriction else: restriction = back_restriction if adapter_type == "anywhere" and restriction is not None: raise ValueError( "Placement restrictions (with X, ^, $) not supported for 'anywhere' (-b) adapters" ) if "min_overlap" in parameters and restriction == "anchored": raise ValueError( "Setting 'min_overlap=' (or 'o=') for anchored adapters is not possible because " "anchored adapters always need to match in full." ) if parameters.get("min_overlap", 0) > len(spec): raise ValueError( f"min_overlap={parameters['min_overlap']}" f" exceeds length of adapter {spec}" ) if rightmost and (adapter_type != "front" or restriction is not None): raise ValueError("'rightmost' only allowed with regular 5' adapters") return cls(name, restriction, spec, parameters, adapter_type, rightmost) @staticmethod def _parse_restrictions(spec: str) -> Tuple[Optional[str], Optional[str], str]: front_restriction = None if spec.startswith("^"): front_restriction = "anchored" spec = spec[1:] if spec.upper().startswith("X"): if front_restriction is not None: raise ValueError("two front restrictions") front_restriction = "noninternal" spec = spec.lstrip("xX") back_restriction = None if spec.endswith("$"): back_restriction = "anchored" spec = spec[:-1] if spec.upper().endswith("X"): if back_restriction is not None: raise ValueError("two back restrictions") back_restriction = "noninternal" spec = spec.rstrip("xX") n_placement_restrictions = int(bool(front_restriction)) + int( bool(back_restriction) ) if n_placement_restrictions > 1: raise ValueError("front and back restrictions") assert front_restriction is None or back_restriction is None return front_restriction, back_restriction, spec @staticmethod def _restriction_to_class(adapter_type, restriction, rightmost): """ restriction: None, "anchored", or "noninternal" """ if adapter_type == "front": if rightmost: assert restriction is None return RightmostFrontAdapter elif restriction is None: return FrontAdapter elif restriction == "anchored": return PrefixAdapter elif restriction == "noninternal": return NonInternalFrontAdapter else: raise ValueError( f"Value {restriction} for a front restriction not allowed" ) elif adapter_type == "back": if restriction is None: return BackAdapter elif restriction == "anchored": return SuffixAdapter elif restriction == "noninternal": return NonInternalBackAdapter else: raise ValueError( f"Value {restriction} for a back restriction not allowed" ) else: assert adapter_type == "anywhere" if restriction is None: return AnywhereAdapter else: raise ValueError( 'No placement may be specified for "anywhere" adapters' ) def adapter_class(self): return self._restriction_to_class( self.adapter_type, self.restriction, self.rightmost ) def make_adapters_from_specifications( type_spec_pairs: List[Tuple[str, str]], search_parameters: Dict[str, Any], ) -> List[Adapter]: """ Create a list of Adapter classes from specification strings and adapter types. type_spec_pairs -- a list of (str, str) pairs, where the first is the adapter type (either 'front', 'back' or 'anywhere') and the second is the adapter specification string, such as "ACGT;o=3" or "file:adapters.fasta" search_parameters -- A dict with default search parameters. These can be overriden by the adapter specifications. They are passed as **kwargs when instantiating the adapter classes. Possible keys: max_error_rate, min_overlap, read_wildcards, adapter_wildcards, indels Return a list of appropriate Adapter instances. """ adapters: List[Adapter] = [] for adapter_type, spec in type_spec_pairs: adapters.extend( make_adapters_from_one_specification(spec, adapter_type, search_parameters) ) return adapters def make_adapters_from_one_specification( spec: str, adapter_type: str, search_parameters: Dict[str, Any], ) -> Iterable[Adapter]: """ Parse an adapter specification and yield appropriate Adapter classes. """ if ( spec.startswith("file:") or spec.startswith("^file:") or spec.startswith("file$:") ): anchoring_prefix = "" anchoring_suffix = "" if spec.startswith("^"): spec = spec[1:] anchoring_prefix = "^" elif spec.startswith("file$:"): spec = "file:" + spec[6:] anchoring_suffix = "$" path, _, parameters_spec = spec[5:].partition(";") parameters = search_parameters.copy() parameters.update(parse_search_parameters(parameters_spec)) for name, spec in read_adapters_fasta(path): yield make_adapter( anchoring_prefix + spec + anchoring_suffix, adapter_type, parameters, name=name, ) else: try: yield make_adapter(spec, adapter_type, search_parameters) except InvalidCharacter as e: if Path(spec).exists(): extra_message = ( f"A file exists named '{spec}'. " "To use the sequences in that file as adapter sequences, write 'file:' " f"before the path, as in 'file:{spec}'." ) raise InvalidCharacter(e.args[0] + "\n" + extra_message) else: raise def make_adapter( spec: str, adapter_type: str, search_parameters: Dict[str, Any], name: Optional[str] = None, ) -> Adapter: """ Parse an adapter specification not using ``file:`` notation and return an object of an appropriate Adapter class. name -- Adapter name if not included as part of the spec. (If spec is 'name=ADAPTER', name will be 'name'.) adapter_type -- describes which commandline parameter was used (``-a`` is 'back', ``-b`` is 'anywhere', and ``-g`` is 'front'). search_parameters -- dict with default search parameters """ if adapter_type not in ("front", "back", "anywhere"): raise ValueError("adapter_type must be front, back or anywhere") spec1, middle, spec2 = spec.partition("...") if middle == "..." and spec1 and spec2: return _make_linked_adapter(spec1, spec2, name, adapter_type, search_parameters) if middle == "...": spec, adapter_type = _normalize_ellipsis(spec1, spec2, adapter_type) else: spec = spec1 return _make_not_linked_adapter(spec, name, adapter_type, search_parameters) def _make_linked_adapter( spec1: str, spec2: str, name: Optional[str], adapter_type: str, search_parameters: Dict[str, Any], ) -> LinkedAdapter: """Return a linked adapter from two specification strings""" if adapter_type == "anywhere": raise ValueError("'anywhere' (-b) adapters may not be linked") front_spec = AdapterSpecification.parse(spec1, "front") back_spec = AdapterSpecification.parse(spec2, "back") if name is None: name = front_spec.name front_anchored = front_spec.restriction is not None back_anchored = back_spec.restriction is not None front_parameters = search_parameters.copy() front_parameters.update(front_spec.parameters) back_parameters = search_parameters.copy() back_parameters.update(back_spec.parameters) if adapter_type == "front": # -g requires both adapters to be present front_required = True back_required = True else: # -a requires only the anchored adapters to be present front_required = front_anchored back_required = back_anchored # Handle parameters overriding whether an adapter is required front_required = front_parameters.pop("required", front_required) back_required = back_parameters.pop("required", back_required) front_adapter = front_spec.adapter_class()( front_spec.sequence, name="linked_front", **front_parameters ) back_adapter = back_spec.adapter_class()( back_spec.sequence, name="linked_back", **back_parameters ) return LinkedAdapter( front_adapter=front_adapter, back_adapter=back_adapter, front_required=front_required, back_required=back_required, name=name, ) def _make_not_linked_adapter( spec: str, name: Optional[str], adapter_type: str, search_parameters: Dict[str, Any], ) -> Adapter: aspec = AdapterSpecification.parse(spec, adapter_type) adapter_class: Type[Adapter] = aspec.adapter_class() if aspec.parameters.pop("anywhere", False) and adapter_class in ( FrontAdapter, BackAdapter, RightmostFrontAdapter, ): aspec.parameters["force_anywhere"] = True if "required" in aspec.parameters: raise ValueError( "'required' and 'optional' can only be used within linked adapters" ) parameters = search_parameters.copy() parameters.update(aspec.parameters) return adapter_class( sequence=aspec.sequence, name=aspec.name if name is None else name, **parameters, ) def read_adapters_fasta(path): """ Read adapter sequences from a FASTA file """ with xopen(path, mode="rb", threads=0) as f: fasta = FastaReader(f) # type: ignore for record in fasta: header = record.name.split(None, 1) name = header[0] if header else None yield name, record.sequence cutadapt-4.7/src/cutadapt/pipeline.py000066400000000000000000000110551457457704700177720ustar00rootroot00000000000000import logging from abc import ABC, abstractmethod from typing import List, Optional, Tuple, Union from .files import InputFiles from .utils import Progress from .modifiers import ( SingleEndModifier, PairedEndModifier, PairedEndModifierWrapper, ModificationInfo, ) from .steps import SingleEndStep logger = logging.getLogger() class Pipeline(ABC): """ Processing pipeline that loops over reads and applies modifiers and filters """ @abstractmethod def process_reads( self, infiles: InputFiles, progress: Optional[Progress] = None, ) -> Tuple[int, int, Optional[int]]: pass class SingleEndPipeline(Pipeline): """ Processing pipeline for single-end reads """ paired = False def __init__( self, modifiers: List[SingleEndModifier], steps: List[SingleEndStep], ): self._modifiers: List[SingleEndModifier] = modifiers self._steps = steps def process_reads( self, infiles: InputFiles, progress: Optional[Progress] = None, ) -> Tuple[int, int, Optional[int]]: """Run the pipeline. Return statistics""" reader = infiles.open() for i, step in enumerate(self._steps, 1): logger.debug("Pipeline step %d: %s", i, step) n = 0 # no. of processed reads total_bp = 0 modifiers_and_steps = self._modifiers + self._steps for read in reader: n += 1 if n % 10000 == 0 and progress is not None: progress.update(10000) total_bp += len(read) info = ModificationInfo(read) for step in modifiers_and_steps: # type: ignore[assignment] read = step(read, info) if read is None: break if progress is not None: progress.update(n % 10000) infiles.close() return (n, total_bp, None) class PairedEndPipeline(Pipeline): """ Processing pipeline for paired-end reads. """ paired = True def __init__( self, modifiers: List[ Union[ PairedEndModifier, Tuple[Optional[SingleEndModifier], Optional[SingleEndModifier]], ] ], steps, ): self._modifiers: List[PairedEndModifier] = [] self._steps = steps self._reader = None # Whether to ignore pair_filter mode for discard-untrimmed filter self.override_untrimmed_pair_filter = False self._add_modifiers(modifiers) def _add_modifiers(self, modifiers): for modifier in modifiers: if isinstance(modifier, tuple): self._add_two_single_modifiers(*modifier) else: self._add_modifier(modifier) def _add_two_single_modifiers( self, modifier1: Optional[SingleEndModifier], modifier2: Optional[SingleEndModifier], ) -> None: """ Add two single-end modifiers that modify R1 and R2, respectively. One of them can be None, in which case the modifier is only applied to the respective other read. """ if modifier1 is None and modifier2 is None: raise ValueError("Not both modifiers can be None") self._modifiers.append(PairedEndModifierWrapper(modifier1, modifier2)) def _add_modifier(self, modifier: PairedEndModifier) -> None: """Add a Modifier (without wrapping it in a PairedEndModifierWrapper)""" self._modifiers.append(modifier) def process_reads( self, infiles: InputFiles, progress: Optional[Progress] = None, ) -> Tuple[int, int, Optional[int]]: self._infiles = infiles self._reader = infiles.open() n = 0 # no. of processed reads total1_bp = 0 total2_bp = 0 assert self._reader is not None modifiers_and_steps = self._modifiers + self._steps for reads in self._reader: n += 1 if n % 10000 == 0 and progress is not None: progress.update(10000) read1, read2 = reads total1_bp += len(read1) total2_bp += len(read2) info1 = ModificationInfo(read1) info2 = ModificationInfo(read2) for step in modifiers_and_steps: reads = step(*reads, info1, info2) # type: ignore if reads is None: break if progress is not None: progress.update(n % 10000) infiles.close() return (n, total1_bp, total2_bp) cutadapt-4.7/src/cutadapt/predicates.py000066400000000000000000000112561457457704700203130ustar00rootroot00000000000000""" Filtering criteria (predicates) """ from abc import ABC, abstractmethod from .qualtrim import expected_errors from .modifiers import ModificationInfo class Predicate(ABC): @abstractmethod def test(self, read, info: ModificationInfo) -> bool: """ Return True if the filtering criterion matches. """ @classmethod def descriptive_identifier(cls) -> str: """ Return a short name for this predicate based on the class name such as "too_long", "too_many_expected_errors". This is used as identifier in the JSON report. """ return "".join( ("_" + ch.lower() if ch.isupper() else ch) for ch in cls.__name__ )[1:] class TooShort(Predicate): """Select reads that are shorter than the specified minimum length""" def __init__(self, minimum_length: int): self.minimum_length = minimum_length def __repr__(self): return f"TooShort(minimum_length={self.minimum_length})" def test(self, read, info: ModificationInfo): return len(read) < self.minimum_length class TooLong(Predicate): """Select reads that are longer than the specified maximum length""" def __init__(self, maximum_length: int): self.maximum_length = maximum_length def __repr__(self): return f"TooLong(maximum_length={self.maximum_length})" def test(self, read, info: ModificationInfo): return len(read) > self.maximum_length class TooManyExpectedErrors(Predicate): """ Select reads whose expected number of errors, according to the quality values, exceeds a threshold. The idea comes from usearch's -fastq_maxee parameter (http://drive5.com/usearch/). """ def __init__(self, max_errors: float): self.max_errors = max_errors def __repr__(self): return f"TooManyExpectedErrors(max_errors={self.max_errors})" def test(self, read, info: ModificationInfo): return expected_errors(read.qualities) > self.max_errors class TooHighAverageErrorRate(Predicate): """ Select reads that have an average error rate above the threshold. This works better than TooManyExpectedErrors for reads that are expected to have varying lengths, such as for long read sequencing technologies. """ def __init__(self, max_error_rate: float): if not 0.0 < max_error_rate < 1.0: raise ValueError( f"max_error_rate must be between 0.0 and 1.0, got {max_error_rate}." ) self.max_error_rate = max_error_rate def __repr__(self): return f"TooHighAverageErrorRate(max_error_rate={self.max_error_rate}" def test(self, read, info: ModificationInfo): return (expected_errors(read.qualities) / len(read)) > self.max_error_rate class TooManyN(Predicate): """ Select reads that have too many 'N' bases. Both a raw count or a proportion (relative to the sequence length) can be used. """ def __init__(self, count: float): """ count -- if it is below 1.0, it will be considered a proportion, and above and equal to 1 will be considered as discarding reads with a number of N's greater than this cutoff. """ assert count >= 0 self.is_proportion = count < 1.0 self.cutoff = count def __repr__(self): return f"TooManyN(cutoff={self.cutoff}, is_proportion={self.is_proportion})" def test(self, read, info: ModificationInfo): n_count = read.sequence.lower().count("n") if self.is_proportion: if len(read) == 0: return False return n_count / len(read) > self.cutoff else: return n_count > self.cutoff class CasavaFiltered(Predicate): """ Select reads that have failed the CASAVA filter according to the read header. The headers look like ``xxxx x:Y:x:x`` (with a ``Y``). Reads that pass the filter have an ``N`` instead of ``Y``. Reads with unrecognized headers are not selected. """ def __repr__(self): return "CasavaFiltered()" def test(self, read, info: ModificationInfo): _, _, right = read.name.partition(" ") return right[1:4] == ":Y:" # discard if :Y: found class IsUntrimmed(Predicate): """ Select reads for which no adapter match was found """ def __repr__(self): return "DiscardUntrimmed()" def test(self, read, info: ModificationInfo): return not info.matches class IsTrimmed(Predicate): """ Select reads for which at least one adapter match was found """ def __repr__(self): return "DiscardTrimmed()" def test(self, read, info: ModificationInfo): return bool(info.matches) cutadapt-4.7/src/cutadapt/qualtrim.pyi000066400000000000000000000007701457457704700201760ustar00rootroot00000000000000from typing import Tuple from typing_extensions import Protocol class _Sequence(Protocol): sequence: str qualities: str class HasNoQualities(Exception): ... def quality_trim_index( qualities: str, cutoff_front: int, cutoff_back: int, base: int = 33 ) -> Tuple[int, int]: ... def nextseq_trim_index(sequence: _Sequence, cutoff: int, base: int = 33) -> int: ... def expected_errors(qualities: str, base: int = 33) -> float: ... def poly_a_trim_index(s: str, revcomp: bool = ...) -> int: ... cutadapt-4.7/src/cutadapt/qualtrim.pyx000066400000000000000000000131101457457704700202050ustar00rootroot00000000000000# cython: profile=False, emit_code_comments=False, language_level=3 """ Quality trimming. """ from cpython.unicode cimport PyUnicode_GET_LENGTH from libc.stdint cimport uint8_t cdef extern from *: unsigned char * PyUnicode_1BYTE_DATA(object o) void *PyUnicode_DATA(object o) bint PyUnicode_IS_COMPACT_ASCII(object o) int PyUnicode_KIND(object o) int PyUnicode_1BYTE_KIND cdef extern from "expected_errors.h": float expected_errors_from_phreds(const uint8_t *phreds, size_t phreds_length, uint8_t base) cdef class HasNoQualities(Exception): pass def quality_trim_index(str qualities, int cutoff_front, int cutoff_back, int base=33): """ Find the positions at which to trim low-quality ends from a nucleotide sequence. Return tuple (start, stop) that indicates the good-quality segment. Qualities are assumed to be ASCII-encoded as chr(qual + base). The algorithm is the same as the one used by BWA within the function 'bwa_trim_read': - Subtract the cutoff value from all qualities. - Compute partial sums from all indices to the end of the sequence. - Trim sequence at the index at which the sum is minimal. """ if qualities is None: raise HasNoQualities("Cannot do quality trimming when no qualities are available") cdef: int s int max_qual int n = len(qualities) int stop = n int start = 0 int i char* qual if not PyUnicode_KIND(qualities) == PyUnicode_1BYTE_KIND: raise ValueError("Quality data is not ASCII") qual = PyUnicode_1BYTE_DATA(qualities) # find trim position for 5' end s = 0 max_qual = 0 for i in range(n): s += cutoff_front - (qual[i] - base) if s < 0: break if s > max_qual: max_qual = s start = i + 1 # same for 3' end max_qual = 0 s = 0 for i in reversed(range(n)): s += cutoff_back - (qual[i] - base) if s < 0: break if s > max_qual: max_qual = s stop = i if start >= stop: start, stop = 0, 0 return (start, stop) def nextseq_trim_index(sequence, int cutoff, int base=33): """ Variant of the above quality trimming routine that works on NextSeq data. With Illumina NextSeq, bases are encoded with two colors. 'No color' (a dark cycle) usually means that a 'G' was sequenced, but that also occurs when sequencing falls off the end of the fragment. The read then contains a run of high-quality G bases in the end. This routine works as the one above, but counts qualities belonging to 'G' bases as being equal to cutoff - 1. """ bases = sequence.sequence qualities = sequence.qualities if qualities is None: raise HasNoQualities() cdef: int s = 0 int max_qual = 0 int max_i int i, q char* qual if not PyUnicode_KIND(qualities) == PyUnicode_1BYTE_KIND: raise ValueError("Quality data is not ASCII") qual = PyUnicode_1BYTE_DATA(qualities) s = 0 max_qual = 0 max_i = len(qualities) for i in reversed(range(max_i)): q = qual[i] - base if bases[i] == 'G': q = cutoff - 1 s += cutoff - q if s < 0: break if s > max_qual: max_qual = s max_i = i return max_i def poly_a_trim_index(str s, bint revcomp = False): """ Return start index of poly-A tail If revcomp is True, return end of poly-T head instead. Poly-A tails shorter than 3 are ignored. """ if not PyUnicode_KIND(s) == PyUnicode_1BYTE_KIND: raise ValueError("Sequence is not ASCII") cdef: char* s_ptr = PyUnicode_1BYTE_DATA(s) int n = len(s) int best_score = 0 int score = 0 int i char c int errors = 0 int best_index if revcomp: best_index = 0 for i in range(n): if s_ptr[i] == b"T": score += 1 else: score -= 2 errors += 1 if score > best_score and errors * 5 <= i + 1: # max error rate 0.2 best_score = score best_index = i + 1 if best_index < 3: best_index = 0 else: best_index = n for i in reversed(range(n)): if s_ptr[i] == b"A": score += 1 else: score -= 2 errors += 1 if score > best_score and errors * 5 <= n - i: # max error rate 0.2 best_score = score best_index = i if best_index > n - 3: best_index = n return best_index def expected_errors(str qualities, uint8_t base=33): """ Return the number of expected errors (as double) from a read’s qualities. This uses the formula in Edgar et al. (2015), see Section 2.2 in . qualities -- ASCII-encoded qualities (chr(qual + base)) """ if not PyUnicode_IS_COMPACT_ASCII(qualities): raise ValueError(f"Quality string contains non-ASCII values: {qualities}") cdef: uint8_t *quals = PyUnicode_DATA(qualities) size_t qual_length = PyUnicode_GET_LENGTH(qualities) double e = expected_errors_from_phreds(quals, qual_length, base) if e < 0.0: for q in qualities: if ord(q) < base or ord(q) > 126: raise ValueError(f"Not a valid phred value {ord(q)} for character {q}") return e cutadapt-4.7/src/cutadapt/report.py000066400000000000000000000776241457457704700175160ustar00rootroot00000000000000""" Routines for printing a report. """ from dataclasses import dataclass from io import StringIO import textwrap from collections import defaultdict, Counter from typing import Any, Optional, List, Dict, Iterator, Tuple, Mapping from .adapters import ( EndStatistics, AdapterStatistics, FrontAdapter, BackAdapter, AnywhereAdapter, LinkedAdapter, SingleAdapter, LinkedAdapterStatistics, FrontAdapterStatistics, BackAdapterStatistics, AnywhereAdapterStatistics, ) from .json import OneLine from .modifiers import ( QualityTrimmer, NextseqQualityTrimmer, AdapterCutter, PairedAdapterCutter, ReverseComplementer, PairedEndModifierWrapper, PolyATrimmer, ) from .statistics import ReadLengthStatistics from .steps import HasStatistics, HasFilterStatistics from .utils import MICRO FILTERS = { "too_short": "that were too short", "too_long": "that were too long", "too_many_n": "with too many N", "too_many_expected_errors": "with too many exp. errors", "casava_filtered": "failed CASAVA filter", "discard_trimmed": "discarded as trimmed", "discard_untrimmed": "discarded as untrimmed", } def safe_divide(numerator: Optional[int], denominator: int) -> float: if numerator is None or not denominator: return 0.0 else: return numerator / denominator def add_if_not_none(a: Optional[int], b: Optional[int]) -> Optional[int]: if a is None: return b if b is None: return a return a + b class Statistics: def __init__(self) -> None: """ """ self.paired: Optional[bool] = None # Map a filter name to the number of filtered reads/read pairs self.filtered: Dict[str, int] = defaultdict(int) self.reverse_complemented: Optional[int] = None self.n = 0 self.total_bp = [0, 0] self.read_length_statistics = ReadLengthStatistics() self.with_adapters: List[Optional[int]] = [None, None] self.quality_trimmed_bp: List[Optional[int]] = [None, None] self.poly_a_trimmed_lengths: List[Optional[defaultdict[int, int]]] = [ None, None, ] self.adapter_stats: List[List[AdapterStatistics]] = [[], []] self._collected: bool = False def __iadd__(self, other: Any): if not isinstance(other, Statistics): raise ValueError(f"Cannot add {other.__type__.__name__}") self.n += other.n self.read_length_statistics += other.read_length_statistics if self.paired is None: self.paired = other.paired elif self.paired != other.paired: raise ValueError("Incompatible Statistics: paired is not equal") self.reverse_complemented = add_if_not_none( self.reverse_complemented, other.reverse_complemented ) for filter_name, count in other.filtered.items(): self.filtered[filter_name] += count for i in (0, 1): self.total_bp[i] += other.total_bp[i] self.with_adapters[i] = add_if_not_none( self.with_adapters[i], other.with_adapters[i] ) self.quality_trimmed_bp[i] = add_if_not_none( self.quality_trimmed_bp[i], other.quality_trimmed_bp[i] ) if self.poly_a_trimmed_lengths[i] is None: self.poly_a_trimmed_lengths[i] = other.poly_a_trimmed_lengths[i] elif other.poly_a_trimmed_lengths[i] is not None: self.poly_a_trimmed_lengths[i] = defaultdict( int, Counter(self.poly_a_trimmed_lengths[i]) + Counter(other.poly_a_trimmed_lengths[i]), ) if self.adapter_stats[i] and other.adapter_stats[i]: if len(self.adapter_stats[i]) != len(other.adapter_stats[i]): raise ValueError( "Incompatible Statistics objects (adapter_stats length)" ) for j in range(len(self.adapter_stats[i])): self.adapter_stats[i][j] += other.adapter_stats[i][j] elif other.adapter_stats[i]: assert self.adapter_stats[i] == [] self.adapter_stats[i] = other.adapter_stats[i] return self def collect( self, n: int, total_bp1: int, total_bp2: Optional[int], modifiers, steps ): """ n -- total number of reads total_bp1 -- number of bases in first reads total_bp2 -- number of bases in second reads. None for single-end data. """ if self._collected: raise ValueError("Cannot call Statistics.collect more than once") self.n = n self.total_bp[0] = total_bp1 if total_bp2 is None: self.paired = False else: self.paired = True self.total_bp[1] = total_bp2 for step in steps: self._collect_step(step) for modifier in modifiers: self._collect_modifier(modifier) self._collected = True # For chaining return self def _collect_step(self, step) -> None: if isinstance(step, HasStatistics): self.read_length_statistics += step.get_statistics() if isinstance(step, HasFilterStatistics): name = step.descriptive_identifier() self.filtered[name] = step.filtered() def _collect_modifier(self, m) -> None: if isinstance(m, PairedAdapterCutter): for i in 0, 1: self.with_adapters[i] = m.with_adapters self.adapter_stats[i] = list(m.adapter_statistics[i].values()) return if isinstance(m, PairedEndModifierWrapper): modifiers_list = [(0, m._modifier1), (1, m._modifier2)] else: modifiers_list = [(0, m)] for i, modifier in modifiers_list: if isinstance(modifier, (QualityTrimmer, NextseqQualityTrimmer)): self.quality_trimmed_bp[i] = add_if_not_none( self.quality_trimmed_bp[i], modifier.trimmed_bases ) if isinstance(modifier, PolyATrimmer): self.poly_a_trimmed_lengths[i] = modifier.trimmed_bases elif isinstance(modifier, AdapterCutter): assert self.with_adapters[i] is None self.with_adapters[i] = modifier.with_adapters self.adapter_stats[i] = list(modifier.adapter_statistics.values()) elif isinstance(modifier, ReverseComplementer): assert self.with_adapters[i] is None self.with_adapters[i] = modifier.adapter_cutter.with_adapters self.adapter_stats[i] = list( modifier.adapter_cutter.adapter_statistics.values() ) self.reverse_complemented = modifier.reverse_complemented def as_json(self, gc_content: float = 0.5, one_line: bool = False) -> Dict: """ Return a dict representation suitable for dumping in JSON format To achieve a more compact representation, set one_line to True, which will wrap some items in a `cutadapt.json.OneLine` object, and use `cutadapt.json.dumps` instead of `json.dumps` to dump the dict. """ filtered = {name: self.filtered.get(name) for name in FILTERS.keys()} filtered_total = sum(self.filtered.values()) written_reads = self.read_length_statistics.written_reads() written_bp = self.read_length_statistics.written_bp() assert written_reads + filtered_total == self.n return { "read_counts": { # pairs or reads "input": self.n, "filtered": filtered, "output": self.read_length_statistics.written_reads(), "reverse_complemented": self.reverse_complemented, "read1_with_adapter": self.with_adapters[0], "read2_with_adapter": self.with_adapters[1] if self.paired else None, }, "basepair_counts": { "input": self.total, "input_read1": self.total_bp[0], "input_read2": self.total_bp[1] if self.paired else None, "quality_trimmed": self.quality_trimmed, "quality_trimmed_read1": self.quality_trimmed_bp[0], "quality_trimmed_read2": self.quality_trimmed_bp[1], "poly_a_trimmed": self.poly_a_trimmed, "poly_a_trimmed_read1": self.poly_a_trimmed_bp[0], "poly_a_trimmed_read2": self.poly_a_trimmed_bp[1], "output": self.total_written_bp, "output_read1": written_bp[0], "output_read2": written_bp[1] if self.paired else None, }, "adapters_read1": [ self._adapter_statistics_as_json( astats, self.n, gc_content, one_line=one_line ) for astats in self.adapter_stats[0] ], "adapters_read2": [ self._adapter_statistics_as_json( astats, self.n, gc_content, one_line=one_line ) for astats in self.adapter_stats[1] ] if self.paired else None, "poly_a_trimmed_read1": self._poly_a_trimmed_as_json( self.poly_a_trimmed_lengths[0] ), "poly_a_trimmed_read2": self._poly_a_trimmed_as_json( self.poly_a_trimmed_lengths[1] ), } def _adapter_statistics_as_json( self, adapter_statistics: AdapterStatistics, n: int, gc_content: float, one_line: bool = False, ): adapter = adapter_statistics.adapter ends: List[Optional[Dict[str, Any]]] = [] total_trimmed_reads = 0 make_line = OneLine if one_line else list for end_statistics in adapter_statistics.end_statistics(): if end_statistics is None: ends.append(None) continue total = sum(end_statistics.lengths.values()) if end_statistics.allows_partial_matches: eranges = ErrorRanges( length=end_statistics.effective_length, error_rate=end_statistics.max_error_rate, ).lengths() else: eranges = None base_stats = AdjacentBaseStatistics(end_statistics.adjacent_bases) trimmed_lengths = [ make_line( { "len": row.length, "expect": round(row.expect, 1), "counts": row.error_counts, } ) for row in histogram_rows(end_statistics, n, gc_content) ] ends.append( { "type": end_statistics.adapter_type, "sequence": end_statistics.sequence, "error_rate": end_statistics.max_error_rate, "indels": end_statistics.indels, "error_lengths": make_line(eranges), "matches": total, "adjacent_bases": base_stats.as_json(), "dominant_adjacent_base": base_stats.warnbase, "trimmed_lengths": trimmed_lengths, } ) total_trimmed_reads += total on_reverse_complement = ( adapter_statistics.reverse_complemented if self.reverse_complemented else None ) return { "name": adapter_statistics.name, "total_matches": total_trimmed_reads, "on_reverse_complement": on_reverse_complement, "linked": isinstance(adapter, LinkedAdapter), "five_prime_end": ends[0], "three_prime_end": ends[1], } @staticmethod def _poly_a_trimmed_as_json(poly_a): if poly_a is None: return None return [ OneLine({"len": length, "count": poly_a[length]}) for length in sorted(poly_a) ] @property def total(self) -> int: return sum(self.total_bp) @property def quality_trimmed(self) -> Optional[int]: return add_if_not_none(*self.quality_trimmed_bp) @property def poly_a_trimmed_bp(self) -> Tuple[Optional[int], Optional[int]]: def trimmed(i: int) -> Optional[int]: lengths = self.poly_a_trimmed_lengths[i] if lengths is None: return None return sum(length * count for length, count in lengths.items()) return (trimmed(0), trimmed(1)) @property def poly_a_trimmed(self) -> Optional[int]: return add_if_not_none(*self.poly_a_trimmed_bp) @property def total_written_bp(self) -> int: return sum(self.read_length_statistics.written_bp()) @property def written(self) -> int: return self.read_length_statistics.written_reads() @property def written_fraction(self) -> float: return safe_divide(self.read_length_statistics.written_reads(), self.n) @property def with_adapters_fraction(self) -> List[float]: return [safe_divide(v, self.n) for v in self.with_adapters] @property def quality_trimmed_fraction(self) -> float: return safe_divide(self.quality_trimmed, self.total) @property def written_bp(self) -> Tuple[int, int]: return self.read_length_statistics.written_bp() @property def total_written_bp_fraction(self) -> float: return safe_divide(self.total_written_bp, self.total) @property def reverse_complemented_fraction(self) -> float: return safe_divide(self.reverse_complemented, self.n) def filtered_fraction(self, filter_name: str) -> float: return safe_divide(self.filtered.get(filter_name), self.n) @property def poly_a_trimmed_fraction(self) -> float: return safe_divide(self.poly_a_trimmed, self.total) class ErrorRanges: """ Representation of the lengths up to which a number of errors is allowed for partial adapter matches. >>> ErrorRanges(length=8, error_rate=0.1).lengths() [8] >>> ErrorRanges(length=19, error_rate=0.1).lengths() [9, 19] >>> ErrorRanges(length=20, error_rate=0.1).lengths() [9, 19, 20] >>> ErrorRanges(length=21, error_rate=0.1).lengths() [9, 19, 21] The entry at index i in the returned list is the length up to which i errors are allowed. For example, the list [9, 19, 23] describes that - 0 errors are allowed up to length 9 - 1 error is allowed up to length 19 - 2 errors are allowed up to length 23 The last number in the list is always the length of the adapter sequence. """ def __init__(self, length: int, error_rate: float): self.length = length self.error_rate = error_rate self._lengths = self._compute_lengths() def _compute_lengths(self) -> List[int]: lengths = [ int(errors / self.error_rate) - 1 for errors in range(1, int(self.error_rate * self.length) + 1) ] if not lengths or lengths[-1] < self.length: lengths.append(self.length) return lengths def __repr__(self): return ( "ErrorRanges(" f"length={self.length}, error_rate={self.error_rate}, _lengths={self._lengths})" ) def __str__(self): """ >>> str(ErrorRanges(length=8, error_rate=0.1)) '1-8 bp: 0' >>> str(ErrorRanges(length=20, error_rate=0.1)) '1-9 bp: 0; 10-19 bp: 1; 20 bp: 2' >>> str(ErrorRanges(length=23, error_rate=0.1)) '1-9 bp: 0; 10-19 bp: 1; 20-23 bp: 2' """ prev = 1 s = "" for errors, r in enumerate(self._lengths[:-1]): s += f"{prev}-{r} bp: {errors}; " prev = r + 1 if prev == self._lengths[-1]: s += f"{prev} bp: {len(self._lengths) - 1}" else: s += f"{prev}-{self._lengths[-1]} bp: {len(self._lengths) - 1}" return s def lengths(self): return self._lengths def error_ranges(end_statistics: EndStatistics) -> str: length = end_statistics.effective_length error_rate = end_statistics.max_error_rate if end_statistics.allows_partial_matches: s = "\n" + str(ErrorRanges(length, error_rate)) else: s = f" {int(error_rate * length)}" return "No. of allowed errors:" + s + "\n" def histogram(end_statistics: EndStatistics, n: int, gc_content: float) -> str: """ Return a formatted histogram. Include the no. of reads expected to be trimmed by chance (assuming a uniform distribution of nucleotides in the reads). adapter_statistics -- EndStatistics object adapter_length -- adapter length n -- total no. of reads. """ sio = StringIO() print("length", "count", "expect", "max.err", "error counts", sep="\t", file=sio) for row in histogram_rows(end_statistics, n, gc_content): print( row.length, row.count, f"{row.expect:.1F}", row.max_err, " ".join(str(e) for e in row.error_counts), sep="\t", file=sio, ) return sio.getvalue() + "\n" @dataclass class HistogramRow: """One row in the "trimmed lengths" histogram""" length: int count: int expect: float max_err: int error_counts: List[int] def histogram_rows( end_statistics: EndStatistics, n: int, gc_content: float, ) -> Iterator[HistogramRow]: """ Yield histogram rows Include the no. of reads expected to be trimmed by chance (assuming a uniform distribution of nucleotides in the reads). n -- total no. of reads. """ d = end_statistics.lengths errors = end_statistics.errors match_probabilities = end_statistics.random_match_probabilities( gc_content=gc_content ) for length in sorted(d): # when length surpasses adapter_length, the # probability does not increase anymore expect = n * match_probabilities[min(len(end_statistics.sequence), length)] count = d[length] max_errors = max(errors[length].keys()) error_counts = [errors[length][e] for e in range(max_errors + 1)] row = HistogramRow( length=length, count=count, expect=expect, max_err=int( end_statistics.max_error_rate * min(length, end_statistics.effective_length) ), error_counts=error_counts, ) yield row class AdjacentBaseStatistics: def __init__(self, bases: Dict[str, int]): """ """ self.bases: Dict[str, int] = bases self._warnbase: Optional[str] = None total = sum(self.bases.values()) if total == 0: self._fractions = None else: self._fractions = [] for base in ["A", "C", "G", "T", ""]: text = base if base != "" else "none/other" fraction = 1.0 * self.bases[base] / total self._fractions.append((text, 1.0 * self.bases[base] / total)) if fraction > 0.8 and base != "": self._warnbase = text if total < 20: self._warnbase = None def __repr__(self): return f"AdjacentBaseStatistics(bases={self.bases})" @property def should_warn(self) -> bool: return self._warnbase is not None @property def warnbase(self) -> Optional[str]: return self._warnbase def __str__(self) -> str: if not self._fractions: return "" sio = StringIO() print("Bases preceding removed adapters:", file=sio) for text, fraction in self._fractions: print(f" {text}: {fraction:.1%}", file=sio) if self.should_warn: print("WARNING:", file=sio) print( f" The adapter is preceded by '{self._warnbase}' extremely often.", file=sio, ) print( " The provided adapter sequence could be incomplete at its 5' end.", file=sio, ) print(" Ignore this warning when trimming primers.", file=sio) return sio.getvalue() def as_json(self) -> Optional[Dict[str, int]]: if self._fractions: return {b: self.bases.get(b, 0) for b in ["A", "C", "G", "T", ""]} else: return None def full_report(stats: Statistics, time: float, gc_content: float) -> str: # noqa: C901 """Print report to standard output.""" if stats.n == 0: return "No reads processed!" if time == 0: time = 1e-6 sio = StringIO() def print_s(*args, **kwargs): kwargs["file"] = sio print(*args, **kwargs) print_s( "Finished in {:.3F} s ({:.3F} {}s/read; {:.2F} M reads/minute).".format( time, 1e6 * time / stats.n, MICRO, stats.n / time * 60 / 1e6 ) ) report = "\n=== Summary ===\n\n" if stats.paired: report += f"Total read pairs processed: {stats.n:13,d}\n" for i in (0, 1): if stats.with_adapters[i] is not None: report += ( f" Read {i+1} with adapter: " f"{stats.with_adapters[i]:13,d} ({stats.with_adapters_fraction[i]:.1%})\n" ) else: report += f"Total reads processed: {stats.n:13,d}\n" if stats.with_adapters[0] is not None: report += ( f"Reads with adapters: " f"{stats.with_adapters[0]:13,d} ({stats.with_adapters_fraction[0]:.1%})\n" ) if stats.reverse_complemented is not None: report += ( "Reverse-complemented: " "{o.reverse_complemented:13,d} ({o.reverse_complemented_fraction:.1%})\n" ) filter_report = format_filter_report(stats) if filter_report: report += "\n== Read fate breakdown ==\n" report += filter_report report += textwrap.dedent( """\ {pairs_or_reads} written (passing filters): {o.written:13,d} ({o.written_fraction:.1%}) Total basepairs processed: {o.total:13,d} bp """ ) if stats.paired: report += " Read 1: {o.total_bp[0]:13,d} bp\n" report += " Read 2: {o.total_bp[1]:13,d} bp\n" if stats.quality_trimmed is not None: report += ( "Quality-trimmed: " f"{stats.quality_trimmed:13,d} bp ({stats.quality_trimmed_fraction:.1%})\n" ) if stats.paired: for i in (0, 1): if stats.quality_trimmed_bp[i] is not None: report += f" Read {i + 1}: {stats.quality_trimmed_bp[i]:13,d} bp\n" if stats.poly_a_trimmed is not None: report += ( "Poly-A-trimmed: " f"{stats.poly_a_trimmed:13,d} bp ({stats.poly_a_trimmed_fraction:.1%})\n" ) if stats.paired: for i in (0, 1): if stats.poly_a_trimmed_bp[i] is not None: report += f" Read {i + 1}: {stats.poly_a_trimmed_bp[i]:13,d} bp\n" report += ( "Total written (filtered): " "{o.total_written_bp:13,d} bp ({o.total_written_bp_fraction:.1%})\n" ) if stats.paired: report += " Read 1: {o.written_bp[0]:13,d} bp\n" report += " Read 2: {o.written_bp[1]:13,d} bp\n" pairs_or_reads = "Pairs" if stats.paired else "Reads" report = report.format(o=stats, pairs_or_reads=pairs_or_reads) print_s(report) warning = False for which_in_pair in (0, 1): for adapter_statistics in stats.adapter_stats[which_in_pair]: end_statistics = adapter_statistics.end_statistics() if end_statistics[0] is not None: total_front = sum(end_statistics[0].lengths.values()) else: total_front = 0 if end_statistics[1] is not None: total_back = sum(end_statistics[1].lengths.values()) else: total_back = 0 total = total_front + total_back reverse_complemented = adapter_statistics.reverse_complemented adapter = adapter_statistics.adapter if isinstance(adapter, BackAdapter): assert total_front == 0 if isinstance(adapter, FrontAdapter): assert total_back == 0 if stats.paired: extra = "First read: " if which_in_pair == 0 else "Second read: " else: extra = "" print_s("=" * 3, extra + "Adapter", adapter_statistics.name, "=" * 3) print_s() if isinstance(adapter_statistics, LinkedAdapterStatistics): print_s( "Sequence: {}...{}; Type: linked; Length: {}+{}; " "5' trimmed: {} times; 3' trimmed: {} times".format( adapter_statistics.front.sequence, adapter_statistics.back.sequence, len(adapter_statistics.front.sequence), len(adapter_statistics.back.sequence), total_front, total_back, ), end="", ) else: assert isinstance(adapter, (SingleAdapter, AnywhereAdapter)) print_s( "Sequence: {}; Type: {}; Length: {}; Trimmed: {} times".format( adapter.sequence, adapter.description, len(adapter.sequence), total, ), end="", ) if stats.reverse_complemented is not None: print_s(f"; Reverse-complemented: {reverse_complemented} times") else: print_s() if total == 0: print_s() continue if isinstance(adapter_statistics, AnywhereAdapterStatistics): assert isinstance(adapter, AnywhereAdapter) print_s(total_front, "times, it overlapped the 5' end of a read") print_s( total_back, "times, it overlapped the 3' end or was within the read" ) print_s() print_s("Minimum overlap:", adapter.min_overlap) print_s(error_ranges(adapter_statistics.front)) print_s("Overview of removed sequences (5')") print_s(histogram(adapter_statistics.front, stats.n, gc_content)) print_s() print_s("Overview of removed sequences (3' or within)") print_s(histogram(adapter_statistics.back, stats.n, gc_content)) elif isinstance(adapter_statistics, LinkedAdapterStatistics): assert isinstance(adapter, LinkedAdapter) print_s() print_s( f"Minimum overlap: " f"{adapter.front_adapter.min_overlap}+{adapter.back_adapter.min_overlap}" ) print_s(error_ranges(adapter_statistics.front)) print_s(error_ranges(adapter_statistics.back)) print_s("Overview of removed sequences at 5' end") print_s(histogram(adapter_statistics.front, stats.n, gc_content)) print_s() print_s("Overview of removed sequences at 3' end") print_s(histogram(adapter_statistics.back, stats.n, gc_content)) elif isinstance(adapter_statistics, FrontAdapterStatistics): assert isinstance(adapter, FrontAdapter) print_s() if adapter.allows_partial_matches: print_s("Minimum overlap:", adapter.min_overlap) print_s(error_ranges(adapter_statistics.end)) print_s("Overview of removed sequences") print_s(histogram(adapter_statistics.end, stats.n, gc_content)) else: assert isinstance(adapter_statistics, BackAdapterStatistics) assert isinstance(adapter, BackAdapter) print_s() if adapter.allows_partial_matches: print_s("Minimum overlap:", adapter.min_overlap) print_s(error_ranges(adapter_statistics.end)) base_stats = AdjacentBaseStatistics( adapter_statistics.end.adjacent_bases ) warning = warning or base_stats.should_warn print_s(base_stats) print_s("Overview of removed sequences") print_s(histogram(adapter_statistics.end, stats.n, gc_content)) poly_a = stats.poly_a_trimmed_lengths[which_in_pair] if poly_a is not None: print_s(poly_a_report(poly_a, which_in_pair if stats.paired else None)) if warning: print_s("WARNING:") print_s(" One or more of your adapter sequences may be incomplete.") print_s(" Please see the detailed output above.") return sio.getvalue().rstrip() def poly_a_report(poly_a: Mapping[int, int], which_in_pair: Optional[int]) -> str: sio = StringIO() if which_in_pair is None: title = "Poly-A" elif which_in_pair == 0: title = "R1 poly-A" else: assert which_in_pair == 1 title = "R2 poly-A" print(f"=== {title} trimmed ===", file=sio) print(file=sio) print("length", "count", sep="\t", file=sio) for length in sorted(poly_a): count = poly_a[length] print(length, count, sep="\t", file=sio) return sio.getvalue() + "\n" def format_filter_report(stats): report = "" for name, description in FILTERS.items(): if name not in stats.filtered: continue value = stats.filtered[name] fraction = stats.filtered_fraction(name) line = ( "{pairs_or_reads} " + (description + ":").ljust(27) + f"{value:13,d} ({fraction:.1%})\n" ) report += line return report def minimal_report(stats: Statistics, time: float, gc_content: float) -> str: """Create a minimal tabular report suitable for concatenation""" _ = time _ = gc_content fields = [ "OK", stats.n, # reads/pairs in stats.total, # bases in stats.filtered.get("too_short", 0), # reads/pairs stats.filtered.get("too_long", 0), # reads/pairs stats.filtered.get("too_many_n", 0), # reads/pairs stats.read_length_statistics.written_reads(), # reads/pairs out stats.with_adapters[0] if stats.with_adapters[0] is not None else 0, # reads stats.quality_trimmed_bp[0] if stats.quality_trimmed_bp[0] is not None else 0, # bases stats.read_length_statistics.written_bp()[0], # bases out ] if stats.paired: fields += [ stats.with_adapters[1] if stats.with_adapters[1] is not None else 0, # reads/pairs stats.quality_trimmed_bp[1] if stats.quality_trimmed_bp[1] is not None else 0, # bases stats.read_length_statistics.written_bp()[1], # bases ] warning = False for which_in_pair in (0, 1): for adapter_statistics in stats.adapter_stats[which_in_pair]: if isinstance(adapter_statistics, BackAdapterStatistics): if AdjacentBaseStatistics( adapter_statistics.end.adjacent_bases ).should_warn: warning = True break if warning: fields[0] = "WARN" header = [ "status", "in_reads", "in_bp", "too_short", "too_long", "too_many_n", "out_reads", "w/adapters", "qualtrim_bp", "out_bp", ] if stats.paired: header += ["w/adapters2", "qualtrim2_bp", "out2_bp"] return "\t".join(header) + "\n" + "\t".join(str(x) for x in fields) cutadapt-4.7/src/cutadapt/runners.py000066400000000000000000000366171457457704700176740ustar00rootroot00000000000000import io import logging import multiprocessing import os import sys import traceback from abc import ABC, abstractmethod from contextlib import ExitStack from multiprocessing.connection import Connection from typing import Any, List, Optional, Tuple, Sequence, Iterator, TYPE_CHECKING import dnaio from cutadapt.files import ( InputFiles, OutputFiles, InputPaths, xopen_rb_raise_limit, detect_file_format, FileFormat, ProxyWriter, ) from cutadapt.pipeline import Pipeline from cutadapt.report import Statistics from cutadapt.utils import Progress logger = logging.getLogger() mpctx = multiprocessing.get_context("spawn") # See https://github.com/python/typeshed/issues/9860 if TYPE_CHECKING: mpctx_Process = multiprocessing.Process else: mpctx_Process = mpctx.Process class ReaderProcess(mpctx_Process): """ Read chunks of FASTA or FASTQ data (single-end or paired) and send them to a worker. The reader repeatedly - reads a chunk from the file(s) - reads a worker index from the Queue - sends the chunk to connections[index] and finally sends the stop token -1 ("poison pills") to all connections. """ def __init__( self, *paths: str, file_format_connection: Connection, connections: Sequence[Connection], queue: multiprocessing.Queue, buffer_size: int, stdin_fd, ): """ Args: paths: path to input files connections: a list of Connection objects, one for each worker. queue: a Queue of worker indices. A worker writes its own index into this queue to notify the reader that it is ready to receive more data. buffer_size: stdin_fd: Note: This expects the paths to the input files as strings because these can be pickled while file-like objects such as BufferedReader cannot. When using multiprocessing with the "spawn" method, which is the default method on macOS, function arguments must be picklable. """ super().__init__() if len(paths) > 2: raise ValueError("Reading from more than two files currently not supported") if not paths: raise ValueError("Must provide at least one file") self._paths = paths self._file_format_connection = file_format_connection self.connections = connections self.queue = queue self.buffer_size = buffer_size self.stdin_fd = stdin_fd def run(self): if self.stdin_fd != -1: sys.stdin.close() sys.stdin = os.fdopen(self.stdin_fd) try: with ExitStack() as stack: files = [ stack.enter_context(xopen_rb_raise_limit(path)) for path in self._paths ] file_format = detect_file_format(files[0]) self._file_format_connection.send(file_format) if file_format is not None: for index, chunks in enumerate(self._read_chunks(*files)): self.send_to_worker(index, *chunks) self.shutdown() except Exception as e: # TODO better send this to a common "something went wrong" Queue for connection in self.connections: connection.send(-2) connection.send((e, traceback.format_exc())) def _read_chunks(self, *files) -> Iterator[Tuple[memoryview, ...]]: if len(files) == 1: for chunk in dnaio.read_chunks(files[0], self.buffer_size): yield (chunk,) elif len(files) == 2: for chunks in dnaio.read_paired_chunks( files[0], files[1], self.buffer_size ): yield chunks else: raise NotImplementedError def send_to_worker(self, chunk_index, chunk1, chunk2=None): worker_index = self.queue.get() connection = self.connections[worker_index] connection.send(chunk_index) connection.send_bytes(chunk1) if chunk2 is not None: connection.send_bytes(chunk2) def shutdown(self): # Send poison pills to all workers for _ in range(len(self.connections)): worker_index = self.queue.get() self.connections[worker_index].send(-1) class WorkerProcess(mpctx_Process): """ The worker repeatedly reads chunks of data from the read_pipe, runs the pipeline on it and sends the processed chunks to the write_pipe. To notify the reader process that it wants data, it puts its own identifier into the need_work_queue before attempting to read data from the read_pipe. """ def __init__( self, id_: int, pipeline: Pipeline, inpaths: InputPaths, proxy_files: List[ProxyWriter], read_pipe: Connection, write_pipe: Connection, need_work_queue: multiprocessing.Queue, ): super().__init__() self._id = id_ self._pipeline = pipeline self._n_input_files = len(inpaths.paths) self._interleaved_input = inpaths.interleaved self._read_pipe = read_pipe self._write_pipe = write_pipe self._need_work_queue = need_work_queue self._proxy_files = proxy_files def run(self): try: stats = Statistics() while True: # Notify reader that we need data self._need_work_queue.put(self._id) chunk_index = self._read_pipe.recv() if chunk_index == -1: # reader is done break elif chunk_index == -2: # An exception has occurred in the reader e, tb_str = self._read_pipe.recv() logger.error("%s", tb_str) raise e files = [ io.BytesIO(self._read_pipe.recv_bytes()) for _ in range(self._n_input_files) ] infiles = InputFiles(*files, interleaved=self._interleaved_input) (n, bp1, bp2) = self._pipeline.process_reads(infiles) stats += Statistics().collect(n, bp1, bp2, [], []) self._send_outfiles(chunk_index, n) stats += Statistics().collect( 0, 0, 0 if self._pipeline.paired else None, self._pipeline._modifiers, self._pipeline._steps, ) self._write_pipe.send(-1) self._write_pipe.send(stats) except Exception as e: self._write_pipe.send(-2) self._write_pipe.send((e, traceback.format_exc())) def _send_outfiles(self, chunk_index: int, n_reads: int): self._write_pipe.send(chunk_index) self._write_pipe.send(n_reads) for pf in self._proxy_files: for chunk in pf.drain(): self._write_pipe.send_bytes(chunk) class OrderedChunkWriter: """ We may receive chunks of processed data from worker processes in any order. This class writes them to an output file in the correct order. """ def __init__(self, outfile): self._chunks = dict() self._current_index = 0 self._outfile = outfile def write(self, data: bytes, index: int): """ """ self._chunks[index] = data while self._current_index in self._chunks: self._outfile.write(self._chunks[self._current_index]) del self._chunks[self._current_index] self._current_index += 1 def wrote_everything(self): return not self._chunks class PipelineRunner(ABC): """ A read processing pipeline """ @abstractmethod def run(self, pipeline, progress: Progress, outfiles: OutputFiles) -> Statistics: """ progress: Use an object that supports .update() and .close() such as DummyProgress, cutadapt.utils.Progress or a tqdm instance """ @abstractmethod def close(self): pass @abstractmethod def input_file_format(self) -> FileFormat: pass def __enter__(self): return self def __exit__(self, *args): self.close() class ParallelPipelineRunner(PipelineRunner): """ Run a Pipeline in parallel - At construction, a reader process is spawned. - When run() is called, as many worker processes as requested are spawned. - In the main process, results are written to the output files in the correct order, and statistics are aggregated. If a worker needs work, it puts its own index into a Queue() (_need_work_queue). The reader process listens on this queue and sends the raw data to the worker that has requested work. For sending the data from reader to worker, a Connection() is used. There is one such connection for each worker (self._pipes). For sending the processed data from the worker to the main process, there is a second set of connections, again one for each worker. When the reader is finished, it sends 'poison pills' to all workers. When a worker receives this, it sends a poison pill to the main process, followed by a Statistics object that contains statistics about all the reads processed by that worker. """ def __init__( self, inpaths: InputPaths, n_workers: int, buffer_size: Optional[int] = None, ): self._n_workers = n_workers self._need_work_queue: multiprocessing.Queue = mpctx.Queue() self._buffer_size = 4 * 1024**2 if buffer_size is None else buffer_size self._inpaths = inpaths # the workers read from these connections connections = [mpctx.Pipe(duplex=False) for _ in range(self._n_workers)] self._connections, connw = zip(*connections) try: fileno = sys.stdin.fileno() except io.UnsupportedOperation: # This happens during tests: pytest sets sys.stdin to an object # that does not have a file descriptor. fileno = -1 file_format_connection_r, file_format_connection_w = mpctx.Pipe(duplex=False) self._reader_process = ReaderProcess( *inpaths.paths, file_format_connection=file_format_connection_w, connections=connw, queue=self._need_work_queue, buffer_size=self._buffer_size, stdin_fd=fileno, ) self._reader_process.daemon = True self._reader_process.start() file_format: Optional[FileFormat] = file_format_connection_r.recv() if file_format is None: raise dnaio.exceptions.UnknownFileFormat( f"Format of input file '{self._inpaths.paths[0]}' not recognized." ) self._input_file_format = file_format def _start_workers( self, pipeline, proxy_files ) -> Tuple[List[WorkerProcess], List[Connection]]: workers = [] connections = [] for index in range(self._n_workers): conn_r, conn_w = mpctx.Pipe(duplex=False) connections.append(conn_r) worker = WorkerProcess( index, pipeline, self._inpaths, proxy_files, self._connections[index], conn_w, self._need_work_queue, ) worker.daemon = True worker.start() workers.append(worker) return workers, connections def run(self, pipeline, progress, outfiles: OutputFiles) -> Statistics: workers, connections = self._start_workers(pipeline, outfiles.proxy_files()) chunk_writers = [] for f in outfiles.binary_files(): chunk_writers.append(OrderedChunkWriter(f)) stats = Statistics() while connections: ready_connections: List[Any] = multiprocessing.connection.wait(connections) for connection in ready_connections: chunk_index: int = self._try_receive(connection) if chunk_index == -1: # the worker is done cur_stats = self._try_receive(connection) stats += cur_stats connections.remove(connection) continue number_of_reads: int = self._try_receive(connection) progress.update(number_of_reads) for writer in chunk_writers: data = connection.recv_bytes() writer.write(data, chunk_index) for writer in chunk_writers: assert writer.wrote_everything() for w in workers: w.join() self._reader_process.join() progress.close() return stats @staticmethod def _try_receive(connection): """ Try to receive data over `self.connection` and return it. If an exception was received, raise it. """ result = connection.recv() if result == -2: # An exception has occurred on the other end e, tb_str = connection.recv() # The other end does not send an actual traceback object because these are # not picklable, but a string representation. logger.debug("%s", tb_str) for child in multiprocessing.active_children(): child.terminate() raise e return result def close(self) -> None: pass def input_file_format(self) -> FileFormat: return self._input_file_format class SerialPipelineRunner(PipelineRunner): """ Run a Pipeline on a single core """ def __init__( self, infiles: InputFiles, ): self._infiles = infiles self._input_file_format = infiles def run( self, pipeline: Pipeline, progress: Progress, outfiles: OutputFiles ) -> Statistics: (n, total1_bp, total2_bp) = pipeline.process_reads( self._infiles, progress=progress ) if progress is not None: progress.close() # TODO modifiers = getattr(pipeline, "_modifiers", None) assert modifiers is not None return Statistics().collect(n, total1_bp, total2_bp, modifiers, pipeline._steps) # type: ignore[attr-defined] def close(self): self._infiles.close() def input_file_format(self) -> FileFormat: detected = detect_file_format(self._infiles._files[0]) if detected is None: raise dnaio.exceptions.UnknownFileFormat( f"Format of input file '{self._infiles._files[0].name}' not recognized." ) return detected def make_runner( inpaths: InputPaths, cores: int, buffer_size: Optional[int] = None, ) -> PipelineRunner: """ Run a pipeline. This uses a SerialPipelineRunner if cores is 1 and a ParallelPipelineRunner otherwise. Args: inpaths: cores: number of cores to run the pipeline on (this is actually the number of worker processes, there will be one extra process for reading the input file(s)) buffer_size: Forwarded to `ParallelPipelineRunner()`. Ignored if cores is 1. """ runner: PipelineRunner if cores > 1: runner = ParallelPipelineRunner( inpaths, n_workers=cores, buffer_size=buffer_size, ) else: runner = SerialPipelineRunner(inpaths.open()) return runner cutadapt-4.7/src/cutadapt/statistics.py000066400000000000000000000033531457457704700203610ustar00rootroot00000000000000from collections import defaultdict, Counter from typing import DefaultDict, Tuple class ReadLengthStatistics: """ Keep track of the lengths of written reads or read pairs """ def __init__(self) -> None: # It would be more natural to use a Counter, but a # defaultdict is much faster self._written_lengths1: DefaultDict[int, int] = defaultdict(int) self._written_lengths2: DefaultDict[int, int] = defaultdict(int) def update(self, read) -> None: """Add a single-end read to the statistics""" self._written_lengths1[len(read)] += 1 def update2(self, read1, read2) -> None: """Add a paired-end read to the statistics""" self._written_lengths1[len(read1)] += 1 self._written_lengths2[len(read2)] += 1 def written_reads(self) -> int: """Return number of written reads or read pairs""" return sum(self._written_lengths1.values()) def written_bp(self) -> Tuple[int, int]: return ( self._compute_total_bp(self._written_lengths1), self._compute_total_bp(self._written_lengths2), ) def written_lengths(self) -> Tuple[Counter, Counter]: return (Counter(self._written_lengths1), Counter(self._written_lengths2)) @staticmethod def _compute_total_bp(counts: DefaultDict[int, int]) -> int: return sum(length * count for length, count in counts.items()) def __iadd__(self, other): written_lengths1, written_lengths2 = other.written_lengths() for length, count in written_lengths1.items(): self._written_lengths1[length] += count for length, count in written_lengths2.items(): self._written_lengths2[length] += count return self cutadapt-4.7/src/cutadapt/steps.py000066400000000000000000000443671457457704700173370ustar00rootroot00000000000000""" Steps of the read output pipeline After all read modifications have been done, a read is written to at most one output file. For this, a pipeline represented as a list of "steps" (SingleEndSteps or PairedEndSteps) is used. Each pipeline step can consume (discard) a read or pass it on to the next step. Steps are added to the pipeline in a certain order: 1. First RestFileWriter, InfoFileWriter, WildcardFileWriter because they should see all reads before filtering. 2. Filters come next. These are implemented as SingleEndFilter or PairedEndFilter instances with an appropriate Predicate. Filters can optionally send each consumed/filtered read to an output file. 3. The last pipeline step should be one of the "Sinks", which consume all reads. Demultiplexers are sinks, for example. """ import itertools from abc import ABC, abstractmethod from typing import Tuple, Optional, Any, TextIO, Sequence, List from dnaio import SequenceRecord from .files import OutputFiles from .predicates import Predicate from .modifiers import ModificationInfo from .statistics import ReadLengthStatistics RecordPair = Tuple[SequenceRecord, SequenceRecord] class SingleEndStep(ABC): @abstractmethod def __call__(self, read, info: ModificationInfo) -> Optional[SequenceRecord]: """ Process a single read. Return the processed read or None to indicate that the read has been consumed and should thus not be passed on to subsequent steps. """ class PairedEndStep(ABC): @abstractmethod def __call__( self, read1, read2, info1: ModificationInfo, info2: ModificationInfo ) -> Optional[RecordPair]: """ Process (read1, read2). Return the processed read pair or None if the read pair has been "consumed" (filtered or written to an output file) and should thus not be passed on to subsequent steps. """ class HasStatistics(ABC): """ Used for the final steps (sinks), which need to keep track of read length statistics """ @abstractmethod def get_statistics(self) -> ReadLengthStatistics: pass class HasFilterStatistics(ABC): @abstractmethod def filtered(self) -> int: """Return number of filtered reads or read pairs""" @abstractmethod def descriptive_identifier(self) -> str: """Name used in statistics""" class SingleEndFilter(SingleEndStep, HasFilterStatistics): """ A pipeline step that can filter reads, can redirect filtered ones to a writer, and counts how many were filtered. """ def __init__(self, predicate: Predicate, writer=None): self._filtered = 0 self._predicate = predicate self._writer = writer def __repr__(self): return f"SingleEndFilter(predicate={self._predicate}, writer={self._writer})" def descriptive_identifier(self) -> str: return self._predicate.descriptive_identifier() def filtered(self) -> int: return self._filtered def __call__(self, read, info: ModificationInfo) -> Optional[SequenceRecord]: if self._predicate.test(read, info): self._filtered += 1 if self._writer is not None: self._writer.write(read) return None return read class PairedEndFilter(PairedEndStep, HasFilterStatistics): """ A pipeline step that can filter paired-end reads, redirect them to a file, and counts how many read pairs were filtered. Different filtering styles are supported, differing by which of the two reads in a pair have to fulfill the filtering criterion. """ def __init__( self, predicate1: Optional[Predicate], predicate2: Optional[Predicate], writer=None, pair_filter_mode="any", ): """ pair_filter_mode -- these values are allowed: 'any': The pair is discarded if any read matches. 'both': The pair is discarded if both reads match. 'first': The pair is discarded if the first read matches. """ if pair_filter_mode not in ("any", "both", "first"): raise ValueError("pair_filter_mode must be 'any', 'both' or 'first'") self._pair_filter_mode = pair_filter_mode self._filtered = 0 self.predicate1 = predicate1 self.predicate2 = predicate2 self.writer = writer self._is_filtered: Any if predicate2 is None: self._is_filtered = self._is_filtered_first elif predicate1 is None: self._is_filtered = self._is_filtered_second elif pair_filter_mode == "any": self._is_filtered = self._is_filtered_any elif pair_filter_mode == "both": self._is_filtered = self._is_filtered_both else: self._is_filtered = self._is_filtered_first def __repr__(self): return ( f"PairedEndFilter(predicate1={self.predicate1}, predicate2={self.predicate2}, " f"writer={self.writer}, pair_filter_mode='{self._pair_filter_mode}')" ) def descriptive_identifier(self) -> str: if self.predicate1 is not None: return self.predicate1.descriptive_identifier() else: assert self.predicate2 is not None return self.predicate2.descriptive_identifier() def filtered(self) -> int: return self._filtered def _is_filtered_any( self, read1, read2, info1: ModificationInfo, info2: ModificationInfo ) -> bool: return self.predicate1.test(read1, info1) or self.predicate2.test(read2, info2) # type: ignore def _is_filtered_both( self, read1, read2, info1: ModificationInfo, info2: ModificationInfo ) -> bool: return self.predicate1.test(read1, info1) and self.predicate2.test(read2, info2) # type: ignore def _is_filtered_first( self, read1, read2, info1: ModificationInfo, info2: ModificationInfo ) -> bool: return self.predicate1.test(read1, info1) # type: ignore def _is_filtered_second( self, read1, read2, info1: ModificationInfo, info2: ModificationInfo ) -> bool: return self.predicate2.test(read2, info2) # type: ignore def __call__( self, read1, read2, info1: ModificationInfo, info2: ModificationInfo ) -> Optional[RecordPair]: if self._is_filtered(read1, read2, info1, info2): self._filtered += 1 if self.writer is not None: self.writer.write(read1, read2) return None return (read1, read2) class RestFileWriter(SingleEndStep): def __init__(self, file: TextIO): self._file = file def __repr__(self): return f"RestFileWriter(file={self._file})" def __call__(self, read, info) -> Optional[SequenceRecord]: # TODO this fails with linked adapters if info.matches: rest = info.matches[-1].rest() if len(rest) > 0: print(rest, read.name, file=self._file) return read class WildcardFileWriter(SingleEndStep): def __init__(self, file: TextIO): self._file = file def __repr__(self): return f"WildcardFileWriter(file={self._file})" def __call__(self, read, info) -> Optional[SequenceRecord]: # TODO this fails with linked adapters if info.matches: print(info.matches[-1].wildcards(), read.name, file=self._file) return read class InfoFileWriter(SingleEndStep): RC_MAP = {None: "", True: "1", False: "0"} def __init__(self, file: TextIO): self._file = file def __repr__(self): return f"InfoFileWriter(file={self._file})" def __call__(self, read, info: ModificationInfo) -> Optional[SequenceRecord]: current_read = info.original_read if info.is_rc: current_read = current_read.reverse_complement() if info.matches: for match in info.matches: for info_record in match.get_info_records(current_read): # info_record[0] is the read name suffix print( read.name + info_record[0], *info_record[1:], self.RC_MAP[info.is_rc], sep="\t", file=self._file, ) current_read = match.trimmed(current_read) else: seq = read.sequence qualities = read.qualities if read.qualities is not None else "" print(read.name, -1, seq, qualities, sep="\t", file=self._file) return read class PairedSingleEndStep(PairedEndStep): """ Wrap a SingleEndStep as a PairedEndStep The wrapped step is called with the first read """ def __init__(self, step: SingleEndStep): self._step = step def __repr__(self): return f"PairedSingleEndStep(step={self._step})" def __call__(self, read1, read2, info1, info2) -> Optional[RecordPair]: _ = read2 # intentionally ignored _ = info2 result = self._step(read1, info1) if result is None: return None return (result, read2) # The following steps are used as final step in a pipeline. # They send each read or read pair to the final intended output file, # and they all track the lengths of written reads. class SingleEndSink(SingleEndStep, HasStatistics): """ Send each read to a writer and keep read length statistics. This is used as the last step in a pipeline. """ def __init__(self, writer): super().__init__() self.writer = writer self._statistics = ReadLengthStatistics() def __repr__(self): return f"SingleEndSink({self.writer})" def __call__(self, read, info: ModificationInfo) -> Optional[SequenceRecord]: self.writer.write(read) self._statistics.update(read) return None def get_statistics(self) -> ReadLengthStatistics: return self._statistics class PairedEndSink(PairedEndStep, HasStatistics): """ Send each read pair to a writer and keep read length statistics. This is used as the last step in a pipeline. """ def __init__(self, writer): super().__init__() self.writer = writer self._statistics = ReadLengthStatistics() def __repr__(self): return f"PairedEndSink({self.writer})" def __call__( self, read1, read2, info1: ModificationInfo, info2: ModificationInfo ) -> Optional[RecordPair]: self.writer.write(read1, read2) self._statistics.update2(read1, read2) return None def get_statistics(self) -> ReadLengthStatistics: return self._statistics class Demultiplexer(SingleEndStep, HasStatistics, HasFilterStatistics): """ Demultiplex trimmed reads. Reads are written to different output files depending on which adapter matches. """ def __init__( self, adapter_names: Sequence[str], template: str, untrimmed_output: Optional[str], discard_untrimmed: bool, outfiles: OutputFiles, ): """ writers maps an adapter name to a writer """ self._writers, self._untrimmed_writer = self._open_writers( adapter_names, template, untrimmed_output, discard_untrimmed, outfiles ) self._statistics = ReadLengthStatistics() self._filtered = 0 def __repr__(self): return f"" @staticmethod def _open_writers( adapter_names: Sequence[str], template: str, untrimmed_output: Optional[str], discard_untrimmed: bool, outfiles: OutputFiles, ): writers = dict() for name in adapter_names: path = template.replace("{name}", name) writers[name] = outfiles.open_record_writer(path) if discard_untrimmed: untrimmed = None else: untrimmed_path: Optional[str] if untrimmed_output: untrimmed_path = untrimmed_output else: untrimmed_path = template.replace("{name}", "unknown") untrimmed = outfiles.open_record_writer(untrimmed_path) return writers, untrimmed def __call__(self, read, info) -> Optional[SequenceRecord]: """ Write the read to the proper output file according to the most recent match """ if info.matches: name = info.matches[-1].adapter.name self._statistics.update(read) self._writers[name].write(read) elif self._untrimmed_writer is not None: self._statistics.update(read) self._untrimmed_writer.write(read) else: self._filtered += 1 return None def descriptive_identifier(self) -> str: return "discard_untrimmed" def get_statistics(self) -> ReadLengthStatistics: return self._statistics def filtered(self) -> int: return self._filtered class PairedDemultiplexer(PairedEndStep, HasStatistics, HasFilterStatistics): """ Demultiplex trimmed paired-end reads. Reads are written to different output files depending on which adapter (in read 1) matches. """ def __init__( self, adapter_names: Sequence[str], template1: str, template2: str, untrimmed_output: Optional[str], untrimmed_paired_output: Optional[str], discard_untrimmed: bool, outfiles: OutputFiles, ): self._writers, self._untrimmed_writer = self._open_writers( adapter_names, template1, template2, untrimmed_output, untrimmed_paired_output, discard_untrimmed, outfiles, ) self._statistics = ReadLengthStatistics() self._filtered = 0 @staticmethod def _open_writers( adapter_names: Sequence[str], template1: str, template2: str, untrimmed_output: Optional[str], untrimmed_paired_output: Optional[str], discard_untrimmed: bool, outfiles: OutputFiles, ): demultiplex_out = dict() for name in adapter_names: path1 = template1.replace("{name}", name) path2 = template2.replace("{name}", name) demultiplex_out[name] = outfiles.open_record_writer(path1, path2) if discard_untrimmed: untrimmed = None else: if untrimmed_output is not None: untrimmed_path1 = untrimmed_output else: untrimmed_path1 = template1.replace("{name}", "unknown") if untrimmed_paired_output is not None: untrimmed_path2 = untrimmed_paired_output else: untrimmed_path2 = template2.replace("{name}", "unknown") untrimmed = outfiles.open_record_writer(untrimmed_path1, untrimmed_path2) return demultiplex_out, untrimmed def __call__( self, read1, read2, info1: ModificationInfo, info2: ModificationInfo ) -> Optional[RecordPair]: assert read2 is not None if info1.matches: name = info1.matches[-1].adapter.name # type: ignore self._statistics.update2(read1, read2) self._writers[name].write(read1, read2) elif self._untrimmed_writer is not None: self._statistics.update2(read1, read2) self._untrimmed_writer.write(read1, read2) else: self._filtered += 1 return None def descriptive_identifier(self) -> str: return "discard_untrimmed" def get_statistics(self) -> ReadLengthStatistics: return self._statistics def filtered(self) -> int: return self._filtered class CombinatorialDemultiplexer(PairedEndStep, HasStatistics): """ Demultiplex paired-end reads depending on which adapter matches, taking into account matches on R1 and R2. """ def __init__( self, adapter_names, adapter_names2, template1: str, template2: str, discard_untrimmed: bool, outfiles: OutputFiles, ): """ Adapter names of the matches on R1 and R2 will be used to look up the writer in the writers dict. If there is no match on a read, None is used in the lookup instead of the name. Missing dictionary keys are ignored and can be used to discard read pairs. """ self._writers = self._open_writers( adapter_names, adapter_names2, template1, template2, discard_untrimmed, outfiles, ) self._statistics = ReadLengthStatistics() @staticmethod def _open_writers( adapter_names: Sequence[str], adapter_names2: Sequence[str], template1: str, template2: str, discard_untrimmed: bool, outfiles: OutputFiles, ): writers = dict() extra: List[Tuple[Optional[str], Optional[str]]] if discard_untrimmed: extra = [] else: extra = [(None, None)] extra += [(None, name2) for name2 in adapter_names2] extra += [(name1, None) for name1 in adapter_names] for name1, name2 in ( list(itertools.product(adapter_names, adapter_names2)) + extra ): # type: ignore fname1 = name1 if name1 is not None else "unknown" fname2 = name2 if name2 is not None else "unknown" path1 = template1.replace("{name1}", fname1).replace("{name2}", fname2) path2 = template2.replace("{name1}", fname1).replace("{name2}", fname2) writers[(name1, name2)] = outfiles.open_record_writer(path1, path2) return writers def __call__(self, read1, read2, info1, info2) -> Optional[RecordPair]: """ Write the read to the proper output file according to the most recent matches both on R1 and R2 """ assert read2 is not None name1 = info1.matches[-1].adapter.name if info1.matches else None name2 = info2.matches[-1].adapter.name if info2.matches else None key = (name1, name2) if key in self._writers: self._statistics.update2(read1, read2) self._writers[key].write(read1, read2) return None def get_statistics(self) -> ReadLengthStatistics: return self._statistics cutadapt-4.7/src/cutadapt/tokenizer.py000066400000000000000000000022021457457704700201710ustar00rootroot00000000000000import re from dataclasses import dataclass from typing import Iterator, Type @dataclass class Token: value: str def __repr__(self) -> str: return f'{self.__class__.__name__}("{self.value}")' class StringToken(Token): pass class BraceToken(Token): pass class TokenizeError(Exception): pass def tokenize_braces(s: str) -> Iterator[Token]: """ >>> list(tokenize_braces("")) [] >>> list(tokenize_braces("before {braced} after")) [StringToken("before "), BraceToken("braced"), StringToken(" after")] >>> list(tokenize_braces("ab{cd}{ef}")) [StringToken("ab"), BraceToken("cd"), BraceToken("ef")] """ for value in re.split("({[^}]*})", s): if value == "": continue if value.startswith("{") and value.endswith("}"): value = value[1:-1] token_class: Type[Token] = BraceToken else: token_class = StringToken if "{" in value: raise TokenizeError("Unexpected '{' encountered") if "}" in value: raise TokenizeError("Unexpected '}' encountered") yield token_class(value) cutadapt-4.7/src/cutadapt/utils.py000066400000000000000000000100251457457704700173210ustar00rootroot00000000000000import re import sys import time import multiprocessing import logging import locale logger = logging.getLogger(__name__) try: "µ".encode(locale.getpreferredencoding()) MICRO = "µ" except UnicodeEncodeError: MICRO = "u" def available_cpu_count(): """ Return the number of available virtual or physical CPUs on this system. The number of available CPUs can be smaller than the total number of CPUs when the cpuset(7) mechanism is in use, as is the case on some cluster systems. Adapted from http://stackoverflow.com/a/1006301/715090 """ try: with open("/proc/self/status") as f: status = f.read() m = re.search(r"(?m)^Cpus_allowed:\s*(.*)$", status) if m: res = bin(int(m.group(1).replace(",", ""), 16)).count("1") if res > 0: return min(res, multiprocessing.cpu_count()) except OSError: pass return multiprocessing.cpu_count() class Progress: """ Print an animated progress report to sys.stderr """ def __init__(self, every=1): """ every: minimum time to wait in seconds between progress updates """ self._every = every self._animation = self.scissors() self._n = 0 self._start_time = time.time() # Time at which the progress was last updated self._last_time = self._start_time self._last_n = 0 def __repr__(self): return ( f"Progress(_n={self._n}, elapsed={self._last_time - self._start_time:.3f})" ) @staticmethod def scissors(width=10): while True: for is_reverse, rang in [ (False, range(width + 1)), (True, range(width + 1)), ]: for position in rang: for is_open in (True, False): left = " " * position right = "-" * (width - position) if is_reverse: sc = ">8" if is_open else "=8" left, right = right, left else: sc = "8<" if is_open else "8=" yield "[" + left + sc + right + "]" def update(self, increment, _final=False): self._n += increment current_time = time.time() if _final: time_delta = current_time - self._start_time delta = self._n else: time_delta = current_time - self._last_time delta = self._n - self._last_n if delta < 1: return if time_delta == 0: return if not _final: if time_delta < self._every: return t = current_time - self._start_time hours = int(t) // 3600 minutes = (int(t) - hours * 3600) // 60 seconds = int(t) % 60 per_second = delta / time_delta per_item = time_delta / delta animation = next(self._animation) if _final: animation = "Done".ljust(len(animation)) print( "\r" "{animation} {hours:02d}:{minutes:02d}:{seconds:02d} " "{total:13,d} reads @ {per_item:5.1F} {micro}s/read; {per_minute:6.2F} M reads/minute" "".format( hours=hours, minutes=minutes, seconds=seconds, total=self._n, per_item=per_item * 1e6, micro=MICRO, per_minute=per_second * 60 / 1e6, animation=animation, ), end="", file=sys.stderr, ) self._last_time = current_time self._last_n = self._n def close(self): """ Print final progress reflecting the final total """ self.update(0, _final=True) print(file=sys.stderr) class DummyProgress(Progress): """ Does not print anything """ def update(self, increment, _final=False): pass def close(self): pass cutadapt-4.7/tests/000077500000000000000000000000001457457704700143575ustar00rootroot00000000000000cutadapt-4.7/tests/.gitignore000066400000000000000000000000351457457704700163450ustar00rootroot00000000000000tmp.log tmp.fastaq tmp.fastq cutadapt-4.7/tests/conftest.py000066400000000000000000000013671457457704700165650ustar00rootroot00000000000000import os import pytest from utils import assert_files_equal, datapath, cutpath from cutadapt.cli import main from cutadapt.report import Statistics @pytest.fixture(params=[1, 2]) def cores(request): return request.param @pytest.fixture def run(tmp_path): def _run(params, expected, inpath) -> Statistics: if type(params) is str: params = params.split() params += ["--json", os.fspath(tmp_path / "stats.cutadapt.json")] tmp_fastaq = tmp_path / expected params += ["-o", tmp_fastaq] params += [datapath(inpath)] stats = main([str(p) for p in params]) # TODO redirect standard output assert_files_equal(cutpath(expected), tmp_fastaq) return stats return _run cutadapt-4.7/tests/cut/000077500000000000000000000000001457457704700151525ustar00rootroot00000000000000cutadapt-4.7/tests/cut/454.fa000066400000000000000000000160741457457704700160060ustar00rootroot00000000000000>000163_1255_2627 length=8 uaccno=E0R4ISW01DCIQD GTGTGGTG >000652_1085_0667 length=80 uaccno=E0R4ISW01CXJXP ATTGAAGAGGTTGGTAAGTTTTAAGTTGGTAGGTGGTTGGGGAGTGGTTGGAGAGGAGTTGTTGGGAGTTTGTGTCCTGC >000653_1285_1649 length=92 uaccno=E0R4ISW01DE4SJ AATTAGTCGAGCGTTGTGGTGGGTATTTGTAATTTTAGCTACTCTGAAGGCTGAGGCAGGAGAACTGCTTGAACCCGGGAGGCGGAGGTTGC >000902_0715_2005 length=50 uaccno=E0R4ISW01B03K3 GGGTGTTGAATTTAATATGTAGTATATTGATTTGTGATGATTATTTTGCC >001146_1255_0340 length=50 uaccno=E0R4ISW01DCGYU GGGTGTTGAATTTAATATGTAGTATATTGATTTGTGATGATTATTTTGCC >001210_1147_1026 length=124 uaccno=E0R4ISW01C2Z5W GAGGTGGTGAGTGTTGTGTGTTTAGATTGTGTGTGGTGGTTGGGAGTGGGAGTTGTATTTTAGGGTGTGGGTTGGGAGAGTGAAAGTTGTGGGTGTTTTGGATGGTGGGTTAGGTGGTTGTGCC >001278_1608_2022 length=66 uaccno=E0R4ISW01D7HW4 CACACACACTCTTCCCCATACCTACTCACACACACACACACACACACAAACATACACAAATAATTC >001333_1518_1176 length=100 uaccno=E0R4ISW01DZKTM AATTGTCGTTTGATTGTTGGAAAGTAGAGGGTCGGGTTGGGGTAGATTCGAAAGGGGAATTTTGAGAAAAGAAATGGAGGGAGGTAGGAAAATTTTTTGC >001398_1584_1549 length=112 uaccno=E0R4ISW01D5DPB TAATGAAATGGAATGGAATGGAATGGAATGAAATGGAATGGAATGGAATGGAATGGAATGGAATGGAATGGAATGGAATGAAATGGAATGGAGTATAAAGGAATGGAATTAC >001455_1136_2179 length=50 uaccno=E0R4ISW01C12AD GGGTGTTGAATTTAATATGTAGTATATTGATTTGTGATGATTATTTTGCC >001481_1165_0549 length=50 uaccno=E0R4ISW01C4KON GGGTGTTGAATTTAATATGTAGTATATTGATTTGTGATGATTATTTTGCC >001744_1376_3512 length=101 uaccno=E0R4ISW01DM5T2 TAAGTAGGGAAGGTTTGAGGTTGTTGGTGTTGGTAGTAGGGGTGTTTTAGTTAGGGGTTGTAGTTTGTTAAGGGAATTTTATTTGAGTTTAGAATTGAGGC >001893_1084_1137 length=120 uaccno=E0R4ISW01CXG4Z TGTATATTTTGTTGGGTTTGTATATATTGTTAGGTGTGGTTGGTGAGTTGTATTGGTGGTGGTGTAAGGTGAGTGGAAATGGGAATGGATTGTAGATATGTTGGATTTGTGGTTTTTGGT >001927_0254_0706 length=139 uaccno=E0R4ISW01AWLLG TGGAATCATCTAAGGGACACAAATAGAATCATCATTGAATGGAATCGAATGGAATCATCTAATGTACTCGAATGGAATTATTATTGAATAGAATAGAATGGAATTATCGAATGGAATCAAATGGAATGTAATGGAATGC >002007_1338_1037 length=95 uaccno=E0R4ISW01DJRTR GGGTTGTGTATTTGGATAGTATGTGGAAAATGGTATTAAAAAGAATTTGTAGTTGGATTGTTGGTGGTTATTTAGTTTTTGGGTAATGGGTAGAT >002186_1130_0654 length=50 uaccno=E0R4ISW01C1H5C GGGTGTTGAATTTAATATGTAGTATATTGATTTGTGATGATTATTTTGCC >002282_1237_2702 length=92 uaccno=E0R4ISW01DAXWG AATTAGCCGGGCGTGATGGCGGGCGTTTGTAGTTTTAGTTATTCGGGAGGTTGAGGTAGGAGAATGGCGTGAATTCGGGAAGCGGAGTTTGC >002382_1259_0997 length=64 uaccno=E0R4ISW01DCT37 TAAGGGTTGAAGCGAGGTAGGTAGTTTGTTTGTGGTTTTGTTTCGTATTTTTGTTTCGTATCCC >002477_0657_0655 length=131 uaccno=E0R4ISW01BVY8H TTTTTGGAAAGTTGGGTGGGTATAGTTTTGAGTAGTTAGAGGTATTATAATAGTATTAGGAAGTTGAATGTGAGGGTATAAGAGTTAATTTGATTTTTCGTTGATATGTTTGTTGTTTGAAGTTAGAGTGC >003149_1553_2333 length=128 uaccno=E0R4ISW01D2OBZ TATTTAGTTTTAGTTTGTTTAGGTGGTTATAGAATACGGAGTTTATGAAGTTGATTAGGAATATTATTAGTTGAATTAAGAATTGGGAAGAGAGGGGAACGGGAAGGGACGTGAGTGATTATTATTGC >003194_1475_2845 length=58 uaccno=E0R4ISW01DVT7J TATTTTGGGTTAAGTCGGGTTTAGTTGTTAGGGCGAGAAGTTAGTTGTTGACCCCTGC >003206_1315_0479 length=52 uaccno=E0R4ISW01DHQPD GGGTTGGATAATATGATGGTGTTGGGGAATATTTAGGTATGTGGTTTGTGGC >003271_0173_0314 length=82 uaccno=E0R4ISW01APHAK GTTTATTTGTTATTTATTTTTAGGTTTAGAAGAGTGTTTGGTATTTATTGAGGATTTAGTATTTGTTAGAAGGATTGGATTC >003443_1737_2250 length=21 uaccno=E0R4ISW01EITSS TGTAGGTTGTGTTGTAGGTTG >002633_1776_1582 length=40 uaccno=E0R4ISW01EL8JK CAGGGTGGATTGGGGAACACACAGTGTGGCCGCGTGATTC >002663_0725_3154 length=84 uaccno=E0R4ISW01B1Z2S GCGTTTTATATTATAATTTAATATTTTGGAGGTTGGGTGCGGTGGTTTACGTTTGTAGTTTAGTATTTGGGAGGTTAAGGTAGC >002761_1056_4055 length=72 uaccno=E0R4ISW01CU2V9 AATTTTATTCGATTTATGTGATGATTTATTTATTTTATTTGAAGATGATTTTATTCGAGATTATTCGATGAT >002843_0289_2275 length=80 uaccno=E0R4ISW01AZPE9 ATTGAAGAGGTTGGTAAGTTTTAAGTTGGTAGGTGGTTGGGGAGTGGTTGGAGAGGAGTTGTTGGGAGTTTGTGTCCTGC >002934_1762_2177 length=50 uaccno=E0R4ISW01EK0Q7 GGGTGTTGAATTTAATATGTAGTATATTGATTTGTGATGATTATTTTGCC >003515_1711_1058 length=79 uaccno=E0R4ISW01EGIPG AATTGAATGGAATTATTATTGAATGGATTCGAATGGAATTATTATTGAATGGAATCATCGAGTGGAATCGAATGGAATC >003541_1276_1589 length=70 uaccno=E0R4ISW01DECAV TAGTTTAGGGTGGTAGTTTGGATAAGGTAGTTTTACGGTTTAGTAGTAGTAGGTTAAGTAGGAAAACTGC >003587_1522_1804 length=109 uaccno=E0R4ISW01DZXX6 AATTTATGTAGTGGAAGTAGGATATAAAGAATAGGTTAATGGATTTTGAGATATTAAAAAGAGTAGGAAATTAGTTGAGAGGTTAAGTAGTAGTTTATTTTAGCCACCC >003592_0076_0430 length=92 uaccno=E0R4ISW01AGYTC AATTAGTTAGGCGTGGTGGCGGGTGTTTGTAGTTTTAGTTATTCGGGAGGTTGAGGTAGGAGAATGTTGTGAATTTAGGAGGTGGAGTTTGC >003957_0595_0965 length=130 uaccno=E0R4ISW01BQJIV TAATATTAGGTGTCAATTTGACTGGATCGAGGGATGTGTGTCGGTGAGAGTCTCACTAGAGGTTGATATTTGAGTCGTTAGACTGGGAGAGGAAGACCGAACTGTCAAGTGTATGGGCGCCATCCAATTC >003986_1127_2937 length=61 uaccno=E0R4ISW01C1AFF TAATGGAATGGAATTTTCGGAATGGAATGGAATGGAATGGAATGGAATGGAATGGAATTAC >004012_1559_1491 length=72 uaccno=E0R4ISW01D26M9 TAGTGGATATAAATGGAATGGATTGGAATGGAATGGATACGAATGGAATGGATTGGAGTGGAATGGATTGAC >004030_1508_2061 length=123 uaccno=E0R4ISW01DYPWF TACGTATATACGCGTACGCGTATACGTATATACGCGTATACGTATACGCGTACGTATATATACGCGTATACGTTTACGTACGTACGCGTATATACGTACGTATACACACACGCATATGCATAC >004038_1061_2047 length=109 uaccno=E0R4ISW01CVG5D AATTGATTCGAATGGAATGGATTGGAATGGAACGGATTTGAATGGAATGGATTGGAATGGAATGGATTGAATGGAATGGATTGGAGAGGATTGGATTTGAATGGAATTC >004105_1121_0391 length=92 uaccno=E0R4ISW01C0PH1 AATTAGTTGGGCGTGGTGGCGAGTGTTTGTAATTTTAGTTATTTAGGAGGTTGAGGTAGGAGAATTATTTGAACCCGGTAGACGGAAGTTGC >004129_1618_3423 length=79 uaccno=E0R4ISW01D8ELT AATTGAATGGTATTGAAAGGTATTAATTTAGTGGAATGGAATGGAATGTATTGGAATGGAAAATAATGGAATGGAGTGC >004203_0451_0902 length=72 uaccno=E0R4ISW01BDWC4 TAGTTGGTGTGTTGTAATCGAGACGTAGTTGGTTGGTACGGGTTAGGGTTTTGATTGGGTTGTTGTGTTTGC >004626_1937_0919 length=180 uaccno=E0R4ISW01E0CVD TAGAGTAGATAGTAGGGTTAGAGAAGGTAGGGTACGTTTAGTTTGTTAGTAAGGTTTAAGTTTTGGGTGGGAAAGGTTAGTGGCGGGAAGGGACGAAGGTGGTAATCGAGAGTAGATTTAGAGAAGTTTTTGAAGTGGGCGTTGGGAGTTTTCGAAGTATTGAGAGAGAGGAGCTTGTGC >004913_0641_2071 length=92 uaccno=E0R4ISW01BULRD AATTAGTCGAGCGTTGTGGTGGGTATTTGTAATTTTAGCTACTCTGAAGGCTGAGGCAGGAGAACTGCTTGAACCCGGGAGGCGGAGGTTGC >005063_0599_1983 length=84 uaccno=E0R4ISW01BQWX9 ATGTGGTGAAGATTGGTTTTAGGTGTTTTAATGTGGATTTTCAGGGGTTTTAAAAGGGTTGGGAGAGTGAAATATATATAAGGC >005140_0759_3209 length=74 uaccno=E0R4ISW01B4ZKR TAGTATAGAGGGTTTGTGGTCGTGAGGGTGTTGATGGCGGGAGGGTTTTGATGGTAGGAGGGCCCGTGCTGTGC >005351_0883_3221 length=95 uaccno=E0R4ISW01CFVHJ TTAGGTGTTATAGTTGAGTGAGATGTTAGTGTTTAATGGTTTTATTTAGGTTGATGGGTTAATGAGGGGGTATTTGATAGTTTTGAAGATTTGAC >005380_1702_1187 length=160 uaccno=E0R4ISW01EFQC1 GTTTTTCGAGTATATATTTAGTAGTACGCTCGACTTCTCTTATATAAAGGTTTTGGTTTTTATAGGTTTTTCCATTGTGTCTGCCTGGGGGAGGGCCCTTCTCCTTCAGGATACTGTAGCTTCTCTGCGTGATAAGCCAGCATTCACGGCTTTCAGGTGC >005568_1060_1943 length=20 uaccno=E0R4ISW01CVDWP ATAGCGTATTTCTCACCTGC >005740_1536_2697 length=116 uaccno=E0R4ISW01D06VV TAAAGAGGTGTTATTATTAGTTAGGAGAGGAGGTGGTTAGATAGTAGTGGGATTATAGGGGAATATAGAGTTGTTAGTTTAGGGATAAGGGATTGATCGATGGGTTAGGTCTCTGC >005753_1884_3877 length=53 uaccno=E0R4ISW01EVRNB AAACTGAGTTGTGATGTTTGCATTCAACTCACAGAGTTCAACATTCCTTTAAC >read_equals_adapter 1a >read_equals_start_of_adapter 1b >read_equals_end_of_adapter 1c >read_equals_middle_of_adapter 1d >read_ends_with_adapter 2a GCTACTCTGAAGGCTGAGGCAGGAGAACTGCTTGAACCCGGGAGGCG >read_ends_with_start_of_adapter 2b GCTACTCTGAAGGCTGAGGCAGGAGAACTGCTTGAACCCGGGAGGCG >read_contains_adapter_in_the_middle 3 CGTAGTTGGTTGGTACG >read_starts_with_adapter 4a AAAGGTTTTGGTTTTTATAGGTTTTT >read_starts_with_end_of_adapter 4b AAAGGTTTTGGTTTTTATAGGTTTTT cutadapt-4.7/tests/cut/SRR2040271_1.fastq000066400000000000000000000005601457457704700176410ustar00rootroot00000000000000@SRR2040271.1 SN603_WBP007_8_1101_63.30_99.90 length=50 NTCATTCCATGACATTGTCTGTTGGTTGCTTTTTGAGTATATTTTCTCAT +SRR2040271.1 SN603_WBP007_8_1101_63.30_99.90 length=50 !1=DDFFFGHHHHIJJJJJIIJJJGGHJGJIJJGBGGHEGHJIIIIEHIJ @SRR2040271.2 SN603_WBP007_8_1101_79.90_99.30 length=20 NTAAAGACCCTTCAACTCAG +SRR2040271.2 SN603_WBP007_8_1101_79.90_99.30 length=20 !4=BBDDDFHHHH@FHIIII cutadapt-4.7/tests/cut/action_lowercase.fasta000066400000000000000000000004211457457704700215100ustar00rootroot00000000000000>r1 CGTCCGAANTAGCTACCACCCTGATTAGACAAAT >r2 caagAcaagacctgccacattgccctagtattaa >r3 caagAcaagacctgccacattgccctagtcaaga >r4 caagATGTCCCCTGCCACATTGCCCTAGTcaaga >r5 caagATGTCCCCTGCCACATTGCCCTAGTTTATT >r6 GTTCATGTCCCCTGCCACATTGCCCTAGTTTATT >r7 ATGGCTGTCCCCTGCCACATTGCCCTAGTcaaga cutadapt-4.7/tests/cut/action_retain.fasta000066400000000000000000000002131457457704700210050ustar00rootroot00000000000000>r1 CGTCCGAAcaag >r2 caag >r3 TGCCCTAGTcaag >r4 TGCCCTAGTcaa >r5 ggttaaCCGCCTTGA >r6 ggttaaCATTGCCCTAGTTTATT >r7 ttaaGTTCATGT >r8 ACGTACGT cutadapt-4.7/tests/cut/adapterorder-ag.fasta000066400000000000000000000000071457457704700212300ustar00rootroot00000000000000>r AAA cutadapt-4.7/tests/cut/adapterorder-ga.fasta000066400000000000000000000000071457457704700212300ustar00rootroot00000000000000>r GGG cutadapt-4.7/tests/cut/adapterx.fasta000066400000000000000000000004601457457704700200020ustar00rootroot00000000000000>r1 CGTCCGAAGTAGCTAtccgaatagaCCACCCTGATTAGACAAAT >r2 tccgaatagaAGCCGCTACGACGGGTTGGCCCTTAGACGTATCT >r3 atagaCAAGATCTACCCTGCCACATTGCCCTAGTTAAAC >r4 CtccgaatagaAAGATCTACCCTGCCACATTGCCCTAGTTAAAC >r5 CAAGATCTACCCTGCCACATTGCCCTAGTTAAAC >r6 CAAGATCTACCCTGCCACATTGCCCTAGTTAAAC >r7 CAAGATCTACCCTGCCACATTGCCCTAGTT cutadapt-4.7/tests/cut/anchored-back.fasta000066400000000000000000000001301457457704700206450ustar00rootroot00000000000000>read1 sequence >read2 sequenceBACKADAPTERblabla >read3 sequenceBACKADA >read4 sequence cutadapt-4.7/tests/cut/anchored.fasta000066400000000000000000000001271457457704700177550ustar00rootroot00000000000000>read1 sequence >read2 blablaFRONTADAPTsequence >read3 NTADAPTsequence >read4 sequence cutadapt-4.7/tests/cut/anchored_no_indels.fasta000066400000000000000000000003141457457704700220050ustar00rootroot00000000000000>no_mismatch (adapter: TTAGACATAT) GAGGTCAG >one_mismatch GAGGTCAG >two_mismatches TAAGACGTATGAGGTCAG >insertion ATTAGACATATGAGGTCAG >deletion TAGACATATGAGGTCAG >mismatch_plus_wildcard TNAGACGTATGAGGTCAG cutadapt-4.7/tests/cut/anchored_no_indels_wildcard.fasta000066400000000000000000000003021457457704700236530ustar00rootroot00000000000000>no_mismatch (adapter: TTAGACATAT) GAGGTCAG >one_mismatch GAGGTCAG >two_mismatches TAAGACGTATGAGGTCAG >insertion ATTAGACATATGAGGTCAG >deletion TAGACATATGAGGTCAG >mismatch_plus_wildcard GAGGTCAG cutadapt-4.7/tests/cut/anywhere_repeat.fastq000066400000000000000000000012041457457704700213710ustar00rootroot00000000000000@prefix:1_13_1400/1 CGTCCGAANTAGCTACCACCCTGATTAGACAAAT + )3%)&&&&!.1&(6:<'67..*,:75)'77&&&5 @prefix:1_13_1500/1 NNNNANNNNNNNNNNNNNNNNNNNNNNNNNNNNN + <=A:A=57!7<';<6?5;;6:+:=)71>70<,=: @prefix:1_13_1550/1 NNNNANNNNNNNNNNNNNNNNNNNNNNNNNNNNN + <=A:A=57!7<';<6?5;;6:+:=)71>70<,=: @prefix:1_13_1600/1 NNNNATGTCCCCTGCCACATTGCCCTAGTNNNNN + <=A:A=57!7<';<6?5;;6:+:=)71>70<,=: @prefix:1_13_1700/1 NNNNATGTCCCCTGCCACATTGCCCTAGTTTATT + <=A:A=57!7<';<6?5;;6:+:=)71>70<,=: @prefix:1_13_1800/1 GTTCATGTCCCCTGCCACATTGCCCTAGTTTATT + <=A:A=57!7<';<6?5;;6:+:=)71>70<,=: @prefix:1_13_1900/1 ATGGCTGTCCCCTGCCACATTGCCCTAGTNNNNN + <=A:A=57!7<';<6?5;;6:+:=)71>70<,=: cutadapt-4.7/tests/cut/casava.fastq000066400000000000000000000002231457457704700174450ustar00rootroot00000000000000@M123:14:000000000-AOEUI:1:1102:18260:1280 1:N:0:9 GGTGGCAGCAGCTACAGGTGTCCAGTCCCAGGTGCAGCTGGTGCAG + CC-BCEFFGGGGGFCFGFFGF<9EDGEAGFC@F<8@FE@EG,CFGG cutadapt-4.7/tests/cut/combinatorial/000077500000000000000000000000001457457704700177755ustar00rootroot00000000000000cutadapt-4.7/tests/cut/combinatorial/combinatorial.A_G.1.fastq000066400000000000000000000000641457457704700244440ustar00rootroot00000000000000@r1 TTATTTGTCTCCAGCTTAGACA + ;5/6+&1)4>=*-(976;9&:; cutadapt-4.7/tests/cut/combinatorial/combinatorial.A_G.2.fastq000066400000000000000000000000661457457704700244470ustar00rootroot00000000000000@r1 GCTGGAGACAAATAACAGTGGAG + -:1/190:8.2+6-20)7>-8>3 cutadapt-4.7/tests/cut/combinatorial/combinatorial.A_T.1.fastq000066400000000000000000000000641457457704700244610ustar00rootroot00000000000000@r2 CAACAGGCCACATTAGACATAT + .;:))9;74(=.99(2';&/9.4.<&>,*5 cutadapt-4.7/tests/cut/combinatorial/combinatorial.A_unknown.1.fastq000066400000000000000000000000641457457704700257550ustar00rootroot00000000000000@r6 AGTGCAACGGTCCACACTGCAG + ->5)'13950-1*&/1;05+:/ cutadapt-4.7/tests/cut/combinatorial/combinatorial.A_unknown.2.fastq000066400000000000000000000000661457457704700257600ustar00rootroot00000000000000@r6 GTCACTGTTCCGTCAACCAGTTT + '8/5;5)39.89>45*5;0/?9' cutadapt-4.7/tests/cut/combinatorial/combinatorial.C_G.1.fastq000066400000000000000000000000641457457704700244460ustar00rootroot00000000000000@r3 AACTTCCGATATTAATAACATT + ,2=.(?''<4:.3:(/3<684' cutadapt-4.7/tests/cut/combinatorial/combinatorial.C_G.2.fastq000066400000000000000000000000661457457704700244510ustar00rootroot00000000000000@r3 TGTTATTAATATCAAGTTGGCAG + '<9039+87)<&&*1+,:638<< cutadapt-4.7/tests/cut/combinatorial/combinatorial.C_T.1.fastq000066400000000000000000000000641457457704700244630ustar00rootroot00000000000000@r4 GACAGGCCGTTTGAATGTTGAC + 5*,,='0?*0/03,7>70,?,- cutadapt-4.7/tests/cut/combinatorial/combinatorial.C_T.2.fastq000066400000000000000000000000661457457704700244660ustar00rootroot00000000000000@r4 CATCCCGTCAACATTCAAACGGC + =;+5.091<,80),<>3<:46'. cutadapt-4.7/tests/cut/combinatorial/combinatorial.C_unknown.1.fastq000066400000000000000000000000001457457704700257450ustar00rootroot00000000000000cutadapt-4.7/tests/cut/combinatorial/combinatorial.C_unknown.2.fastq000066400000000000000000000000001457457704700257460ustar00rootroot00000000000000cutadapt-4.7/tests/cut/combinatorial/combinatorial.unknown_G.1.fastq000066400000000000000000000000641457457704700257630ustar00rootroot00000000000000@r5 TTATTTGTCTCCAGCTTAGACA + <9(/-+3'/--?'88.*<82>. cutadapt-4.7/tests/cut/combinatorial/combinatorial.unknown_G.2.fastq000066400000000000000000000000661457457704700257660ustar00rootroot00000000000000@r5 CAACAGGCCACATTAGACATATC + &05(-2*3*)2(=?21+256(/7 cutadapt-4.7/tests/cut/combinatorial/combinatorial.unknown_T.1.fastq000066400000000000000000000000001457457704700257660ustar00rootroot00000000000000cutadapt-4.7/tests/cut/combinatorial/combinatorial.unknown_T.2.fastq000066400000000000000000000000001457457704700257670ustar00rootroot00000000000000cutadapt-4.7/tests/cut/combinatorial/combinatorial.unknown_unknown.1.fastq000066400000000000000000000000001457457704700272620ustar00rootroot00000000000000cutadapt-4.7/tests/cut/combinatorial/combinatorial.unknown_unknown.2.fastq000066400000000000000000000000001457457704700272630ustar00rootroot00000000000000cutadapt-4.7/tests/cut/demultiplexed.first.1.fastq000066400000000000000000000000551457457704700223440ustar00rootroot00000000000000@read3/1 CCAACTTGATATTAAT + HHHHHHHHHHHHHHHH cutadapt-4.7/tests/cut/demultiplexed.first.2.fastq000066400000000000000000000000651457457704700223460ustar00rootroot00000000000000@read3/2 TGTTATTAATATCAAGTTGG + #HHHHHHHHHHHHHHHHHHH cutadapt-4.7/tests/cut/demultiplexed.second.1.fastq000066400000000000000000000000411457457704700224630ustar00rootroot00000000000000@read2/1 CAACAGGCCA + HHHHHHHHHH cutadapt-4.7/tests/cut/demultiplexed.second.2.fastq000066400000000000000000000000451457457704700224700ustar00rootroot00000000000000@read2/2 TGTGGCCTGTTG + ###HHHHHHHHH cutadapt-4.7/tests/cut/demultiplexed.unknown.1.fastq000066400000000000000000000003431457457704700227140ustar00rootroot00000000000000@read1/1 some text TTATTTGTCTCCAGCTTAGACATATCGCCT + ##HHHHHHHHHHHHHHHHHHHHHHHHHHHH @read4/1 GACAGGCCGTTTGAATGTTGACGGGATGTT + HHHHHHHHHHHHHHHHHHHHHHHHHHHHHH @read5 TTATTTGTCTCCAGCTTAGACATATCGCCT + #####HHHHHHHHHHHHHHHHHHHHHHHHH cutadapt-4.7/tests/cut/demultiplexed.unknown.2.fastq000066400000000000000000000002741457457704700227200ustar00rootroot00000000000000@read1/2 other text GCTGGAGACA + HHHHHHHHHH @read4/2 CATCCCGTCAACATTCAAACGGCCTGTCCA + HH############################ @read5 CAACAGGCCACATTAGACATATCGGATGGT + HHHHHHHH##HHHHHHHHHHHHHHHHHHHH cutadapt-4.7/tests/cut/discard-untrimmed.fastq000066400000000000000000000001201457457704700216160ustar00rootroot00000000000000@prefix:1_13_1440/1 CTNCCCTGCCACATTGCCCTAGTTAAAC + 57!7<';<6?5;;6:+:=)71>70<,=: cutadapt-4.7/tests/cut/discard.fastq000066400000000000000000000001341457457704700176210ustar00rootroot00000000000000@prefix:1_13_1440/1 CAAGATCTNCCCTGCCACATTGCCCTAGTTAAAC + <=A:A=57!7<';<6?5;;6:+:=)71>70<,=: cutadapt-4.7/tests/cut/dos.fastq000066400000000000000000000003471457457704700170030ustar00rootroot00000000000000@prefix:1_13_573/1 CGTCCGAANTAGCTACCACCCTGA + )3%)&&&&!.1&(6:<'67..*,: @prefix:1_13_1259/1 AGCCGCTANGACGGGTTGGCCC + ;<:&:A;A!9<<<,7:<=3=;: @prefix:1_13_1440/1 CAAGATCTNCCCTGCCACATTGCCCTAGTTAAAC + <=A:A=57!7<';<6?5;;6:+:=)71>70<,=: cutadapt-4.7/tests/cut/dual-i1.1.fastq000066400000000000000000000000621457457704700176030ustar00rootroot00000000000000@id1 ACGTACGT + zzzzzzzz @id3 ACGTACGT + zzzzzzzz cutadapt-4.7/tests/cut/dual-i1.2.fastq000066400000000000000000000000621457457704700176040ustar00rootroot00000000000000@id1 TGCATGCA + zzzzzzzz @id3 TGCATGCA + zzzzzzzz cutadapt-4.7/tests/cut/dual-i2.1.fastq000066400000000000000000000000621457457704700176040ustar00rootroot00000000000000@id2 ACGTACGT + zzzzzzzz @id4 ACGTACGT + zzzzzzzz cutadapt-4.7/tests/cut/dual-i2.2.fastq000066400000000000000000000000621457457704700176050ustar00rootroot00000000000000@id2 TGCATGCA + zzzzzzzz @id4 TGCATGCA + zzzzzzzz cutadapt-4.7/tests/cut/dual-unknown.1.fastq000066400000000000000000000000411457457704700207660ustar00rootroot00000000000000@id5 GGGGACGTACGT + zzzzzzzzzzzz cutadapt-4.7/tests/cut/dual-unknown.2.fastq000066400000000000000000000000411457457704700207670ustar00rootroot00000000000000@id5 TTTTTGCATGCA + zzzzzzzzzzzz cutadapt-4.7/tests/cut/empty.fasta000066400000000000000000000000001457457704700173160ustar00rootroot00000000000000cutadapt-4.7/tests/cut/empty.fastq000066400000000000000000000000001457457704700173360ustar00rootroot00000000000000cutadapt-4.7/tests/cut/example.fa000066400000000000000000000002661457457704700171210ustar00rootroot00000000000000>read1 MYSEQUENCE >read2 MYSEQUENCE >read3 MYSEQUENCE >read4 MYSEQUENCEADABTER >read5 MYSEQUENCEADAPTR >read6 MYSEQUENCEADAPPTER >read7 MYSEQUENCE >read8 MYSEQUENCE >read9 SOMETHING cutadapt-4.7/tests/cut/examplefront.fa000066400000000000000000000002641457457704700201700ustar00rootroot00000000000000>read1 >read2 MYSEQUENCEADAP >read3 SOMETHINGELSE >read4 MYSEQUENCEADABTER >read5 MYSEQUENCEADAPTR >read6 MYSEQUENCEADAPPTER >read7 MYSEQUENCE >read8 MYSEQUENCE >read9 MYSEQUENCE cutadapt-4.7/tests/cut/illumina.fastq000066400000000000000000000425641457457704700200370ustar00rootroot00000000000000@SEQ:1:1101:9010:3891#0/1 adapter start: 51 ATAACCGGAGTAGTTGAAATGGTAATAAGACGACCAATCTGACCAGCAAGG + FFFFFEDBE@79@@>@CBCBFDBDFDDDDD<@C>ADD@B;5:978@CBDDF @SEQ:1:1101:9240:3898#0/1 CCAGCAAGGAAGCCAAGATGGGAAAGGTCATGCGGCATACGCTCGGCGCCAGTTTGAATATTAGACATAATTTATCCTCAAGTAAGGGGCCGAAGCCCCTG + GHGHGHHHHGGGDHHGDCGFEEFHHGDFGEHHGFHHHHHGHEAFDHHGFHHEEFHGHFHHFHGEHFBHHFHHHH@GGGDGDFEEFC@=D?GBGFGF:FB6D @SEQ:1:1101:9207:3899#0/1 adapter start: 64 TTAACTTCTCAGTAACAGATACAAACTCATCACGAACGTCAGAAGCAGCCTTATGGCCGTCAAC + HHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHFHHHHHHCFHHF @SEQ:1:1101:9148:3908#0/1 adapter start: 28 ACGACGCAATGGAGAAAGACGGAGAGCG + HHHHHHHHHHHHGHHHHGHHHHHHHHHH @SEQ:1:1101:9044:3916#0/1 adapter start: 78 AACAGAAGGAGTCTACTGCTCGCGTTGCGTCTATTATGGAAAACACCAATCTTTCCAAGCAACAGCAGGTTTCCGAGA + HHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHGHHHHGHHHHHHHHHHHHFHEBFHFFEFHE @SEQ:1:1101:9235:3923#0/1 TTGATGCGGTTATCCATCTGCTTATGGAAGCCAAGCATTGGGGATTGAGAAAGAGTAGAAATGCCACAAGCCTCAATAGCAGGTTTAAGAGCCTCGATACG + HHHHHHHHHHHHHHHHHHHHHHHHHHHHHHFHBHHFHFHHHHHFHHCHHFFHHHHEHHFDHCEEHHHFHHFHFEHHHHHHHHHEHHGFHHCDCEEEFDFFHHHCFFEFE?EBFEB?3 @SEQ:1:1101:9185:3939#0/1 CGTTGAGGCTTGCGTTTATGGTACGCTGGACTTTGTAGGATACCCTCGCTTTCCTGCTCCTGTTGAGTTTATTGCTGCCGTCATTGCTTATTATGTTCATC + HHHHHHHHHHHHHHFHHEHHHDHHFGHHHCHHHHHDHHHHFECEGBD@@?A?DAFF9F<@@08?< @SEQ:1:1101:9140:3961#0/1 adapter start: 66 CAGGAGAAACATACGAAGGCGCATAACGATACCACTGACCCTCAGCAATCTTAAACTTCTTAGACG + HHHHHHHGHHHHHHHHHHHGHHHHHHHHHHHHHHHHFHHHHHHFGHHHHHHHHHHHHHHHHDHHFH @SEQ:1:1101:9073:3961#0/1 adapter start: 49 GTGGCAAGTCTGCCGCTGATAAAGGAAAGGATACTCGTGATTATCTTGC + HHHHHHHHFHHHHHHGHHHHHHHHHEHHGHHGHHHHHHHHHHGEHHHHH @SEQ:1:1101:9196:3971#0/1 adapter start: 18 ACCAGAAGGCGGTTCCTG + HHHHHHHHHFHHHHHHHH @SEQ:1:1101:9053:3973#0/1 TTCACGTTCTGGTTGGTTGTGGCCTGTTGATGCTAAAGGTGAGCCGCTTAAAGCTACCAGGTTTATTGCTGTTTGTTTCTATGTGGCTTAAAACGTTACCA + A39>A################################################################################################ @SEQ:1:1101:9120:3979#0/1 GGCGTTGACAGATGTATCCATCTGAATGCAATGAAGAAAACCACCATTACCAGCATTAACCGTCAAACTATCAAAATATAACGTTGACGATGTAGCTTTAG + HHHHHHHHHHHGHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHFFGFFDHBHHHFGEHHHFGHHHEHHHGH @SEQ:1:1101:9045:3988#0/1 adapter start: 91 TAACCCTGAAACAAATGCTTAGGGATTTTATTGGTATCAGGGTTAATCGTGCCAAGAAAAGCGGCATGGTCAATATAACCAGCAGTGTTAA + HHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHFHHHHHHHFHHHHHHHHHHHFHHHHHHDHHHHHHHFHFFHHGHEHHGHHHGHGHHFH @SEQ:1:1101:9418:3756#0/1 TAATCGTGCCAAGAAAAGCGGCATGGTCAATATAACCAGTAGTGTTAACAGTCGGGAGAGGAGTGGCATTAACACCATCCTTCATGAACTTAATCCACTGT + HHHHHHHHHHHHHHHHFHHHGHEHHHFHHHHFFEHHFHHHHGHHFHFHHHGHHHDHFHCHFCFBCFEFDEHHHHHG@GGGGHHGHFFEG=AB@C:EDEEEH @SEQ:1:1101:9394:3759#0/1 CCCTCGCTTTCCTGCTCCTGTTGAGGTTATTGCTGCCGTCATTGCTTATTATGTTCATCTCGGCAACATTCATACGGTCTGGCTTATCCGTGCAGAGACTG + ##################################################################################################### @SEQ:1:1101:9365:3766#0/1 AAGCACATCACCTTGAATGCCACCGGAGGCGGCTTTTTGACCGCCTCCAAACAATTTAGACATGGCGCCACCAGCAAGAGCAGAAGCAATACCGCCAGCAA + HHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHFFHHHHFHHHHEHHFGHHHHFEHHHHFEHHFDFFAFHEFHFHDFFFFHHDH?DFABFDHADFDHHHFBF @SEQ:1:1101:9436:3776#0/1 GAAGGACGTCAATAGTCACACAGTCCTTGACGGTATAATAACCACCATCATGGCGACCATCCAAAGGATAAACATCATAGGCAGTCGGGAGGGGAGTCGGA + HHHHHHHHHHHHGHHHHHHHHHHHHHHHHHHHHFHGHHHHHHHGHHHHHHFDHHHHHHHHHHHHHFH?HHHHHFBHEH@GHHGD=EEEE88==%893A@@; @SEQ:1:1101:9354:3801#0/1 CCAGCAAGAGCAGAAGCAATACCGCCAGCAATAGCACCAAACATAAATCACCTCACTTAAGTGGCTGGAGACAAATAATCTCTTTAATAACCTGATTCAGC + HHHHHHHHHGHHGHHEGHHEHFGFEHHGHGGHHHHHHHFHGHHFHHEFFFHEHHFHHHDHE5EDFCAC+C)4&27DDA?7HFHDHEFGFG,<@7>?>??::@<5DDDDDCDCBEDCDDDDBDDDBAA1 @SEQ:1:1101:9286:3846#0/1 TGATTAAACTCCTAAGCAGAAAACCTACCGCGCTTCGCTTGGTCAACCCCTCAGCGGCAAAAATTAAAATTTTTACCGCTTCGGCGTTATAACCTCACACT + HHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHFHFHHDGCEGGHHHHFHHFHEHHFHEGHGHGF @SEQ:1:1101:9403:3867#0/1 adapter start: 1 G + H @SEQ:1:1101:9341:3873#0/1 adapter start: 88 CCTAAGCAGAAAACCTACCGCGCTTCGCTTGGTCAACCCCTCAGCGGCAAAAATTAAAATTTTTACCGCTTCGGCGTTATAACCTCAC + HHHHHHHGGFHGHHHHHGHHHHFGHGHHHHEHHHFHFHFHFHH?CEEEDFCEFCDFFHFEABEDF.ECDCDFEEEEEGGFADACDHHH @SEQ:1:1101:9381:3881#0/1 adapter start: 41 ACGTTCTGGTTGGTTGTGGCCTGTTGATGCTAAAGGTGAGC + HHHHHHHHHHHHGHGHDHHHHHHHHFEHHHGGGGFFBGFFF @SEQ:1:1101:9360:3884#0/1 TAATACCTTTCTTTTTGGGGTAATTATACTCATCGCGAATATCCTTAAGAGGGCGTTCAGCAGCCAGCTTGCGGCAAAACTGCGTAACCGTCTTCTCGTTC + HGDEHGHDGHFGFGHFDFFF7EEEEGGFGGEGHEGHHHHFFFEHHHFHEHFBFFF>?DEEBF=?CDB:DFBGFBBGDFFHF?FAFGGABFGGFAFE6EDDC @SEQ:1:1101:9323:3894#0/1 adapter start: 100 ATACCGATATTGCTGGCGACCCTGTTTTGTATGGCAACTTGCCGCCGCGTGAAATTTCTATGAAGGATGTTTTCCGTTCTGGTGATTCGTCTAAGAAGTTG + HHGHHHHHHHHHHHHHHHHHHHEHDHHHHHGEHHFFHHFFFHHHHHHHHFHDHHBHGHB?HHDFFF?EFEHFHBFGEGGFFFDFBHFHHHHHFHHEFFFCF @SEQ:1:1101:9267:3900#0/1 adapter start: 89 GTTTTGGATTTAACCGAAGATGATTTCGATTTTCTGACGAGTAACAAAGTTTGGATTGCTACTGACCGCTCTCGTGCTCGTCGCTGCGT + HHHHHHHHHHHHHHHHHHHHHHHHHHFHHHHHHHHHHHHHHFHHHHEHHEHHHFHHHHHHHHHHFHFHECFFHABGGGIGHHHGGFFGF @SEQ:1:1101:9416:3909#0/1 TAAACGTGACGATGAGGGACATAAAAAGTAAAAATGTCTACAGTAGAGTCAATAGCAAGGCCACGACGCAATGGAGAAAGACGGAGAGCGCCAACGGCGTC + HHHHHHHHHHHHHHHHHHHHHHHHHHHHFHHHHHHHHHHHHHHHHHHHEHHGHHFEFHEFHFFDHEFHFAFFFA?GDFGFE@FFFB?B7EEFEFE?DAA## @SEQ:1:1101:9360:3917#0/1 adapter start: 68 ATGCTTGGGAGCGTGCTGGTGCTGATGCTTCCTCTGCTGGTATGGTTGACGCCGGATTTGAGAATCAA + HHHHHHHHHHHHHHHHHHHFHHHHHHHHHHFHHHHHHHFHEFHHHEHHCFFEFEE9AFFBBDCDCAEE @SEQ:1:1101:9337:3918#0/1 adapter start: 14 CATCAGCACCAGCA + FDEGGGCDBEFCDF @SEQ:1:1101:9307:3927#0/1 adapter start: 15 TCAGCGCCTTCCATG + FFFFFFFFFFFFFDF @SEQ:1:1101:9479:3929#0/1 adapter start: 9 GACAAATTA + HHHHHHHHH @SEQ:1:1101:9277:3934#0/1 adapter start: 71 CTGTCTTTTCGTATGCAGGGCGTTGAGTTCGATAATGGTGATATGTATGTTGACGGCCATAAGGCTGCTTC + HHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHFHHHHHHEHFHHHHFHHHHHFHHEHFHHHFHHFDHHFHHE @SEQ:1:1101:9442:3934#0/1 AGACCCATAATGTCAATAGATGTGGTAGAAGTCGTCATTTGGCGAGAAAGCTCAGTCTCAGGAGGAAGCGGAGCAGTCCAAATGTTTTTGAGATGGCAGCA + HHHHHHHHHGHHHHHFGHHBHHEHGFHHDHGDEGDHHHHHFHHHHHAHHH?FEEBEFDFBEBEEFEHFE7ECCDCG=FDFFDFFFHHHHFEEBEF;BEAEG @SEQ:1:1101:9329:3935#0/1 AGATGGATAACCGCATCAAGCTCTTGGAAGAGATTCTGTCTTTTCGTATGCAGGGCGTTGAGTTCGATAATGGTGATATGTATGTTGACGGCCATAAGGCT + GFGGGEEGDHHHGGEHHHHHHGGFHHEAHHAGDEGEGGEDG@GGGHHGHHFGGH6@CADDHHBEEE@8EBGEEFGGGHFHHHHGEGFGGEFBGEDDE?E7E @SEQ:1:1101:9445:3956#0/1 adapter start: 81 TGCAACAACTGAACGGACTGGAAACACTGGTCATAATCATGGTGGCGAATAAGTACGCGTTCTTGCAAATCACCAGAAGGC + HHHHHHHHHGFHHHHHHHHHHHHHHGHHHHFHHHHHHHHHHHFGHHHFGHHHHFGHHFHEHHHHHHHHHHHHGBHHHHGFG @SEQ:1:1101:9357:3957#0/1 TTAATCGTGCCAAGAAAAGCGGCATGGTCAATATAACCAGTAGTGTTAACAGTCGGGAGAGGAGTGGCATTAACACCATCCTTCATGAACTTAATCCACTG + HHHHHHGHHHHHHHHHHGHEHHHHHGHEHHHHHHHHHHHHHHGHEBGGFGFFFFFBH?HCEEEDEFHFHBHFHCFHHGGGHEGHEGHEF@GHHFHEDHH;H @SEQ:1:1101:9309:3957#0/1 adapter start: 72 GTCAGATATGGACCTTGCTGCTAAAGGTCTAGGAGCTAAAGAATGGAACAACTCACTAAAAACCAAGCTGTC + HHHHHHHHHHHHHHHHHHHHHHHHHGHFHHHFHHHHHHHHGHHHFHHHHHHHFHDHHHHHHFHCHHEAHHDG @SEQ:1:1101:9425:3960#0/1 CTGACGCAGAAGAAAACGTGCGTCAAAAATTACGTGCAGAAGGAGTGATGTAATGTCTAAAGGTAAAAAACGTTCTGGCGCTCGCCCTGGTCGTCCGCAGC + 8?8?C?BC@BD=ABB==BD?CADD=AD>C@@CCBBDD@B/143'3.>>@9BCBDDDC8@@;@???FB=DFB=>C=EEFFFFFEFFFFF:FEF@FEF @SEQ:1:1101:9363:3989#0/1 adapter start: 95 CCTCCAAGATTTGGAGGCATGAAAACATACAATTGGGAGGGTGTCAATCCTGACGGTTATTTCCTAGACAAATTAGAGCCAATACCATCAGCTTTGCCTAA + HHHHHHHHHHHHHHHHHHHHHGHHHHHHHHHHGHHHHHHHGGEEGB;5 @SEQ:1:1101:9554:3781#0/1 CACGCTCTTTTAAAATGTCAACAAGAGAATCTCTACCATGAACAAAATGTGACTCATATCTAAACCAGTCCTTGACGAACGTGCCAAGCATATTAAGCCAC + HHHHHHHHHHHHHGGHHHHHHGHFHHHHHHEHHFHHHEHHHHHHHEHHGHHHHEHHHGFHHHEHHHHHHEEFFEDFEDFF>ACBAHGHHHHECEGHBCFEE @SEQ:1:1101:9695:3783#0/1 adapter start: 52 AATAACCCTGAAACAAATGCTTAGGGATTTTATTGGTATCAGGGTTAATCGT + HHHHHHHHHHHHHHHHHHHHHHHHHHGHHHHHHHHHHHHHGHHHHHHHHHHF @SEQ:1:1101:9572:3788#0/1 ACCAACACGGCGAGTACAACGGCCCAGCTCAGAAGCGAGAACCAGCTGCGCTTGGGTGGGGCGATGGTGATGGTTTGCATGTTTGGCTCCGGTCTGTAGGC + FFFFFFFFF=EBEB0A@A@>A?;FED;;<7??A>>9A>?DA1ADD?D:FF:BC;@############## @SEQ:1:1101:9601:3793#0/1 GCCGCTAATCAGGTTGTTTCTGTTGGTGCTGATATTGCTTTTGATGCCGACCCTAAATTTTTTGCCTGTTTGGTTCGCTTTGAGTCTTCTTCGGTTCCGAC + HHHHHHHHHHHHHHHHHHHHHHHHHHHGHHHEHEGHFHHHHHHHHFHFHCHHHFHFFHHHHHH@HHHHHHGHHHFHHGFHHCFHEGGGFEGE?GCDAD6AD @SEQ:1:1101:9634:3800#0/1 TTTATGCGGACACTTCCTACAGGTAGCGTTGACCCTAATTTTGGTCGTCGGGTACGCAATCGCCGCCAGTTAAATAGCTTGCAAAATACGTGGCCTTATGG + HHGHFHFHHHHCGHHFHHHHHHGEHHHHHGFBEFHHFEHDHHHGFHHEHHFF9ECD?CEEHEDF?GEEDEEG @SEQ:1:1101:9501:3800#0/1 adapter start: 42 TGACCACCTACATACCAAAGACGAGCGCCTTTACGCTTGCCT + HHHHHHHHHHHHHHHHFHHHHHHHHFHHHHHHHHHHHHHHHH @SEQ:1:1101:9703:3807#0/1 adapter start: 27 TAATAACCTGATTCAGCGAAACCAATC + HHHHHHHHHHHHHHHHHHHHHHGHHHG @SEQ:1:1101:9728:3808#0/1 adapter start: 7 CAGAAAA + HHHFHHH @SEQ:1:1101:9676:3812#0/1 adapter start: 1 T + H @SEQ:1:1101:9620:3815#0/1 TCGCGTAGAGGCTTTGCTATTCAGCGTTTGATGAATGCAATGCGACAGGCTCATGCTGATGGTTGGTTTATCGTTTTTGACACTCTCACGTTGGCTGACGA + HHHHHHHHHHGGHHHGHHGHHHHHHHHHHGFHGHHHHHHHHHFHDHHHDDHFHFHFHHHHFF9EFF>DG?FCBCDFFFEBFFE@DFEGGEEG?GF>>:;@A @SEQ:1:1101:9720:3834#0/1 adapter start: 74 TAGACATTTTTACTTTTTATGTCCCTCATCGTCACGTTTATGGTGAACAGTGGATTAAGTTCATGAAGGATGGT + HGHHHHHHHHHHHHHHHGGHEGGFGHFGHFHHDGHGHGHHHHHHHHHHFHHHHHFHFHFFHEFHF=FFHFHHFF @SEQ:1:1101:9635:3844#0/1 adapter start: 4 GACC + HHHH @SEQ:1:1101:9744:3849#0/1 adapter start: 55 AAACATAGTGCCATGCTCAGGAACAAAGAAACGCGGCACAGAATGTTTATAGGTC + HHHHHHHGCHHFHHFHHFFHEHFGCHHGDGHEFFHFHEHHGBBGFCDGFEEFDCF @SEQ:1:1101:9725:3850#0/1 ATAACCCTGAAACAAATGCTTAGGGATTTTATTGGTATCAGGGTTAATCGTGCCAAGAAAAGCGGCATGGTCAATATAACCAGTAGTGTTAACAGTCGGGA + FDGGGDGGGEGGGGGBGBEGFFFDFFFFGGFGGGGFBGGGGGEFDFFGEGFFEFEDGGEEF9DCF?EFBBEDBBGFGGEGGGGCFGFEB@B7C>CDEEE## @SEQ:1:1101:9544:3854#0/1 TAGCGGTAAAGTTAGACCAAACCATGAAACCAACATAAACATTATTGCCCGGCGTACGGGGAAGGACGTCAATAGTCACACAGTCCTTGACGGTATAATAA + HHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHFHHHHHHHHHHHHFFHHHHHHHHHBFHHHHHFHHHHHHHHHHHHHHFCHHHBHE @SEQ:1:1101:9581:3856#0/1 GGGCGGTGGTCTATAGTGTTATTAATATCAAGTTGGGGGAGCACATTGTAGCATTGTGCCAATTCATCCATTAACTTCTCAGTAACAGATACAAACTCATC + HHHHHHEHHHHHHHGHHHHHHHHHHHHHHHHHHHHHHHFHHHHGHHHHHHHHHHHHHHHGGHHHFHHHHHGHFGHGEGHHHHHHFEHFHGDGGFFGHH@DH @SEQ:1:1101:9649:3858#0/1 adapter start: 33 CCTCCAAACAATTTAGACATGGCGCCACCAGCA + BFEEEE@@BA@3>8<>CCDDBEE@ @SEQ:1:1101:9616:3862#0/1 adapter start: 91 GAATTAAATCGAAGTGGACTGCTGGCGGAAAATGAGAAAATTCGACCTATCCTTGCGCAGCTCGAGAAGCTCTTACTTTGCGACCTTTCGC + HHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHFHHHHHHHEHHHHHHHHHHHHHHFHHHHHHHFFFHFDHHEHHHGHHHHGDEHHGHHEGH @SEQ:1:1101:9696:3866#0/1 CAAGTTGCCATACAAAACAGGGTCGCCAGCAATATCGGTATAAGTCAAAGCACCTTTAGCGTTAAGGTACTGAATCTCTTTAGTCGCAGTAGGCGGAAAAC + HHHHHHHHHHHHHHHHHHHHEHEHHHEHHHHFHHHHHHFHHHFHFHHHHHHHHFHHHHFHHFEHBHFEHHHHCEEHHFHHHHHHHHHHHHEHHHHCAFEFG @SEQ:1:1101:9512:3869#0/1 GCTCGACGCCATTAATAATGTTTTCCGTAAATTCAGCGCCTTCCATGATGAGACAGGCCGTTTGAATGTTGACGGGATGAACATAATAAGCAATGACGGCA + HHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHFHHHHFHHHDHHHEHHFFFFFFHFAFEFH?E@FFGGGFGHFHAEFGFFFCEEFF @SEQ:1:1101:9723:3870#0/1 adapter start: 66 CTTTAGCAGCAAGGTATATATCTGACTTTTTGTTAACGTATTTAGCCACATAGCAACCAACAGACA + ################################################################## @SEQ:1:1101:9667:3874#0/1 CTGCTGCATTTCCTGAGCTTAATGCTTGGGAGCGTGCTGGTGCTGATGCTTCCTCTGCTGGTATGGTTGACGCCGGATTTGAGAATCAAAAAGAGCTTACT + HHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHAHHHHEHHD=DAD>D6ADGE@EBE;@?BCGGE?4>ADAAC @SEQ:1:1101:9565:3879#0/1 adapter start: 24 AGCCTTATGGCCGTCAACATACAT + HHHHHHHHHHHHHHHHHFHHGFFH @SEQ:1:1101:9721:3885#0/1 adapter start: 51 TTCCTCAAACGCTTGTTCGGTGGATAAGTTATGGCATTAATCGATTTATTT + >BC?:A?=<>::A=528882.53)5.77;407)*9@:AA8CAA######## @SEQ:1:1101:9707:3894#0/1 adapter start: 40 AACACCATCCTTCATGAACTTAATCCACTGTTCACCATAA + F@F8DEE@EEBCCCCFFEFDDC=DCCFFF=ADD=D>@AA@ @SEQ:1:1101:9560:3900#0/1 adapter start: 6 AGAAGT + GGGGGF @SEQ:1:1101:9696:3913#0/1 adapter start: 2 CC + HH @SEQ:1:1101:9574:3914#0/1 adapter start: 5 GAACA + HHHHH @SEQ:1:1101:9508:3931#0/1 adapter start: 91 TAGAGAACGAGAAGACGGTTACGCAGTTTTGCCGCAAGCTGGCTGCTGAACGCCCTCTTAAGGATATTCGCGATGAGTATAATTACCCCAA + HGHHHHHHHHHHHHHHHHHHGHHHHHFHHHGHHHHFHHHHHHHHHD?ACFEF9FFEEBHBAEFB?E><>B@CBCD==BB @SEQ:1:1101:9903:3754#0/1 ACCAAAATTAGGGTCAACGCTACCTGTAGGAAGTGTCCGCATAAAGTGCACCGCATGGAAATGAAGACGGCCATCAGCTGTACCATACTCAGGCACACAAA + GFEGGGGGBGE@EAEEGGFGGEGGFGEFFGFGFFGGEGGGGEFGCFCEFBF7FGEGEF?BFEEFDFFE??AADD+D@C@CGFCE6FDFFDFBGFDD@DAAD @SEQ:1:1101:9878:3755#0/1 adapter start: 32 AGAACGTGAAAAAGCGTCCTGCGTGTAGCGAA + HHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHH @SEQ:1:1101:9833:3756#0/1 adapter start: 65 TCATCGTCACGTTTATGGTGAACAGTGGATTAAGTTCATGAAGGATGGTGTTAATGCCACTCCTC + HHHHHHHHHHHHHHHHHFHHHHHHHHHHHHHHHHFHHHHGHHFHHHHHEHEHHHHFHEHHHEHFH @SEQ:1:1101:9991:3777#0/1 GCTTTGAGTCTTCTTCGGTTCCGACTACCCTCCCGACTGCCTATGATGTTTATCCTTTGGATGGTCGCCATGATGGTGGTTATTATACCGTCAAGGACTGT + HHHHHHHHHHHHHHHHHHHHHHGHHHGHHHHHHHGHHHHHHGHHHHHHHHHHHHHFHHFFDFFFCFFDHCFF;BFGEFGEGFGGFFF.CFDCCEDB=CBC@ cutadapt-4.7/tests/cut/illumina.info.txt000066400000000000000000000610051457457704700204610ustar00rootroot00000000000000SEQ:1:1101:9010:3891#0/1 adapter start: 51 1 51 81 ATAACCGGAGTAGTTGAAATGGTAATAAGACGACCAATCTGACCAGCAAGG GCCTAACTTCTTAGACTGCCTTAAGGACGT AAGCCAAGATGGGAAAGGTC adapt FFFFFEDBE@79@@>@CBCBFDBDFDDDDD<@C>ADD@B;5:978@CBDDF FDB4B?DB21;84?DDBC9DEBAB;=@<@@ B@@@@B>CCBBDE98>>0@7 SEQ:1:1101:9240:3898#0/1 -1 CCAGCAAGGAAGCCAAGATGGGAAAGGTCATGCGGCATACGCTCGGCGCCAGTTTGAATATTAGACATAATTTATCCTCAAGTAAGGGGCCGAAGCCCCTG GHGHGHHHHGGGDHHGDCGFEEFHHGDFGEHHGFHHHHHGHEAFDHHGFHHEEFHGHFHHFHGEHFBHHFHHHH@GGGDGDFEEFC@=D?GBGFGF:FB6D SEQ:1:1101:9207:3899#0/1 adapter start: 64 1 64 94 TTAACTTCTCAGTAACAGATACAAACTCATCACGAACGTCAGAAGCAGCCTTATGGCCGTCAAC GCCTAACTTCTTAGACTGCCTTAAGGACGT ATACATA adapt HHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHFHHHHHHCFHHF HHFHFFFFFBHHGHHHFFHHFHGGHHDEBF GCDCEEEFDFFHHHCFFEFE?EBFEB?3 SEQ:1:1101:9185:3939#0/1 -1 CGTTGAGGCTTGCGTTTATGGTACGCTGGACTTTGTAGGATACCCTCGCTTTCCTGCTCCTGTTGAGTTTATTGCTGCCGTCATTGCTTATTATGTTCATC HHHHHHHHHHHHHHFHHEHHHDHHFGHHHCHHHHHDHHHHFECEGBD@@?A?DAFF9F<@@08?< SEQ:1:1101:9140:3961#0/1 adapter start: 66 1 66 96 CAGGAGAAACATACGAAGGCGCATAACGATACCACTGACCCTCAGCAATCTTAAACTTCTTAGACG GCCTAACTTCTTAGACTGCCTTAAGGACGT AATCA adapt HHHHHHHGHHHHHHHHHHHGHHHHHHHHHHHHHHHHFHHHHHHFGHHHHHHHHHHHHHHHHDHHFH HHHEHHFHFHHHHHGHHHHHFHGHGHHHHH EHCFG SEQ:1:1101:9073:3961#0/1 adapter start: 49 1 49 79 GTGGCAAGTCTGCCGCTGATAAAGGAAAGGATACTCGTGATTATCTTGC GCCTAACTTCTTAGACTGCCTTAAGGACGT TGCTGCATTTCCTGAGCTTAAT adapt HHHHHHHHFHHHHHHGHHHHHHHHHEHHGHHGHHHHHHHHHHGEHHHHH GFHFFGHFHHGHHCHHFDGHHHHHFHHHFC DFGHHHHHHCFGHHEGEFBGGB SEQ:1:1101:9196:3971#0/1 adapter start: 18 1 18 48 ACCAGAAGGCGGTTCCTG GCCTAACTTCTTAGACTGCCTTAAGGACGT AATGAATGGGAAGCCTTCAAGAAGGTGATAAGCAGGAGAAACATACGAAGGCG adapt HHHHHHHHHFHHHHHHHH HGHHHGHHHHHHHFHHHHHHHHHHHEHHHH HHHHHHHHFHHGHHHHHEHFHHHHBHEHHGEHFHFHHFHHHHFBDFHF?HHHH SEQ:1:1101:9053:3973#0/1 -1 TTCACGTTCTGGTTGGTTGTGGCCTGTTGATGCTAAAGGTGAGCCGCTTAAAGCTACCAGGTTTATTGCTGTTTGTTTCTATGTGGCTTAAAACGTTACCA A39>A################################################################################################ SEQ:1:1101:9120:3979#0/1 -1 GGCGTTGACAGATGTATCCATCTGAATGCAATGAAGAAAACCACCATTACCAGCATTAACCGTCAAACTATCAAAATATAACGTTGACGATGTAGCTTTAG HHHHHHHHHHHGHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHFFGFFDHBHHHFGEHHHFGHHHEHHHGH SEQ:1:1101:9045:3988#0/1 adapter start: 91 1 91 101 TAACCCTGAAACAAATGCTTAGGGATTTTATTGGTATCAGGGTTAATCGTGCCAAGAAAAGCGGCATGGTCAATATAACCAGCAGTGTTAA GCCTAACTTC adapt HHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHFHHHHHHHFHHHHHHHHHHHFHHHHHHDHHHHHHHFHFFHHGHEHHGHHHGHGHHFH GHHFFFEFFE SEQ:1:1101:9418:3756#0/1 -1 TAATCGTGCCAAGAAAAGCGGCATGGTCAATATAACCAGTAGTGTTAACAGTCGGGAGAGGAGTGGCATTAACACCATCCTTCATGAACTTAATCCACTGT HHHHHHHHHHHHHHHHFHHHGHEHHHFHHHHFFEHHFHHHHGHHFHFHHHGHHHDHFHCHFCFBCFEFDEHHHHHG@GGGGHHGHFFEG=AB@C:EDEEEH SEQ:1:1101:9394:3759#0/1 -1 CCCTCGCTTTCCTGCTCCTGTTGAGGTTATTGCTGCCGTCATTGCTTATTATGTTCATCTCGGCAACATTCATACGGTCTGGCTTATCCGTGCAGAGACTG ##################################################################################################### SEQ:1:1101:9365:3766#0/1 -1 AAGCACATCACCTTGAATGCCACCGGAGGCGGCTTTTTGACCGCCTCCAAACAATTTAGACATGGCGCCACCAGCAAGAGCAGAAGCAATACCGCCAGCAA HHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHFFHHHHFHHHHEHHFGHHHHFEHHHHFEHHFDFFAFHEFHFHDFFFFHHDH?DFABFDHADFDHHHFBF SEQ:1:1101:9436:3776#0/1 -1 GAAGGACGTCAATAGTCACACAGTCCTTGACGGTATAATAACCACCATCATGGCGACCATCCAAAGGATAAACATCATAGGCAGTCGGGAGGGGAGTCGGA HHHHHHHHHHHHGHHHHHHHHHHHHHHHHHHHHFHGHHHHHHHGHHHHHHFDHHHHHHHHHHHHHFH?HHHHHFBHEH@GHHGD=EEEE88==%893A@@; SEQ:1:1101:9354:3801#0/1 -1 CCAGCAAGAGCAGAAGCAATACCGCCAGCAATAGCACCAAACATAAATCACCTCACTTAAGTGGCTGGAGACAAATAATCTCTTTAATAACCTGATTCAGC HHHHHHHHHGHHGHHEGHHEHFGFEHHGHGGHHHHHHHFHGHHFHHEFFFHEHHFHHHDHE5EDFCAC+C)4&27DDA?7HFHDHEFGFG,<@7>?>??EDC@FDDDDCDFE?DEEFGFCC@;@D SEQ:1:1101:9477:3819#0/1 adapter start: 28 1 28 58 ATAAAGGAAAGGATACTCGTGATTATCT GCCTAACTTCTTAGACTGCCTTAAGGACGT TGCTGCTGCATTTCCTGAGCTTAATGCTTGGGAGCGTGCTGGT adapt HHHHHHHHHHHHHHHHGHHHHHHHHHHH HHHHHHHHHHHHHFHHFHFHHHHHHHEHHH HHEHHHHHHEHHDHDHBHHGCEHHHHHGGEFGG=DGDGCGC68 SEQ:1:1101:9428:3823#0/1 -1 CGTCAGTAAGAACGTCAGTGTTTCCTGCGCGTACACGCAAGGTAAACGCGAACAATTCAGCGGCTTTAACCGGACGCTCGACGCCATTAATAATGTTTTCC HHHHHHHHHHHHHHHHHHHHHHHHHFHHHHHFGHGHHHHHHHEHHHHFHHHHHFHHHFHH?FHEFFFDGFDAFDCFAFDBFGBFGFHHHHHHHHHFHFH;8 SEQ:1:1101:9403:3824#0/1 adapter start: 70 1 70 100 GCTCAGGAACAAAGAAACGCGGCACAGAATGTTTATAGGTCTGTTGAACACGACCAGAAAACTGGCCTAA GCCTAACTTCTTAGACTGCCTTAAGGACGT C adapt HHHHHHHHHHHHHHHHHHHEHHHHHHHHHHHHHHHHGDHDHHHHHHHHHGHHHHGHEHGHHHHFFHHHHH EHFHFEHHFGBFFFDHCEHHHHGH=HHH=G E SEQ:1:1101:9362:3824#0/1 -1 ACCATGAAACCAACATAAACATTATTGCCCGGCGTACGGGGAAGGACGTCAATAGTCACACAGTCCTTGACGGTATAATAACCACCATCATGGCGACCATC HHHHHHHGHHHHHHHHHHHHHHHGHHHHHFHHHHHHHHFHHFHHHFHHHHHHHHHFHEHHHFHBHFHHHFCEFDEHHHHGHHHHHHHHHEFFFHHFFFDAG SEQ:1:1101:9480:3842#0/1 adapter start: 54 1 54 84 GTACGGATTGTTCAGTAACTTGACTCATGATTTCTTACCTATTAGTGGTTGAAC GCCTAACTTCTTAGACTGCCTTAAGGACGT CGCATCGGACTCAGATA adapt BDCCC@5<<<@BBB7DDDDD<<<9>::@<5DDDDDCDCBEDCDDDDBDDDBAA1 /82638?D=CD2*><6BCFGFF?E?FEFFHBBFEE3E, ;/97-0(6,?=BB@A@D9D########### SEQ:1:1101:9360:3884#0/1 -1 TAATACCTTTCTTTTTGGGGTAATTATACTCATCGCGAATATCCTTAAGAGGGCGTTCAGCAGCCAGCTTGCGGCAAAACTGCGTAACCGTCTTCTCGTTC HGDEHGHDGHFGFGHFDFFF7EEEEGGFGGEGHEGHHHHFFFEHHHFHEHFBFFF>?DEEBF=?CDB:DFBGFBBGDFFHF?FAFGGABFGGFAFE6EDDC SEQ:1:1101:9323:3894#0/1 adapter start: 100 -1 ATACCGATATTGCTGGCGACCCTGTTTTGTATGGCAACTTGCCGCCGCGTGAAATTTCTATGAAGGATGTTTTCCGTTCTGGTGATTCGTCTAAGAAGTTG HHGHHHHHHHHHHHHHHHHHHHEHDHHHHHGEHHFFHHFFFHHHHHHHHFHDHHBHGHB?HHDFFF?EFEHFHBFGEGGFFFDFBHFHHHHHFHHEFFFCF SEQ:1:1101:9267:3900#0/1 adapter start: 89 1 89 101 GTTTTGGATTTAACCGAAGATGATTTCGATTTTCTGACGAGTAACAAAGTTTGGATTGCTACTGACCGCTCTCGTGCTCGTCGCTGCGT GCCTAACTTCTT adapt HHHHHHHHHHHHHHHHHHHHHHHHHHFHHHHHHHHHHHHHHFHHHHEHHEHHHFHHHHHHHHHHFHFHECFFHABGGGIGHHHGGFFGF FCACFECEB5<; SEQ:1:1101:9416:3909#0/1 -1 TAAACGTGACGATGAGGGACATAAAAAGTAAAAATGTCTACAGTAGAGTCAATAGCAAGGCCACGACGCAATGGAGAAAGACGGAGAGCGCCAACGGCGTC HHHHHHHHHHHHHHHHHHHHHHHHHHHHFHHHHHHHHHHHHHHHHHHHEHHGHHFEFHEFHFFDHEFHFAFFFA?GDFGFE@FFFB?B7EEFEFE?DAA## SEQ:1:1101:9360:3917#0/1 adapter start: 68 1 68 98 ATGCTTGGGAGCGTGCTGGTGCTGATGCTTCCTCTGCTGGTATGGTTGACGCCGGATTTGAGAATCAA GCCTAACTTCTTAGACTGCCTTAAGGACGT AAA adapt HHHHHHHHHHHHHHHHHHHFHHHHHHHHHHFHHHHHHHFHEFHHHEHHCFFEFEE9AFFBBDCDCAEE EFHD??>E4@EC>74<-5@############## SEQ:1:1101:9307:3927#0/1 adapter start: 15 1 15 45 TCAGCGCCTTCCATG GCCTAACTTCTTAGACTGCCTTAAGGACGT ATGAGACAGGCCGTTTGAATGTTGACGGGATGAACATAATAAGCAATGACGGCAGC adapt FFFFFFFFFFFFFDF =EEEEDFFFFBEEEEFFFFFFFFFFFDEEB DFFFFDFFFFEF@FFFBEFFBFFEF--@@EFHFHBHFHCFHHGGGHEGHEGHEF@GHHFHEDHH;H SEQ:1:1101:9309:3957#0/1 adapter start: 72 1 72 101 GTCAGATATGGACCTTGCTGCTAAAGGTCTAGGAGCTAAAGAATGGAACAACTCACTAAAAACCAAGCTGTC GCCTAACTTCTTAGACTGCCTTAAGGACG adapt HHHHHHHHHHHHHHHHHHHHHHHHHGHFHHHFHHHHHHHHGHHHFHHHHHHHFHDHHHHHHFHCHHEAHHDG GHFHFHDHHHGHHEHHFFH?HHHFDGGG? SEQ:1:1101:9425:3960#0/1 -1 CTGACGCAGAAGAAAACGTGCGTCAAAAATTACGTGCAGAAGGAGTGATGTAATGTCTAAAGGTAAAAAACGTTCTGGCGCTCGCCCTGGTCGTCCGCAGC 8?8?C?BC@BD=ABB==BD?CADD=AD>C@@CCBBDD@B/143'3.>>@9BCBDDDC8@@;@???FB=DFB=>C=EEFFFFFEFFFFF:FEF@FEF EFBGGGFFGHFFHD5DGB=>>@;A>C5?A SEQ:1:1101:9363:3989#0/1 adapter start: 95 -1 CCTCCAAGATTTGGAGGCATGAAAACATACAATTGGGAGGGTGTCAATCCTGACGGTTATTTCCTAGACAAATTAGAGCCAATACCATCAGCTTTGCCTAA HHHHHHHHHHHHHHHHHHHHHGHHHHHHHHHHGHHHHHHHGB B;FBFFEGGEGB==EGFHHGEEGB;5 SEQ:1:1101:9554:3781#0/1 -1 CACGCTCTTTTAAAATGTCAACAAGAGAATCTCTACCATGAACAAAATGTGACTCATATCTAAACCAGTCCTTGACGAACGTGCCAAGCATATTAAGCCAC HHHHHHHHHHHHHGGHHHHHHGHFHHHHHHEHHFHHHEHHHHHHHEHHGHHHHEHHHGFHHHEHHHHHHEEFFEDFEDFF>ACBAHGHHHHECEGHBCFEE SEQ:1:1101:9695:3783#0/1 adapter start: 52 1 52 82 AATAACCCTGAAACAAATGCTTAGGGATTTTATTGGTATCAGGGTTAATCGT GCCTAACTTCTTAGACTGCCTTAAGGACGT GCCAAGAAAAGCGGCATGG adapt HHHHHHHHHHHHHHHHHHHHHHHHHHGHHHHHHHHHHHHHGHHHHHHHHHHF HHHHHHFHGEHEHHHHHGHHHHHHHHHFHH FHGGHHHHHHGGHGFHHHG SEQ:1:1101:9572:3788#0/1 -1 ACCAACACGGCGAGTACAACGGCCCAGCTCAGAAGCGAGAACCAGCTGCGCTTGGGTGGGGCGATGGTGATGGTTTGCATGTTTGGCTCCGGTCTGTAGGC FFFFFFFFF=EBEB0A@A@>A?;FED;;<7??A>>9A>?DA1ADD?D:FF:BC;@############## SEQ:1:1101:9601:3793#0/1 -1 GCCGCTAATCAGGTTGTTTCTGTTGGTGCTGATATTGCTTTTGATGCCGACCCTAAATTTTTTGCCTGTTTGGTTCGCTTTGAGTCTTCTTCGGTTCCGAC HHHHHHHHHHHHHHHHHHHHHHHHHHHGHHHEHEGHFHHHHHHHHFHFHCHHHFHFFHHHHHH@HHHHHHGHHHFHHGFHHCFHEGGGFEGE?GCDAD6AD SEQ:1:1101:9634:3800#0/1 -1 TTTATGCGGACACTTCCTACAGGTAGCGTTGACCCTAATTTTGGTCGTCGGGTACGCAATCGCCGCCAGTTAAATAGCTTGCAAAATACGTGGCCTTATGG HHGHFHFHHHHCGHHFHHHHHHGEHHHHHGFBEFHHFEHDHHHGFHHEHHFF9ECD?CEEHEDF?GEEDEEG SEQ:1:1101:9501:3800#0/1 adapter start: 42 1 42 72 TGACCACCTACATACCAAAGACGAGCGCCTTTACGCTTGCCT GCCTAACTTCTTAGACTGCCTTAAGGACGT TTAGTACCTCGCAACGGCTGCGGACGACC adapt HHHHHHHHHHHHHHHHFHHHHHHHHFHHHHHHHHHHHHHHHH HHHHHFHHHHHHHHHHHHHHFBHAEDBEFB BEF=ADEEGGGEFCC>B1CCDCB7FGFFE SEQ:1:1101:9703:3807#0/1 adapter start: 27 1 27 57 TAATAACCTGATTCAGCGAAACCAATC GCCTAACTTCTTAGACTGCCTTAAGGACGT CGCGGCATTTAGTAGCGGTAAAGTTAGACCAAACCATGAAACCA adapt HHHHHHHHHHHHHHHHHHHHHHGHHHG HHHFHGFHHHHFFHHHHHDHHHHBGFEFHH HFHFHFDHFDFFFEHHGHDHHGHHEHHG@E?FDGBEBDGGFFGF SEQ:1:1101:9728:3808#0/1 adapter start: 7 1 7 37 CAGAAAA GCCTAACTTCTTAGACTGCCTTAAGGACGT CCTACCGCGCTTCGCTTGGTCAACCCCTCAGCGGCAAAAATTAAAATTTTTACCGCTTCGGCGT adapt HHHFHHH HHHHHHHHHHHHHHHFHHHHHHHHHHHFB8 @B9C?CC@CHCFFFHF=FEED<4:?:>@,@;@>.>6;+?&@><:BC?DE@=7@### SEQ:1:1101:9676:3812#0/1 adapter start: 1 1 1 31 T GCCTAACTTCTTAGACTGCCTTAAGGACGT TATTGCCCGGCGTACGGGGAAGGACGTCAATAGTCACACAGTCCTTGACGGTATAATAACCACCATCATG adapt H HHHHHHHHHHHHHHHHHHHHHHHFHHHHHH HDHFHHHHHHHECHHEHEHHH=HHFHHFHFHHFHFHGFFEECFFHEFFGFGHFFEHHFHHFFFHFDG?FCBCDFFFEBFFE@DFEGGEEG?GF>>:;@A SEQ:1:1101:9720:3834#0/1 adapter start: 74 1 74 101 TAGACATTTTTACTTTTTATGTCCCTCATCGTCACGTTTATGGTGAACAGTGGATTAAGTTCATGAAGGATGGT GCCTAACTTCTTAGACTGCCTTAAGGA adapt HGHHHHHHHHHHHHHHHGGHEGGFGHFGHFHHDGHGHGHHHHHHHHHHFHHHHHFHFHFFHEFHF=FFHFHHFF HFGAGGHHDHGHBHHHEGDGC>FEC@D SEQ:1:1101:9635:3844#0/1 adapter start: 4 1 4 34 GACC GCCTAACTTCTTAGACTGCCTTAAGGACGT ATCCAAAGGATAAACATCATAGGCAGTCGGGAGGGTAGTCGGAACCGAAGAAGACTCAAAGCGAACC adapt HHHH GHHHHHHHHHGHHHHHHGHHGHHHGHGHHH HFHHH;GGCGFH?HHFHEHHFFHFHFFFHHFDHHHHHHHHHEGHHHHGHGHEHHHHC@?GFEGBGHH SEQ:1:1101:9744:3849#0/1 adapter start: 55 1 55 85 AAACATAGTGCCATGCTCAGGAACAAAGAAACGCGGCACAGAATGTTTATAGGTC GCCTAACTTCTTAGACTGCCTTAAGGACGT TGTTGAACACGACCAG adapt HHHHHHHGCHHFHHFHHFFHEHFGCHHGDGHEFFHFHEHHGBBGFCDGFEEFDCF FGEEEHEHFHHHCFF?EEFDEFD6FHGEHH HEHHHBBE?:CCDA7G SEQ:1:1101:9725:3850#0/1 -1 ATAACCCTGAAACAAATGCTTAGGGATTTTATTGGTATCAGGGTTAATCGTGCCAAGAAAAGCGGCATGGTCAATATAACCAGTAGTGTTAACAGTCGGGA FDGGGDGGGEGGGGGBGBEGFFFDFFFFGGFGGGGFBGGGGGEFDFFGEGFFEFEDGGEEF9DCF?EFBBEDBBGFGGEGGGGCFGFEB@B7C>CDEEE## SEQ:1:1101:9544:3854#0/1 -1 TAGCGGTAAAGTTAGACCAAACCATGAAACCAACATAAACATTATTGCCCGGCGTACGGGGAAGGACGTCAATAGTCACACAGTCCTTGACGGTATAATAA HHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHFHHHHHHHHHHHHFFHHHHHHHHHBFHHHHHFHHHHHHHHHHHHHHFCHHHBHE SEQ:1:1101:9581:3856#0/1 -1 GGGCGGTGGTCTATAGTGTTATTAATATCAAGTTGGGGGAGCACATTGTAGCATTGTGCCAATTCATCCATTAACTTCTCAGTAACAGATACAAACTCATC HHHHHHEHHHHHHHGHHHHHHHHHHHHHHHHHHHHHHHFHHHHGHHHHHHHHHHHHHHHGGHHHFHHHHHGHFGHGEGHHHHHHFEHFHGDGGFFGHH@DH SEQ:1:1101:9649:3858#0/1 adapter start: 33 1 33 63 CCTCCAAACAATTTAGACATGGCGCCACCAGCA GCCTAACTTCTTAGACTGCCTTAAGGACGT AGAGCAGAAGCAATACCGCCAGCAATAGCAACAAACAT adapt BFEEEE@@BA@3>8<>CCDDBEE@ DEFFDDFE=EEB@EDEEFDFDECEEBEB:C -@<698<@BBA@DCBDDFCEBFCCD;DC=D@C###### SEQ:1:1101:9616:3862#0/1 adapter start: 91 1 91 101 GAATTAAATCGAAGTGGACTGCTGGCGGAAAATGAGAAAATTCGACCTATCCTTGCGCAGCTCGAGAAGCTCTTACTTTGCGACCTTTCGC GCCTAACTTC adapt HHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHFHHHHHHHEHHHHHHHHHHHHHHFHHHHHHHFFFHFDHHEHHHGHHHHGDEHHGHHEGH GCHHHHEHFG SEQ:1:1101:9696:3866#0/1 -1 CAAGTTGCCATACAAAACAGGGTCGCCAGCAATATCGGTATAAGTCAAAGCACCTTTAGCGTTAAGGTACTGAATCTCTTTAGTCGCAGTAGGCGGAAAAC HHHHHHHHHHHHHHHHHHHHEHEHHHEHHHHFHHHHHHFHHHFHFHHHHHHHHFHHHHFHHFEHBHFEHHHHCEEHHFHHHHHHHHHHHHEHHHHCAFEFG SEQ:1:1101:9512:3869#0/1 -1 GCTCGACGCCATTAATAATGTTTTCCGTAAATTCAGCGCCTTCCATGATGAGACAGGCCGTTTGAATGTTGACGGGATGAACATAATAAGCAATGACGGCA HHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHFHHHHFHHHDHHHEHHFFFFFFHFAFEFH?E@FFGGGFGHFHAEFGFFFCEEFF SEQ:1:1101:9723:3870#0/1 adapter start: 66 1 66 96 CTTTAGCAGCAAGGTATATATCTGACTTTTTGTTAACGTATTTAGCCACATAGCAACCAACAGACA GCCTAACTTCTTAGACTGCCTTAAGGACGT TATAA adapt ################################################################## ############################## ##### SEQ:1:1101:9667:3874#0/1 -1 CTGCTGCATTTCCTGAGCTTAATGCTTGGGAGCGTGCTGGTGCTGATGCTTCCTCTGCTGGTATGGTTGACGCCGGATTTGAGAATCAAAAAGAGCTTACT HHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHAHHHHEHHD=DAD>D6ADGE@EBE;@?BCGGE?4>ADAAC SEQ:1:1101:9565:3879#0/1 adapter start: 24 1 24 54 AGCCTTATGGCCGTCAACATACAT GCCTAACTTCTTAGACTGCCTTAAGGACGT ATCACCATTATCGAACTCAACGCCCTGCATACGAAAAGACAGAATCT adapt HHHHHHHHHHHHHHHHHFHHGFFH HHHHHHHHHDGHHFHFHHHHHFECHFFHHH HHEHFCFFFFHEHDEFHHCHHEG?GFEGGEGHHHHHH?HH?EFFFFF SEQ:1:1101:9721:3885#0/1 adapter start: 51 1 51 81 TTCCTCAAACGCTTGTTCGGTGGATAAGTTATGGCATTAATCGATTTATTT GCCTAACTTCTTAGACTGCCTTAAGGACGT ATCTCGCGGAAGAAAAACAC adapt >BC?:A?=<>::A=528882.53)5.77;407)*9@:AA8CAA######## ############################## #################### SEQ:1:1101:9707:3894#0/1 adapter start: 40 1 40 70 AACACCATCCTTCATGAACTTAATCCACTGTTCACCATAA GCCTAACTTCTTAGACTGCCTTAAGGACGT ACGTGACGATGAGGGACATAAAAAGTAAAAA adapt F@F8DEE@EEBCCCCFFEFDDC=DCCFFF=ADD=D>@AA@ FFFDE99>,>>@=856>;6C<@1:39@>6@ =??:BAEEEBEDFBF69:<8B5D>@DEDEEF?F><>B@CBCD==BB DCDCCDD=8A>@<3A499:1@@@@CDC@@= @=<6@<@:=>189<16 SEQ:1:1101:9903:3754#0/1 -1 ACCAAAATTAGGGTCAACGCTACCTGTAGGAAGTGTCCGCATAAAGTGCACCGCATGGAAATGAAGACGGCCATCAGCTGTACCATACTCAGGCACACAAA GFEGGGGGBGE@EAEEGGFGGEGGFGEFFGFGFFGGEGGGGEFGCFCEFBF7FGEGEF?BFEEFDFFE??AADD+D@C@CGFCE6FDFFDFBGFDD@DAAD SEQ:1:1101:9878:3755#0/1 adapter start: 32 1 32 62 AGAACGTGAAAAAGCGTCCTGCGTGTAGCGAA GCCTAACTTCTTAGACTGCCTTAAGGACGT CTGCGATGGGCATACTGTAACCATAAGGCCACGTATTTT adapt HHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHH HFHHHBHHHHHHHHHFHFHEHHHHHHHHHH HHHEFHFHHHDFHHHHFGHHHHHFCEHECHHF?D5D7@D SEQ:1:1101:9833:3756#0/1 adapter start: 65 1 65 95 TCATCGTCACGTTTATGGTGAACAGTGGATTAAGTTCATGAAGGATGGTGTTAATGCCACTCCTC GCCTAACTTCTTAGACTGCCTTAAGGACGT TCCCGA adapt HHHHHHHHHHHHHHHHHFHHHHHHHHHHHHHHHHFHHHHGHHFHHHHHEHEHHHHFHEHHHEHFH HHFHHFEFFB=;,01:99;;HHHHHHEFGE EFFBFB SEQ:1:1101:9991:3777#0/1 -1 GCTTTGAGTCTTCTTCGGTTCCGACTACCCTCCCGACTGCCTATGATGTTTATCCTTTGGATGGTCGCCATGATGGTGGTTATTATACCGTCAAGGACTGT HHHHHHHHHHHHHHHHHHHHHHGHHHGHHHHHHHGHHHHHHGHHHHHHHHHHHHHFHHFFDFFFCFFDHCFF;BFGEFGEGFGGFFF.CFDCCEDB=CBC@ cutadapt-4.7/tests/cut/illumina5.fastq000066400000000000000000000015421457457704700201130ustar00rootroot00000000000000@SEQ:1:1101:9010:3891#0/1 adapter start: 51 ATAACCGGAGTAGTTGAAATGGTAATAAGACGACCAATCTGACCAGCAAGG + FFFFFEDBE@79@@>@CBCBFDBDFDDDDD<@C>ADD@B;5:978@CBDDF @SEQ:1:1101:9240:3898#0/1 CCAGCAAGGAAGCCAAGATGGGAAAGGTCATGCGGCATACGCTCGGCGCCAGTTTGAATATTAGACATAATTTATCCTCAAGTAAGGGGCCGAAGCCCCTG + GHGHGHHHHGGGDHHGDCGFEEFHHGDFGEHHGFHHHHHGHEAFDHHGFHHEEFHGHFHHFHGEHFBHHFHHHH@GGGDGDFEEFC@=D?GBGFGF:FB6D @SEQ:1:1101:9207:3899#0/1 adapter start: 64 TTAACTTCTCAGTAACAGATACAAACTCATCACGAACGTCAGAAGCAGCCTTATGGCCGTCAAC + HHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHFHHHHHHCFHHF @SEQ:1:1101:9148:3908#0/1 adapter start: 28 ACGACGCAATGGAGAAAGACGGAGAGCG + HHHHHHHHHHHHGHHHHGHHHHHHHHHH @SEQ:1:1101:9044:3916#0/1 adapter start: 78 AACAGAAGGAGTCTACTGCTCGCGTTGCGTCTATTATGGAAAACACCAATCTTTCCAAGCAACAGCAGGTTTCCGAGA + HHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHGHHHHGHHHHHHHHHHHHFHEBFHFFEFHE cutadapt-4.7/tests/cut/illumina5.info.txt000066400000000000000000000034701457457704700205500ustar00rootroot00000000000000SEQ:1:1101:9010:3891#0/1 adapter start: 51 0 64 81 ATAACCGGAGTAGTTGAAATGGTAATAAGACGACCAATCTGACCAGCAAGGGCCTAACTTCTTA GACTGCCTTAAGGACGT AAGCCAAGATGGGAAAGGTC adapt2 FFFFFEDBE@79@@>@CBCBFDBDFDDDDD<@C>ADD@B;5:978@CBDDFFDB4B?DB21;84 ?DDBC9DEBAB;=@<@@ B@@@@B>CCBBDE98>>0@7 SEQ:1:1101:9010:3891#0/1 adapter start: 51 1 51 64 ATAACCGGAGTAGTTGAAATGGTAATAAGACGACCAATCTGACCAGCAAGG GCCTAACTTCTTA adapt FFFFFEDBE@79@@>@CBCBFDBDFDDDDD<@C>ADD@B;5:978@CBDDF FDB4B?DB21;84 SEQ:1:1101:9240:3898#0/1 -1 CCAGCAAGGAAGCCAAGATGGGAAAGGTCATGCGGCATACGCTCGGCGCCAGTTTGAATATTAGACATAATTTATCCTCAAGTAAGGGGCCGAAGCCCCTG GHGHGHHHHGGGDHHGDCGFEEFHHGDFGEHHGFHHHHHGHEAFDHHGFHHEEFHGHFHHFHGEHFBHHFHHHH@GGGDGDFEEFC@=D?GBGFGF:FB6D SEQ:1:1101:9207:3899#0/1 adapter start: 64 0 77 94 TTAACTTCTCAGTAACAGATACAAACTCATCACGAACGTCAGAAGCAGCCTTATGGCCGTCAACGCCTAACTTCTTA GACTGCCTTAAGGACGT ATACATA adapt2 HHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHFHHHHHHCFHHFHHFHFFFFFBHHG HHHFFHHFHGGHHDEBF Greadname A cutadapt-4.7/tests/cut/linked-anchored.fasta000066400000000000000000000006251457457704700212240ustar00rootroot00000000000000>r1 5' adapter and 3' adapter AAAAAAAAAACCCCCCCCCCTTTTTTTTTTGGGGGGG >r2 without any adapter GGGGGGGGGGGGGGGGGGG >r3 5' adapter, partial 3' adapter CCCGGCCCCC >r4 only 3' adapter GGGGGGGGGGCCCCCCCCCCTTTTTTTTTTGGGGGGG >r5 only 5' adapter AAAAAAAAAACCCCCCCCCCGGGGGGG >r6 partial 5' adapter AAAAAACCCCCCCCCCTTTTTTTTTTGGGGGGG >r7 5' adapter plus preceding bases AACCGGTTTTAAAAAAAAAACCCCCCCCCCTTTTTTTTTTGGGGGGG cutadapt-4.7/tests/cut/linked-discard-g.fasta000066400000000000000000000002501457457704700212700ustar00rootroot00000000000000>r1 5' adapter and 3' adapter CCCCCCCCCC >r3 5' adapter, partial 3' adapter CCCGGCCCCC >r6 partial 5' adapter CCCCCCCCCC >r7 5' adapter plus preceding bases CCCCCCCCCC cutadapt-4.7/tests/cut/linked-discard.fasta000066400000000000000000000001751457457704700210520ustar00rootroot00000000000000>r1 5' adapter and 3' adapter CCCCCCCCCC >r3 5' adapter, partial 3' adapter CCCGGCCCCC >r5 only 5' adapter CCCCCCCCCCGGGGGGG cutadapt-4.7/tests/cut/linked-info.txt000066400000000000000000000012461457457704700201150ustar00rootroot00000000000000r1 5' adapter and 3' adapter 0 0 10 AAAAAAAAAA CCCCCCCCCCTTTTTTTTTTGGGGGGG linkedadapter;1 r1 5' adapter and 3' adapter 0 10 20 CCCCCCCCCC TTTTTTTTTT GGGGGGG linkedadapter;2 r2 without any adapter -1 GGGGGGGGGGGGGGGGGGG r3 5' adapter, partial 3' adapter 0 0 10 AAAAAAAAAA CCCGGCCCCCTTTTT linkedadapter;1 r3 5' adapter, partial 3' adapter 0 10 15 CCCGGCCCCC TTTTT linkedadapter;2 r4 only 3' adapter -1 GGGGGGGGGGCCCCCCCCCCTTTTTTTTTTGGGGGGG r5 only 5' adapter 0 0 10 AAAAAAAAAA CCCCCCCCCCGGGGGGG linkedadapter;1 r6 partial 5' adapter -1 AAAAAACCCCCCCCCCTTTTTTTTTTGGGGGGG r7 5' adapter plus preceding bases -1 AACCGGTTTTAAAAAAAAAACCCCCCCCCCTTTTTTTTTTGGGGGGG cutadapt-4.7/tests/cut/linked-lowercase.fasta000066400000000000000000000006441457457704700214260ustar00rootroot00000000000000>r1 5' adapter and 3' adapter AAAAAAAAAACCCCCCCCCCTTTTTTTTTTGGGGGGG >r2 without any adapter GGGGGGGGGGGGGGGGGGG >r3 5' adapter, partial 3' adapter aaaaAAAAAACCCGGCCCCCTtttt >r4 only 3' adapter GGGGGGGGGGCCCCCCCCCCTTTTTTTTTTGGGGGGG >r5 only 5' adapter AAAAAAAAAACCCCCCCCCCGGGGGGG >r6 partial 5' adapter AAAAAACCCCCCCCCCTTTTTTTTTTGGGGGGG >r7 5' adapter plus preceding bases aaccggttttaaaaAAAAAACCCCCCCCCCTTTTTTttttggggggg cutadapt-4.7/tests/cut/linked-not-anchored.fasta000066400000000000000000000004761457457704700220260ustar00rootroot00000000000000>r1 5' adapter and 3' adapter CCCCCCCCCC >r2 without any adapter GGGGGGGGGGGGGGGGGGG >r3 5' adapter, partial 3' adapter CCCGGCCCCC >r4 only 3' adapter GGGGGGGGGGCCCCCCCCCCTTTTTTTTTTGGGGGGG >r5 only 5' adapter AAAAAAAAAACCCCCCCCCCGGGGGGG >r6 partial 5' adapter CCCCCCCCCC >r7 5' adapter plus preceding bases CCCCCCCCCC cutadapt-4.7/tests/cut/linked.fasta000066400000000000000000000005601457457704700174410ustar00rootroot00000000000000>r1 5' adapter and 3' adapter CCCCCCCCCC >r2 without any adapter GGGGGGGGGGGGGGGGGGG >r3 5' adapter, partial 3' adapter CCCGGCCCCC >r4 only 3' adapter GGGGGGGGGGCCCCCCCCCCTTTTTTTTTTGGGGGGG >r5 only 5' adapter CCCCCCCCCCGGGGGGG >r6 partial 5' adapter AAAAAACCCCCCCCCCTTTTTTTTTTGGGGGGG >r7 5' adapter plus preceding bases AACCGGTTTTAAAAAAAAAACCCCCCCCCCTTTTTTTTTTGGGGGGG cutadapt-4.7/tests/cut/lowercase.fastq000066400000000000000000000003471457457704700202020ustar00rootroot00000000000000@prefix:1_13_573/1 CGTCCGAANTAGCTACCACCCTGA + )3%)&&&&!.1&(6:<'67..*,: @prefix:1_13_1259/1 AGCCGCTANGACGGGTTGGCCC + ;<:&:A;A!9<<<,7:<=3=;: @prefix:1_13_1440/1 CAAGATCTNCCCTGCCACATTGCCCTAGTTAAAC + <=A:A=57!7<';<6?5;;6:+:=)71>70<,=: cutadapt-4.7/tests/cut/lowqual.fastq000066400000000000000000000000511457457704700176720ustar00rootroot00000000000000@first_sequence + @second_sequence + cutadapt-4.7/tests/cut/lowqual.unchanged.fastq000066400000000000000000000001151457457704700216260ustar00rootroot00000000000000@first_sequence SEQUENCE1 + ######### @second_sequence SEQUENCE2 + ######### cutadapt-4.7/tests/cut/maxee.fastq000066400000000000000000000000471457457704700173120ustar00rootroot00000000000000@empty + @ee_0.8 ACGTTGCA + ++++++++ cutadapt-4.7/tests/cut/maxlen.fa000066400000000000000000000001731457457704700167470ustar00rootroot00000000000000>trimmed_length0 >trimmed_length1 c >trimmed_length2 ca >trimmed_length3 caa >trimmed_length4 caaa >trimmed_length5 caaac cutadapt-4.7/tests/cut/maxn0.2.fasta000066400000000000000000000000301457457704700173460ustar00rootroot00000000000000>r1 >r3 AAAA >r4 AAAAN cutadapt-4.7/tests/cut/maxn0.4.fasta000066400000000000000000000000421457457704700173530ustar00rootroot00000000000000>r1 >r3 AAAA >r4 AAAAN >r5 AAANN cutadapt-4.7/tests/cut/maxn0.fasta000066400000000000000000000000161457457704700172120ustar00rootroot00000000000000>r1 >r3 AAAA cutadapt-4.7/tests/cut/maxn1.fasta000066400000000000000000000000361457457704700172150ustar00rootroot00000000000000>r1 >r2 N >r3 AAAA >r4 AAAAN cutadapt-4.7/tests/cut/maxn2.fasta000066400000000000000000000000501457457704700172120ustar00rootroot00000000000000>r1 >r2 N >r3 AAAA >r4 AAAAN >r5 AAANN cutadapt-4.7/tests/cut/minlen.fa000066400000000000000000000002321457457704700167410ustar00rootroot00000000000000>trimmed_length5 caaac >trimmed_length6 caaacc >trimmed_length7 caaacca >trimmed_length8 caaaccag >trimmed_length9 caaaccagt >trimmed_length10 caaaccagtt cutadapt-4.7/tests/cut/minlen.noprimer.fa000066400000000000000000000002441457457704700205760ustar00rootroot00000000000000>read_length6 23302 >read_length7 023302 >read_length8 1023302 >read_length9 11023302 >read_length10 111023302 >read_length11 2111023302 >read_length12 02111023302 cutadapt-4.7/tests/cut/multiprefix.fasta000066400000000000000000000006361457457704700205470ustar00rootroot00000000000000>r1 ATAACCGGAGTAGTTGAAATGGTAATAAGACGACCAATCTGACCAGCAAGG >r2 ACGACGCAATGGAGAAAGACGGAGAGCG >r3 CCATCCAAAGGATAAACATCATAGGCAGTCGGGAGGGTAGTCGGAA >r4 GTGGCAAGTCTGCCGCTGATAAAGGAAAGGATACTCGTGATTATCTTGC >r5 ATTAGAGCCAATACCATCAGCTTTACCG >r6 ATAAAGGAAAGGATACTCGTGATTATCT >r7 ACTTGACTCATGATTTCTTACCTATTAGTGGTTGAAC >r8 ACGTTCTGGTTGGTTGTGGCCTGTTGATGCTAAAGGTGAGC >r9 AGGCTTCTGCCGTTTTGGATTTAACCGAAGATGAT >r10 TCTGTTGAACACGACCAGAA cutadapt-4.7/tests/cut/multisuffix.fasta000066400000000000000000000006361457457704700205560ustar00rootroot00000000000000>r1 ATAACCGGAGTAGTTGAAATGGTAATAAGACGACCAATCTGACCAGCAAGG >r2 ACGACGCAATGGAGAAAGACGGAGAGCG >r3 CCATCCAAAGGATAAACATCATAGGCAGTCGGGAGGGTAGTCGGAA >r4 GTGGCAAGTCTGCCGCTGATAAAGGAAAGGATACT >r5 ATTAGAGCCAATACCATCAGCTTTACCG >r6 ATAAAGGAAAGGATACTCGTGATTATCT >r7 GTACGGATTGTTCAGTAACTTGACTCATGATTTCTTA >r8 ACGTTCTGGTTGGTTGTGGCCTGTTGATGCTAAAGGTGAGC >r9 TATTAAGCTCATTCAGGCTTCTGCCGTTTTGGATTTAACCGAAGATGAT >r10 TCTGTTGAACACGACCAGAA cutadapt-4.7/tests/cut/nextseq.fastq000066400000000000000000000006261457457704700177050ustar00rootroot00000000000000@NS500350:251:HLM7JBGXX:1:11101:12075:1120 1:N:0:TACAGC GATCGGAAGAGCACACGTCTGAACTCCAGTCACTACAGCATCTCGTATTCCGTCTTCTGCTTGAAAAAAAA + AAAAAEEEEEEAEEEEAEAEEEEEEAEEEEEEEEEEEEEEE///E/EE////AAEE/E//////EEEEEEE @NS500350:251:HLM7JBGXX:1:11101:22452:1121 1:N:0:TACAGC GATCGGAAGAGCACACGTCTGAACTCCAGTCACTACAGCATCGCGTATGCCGTCTTATGCTTGAAAAAAAAA + AAAAAEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEE/////E/EE//E6///E//A//E//EEEEEEEE cutadapt-4.7/tests/cut/no-trim.fastq000066400000000000000000000001341457457704700175750ustar00rootroot00000000000000@prefix:1_13_1440/1 CAAGATCTNCCCTGCCACATTGCCCTAGTTAAAC + <=A:A=57!7<';<6?5;;6:+:=)71>70<,=: cutadapt-4.7/tests/cut/no_indels.fasta000066400000000000000000000004041457457704700201420ustar00rootroot00000000000000>3p_orig TGAACATAGC >3p_mism TGAACATAGC >3p_del TGAACATAGCTTAACATATAACCG >3p_ins TGAACATAGCTTAGGACATATAACCG >3p_frontins TAGACATATAACCG >5p_orig TACTGCTTCTCGAA >5p_mism TACTGCTTCTCGAA >5p_del TCCTCGAGATGCCATACTGCTTCTCGAA >5p_ins TCCTCGAGATATGCCATACTGCTTCTCGAA cutadapt-4.7/tests/cut/overlapb.fa000066400000000000000000000010211457457704700172660ustar00rootroot00000000000000>adaptlen18 ATACTTACCCGTA >adaptlen17 ATACTTACCCGTA >adaptlen16 ATACTTACCCGTA >adaptlen15 ATACTTACCCGTA >adaptlen14 ATACTTACCCGTA >adaptlen13 ATACTTACCCGTA >adaptlen12 ATACTTACCCGTA >adaptlen11 ATACTTACCCGTA >adaptlen10 ATACTTACCCGTA >adaptlen9 TCTCCGTCGATACTTACCCGTA >adaptlen8 CTCCGTCGATACTTACCCGTA >adaptlen7 TCCGTCGATACTTACCCGTA >adaptlen6 CCGTCGATACTTACCCGTA >adaptlen5 CGTCGATACTTACCCGTA >adaptlen4 GTCGATACTTACCCGTA >adaptlen3 TCGATACTTACCCGTA >adaptlen2 CGATACTTACCCGTA >adaptlen1 GATACTTACCCGTA >adaptlen0 ATACTTACCCGTA cutadapt-4.7/tests/cut/pair-adapters.1.fastq000066400000000000000000000004771457457704700211150ustar00rootroot00000000000000@read1/1 some text TTATTT + ##HHHH @read2/1 CAACAGGCCACATTAGACATATCGGATGGT + HHHHHHHHHHHHHHHHHHHHHHHHHHHHHH @read3/1 CCAACTTGATATTAATAACATTAGACA + HHHHHHHHHHHHHHHHHHHHHHHHHHH @read4/1 GACAGGCCGTTTGAATGTTGACGGGATGTT + HHHHHHHHHHHHHHHHHHHHHHHHHHHHHH @read5 TTATTTGTCTCCAGCTTAGACATATCGCCT + #####HHHHHHHHHHHHHHHHHHHHHHHHH cutadapt-4.7/tests/cut/pair-adapters.2.fastq000066400000000000000000000004741457457704700211130ustar00rootroot00000000000000@read1/2 other text GCTGGA + HHHHHH @read2/2 TGTGGCCTGTTGCAGTGGAGTAACTCCAGC + ###HHHHHHHHHHHHHHHHHHHHHHHHHHH @read3/2 TGTTATTAATATCAAGTTGGCAGTG + #HHHHHHHHHHHHHHHHHHHHHHHH @read4/2 CATCCCGTCAACATTCAAACGGCCTGTCCA + HH############################ @read5 CAACAGGCCACATTAGACATATCGGATGGT + HHHHHHHH##HHHHHHHHHHHHHHHHHHHH cutadapt-4.7/tests/cut/paired-filterboth.1.fastq000066400000000000000000000004011457457704700217500ustar00rootroot00000000000000@read1/1 some text TTATTTGTCTCCAGC + ##HHHHHHHHHHHHH @read2/1 CAACAGGCCACA + HHHHHHHHHHHH @read3/1 CCAACTTGATATTAATAACA + HHHHHHHHHHHHHHHHHHHH @read4/1 GACAGGCCGTTTGAATGTTGACGGGATGTT + HHHHHHHHHHHHHHHHHHHHHHHHHHHHHH @read5 TTATTTGTCTCCAGC + #####HHHHHHHHHH cutadapt-4.7/tests/cut/paired-filterboth.2.fastq000066400000000000000000000004721457457704700217610ustar00rootroot00000000000000@read1/2 other text GCTGGAGACAAATAACAGT + HHHHHHHHHHHHHHHHHHH @read2/2 TGTGGCCTGTTGCAGT + ###HHHHHHHHHHHHH @read3/2 TGTTATTAATATCAAGTTGGCAGTG + #HHHHHHHHHHHHHHHHHHHHHHHH @read4/2 CATCCCGTCAACATTCAAACGGCCTGTCCA + HH############################ @read5 CAACAGGCCACATTAGACATATCGGATGGT + HHHHHHHH##HHHHHHHHHHHHHHHHHHHH cutadapt-4.7/tests/cut/paired-filterfirst.1.fastq000066400000000000000000000003341457457704700221500ustar00rootroot00000000000000@read1/1 some text TTATTTGTCTCCAGC + ##HHHHHHHHHHHHH @read3/1 CCAACTTGATATTAATAACA + HHHHHHHHHHHHHHHHHHHH @read4/1 GACAGGCCGTTTGAATGTTGACGGGATGTT + HHHHHHHHHHHHHHHHHHHHHHHHHHHHHH @read5 TTATTTGTCTCCAGC + #####HHHHHHHHHH cutadapt-4.7/tests/cut/paired-filterfirst.2.fastq000066400000000000000000000004151457457704700221510ustar00rootroot00000000000000@read1/2 other text GCTGGAGACAAATAACAGT + HHHHHHHHHHHHHHHHHHH @read3/2 TGTTATTAATATCAAGTTGGCAGTG + #HHHHHHHHHHHHHHHHHHHHHHHH @read4/2 CATCCCGTCAACATTCAAACGGCCTGTCCA + HH############################ @read5 CAACAGGCCACATTAGACATATCGGATGGT + HHHHHHHH##HHHHHHHHHHHHHHHHHHHH cutadapt-4.7/tests/cut/paired-m27.1.fastq000066400000000000000000000004541457457704700202230ustar00rootroot00000000000000@read1/1 some text TTATTTGTCTCCAGCTTAGACATATCGCCT + ##HHHHHHHHHHHHHHHHHHHHHHHHHHHH @read2/1 CAACAGGCCACATTAGACATATCGGATGGT + HHHHHHHHHHHHHHHHHHHHHHHHHHHHHH @read4/1 GACAGGCCGTTTGAATGTTGACGGGATGTT + HHHHHHHHHHHHHHHHHHHHHHHHHHHHHH @read5 TTATTTGTCTCCAGCTTAGACATATCGCCT + #####HHHHHHHHHHHHHHHHHHHHHHHHH cutadapt-4.7/tests/cut/paired-m27.2.fastq000066400000000000000000000004551457457704700202250ustar00rootroot00000000000000@read1/2 other text GCTGGAGACAAATAACAGTGGAGTAGTTTT + HHHHHHHHHHHHHHHHHHHHHHHHHHHHHH @read2/2 TGTGGCCTGTTGCAGTGGAGTAACTCCAGC + ###HHHHHHHHHHHHHHHHHHHHHHHHHHH @read4/2 CATCCCGTCAACATTCAAACGGCCTGTCCA + HH############################ @read5 CAACAGGCCACATTAGACATATCGGATGGT + HHHHHHHH##HHHHHHHHHHHHHHHHHHHH cutadapt-4.7/tests/cut/paired-onlyA.1.fastq000066400000000000000000000005571457457704700207040ustar00rootroot00000000000000@read1/1 some text TTATTTGTCTCCAGCTTAGACATATCGCCT + ##HHHHHHHHHHHHHHHHHHHHHHHHHHHH @read2/1 CAACAGGCCACATTAGACATATCGGATGGT + HHHHHHHHHHHHHHHHHHHHHHHHHHHHHH @read3/1 CCAACTTGATATTAATAACATTAGACA + HHHHHHHHHHHHHHHHHHHHHHHHHHH @read4/1 GACAGGCCGTTTGAATGTTGACGGGATGTT + HHHHHHHHHHHHHHHHHHHHHHHHHHHHHH @read5 TTATTTGTCTCCAGCTTAGACATATCGCCT + #####HHHHHHHHHHHHHHHHHHHHHHHHH cutadapt-4.7/tests/cut/paired-onlyA.2.fastq000066400000000000000000000004401457457704700206740ustar00rootroot00000000000000@read1/2 other text GCTGGAGACAAATAA + HHHHHHHHHHHHHHH @read2/2 TGTGGCCTGTTG + ###HHHHHHHHH @read3/2 TGTTATTAATATCAAGTTGG + #HHHHHHHHHHHHHHHHHHH @read4/2 CATCCCGTCAACATTCAAACGGCCTGTCCA + HH############################ @read5 CAACAGGCCACATTAGACATATCGGATGGT + HHHHHHHH##HHHHHHHHHHHHHHHHHHHH cutadapt-4.7/tests/cut/paired-separate.1.fastq000066400000000000000000000004011457457704700214120ustar00rootroot00000000000000@read1/1 some text TTATTTGTCTCCAGC + ##HHHHHHHHHHHHH @read2/1 CAACAGGCCACA + HHHHHHHHHHHH @read3/1 CCAACTTGATATTAATAACA + HHHHHHHHHHHHHHHHHHHH @read4/1 GACAGGCCGTTTGAATGTTGACGGGATGTT + HHHHHHHHHHHHHHHHHHHHHHHHHHHHHH @read5 TTATTTGTCTCCAGC + #####HHHHHHHHHH cutadapt-4.7/tests/cut/paired-separate.2.fastq000066400000000000000000000004401457457704700214160ustar00rootroot00000000000000@read1/2 other text GCTGGAGACAAATAA + HHHHHHHHHHHHHHH @read2/2 TGTGGCCTGTTG + ###HHHHHHHHH @read3/2 TGTTATTAATATCAAGTTGG + #HHHHHHHHHHHHHHHHHHH @read4/2 CATCCCGTCAACATTCAAACGGCCTGTCCA + HH############################ @read5 CAACAGGCCACATTAGACATATCGGATGGT + HHHHHHHH##HHHHHHHHHHHHHHHHHHHH cutadapt-4.7/tests/cut/paired-too-short.1.fastq000066400000000000000000000000451457457704700215500ustar00rootroot00000000000000@read2/1 CAACAGGCCACA + HHHHHHHHHHHH cutadapt-4.7/tests/cut/paired-too-short.2.fastq000066400000000000000000000000451457457704700215510ustar00rootroot00000000000000@read2/2 TGTGGCCTGTTG + ###HHHHHHHHH cutadapt-4.7/tests/cut/paired-trimmed.1.fastq000066400000000000000000000002701457457704700212530ustar00rootroot00000000000000@read1/1 some text TTATTTGTCTCCAGC + ##HHHHHHHHHHHHH @read2/1 CAACAGGCCACA + HHHHHHHHHHHH @read3/1 CCAACTTGATATTAATAACA + HHHHHHHHHHHHHHHHHHHH @read5 TTATTTGTCTCCAGC + #####HHHHHHHHHH cutadapt-4.7/tests/cut/paired-trimmed.2.fastq000066400000000000000000000004431457457704700212560ustar00rootroot00000000000000@read1/2 other text GCTGGAGACAAATAACAGTGGAGTAGTTTT + HHHHHHHHHHHHHHHHHHHHHHHHHHHHHH @read2/2 TGTGGCCTGTTGCAGTGGAGTAACTCCAGC + ###HHHHHHHHHHHHHHHHHHHHHHHHHHH @read3/2 TGTTATTAATATCAAGTTGGCAGTG + #HHHHHHHHHHHHHHHHHHHHHHHH @read5 CAACAGGCCACATTAGACATATCGGATGGT + HHHHHHHH##HHHHHHHHHHHHHHHHHHHH cutadapt-4.7/tests/cut/paired-untrimmed.1.fastq000066400000000000000000000001111457457704700216100ustar00rootroot00000000000000@read4/1 GACAGGCCGTTTGAATGTTGACGGGATGTT + HHHHHHHHHHHHHHHHHHHHHHHHHHHHHH cutadapt-4.7/tests/cut/paired-untrimmed.2.fastq000066400000000000000000000001111457457704700216110ustar00rootroot00000000000000@read4/2 CATCCCGTCAACATTCAAACGGCCTGTCCA + HH############################ cutadapt-4.7/tests/cut/paired.1.fastq000066400000000000000000000003341457457704700176150ustar00rootroot00000000000000@read1/1 some text TTATTTGTCTCCAGC + ##HHHHHHHHHHHHH @read3/1 CCAACTTGATATTAATAACA + HHHHHHHHHHHHHHHHHHHH @read4/1 GACAGGCCGTTTGAATGTTGACGGGATGTT + HHHHHHHHHHHHHHHHHHHHHHHHHHHHHH @read5 TTATTTGTCTCCAGC + #####HHHHHHHHHH cutadapt-4.7/tests/cut/paired.2.fastq000066400000000000000000000003731457457704700176210ustar00rootroot00000000000000@read1/2 other text GCTGGAGACAAATAA + HHHHHHHHHHHHHHH @read3/2 TGTTATTAATATCAAGTTGG + #HHHHHHHHHHHHHHHHHHH @read4/2 CATCCCGTCAACATTCAAACGGCCTGTCCA + HH############################ @read5 CAACAGGCCACATTAGACATATCGGATGGT + HHHHHHHH##HHHHHHHHHHHHHHHHHHHH cutadapt-4.7/tests/cut/paired.m14.1.fastq000066400000000000000000000002231457457704700202120ustar00rootroot00000000000000@read1/1 some text TTATTTGTCTCCAGC + ##HHHHHHHHHHHHH @read3/1 CCAACTTGATATTAATAACA + HHHHHHHHHHHHHHHHHHHH @read5 TTATTTGTCTCCAGC + #####HHHHHHHHHH cutadapt-4.7/tests/cut/paired.m14.2.fastq000066400000000000000000000003321457457704700202140ustar00rootroot00000000000000@read1/2 other text GCTGGAGACAAATAACAGTGGAGTAGTTTT + HHHHHHHHHHHHHHHHHHHHHHHHHHHHHH @read3/2 TGTTATTAATATCAAGTTGGCAGTG + #HHHHHHHHHHHHHHHHHHHHHHHH @read5 CAACAGGCCACATTAGACATATCGGATGGT + HHHHHHHH##HHHHHHHHHHHHHHHHHHHH cutadapt-4.7/tests/cut/pairedq.1.fastq000066400000000000000000000002231457457704700177730ustar00rootroot00000000000000@read1/1 some text TTATTTGTCTCCAGC + ##HHHHHHHHHHHHH @read3/1 CCAACTTGATATTAATAACA + HHHHHHHHHHHHHHHHHHHH @read5 TTATTTGTCTCCAGC + #####HHHHHHHHHH cutadapt-4.7/tests/cut/pairedq.2.fastq000066400000000000000000000002621457457704700177770ustar00rootroot00000000000000@read1/2 other text GCTGGAGACAAATAA + HHHHHHHHHHHHHHH @read3/2 TGTTATTAATATCAAGTTGG + #HHHHHHHHHHHHHHHHHHH @read5 CAACAGGCCACATTAGACATATCGGATGGT + HHHHHHHH##HHHHHHHHHHHHHHHHHHHH cutadapt-4.7/tests/cut/pairedu.1.fastq000066400000000000000000000005071457457704700200040ustar00rootroot00000000000000@read1/1 some text TTTGTCTCCAGCTTAGACATATCGCC + HHHHHHHHHHHHHHHHHHHHHHHHHH @read2/1 CAGGCCACATTAGACATATCGGATGG + HHHHHHHHHHHHHHHHHHHHHHHHHH @read3/1 ACTTGATATTAATAACATTAGAC + HHHHHHHHHHHHHHHHHHHHHHH @read4/1 AGGCCGTTTGAATGTTGACGGGATGT + HHHHHHHHHHHHHHHHHHHHHHHHHH @read5 TTTGTCTCCAGCTTAGACATATCGCC + ##HHHHHHHHHHHHHHHHHHHHHHHH cutadapt-4.7/tests/cut/pairedu.2.fastq000066400000000000000000000004601457457704700200030ustar00rootroot00000000000000@read1/2 other text GAGACAAATAACAGTGGAGTAGTT + HHHHHHHHHHHHHHHHHHHHHHHH @read2/2 GCCTGTTGCAGTGGAGTAACTCCA + HHHHHHHHHHHHHHHHHHHHHHHH @read3/2 ATTAATATCAAGTTGGCAG + HHHHHHHHHHHHHHHHHHH @read4/2 CCGTCAACATTCAAACGGCCTGTC + ######################## @read5 AGGCCACATTAGACATATCGGATG + HHHH##HHHHHHHHHHHHHHHHHH cutadapt-4.7/tests/cut/plus.fastq000066400000000000000000000002461457457704700171770ustar00rootroot00000000000000@first_sequence some other text SEQUENCE1 +first_sequence some other text :6;;8<=:< @second_sequence and more text SEQUENCE2 +second_sequence and more text 83polyA AAACTTCAGAACAG >polyAlong CTTAGTTCAATWTTAACCAAACTTCAGAACAG >polyA2 AAACTTAACAAGAACAAG >nopoly GAAGAGTATCTCTCTGTCCTCTTGTCCGGCGTTACAGTAATGATCG cutadapt-4.7/tests/cut/polya.2.fasta000066400000000000000000000002011457457704700174470ustar00rootroot00000000000000>polyA CTGTTCTGAAGTTT >polyAlong CTGTTCTGAAGTTTGGTTAAWATTGAACTAAG >polyA2 CTTGTTCTTGTTAAGTTT >nopoly TCTGAAGTTTGGTTAAWATTGAACTAA cutadapt-4.7/tests/cut/polya.legacy.1.fasta000066400000000000000000000002061457457704700207160ustar00rootroot00000000000000>polyA AAACTTCAG >polyAlong CTTAGTTCAATWTTAACCAAACTTCAGAACAG >polyA2 AAACTTAAC >nopoly GAAGAGTATCTCTCTGTCCTCTTGTCCGGCGTTACAGTAATGATCG cutadapt-4.7/tests/cut/rename.1.fastq000066400000000000000000000006101457457704700176150ustar00rootroot00000000000000@read1/1 TTAT TTAT some text R1adapter R2adapter TT + HH @read2/1 CAAC CAAC no_adapter no_adapter AGGCCACATTAGACATATCGGATGGT + HHHHHHHHHHHHHHHHHHHHHHHHHH @read3/1 CCAA CCAA no_adapter no_adapter CTTGATATTAATAACATTAGACA + HHHHHHHHHHHHHHHHHHHHHHH @read4/1 GACA GACA no_adapter no_adapter GGCCGTTTGAATGTTGACGGGATGTT + HHHHHHHHHHHHHHHHHHHHHHHHHH @read5 TTAT TTAT R1adapter no_adapter TT + #H cutadapt-4.7/tests/cut/rename.2.fastq000066400000000000000000000007121457457704700176210ustar00rootroot00000000000000@read1/2 TTAT other text R2adapter R2adapter GCTGGA + HHHHHH @read2/2 CAAC no_adapter no_adapter TGTGGCCTGTTGCAGTGGAGTAACTCCAGC + ###HHHHHHHHHHHHHHHHHHHHHHHHHHH @read3/2 CCAA no_adapter no_adapter TGTTATTAATATCAAGTTGGCAGTG + #HHHHHHHHHHHHHHHHHHHHHHHH @read4/2 GACA no_adapter no_adapter CATCCCGTCAACATTCAAACGGCCTGTCCA + HH############################ @read5 TTAT no_adapter no_adapter CAACAGGCCACATTAGACATATCGGATGGT + HHHHHHHH##HHHHHHHHHHHHHHHHHHHH cutadapt-4.7/tests/cut/rename.fastq000066400000000000000000000002311457457704700174550ustar00rootroot00000000000000@Read1_TCCC Read1 extra OnlyT AAAAAAAAAA + FFGHHHHIJJ @Read2_GCCC Read2 OnlyG AAAA + JIJJ @Read3_AAAA Read3 no_adapter AAAAAAAAAAAAAAA + JGFHCG<read1 TESTING >read2 TESTING >read3 TESTING >read4 TESTING >read5 TESTING >read6 SOMETHING >read7 SOMETHING >read8 REST >read9 NOREST cutadapt-4.7/tests/cut/restfront.fa000066400000000000000000000002041457457704700175040ustar00rootroot00000000000000>read1 REST1 >read2 RESTING >read3 >read4 RESTLESS >read5 RESTORE >read6 SOMETHING >read7 SOMETHING >read8 SOMETHING >read9 NOREST cutadapt-4.7/tests/cut/revcomp-r1r2.1.fastq000066400000000000000000000006401457457704700206100ustar00rootroot00000000000000@read1/1 CCAGCTTAGACATATCGCCT + G=1C(C1=J1J=C(18C(1( @read2/2 rc TGTGGCCTGTTGCAGTGGAGTAACTCCAGC + '<07I'FIB'<first_sequence SEQUENCE1 >second_sequence SEQUENCE2 cutadapt-4.7/tests/cut/small-no-trim.fasta000066400000000000000000000002441457457704700206650ustar00rootroot00000000000000>prefix:1_13_573/1 CGTCCGAANTAGCTACCACCCTGATTAGACAAAT >prefix:1_13_1259/1 AGCCGCTANGACGGGTTGGCCCTTAGACGTATCT >prefix:1_13_1440/1 CAAGATCTNCCCTGCCACATTGCCCTAGTTAAAC cutadapt-4.7/tests/cut/small.fasta000066400000000000000000000002161457457704700173010ustar00rootroot00000000000000>prefix:1_13_573/1 CGTCCGAANTAGCTACCACCCTGA >prefix:1_13_1259/1 AGCCGCTANGACGGGTTGGCCC >prefix:1_13_1440/1 CAAGATCTNCCCTGCCACATTGCCCTAGTTAAAC cutadapt-4.7/tests/cut/small.fastq000066400000000000000000000003471457457704700173260ustar00rootroot00000000000000@prefix:1_13_573/1 CGTCCGAANTAGCTACCACCCTGA + )3%)&&&&!.1&(6:<'67..*,: @prefix:1_13_1259/1 AGCCGCTANGACGGGTTGGCCC + ;<:&:A;A!9<<<,7:<=3=;: @prefix:1_13_1440/1 CAAGATCTNCCCTGCCACATTGCCCTAGTTAAAC + <=A:A=57!7<';<6?5;;6:+:=)71>70<,=: cutadapt-4.7/tests/cut/small.trimmed.fastq000066400000000000000000000002131457457704700207560ustar00rootroot00000000000000@prefix:1_13_573/1 CGTCCGAANTAGCTACCACCCTGA + )3%)&&&&!.1&(6:<'67..*,: @prefix:1_13_1259/1 AGCCGCTANGACGGGTTGGCCC + ;<:&:A;A!9<<<,7:<=3=;: cutadapt-4.7/tests/cut/small.untrimmed.fastq000066400000000000000000000001341457457704700213230ustar00rootroot00000000000000@prefix:1_13_1440/1 CAAGATCTNCCCTGCCACATTGCCCTAGTTAAAC + <=A:A=57!7<';<6?5;;6:+:=)71>70<,=: cutadapt-4.7/tests/cut/solid-no-zerocap.fastq000066400000000000000000000046171457457704700214070ustar00rootroot00000000000000@1_13_85_F3 T110020300.0113010210002110102330021 + 7&9<&77)& <7))%4'657-1+9;9,.<8);.;8 @1_13_573_F3 T312311200.30213011011132 + 6)3%)&&&& .1&(6:<'67..*, @1_13_1259_F3 T002112130.201222332211 + =;<:&:A;A 9<<<,7:<=3=; @1_13_1440_F3 T110020313.1113211010332111302330001 + =<=A:A=57 7<';<6?5;;6:+:=)71>70<,=: @1_14_177_F3 T31330222020233321121323302013303311 + :8957;;54)'98924905;;)6:7;1:3<88(9: @1_14_238_F3 T0133103120031002212223 + ?><5=;<<<12>=<;1;;=5); @1_15_1098_F3 T32333033222233020223032312232220332 + #,##(#5##*#($$'#.##)$&#%)$1##-$&##% @1_16_404_F3 T03310320002130202331112 + 78;:;;><>9=9;<<2=><<1;5 @1_16_904_F3 T21230102331022312232132021122111212 + 9>=::6;;99=+/'$+#.#&%$&'(($1*$($.#. @1_16_1315_F3 T032312311122103330103103 + <9<8A?>?::;6&,%;6/)8<<#/ @1_16_1595_F3 T22323211312111230022210011213302012 + >,<=<>@6<;?<=>:/=.>&;;8;)17:=&,>1=+ @1_17_1379_F3 T32011212111223230232132311321200123 + /-1179<1;>>8:':7-%/::0&+=<29,7<8(,2 @1_18_1692_F3 T12322233031100211233323300112200210 + .#(###5%)%2)',2&:+#+&5,($/1#&4&))$6 @1_19_171_F3 T10101101220213201111011320201230032 + )6:65/=3*:(8%)%2>&8&%;%0&#;$3$&:$#& @1_22_72_F3 T13303032323221212301322233320210233 + 3/#678<:.=9::6:(<538295;9+;&*;)+',& @1_22_1377_F3 T22221333311222312201132312022322300 + )##0%.$.1*%,)95+%%14%$#8-###9-()#9+ @1_23_585_F3 T300103103101303121221 + >55;8><96/18?)<3<58<5 @1_23_809_F3 T13130101101021211013220302223302112 + :7<59@;<<5;/9;=<;7::.)&&&827(+221%( @1_24_138_F3 T33211130100120323002 + 6)68/;906#,25/&;<$0+ @1_24_206_F3 T33330332002223002020303331321221000 + ))4(&)9592)#)694(,)292:(=7$.18,()65 @1_25_143_F3 T23202003031200220301303302012203132 + :4;/#&<9;&*;95-7;85&;587#16>%&,9<2& @1_25_1866_F3 T03201321022131101112012330221130311 + =<>9;<@7?(=6,<&?=6=(=<641:?'<1=;':4 @1_27_584_F3 T10010330110103213112323303012103101 + 82'('*.-8+%#2)(-&3.,.2,),+.':&,'(&/ @1_27_1227_F3 T02003022123001003201002031303302011 + 492:;>A:<;34<<=);:<<;9=7<3::<::3=>' @1_27_1350_F3 T13130101101021211013220222221301231 + 95,)<(4./;<938=64=+2/,.4),3':97#33& @1_29_477_F3 T13130101101021211013300302223003030 + 94=55:75=+:/7><968;;#&+$#3&6,#1#4#' @1_30_882_F3 T20102033000233 + 2(+-:-3<;5##/; @1_31_221_F3 T03301311201100030300100233220102031 + 89>9>5<139/,&:7969972.274&%:78&&746 @1_31_1313_F3 T0133113130033012232100010101 + ;3<7=7::)5*4=&;<7>4;795065;9 @1_529_129_F3 T132222301020322102101322221322302.3302.3.3..221..3 + >>%/((B6-&5A0:6)>;'1)B*38/?(5=%B+ &<-9 % @ )%) ( cutadapt-4.7/tests/cut/solidqual.fastq000066400000000000000000000047571457457704700202240ustar00rootroot00000000000000@1_13_85_F3 T110020300.0113010210002110102330021 + 7&9<&77)&!<7))%4'657-1+9;9,.<8);.;8 @1_13_573_F3 T312311200.3021301101113203302010003 + 6)3%)&&&&!.1&(6:<'67..*,:75)'77&&&5 @1_13_1259_F3 T002112130.201222332211133020123031 + =;<:&:A;A!9<<<,7:<=3=;:<&70<,=: @1_14_177_F3 T31330222020233321121323302013303311 + :8957;;54)'98924905;;)6:7;1:3<88(9: @1_14_238_F3 T01331031200310022122230330201030313 + ?><5=;<<<12>=<;1;;=5);.;14:0>2;:3;7 @1_15_1098_F3 T + @1_16_404_F3 T03310320002130202331112133020103031 + 78;:;;><>9=9;<<2=><<1;58;9<<;>(<;<; @1_16_904_F3 T21230102331022312232132021122111212 + 9>=::6;;99=+/'$+#.#&%$&'(($1*$($.#. @1_16_1315_F3 T0323123111221033301031032330201000 + <9<8A?>?::;6&,%;6/)8<<#/;79(448&*. @1_16_1595_F3 T22323211312111230022210011213302012 + >,<=<>@6<;?<=>:/=.>&;;8;)17:=&,>1=+ @1_17_1379_F3 T32011212111223230232132311321200123 + /-1179<1;>>8:':7-%/::0&+=<29,7<8(,2 @1_18_1692_F3 T12322233031100211233323300112200210 + .#(###5%)%2)',2&:+#+&5,($/1#&4&))$6 @1_19_171_F3 T10101101220213201111011320201230 + )6:65/=3*:(8%)%2>&8&%;%0&#;$3$&: @1_22_72_F3 T133030323232212123013222333202 + 3/#678<:.=9::6:(<538295;9+;&*; @1_22_1377_F3 T22221333311222312201132312022322300 + )##0%.$.1*%,)95+%%14%$#8-###9-()#9+ @1_23_585_F3 T30010310310130312122123302013303131 + >55;8><96/18?)<3<58<5:;96=7:1=8=:-< @1_23_809_F3 T131301011010212110132203022233021 + :7<59@;<<5;/9;=<;7::.)&&&827(+221 @1_24_138_F3 T3321113010012032300203302012303131 + 6)68/;906#,25/&;<$0+250#2,<)5,9/+7 @1_24_206_F3 T33330332002223002020303331321221000 + ))4(&)9592)#)694(,)292:(=7$.18,()65 @1_25_143_F3 T2320200303120022030130330201220313 + :4;/#&<9;&*;95-7;85&;587#16>%&,9<2 @1_25_1866_F3 T03201321022131101112012330221130311 + =<>9;<@7?(=6,<&?=6=(=<641:?'<1=;':4 @1_27_584_F3 T10010330110103213112323303012103101 + 82'('*.-8+%#2)(-&3.,.2,),+.':&,'(&/ @1_27_1227_F3 T0200302212300100320100203130330201 + 492:;>A:<;34<<=);:<<;9=7<3::<::3=> @1_27_1350_F3 T1313010110102121101322022222130123 + 95,)<(4./;<938=64=+2/,.4),3':97#33 @1_29_477_F3 T13130101101021211013300302223 + 94=55:75=+:/7><968;;#&+$#3&6, @1_30_882_F3 T20102033000233133320103031311233200 + 2(+-:-3<;5##/;:(%&84'#:,?3&&8>-();5 @1_31_221_F3 T03301311201100030300100233220102031 + 89>9>5<139/,&:7969972.274&%:78&&746 @1_31_1313_F3 T01331131300330122321000101010330201 + ;3<7=7::)5*4=&;<7>4;795065;9';896'= @1_529_129_F3 T132222301020322102101322221322302.3302 + >>%/((B6-&5A0:6)>;'1)B*38/?(5=%B+!&<-9 cutadapt-4.7/tests/cut/sra.fastq000066400000000000000000000007071457457704700170030ustar00rootroot00000000000000@1_13_85_F3 T110020300.0113010210002110102330021 + 7&9<&77)&!<7))%4'657-1+9;9,.<8);.;8 @1_13_573_F3 T312311200.30213011011132 + 6)3%)&&&&!.1&(6:<'67..*, @1_13_1259_F3 T002112130.201222332211 + =;<:&:A;A!9<<<,7:<=3=; @1_13_1440_F3 T110020313.1113211010332111302330001 + =<=A:A=57!7<';<6?5;;6:+:=)71>70<,=: @1_14_177_F3 T31330222020233321121323302013303311 + :8957;;54)'98924905;;)6:7;1:3<88(9: @1_14_238_F3 T0133103120031002212223 + ?><5=;<<<12>=<;1;;=5); cutadapt-4.7/tests/cut/stripped.fasta000066400000000000000000000000431457457704700200210ustar00rootroot00000000000000>first SEQUENCE1 >second SEQUENCE2 cutadapt-4.7/tests/cut/suffix.fastq000066400000000000000000000002001457457704700175060ustar00rootroot00000000000000@Read1 extra OnlyT AAAAAAAAAA + FFGHHHHIJJ @Read2 OnlyG AAAA + JIJJ @Read3 no_adapter AAAAAAAAAAAAAAAAAAA + JGFHCG<r ACGT cutadapt-4.7/tests/cut/trimN3.fasta000066400000000000000000000000471457457704700173470ustar00rootroot00000000000000>read1 CAGTCGGTCCTGAGAGATGGGCGAGCGCTGG cutadapt-4.7/tests/cut/trimN5.fasta000066400000000000000000000000601457457704700173440ustar00rootroot00000000000000>read1 GGCCTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACCAG cutadapt-4.7/tests/cut/twoadapters.fasta000066400000000000000000000003231457457704700205250ustar00rootroot00000000000000>read1 GATCCTCCTGGAGCTGGCTGATACCAGTATACCAGTGCTGATTGTTG >read2 CTCGAGAATTCTGGATCCTCTCTTCTGCTACCTTTGGGATTTGCTTGCTCTTG >read3 (no adapter) AATGAAGGTTGTAACCATAACAGGAAGTCATGCGCATTTAGTCGAGCACGTAAGTTCATACGGAAATGGGTAAG cutadapt-4.7/tests/cut/twoadapters.first.fasta000066400000000000000000000000671457457704700216600ustar00rootroot00000000000000>read1 GATCCTCCTGGAGCTGGCTGATACCAGTATACCAGTGCTGATTGTTG cutadapt-4.7/tests/cut/twoadapters.second.fasta000066400000000000000000000000751457457704700220030ustar00rootroot00000000000000>read2 CTCGAGAATTCTGGATCCTCTCTTCTGCTACCTTTGGGATTTGCTTGCTCTTG cutadapt-4.7/tests/cut/twoadapters.unknown.fasta000066400000000000000000000001371457457704700222260ustar00rootroot00000000000000>read3 (no adapter) AATGAAGGTTGTAACCATAACAGGAAGTCATGCGCATTTAGTCGAGCACGTAAGTTCATACGGAAATGGGTAAG cutadapt-4.7/tests/cut/unconditional-back.fastq000066400000000000000000000003651457457704700217620ustar00rootroot00000000000000@prefix:1_13_573/1 CGTCCGAANTAGCTACCACCCTGATTAGA + )3%)&&&&!.1&(6:<'67..*,:75)'7 @prefix:1_13_1259/1 AGCCGCTANGACGGGTTGGCCCTTAGACG + ;<:&:A;A!9<<<,7:<=3=;:<&7 cutadapt-4.7/tests/cut/unconditional-both.fastq000066400000000000000000000003271457457704700220140ustar00rootroot00000000000000@prefix:1_13_573/1 GAANTAGCTACCACCCTGATTAGA + &&&!.1&(6:<'67..*,:75)'7 @prefix:1_13_1259/1 CTANGACGGGTTGGCCCTTAGACG + A;A!9<<<,7:<=3=;:<&7 cutadapt-4.7/tests/cut/unconditional-front.fastq000066400000000000000000000003651457457704700222120ustar00rootroot00000000000000@prefix:1_13_573/1 GAANTAGCTACCACCCTGATTAGACAAAT + &&&!.1&(6:<'67..*,:75)'77&&&5 @prefix:1_13_1259/1 CTANGACGGGTTGGCCCTTAGACGTATCT + A;A!9<<<,7:<=3=;:<&70<,=: cutadapt-4.7/tests/cut/wildcard.fa000066400000000000000000000000301457457704700172440ustar00rootroot00000000000000>1 TGCATGCA >2 TGCATGCA cutadapt-4.7/tests/cut/wildcardN.fa000066400000000000000000000000521457457704700173660ustar00rootroot00000000000000>perfect TTT >withN TTT >1mism TTTGGGGCGG cutadapt-4.7/tests/cut/wildcard_adapter.fa000066400000000000000000000000441457457704700207510ustar00rootroot00000000000000>1 >2 >3b TGGCTGGCC >4b TGGCTGGCC cutadapt-4.7/tests/cut/wildcard_adapter_anywhere.fa000066400000000000000000000000641457457704700226550ustar00rootroot00000000000000>1 TGCATGCA >2 TGCATGCA >3b TGGCTGGCC >4b TGGCTGGCC cutadapt-4.7/tests/cut/xadapter.fasta000066400000000000000000000004601457457704700200020ustar00rootroot00000000000000>r1 CGTCCGAAGTAGCTAtccgaatagaCCACCCTGATTAGACAAAT >r2 AGCCGCTACGACGGGTTGGCCCTTAGACGTATCT >r3 CAAGATCTACCCTGCCACATTGCCCTAGTTAAAC >r4 AAGATCTACCCTGCCACATTGCCCTAGTTAAAC >r5 CAAGATCTACCCTGCCACATTGCCCTAGTTAAACtccgaataga >r6 CAAGATCTACCCTGCCACATTGCCCTAGTTAAACtccga >r7 CAAGATCTACCCTGCCACATTGCCCTAGTTtccgaatagaA cutadapt-4.7/tests/data/000077500000000000000000000000001457457704700152705ustar00rootroot00000000000000cutadapt-4.7/tests/data/454.fa000066400000000000000000000227771457457704700161330ustar00rootroot00000000000000>000163_1255_2627 length=52 uaccno=E0R4ISW01DCIQD CCATCTCATCCCTGCGTGTCCCATCTGTTCCCTTCCTTGTCTCAGTGTGGTG >000652_1085_0667 length=122 uaccno=E0R4ISW01CXJXP ATTGAAGAGGTTGGTAAGTTTTAAGTTGGTAGGTGGTTGGGGAGTGGTTGGAGAGGAGTTGTTGGGAGTTTGTGTCCTGCTGAGACACGCAACGGGGATAGGCAAGGCACACAGGGGATAGG >000653_1285_1649 length=135 uaccno=E0R4ISW01DE4SJ AATTAGTCGAGCGTTGTGGTGGGTATTTGTAATTTTAGCTACTCTGAAGGCTGAGGCAGGAGAACTGCTTGAACCCGGGAGGCGGAGGTTGCTGAGACACGCAACAGGAGATAGGCAAGGCACACAGGGGATAGG >000902_0715_2005 length=92 uaccno=E0R4ISW01B03K3 GGGTGTTGAATTTAATATGTAGTATATTGATTTGTGATGATTATTTTGCCTGAGACACGCAACAGGGGTAGGCAAGGCACACAGGGGATAGG >001146_1255_0340 length=92 uaccno=E0R4ISW01DCGYU GGGTGTTGAATTTAATATGTAGTATATTGATTTGTGATGATTATTTTGCCTGAGACACGCAACAGGGGTAGGCAAGGCACACAGGGGATAGG >001210_1147_1026 length=171 uaccno=E0R4ISW01C2Z5W TAGGGAGGTGGTGAGTGTTGTGTGTTTAGATTGTGTGTGGTGGTTGGGAGTGGGAGTTGTATTTTAGGGTGTGGGTTGGGAGAGTGAAAGTTGTGGGTGTTTTGGATGGTGGGTTAGGTGGTTGTGCCTGAGACACGCAACAGGGGAAAGGCAAGGCACACAGGGGATAGG >001278_1608_2022 length=109 uaccno=E0R4ISW01D7HW4 CACACACACTCTTCCCCATACCTACTCACACACACACACACACACACAAACATACACAAATAATTCTGAGACACGCAACAGGAGATAGGCAAGGCACACAGGGGATAGG >001333_1518_1176 length=142 uaccno=E0R4ISW01DZKTM AATTGTCGTTTGATTGTTGGAAAGTAGAGGGTCGGGTTGGGGTAGATTCGAAAGGGGAATTTTGAGAAAAGAAATGGAGGGAGGTAGGAAAATTTTTTGCTGAGACACGCAACAGGGGTAGGCAAGGCACACAGGGGATAGG >001398_1584_1549 length=154 uaccno=E0R4ISW01D5DPB TAATGAAATGGAATGGAATGGAATGGAATGAAATGGAATGGAATGGAATGGAATGGAATGGAATGGAATGGAATGGAATGAAATGGAATGGAGTATAAAGGAATGGAATTACTGAGACACGCAACAGGGGAAGGCAAGGCACACAGGGGATAGG >001455_1136_2179 length=92 uaccno=E0R4ISW01C12AD GGGTGTTGAATTTAATATGTAGTATATTGATTTGTGATGATTATTTTGCCTGAGACACGCAACAGGGGTAGGCAAGGCACACAGGGGATAGG >001481_1165_0549 length=92 uaccno=E0R4ISW01C4KON GGGTGTTGAATTTAATATGTAGTATATTGATTTGTGATGATTATTTTGCCTGAGACACGCAACAGGGGTAGGCAAGGCACACAGGGGATAGG >001744_1376_3512 length=144 uaccno=E0R4ISW01DM5T2 TAAGTAGGGAAGGTTTGAGGTTGTTGGTGTTGGTAGTAGGGGTGTTTTAGTTAGGGGTTGTAGTTTGTTAAGGGAATTTTATTTGAGTTTAGAATTGAGGCTGAGACACGCAAAAGGGGATAGGCAAGGCACACAGGGGATAGG >001893_1084_1137 length=162 uaccno=E0R4ISW01CXG4Z TGTATATTTTGTTGGGTTTGTATATATTGTTAGGTGTGGTTGGTGAGTTGTATTGGTGGTGGTGTAAGGTGAGTGGAAATGGGAATGGATTGTAGATATGTTGGATTTGTGGTTTTTGGTTGAGACACGAACAGGGGATAGGCAAGGCACACAGGGGATAGG >001927_0254_0706 length=182 uaccno=E0R4ISW01AWLLG TGGAATCATCTAAGGGACACAAATAGAATCATCATTGAATGGAATCGAATGGAATCATCTAATGTACTCGAATGGAATTATTATTGAATAGAATAGAATGGAATTATCGAATGGAATCAAATGGAATGTAATGGAATGCTGAGACACGCAACAGGGGAAAGGCAAGGCACACAGGGGATAGG >002007_1338_1037 length=139 uaccno=E0R4ISW01DJRTR GGGTTGTGTATTTGGATAGTATGTGGAAAATGGTATTAAAAAGAATTTGTAGTTGGATTGTTGGTGGTTATTTAGTTTTTGGGTAATGGGTAGATTCCTGAGACACGCAAAGGGATAGGCAAGGCACACAGGGGATAGG >002186_1130_0654 length=92 uaccno=E0R4ISW01C1H5C GGGTGTTGAATTTAATATGTAGTATATTGATTTGTGATGATTATTTTGCCTGAGACACGCAACAGGGGTAGGCAAGGCACACAGGGGATAGG >002282_1237_2702 length=134 uaccno=E0R4ISW01DAXWG AATTAGCCGGGCGTGATGGCGGGCGTTTGTAGTTTTAGTTATTCGGGAGGTTGAGGTAGGAGAATGGCGTGAATTCGGGAAGCGGAGTTTGCTGAGACACGCAACAGGGGTAGGCAAGGCACACAGGGGATAGG >002382_1259_0997 length=107 uaccno=E0R4ISW01DCT37 TAAGGGTTGAAGCGAGGTAGGTAGTTTGTTTGTGGTTTTGTTTCGTATTTTTGTTTCGTATCCCTGAGACACGCAACAGAGGATAGGCAAGGCACACAGGGGATAGG >002477_0657_0655 length=174 uaccno=E0R4ISW01BVY8H TTTTTGGAAAGTTGGGTGGGTATAGTTTTGAGTAGTTAGAGGTATTATAATAGTATTAGGAAGTTGAATGTGAGGGTATAAGAGTTAATTTGATTTTTCGTTGATATGTTTGTTGTTTGAAGTTAGAGTGCTGAGACACGCAACAGGAGATAGGCAAGGCACACAGGGGATAGG >003149_1553_2333 length=170 uaccno=E0R4ISW01D2OBZ TATTTAGTTTTAGTTTGTTTAGGTGGTTATAGAATACGGAGTTTATGAAGTTGATTAGGAATATTATTAGTTGAATTAAGAATTGGGAAGAGAGGGGAACGGGAAGGGACGTGAGTGATTATTATTGCTGAGACACGCAAAGGGGATAGGCAAGGCACACAGGGGATAGG >003194_1475_2845 length=101 uaccno=E0R4ISW01DVT7J TATTTTGGGTTAAGTCGGGTTTAGTTGTTAGGGCGAGAAGTTAGTTGTTGACCCCTGCTGAGACACGCAAAAGGGGATAGGCAAGGCACACAGGGGATAGG >003206_1315_0479 length=95 uaccno=E0R4ISW01DHQPD GGGTTGGATAATATGATGGTGTTGGGGAATATTTAGGTATGTGGTTTGTGGCTGAGACACGCAACAGAGGATAGGCAAGGCACACAGGGGATAGG >003271_0173_0314 length=125 uaccno=E0R4ISW01APHAK GTTTATTTGTTATTTATTTTTAGGTTTAGAAGAGTGTTTGGTATTTATTGAGGATTTAGTATTTGTTAGAAGGATTGGATTCTGAGACACGCAACAGGGGGTAGGCAAGGCACACAGGGGATAGG >003443_1737_2250 length=67 uaccno=E0R4ISW01EITSS TGTAGGTTGTGTTGTAGGTTGTCCTGAGACACGCAACAGGGGAAAGGCAAGGCACACAGGGGATAGG >002633_1776_1582 length=81 uaccno=E0R4ISW01EL8JK CAGGGTGGATTGGGGAACACACAGTGTGGCCGCGTGATTCTGAGACACGCAACAGGGAAGGCAAGGCACACAGGGGATAGG >002663_0725_3154 length=126 uaccno=E0R4ISW01B1Z2S GCGTTTTATATTATAATTTAATATTTTGGAGGTTGGGTGCGGTGGTTTACGTTTGTAGTTTAGTATTTGGGAGGTTAAGGTAGCTGAGACACGCAACGGGGATAGGCAAGGCACACAGGGGATAGG >002761_1056_4055 length=121 uaccno=E0R4ISW01CU2V9 AATTTTATTCGATTTATGTGATGATTTATTTATTTTATTTGAAGATGATTTTATTCGAGATTATTCGATGATTCCATTCCTGAGACACGCAAGGGGATAGGCAAGGCACACAGGGGATAGG >002843_0289_2275 length=122 uaccno=E0R4ISW01AZPE9 ATTGAAGAGGTTGGTAAGTTTTAAGTTGGTAGGTGGTTGGGGAGTGGTTGGAGAGGAGTTGTTGGGAGTTTGTGTCCTGCTGAGACACGCAACGGGGATAGGCAAGGCACACAGGGGATAGG >002934_1762_2177 length=92 uaccno=E0R4ISW01EK0Q7 GGGTGTTGAATTTAATATGTAGTATATTGATTTGTGATGATTATTTTGCCTGAGACACGCAACAGGGGTAGGCAAGGCACACAGGGGATAGG >003515_1711_1058 length=122 uaccno=E0R4ISW01EGIPG AATTGAATGGAATTATTATTGAATGGATTCGAATGGAATTATTATTGAATGGAATCATCGAGTGGAATCGAATGGAATCTGAGACACGCAACAGGGGAAAGGCAAGGCACACAGGGGATAGG >003541_1276_1589 length=112 uaccno=E0R4ISW01DECAV TAGTTTAGGGTGGTAGTTTGGATAAGGTAGTTTTACGGTTTAGTAGTAGTAGGTTAAGTAGGAAAACTGCTGAGACACGCAAAGGGGATAGGCAAGGCACACAGGGGATAGG >003587_1522_1804 length=152 uaccno=E0R4ISW01DZXX6 AATTTATGTAGTGGAAGTAGGATATAAAGAATAGGTTAATGGATTTTGAGATATTAAAAAGAGTAGGAAATTAGTTGAGAGGTTAAGTAGTAGTTTATTTTAGCCACCCTGAGACACGCAACAGGAGATAGGCAAGGCACACAGGGGATAGG >003592_0076_0430 length=134 uaccno=E0R4ISW01AGYTC AATTAGTTAGGCGTGGTGGCGGGTGTTTGTAGTTTTAGTTATTCGGGAGGTTGAGGTAGGAGAATGTTGTGAATTTAGGAGGTGGAGTTTGCTGAGACACGCAACAGGGGAAGGCAAGGCACACAGGGGATAGG >003957_0595_0965 length=173 uaccno=E0R4ISW01BQJIV TAATATTAGGTGTCAATTTGACTGGATCGAGGGATGTGTGTCGGTGAGAGTCTCACTAGAGGTTGATATTTGAGTCGTTAGACTGGGAGAGGAAGACCGAACTGTCAAGTGTATGGGCGCCATCCAATTCTGAGACACGCAACAGGGGAAAGGCAAGGCACACAGGGGATAGG >003986_1127_2937 length=103 uaccno=E0R4ISW01C1AFF TAATGGAATGGAATTTTCGGAATGGAATGGAATGGAATGGAATGGAATGGAATGGAATTACTGAGACACGCAACAGGGGAAGGCAAGGCACACAGGGGATAGG >004012_1559_1491 length=111 uaccno=E0R4ISW01D26M9 TAGTGGATATAAATGGAATGGATTGGAATGGAATGGATACGAATGGAATGGATTGGAGTGGAATGGATTGACTGAGACACGCAACAGGGGGCAAGGCACACAGGGGATAGG >004030_1508_2061 length=166 uaccno=E0R4ISW01DYPWF TACGTATATACGCGTACGCGTATACGTATATACGCGTATACGTATACGCGTACGTATATATACGCGTATACGTTTACGTACGTACGCGTATATACGTACGTATACACACACGCATATGCATACTGAGACACGCAACAGGGGAAAGGCAAGGCACACAGGGGATAGG >004038_1061_2047 length=152 uaccno=E0R4ISW01CVG5D AATTGATTCGAATGGAATGGATTGGAATGGAACGGATTTGAATGGAATGGATTGGAATGGAATGGATTGAATGGAATGGATTGGAGAGGATTGGATTTGAATGGAATTCTGAGACACGCAACAGGGGAAAGGCAAGGCACACAGGGGATAGG >004105_1121_0391 length=135 uaccno=E0R4ISW01C0PH1 AATTAGTTGGGCGTGGTGGCGAGTGTTTGTAATTTTAGTTATTTAGGAGGTTGAGGTAGGAGAATTATTTGAACCCGGTAGACGGAAGTTGCTGAGACACGCAACAGGGGAAAGGCAAGGCACACAGGGGATAGG >004129_1618_3423 length=122 uaccno=E0R4ISW01D8ELT AATTGAATGGTATTGAAAGGTATTAATTTAGTGGAATGGAATGGAATGTATTGGAATGGAAAATAATGGAATGGAGTGCTGAGACACGCAACAGGGGAAAGGCAAGGCACACAGGGGATAGG >004203_0451_0902 length=115 uaccno=E0R4ISW01BDWC4 TAGTTGGTGTGTTGTAATCGAGACGTAGTTGGTTGGTACGGGTTAGGGTTTTGATTGGGTTGTTGTGTTTGCTGAGACACGCAACATGGGATAGGCAAGGCACACAGGGGATAGG >004626_1937_0919 length=223 uaccno=E0R4ISW01E0CVD TAGAGTAGATAGTAGGGTTAGAGAAGGTAGGGTACGTTTAGTTTGTTAGTAAGGTTTAAGTTTTGGGTGGGAAAGGTTAGTGGCGGGAAGGGACGAAGGTGGTAATCGAGAGTAGATTTAGAGAAGTTTTTGAAGTGGGCGTTGGGAGTTTTCGAAGTATTGAGAGAGAGGAGCTTGTGCTGAGACATGCAACAGAGGATAGGCAAGGCACACAGGGGATAGG >004913_0641_2071 length=135 uaccno=E0R4ISW01BULRD AATTAGTCGAGCGTTGTGGTGGGTATTTGTAATTTTAGCTACTCTGAAGGCTGAGGCAGGAGAACTGCTTGAACCCGGGAGGCGGAGGTTGCTGAGACACGCAACAGGAGATAGGCAAGGCACACAGGGGATAGG >005063_0599_1983 length=127 uaccno=E0R4ISW01BQWX9 ATGTGGTGAAGATTGGTTTTAGGTGTTTTAATGTGGATTTTCAGGGGTTTTAAAAGGGTTGGGAGAGTGAAATATATATAAGGCTGAGACACGCAAAAGGGGATAGGCAAGGCACACAGGGGATAGG >005140_0759_3209 length=116 uaccno=E0R4ISW01B4ZKR TAGTATAGAGGGTTTGTGGTCGTGAGGGTGTTGATGGCGGGAGGGTTTTGATGGTAGGAGGGCCCGTGCTGTGCTGAGACACGCAACAGGGGAAGGCAAGGCACACAGGGGATAGG >005351_0883_3221 length=137 uaccno=E0R4ISW01CFVHJ TTAGGTGTTATAGTTGAGTGAGATGTTAGTGTTTAATGGTTTTATTTAGGTTGATGGGTTAATGAGGGGGTATTTGATAGTTTTGAAGATTTGACTGAGACACGCAACGGGGATAGGCAAGGCACACAGGGGATAGG >005380_1702_1187 length=207 uaccno=E0R4ISW01EFQC1 TAGGGTTTTTCGAGTATATATTTAGTAGTACGCTCGACTTCTCTTATATAAAGGTTTTGGTTTTTATAGGTTTTTCCATTGTGTCTGCCTGGGGGAGGGCCCTTCTCCTTCAGGATACTGTAGCTTCTCTGCGTGATAAGCCAGCATTCACGGCTTTCAGGTGCTGAGACATGCAACAGGGGAAAGGCAAGGCACACAGGGGATAGG >005568_1060_1943 length=63 uaccno=E0R4ISW01CVDWP ATAGCGTATTTCTCACCTGCTGAGACACGCAACAGGGGAAAGGCAAGGCACACAGGGGATAGG >005740_1536_2697 length=159 uaccno=E0R4ISW01D06VV TAAAGAGGTGTTATTATTAGTTAGGAGAGGAGGTGGTTAGATAGTAGTGGGATTATAGGGGAATATAGAGTTGTTAGTTTAGGGATAAGGGATTGATCGATGGGTTAGGTCTCTGCTGAGACACGCAAAAGGGGATAGGCAAGGCACACAGGGGATAGG >005753_1884_3877 length=95 uaccno=E0R4ISW01EVRNB AAACTGAGTTGTGATGTTTGCATTCAACTCACAGAGTTCAACATTCCTTTAACTGAGACACGCAACAGGGTTAGGCAAGGCACACAGGGTATAGG >read_equals_adapter 1a TGAGACACGCAACAGGGGAAAGGCAAGGCACACAGGGGATAGG >read_equals_start_of_adapter 1b TGAGACACGCAACAGGGGAAAG >read_equals_end_of_adapter 1c GAAAGGCAAGGCACACAGGGGATAGG >read_equals_middle_of_adapter 1d GCAACAGGGGAAAGGCAAGGCACACAGG >read_ends_with_adapter 2a GCTACTCTGAAGGCTGAGGCAGGAGAACTGCTTGAACCCGGGAGGCGTGAGACACGCAACAGGGGAAAGGCAAGGCACACAGGGGATAGG >read_ends_with_start_of_adapter 2b GCTACTCTGAAGGCTGAGGCAGGAGAACTGCTTGAACCCGGGAGGCGTGAGACACGCAACAGGGGAAAGGCAAGG >read_contains_adapter_in_the_middle 3 CGTAGTTGGTTGGTACGTGAGACACGCAACAGGGGAAAGGCAAGGCACACAGGGGATAGGGGTTAGGGTTTTGATTGGGTTGT >read_starts_with_adapter 4a TGAGACACGCAACAGGGGAAAGGCAAGGCACACAGGGGATAGGAAAGGTTTTGGTTTTTATAGGTTTTT >read_starts_with_end_of_adapter 4b AACAGGGGAAAGGCAAGGCACACAGGGGATAGGAAAGGTTTTGGTTTTTATAGGTTTTT cutadapt-4.7/tests/data/E3M.fasta000066400000000000000000000066331457457704700167040ustar00rootroot00000000000000>E3MFGYR02JWQ7T length=260 xy=3946_2103 region=2 run=R_2008_01_09_16_16_00_ tcagGGTCTACATGTTGGTTAACCCGTACTGATTTGAATTGGCTCTTTGTCTTTCCAAAG GGAATTCATCTTCTTATGGCACACATAAAGGATAAATACAAGAATCTTCCTATTTACATC ACTGAAAATGGCATGGCTGAATCAAGGAATGACTCAATACCAGTCAATGAAGCCCGCAAG GATAGTATAAGGATTAGATACCATGATGGCCATCTTAAATTCCTTCTTCAAGCGATCAAG GAAGGTGTTAATTTGAAGGGGCTTa >E3MFGYR02JA6IL length=265 xy=3700_3115 region=2 run=R_2008_01_09_16_16_00_ tcagTTTTTTTTGGAAAGGAAAACGGACGTACTCATAGATGGATCATACTGACGTTAGGA AAATAATTCATAAGACAATAAGGAAACAAAGTGTAAAAAAAAAACCTAAATGCTCAAGGA AAATACATAGCCATCTGAACAGATTTCTGCTGGAAGCCACATTTCTCGTAGAACGCCTTG TTCTCGACGCTGCAATCAAGAATCACCTTGTAGCATCCCATTGAACGCGCATGCTCCGTG AGGAACTTGATGATTCTCTTTCCCAAATGcc >E3MFGYR02JHD4H length=292 xy=3771_2095 region=2 run=R_2008_01_09_16_16_00_ tcagAAAGACAAGTGGTATCAACGCAGAGTGGCCATTACGCCGGGGACTAGGTCATGTTA AGAGTGTAGCTTTGTGATGCTCTGCATCCGTCTTATGATAAAATTGAGGTTATCCTGAAA TAAAGTGTCTCAAACGATTTATTTTCCATTTATTGTATTTAATTTGAGTCCAAACTAGAT TAGAGATCTCTGTAATAAAACATGTTTGTTAGTTTAATTTCAATAACATTTAGTATTGTG TCGTAAAAAAAAAAAAAACGAAAAAAAAAAAAACAAAAAAAAAAACAAATGTACGGccgg ctagagaacg >E3MFGYR02GFKUC length=295 xy=2520_2738 region=2 run=R_2008_01_09_16_16_00_ tcagCGGCCGGGCCTCTCATCGGTGGTGGAATCACTGGCCTTGTTTACGAGGTTGTCTTT ATCAGCCACACCCACGAGCAGCTTCCCACCACTGACTACTAGAGGGGGGGAAATGAAAAA TAAAAAAAAAAAATTGTGTATTATTGAATTTCTCTGGAATCTTCTTCTGTGTATGGTTTT CCTTCCTTGTGTTTTCTTCCTAATTCACTTTCGAGGGTTGTACTTGTTCCTTTCGTCTTA AATCCTTGGATGGTTGATGATCATGAAGTTCTCTTTAAAGTTAAATTATTATCATTTTG >E3MFGYR02FTGED length=277 xy=2268_2739 region=2 run=R_2008_01_09_16_16_00_ tcagTGGTAATGGGGGGAAATTTAATTTTCTGATTTTATTATATATAGTTAATTGATGCT TTCGACGGTTTATATTTATGCGATTTGGTTTAGGTTTCAATGGAATTTTGTTGGTAGTTT ATATGATTGTATATAGTTATCAGCAACCTTATATTGTTTGCTTGCCTTTCTAGAGCACTC AGTGGAGATTTGAAACTTTGTTAGTGGAAAATTTGCAATTGTATGTTAATTGGAGATGGA GACAAAAAAGGAGGCAGATATTAATATTTATTTGGATATCA >E3MFGYR02FR9G7 length=256 xy=2255_0361 region=2 run=R_2008_01_09_16_16_00_ tcagCTCCGTAAGAAGGTGCTGCCCGCCGTCATCGTCCGCCAGCGCAAGCCTTGGCGCCG AAAGGACGGTGTTTACATGTACTTCGAAGATAATGCTGGTGTTATCGTGAATCCCAAGGG TGAAATGAAAGGTTCTGCTATCACTGGTCCAATTGGGAAGGAGTGTGCTGATCTGTGGCC CAGGATTGCAAGTGCTGCCAATGCTATTGTTTAAGCTAGGATTTTAGTTTTTGTAATGTT TCAGCTTCTTGAAGTTGTTTc >E3MFGYR02GAZMS length=271 xy=2468_1618 region=2 run=R_2008_01_09_16_16_00_ tcagAAAGAAGTAAGGTAAATAACAAACGACAGAGTGGCACATACTCCGGCAGTTCATGG GCAGTGACCCAGTTCAGAGAACCAAAGAACCTGAATAAGAATCTATGTCTACTGTGAATT TTGTGGCTTTCGTTGGAACGAAGGTAGCTTCGAAACAATAAAGTTATCTACTTCGCAATA TGAAGTGTTTCTGTTAGTTCTATGGTTCCTACTCCTAGCACCTCTTTTTCTTATAGAAAT GGACCACCGTGATTGGTACAAAAGNTGTACCTAGAtga >E3MFGYR02HHZ8O length=150 xy=2958_1574 region=2 run=R_2008_01_09_16_16_00_ tcagACTTTCTTCTTTACCGTAACGTTGTTAAATTATCTGAGTATATGAAGGACCCTATT TGGGTTCTATAACTACAGAACATATCTCAGTCCAATAGTGACGGAATAACAATATTATAA ACTAGTTTAACGCTTTATGAAAAAAAAAAAAAAAgaaaaaaaaacatgtcggccgctgag acacgcaacaggggataggcaaggcacacaggggataggnn >E3MFGYR02GPGB1 length=221 xy=2633_0607 region=2 run=R_2008_01_09_16_16_00_ tcagAAGCAGTGGTATCAACGCAGAGTGGCCATTACGGCCGGGTCTGATGAGTATGTGTC GAAGATCCCAAATAACAAGGTTGGTCTTGTAATTGGTAAAGGTGGAGAAACAATAAAGAA TATGCAAGCTTCAACTGGAGCAAGAATTCAGGTGATTCCTCTTCATCTTCCACCTGGTGA CACATCTACCAAAAAAAAAAAAAAAAAAAAACCAAATGTCGGCCGctgagacacgcaaca gggataggcaaggcacacaggggataggn >E3MFGYR02F7Z7G length=130 xy=2434_1658 region=2 run=R_2008_01_09_16_16_00_ tcagAATCATCCACTTTTTAACGTTTTGTTTTGTTCATCTCTTAACAACAATTCTAGGGC GACAGAGAGAGTAAGTACCCACTAACCAGTCCCCAAGTACCAAAATAACAATTTAAACAA CAAAACACAAACAGatcttatcaacaaaactcaaagttcctaactgagacacgcaacagg ggataagacaaggcacacaggggataggnnnnnnnnnnn cutadapt-4.7/tests/data/E3M.qual000066400000000000000000000206661457457704700165520ustar00rootroot00000000000000>E3MFGYR02JWQ7T length=260 xy=3946_2103 region=2 run=R_2008_01_09_16_16_00_ 23 24 26 38 31 11 27 28 25 28 22 25 27 28 36 27 32 22 33 23 27 16 40 33 18 28 28 24 25 20 26 26 37 31 10 21 27 16 36 28 32 22 27 26 28 37 30 9 28 27 26 36 29 8 33 23 37 30 9 37 30 9 34 26 32 22 28 28 28 22 33 23 28 31 21 28 26 33 23 28 27 28 28 28 21 25 37 33 16 34 28 25 28 37 33 17 28 28 27 34 27 25 30 25 26 24 34 27 34 27 23 28 36 32 14 24 28 27 27 23 26 25 27 25 36 32 18 1 27 29 21 26 24 27 31 22 27 26 26 34 26 28 27 33 26 34 26 33 26 28 26 27 27 27 27 28 19 25 25 31 23 28 28 28 27 33 26 26 26 27 18 21 35 31 12 21 28 34 28 32 26 27 27 23 25 27 28 26 34 28 34 28 27 34 28 28 26 28 26 19 32 27 28 25 27 27 26 33 25 34 28 24 28 21 30 21 37 33 16 23 12 27 18 27 18 25 34 28 24 30 22 22 23 28 27 25 26 34 28 33 26 19 6 34 28 25 25 32 27 34 28 37 33 17 25 34 28 36 32 18 2 17 24 14 17 >E3MFGYR02JA6IL length=265 xy=3700_3115 region=2 run=R_2008_01_09_16_16_00_ 24 24 26 28 45 32 22 17 12 9 5 1 36 28 40 34 15 36 27 42 35 21 6 28 34 24 27 28 28 21 28 28 28 28 25 27 28 28 28 27 36 28 27 28 28 24 28 28 28 28 28 24 28 28 36 27 28 36 28 43 36 22 10 28 19 5 36 28 28 25 28 37 28 28 12 28 33 26 28 24 11 35 26 41 34 15 27 40 33 18 28 28 24 24 44 26 17 13 10 7 6 4 2 1 22 9 27 36 33 17 27 26 26 27 28 30 22 33 26 36 33 19 4 25 18 27 24 22 24 26 31 23 27 24 28 25 25 31 23 27 27 28 26 32 28 7 27 23 24 25 26 33 25 32 24 24 34 26 25 23 27 33 29 8 25 25 26 25 26 25 27 29 20 28 26 32 24 33 25 25 29 20 24 26 28 23 25 26 26 27 25 27 27 27 18 27 28 31 23 27 31 23 27 23 27 33 27 34 27 27 26 28 26 27 28 27 37 33 15 24 33 26 27 27 18 26 25 27 27 27 25 28 26 27 25 34 28 27 24 27 25 34 28 31 23 22 34 28 26 27 27 28 27 34 28 25 25 23 36 32 14 37 33 17 37 33 17 23 25 25 15 >E3MFGYR02JHD4H length=292 xy=3771_2095 region=2 run=R_2008_01_09_16_16_00_ 19 23 27 28 41 34 16 27 27 27 27 16 28 22 33 23 23 28 27 27 36 28 28 28 28 22 26 26 28 26 34 24 36 27 26 37 28 28 27 28 36 28 43 36 22 9 24 21 26 28 36 27 27 28 28 28 27 37 28 36 27 28 24 28 27 27 28 24 28 28 40 33 14 26 21 28 27 28 27 28 23 27 27 28 27 27 26 33 25 27 26 25 34 27 28 28 27 28 28 38 34 22 10 34 28 27 27 34 27 34 28 27 27 33 27 27 28 35 30 11 28 37 33 17 27 28 26 27 27 23 25 36 32 14 27 27 24 32 28 7 28 36 32 19 3 30 21 22 37 33 15 21 34 27 28 22 26 36 33 17 34 28 37 33 17 26 21 26 24 34 27 35 31 12 20 27 27 28 25 34 28 27 25 27 27 25 27 28 27 28 23 28 27 28 20 28 38 34 22 9 23 24 28 28 36 32 13 27 19 7 20 26 37 33 17 21 9 37 33 17 23 32 25 22 29 21 27 24 34 30 10 28 26 25 28 33 26 23 21 27 28 27 26 23 32 20 11 7 5 3 2 1 1 1 1 1 1 1 20 25 33 21 13 8 6 4 3 2 2 1 1 1 1 23 34 25 16 11 9 7 5 4 3 1 1 21 37 33 17 21 27 25 28 28 34 27 32 27 21 9 17 25 20 27 18 17 32 24 17 16 >E3MFGYR02GFKUC length=295 xy=2520_2738 region=2 run=R_2008_01_09_16_16_00_ 24 23 24 27 28 36 28 37 28 39 32 13 34 25 22 28 27 28 26 28 28 37 28 28 36 28 26 36 28 36 28 27 28 27 28 26 36 28 36 28 35 26 28 41 34 17 28 28 28 27 36 28 37 28 28 27 28 41 34 16 25 28 28 26 27 36 28 28 27 28 41 34 17 28 25 28 28 27 28 27 26 27 34 27 37 33 17 25 33 27 26 27 27 28 25 28 28 27 27 25 27 26 28 38 32 23 17 12 8 2 37 33 17 28 26 38 34 23 12 1 28 34 23 15 10 8 6 4 3 2 1 1 1 31 23 28 26 26 28 26 34 27 24 34 27 28 34 27 37 33 16 27 24 25 28 34 27 34 27 28 28 34 26 26 34 28 27 27 28 27 28 27 28 28 34 28 38 34 23 11 34 28 34 27 34 26 34 28 28 27 26 38 35 22 9 27 30 22 33 26 28 34 28 34 28 28 27 28 37 33 15 25 27 23 32 27 6 32 25 28 22 26 26 32 24 27 33 26 26 17 34 30 11 28 26 27 22 33 26 34 30 10 26 30 22 34 28 33 25 26 27 34 28 31 26 24 28 28 28 28 26 28 28 27 28 32 24 26 34 26 27 28 26 34 30 10 32 28 7 27 33 25 35 31 12 34 27 25 30 22 23 28 27 23 38 34 23 11 26 >E3MFGYR02FTGED length=277 xy=2268_2739 region=2 run=R_2008_01_09_16_16_00_ 21 24 28 24 28 35 27 28 35 28 28 44 35 24 16 9 2 41 34 17 40 34 15 34 26 43 36 22 9 28 25 26 26 41 34 20 5 26 37 28 27 27 28 28 28 28 28 28 37 28 36 28 37 28 28 28 27 26 26 38 31 11 28 24 28 28 36 27 36 29 8 26 27 28 36 29 8 27 28 27 28 28 24 34 27 5 32 22 40 33 14 28 37 28 41 34 16 28 32 24 23 34 28 34 27 38 34 22 9 27 34 28 34 27 27 26 26 36 32 13 28 27 26 28 28 25 34 26 27 28 28 27 28 23 27 28 34 26 27 25 27 26 28 23 32 24 34 28 33 26 28 26 27 27 18 25 36 32 13 27 27 32 24 27 32 25 35 31 12 27 28 26 27 21 27 27 27 26 28 28 27 26 28 33 25 22 28 28 37 33 17 26 37 33 17 20 36 32 14 28 34 27 26 27 28 34 28 38 34 22 8 37 33 15 27 28 34 27 33 26 27 26 27 28 28 33 25 34 28 34 28 34 26 24 24 28 25 34 28 28 27 25 23 38 33 24 17 11 5 34 28 25 31 26 22 27 27 27 26 22 34 26 34 27 26 24 34 30 11 19 37 33 15 34 28 27 25 28 25 27 27 >E3MFGYR02FR9G7 length=256 xy=2255_0361 region=2 run=R_2008_01_09_16_16_00_ 21 22 26 28 28 24 35 26 27 28 36 28 28 37 28 36 27 28 28 26 25 24 37 30 9 28 36 28 28 21 28 26 28 28 28 28 36 28 28 35 26 27 25 25 28 28 36 28 23 31 20 32 22 29 18 27 27 34 25 28 39 33 13 36 27 28 28 35 25 28 28 40 34 15 27 28 28 27 27 28 28 28 34 28 27 27 34 28 27 27 27 34 27 28 28 28 27 34 27 27 28 34 26 28 27 27 27 27 28 34 27 27 35 31 11 34 27 34 30 10 28 27 34 30 10 27 28 37 33 15 33 25 33 26 26 28 26 27 27 27 28 26 26 28 27 34 27 26 31 23 34 28 34 28 37 33 15 34 28 34 28 27 23 27 28 27 27 28 23 28 27 25 27 24 27 22 34 28 37 33 16 26 33 26 25 34 26 25 28 33 25 27 27 23 27 28 28 32 24 34 27 27 27 27 28 27 29 20 27 33 28 8 32 27 23 28 25 24 34 28 26 38 34 22 9 27 26 38 34 23 13 3 27 26 34 28 26 28 36 32 14 23 28 27 20 33 25 28 30 22 26 33 25 23 34 28 23 34 30 10 27 >E3MFGYR02GAZMS length=271 xy=2468_1618 region=2 run=R_2008_01_09_16_16_00_ 18 25 28 28 40 34 17 19 33 26 21 17 34 24 31 21 28 41 34 17 28 37 28 28 41 34 17 27 27 21 28 18 24 23 26 25 31 20 28 26 27 28 23 25 27 25 33 23 30 20 28 28 26 31 21 27 28 23 38 31 11 28 28 28 28 28 26 39 33 13 28 28 35 25 28 26 28 27 28 35 26 36 27 35 31 11 28 32 24 34 28 26 25 34 28 28 34 28 24 33 25 27 27 28 26 27 27 26 27 27 27 27 27 26 27 28 34 27 38 34 22 10 25 23 32 25 28 37 33 16 26 26 29 20 33 26 27 18 27 25 23 13 32 24 27 22 24 27 34 28 27 27 36 32 14 27 27 18 26 33 29 8 28 34 27 23 26 28 27 28 27 32 24 28 27 23 34 26 25 27 27 24 34 28 26 25 27 36 32 17 25 25 27 33 27 27 27 34 28 28 28 27 25 34 28 33 27 34 28 28 27 23 25 34 28 27 27 27 28 27 34 27 20 23 38 34 24 15 7 26 22 11 28 27 23 26 36 32 14 22 34 28 28 33 27 27 30 22 25 22 24 27 34 28 34 28 26 26 27 37 33 20 6 28 0 25 28 27 24 34 28 25 28 28 27 25 26 26 >E3MFGYR02HHZ8O length=150 xy=2958_1574 region=2 run=R_2008_01_09_16_16_00_ 22 22 25 23 25 28 41 34 17 28 37 28 28 35 28 6 24 30 19 28 25 32 22 27 25 37 28 28 27 15 38 31 11 36 28 27 24 28 28 27 20 28 23 26 25 22 19 28 35 26 34 25 26 41 34 17 26 28 36 29 7 36 29 8 35 26 28 28 28 24 33 23 28 24 27 27 23 25 34 24 26 24 28 27 22 28 26 28 24 27 28 34 27 34 27 26 27 28 26 27 28 28 34 28 31 23 25 30 22 27 29 21 26 27 34 27 28 26 37 33 17 17 26 18 28 34 30 11 19 6 27 24 27 35 30 11 27 22 28 32 19 11 6 4 3 2 1 1 1 1 1 1 1 1 27 36 28 19 14 11 8 6 4 2 19 19 27 27 28 27 33 26 33 26 25 27 25 28 26 22 28 25 27 27 28 25 34 28 28 24 38 34 21 7 28 25 17 33 26 26 31 26 34 27 27 27 27 26 26 28 38 34 23 12 27 28 25 33 27 0 0 >E3MFGYR02GPGB1 length=221 xy=2633_0607 region=2 run=R_2008_01_09_16_16_00_ 21 24 27 28 36 28 28 28 26 28 28 36 28 28 27 24 28 36 27 28 28 28 23 27 27 28 28 37 28 36 27 27 37 28 28 28 37 28 36 27 41 34 17 28 28 28 28 27 28 28 28 26 28 28 28 28 28 28 28 28 28 37 28 28 27 28 39 32 13 41 34 16 28 37 28 28 34 28 34 28 34 28 34 27 27 26 34 28 27 27 34 28 34 28 34 28 27 37 33 15 34 27 28 34 28 28 28 37 33 16 28 34 26 27 37 33 16 27 34 27 26 27 27 27 28 34 28 26 23 34 27 25 34 27 28 26 34 28 27 25 28 34 27 27 33 26 34 28 27 28 34 27 27 27 27 34 28 34 27 25 26 34 27 26 24 27 28 34 27 32 24 27 31 23 28 34 27 27 25 28 27 25 27 27 27 28 27 17 32 24 35 16 8 4 2 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 21 9 36 31 13 24 27 26 28 34 28 34 27 19 22 23 19 28 28 26 26 20 23 22 26 34 27 25 25 36 32 17 27 27 24 24 14 21 34 27 31 23 23 28 22 27 27 28 36 32 18 2 27 27 22 25 15 0 >E3MFGYR02F7Z7G length=130 xy=2434_1658 region=2 run=R_2008_01_09_16_16_00_ 22 21 23 28 26 15 12 21 28 21 36 28 27 27 43 35 23 12 1 36 28 27 27 41 34 20 5 28 43 36 22 9 27 35 26 28 26 27 26 28 22 33 26 37 28 26 36 27 28 35 27 31 20 26 28 13 38 32 12 26 23 24 27 28 27 22 25 28 19 27 28 20 36 27 25 20 26 41 34 17 28 28 17 36 28 35 27 20 28 28 43 36 22 8 33 26 25 27 27 31 26 38 34 22 10 25 34 28 26 34 27 32 27 5 37 33 17 20 23 13 27 37 33 19 4 27 28 20 37 33 17 24 26 23 27 21 26 33 26 26 27 28 34 27 21 38 34 21 7 28 25 24 37 33 17 28 34 28 32 24 27 33 27 27 20 28 27 27 22 28 19 25 22 28 32 26 27 23 37 33 20 5 24 24 34 28 28 11 26 30 25 33 26 28 25 22 26 27 27 38 34 23 11 28 26 28 34 26 0 0 0 0 0 0 0 0 0 0 0 cutadapt-4.7/tests/data/action_lowercase.fasta000066400000000000000000000004211457457704700216260ustar00rootroot00000000000000>r1 CGtccgaantagctACCACCCTGATTAGACAAAT >r2 CAAGACAAGACCTGCCACAttgccCTAGTATTAA >r3 CAAGACAAGACCTGCCACATTGCCCTAGTCAAGA >r4 CAAGATGTCCCCtgccacattgccCTAGTCAAGA >r5 CAAGATGTCCCCTGCCaCATTGCCCTAGTTTATT >r6 GTTCATGTCCCCTGCCACATTGCCCTAGTTTATT >r7 ATGGCTGTCCCCTGCCACATTGCCCTAGTCAAGA cutadapt-4.7/tests/data/action_retain.fasta000066400000000000000000000002431457457704700211260ustar00rootroot00000000000000>r1 CGTCCGAAcaagCCTGCCACAT >r2 caagACAAGACCT >r3 TGCCCTAGTcaag >r4 TGCCCTAGTcaa >r5 TGTTGggttaaCCGCCTTGA >r6 ggttaaCATTGCCCTAGTTTATT >r7 ttaaGTTCATGT >r8 ACGTACGT cutadapt-4.7/tests/data/adapter-empty-name.fasta000066400000000000000000000000701457457704700217770ustar00rootroot00000000000000>adapter1 GCCGAACTTCTTAGACTGCCTTAAGGACGT > CAGGTATATCGA cutadapt-4.7/tests/data/adapter.fasta000066400000000000000000000001001457457704700177170ustar00rootroot00000000000000>adapter1 GCCGAACTTCTTAGACTGCCTTAAGGACGT >adapter2 CAGGTATATCGA cutadapt-4.7/tests/data/adapterorder.fasta000066400000000000000000000000141457457704700207570ustar00rootroot00000000000000>r AAACCGGG cutadapt-4.7/tests/data/anchored-back.fasta000066400000000000000000000001561457457704700207730ustar00rootroot00000000000000>read1 sequenceBACKADAPTER >read2 sequenceBACKADAPTERblabla >read3 sequenceBACKADA >read4 sequenceBECKADAPTER cutadapt-4.7/tests/data/anchored.fasta000066400000000000000000000001531457457704700200720ustar00rootroot00000000000000>read1 FRONTADAPTsequence >read2 blablaFRONTADAPTsequence >read3 NTADAPTsequence >read4 FRINTADAPTsequence cutadapt-4.7/tests/data/anchored_no_indels.fasta000066400000000000000000000003401457457704700221220ustar00rootroot00000000000000>no_mismatch (adapter: TTAGACATAT) TTAGACATATGAGGTCAG >one_mismatch TAAGACATATGAGGTCAG >two_mismatches TAAGACGTATGAGGTCAG >insertion ATTAGACATATGAGGTCAG >deletion TAGACATATGAGGTCAG >mismatch_plus_wildcard TNAGACGTATGAGGTCAG cutadapt-4.7/tests/data/anywhere_repeat.fastq000066400000000000000000000012031457457704700215060ustar00rootroot00000000000000@prefix:1_13_1400/1 CGTCCGAANTAGCTACCACCCTGATTAGACAAAT + )3%)&&&&!.1&(6:<'67..*,:75)'77&&&5 @prefix:1_13_1500/1 CAAGACAAGACCTGCCACATTGCCCTAGTATTAA + <=A:A=57!7<';<6?5;;6:+:=)71>70<,=: @prefix:1_13_1550/1 CAAGACAAGACCTGCCACATTGCCCTAGTCAAGA + <=A:A=57!7<';<6?5;;6:+:=)71>70<,=: @prefix:1_13_1600/1 CAAGATGTCCCCTGCCACATTGCCCTAGTCAAGA + <=A:A=57!7<';<6?5;;6:+:=)71>70<,=: @prefix:1_13_1700/1 CAAGATGTCCCCTGCCACATTGCCCTAGTTTATT + <=A:A=57!7<';<6?5;;6:+:=)71>70<,=: @prefix:1_13_1800/1 GTTCATGTCCCCTGCCACATTGCCCTAGTTTATT + <=A:A=57!7<';<6?5;;6:+:=)71>70<,=: @prefix:1_13_1900/1 ATGGCTGTCCCCTGCCACATTGCCCTAGTCAAGA + <=A:A=57!7<';<6?5;;6:+:=)71>70<,=:cutadapt-4.7/tests/data/casava.fastq000066400000000000000000000004461457457704700175720ustar00rootroot00000000000000@M123:14:000000000-AOEUI:1:1102:18260:1280 1:N:0:9 GGTGGCAGCAGCTACAGGTGTCCAGTCCCAGGTGCAGCTGGTGCAG + CC-BCEFFGGGGGFCFGFFGF<9EDGEAGFC@F<8@FE@EG,CFGG @M123:14:000000000-AOEUI:1:1102:18600:1290 1:Y:0:9 CCTGCTGCTGACCATCCCTTCCTGGGTCTTGTCCCAGATCACCTTG + CCCCCGGGGGFGGGGFGGGGGGGFGGAFG@9F@FFG=*-(976;9&:; @r2 aaaaaaaaaaCAACAGGCCACATTAGACATAT + 14&+?')+4;.;:))9;74(=70,?,- @r5 TTATTTGTCTCCAGCTTAGACA + <9(/-+3'/--?'88.*<82>. @r6 aaaaaaaaaaAGTGCAACGGTCCACACTGCAG + ?61&.53(7:->5)'13950-1*&/1;05+:/ cutadapt-4.7/tests/data/combinatorial.2.fastq000066400000000000000000000006501457457704700213140ustar00rootroot00000000000000@r1 ggggggggggGCTGGAGACAAATAACAGTGGAG + +:*(=<(,;,-:1/190:8.2+6-20)7>-8>3 @r2 ttttttttttTGTGGCCTGTTGCAGTGGAGTAA + )'4;/5*9)64?(>.99(2';&/9.4.<&>,*5 @r3 ggggggggggTGTTATTAATATCAAGTTGGCAG + &=0)88/)-''<9039+87)<&&*1+,:638<< @r4 ttttttttttCATCCCGTCAACATTCAAACGGC + .,(,)2,((+=;+5.091<,80),<>3<:46'. @r5 ggggggggggCAACAGGCCACATTAGACATATC + *2111.57.(&05(-2*3*)2(=?21+256(/7 @r6 GTCACTGTTCCGTCAACCAGTTT + '8/5;5)39.89>45*5;0/?9' cutadapt-4.7/tests/data/dos.fastq000066400000000000000000000004371457457704700171210ustar00rootroot00000000000000@prefix:1_13_573/1 CGTCCGAANTAGCTACCACCCTGATTAGACAAAT + )3%)&&&&!.1&(6:<'67..*,:75)'77&&&5 @prefix:1_13_1259/1 AGCCGCTANGACGGGTTGGCCCTTAGACGTATCT + ;<:&:A;A!9<<<,7:<=3=;:<&70<,=: cutadapt-4.7/tests/data/dual-index.1.fastq000066400000000000000000000002451457457704700205220ustar00rootroot00000000000000@id1 AAAAACGTACGT + zzzzzzzzzzzz @id2 CCCCACGTACGT + zzzzzzzzzzzz @id3 AAAAACGTACGT + zzzzzzzzzzzz @id4 CCCCACGTACGT + zzzzzzzzzzzz @id5 GGGGACGTACGT + zzzzzzzzzzzz cutadapt-4.7/tests/data/dual-index.2.fastq000066400000000000000000000002451457457704700205230ustar00rootroot00000000000000@id1 GGGGTGCATGCA + zzzzzzzzzzzz @id2 TTTTTGCATGCA + zzzzzzzzzzzz @id3 GGGGTGCATGCA + zzzzzzzzzzzz @id4 TTTTTGCATGCA + zzzzzzzzzzzz @id5 TTTTTGCATGCA + zzzzzzzzzzzz cutadapt-4.7/tests/data/empty.fasta000066400000000000000000000000001457457704700174340ustar00rootroot00000000000000cutadapt-4.7/tests/data/empty.fastq000066400000000000000000000000001457457704700174540ustar00rootroot00000000000000cutadapt-4.7/tests/data/example.fa000066400000000000000000000003611457457704700172330ustar00rootroot00000000000000>read1 MYSEQUENCEADAPTER >read2 MYSEQUENCEADAP >read3 MYSEQUENCEADAPTERSOMETHINGELSE >read4 MYSEQUENCEADABTER >read5 MYSEQUENCEADAPTR >read6 MYSEQUENCEADAPPTER >read7 ADAPTERMYSEQUENCE >read8 PTERMYSEQUENCE >read9 SOMETHINGADAPTERMYSEQUENCE cutadapt-4.7/tests/data/format-error.fastq000066400000000000000000000000461457457704700207470ustar00rootroot00000000000000@too_many_quality_values ACG + ##### cutadapt-4.7/tests/data/illumina.fastq.gz000066400000000000000000000157711457457704700205740ustar00rootroot00000000000000S\ےHn}WlĆ;;=x{ҏ6~w(T=*4Bvpe.vfo?nc'kMZ;e:zO8ts4ى__~KO_71hƘx] .ކ`ti^ !G7]MκkgZcw.Ħ6^,fkߏi  '>kh9x=&+bZ ZHŐ#Gz)s1QʑѦ O<cr|@ß@]N9];ۊl4|>n,Yt'I&:fy^"QXNFV$!1zO`%n &Y:$&^GwrhzeOlV 폤</]+m׋^ oL.昳/3O k?H^ DELpC p z*3~~>ʼ/tGNc}M جu3yخix@XŶh$,spؾl| cG"$g5l ,@j,;x`aѼj_Hg>%nClp9˶kG4yB0_q AG2٧LkvS,w<})<b n8,W͖|hDN$_:pQaG,ItakMB&'9% eH63B<[s$/wpga::x``Sjm=(My9"3Y,鑋<ԈN{c~ H7S.aѳC="VmKɏ&YhX$e'E B7XTo0,o;wfO8K4 yCY/yѫZ|ǙY0[ElxKʲdyՆ=D[YˋRqRA1 |u^5k}C[^z3Bq>$9B{JWl!2r1(OІ|m*P$h/{ 1a#?F07Grq]"-*lHIlWa5Y^T *@ \Sz>) T7J,>J~QN?o0DPh.궂\2ij069Ө_XivB,`$H(+wBÇrj~]!h0.Wy@ԉxL\ERIkIEeAY4Կ(D.U5VIނOmXdhPEe R@)(Tșg mz 0`zh~ _gk5!ǗSX>_56td`.y"~&PܨYѻ_E B̕ϹSCȑ9![b.Yea?BCŊr_ZSC? ]Y7,U/`߫Ĉ2,E/Ax:#LbɬoBR\I >(f 3p$ } ހ"C$t{s5*X1[ev kQd0$`5&L醎 <3/_rPrs Xk/rey@boɇY%)[#8RR>UZYxwV~cd"6QDHSx]ʹ=]m7E#V.lvh% {ellT_<.O0"9Q$,#j ć\.e6ϒ KP DZ:T4_ogMY  N+1*)&.jc,qʿ7"!h!It慱RAdQ>'a{Irh!ͩm[㜫uoۦ\L{  ŇPa=;z_]߼%6}k V}Tk&d)vR/s)u\C3 a-VAR;o7jݼA?ۈfoXJwN ˁvMDTJ"pܑR.s\ (MuT !\`ϥhӷ葔ll#)R8t͒Oo"VCg hQTu@Crnz2tkB>("8ǹ(D0iu8 b .iw %U74!3̨ R"CWgQK^ 2>P~5v S8G;p9E=]uf\W<N#{e1s^^@&sW@z> -͔ؐ1\hD1-; h,$?mVټqcFg?̥ 6GI"q,qA2yOڬ}Hٴ"d(!r4O06(d},(EKgm`hBnnr}d~@HHZ"Є9\\Հ.Y:2^ M源J7~G:%;Yxl[$'SAf/?/i!:0jb6bYeY$wrBԐy2ϋ>ZBVB"7g^OӨrR+AG_ѴPdD$ul6O1y`V1OX䘊*Zbr~xB9†, ic0Oh:ه[x<7LloW;R|i1Фp:\׹0pl1 ;Mx4ژkkl`/\_㗶_!ڮAsSO[Y/bBGu UK J."tR}&1I܍0Zvu<+ު'눰!+6#1ۼ@O8LڌnZ H&q[ ւ vr+!S,:4sR-jVP%w@IeG 2a[$Fc~=5_=bt[,cjϒ5Kdц4edft萄%3gqM蚾ղڎҥZcM…uf Wqej\Jڼ^jl7ݑT>o9 72#SV4V2Um•]W)C!joze.8NR+Os4u{TLihȞl22azo$@Y!Bb[檅0+ä%/J{X:PJhY) }MTV*S.JGQ151CGSxɡc(Ozxo&#eRF&3z翘+~kv~m*Y3Wmr7K"qƶ~0V=ҫHTTat͈eZ-,[LB^eE]UxOm’ymZ; xœvۄU ]g#GCAjG8{c-μE]b*;|,$$+[ Ν3|s_w@S,6W\Ie@9Mɾm8%פwRC*YW*ĒRMEHEcV۞؋\lwuv:rVUoX[{XR"˩v`Ҥ}ۜxj1I5[C T . P#_Ͼ5g Ås3ӈ:RFiF3ƒIhEFmP[%x?-x*%#e>@%6$㱭գ\ ` .T}Ds=tn.mݺoz^sCy7a ggݳAty" 7?Ӓ-Vm q5(@beݣW9;s;ozuOs*L8_ܰd|q>+=f*u } -zu8Yé!mk [hzJ:e 8!JR oiVXȜ62b:]Qn47l- c .c.0Y^|ߕl79wja"qTvtAU7[owu;:Wq[q r+>;gN{5j''6|,}y ZSL%X4v:mo?|?-LnOtq1eAͻ6nWO,{xk¼|=B2\\`՞_߻KYѲ ik9fKQ&[я4/:4dћSRz qgm`U|Ӣ..n餙 [nK;6تl Jiz߽iGxNLS3a8&<~]:5@5jzEn;]&e{K&0:yr@x-qޝ 2Mqw>2a~Vˎ{.7vMM_w|udjۛ`Ms=1Z_]vCߞMk.];p8}m**Zہ3-gnQ򆍽:/3\s휊Sa,Tr, JkAťgٸt'zI֥[Y䦡q,UtYm%Sj]ΑU=8.zZ^aO&ٝ4G {ĕŖ!c㏏hQ!BH֘j\d=cGw\Zrw-:`ʉxN){5O@.l*$\*V!̔K ar k] va?J'$&;[%tObpՄz^cutadapt-4.7/tests/data/illumina5.fastq000066400000000000000000000023201457457704700202240ustar00rootroot00000000000000@SEQ:1:1101:9010:3891#0/1 adapter start: 51 ATAACCGGAGTAGTTGAAATGGTAATAAGACGACCAATCTGACCAGCAAGGGCCTAACTTCTTAGACTGCCTTAAGGACGTAAGCCAAGATGGGAAAGGTC + FFFFFEDBE@79@@>@CBCBFDBDFDDDDD<@C>ADD@B;5:978@CBDDFFDB4B?DB21;84?DDBC9DEBAB;=@<@@B@@@@B>CCBBDE98>>0@7 @SEQ:1:1101:9240:3898#0/1 CCAGCAAGGAAGCCAAGATGGGAAAGGTCATGCGGCATACGCTCGGCGCCAGTTTGAATATTAGACATAATTTATCCTCAAGTAAGGGGCCGAAGCCCCTG + GHGHGHHHHGGGDHHGDCGFEEFHHGDFGEHHGFHHHHHGHEAFDHHGFHHEEFHGHFHHFHGEHFBHHFHHHH@GGGDGDFEEFC@=D?GBGFGF:FB6D @SEQ:1:1101:9207:3899#0/1 adapter start: 64 TTAACTTCTCAGTAACAGATACAAACTCATCACGAACGTCAGAAGCAGCCTTATGGCCGTCAACGCCTAACTTCTTAGACTGCCTTAAGGACGTATACATA + HHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHFHHHHHHCFHHFHHFHFFFFFBHHGHHHFFHHFHGGHHDEBFGs AGGCGCTTGTAGCGTCGATTGAGTCGTGAC >r GTCACGACTCAATCGACGCTACAAGCGCCT cutadapt-4.7/tests/data/interleaved.fastq000066400000000000000000000013331457457704700206320ustar00rootroot00000000000000@read1/1 some text TTATTTGTCTCCAGCTTAGACATATCGCCT + ##HHHHHHHHHHHHHHHHHHHHHHHHHHHH @read1/2 other text GCTGGAGACAAATAACAGTGGAGTAGTTTT + HHHHHHHHHHHHHHHHHHHHHHHHHHHHHH @read2/1 CAACAGGCCACATTAGACATATCGGATGGT + HHHHHHHHHHHHHHHHHHHHHHHHHHHHHH @read2/2 TGTGGCCTGTTGCAGTGGAGTAACTCCAGC + ###HHHHHHHHHHHHHHHHHHHHHHHHHHH @read3/1 CCAACTTGATATTAATAACATTAGACA + HHHHHHHHHHHHHHHHHHHHHHHHHHH @read3/2 TGTTATTAATATCAAGTTGGCAGTG + #HHHHHHHHHHHHHHHHHHHHHHHH @read4/1 GACAGGCCGTTTGAATGTTGACGGGATGTT + HHHHHHHHHHHHHHHHHHHHHHHHHHHHHH @read4/2 CATCCCGTCAACATTCAAACGGCCTGTCCA + HH############################ @read5 TTATTTGTCTCCAGCTTAGACATATCGCCT + #####HHHHHHHHHHHHHHHHHHHHHHHHH @read5 CAACAGGCCACATTAGACATATCGGATGGT + HHHHHHHH##HHHHHHHHHHHHHHHHHHHH cutadapt-4.7/tests/data/issue46.fasta000066400000000000000000000000201457457704700176020ustar00rootroot00000000000000>readname CGTGA cutadapt-4.7/tests/data/lengths.fa000066400000000000000000000007041457457704700172450ustar00rootroot00000000000000>trimmed_length0 TTAGACATATCTCCGTCG >trimmed_length1 cTTAGACATATCTCCGTCG >trimmed_length2 caTTAGACATATCTCCGTCG >trimmed_length3 caaTTAGACATATCTCCGTCG >trimmed_length4 caaaTTAGACATATCTCCGTCG >trimmed_length5 caaacTTAGACATATCTCCGTCG >trimmed_length6 caaaccTTAGACATATCTCCGTCG >trimmed_length7 caaaccaTTAGACATATCTCCGTCG >trimmed_length8 caaaccagTTAGACATATCTCCGTCG >trimmed_length9 caaaccagtTTAGACATATCTCCGTCG >trimmed_length10 caaaccagttTTAGACATATCTCCGTCG cutadapt-4.7/tests/data/linked.fasta000066400000000000000000000006441457457704700175620ustar00rootroot00000000000000>r1 5' adapter and 3' adapter AAAAAAAAAACCCCCCCCCCTTTTTTTTTTGGGGGGG >r2 without any adapter GGGGGGGGGGGGGGGGGGG >r3 5' adapter, partial 3' adapter AAAAAAAAAACCCGGCCCCCTTTTT >r4 only 3' adapter GGGGGGGGGGCCCCCCCCCCTTTTTTTTTTGGGGGGG >r5 only 5' adapter AAAAAAAAAACCCCCCCCCCGGGGGGG >r6 partial 5' adapter AAAAAACCCCCCCCCCTTTTTTTTTTGGGGGGG >r7 5' adapter plus preceding bases AACCGGTTTTAAAAAAAAAACCCCCCCCCCTTTTTTTTTTGGGGGGG cutadapt-4.7/tests/data/lowqual.fastq000066400000000000000000000001151457457704700200110ustar00rootroot00000000000000@first_sequence SEQUENCE1 + ######### @second_sequence SEQUENCE2 + ######### cutadapt-4.7/tests/data/maxee.fastq000066400000000000000000000001261457457704700174260ustar00rootroot00000000000000@empty + @ee_1 A + ! @ee_0.8 ACGTTGCA + ++++++++ @ee_1.01 TGGACGTTGCA + +5+++++++++ cutadapt-4.7/tests/data/maxn.fasta000066400000000000000000000000621457457704700172510ustar00rootroot00000000000000>r1 >r2 N >r3 AAAA >r4 AAAAN >r5 AAANN >r6 AANNN cutadapt-4.7/tests/data/multi.fasta000066400000000000000000000006751457457704700174520ustar00rootroot00000000000000>r1 ATAACCGGAGTAGTTGAAATGGTAATAAGACGACCAATCTGACCAGCAAGG >r2 ACGACGCAATGGAGAAAGACGGAGAGCG >r3 CCATCCAAAGGATAAACATCATAGGCAGTCGGGAGGGTAGTCGGAA >r4 GTGGCAAGTCTGCCGCTGATAAAGGAAAGGATACTCGTGATTATCTTGC >r5 ATTAGAGCCAATACCATCAGCTTTACCG >r6 ATAAAGGAAAGGATACTCGTGATTATCT >r7 GTACGGATTGTTCAGTAACTTGACTCATGATTTCTTACCTATTAGTGGTTGAAC >r8 ACGTTCTGGTTGGTTGTGGCCTGTTGATGCTAAAGGTGAGC >r9 TATTAAGCTCATTCAGGCTTCTGCCGTTTTGGATTTAACCGAAGATGAT >r10 TCTGTTGAACACGACCAGAA cutadapt-4.7/tests/data/multiblock.fastq.bz2000066400000000000000000000004451457457704700211740ustar00rootroot00000000000000BZh91AY&SY=߀h P@ TP 5O)PT5\j~֮"E74<ܑN$"j@BZh91AY&SYݟ߀@# P@0U?CSi<"yh U?d@d =Ee9lN$hY@ptu̔צj.\%(0r{tf֎ 5{7-Mҩ,,)mS%^@yb" œ(*\8i!UTE ܑN$4wgcutadapt-4.7/tests/data/multiblock.fastq.gz000066400000000000000000000004061457457704700211140ustar00rootroot00000000000000/Ns((JMˬ2747577rvqvvwt qtwqtv"w 1KKXUS  4̬ltM5́\8 [/NUͫ@aoQSBJB^ҞXW5 ALq,Pb0c@Q-jX bV<m0 jS"]fZfY}ߺpŅuÏےACXy`|LZ9e _fB:cutadapt-4.7/tests/data/nextseq.fastq000066400000000000000000000013201457457704700200130ustar00rootroot00000000000000@NS500350:251:HLM7JBGXX:1:11101:12075:1120 1:N:0:TACAGC GATCGGAAGAGCACACGTCTGAACTCCAGTCACTACAGCATCTCGTATTCCGTCTTCTGCTTGAAAAAAAAAAAGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGG + AAAAAEEEEEEAEEEEAEAEEEEEEAEEEEEEEEEEEEEEE///E/EE////AAEE/E//////EEEEEEE6///////E6EEA/AEAEAE6EEEEEEEEEEEEAEAA/E/EEEEA//EEEEEAEAEE/EEEAEEEE3p_orig TGAACATAGCTTAGACATATAACCG >3p_mism TGAACATAGCTTACACATATAACCG >3p_del TGAACATAGCTTAACATATAACCG >3p_ins TGAACATAGCTTAGGACATATAACCG >3p_frontins TAGACATATAACCG >5p_orig TCCTCGAGATTGCCATACTGCTTCTCGAA >5p_mism TCCTCGAGATAGCCATACTGCTTCTCGAA >5p_del TCCTCGAGATGCCATACTGCTTCTCGAA >5p_ins TCCTCGAGATATGCCATACTGCTTCTCGAA cutadapt-4.7/tests/data/onlycomment.fasta000066400000000000000000000000351457457704700206520ustar00rootroot00000000000000# just a comment, zero reads cutadapt-4.7/tests/data/overlapb.fa000066400000000000000000000012171457457704700174130ustar00rootroot00000000000000>adaptlen18 TTAGACATATCTCCGTCGATACTTACCCGTA >adaptlen17 TAGACATATCTCCGTCGATACTTACCCGTA >adaptlen16 AGACATATCTCCGTCGATACTTACCCGTA >adaptlen15 GACATATCTCCGTCGATACTTACCCGTA >adaptlen14 ACATATCTCCGTCGATACTTACCCGTA >adaptlen13 CATATCTCCGTCGATACTTACCCGTA >adaptlen12 ATATCTCCGTCGATACTTACCCGTA >adaptlen11 TATCTCCGTCGATACTTACCCGTA >adaptlen10 ATCTCCGTCGATACTTACCCGTA >adaptlen9 TCTCCGTCGATACTTACCCGTA >adaptlen8 CTCCGTCGATACTTACCCGTA >adaptlen7 TCCGTCGATACTTACCCGTA >adaptlen6 CCGTCGATACTTACCCGTA >adaptlen5 CGTCGATACTTACCCGTA >adaptlen4 GTCGATACTTACCCGTA >adaptlen3 TCGATACTTACCCGTA >adaptlen2 CGATACTTACCCGTA >adaptlen1 GATACTTACCCGTA >adaptlen0 ATACTTACCCGTA cutadapt-4.7/tests/data/paired.1.fastq000066400000000000000000000005571457457704700177420ustar00rootroot00000000000000@read1/1 some text TTATTTGTCTCCAGCTTAGACATATCGCCT + ##HHHHHHHHHHHHHHHHHHHHHHHHHHHH @read2/1 CAACAGGCCACATTAGACATATCGGATGGT + HHHHHHHHHHHHHHHHHHHHHHHHHHHHHH @read3/1 CCAACTTGATATTAATAACATTAGACA + HHHHHHHHHHHHHHHHHHHHHHHHHHH @read4/1 GACAGGCCGTTTGAATGTTGACGGGATGTT + HHHHHHHHHHHHHHHHHHHHHHHHHHHHHH @read5 TTATTTGTCTCCAGCTTAGACATATCGCCT + #####HHHHHHHHHHHHHHHHHHHHHHHHH cutadapt-4.7/tests/data/paired.2.fastq000066400000000000000000000005541457457704700177400ustar00rootroot00000000000000@read1/2 other text GCTGGAGACAAATAACAGTGGAGTAGTTTT + HHHHHHHHHHHHHHHHHHHHHHHHHHHHHH @read2/2 TGTGGCCTGTTGCAGTGGAGTAACTCCAGC + ###HHHHHHHHHHHHHHHHHHHHHHHHHHH @read3/2 TGTTATTAATATCAAGTTGGCAGTG + #HHHHHHHHHHHHHHHHHHHHHHHH @read4/2 CATCCCGTCAACATTCAAACGGCCTGTCCA + HH############################ @read5 CAACAGGCCACATTAGACATATCGGATGGT + HHHHHHHH##HHHHHHHHHHHHHHHHHHHH cutadapt-4.7/tests/data/polya.1.fasta000066400000000000000000000003501457457704700175710ustar00rootroot00000000000000>polyA AAACTTCAGAACAGAAAAAAAAAAAAAAAAAAAAA >polyAlong CTTAGTTCAATWTTAACCAAACTTCAGAACAGAAAAAAAAAAAAAAAAAAAAAGAAAAAAAAAAAAAAAAAAAA >polyA2 AAACTTAACAAGAACAAGAAAAAAAAAAAAAAAAAAAAA >nopoly GAAGAGTATCTCTCTGTCCTCTTGTCCGGCGTTACAGTAATGATCG cutadapt-4.7/tests/data/polya.2.fasta000066400000000000000000000003251457457704700175740ustar00rootroot00000000000000>polyA TTTTTTTTTTTTTTTTTTTTTCTGTTCTGAAGTTT >polyAlong TTTTTTTTTTTTTTTTTTTTCTTTTTTTTTTTTTTTTTTTTTCTGTTCTGAAGTTTGGTTAAWATTGAACTAAG >polyA2 TTTTTTTTTTTTTTTTTTTTTCTTGTTCTTGTTAAGTTT >nopoly TCTGAAGTTTGGTTAAWATTGAACTAA cutadapt-4.7/tests/data/prefix-adapter.fasta000066400000000000000000000000331457457704700212170ustar00rootroot00000000000000>prefixadapter ^FRONTADAPT cutadapt-4.7/tests/data/rest.fa000066400000000000000000000003421457457704700165540ustar00rootroot00000000000000>read1 TESTINGADAPTERREST1 >read2 TESTINGADAPTERRESTING >read3 TESTINGADAPTER >read4 TESTINGADAPTERRESTLESS >read5 TESTINGADAPTERRESTORE >read6 ADAPTERSOMETHING >read7 DAPTERSOMETHING >read8 RESTADAPTERSOMETHING >read9 NOREST cutadapt-4.7/tests/data/rest.txt000066400000000000000000000001071457457704700170040ustar00rootroot00000000000000REST1 read1 RESTING read2 RESTLESS read4 RESTORE read5 SOMETHING read8 cutadapt-4.7/tests/data/restfront.txt000066400000000000000000000001211457457704700200510ustar00rootroot00000000000000TESTING read1 TESTING read2 TESTING read3 TESTING read4 TESTING read5 REST read8 cutadapt-4.7/tests/data/revcomp.1.fastq000066400000000000000000000007701457457704700201460ustar00rootroot00000000000000@read1/1 ttatttgtctCCAGCTTAGACATATCGCCT + (8J181J(J1G=1C(C1=J1J=C(18C(1( @read2/1 CAACAGGCCACATTAGACATATCGGATGGTagacaaataa + 8JJJC(1C=88CJ=JJ1(88C=CG((=C81GJG8=J=8=( @read3/1 tccgcactggCCAACTTGATATTAATAACATTagacaXataa + 1GC((GCG=(111C1=8(G(8CJ8118=C8J=88G(J=8(=8 @read4/1 GACAGGCCGTTTGAATGTTGACGGGATGTT + 11=CCJJ(G===1==J8G1=8=J=CCGJ(J @read5 ttatttgtctCCAGCTTAGACATATCGCCT + =GG=1GC(JG(J=1(G(J1GCC==CCG(=G @read6 CAACATTTCGATCATATCTGACTTCTTCCTCATagacaaataa + J(1J=88J(1=(=J1GJ(G=CGJJC(8G=1G118JC8(CC8(( cutadapt-4.7/tests/data/revcomp.2.fastq000066400000000000000000000007421457457704700201460ustar00rootroot00000000000000@read1/2 GCTGGAGACAAATAACAGTGGAGTAGTTTT + 0FIF<Mcutadapt-4.7/tests/data/simple.fasta000066400000000000000000000001201457457704700175720ustar00rootroot00000000000000# a comment # another one >first_sequence SEQUENCE1 >second_sequence SEQUEN CE2 cutadapt-4.7/tests/data/simple.fasta.gz000066400000000000000000000001431457457704700202160ustar00rootroot00000000000000Usimple.fastaSVHTHM+RVH/H-RKK,*./N-,MKN v usv5+NMKArv54m70<,=: cutadapt-4.7/tests/data/small.fastq.bz2000066400000000000000000000003361457457704700201360ustar00rootroot00000000000000BZh91AY&SY`kH߀@# P@0"ii?T&42F ɡT!E r5MAV`jd_v5}#[ (!|R[O.֑+ݶV-l)87drZÅyfEW -wU XD$V* $DQ!DB#{G)„Xcutadapt-4.7/tests/data/small.fastq.gz000066400000000000000000000003321457457704700200550ustar00rootroot00000000000000Osmall.fastq]1n0 @]Pn '#:ngjlE"~|nie)2o*">bfp2Dݔ/0Mo#Bz_pO|qEi撪6|ؤم0".D4RkAAR|SfW\ܖ;~Ȼm\3t@}eͥ,x:@F%cutadapt-4.7/tests/data/small.fastq.xz000066400000000000000000000004041457457704700200760ustar00rootroot000000000000007zXZִF!t/]  FgB!1,h"zag|w\.C 9f9bfWRr,(_MLz)U0Au:1elZ(o2t{|B_ED⽛(a=ysV+QH Ѷ|FhYRWPȵV.q ei}{&9<4Z_d",gYZcutadapt-4.7/tests/data/suffix-adapter.fasta000066400000000000000000000000341457457704700212270ustar00rootroot00000000000000>suffixadapter BACKADAPTER$ cutadapt-4.7/tests/data/suffix.fastq000066400000000000000000000002251457457704700176330ustar00rootroot00000000000000@Read1 extra AAAAAAAAAATTTTTTTTCCC + FFGHHHHIJJJJJIIJJJGGH @Read2 AAAAGGGGGGGGCCC + JIJJGBGGHEGHJII @Read3 AAAAAAAAAAAAAAAAAAA + JGFHCG<trimmed_length6 caaacc >trimmed_length7 caaacca >trimmed_length8 caaaccag >trimmed_length9 caaaccagt >trimmed_length10 caaaccagtt cutadapt-4.7/tests/data/tooshort.fa000066400000000000000000000001441457457704700174600ustar00rootroot00000000000000>trimmed_length0 >trimmed_length1 c >trimmed_length2 ca >trimmed_length3 caa >trimmed_length4 caaa cutadapt-4.7/tests/data/tooshort.noprimer.fa000066400000000000000000000001651457457704700213150ustar00rootroot00000000000000>read_length0a >read_length0b >read_length1 >read_length2 2 >read_length3 02 >read_length4 302 >read_length5 3302 cutadapt-4.7/tests/data/trim-n.fasta000066400000000000000000000000151457457704700175120ustar00rootroot00000000000000>r NNACGTNNN cutadapt-4.7/tests/data/trimN3.fasta000066400000000000000000000000611457457704700174610ustar00rootroot00000000000000>read1 CAGTCGGTCCTGAGAGATGGGCGAGCGCTGGNANNNNNNNG cutadapt-4.7/tests/data/trimN5.fasta000066400000000000000000000000611457457704700174630ustar00rootroot00000000000000>read1 NGGCCTGGAATTCTCGGGTGCCAAGGAACTCCAGTCACCAG cutadapt-4.7/tests/data/twoadapters.fasta000066400000000000000000000004071457457704700206460ustar00rootroot00000000000000>read1 GATCCTCCTGGAGCTGGCTGATACCAGTATACCAGTGCTGATTGTTGAATTTCAGGAATTTCTCAAGCTCGGTAGC >read2 CTCGAGAATTCTGGATCCTCTCTTCTGCTACCTTTGGGATTTGCTTGCTCTTGGTTCTCTAGTTCTTGTAGTGGTG >read3 (no adapter) AATGAAGGTTGTAACCATAACAGGAAGTCATGCGCATTTAGTCGAGCACGTAAGTTCATACGGAAATGGGTAAG cutadapt-4.7/tests/data/underscore_fastq.gz000066400000000000000000000003321457457704700211770ustar00rootroot00000000000000Osmall.fastq]1n0 @]Pn '#:ngjlE"~|nie)2o*">bfp2Dݔ/0Mo#Bz_pO|qEi撪6|ؤم0".D4RkAAR|SfW\ܖ;~Ȼm\3t@}eͥ,x:@F%cutadapt-4.7/tests/data/wildcard.fa000066400000000000000000000000501457457704700173640ustar00rootroot00000000000000>1 ANGTACGTTGCATGCA >2 ACGTANGTTGCATGCA cutadapt-4.7/tests/data/wildcardN.fa000066400000000000000000000000701457457704700175040ustar00rootroot00000000000000>perfect TTTGGGGGGG >withN TTTGGNGGGG >1mism TTTGGGGCGG cutadapt-4.7/tests/data/wildcard_adapter.fa000066400000000000000000000001441457457704700210700ustar00rootroot00000000000000>1 ACGTAAAACGTTGCATGCA >2 ACGTGGGACGTTGCATGCA >3b TGGCTGGCCACGTCCCACGTAA >4b TGGCTGGCCACGTTTTACGTCC cutadapt-4.7/tests/data/withplus.fastq000066400000000000000000000001541457457704700202070ustar00rootroot00000000000000@first_sequence SEQUENCE1 +this is different :6;;8<=:< @second_sequence SEQUENCE2 +also different 83r1 CGTCCGAAGTAGCTAtccgaatagaCCACCCTGATTAGACAAAT >r2 tccgaatagaAGCCGCTACGACGGGTTGGCCCTTAGACGTATCT >r3 atagaCAAGATCTACCCTGCCACATTGCCCTAGTTAAAC >r4 CtccgaatagaAAGATCTACCCTGCCACATTGCCCTAGTTAAAC >r5 CAAGATCTACCCTGCCACATTGCCCTAGTTAAACtccgaataga >r6 CAAGATCTACCCTGCCACATTGCCCTAGTTAAACtccga >r7 CAAGATCTACCCTGCCACATTGCCCTAGTTtccgaatagaA cutadapt-4.7/tests/test_adapters.py000066400000000000000000000440231457457704700175760ustar00rootroot00000000000000import pytest from dnaio import SequenceRecord from cutadapt.adapters import ( RemoveAfterMatch, RemoveBeforeMatch, FrontAdapter, BackAdapter, PrefixAdapter, SuffixAdapter, RightmostFrontAdapter, LinkedAdapter, MultipleAdapters, IndexedPrefixAdapters, IndexedSuffixAdapters, NonInternalFrontAdapter, ) def test_back_adapter_absolute_number_of_errors(): adapter = BackAdapter( sequence="GATCGGAAGA", max_errors=1, min_overlap=3, ) assert adapter.max_error_rate == 1 / 10 def test_back_adapter_absolute_number_of_errors_with_wildcards(): adapter = BackAdapter( sequence="NNNNNNNNNNGATCGGAAGA", max_errors=1, ) assert adapter.max_error_rate == 1 / 10 def test_front_adapter_partial_occurrence_in_back(): adapter = FrontAdapter("CTGAATT", max_errors=0, min_overlap=4) assert adapter.match_to("GGGGGCTGAA") is None def test_back_adapter_partial_occurrence_in_front(): adapter = BackAdapter("CTGAATT", max_errors=0, min_overlap=4) assert adapter.match_to("AATTGGGGGGG") is None def test_front_adapter_finds_leftmost_match(): adapter = FrontAdapter("CTGAATT", max_errors=1, min_overlap=3) match = adapter.match_to("GGCTGAATTGGG") assert match.astart == 0 assert match.astop == 7 assert match.rstart == 2 assert match.rstop == 9 assert match.errors == 0 match = adapter.match_to("GGCTGAATTGGGCTGAATTGGG") assert match.astart == 0 assert match.astop == 7 assert match.rstart == 2 assert match.rstop == 9 assert match.errors == 0 match = adapter.match_to("GGCTGAATTGGGCTGTATTGGG") assert match.astart == 0 assert match.astop == 7 assert match.rstart == 2 assert match.rstop == 9 assert match.errors == 0 match = adapter.match_to("GGCTTAATTGGGCTGAATTGGG") assert match.astart == 0 assert match.astop == 7 assert match.rstart == 2 assert match.rstop == 9 assert match.errors == 1 def test_rightmost_front_adapter(): adapter = RightmostFrontAdapter("CTGAATT", max_errors=1, min_overlap=3) match = adapter.match_to("GGCTGAATTGGG") assert match.astart == 0 assert match.astop == 7 assert match.rstart == 2 assert match.rstop == 9 assert match.errors == 0 match = adapter.match_to("GGCTGAATTGGGCTGAATTGGG") assert match.astart == 0 assert match.astop == 7 assert match.rstart == 12 assert match.rstop == 19 assert match.errors == 0 match = adapter.match_to("GGCTGAATTGGGCTGTATTGGG") assert match.astart == 0 assert match.astop == 7 assert match.rstart == 12 assert match.rstop == 19 assert match.errors == 1 match = adapter.match_to("GGCTTAATTGGGCTGAATTGGG") assert match.astart == 0 assert match.astop == 7 assert match.rstart == 12 assert match.rstop == 19 assert match.errors == 0 def test_rightmost_front_adapter_partial_occurrence(): adapter = RightmostFrontAdapter("TTTTACGT") match = adapter.match_to("ACGTAAAAAAAA") assert match is not None assert match.astart == 4 assert match.astop == 8 assert match.rstart == 0 assert match.rstop == 4 def test_wildcards(): # issue 52 adapter = BackAdapter( sequence="GAACTCCAGTCACNNNNN", max_errors=0.12, min_overlap=5, read_wildcards=False, adapter_wildcards=True, ) sequence = "CCCCAGAACTACAGTCCCGGC" am = RemoveAfterMatch( astart=0, astop=17, rstart=5, rstop=21, score=15, errors=2, adapter=adapter, sequence=sequence, ) assert am.wildcards() == "GGC" """ The result above should actually be 'CGGC' since the correct alignment is this one: adapter GAACTCCAGTCACNNNNN mismatches X X read CCCCAGAACTACAGTC-CCGGC Since we do not keep the alignment, guessing 'GGC' is the best we can currently do. """ def test_issue_80(): # This issue was at the time not considered to be an actual issue with the alignment # algorithm. The following alignment with three errors was found because it had more # matches than the 'obvious' one: # # TCGTATGCCGTCTTC # =========X==XX= # TCGTATGCCCTC--C # # The alignment algorithm has since been changed so that not the number of matches # is relevant, but a score that penalizes indels. Now, the resulting alignment # should be this one (with only two errors): # # TCGTATGCCGTCTTC # =========X==X # TCGTATGCCCTCC adapter = BackAdapter( sequence="TCGTATGCCGTCTTC", max_errors=0.2, min_overlap=3, read_wildcards=False, adapter_wildcards=False, ) result = adapter.match_to("TCGTATGCCCTCC") assert result.errors == 2, result assert result.astart == 0, result assert result.astop == 13, result def test_back_adapter_indel_and_exact_occurrence(): adapter = BackAdapter( sequence="GATCGGAAGA", max_errors=0.1, min_overlap=3, ) match = adapter.match_to("GATCGTGAAGAGATCGGAAGA") # We want the leftmost match of these two possible ones: # GATCGTGAAGAGATCGGAAGA # GATCG-GAAGA # GATCGGAAGA assert match.astart == 0 assert match.astop == 10 assert match.rstart == 0 assert match.rstop == 11 assert match.errors == 1 assert match.score == 8 def test_back_adapter_indel_and_mismatch_occurrence(): adapter = BackAdapter( sequence="GATCGGAAGA", max_errors=0.1, min_overlap=3, ) match = adapter.match_to("CTGGATCGGAGAGCCGTAGATCGGGAGAGGC") # CTGGATCGGA-GAGCCGTAGATCGGGAGAGGC # ||||||| || ||||||X||| # GATCGGAAGA GATCGGAAGA assert match.astart == 0 assert match.astop == 10 assert match.rstart == 3 assert match.rstop == 12 assert match.score == 7 assert match.errors == 1 def test_str(): a = BackAdapter("ACGT", max_errors=0.1) str(a) str(a.match_to("TTACGT")) def test_prefix_adapter_with_indels_one_mismatch(): a = PrefixAdapter( sequence="GCACATCT", max_errors=0.15, min_overlap=1, read_wildcards=False, adapter_wildcards=False, indels=True, ) # GCACATCGGAA # |||||||X # GCACATCT result = a.match_to("GCACATCGGAA") assert result.astart == 0 assert result.astop == 8 assert result.rstart == 0 assert result.rstop == 8 assert result.score == 6 # 7 matches, 1 mismatch assert result.errors == 1 def test_prefix_adapter_with_indels_two_mismatches(): a = PrefixAdapter( sequence="GCACATTT", max_errors=0.3, min_overlap=1, read_wildcards=False, adapter_wildcards=False, indels=True, ) result = a.match_to("GCACATCGGAA") # GCACATCGGAA # ||||||XX # GCACATTT assert result.astart == 0 assert result.astop == 8 assert result.rstart == 0 assert result.rstop == 8 assert result.score == 4 assert result.errors == 2 def test_linked_adapter(): front_adapter = PrefixAdapter("AAAA", min_overlap=4) back_adapter = BackAdapter("TTTT", min_overlap=3) linked_adapter = LinkedAdapter( front_adapter, back_adapter, front_required=True, back_required=False, name="name", ) assert linked_adapter.front_adapter.min_overlap == 4 assert linked_adapter.back_adapter.min_overlap == 3 read = SequenceRecord(name="seq", sequence="AAAACCCCCTTTT") trimmed = linked_adapter.match_to(read.sequence).trimmed(read) assert trimmed.name == "seq" assert trimmed.sequence == "CCCCC" def test_linked_adapter_statistics(): # Issue #615 front_adapter = PrefixAdapter("GGG") back_adapter = BackAdapter("ACGACGACGACG") la = LinkedAdapter( front_adapter, back_adapter, front_required=True, back_required=False, name="name", ) statistics = la.create_statistics() match = la.match_to("GGGTTTTTACGACTACGACG") statistics.add_match(match) front, back = statistics.end_statistics() assert back.errors.get(12) == {1: 1} assert front.errors.get(3) == {0: 1} def test_linked_matches_property(): """Accessing matches property of non-anchored linked adapters""" # Issue #265 front_adapter = FrontAdapter("GGG") back_adapter = BackAdapter("TTT") la = LinkedAdapter( front_adapter, back_adapter, front_required=False, back_required=False, name="name", ) assert la.match_to("AAAATTTT").score == 3 def test_info_record(): adapter = BackAdapter( sequence="GAACTCCAGTCACNNNNN", max_errors=0.12, min_overlap=5, read_wildcards=False, adapter_wildcards=True, name="Foo", ) read = SequenceRecord(name="abc", sequence="CCCCAGAACTACAGTCCCGGC") am = RemoveAfterMatch( astart=0, astop=17, rstart=5, rstop=21, score=15, errors=2, adapter=adapter, sequence=read.sequence, ) assert am.get_info_records(read) == [ [ "", 2, 5, 21, "CCCCA", "GAACTACAGTCCCGGC", "", "Foo", "", "", "", ] ] def test_random_match_probabilities(): a = BackAdapter("A", max_errors=0.1).create_statistics() assert a.end.random_match_probabilities(0.5) == [1, 0.25] assert a.end.random_match_probabilities(0.2) == [1, 0.4] for s in ("ACTG", "XMWH"): a = BackAdapter(s, max_errors=0.1).create_statistics() assert a.end.random_match_probabilities(0.5) == [ 1, 0.25, 0.25**2, 0.25**3, 0.25**4, ] assert a.end.random_match_probabilities(0.2) == [ 1, 0.4, 0.4 * 0.1, 0.4 * 0.1 * 0.4, 0.4 * 0.1 * 0.4 * 0.1, ] a = FrontAdapter("GTCA", max_errors=0.1).create_statistics() assert a.end.random_match_probabilities(0.5) == [ 1, 0.25, 0.25**2, 0.25**3, 0.25**4, ] assert a.end.random_match_probabilities(0.2) == [ 1, 0.4, 0.4 * 0.1, 0.4 * 0.1 * 0.4, 0.4 * 0.1 * 0.4 * 0.1, ] def test_add_adapter_statistics(): stats = BackAdapter("A", name="name", max_errors=0.1).create_statistics() end_stats = stats.end end_stats.adjacent_bases["A"] = 7 end_stats.adjacent_bases["C"] = 19 end_stats.adjacent_bases["G"] = 23 end_stats.adjacent_bases["T"] = 42 end_stats.adjacent_bases[""] = 45 end_stats.errors[10][0] = 100 end_stats.errors[10][1] = 11 end_stats.errors[10][2] = 3 end_stats.errors[20][0] = 600 end_stats.errors[20][1] = 66 end_stats.errors[20][2] = 6 stats2 = BackAdapter("A", name="name", max_errors=0.1).create_statistics() end_stats2 = stats2.end end_stats2.adjacent_bases["A"] = 43 end_stats2.adjacent_bases["C"] = 31 end_stats2.adjacent_bases["G"] = 27 end_stats2.adjacent_bases["T"] = 8 end_stats2.adjacent_bases[""] = 5 end_stats2.errors[10][0] = 234 end_stats2.errors[10][1] = 14 end_stats2.errors[10][3] = 5 end_stats2.errors[15][0] = 90 end_stats2.errors[15][1] = 17 end_stats2.errors[15][2] = 2 stats += stats2 r = stats.end assert r.adjacent_bases == {"A": 50, "C": 50, "G": 50, "T": 50, "": 50} assert r.errors == { 10: {0: 334, 1: 25, 2: 3, 3: 5}, 15: {0: 90, 1: 17, 2: 2}, 20: {0: 600, 1: 66, 2: 6}, } @pytest.mark.parametrize("adapter_class", [PrefixAdapter, SuffixAdapter]) def test_no_indels_empty_read(adapter_class): # Issue #376 adapter = adapter_class("ACGT", indels=False) adapter.match_to("") def test_prefix_adapter_match_with_n_wildcard_in_read(): adapter = PrefixAdapter("NNNACGT", indels=False) match = adapter.match_to("TTTACGTAAAA") assert match is not None and (0, 7) == (match.rstart, match.rstop) match = adapter.match_to("NTTACGTAAAA") assert match is not None and (0, 7) == (match.rstart, match.rstop) def test_suffix_adapter_match_with_n_wildcard_in_read(): adapter = SuffixAdapter("ACGTNNN", indels=False) match = adapter.match_to("TTTTACGTTTT") assert match is not None and (4, 11) == (match.rstart, match.rstop) match = adapter.match_to("TTTTACGTCNC") assert match is not None and (4, 11) == (match.rstart, match.rstop) def test_multiple_adapters(): a1 = BackAdapter("GTAGTCCCGC") a2 = BackAdapter("GTAGTCCCCC") ma = MultipleAdapters([a1, a2]) match = ma.match_to("ATACCCCTGTAGTCCCC") assert match.adapter is a2 def test_indexed_prefix_adapters(): adapters = [ PrefixAdapter("GAAC", indels=False), PrefixAdapter("TGCT", indels=False), ] ma = IndexedPrefixAdapters(adapters) match = ma.match_to("GAACTT") assert match.adapter is adapters[0] match = ma.match_to("TGCTAA") assert match.adapter is adapters[1] assert ma.match_to("GGGGGGG") is None def test_indexed_prefix_adapters_incorrect_type(): with pytest.raises(ValueError): IndexedPrefixAdapters( [ PrefixAdapter("GAAC", indels=False), SuffixAdapter("TGCT", indels=False), ] ) def test_indexed_very_similar(caplog): IndexedPrefixAdapters( [ PrefixAdapter("GAAC", max_errors=1, indels=False), PrefixAdapter("GAAG", max_errors=1, indels=False), ] ) assert "cannot be assigned uniquely" in caplog.text def test_indexed_too_high_k(): with pytest.raises(ValueError) as e: IndexedPrefixAdapters( [ PrefixAdapter("ACGTACGT", max_errors=3, indels=False), PrefixAdapter("AAGGTTCC", max_errors=2, indels=False), ] ) assert "Error rate too high" in e.value.args[0] def test_indexed_suffix_adapters(): adapters = [ SuffixAdapter("GAAC", indels=False), SuffixAdapter("TGCT", indels=False), ] ma = IndexedSuffixAdapters(adapters) match = ma.match_to("TTGAAC") assert match.adapter is adapters[0] match = ma.match_to("AATGCT") assert match.adapter is adapters[1] def test_indexed_suffix_adapters_incorrect_type(): with pytest.raises(ValueError): IndexedSuffixAdapters( [ SuffixAdapter("GAAC", indels=False), PrefixAdapter("TGCT", indels=False), ] ) def test_indexed_prefix_adapters_with_indels(): adapters = [ PrefixAdapter("GTAC", max_errors=1, indels=True), PrefixAdapter("TGCT", max_errors=1, indels=True), ] ma = IndexedPrefixAdapters(adapters) match = ma.match_to("GATACGGG") assert match.adapter is adapters[0] match = ma.match_to("TAGCTAA") assert match.adapter is adapters[1] def test_indexed_prefix_adapters_with_n_wildcard(): sequence = "GGTCCAGA" ma = IndexedPrefixAdapters([PrefixAdapter(sequence, max_errors=1, indels=False)]) for i in range(len(sequence)): # N in the read should be counted as mismatch t = sequence[:i] + "N" + sequence[i + 1 :] + "TGCT" result = ma.match_to(t) assert isinstance(result, RemoveBeforeMatch) assert (result.rstart, result.rstop) == (0, 8) assert result.errors == 1 assert result.score == 6 @pytest.mark.parametrize("sequence", ["ANGCATCATAAAAAAAAAA", "AAGCATCATAAAAAAAAAA"]) def test_indexed_prefix_adapters_with_n_collision(sequence): a1 = PrefixAdapter("AAGCGCCAT", max_errors=2, indels=False) a2 = PrefixAdapter("AGGCATCAT", max_errors=2, indels=False) ipa = IndexedPrefixAdapters([a1, a2]) result = ipa.match_to(sequence) assert isinstance(result, RemoveBeforeMatch) assert result.adapter is a2 def test_inosine_wildcard(): adapter = BackAdapter("CTGIAIT", max_errors=0, min_overlap=3) match = adapter.match_to("GGCTGAATTGGG") assert match.astart == 0 assert match.astop == 7 assert match.rstart == 2 assert match.rstop == 9 assert match.errors == 0 def test_noninternal_front_adapter(): adapter = NonInternalFrontAdapter("CTGTAAT") match = adapter.match_to("CTGTAATAAAAA") assert match.rstart == 0 assert match.rstop == 7 assert match.astart == 0 assert match.astop == 7 assert adapter.match_to("ACTGTAATAAA") is None match = adapter.match_to("AATCCCC") assert match.rstart == 0 assert match.rstop == 3 assert match.astart == 4 assert match.astop == 7 @pytest.mark.parametrize("errors", (0, 1)) def test_noninternal_front_adapter_with_n_wildcards(errors): sequence = "NNNCTG" if errors == 0 else "NNNCAG" adapter = NonInternalFrontAdapter(sequence, max_errors=errors) match = adapter.match_to("CTGAAAA") assert match.rstart == 0 assert match.rstop == 3 assert match.astart == 3 assert match.astop == 6 match = adapter.match_to("ACTGAAAA") assert match.rstart == 0 assert match.rstop == 4 assert match.astart == 2 assert match.astop == 6 match = adapter.match_to("AACTGAAAA") assert match.rstart == 0 assert match.rstop == 5 assert match.astart == 1 assert match.astop == 6 match = adapter.match_to("AAACTGAAAA") assert match.astart == 0, match assert match.astop == 6 assert match.rstart == 0 assert match.rstop == 6 match = adapter.match_to("AAAACTGAAAA") assert match is None def test_noninternal_front_adapter_with_n_wildcards_issue_654(): adapter = NonInternalFrontAdapter("NNNCGC", max_errors=1) match = adapter.match_to("CCCTTT") assert match is not None assert match.rstart == 0 assert match.rstop == 3 assert match.astart == 3 assert match.astop == 6 assert match.errors == 1 def test_very_long_adapter_issue_749(): adapter = BackAdapter("A" * 70, max_errors=0) match = adapter.match_to("GATTAC" + 20 * "A") assert match is not None assert match.rstart == 6 assert match.rstop == 26 assert match.astart == 0 assert match.astop == 20 assert match.errors == 0 cutadapt-4.7/tests/test_align.py000066400000000000000000000344351457457704700170730ustar00rootroot00000000000000from typing import NamedTuple import pytest from cutadapt.align import ( EndSkip, Aligner, PrefixComparer, SuffixComparer, hamming_sphere, edit_environment, edit_distance, naive_edit_environment, slow_edit_environment, py_edit_environment, ) from cutadapt.adapters import Where from utils import binomial class AlignmentResult(NamedTuple): ref_start: int ref_end: int query_start: int query_end: int score: int errors: int # convenience function (to avoid having to instantiate an Aligner manually) def locate( reference, query, max_error_rate, flags=EndSkip.SEMIGLOBAL, wildcard_ref=False, wildcard_query=False, min_overlap=1, ): aligner = Aligner( reference, max_error_rate, flags, wildcard_ref, wildcard_query, min_overlap=min_overlap, ) return aligner.locate(query) class TestAligner: def test(self): reference = "CTCCAGCTTAGACATATC" aligner = Aligner(reference, 0.1, flags=Where.BACK.value) aligner.locate("CC") def test_100_percent_error_rate(self): reference = "GCTTAGACATATC" aligner = Aligner(reference, 1.0, flags=Where.BACK.value) aligner.locate("CAA") def test_not_only_n_wildcards(self): reference = "NNNNN" with pytest.raises(ValueError) as info: Aligner(reference, 0.1, wildcard_ref=True) assert "only N wildcards" in info.value.args[0] def test_find_empty_in_empty(self): aligner = Aligner("", 0, flags=0, min_overlap=0) result = aligner.locate("") assert (0, 0, 0, 0, 0, 0) == result def test_indels_penalized(self): # Alignment in older versions: # CCAGTCCTTTCCTGAGAGT # CCAGTCCT---CT # # Should now be: # CCAGTCCTTTCCTGAGAGT # CCAGTCCTCT aligner = Aligner("CCAGTCCTCT", 0.3, flags=Where.PREFIX) result = aligner.locate("CCAGTCCTTTCCTGAGAGT") assert (0, 10, 0, 10, 9 - 1, 1) == result # refstart, refstop, querystart, querystop, score, errors) # Alignment: # TCGATGC # TCGATC aligner = Aligner("TCGATC", 1.5 / 6, flags=Where.PREFIX) result = aligner.locate("TCGATGC") assert (0, 6, 0, 6, 4, 1) == result def test_align_illumina(self): aligner = Aligner("GCCGAACTTCTTAGACTGCCTTAAGGACGT", 0.1, flags=Where.BACK) result = AlignmentResult( *aligner.locate("CAAATCACCAGAAGGCGCCTAACTTCTTAGACTGCC") ) # GCCGAACTTCTTAGACTGCCTTAAGGACGT (ref) # |||X|||||||||||||||| # CAAATCACCAGAAGGCGCCTAACTTCTTAGACTGCC (query) assert result.ref_start == 0 assert result.ref_end == 20 assert result.query_start == 16 assert result.query_end == 36 assert result.score == 18 assert result.errors == 1 def test_poly_t(): aligner = Aligner("TTTT", 0.25, flags=Where.BACK) result = AlignmentResult(*aligner.locate("CCTTTT")) assert result.ref_start == 0 assert result.ref_end == 4 assert result.query_start == 2 assert result.query_end == 6 assert result.score == 4 assert result.errors == 0 def test_poly_t_partial_match(): aligner = Aligner("TTTTTT", 0.25, flags=Where.BACK) result = AlignmentResult(*aligner.locate("CCTTTT")) assert result.ref_start == 0 assert result.ref_end == 4 assert result.query_start == 2 assert result.query_end == 6 assert result.score == 4 assert result.errors == 0 def test_poly_t_2(): aligner = Aligner("TTT", 1 / 3, flags=Where.BACK) result = AlignmentResult(*aligner.locate("CCTTTT")) assert result.ref_start == 0 assert result.ref_end == 3 assert result.query_start == 2 assert result.query_end == 5 def test_poly_a(): s = "AAAAAAAAAAAAAAAAA" t = "ACAGAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA" result = locate(s, t, 0.0, Where.BACK.value) # start_s, stop_s, start_t, stop_t, score, cost = result assert result == (0, len(s), 4, 4 + len(s), len(s), 0) # Sequences with IUPAC wildcards # R=A|G, Y=C|T, S=G|C, W=A|T, K=G|T, M=A|C, B=C|G|T, D=A|G|T, H=A|C|T, V=A|C|G, # N=A|C|G|T, X={} WILDCARD_SEQUENCES = [ "CCCATTGATC", # original sequence without wildcards "CCCRTTRATC", # R=A|G "YCCATYGATC", # Y=C|T "CSSATTSATC", # S=G|C "CCCWWWGATC", # W=A|T "CCCATKKATC", # K=G|T "CCMATTGMTC", # M=A|C "BCCATTBABC", # B=C|G|T "BCCATTBABC", # B "CCCDTTDADC", # D=A|G|T "CHCATHGATC", # H=A|C|T "CVCVTTVATC", # V=A|C|G "CCNATNGATC", # N=A|C|G|T "CCCNTTNATC", # N # 'CCCXTTXATC', # X ] def compare_prefixes(ref, query, wildcard_ref=False, wildcard_query=False): aligner = PrefixComparer( ref, max_error_rate=0.9, wildcard_ref=wildcard_ref, wildcard_query=wildcard_query, ) return aligner.locate(query) def compare_suffixes(ref, query, wildcard_ref=False, wildcard_query=False): aligner = SuffixComparer( ref, max_error_rate=0.9, wildcard_ref=wildcard_ref, wildcard_query=wildcard_query, ) return aligner.locate(query) def test_compare_prefixes(): assert compare_prefixes("AAXAA", "AAAAATTTTTTTTT") == (0, 5, 0, 5, 3, 1) assert compare_prefixes("AANAA", "AACAATTTTTTTTT", wildcard_ref=True) == ( 0, 5, 0, 5, 5, 0, ) assert compare_prefixes("AANAA", "AACAATTTTTTTTT", wildcard_ref=True) == ( 0, 5, 0, 5, 5, 0, ) assert compare_prefixes("XAAAAA", "AAAAATTTTTTTTT") == (0, 6, 0, 6, 2, 2) a = WILDCARD_SEQUENCES[0] for s in WILDCARD_SEQUENCES: r = s + "GCCAGGGTTGATTCGGCTGATCTGGCCG" result = compare_prefixes(a, r, wildcard_query=True) assert result == (0, 10, 0, 10, 10, 0), result result = compare_prefixes(r, a, wildcard_ref=True) assert result == (0, 10, 0, 10, 10, 0) for s in WILDCARD_SEQUENCES: for t in WILDCARD_SEQUENCES: # FIXME what is this t doing? r = s + "GCCAGGG" result = compare_prefixes( s, r, ) assert result == (0, 10, 0, 10, 10, 0) result = compare_prefixes(r, s, wildcard_ref=True, wildcard_query=True) assert result == (0, 10, 0, 10, 10, 0) r = WILDCARD_SEQUENCES[0] + "GCCAGG" for wildc_ref in (False, True): for wildc_query in (False, True): result = compare_prefixes( "CCCXTTXATC", r, wildcard_ref=wildc_ref, wildcard_query=wildc_query ) assert result == (0, 10, 0, 10, 6, 2) def test_n_wildcard_in_ref_matches_n_wildcard_in_query_prefix(): # With allowed wildcards in the ref, an N wildcard in the ref should never count as an error, # even if matched against an N wildcard in the query while wildcard_query is False # Issue #453 match = compare_prefixes( "NNACGT", "NTACGTAA", wildcard_ref=True, wildcard_query=False ) assert match == (0, 6, 0, 6, 6, 0) match = compare_prefixes( "NNACGT", "YTACGTAA", wildcard_ref=True, wildcard_query=False ) assert match == (0, 6, 0, 6, 6, 0) def test_n_wildcard_in_ref_matches_n_wildcard_in_query_back(): aligner = Aligner( "NNACGT", max_error_rate=0, wildcard_ref=True, flags=Where.BACK.value ) match = aligner.locate("AAANTACGTAAA") assert match == (0, 6, 3, 9, 6, 0) def test_compare_suffixes(): assert compare_suffixes("AAXAA", "TTTTTTTAAAAA") == (0, 5, 7, 12, 3, 1) assert compare_suffixes("AANAA", "TTTTTTTAACAA", wildcard_ref=True) == ( 0, 5, 7, 12, 5, 0, ) assert compare_suffixes("AANAA", "TTTTTTTAACAA", wildcard_ref=True) == ( 0, 5, 7, 12, 5, 0, ) assert compare_suffixes("AAAAAX", "TTTTTTTAAAAA") == (0, 6, 6, 12, 2, 2) @pytest.mark.parametrize("upper", (True, False)) def test_prefix_comparer(upper): # only need to test whether None is returned on too many errors, the rest is tested above ref = "axcgt" if upper: ref = ref.upper() comparer = PrefixComparer(ref, max_error_rate=0.4) repr(comparer) assert comparer.locate("TTG") is None assert comparer.locate("AGT") is not None assert comparer.locate("agt") is not None assert comparer.locate("CGT") is None assert comparer.locate("TTG") is None @pytest.mark.parametrize("upper", (True, False)) def test_suffix_comparer(upper): # only need to test whether None is returned on too many errors, the rest is tested above ref = "axcgt" if upper: ref = ref.upper() comparer = SuffixComparer(ref, max_error_rate=0.4) repr(comparer) assert comparer.locate("TTG") is None assert comparer.locate("AGT") is not None assert comparer.locate("agt") is not None assert comparer.locate("CGT") is not None assert comparer.locate("TTG") is None @pytest.mark.parametrize("comparer_class", [PrefixComparer, SuffixComparer]) def test_n_wildcards_not_counted_affix(comparer_class): # N bases should not contribute to effective adapter length, so only 1 mismatch is allowed ref = "CNNNNNNNNGTT" assert len(ref) == 12 comparer = comparer_class(ref, max_error_rate=0.25, wildcard_ref=True) assert comparer.locate("CAAAAAAAAGTT") is not None assert comparer.locate("CAAAAAAAAGTA") is not None assert comparer.locate("CAAAAAAAAGAA") is None # two mismatches def test_n_wildcards_not_counted_aligner_back(): ref = "AGGNNNNNNNNNNNNNNTTC" assert len(ref) == 20 aligner = Aligner( ref, max_error_rate=0.1, wildcard_ref=True, flags=Where.BACK.value, min_overlap=3, ) assert aligner.effective_length == 6 assert aligner.locate("TTC") is None # adapter start, adapter stop, read start, read stop assert aligner.locate("AGG")[:4] == (0, 3, 0, 3) assert aligner.locate("AGGCCCCCCC")[:4] == (0, 10, 0, 10) assert aligner.locate("ATGCCCCCCC") is None assert aligner.locate("AGGCCCCCCCCCCCCCCATC") is None assert aligner.locate("CCC" + ref.replace("N", "G") + "AAA") == ( 0, 20, 3, 23, 20, 0, ) def test_n_wildcards_not_counted_aligner_front(): ref = "AGGNNNNNNNNNNNNNNTTC" assert len(ref) == 20 aligner = Aligner( ref, max_error_rate=0.1, wildcard_ref=True, flags=Where.FRONT.value, min_overlap=3, ) assert aligner.effective_length == 6 # adapter start, adapter stop, read start, read stop assert aligner.locate("TTC")[:4] == (17, 20, 0, 3) assert aligner.locate("TGC") is None assert aligner.locate("CCCCCCCTTC")[:4] == (10, 20, 0, 10) assert aligner.locate("CCCCCCCGTC") is None assert aligner.locate("CCC" + ref.replace("N", "G") + "AAA") == ( 0, 20, 3, 23, 20, 0, ) def test_wildcards_in_adapter(): r = "CATCTGTCC" + WILDCARD_SEQUENCES[0] + "GCCAGGGTTGATTCGGCTGATCTGGCCG" for a in WILDCARD_SEQUENCES: result = locate(a, r, 0.0, Where.BACK.value, wildcard_ref=True) assert result == (0, 10, 9, 19, 10, 0), result a = "CCCXTTXATC" result = locate(a, r, 0.0, Where.BACK.value, wildcard_ref=True) assert result is None def test_wildcards_in_read(): a = WILDCARD_SEQUENCES[0] for s in WILDCARD_SEQUENCES + ["CCCXTTXATC"]: r = "CATCTGTCC" + s + "GCCAGGGTTGATTCGGCTGATCTGGCCG" result = locate(a, r, 0.0, Where.BACK.value, wildcard_query=True) if "X" in s: assert result is None else: assert result == (0, 10, 9, 19, 10, 0), result def test_wildcards_in_both(): for a in WILDCARD_SEQUENCES: for s in WILDCARD_SEQUENCES: r = "CATCTGTCC" + s + "GCCAGGGTTGATTCGGCTGATCTGGCCG" result = locate( a, r, 0.0, Where.BACK.value, wildcard_ref=True, wildcard_query=True ) assert result == (0, 10, 9, 19, 10, 0), result def test_no_match(): a = locate("CTGATCTGGCCG", "AAAAGGG", 0.1, Where.BACK.value) assert a is None, a def test_hamming_sphere_explicit(): assert list(hamming_sphere("", 0)) == [""] assert list(hamming_sphere("A", 0)) == ["A"] assert list(hamming_sphere("A", 1)) == ["C", "G", "T"] assert list(hamming_sphere("GTC", 0)) == ["GTC"] assert list(hamming_sphere("GTC", 1)) == [ "ATC", "CTC", "TTC", "GAC", "GCC", "GGC", "GTA", "GTG", "GTT", ] def hamming_distance(s, t): return sum(1 if c != d else 0 for c, d in zip(s, t)) @pytest.mark.parametrize( "sk", [ ("", 0), ("A", 0), ("AAA", 1), ("ACC", 2), ("TCATTA", 3), ("AAAAAAAA", 1), ("A" * 15, 2), ], ) def test_hamming_sphere(sk): s, k = sk result = list(hamming_sphere(s, k)) result_set = set(result) assert len(result) == len(result_set) assert len(result) == 3**k * binomial(len(s), k) for t in result: assert hamming_distance(s, t) == k @pytest.mark.parametrize( "k,s", [ (0, ""), (0, "A"), (1, "AAA"), (1, "TCATTAGA"), (2, "ACC"), (2, "A" * 10), (3, "TCATTA"), ], ) @pytest.mark.parametrize( "environment_func", [edit_environment, slow_edit_environment, py_edit_environment], ) def test_edit_environment(k, s, environment_func): result = list(environment_func(s, k)) strings, distances, matches = zip(*result) naive = set(naive_edit_environment(s, k)) assert len(set(strings)) == len(strings) assert set(strings) == naive error_rate = k / len(s) if s else 0.0 aligner = Aligner(s, max_error_rate=error_rate, flags=0, min_overlap=len(s)) for t, dist, m in result: result = aligner.locate(t) start1, stop1, start2, stop2, score, errors = result assert errors == dist assert start1 == 0 assert stop1 == len(s) assert start2 == 0 assert stop2 == len(t) assert edit_distance(s, t) == dist if environment_func is py_edit_environment: assert m == score assert m <= len(s), (s, t, dist) assert m <= len(t), (s, t, dist) cutadapt-4.7/tests/test_api.py000066400000000000000000000122251457457704700165430ustar00rootroot00000000000000""" Cutadapt doesn’t have a stable API, yet. This is an attempt to document how one currently needs to use Cutadapt from Python to do certain things, mostly in order to figure out where improvements need to be made. The tests in this module do not check results, they are just here to ensure that the code as shown can be executed. """ import copy import io import json import os from cutadapt.predicates import TooShort, IsUntrimmed from cutadapt.runners import make_runner from cutadapt.steps import ( InfoFileWriter, PairedSingleEndStep, SingleEndSink, SingleEndFilter, PairedEndFilter, PairedEndSink, ) from cutadapt.utils import DummyProgress from utils import datapath def test_main_without_sys_stdout_buffer_available(mocker): """Within e.g. IPython, sys.stdout.buffer does not exist""" from cutadapt.cli import main mocker.patch("sys.stdout", io.StringIO()) main(["-o", os.devnull, datapath("small.fastq")]) def test_command_line(): # Call Cutadapt from Python, but pass parameters as a list of strings # the same way we would in the shell. The difference is that this is # not in a separate process, errors cause a CommandLineError instead # of SystemExit, and we get back a Statistics object. from cutadapt.cli import main stats = main(["-q", "10", "-o", os.devnull, datapath("small.fastq")]) assert stats is not None json.dumps(stats.as_json()) # TODO # - Should not set up logging # - Should not print anything # - still raises SystemExit if parser.error is called # - Should be cutadapt.run(...) # - Should the JSON stats be returned instead? def test_pipeline_single(tmp_path, cores): # The following is roughly equivalent to: # cutadapt -u 5 -a GATCGGAAGA -q 0,15 -m 10 # --discard-untrimmed --info-file=info.txt -o ... small.fastq info_path = tmp_path / "info.txt" import json from cutadapt.pipeline import SingleEndPipeline from cutadapt.files import OutputFiles, InputPaths from cutadapt.modifiers import UnconditionalCutter, QualityTrimmer, AdapterCutter from cutadapt.adapters import BackAdapter adapter = BackAdapter( sequence="GATCGGAAGA", max_errors=1, min_overlap=3, ) modifiers = [ UnconditionalCutter(5), QualityTrimmer(cutoff_front=0, cutoff_back=15), AdapterCutter([adapter]), ] inpaths = InputPaths(datapath("small.fastq")) with make_runner(inpaths, cores) as runner: outfiles = OutputFiles( proxied=cores > 1, qualities=runner.input_file_format().has_qualities(), interleaved=False, ) steps = [ InfoFileWriter(outfiles.open_text(info_path)), SingleEndFilter(TooShort(10)), SingleEndFilter(IsUntrimmed()), SingleEndSink(outfiles.open_record_writer(tmp_path / "out.fastq")), ] pipeline = SingleEndPipeline(modifiers, steps) stats = runner.run(pipeline, DummyProgress(), outfiles) assert stats is not None assert info_path.exists() json.dumps(stats.as_json()) outfiles.close() def test_pipeline_paired(tmp_path, cores): # cutadapt -u 5 -U 7 -a GATCGGAAGA -q 0,15 -m 10:0 # --discard-untrimmed --info-file=info.txt # -o ... -p ... # paired.1.fastq paired.2.fastq info_path = tmp_path / "info.txt" from cutadapt.pipeline import PairedEndPipeline from cutadapt.modifiers import UnconditionalCutter, QualityTrimmer, AdapterCutter from cutadapt.adapters import BackAdapter from cutadapt.files import OutputFiles, InputPaths trimmer = QualityTrimmer(cutoff_front=0, cutoff_back=15) adapter = BackAdapter( sequence="GATCGGAAGA", max_errors=1, min_overlap=3, ) modifiers = [ (UnconditionalCutter(5), UnconditionalCutter(7)), (trimmer, copy.copy(trimmer)), (AdapterCutter([adapter]), None), ] inpaths = InputPaths(datapath("paired.1.fastq"), datapath("paired.2.fastq")) with make_runner(inpaths, cores=cores) as runner: outfiles = OutputFiles( proxied=cores > 1, qualities=runner.input_file_format().has_qualities(), interleaved=False, ) steps = [ PairedSingleEndStep(InfoFileWriter(outfiles.open_text(info_path))), PairedEndFilter(TooShort(10), None), PairedEndFilter( IsUntrimmed(), IsUntrimmed(), pair_filter_mode="any", ), PairedEndSink( outfiles.open_record_writer( tmp_path / "out.1.fastq", tmp_path / "out.2.fastq" ) ), ] pipeline = PairedEndPipeline(modifiers, steps) stats = runner.run(pipeline, DummyProgress(), outfiles) assert stats is not None assert info_path.exists() _ = stats.as_json() outfiles.close() # TODO # - could use += for adding modifiers # - allow using adapter specification strings? # - too many submodules (flatter namespace) # - use xopen directly instead of file_opener; # possibly with myxopen = functools.partial(xopen, ...) cutadapt-4.7/tests/test_command.py000066400000000000000000000102641457457704700174110ustar00rootroot00000000000000"""Tests that run the program in a subprocess""" import subprocess import sys import os import pytest from utils import datapath, assert_files_equal, cutpath def test_run_cutadapt_process(): subprocess.check_call(["cutadapt", "--version"]) def test_run_as_module(): """Check that "python3 -m cutadapt ..." works""" from cutadapt import __version__ with subprocess.Popen( [sys.executable, "-m", "cutadapt", "--version"], stdout=subprocess.PIPE ) as py: assert py.communicate()[0].decode().strip() == __version__ @pytest.mark.skipif(sys.platform == "win32", reason="Perhaps this can be fixed") def test_standard_input_pipe(tmp_path, cores): """Read FASTQ from standard input""" out_path = os.fspath(tmp_path / "out.fastq") in_path = datapath("small.fastq") # Simulate that no file name is available for stdin with subprocess.Popen(["cat", in_path], stdout=subprocess.PIPE) as cat: with subprocess.Popen( [ sys.executable, "-m", "cutadapt", "--cores", str(cores), "-a", "TTAGACATATCTCCGTCG", "-o", out_path, "-", ], stdin=cat.stdout, ) as py: _ = py.communicate() cat.stdout.close() _ = py.communicate()[0] assert_files_equal(cutpath("small.fastq"), out_path) def test_standard_output(tmp_path, cores): """Write FASTQ to standard output (not using --output/-o option)""" out_path = os.fspath(tmp_path / "out.fastq") with open(out_path, "w") as out_file: py = subprocess.Popen( [ sys.executable, "-m", "cutadapt", "--cores", str(cores), "-a", "TTAGACATATCTCCGTCG", datapath("small.fastq"), ], stdout=out_file, ) _ = py.communicate() assert_files_equal(cutpath("small.fastq"), out_path) def test_errors_are_printed_to_stderr(tmp_path): out_path = os.fspath(tmp_path / "out.fastq") py = subprocess.Popen( [ sys.executable, "-m", "cutadapt", "-o", out_path, tmp_path / "does-not-exist.fastq", ], stdout=subprocess.PIPE, stderr=subprocess.PIPE, ) stdout_bytes, stderr_bytes = py.communicate() assert b"No such file or directory" in stderr_bytes assert b"No such file or directory" not in stdout_bytes def test_explicit_standard_output(tmp_path, cores): """Write FASTQ to standard output (using "-o -")""" out_path = os.fspath(tmp_path / "out.fastq") with open(out_path, "w") as out_file: py = subprocess.Popen( [ sys.executable, "-m", "cutadapt", "-o", "-", "--cores", str(cores), "-a", "TTAGACATATCTCCGTCG", datapath("small.fastq"), ], stdout=out_file, ) _ = py.communicate() assert_files_equal(cutpath("small.fastq"), out_path) def test_force_fasta_output(tmp_path, cores): """Write FASTA to standard output even on FASTQ input""" out_path = os.fspath(tmp_path / "out.fasta") with open(out_path, "w") as out_file: py = subprocess.Popen( [ sys.executable, "-m", "cutadapt", "--fasta", "-o", "-", "--cores", str(cores), "-a", "TTAGACATATCTCCGTCG", datapath("small.fastq"), ], stdout=out_file, ) _ = py.communicate() assert_files_equal(cutpath("small.fasta"), out_path) @pytest.mark.skipif(sys.platform == "win32", reason="Maybe this can be made to work") def test_non_utf8_locale(): subprocess.check_call( [sys.executable, "-m", "cutadapt", "-o", os.devnull, datapath("small.fastq")], env={"LC_CTYPE": "C"}, ) cutadapt-4.7/tests/test_commandline.py000066400000000000000000000727061457457704700202720ustar00rootroot00000000000000import subprocess import sys import os from io import StringIO, BytesIO import dnaio import pytest from cutadapt.cli import main from utils import assert_files_equal, datapath, cutpath # pytest.mark.timeout will not fail even if pytest-timeout is not installed try: import pytest_timeout as _unused except ImportError: # pragma: no cover raise ImportError("pytest_timeout needs to be installed") del _unused def test_does_not_close_stdout(): main([datapath("small.fastq")]) assert not sys.stdout.closed def test_help(): with pytest.raises(SystemExit) as e: main(["--help"]) assert e.value.args[0] == 0 def test_unknown_file_format(tmp_path, cores): path = tmp_path / "unknown_format.txt" path.write_text("raw text") with pytest.raises(SystemExit): main([f"--cores={cores}", str(path)]) def test_cores_negative(): with pytest.raises(SystemExit) as e: main(["--cores=-1", datapath("simple.fasta")]) assert e.value.args[0] == 2 # "cannot be negative" def test_quiet_and_report(): with pytest.raises(SystemExit) as e: main(["--quiet", "--report=minimal", datapath("simple.fasta")]) assert e.value.args[0] == 2 # "Options --quiet and --report cannot be used at the same time" @pytest.mark.parametrize( "args", [ ("--discard-trimmed", "--discard-untrimmed"), ("--discard-trimmed", "--untrimmed-output", os.devnull), ("--discard-untrimmed", "--untrimmed-output", os.devnull), ], ) def test_only_one_of_discard_trimmed_discard_untrimmed_untrimmed_output(args): with pytest.raises(SystemExit) as e: main(["-o", os.devnull, *args, datapath("small.fastq")]) assert e.value.args[0] == 2 def test_debug(): main(["--debug", "--", datapath("small.fastq")]) def test_debug_trace(): main(["--debug", "--debug", "-a", "ACGT", datapath("small.fastq")]) def test_example(run): run("-N -b ADAPTER", "example.fa", "example.fa") def test_compressed_fasta(run): run("", "simple.fasta", "simple.fasta.gz") def test_small(run): run("-a TTAGACATATCTCCGTCG", "small.fastq", "small.fastq") def test_empty_fastq(run, cores): run("--cores {} -a TTAGACATATCTCCGTCG".format(cores), "empty.fastq", "empty.fastq") def test_empty_fasta_input(run, cores): run(["--cores", str(cores)], "empty.fasta", "empty.fasta") def test_no_read_only_comment_fasta_input(run, cores): run(["--cores", str(cores)], "empty.fasta", "onlycomment.fasta") def test_newlines(run): """DOS/Windows newlines""" run("-e 0.12 -a TTAGACATATCTCCGTCG", "dos.fastq", "dos.fastq") def test_lowercase(run): """lowercase adapter""" run("-a ttagacatatctccgtcg", "lowercase.fastq", "small.fastq") def test_rest(run, tmp_path, cores): """-r/--rest-file""" rest = tmp_path / "rest.tmp" run( ["--cores", str(cores), "-b", "ADAPTER", "-N", "-r", rest], "rest.fa", "rest.fa" ) assert_files_equal(datapath("rest.txt"), rest) def test_restfront(run, tmp_path): path = tmp_path / "rest.txt" run(["-g", "ADAPTER", "-N", "-r", path], "restfront.fa", "rest.fa") assert_files_equal(datapath("restfront.txt"), path) def test_discard(run): """--discard""" run("-b TTAGACATATCTCCGTCG --discard", "discard.fastq", "small.fastq") def test_discard_untrimmed(run): """--discard-untrimmed""" run("-b CAAGAT --discard-untrimmed", "discard-untrimmed.fastq", "small.fastq") def test_extensiontxtgz(run): """automatic recognition of "_sequence.txt.gz" extension""" run("-b TTAGACATATCTCCGTCG", "s_1_sequence.txt", "s_1_sequence.txt.gz") def test_minimum_length(run): """-m/--minimum-length""" stats = run("-m 5 -a TTAGACATATCTCCGTCG", "minlen.fa", "lengths.fa") assert stats.written_bp[0] == 45 assert stats.written == 6 def test_too_short(run, tmp_path, cores): too_short_path = tmp_path / "tooshort.fa" stats = run( [ "--cores", str(cores), "-m", "5", "-a", "TTAGACATATCTCCGTCG", "--too-short-output", too_short_path, ], "minlen.fa", "lengths.fa", ) assert_files_equal(datapath("tooshort.fa"), too_short_path) assert stats.filtered["too_short"] == 5 @pytest.mark.parametrize("redirect", (False, True)) def test_too_short_statistics(redirect): args = [ "-a", "TTAGACATATCTCCGTCG", "-m", "24", "-o", os.devnull, datapath("small.fastq"), ] if redirect: args[:0] = ["--too-short-output", os.devnull] stats = main(args) assert stats.with_adapters[0] == 2 assert stats.written == 2 assert stats.written_bp[0] == 58 assert stats.filtered["too_short"] == 1 def test_maximum_length(run): """-M/--maximum-length""" run("-M 5 -a TTAGACATATCTCCGTCG", "maxlen.fa", "lengths.fa") def test_too_long(run, tmp_path, cores): """--too-long-output""" too_long_path = tmp_path / "toolong.fa" stats = run( [ "--cores", str(cores), "-M", "5", "-a", "TTAGACATATCTCCGTCG", "--too-long-output", too_long_path, ], "maxlen.fa", "lengths.fa", ) assert_files_equal(datapath("toolong.fa"), too_long_path) assert stats.filtered["too_long"] == 5 def test_length_tag(run): """454 data; -n and --length-tag""" run( "-n 3 -e 0.1 --length-tag length= " "-b TGAGACACGCAACAGGGGAAAGGCAAGGCACACAGGGGATAGG " "-b TCCATCTCATCCCTGCGTGTCCCATCTGTTCCCTCCCTGTCTCA", "454.fa", "454.fa", ) @pytest.mark.parametrize("length", list(range(3, 11))) def test_overlap_a(tmp_path, length): """-O/--overlap with -a""" adapter = "catatctccg" record = ">read\nGAGACCATTCCAATG" + adapter[:length] + "\n" input = tmp_path / "overlap.fasta" input.write_text(record) if length < 7: expected = record else: expected = ">read\nGAGACCATTCCAATG\n" output = tmp_path / "overlap-trimmed.fasta" main(["-O", "7", "-e", "0", "-a", adapter, "-o", str(output), str(input)]) assert expected == output.read_text() def test_overlap_b(run): """-O/--overlap with -b""" run("-O 10 -b TTAGACATATCTCCGTCG", "overlapb.fa", "overlapb.fa") def test_trim_n(run): run("--trim-n", "trim-n.fasta", "trim-n.fasta") def test_qualtrim(run): """-q with low qualities""" run("-q 10 -a XXXXXX", "lowqual.fastq", "lowqual.fastq") def test_qualbase(run): """-q with low qualities, using ascii(quality+64) encoding""" run("-q 10 --quality-base 64 -a XXXXXX", "illumina64.fastq", "illumina64.fastq") def test_quality_trim_only(run): """only trim qualities, do not remove adapters""" run("-q 10 --quality-base 64", "illumina64.fastq", "illumina64.fastq") def test_twoadapters(run): """two adapters""" run("-a AATTTCAGGAATT -a GTTCTCTAGTTCT", "twoadapters.fasta", "twoadapters.fasta") def test_poly_a_legacy(run): """poly-A tails""" run( "-O 10 -a AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA", "polya.legacy.1.fasta", "polya.1.fasta", ) def test_poly_a_legacy_brace_notation(run): """poly-A tails""" run("-O 10 -a A{35}", "polya.legacy.1.fasta", "polya.1.fasta") def test_poly_a(run): run("--poly-a", "polya.1.fasta", "polya.1.fasta") # the same as --action=none def test_no_trim(run): run("--no-trim --discard-untrimmed -a CCCTAGTTAAAC", "no-trim.fastq", "small.fastq") def test_action_none(run): run( "--action=none --discard-untrimmed -a CCCTAGTTAAAC", "no-trim.fastq", "small.fastq", ) # the same as --action=mask def test_mask_adapter(run): """mask adapter with N (reads maintain the same length)""" run("-b CAAG -n 3 --mask-adapter", "anywhere_repeat.fastq", "anywhere_repeat.fastq") def test_action_mask(run): """mask adapter with N (reads maintain the same length)""" run("-b CAAG -n 3 --action=mask", "anywhere_repeat.fastq", "anywhere_repeat.fastq") def test_action_lowercase(run): run( "-b CAAG -n 3 --action=lowercase", "action_lowercase.fasta", "action_lowercase.fasta", ) def test_action_retain(run): run( "-g GGTTAACC -a CAAG --action=retain", "action_retain.fasta", "action_retain.fasta", ) def test_action_retain_times(): with pytest.raises(SystemExit): main(["-a", "ACGT", "--times=2", "--action=retain", datapath("small.fastq")]) def test_gz_multiblock(run): """compressed gz file with multiple blocks (created by concatenating two .gz files)""" run("-b TTAGACATATCTCCGTCG", "small.fastq", "multiblock.fastq.gz") def test_read_wildcard(run): """test wildcards in reads""" run("--match-read-wildcards -b ACGTACGT", "wildcard.fa", "wildcard.fa") @pytest.mark.parametrize( "adapter_type,expected", [ ("-a", "wildcard_adapter.fa"), ("-b", "wildcard_adapter_anywhere.fa"), ], ) def test_adapter_wildcard(adapter_type, expected, run, tmp_path, cores): """wildcards in adapter""" wildcard_path = tmp_path / "wildcards.txt" run( [ "--cores", str(cores), "--wildcard-file", wildcard_path, adapter_type, "ACGTNNNACGT", ], expected, "wildcard_adapter.fa", ) with open(wildcard_path) as wct: lines = wct.readlines() lines = [line.strip() for line in lines] assert lines == ["AAA 1", "GGG 2", "CCC 3b", "TTT 4b"] def test_wildcard_N(run): """test 'N' wildcard matching with no allowed errors""" run("-e 0 -a GGGGGGG --match-read-wildcards", "wildcardN.fa", "wildcardN.fa") def test_illumina_adapter_wildcard(run): run("-a VCCGAMCYUCKHRKDCUBBCNUWNSGHCGU", "illumina.fastq", "illumina.fastq.gz") def test_adapter_front(run): """test adapter in front""" run("--front ADAPTER -N", "examplefront.fa", "example.fa") def test_literal_N(run): """test matching literal 'N's""" run("-N -e 0.2 -a NNNNNNNNNNNNNN", "trimN3.fasta", "trimN3.fasta") def test_literal_N2(run): run("-N -O 1 -g NNNNNNNNNNNNNN", "trimN5.fasta", "trimN5.fasta") def test_literal_N_brace_notation(run): """test matching literal 'N's""" run("-N -e 0.2 -a N{14}", "trimN3.fasta", "trimN3.fasta") def test_literal_N2_brace_notation(run): run("-N -O 1 -g N{14}", "trimN5.fasta", "trimN5.fasta") def test_anchored_front(run): run("-g ^FRONTADAPT -N", "anchored.fasta", "anchored.fasta") def test_anchored_front_ellipsis_notation(run): run("-a ^FRONTADAPT... -N", "anchored.fasta", "anchored.fasta") def test_anchored_back(run): run("-a BACKADAPTER$ -N", "anchored-back.fasta", "anchored-back.fasta") def test_anchored_back_ellipsis_notation(run): run("-a ...BACKADAPTER$ -N", "anchored-back.fasta", "anchored-back.fasta") def test_anchored_back_no_indels(run): run("-a BACKADAPTER$ -N --no-indels", "anchored-back.fasta", "anchored-back.fasta") def test_no_indels(run): run("-a TTAGACATAT -g GAGATTGCCA --no-indels", "no_indels.fasta", "no_indels.fasta") def test_ellipsis_notation(run): run( "-a ...TTAGACATAT -g GAGATTGCCA --no-indels", "no_indels.fasta", "no_indels.fasta", ) def test_issue_46(run, tmp_path): """issue 46 - IndexError with --wildcard-file""" run( "--anywhere=AACGTN --wildcard-file={}".format(tmp_path / "wildcards.txt"), "issue46.fasta", "issue46.fasta", ) def test_strip_suffix(run): run("--strip-suffix _sequence -a XXXXXXX", "stripped.fasta", "simple.fasta") def test_info_file(run, tmp_path, cores): # The true adapter sequence in the illumina.fastq.gz data set is # GCCTAACTTCTTAGACTGCCTTAAGGACGT (fourth base is different from the sequence shown here) info_path = tmp_path / "info.txt" run( [ "--cores", str(cores), "--info-file", info_path, "-a", "adapt=GCCGAACTTCTTAGACTGCCTTAAGGACGT", ], "illumina.fastq", "illumina.fastq.gz", ) assert_files_equal( cutpath("illumina.info.txt"), info_path, ignore_trailing_space=True ) def test_info_file_times(run, tmp_path, cores): info_path = tmp_path / "info.txt" run( [ "--cores", str(cores), "--info-file", info_path, "--times", "2", "-a", "adapt=GCCGAACTTCTTA", "-a", "adapt2=GACTGCCTTAAGGACGT", ], "illumina5.fastq", "illumina5.fastq", ) assert_files_equal( cutpath("illumina5.info.txt"), info_path, ignore_trailing_space=True ) def test_info_file_fasta(run, tmp_path, cores): info_path = tmp_path / "info.txt" # Just make sure that it runs run( [ "--cores", str(cores), "--info-file", info_path, "-a", "TTAGACATAT", "-g", "GAGATTGCCA", "--no-indels", ], "no_indels.fasta", "no_indels.fasta", ) def test_info_file_revcomp(run, tmp_path): info_path = tmp_path / "info-rc.txt" main( [ "--info-file", str(info_path), "-a", "adapt=GAGTCG", "--revcomp", "--rename={header}", "-o", str(tmp_path / "out.fasta"), datapath("info-rc.fasta"), ] ) assert_files_equal(cutpath("info-rc.txt"), info_path) def test_named_adapter(run): run( "-a MY_ADAPTER=GCCGAACTTCTTAGACTGCCTTAAGGACGT", "illumina.fastq", "illumina.fastq.gz", ) def test_adapter_with_u(run): run("-a GCCGAACUUCUUAGACUGCCUUAAGGACGU", "illumina.fastq", "illumina.fastq.gz") def test_bzip2_input(run, cores): run( ["--cores", str(cores), "-a", "TTAGACATATCTCCGTCG"], "small.fastq", "small.fastq.bz2", ) @pytest.mark.parametrize("extension", ["bz2", "xz", "gz"]) def test_compressed_output(tmp_path, cores, extension): out_path = str(tmp_path / ("small.fastq." + extension)) params = [ "--cores", str(cores), "-a", "TTAGACATATCTCCGTCG", "-o", out_path, datapath("small.fastq"), ] main(params) def test_bzip2_multiblock(run): run("-b TTAGACATATCTCCGTCG", "small.fastq", "multiblock.fastq.bz2") def test_xz(run): run("-b TTAGACATATCTCCGTCG", "small.fastq", "small.fastq.xz") def test_no_args(): with pytest.raises(SystemExit): main([]) def test_two_fastqs(): with pytest.raises(SystemExit): main([datapath("paired.1.fastq"), datapath("paired.2.fastq")]) def test_anchored_no_indels(run): """anchored 5' adapter, mismatches only (no indels)""" run( "-g ^TTAGACATAT --no-indels -e 0.1", "anchored_no_indels.fasta", "anchored_no_indels.fasta", ) def test_anchored_no_indels_wildcard_read(run): """anchored 5' adapter, mismatches only (no indels), but wildcards in the read count as matches""" run( "-g ^TTAGACATAT --match-read-wildcards --no-indels -e 0.1", "anchored_no_indels_wildcard.fasta", "anchored_no_indels.fasta", ) def test_anchored_no_indels_wildcard_adapt(run): """anchored 5' adapter, mismatches only (no indels), but wildcards in the adapter count as matches""" run( "-g ^TTAGACANAT --no-indels -e 0.12", "anchored_no_indels.fasta", "anchored_no_indels.fasta", ) def test_non_iupac_characters(run): with pytest.raises(SystemExit): main(["-a", "ZACGT", datapath("small.fastq")]) def test_unconditional_cut_front(run): run("-u 5", "unconditional-front.fastq", "small.fastq") def test_unconditional_cut_back(run): run("-u -5", "unconditional-back.fastq", "small.fastq") def test_unconditional_cut_both(run): run("-u -5 -u 5", "unconditional-both.fastq", "small.fastq") def test_unconditional_cut_too_many_commas(): with pytest.raises(SystemExit): main(["-u", "5,7,8", datapath("small.fastq")]) def test_unconditional_cut_invalid_number(): with pytest.raises(SystemExit): main(["-u", "a,b", datapath("small.fastq")]) def test_untrimmed_output(run, cores, tmp_path): path = tmp_path / "untrimmed.fastq" stats = run( ["--cores", str(cores), "-a", "TTAGACATATCTCCGTCG", "--untrimmed-output", path], "small.trimmed.fastq", "small.fastq", ) assert_files_equal(cutpath("small.untrimmed.fastq"), path) assert stats.with_adapters[0] == 2 assert stats.written == 2 assert stats.written_bp[0] == 46 def test_adapter_file(run): run("-a file:" + datapath("adapter.fasta"), "illumina.fastq", "illumina.fastq.gz") def test_adapter_file_5p_anchored(run): run( "-N -g file:" + datapath("prefix-adapter.fasta"), "anchored.fasta", "anchored.fasta", ) def test_adapter_file_3p_anchored(run): run( "-N -a file:" + datapath("suffix-adapter.fasta"), "anchored-back.fasta", "anchored-back.fasta", ) def test_adapter_file_5p_anchored_no_indels(run): run( "-N --no-indels -g file:" + datapath("prefix-adapter.fasta"), "anchored.fasta", "anchored.fasta", ) def test_adapter_file_3p_anchored_no_indels(run): run( "-N --no-indels -a file:" + datapath("suffix-adapter.fasta"), "anchored-back.fasta", "anchored-back.fasta", ) def test_adapter_file_empty_name(run): run( "-N -a file:" + datapath("adapter-empty-name.fasta"), "illumina.fastq", "illumina.fastq.gz", ) @pytest.mark.parametrize("ext", ["", ".gz"]) def test_demultiplex(cores, tmp_path, ext): multiout = str(tmp_path / "tmp-demulti.{name}.fasta") + ext params = [ "--cores", str(cores), "-a", "first=AATTTCAGGAATT", "-a", "second=GTTCTCTAGTTCT", "-o", multiout, datapath("twoadapters.fasta"), ] main(params) for name in ("first", "second", "unknown"): actual = multiout.format(name=name) if ext == ".gz": subprocess.run(["gzip", "-d", actual], check=True) actual = actual[:-3] expected = cutpath("twoadapters.{name}.fasta".format(name=name)) assert_files_equal(expected, actual) def test_multiple_fake_anchored_adapters(run): run( "-g ^CGTCCGAAGTAGC -g ^ATTGCCCTAG " "-a TTCCATGCAGCATT$ -a CCAGTCCCCCC$ " "-a GCCGAACTTCTTAGACTGCCTTAAGGACGT", "illumina.fastq", "illumina.fastq.gz", ) def test_multiple_prefix_adapters(run): run("-g ^GTACGGATTGTTCAGTA -g ^TATTAAGCTCATTC", "multiprefix.fasta", "multi.fasta") def test_multiple_prefix_adapters_noindels(run): run( "--no-indels -g ^GTACGGATTGTTCAGTA -g ^TATTAAGCTCATTC", "multiprefix.fasta", "multi.fasta", ) def test_multiple_suffix_adapters_noindels(run): run( "--no-indels -a CGTGATTATCTTGC$ -a CCTATTAGTGGTTGAAC$", "multisuffix.fasta", "multi.fasta", ) def test_max_n(run): assert run("--max-n 0", "maxn0.fasta", "maxn.fasta").filtered["too_many_n"] == 4 assert run("--max-n 1", "maxn1.fasta", "maxn.fasta").filtered["too_many_n"] == 2 assert run("--max-n 2", "maxn2.fasta", "maxn.fasta").filtered["too_many_n"] == 1 assert run("--max-n 0.2", "maxn0.2.fasta", "maxn.fasta").filtered["too_many_n"] == 3 assert run("--max-n 0.4", "maxn0.4.fasta", "maxn.fasta").filtered["too_many_n"] == 2 def test_quiet_is_quiet(): captured_standard_output = StringIO() captured_standard_error = StringIO() setattr(captured_standard_output, "buffer", BytesIO()) setattr(captured_standard_error, "buffer", BytesIO()) old_stdout = sys.stdout old_stderr = sys.stderr try: sys.stdout = captured_standard_output sys.stderr = captured_standard_error main(["-o", os.devnull, "--quiet", datapath("small.fastq")]) finally: sys.stdout = old_stdout sys.stderr = old_stderr assert captured_standard_output.getvalue() == "" assert captured_standard_error.getvalue() == "" assert getattr(captured_standard_output, "buffer").getvalue() == b"" assert getattr(captured_standard_output, "buffer").getvalue() == b"" def test_x_brace_notation(): main(["-o", os.devnull, "--quiet", "-a", "X{5}", datapath("small.fastq")]) def test_nextseq(run): run("--nextseq-trim 22", "nextseq.fastq", "nextseq.fastq") def test_linked_explicitly_anchored(run): run("-a ^AAAAAAAAAA...TTTTTTTTTT", "linked.fasta", "linked.fasta") def test_linked_multiple(run): run( "-a ^AAAAAAAAAA...TTTTTTTTTT -a ^AAAAAAAAAA...GCGCGCGCGC", "linked.fasta", "linked.fasta", ) def test_linked_both_anchored(run): run("-a ^AAAAAAAAAA...TTTTT$", "linked-anchored.fasta", "linked.fasta") def test_linked_5p_not_anchored(run): run("-g AAAAAAAAAA...TTTTTTTTTT", "linked-not-anchored.fasta", "linked.fasta") def test_linked_discard_untrimmed(run): run( "-a ^AAAAAAAAAA...TTTTTTTTTT --discard-untrimmed", "linked-discard.fasta", "linked.fasta", ) def test_linked_discard_untrimmed_g(run): run( "-g AAAAAAAAAA...TTTTTTTTTT --discard-untrimmed", "linked-discard-g.fasta", "linked.fasta", ) def test_linked_lowercase(run): run( "-a ^AACCGGTTTT...GGGGGGG$ -a ^AAAA...TTTT$ --times=2 --action=lowercase", "linked-lowercase.fasta", "linked.fasta", ) def test_linked_info_file(tmp_path): info_path = tmp_path / "info.txt" main( [ "-a linkedadapter=^AAAAAAAAAA...TTTTTTTTTT", "--info-file", str(info_path), "-o", str(tmp_path / "out.fasta"), datapath("linked.fasta"), ] ) assert_files_equal( cutpath("linked-info.txt"), info_path, ignore_trailing_space=True ) def test_linked_anywhere(): with pytest.raises(SystemExit): main(["-b", "AAA...TTT", datapath("linked.fasta")]) def test_anywhere_anchored_5p(): with pytest.raises(SystemExit): main(["-b", "^AAA", datapath("small.fastq")]) def test_anywhere_anchored_3p(): with pytest.raises(SystemExit): main(["-b", "TTT$", datapath("small.fastq")]) def test_fasta(run): run("-a TTAGACATATCTCCGTCG", "small.fasta", "small.fastq") def test_fasta_no_trim(run): run([], "small-no-trim.fasta", "small.fastq") def test_length(run): run("--length 5", "shortened.fastq", "small.fastq") def test_negative_length(run): run("--length -5", "shortened-negative.fastq", "small.fastq") @pytest.mark.timeout(0.5) def test_issue_296(tmp_path): # Hang when using both --no-trim and --info-file together info_path = tmp_path / "info.txt" reads_path = tmp_path / "reads.fasta" out_path = tmp_path / "out.fasta" reads_path.write_text(">read\nCACAAA\n") main( [ "--info-file", str(info_path), "--no-trim", "-g", "TTTCAC", "-o", str(out_path), str(reads_path), ] ) # Output should be unchanged because of --no-trim assert_files_equal(reads_path, out_path) def test_xadapter(run): run("-g XTCCGAATAGA", "xadapter.fasta", "xadapterx.fasta") def test_adapterx(run): run("-a TCCGAATAGAX", "adapterx.fasta", "xadapterx.fasta") def test_not_rightmost(tmp_path): path = tmp_path / "reads.fasta" path.write_text(">r\nGGCTGAATTGGACTGAATTGGGT\n") trimmed = tmp_path / "trimmed.fasta" main(["-g", "CTGAATT", "-o", str(trimmed), str(path)]) assert trimmed.read_text() == ">r\nGGACTGAATTGGGT\n" def test_rightmost(tmp_path): path = tmp_path / "reads.fasta" path.write_text(">r\nGGCTGAATTGGACTGAATTGGGT\n") trimmed = tmp_path / "trimmed.fasta" main(["-g", "CTGAATT;rightmost", "-o", str(trimmed), str(path)]) assert trimmed.read_text() == ">r\nGGGT\n" def test_discard_casava(run): stats = run("--discard-casava", "casava.fastq", "casava.fastq") assert stats.filtered["casava_filtered"] == 1 def test_underscore(run): """File name ending in _fastq.gz (issue #275)""" run("-b TTAGACATATCTCCGTCG", "small.fastq", "underscore_fastq.gz") def test_cores_autodetect(run): # Just make sure that it runs; functionality is not tested run("--cores 0 -b TTAGACATATCTCCGTCG", "small.fastq", "underscore_fastq.gz") def test_write_compressed_fastq(cores, tmp_path): main( [ "--cores", str(cores), "-o", str(tmp_path / "out.fastq.gz"), datapath("small.fastq"), ] ) def test_minimal_report(run): run("-b TTAGACATATCTCCGTCG --report=minimal", "small.fastq", "small.fastq") def test_paired_separate(run): """test separate trimming of paired-end reads""" run("-a TTAGACATAT", "paired-separate.1.fastq", "paired.1.fastq") run("-a CAGTGGAGTA", "paired-separate.2.fastq", "paired.2.fastq") def test_empty_read_with_wildcard_in_adapter(run): run("-g CWC", "empty.fastq", "empty.fastq") def test_print_progress_to_tty(tmp_path, mocker): mocker.patch("cutadapt.utils.sys.stderr").isatty.return_value = True main(["-o", str(tmp_path / "out.fastq"), datapath("small.fastq")]) def test_adapter_order(run): run("-g ^AAACC -a CCGGG", "adapterorder-ga.fasta", "adapterorder.fasta") run("-a CCGGG -g ^AAACC", "adapterorder-ag.fasta", "adapterorder.fasta") def test_reverse_complement_no_rc_suffix(run, tmp_path): out_path = tmp_path / "out.fastq" main( [ "-o", str(out_path), "--revcomp", "--no-index", "--rename", "{header}", "-g", "^TTATTTGTCT", "-g", "^TCCGCACTGG", datapath("revcomp.1.fastq"), ] ) with dnaio.open(out_path) as f: reads = list(f) assert len(reads) == 6 assert reads[1].name == "read2/1" assert reads[1].sequence == "ACCATCCGATATGTCTAATGTGGCCTGTTG" def test_reverse_complement_normalized(run): stats = run( "--revcomp --no-index -g ^TTATTTGTCT -g ^TCCGCACTGG", "revcomp-single-normalize.fastq", "revcomp.1.fastq", ) assert stats.n == 6 assert stats.reverse_complemented == 2 def test_reverse_complement_and_info_file(run, tmp_path, cores): info_path = str(tmp_path / "info.txt") run( [ "--revcomp", "--no-index", "-g", "^TTATTTGTCT", "-g", "^TCCGCACTGG", "--info-file", info_path, ], "revcomp-single-normalize.fastq", "revcomp.1.fastq", ) with open(info_path) as f: lines = f.readlines() assert len(lines) == 6 assert lines[0].split("\t")[0] == "read1/1" assert lines[1].split("\t")[0] == "read2/1 rc" def test_max_expected_errors(run, cores): stats = run("--max-ee=0.9", "maxee.fastq", "maxee.fastq") assert stats.filtered["too_many_expected_errors"] == 2 def test_max_expected_errors_fasta(tmp_path): path = tmp_path / "input.fasta" path.write_text(">read\nACGTACGT\n") main(["--max-ee=0.001", "-o", os.devnull, str(path)]) def test_warn_if_en_dashes_used(): with pytest.raises(SystemExit): main(["–q", "25", "-o", os.devnull, "in.fastq"]) @pytest.mark.parametrize("opt", ["-y", "--suffix"]) def test_suffix(opt, run): """-y/--suffix parameter""" run( [opt, " {name}", "-e", "0", "-a", "OnlyT=TTTTTTTT", "-a", "OnlyG=GGGGGGGG"], "suffix.fastq", "suffix.fastq", ) @pytest.mark.parametrize("opt", ["--prefix", "--suffix"]) def test_rename_cannot_be_combined_with_other_renaming_options(opt): with pytest.raises(SystemExit): main( [ opt, "something", "--rename='{id} {comment} extrainfo'", "-o", os.devnull, datapath("empty.fastq"), ] ) @pytest.mark.skipif(sys.platform == "win32", reason="Disabled on Windows") def test_duplicate_output_paths(tmp_path): path = str(tmp_path / "discard.fastq") with pytest.raises(SystemExit): main( [ "--untrimmed-output", path, "--too-long-output", path, "-o", os.devnull, datapath("empty.fastq"), ] ) # "specified more than once as an output file" def test_rename(run, cores): run( [ "--rename={id}_{cut_suffix} {header} {adapter_name}", "--cut=-4", "-a", "OnlyT=TTTTTT", "-a", "OnlyG=GGGGGG", "--cores", str(cores), ], "rename.fastq", "suffix.fastq", ) def test_terminates_correctly_on_error_in_subprocess(tmp_path): params = [ "-j", "2", "-o", str(tmp_path / "out.fastq.gz"), datapath("format-error.fastq"), ] with pytest.raises(SystemExit): main(params) def test_json_report_with_demultiplexing_and_discard_untrimmed(tmp_path): stats = main( [ "--json", str(tmp_path / "demux.cutadapt.json"), "--discard-untrimmed", "-a", "name=ACGT", "-o", str(tmp_path / "{name}.fastq"), datapath("illumina.fastq.gz"), ] ) assert stats.n == 100 assert stats.written == 64 cutadapt-4.7/tests/test_files.py000066400000000000000000000073621457457704700171020ustar00rootroot00000000000000import os import pickle from cutadapt.files import ProxyTextFile, ProxyRecordWriter, OutputFiles from dnaio import SequenceRecord def test_proxy_text_file(): newline = os.linesep.encode() pf = ProxyTextFile() print("hello", file=pf) assert pf.drain() == [b"hello" + newline] assert pf.drain() == [b""] print("world", file=pf, end="\n") print("foo", file=pf, end="\n") assert pf.drain() == [b"world" + newline + b"foo" + newline] def test_proxy_test_file_pickleable(): pf = ProxyTextFile() pickled = pickle.dumps(pf) unpickled = pickle.loads(pickled) assert isinstance(unpickled, ProxyTextFile) def test_proxy_record_writer(): pw = ProxyRecordWriter(n_files=1, qualities=True) pw.write(SequenceRecord("name", "ACGT", qualities="####")) assert pw.drain() == [ b"@name\nACGT\n+\n####\n", ] pw.write(SequenceRecord("foo", "AA", "HH")) pw.write(SequenceRecord("bar", "CC", ",,")) assert pw.drain() == [ b"@foo\nAA\n+\nHH\n@bar\nCC\n+\n,,\n", ] def test_proxy_record_writer_paired(): pw = ProxyRecordWriter(n_files=2, qualities=True) pw.write( SequenceRecord("name", "ACGT", qualities="####"), SequenceRecord("name", "GGGG", qualities="!!!!"), ) assert pw.drain() == [b"@name\nACGT\n+\n####\n", b"@name\nGGGG\n+\n!!!!\n"] pw.write( SequenceRecord("foo", "AA", "HH"), SequenceRecord("foo", "TT", "33"), ) pw.write( SequenceRecord("bar", "CC", ",,"), SequenceRecord("bar", "GGG", "444"), ) assert pw.drain() == [ b"@foo\nAA\n+\nHH\n@bar\nCC\n+\n,,\n", b"@foo\nTT\n+\n33\n@bar\nGGG\n+\n444\n", ] def test_proxy_record_writer_picklable(): pw = ProxyRecordWriter(n_files=2, qualities=True) pickled = pickle.dumps(pw) unpickled = pickle.loads(pickled) assert isinstance(unpickled, ProxyRecordWriter) assert unpickled._n_files == 2 class TestOutputFiles: def test_open_text(self, tmp_path): o = OutputFiles( proxied=False, qualities=False, interleaved=False, ) path = tmp_path / "out.txt" f = o.open_text(path) print("Hello", file=f) o.close() assert path.read_text() == "Hello\n" def test_open_record_writer(self, tmp_path): o = OutputFiles( proxied=False, qualities=True, interleaved=False, ) path = tmp_path / "out.fastq" f = o.open_record_writer(path) f.write(SequenceRecord("r", "ACGT", "####")) o.close() assert path.read_text() == "@r\nACGT\n+\n####\n" def test_paired_record_writer(self, tmp_path): o = OutputFiles( proxied=False, qualities=True, interleaved=False, ) path1 = tmp_path / "out.1.fastq" path2 = tmp_path / "out.2.fastq" f = o.open_record_writer(path1, path2) f.write( SequenceRecord("r", "AACC", "####"), SequenceRecord("r", "GGTT", "####") ) o.close() assert path1.read_text() == "@r\nAACC\n+\n####\n" assert path2.read_text() == "@r\nGGTT\n+\n####\n" def test_interleaved_record_writer(self, tmp_path): o = OutputFiles( proxied=False, qualities=True, interleaved=True, ) path = tmp_path / "out.1.fastq" f = o.open_record_writer(path, interleaved=True) f.write( SequenceRecord("r", "AACC", "####"), SequenceRecord("r", "GGTT", "####") ) o.close() assert path.read_text() == "@r\nAACC\n+\n####\n@r\nGGTT\n+\n####\n" # - test force fasta # - test qualities # - test proxied # - test complaint about duplicate file names cutadapt-4.7/tests/test_kmer_finder.py000066400000000000000000000101141457457704700202520ustar00rootroot00000000000000import string import pytest from cutadapt._match_tables import matches_lookup from cutadapt.adapters import KmerFinder from cutadapt._kmer_finder import MAXIMUM_WORD_SIZE KMER_FINDER_TESTS = [ # kmer, start, stop, ref_wildcards, query_wildcards, sequence, expected ("ACGT", 0, None, False, False, "ACGTACG", True), ("ACGT", 0, None, False, False, "ACgtACG", True), ("acgt", 0, None, False, False, "ACgtACG", True), ("ACGT", 0, None, False, False, "acgtacg", True), ("ACGT", 0, None, False, False, "gacgact", False), ("ACGT", 0, None, False, True, "ACGNACG", True), ("ACGT", 0, None, False, False, "ACGNACG", False), ("ACGN", 0, None, True, False, "ACGTACG", True), ("ACGN", 0, None, True, False, "ACGxACG", True), ("ACKN", 0, None, True, False, "ACGTACG", True), ("ACKN", 0, None, True, True, "ACWRACG", True), ("ACKN", 0, None, True, True, "ACWxACG", False), ] @pytest.mark.parametrize( [ "kmer", "start", "stop", "ref_wildcards", "query_wildcards", "sequence", "expected", ], KMER_FINDER_TESTS, ) def test_kmer_finder( kmer: str, start: int, stop: int, ref_wildcards: bool, query_wildcards: bool, sequence: str, expected: bool, ): kmer_finder = KmerFinder([(start, stop, [kmer])], ref_wildcards, query_wildcards) assert kmer_finder.kmers_present(sequence) is expected @pytest.mark.parametrize( ["ref_wildcards", "query_wildcards"], [ (False, False), (True, False), (False, True), (True, True), ], ) def test_kmer_finder_per_char_matching(ref_wildcards, query_wildcards): match_table = matches_lookup(ref_wildcards, query_wildcards) for char in string.ascii_letters: matches = match_table[ord(char)] positions_and_kmers = [(0, None, [char])] kmer_finder = KmerFinder( positions_and_kmers, ref_wildcards=ref_wildcards, query_wildcards=query_wildcards, ) for comp_char in string.ascii_letters: should_match = comp_char.encode("ascii") in matches if kmer_finder.kmers_present(comp_char) is not should_match: raise ValueError( f"{char} should{' ' if should_match else ' not '}match {comp_char}" ) def test_kmer_finder_initialize_bigword(): with pytest.raises(ValueError) as error: KmerFinder([(0, None, ["A" * (MAXIMUM_WORD_SIZE + 1)])]) error.match("A" * (MAXIMUM_WORD_SIZE + 1)) error.match(str(MAXIMUM_WORD_SIZE)) def test_kmer_finder_initialize_total_greater_than_max(): kmer_finder = KmerFinder([(0, None, ["A" * 32, "B" * 32, "C" * 32, "D" * 43])]) assert kmer_finder.kmers_present("X" * 100 + "A" * 32) assert kmer_finder.kmers_present("X" * 100 + "B" * 32) assert kmer_finder.kmers_present("X" * 100 + "C" * 32) assert kmer_finder.kmers_present("X" * 100 + "D" * 43) assert not kmer_finder.kmers_present(string.ascii_letters) def test_kmer_finder_finds_all(): kmer_finder = KmerFinder([(0, None, ["teenage", "mutant", "ninja", "turtles"])]) assert kmer_finder.kmers_present("Smells like teenage spirit") assert kmer_finder.kmers_present("Everyone with a SNP is technically a mutant.") assert kmer_finder.kmers_present("He made a ninja PR that was merged before review") assert kmer_finder.kmers_present( "Turtles are treated as outgroup, for 'more advanced' reptiles but " "molecular evidence suggests they are more close to the dinosaurs than " "previously thought." ) assert not kmer_finder.kmers_present( "A turtle may be slow, but it also lives for a long time." ) def test_kmer_finder_finds_in_region(): kmer_finder = KmerFinder([(-20, None, ["peace"])]) # Finding peace, quotes from Mahatma Gandhi assert kmer_finder.kmers_present("Each one has to find his peace from within") # Peace not found here because outside of the search range. assert not kmer_finder.kmers_present( "And peace to be real must be unaffected by outside circumstances." ) cutadapt-4.7/tests/test_kmer_heuristic.py000066400000000000000000000073261457457704700210150ustar00rootroot00000000000000import pytest from cutadapt.kmer_heuristic import ( kmer_chunks, minimize_kmer_search_list, create_back_overlap_searchsets, create_positions_and_kmers, ) @pytest.mark.parametrize( ["sequence", "chunks", "expected"], [ ("ABC", 3, {"A", "B", "C"}), ("ABCD", 3, {"AB", "C", "D"}), ], ) def test_kmer_chunks(sequence, chunks, expected): assert kmer_chunks(sequence, chunks) == expected @pytest.mark.parametrize( ["kmer_search_list", "expected"], [ ([("ABC", -33, None), ("ABC", -19, None)], [("ABC", -33, None)]), ( [("ABC", -33, None), ("ABC", -19, None), ("ABC", 0, None)], [("ABC", 0, None)], ), ([("ABC", 0, 10), ("ABC", 0, 20)], [("ABC", 0, 20)]), ([("ABC", 0, 10), ("ABC", 0, 20), ("ABC", 0, None)], [("ABC", 0, None)]), ([("ABC", 0, 10), ("ABC", -19, None), ("ABC", 0, None)], [("ABC", 0, None)]), ([("ABC", 0, 10), ("ABC", -19, None)], [("ABC", 0, 10), ("ABC", -19, None)]), ], ) def test_minimize_kmer_search_list(kmer_search_list, expected): result = minimize_kmer_search_list(kmer_search_list) assert set(result) == set(expected) def test_create_back_overlap_searchsets(): adapter = "ABCDEFGHIJ0123456789" searchsets = create_back_overlap_searchsets(adapter, 3, 0.1) assert len(searchsets) == 5 assert (-3, None, {"ABC"}) in searchsets assert (-4, None, {"ABCD"}) in searchsets assert (-9, None, {"ABCDE"}) in searchsets assert (-19, None, kmer_chunks(adapter[:10], 2)) in searchsets assert (-20, None, kmer_chunks(adapter, 3)) in searchsets @pytest.mark.parametrize( ["kwargs", "expected"], [ ( dict(back_adapter=True, front_adapter=False, internal=True, min_overlap=3), [ (-3, None, ["ABC"]), (-4, None, ["ABCD"]), (-19, None, ["ABCDE", "FGHIJ"]), (0, None, ["ABCDEFG", "HIJ0123", "456789"]), ], ), ( dict(back_adapter=True, front_adapter=False, internal=False, min_overlap=3), [ (-3, None, ["ABC"]), (-4, None, ["ABCD"]), (-19, None, ["ABCDE", "FGHIJ"]), (-20, None, ["ABCDEFG", "HIJ0123", "456789"]), ], ), ( dict(back_adapter=False, front_adapter=True, internal=False, min_overlap=3), [ (0, 3, ["789"]), (0, 4, ["6789"]), (0, 19, ["01234", "56789"]), (0, 20, ["ABCDEF", "GHIJ012", "3456789"]), ], ), ( dict(back_adapter=True, front_adapter=False, internal=True, min_overlap=20), [ (0, None, ["ABCDEFG", "HIJ0123", "456789"]), ], ), ( dict(back_adapter=False, front_adapter=False, internal=True, min_overlap=3), [ (0, None, ["ABCDEFG", "HIJ0123", "456789"]), ], ), ], ) def test_create_kmers_and_positions(kwargs, expected): adapter = "ABCDEFGHIJ0123456789" result = create_positions_and_kmers( adapter, error_rate=0.1, **kwargs, ) assert {(start, stop): frozenset(kmers) for start, stop, kmers in result} == { (start, stop): frozenset(kmers) for start, stop, kmers in expected } @pytest.mark.timeout(0.5) def test_create_positions_and_kmers_slow(): create_positions_and_kmers( # Ridiculous size to check if there aren't any quadratic or exponential # algorithms in the code. "A" * 1000, min_overlap=3, error_rate=0.1, back_adapter=True, front_adapter=False, internal=True, ) cutadapt-4.7/tests/test_main.py000066400000000000000000000030631457457704700167160ustar00rootroot00000000000000import pytest from cutadapt.cli import ( main, parse_cutoffs, parse_lengths, CommandLineError, setup_logging, ) def test_help(): with pytest.raises(SystemExit) as e: main(["--help"]) assert e.value.args[0] == 0 def test_parse_cutoffs(): assert parse_cutoffs("5") == (0, 5) assert parse_cutoffs("6,7") == (6, 7) with pytest.raises(CommandLineError): parse_cutoffs("a,7") with pytest.raises(CommandLineError): parse_cutoffs("a") with pytest.raises(CommandLineError): parse_cutoffs("a,7") with pytest.raises(CommandLineError): parse_cutoffs("1,2,3") def test_parse_lengths(): assert parse_lengths("25") == (25,) assert parse_lengths("17:25") == (17, 25) assert parse_lengths("25:") == (25, None) assert parse_lengths(":25") == (None, 25) with pytest.raises(CommandLineError): parse_lengths("1:2:3") with pytest.raises(CommandLineError): parse_lengths("a:2") with pytest.raises(CommandLineError): parse_lengths("a") with pytest.raises(CommandLineError): parse_lengths("2:a") with pytest.raises(CommandLineError): parse_lengths(":") def test_setup_logging(): import logging logger = logging.getLogger(__name__) setup_logging(logger, log_to_stderr=False, quiet=False, minimal=False, debug=False) logger.info("Log message") setup_logging(logger, log_to_stderr=False, debug=1) setup_logging(logger, log_to_stderr=False, quiet=True) setup_logging(logger, log_to_stderr=False, minimal=True) cutadapt-4.7/tests/test_modifiers.py000066400000000000000000000401771457457704700177620ustar00rootroot00000000000000from typing import List import pytest from dnaio import SequenceRecord from cutadapt.adapters import ( BackAdapter, PrefixAdapter, IndexedPrefixAdapters, LinkedAdapter, FrontAdapter, Adapter, RemoveBeforeMatch, RemoveAfterMatch, LinkedMatch, ) from cutadapt.modifiers import ( UnconditionalCutter, NEndTrimmer, QualityTrimmer, Shortener, AdapterCutter, PairedAdapterCutter, ModificationInfo, ZeroCapper, Renamer, ReverseComplementer, InvalidTemplate, PairedEndRenamer, PairedReverseComplementer, ) def test_unconditional_cutter(): UnconditionalCutter(length=5) read = SequenceRecord("r1", "abcdefg") info = ModificationInfo(read) assert UnconditionalCutter(length=2)(read, info).sequence == "cdefg" assert info.cut_prefix == "ab" assert info.cut_suffix is None info = ModificationInfo(read) assert UnconditionalCutter(length=-2)(read, info).sequence == "abcde" assert info.cut_suffix == "fg" assert info.cut_prefix is None assert UnconditionalCutter(length=100)(read, info).sequence == "" assert UnconditionalCutter(length=-100)(read, info).sequence == "" def test_reverse_complementer(): adapters = [ PrefixAdapter("TTATTTGTCT"), PrefixAdapter("TCCGCACTGG"), ] adapter_cutter = AdapterCutter(adapters, index=False) reverse_complementer = ReverseComplementer(adapter_cutter) read = SequenceRecord("r", "ttatttgtctCCAGCTTAGACATATCGCCT") info = ModificationInfo(read) trimmed = reverse_complementer(read, info) assert trimmed.sequence == "CCAGCTTAGACATATCGCCT" assert not info.is_rc read = SequenceRecord("r", "CAACAGGCCACATTAGACATATCGGATGGTagacaaataa") info = ModificationInfo(read) trimmed = reverse_complementer(read, info) assert trimmed.sequence == "ACCATCCGATATGTCTAATGTGGCCTGTTG" assert info.is_rc def test_zero_capper(): zc = ZeroCapper() read = SequenceRecord("r1", "ACGT", "# !%") result = zc(read, ModificationInfo(read)) assert result.sequence == "ACGT" assert result.qualities == "#!!%" def test_nend_trimmer(): trimmer = NEndTrimmer() seqs = ["NNNNAAACCTTGGNNN", "NNNNAAACNNNCTTGGNNN", "NNNNNN"] trims = ["AAACCTTGG", "AAACNNNCTTGG", ""] for seq, trimmed in zip(seqs, trims): _seq = SequenceRecord("read1", seq, qualities="#" * len(seq)) _trimmed = SequenceRecord("read1", trimmed, qualities="#" * len(trimmed)) assert trimmer(_seq, ModificationInfo(_seq)) == _trimmed def test_quality_trimmer(): read = SequenceRecord("read1", "ACGTTTACGTA", "##456789###") qt = QualityTrimmer(10, 10, 33) assert qt(read, ModificationInfo(read)) == SequenceRecord( "read1", "GTTTAC", "456789" ) qt = QualityTrimmer(0, 10, 33) assert qt(read, ModificationInfo(read)) == SequenceRecord( "read1", "ACGTTTAC", "##456789" ) qt = QualityTrimmer(10, 0, 33) assert qt(read, ModificationInfo(read)) == SequenceRecord( "read1", "GTTTACGTA", "456789###" ) def test_shortener(): read = SequenceRecord("read1", "ACGTTTACGTA", "##456789###") shortener = Shortener(0) assert shortener(read, ModificationInfo(read)) == SequenceRecord("read1", "", "") shortener = Shortener(1) assert shortener(read, ModificationInfo(read)) == SequenceRecord("read1", "A", "#") shortener = Shortener(5) assert shortener(read, ModificationInfo(read)) == SequenceRecord( "read1", "ACGTT", "##456" ) shortener = Shortener(100) assert shortener(read, ModificationInfo(read)) == read def test_adapter_cutter_indexing(): adapters = [ PrefixAdapter(sequence, max_errors=1, indels=False) for sequence in ["ACGAT", "GGAC", "TTTACTTA", "TAACCGGT", "GTTTACGTA", "CGATA"] ] ac = AdapterCutter(adapters) assert len(ac.adapters) == 1 assert isinstance(ac.adapters[0], IndexedPrefixAdapters) ac = AdapterCutter(adapters, index=False) assert len(ac.adapters) == len(adapters) class TestPairedAdapterCutter: @pytest.mark.parametrize( "action,expected_trimmed1,expected_trimmed2", [ (None, "CCCCGGTTAACCCC", "TTTTAACCGGTTTT"), ("trim", "CCCC", "TTTT"), ("lowercase", "CCCCggttaacccc", "TTTTaaccggtttt"), ("mask", "CCCCNNNNNNNNNN", "TTTTNNNNNNNNNN"), ("retain", "CCCCGGTTAA", "TTTTAACCGG"), ], ) def test_actions(self, action, expected_trimmed1, expected_trimmed2): a1 = BackAdapter("GGTTAA") a2 = BackAdapter("AACCGG") s1 = SequenceRecord("name", "CCCCGGTTAACCCC") s2 = SequenceRecord("name", "TTTTAACCGGTTTT") pac = PairedAdapterCutter([a1], [a2], action=action) info1 = ModificationInfo(s1) info2 = ModificationInfo(s2) trimmed1, trimmed2 = pac(s1, s2, info1, info2) assert expected_trimmed1 == trimmed1.sequence assert expected_trimmed2 == trimmed2.sequence def test_multiple_occurrences(self): r1_a1 = BackAdapter("AAAAAA") r1_a2 = BackAdapter("CCCC") r2_a1 = BackAdapter("GGGG") r2_a2 = BackAdapter("TTTT") s1 = SequenceRecord("name", "TTAAAAAATTCCCCTT") s2 = SequenceRecord("name", "ACACTTTTACAC") pac = PairedAdapterCutter([r1_a1, r1_a2], [r2_a1, r2_a2], action="lowercase") info1 = ModificationInfo(s1) info2 = ModificationInfo(s2) trimmed1, trimmed2 = pac(s1, s2, info1, info2) assert len(info1.matches) == 1 and info1.matches[0].adapter is r1_a2 assert len(info2.matches) == 1 and info2.matches[0].adapter is r2_a2 assert "TTAAAAAATTcccctt" == trimmed1.sequence assert "ACACttttacac" == trimmed2.sequence def test_retain_times(): with pytest.raises(ValueError) as e: AdapterCutter([BackAdapter("ACGT")], times=2, action="retain") assert "cannot be combined with times" in e.value.args[0] def test_action_retain(): back = BackAdapter("AACCGG") ac = AdapterCutter([back], action="retain") seq = SequenceRecord("r1", "ATTGCCAACCGGTATATAT") info = ModificationInfo(seq) trimmed = ac(seq, info) assert "ATTGCCAACCGG" == trimmed.sequence @pytest.mark.parametrize( "s,expected", [ ("ATTATTggttaaccAAAAAaaccggTATT", "ggttaaccAAAAAaaccgg"), ("AAAAAaaccggTATT", "AAAAAaaccgg"), ("ATTATTggttaaccAAAAA", "ggttaaccAAAAA"), ("ATTATT", "ATTATT"), ], ) def test_linked_action_retain(s, expected): front = FrontAdapter("GGTTAACC") back = BackAdapter("AACCGG") adapters: List[Adapter] = [ LinkedAdapter( front, back, front_required=False, back_required=False, name="linked" ) ] ac = AdapterCutter(adapters, action="retain") seq = SequenceRecord("r1", s) info = ModificationInfo(seq) trimmed = ac(seq, info) assert expected == trimmed.sequence class TestRenamer: def test_invalid_template_variable(self): with pytest.raises(InvalidTemplate): Renamer("{id} {invalid}") def test_header_template_variable(self): renamer = Renamer("{header} extra") read = SequenceRecord("theid thecomment", "ACGT") info = ModificationInfo(read) assert renamer(read, info).name == "theid thecomment extra" def test_id_template_variable(self): renamer = Renamer("{id} extra") read = SequenceRecord("theid thecomment", "ACGT") info = ModificationInfo(read) assert renamer(read, info).name == "theid extra" def test_tab_escape(self): renamer = Renamer(r"{id} extra\tand a tab") read = SequenceRecord("theid thecomment", "ACGT") info = ModificationInfo(read) assert renamer(read, info).name == "theid extra\tand a tab" def test_comment_template_variable(self): renamer = Renamer("{id}_extra {comment}") read = SequenceRecord("theid thecomment", "ACGT") info = ModificationInfo(read) assert renamer(read, info).name == "theid_extra thecomment" def test_comment_template_variable_missing_comment(self): renamer = Renamer("{id}_extra {comment}") read = SequenceRecord("theid", "ACGT") info = ModificationInfo(read) assert renamer(read, info).name == "theid_extra " def test_cut_prefix_template_variable(self): renamer = Renamer("{id}_{cut_prefix} {comment}") read = SequenceRecord("theid thecomment", "ACGT") info = ModificationInfo(read) info.cut_prefix = "TTAAGG" assert renamer(read, info).name == "theid_TTAAGG thecomment" def test_cut_suffix_template_variable(self): renamer = Renamer("{id}_{cut_suffix} {comment}") read = SequenceRecord("theid thecomment", "ACGT") info = ModificationInfo(read) info.cut_suffix = "TTAAGG" assert renamer(read, info).name == "theid_TTAAGG thecomment" def test_rc_template_variable(self): renamer = Renamer("{id} rc={rc} {comment}") read = SequenceRecord("theid thecomment", "ACGT") info = ModificationInfo(read) assert renamer(read, info).name == "theid rc= thecomment" read = SequenceRecord("theid thecomment", "ACGT") info.is_rc = True assert renamer(read, info).name == "theid rc=rc thecomment" def test_match_sequence(self): sequence = "TTTTCCCCACGTGGGG" read = SequenceRecord("theid thecomment", sequence) adapter = BackAdapter("AGGT") info = ModificationInfo(read) info.matches.append( RemoveBeforeMatch( astart=0, astop=4, rstart=8, rstop=12, score=3, errors=1, adapter=adapter, sequence=sequence, ) ) renamer = Renamer("{header} match={match_sequence}") renamer(read, info) assert read.name == "theid thecomment match=ACGT" def test_match_sequence_linked_match(self): sequence = "TATTCCCCACGTGGGG" read = SequenceRecord("theid thecomment", sequence) adapter1 = PrefixAdapter("TTTT") adapter2 = BackAdapter("AGGT") linked_adapter = LinkedAdapter( adapter1, adapter2, front_required=True, back_required=False, name="name", ) info = ModificationInfo(read) before_match = RemoveBeforeMatch( astart=0, astop=4, rstart=0, rstop=4, score=3, errors=1, adapter=adapter1, sequence=sequence, ) after_match = RemoveAfterMatch( astart=0, astop=4, rstart=4, rstop=8, score=3, errors=1, adapter=adapter2, sequence=sequence[4:], ) info.matches.append(LinkedMatch(before_match, after_match, linked_adapter)) renamer = Renamer("{header} match={match_sequence}") renamer(read, info) assert read.name == "theid thecomment match=TATT,ACGT" class TestPairedEndRenamer: def test_invalid_template_variable(self): with pytest.raises(InvalidTemplate): PairedEndRenamer("{id} {invalid}") def test_tab_escape(self): renamer = PairedEndRenamer(r"{id} {comment}\tand a tab") r1 = SequenceRecord("theid comment1", "ACGT") r2 = SequenceRecord("theid comment2", "ACGT") info1 = ModificationInfo(r1) info2 = ModificationInfo(r2) renamed1, renamed2 = renamer(r1, r2, info1, info2) assert renamed1.name == "theid comment1\tand a tab" assert renamed2.name == "theid comment2\tand a tab" def test_ids_not_identical(self): renamer = PairedEndRenamer("{id} abc {comment} xyz") r1 = SequenceRecord("theid_a cmtx", "ACGT") r2 = SequenceRecord("theid_b cmty", "ACGT") info1 = ModificationInfo(r1) info2 = ModificationInfo(r2) with pytest.raises(ValueError) as e: renamer(r1, r2, info1, info2) assert "not identical" in e.value.args[0] def test_comment(self): renamer = PairedEndRenamer("{id} abc {comment} xyz") r1 = SequenceRecord("theid cmtx", "ACGT") r2 = SequenceRecord("theid cmty", "ACGT") info1 = ModificationInfo(r1) info2 = ModificationInfo(r2) renamed1, renamed2 = renamer(r1, r2, info1, info2) assert renamed1.name == "theid abc cmtx xyz" assert renamed2.name == "theid abc cmty xyz" def test_r1_comment(self): renamer = PairedEndRenamer("{id} abc {r1.comment} xyz") r1 = SequenceRecord("theid cmtx", "ACGT") r2 = SequenceRecord("theid cmty", "ACGT") info1 = ModificationInfo(r1) info2 = ModificationInfo(r2) renamed1, renamed2 = renamer(r1, r2, info1, info2) assert renamed1.name == "theid abc cmtx xyz" assert renamed2.name == "theid abc cmtx xyz" def test_r2_comment(self): renamer = PairedEndRenamer("{id} abc {r2.comment} xyz") r1 = SequenceRecord("theid cmtx", "ACGT") r2 = SequenceRecord("theid cmty", "ACGT") info1 = ModificationInfo(r1) info2 = ModificationInfo(r2) renamed1, renamed2 = renamer(r1, r2, info1, info2) assert renamed1.name == "theid abc cmty xyz" assert renamed2.name == "theid abc cmty xyz" def test_read_number(self): renamer = PairedEndRenamer("{id} read no. is: {rn}") r1 = SequenceRecord("theid cmtx", "ACGT") r2 = SequenceRecord("theid cmty", "ACGT") info1 = ModificationInfo(r1) info2 = ModificationInfo(r2) renamed1, renamed2 = renamer(r1, r2, info1, info2) assert renamed1.name == "theid read no. is: 1" assert renamed2.name == "theid read no. is: 2" def test_match_sequence(self): r1 = SequenceRecord("theid first", "AACC") info1 = ModificationInfo(r1) info1.matches.append( RemoveBeforeMatch( astart=2, astop=4, rstart=1, rstop=3, score=1, errors=1, adapter=FrontAdapter("AT"), sequence="AACC", ) ) r2 = SequenceRecord("theid second", "GGTT") info2 = ModificationInfo(r2) info2.matches.append( RemoveBeforeMatch( astart=2, astop=4, rstart=1, rstop=3, score=1, errors=1, adapter=FrontAdapter("GA"), sequence="GGTT", ) ) renamer = PairedEndRenamer("{header} s={match_sequence}") renamed1, renamed2 = renamer(r1[:], r2[:], info1, info2) assert renamed1.name == "theid first s=AC" assert renamed2.name == "theid second s=GT" renamer = PairedEndRenamer("{header} s={r1.match_sequence}") renamed1, renamed2 = renamer(r1[:], r2[:], info1, info2) assert renamed1.name == "theid first s=AC" assert renamed2.name == "theid second s=AC" renamer = PairedEndRenamer("{header} s={r2.match_sequence}") renamed1, renamed2 = renamer(r1[:], r2[:], info1, info2) assert renamed1.name == "theid first s=GT" assert renamed2.name == "theid second s=GT" def test_paired_reverse_complementer(): cutter1 = AdapterCutter([PrefixAdapter("AACC")], index=False) cutter2 = AdapterCutter([PrefixAdapter("GGCC")], index=False) revcomper = PairedReverseComplementer(cutter1, cutter2) # Forward (normal) r1 = SequenceRecord("a", "AACCAAAAA") r2 = SequenceRecord("a", "GGCCTTTTT") info1 = ModificationInfo(r1) info2 = ModificationInfo(r2) trimmed1, trimmed2 = revcomper(r1, r2, info1, info2) assert trimmed1.sequence == "AAAAA" assert trimmed2.sequence == "TTTTT" assert trimmed1.name == "a" assert trimmed2.name == "a" # Reversed (R1/R2 swapped) r1, r2 = r2, r1 info1 = ModificationInfo(r1) info2 = ModificationInfo(r2) trimmed1, trimmed2 = revcomper(r1, r2, info1, info2) assert trimmed1.sequence == "AAAAA" assert trimmed2.sequence == "TTTTT" assert trimmed1.name == "a rc" assert trimmed2.name == "a rc" cutadapt-4.7/tests/test_paired.py000066400000000000000000000576601457457704700172520ustar00rootroot00000000000000import os import os.path import shutil from itertools import product import pytest from cutadapt.cli import main from utils import assert_files_equal, datapath, cutpath @pytest.fixture def run_paired(tmp_path): def _run(params, in1, in2, expected1, expected2, cores): if type(params) is str: params = params.split() params += ["--cores", str(cores), "--buffer-size=512"] params += ["--json", os.fspath(tmp_path / "stats.cutadapt.json")] (tmp_path / "r1").mkdir() (tmp_path / "r2").mkdir() path1 = os.fspath(tmp_path / "r1" / expected1) path2 = os.fspath(tmp_path / "r2" / expected2) params += ["-o", path1, "-p", path2] params += [datapath(in1), datapath(in2)] stats = main(params) assert_files_equal(cutpath(expected1), path1) assert_files_equal(cutpath(expected2), path2) return stats return _run @pytest.fixture def run_interleaved(tmp_path): """ Interleaved input or output (or both) """ def _run(params, inpath1, inpath2=None, expected1=None, expected2=None, cores=1): assert not (inpath1 and inpath2 and expected1 and expected2) assert not (expected2 and not expected1) assert not (inpath2 and not inpath1) params = params.split() params += ["--interleaved", "--cores", str(cores), "--buffer-size=512"] params += ["--json", os.fspath(tmp_path / "stats.cutadapt.json")] tmp1 = os.fspath(tmp_path / ("out1-" + expected1)) params += ["-o", tmp1] paths = [datapath(inpath1)] if inpath2: paths += [datapath(inpath2)] if expected2: tmp2 = os.fspath(tmp_path / ("out2-" + expected2)) params += ["-p", tmp2] stats = main(params + paths) assert_files_equal(cutpath(expected2), tmp2) else: stats = main(params + paths) assert_files_equal(cutpath(expected1), tmp1) return stats return _run def test_paired_end_no_legacy(run_paired, cores): """--paired-output, not using -A/-B/-G""" # the -m 14 filters out one read, which should then also be removed from the second file # Since legacy mode was removed, -q 10 should filter out an additional read which gets # quality-trimmed in file 2 run_paired( "-a TTAGACATAT -m 14 -q 10", in1="paired.1.fastq", in2="paired.2.fastq", expected1="paired.m14.1.fastq", expected2="paired.m14.2.fastq", cores=cores, ) def test_untrimmed_paired_output(tmp_path, run_paired): untrimmed1 = os.fspath(tmp_path / "untrimmed.1.fastq") untrimmed2 = os.fspath(tmp_path / "untrimmed.2.fastq") run_paired( [ "-a", "TTAGACATAT", "--pair-filter=first", "--untrimmed-output", untrimmed1, "--untrimmed-paired-output", untrimmed2, ], in1="paired.1.fastq", in2="paired.2.fastq", expected1="paired-trimmed.1.fastq", expected2="paired-trimmed.2.fastq", cores=1, ) assert_files_equal(cutpath("paired-untrimmed.1.fastq"), untrimmed1) assert_files_equal(cutpath("paired-untrimmed.2.fastq"), untrimmed2) def test_untrimmed_paired_output_automatic_pair_filter(tmp_path, run_paired): # When no R2 adapters are given, --pair-filter should be ignored for # --discard-untrimmed, --untrimmed-output, --untrimmed-paired-output # and always be "both" (with --pair-filter=any, all pairs would be # considered untrimmed because the R1 read is always untrimmed) untrimmed1 = os.fspath(tmp_path / "untrimmed.1.fastq") untrimmed2 = os.fspath(tmp_path / "untrimmed.2.fastq") run_paired( [ "-a", "TTAGACATAT", "--untrimmed-output", untrimmed1, "--untrimmed-paired-output", untrimmed2, ], in1="paired.1.fastq", in2="paired.2.fastq", expected1="paired-trimmed.1.fastq", expected2="paired-trimmed.2.fastq", cores=1, ) assert_files_equal(cutpath("paired-untrimmed.1.fastq"), untrimmed1) assert_files_equal(cutpath("paired-untrimmed.2.fastq"), untrimmed2) def test_explicit_format_with_paired(tmp_path, run_paired): # Use FASTQ input files whose extension is .txt txt1 = os.fspath(tmp_path / "paired.1.txt") txt2 = os.fspath(tmp_path / "paired.2.txt") shutil.copyfile(datapath("paired.1.fastq"), txt1) shutil.copyfile(datapath("paired.2.fastq"), txt2) run_paired( "-a TTAGACATAT -m 14 -q 10", in1=txt1, in2=txt2, expected1="paired.m14.1.fastq", expected2="paired.m14.2.fastq", cores=1, ) def test_no_trimming_legacy(): # make sure that this doesn"t divide by zero main( [ "-a", "XXXXX", "-o", os.devnull, "-p", os.devnull, datapath("paired.1.fastq"), datapath("paired.2.fastq"), ] ) def test_no_trimming(): # make sure that this doesn"t divide by zero main( [ "-a", "XXXXX", "-A", "XXXXX", "-o", os.devnull, "-p", os.devnull, datapath("paired.1.fastq"), datapath("paired.2.fastq"), ] ) def test_missing_file(tmp_path): with pytest.raises(SystemExit): main( [ "--paired-output", os.fspath(tmp_path / "out.fastq"), datapath("paired.1.fastq"), ] ) def test_first_too_short(tmp_path, cores): # Create a truncated file in which the last read is missing trunc1 = tmp_path / "truncated.1.fastq" with open(datapath("paired.1.fastq")) as f: lines = f.readlines() lines = lines[:-4] trunc1.write_text("".join(lines)) with pytest.raises(SystemExit): main( [ "-o", os.devnull, "--paired-output", os.fspath(tmp_path / "out.fastq"), "--cores", str(cores), str(trunc1), datapath("paired.2.fastq"), ] ) def test_second_too_short(tmp_path, cores): # Create a truncated file in which the last read is missing trunc2 = tmp_path / "truncated.2.fastq" with open(datapath("paired.2.fastq")) as f: lines = f.readlines() lines = lines[:-4] trunc2.write_text("".join(lines)) with pytest.raises(SystemExit): main( [ "-o", os.devnull, "--paired-output", os.fspath(tmp_path / "out.fastq"), "--cores", str(cores), datapath("paired.1.fastq"), str(trunc2), ] ) def test_unmatched_read_names(tmp_path, cores): # Create a file in which reads 2 and 1 are swapped with open(datapath("paired.1.fastq")) as f: lines = f.readlines() lines = lines[0:4] + lines[8:12] + lines[4:8] + lines[12:] swapped = tmp_path / "swapped.1.fastq" swapped.write_text("".join(lines)) with pytest.raises(SystemExit): main( [ "-o", os.fspath(tmp_path / "out1.fastq"), "--paired-output", os.fspath(tmp_path / "out2.fastq"), "--cores", str(cores), str(swapped), datapath("paired.2.fastq"), ] ) def test_p_without_o(cores): """Option -p given but -o missing""" with pytest.raises(SystemExit): main( ["-a", "XX", "-p", os.devnull] + ["--cores", str(cores)] + [datapath("paired.1.fastq"), datapath("paired.2.fastq")] ) def test_paired_but_only_one_input_file(cores): """Option -p given but only one input file""" with pytest.raises(SystemExit): main( ["-a", "XX", "-o", os.devnull, "-p", os.devnull] + ["--cores", str(cores)] + [datapath("paired.1.fastq")] ) def test_no_legacy_minlength(run_paired, cores): """Legacy mode was removed: Ensure -m is applied to second read in a pair""" run_paired( "-a XXX -m 27", in1="paired.1.fastq", in2="paired.2.fastq", expected1="paired-m27.1.fastq", expected2="paired-m27.2.fastq", cores=cores, ) def test_paired_end(run_paired, cores): """single-pass paired-end with -m""" run_paired( "-a TTAGACATAT -A CAGTGGAGTA -m 14", in1="paired.1.fastq", in2="paired.2.fastq", expected1="paired.1.fastq", expected2="paired.2.fastq", cores=cores, ) def test_paired_anchored_back_no_indels(run_paired): run_paired( "-a BACKADAPTER$ -A BACKADAPTER$ -N --no-indels", in1="anchored-back.fasta", in2="anchored-back.fasta", expected1="anchored-back.fasta", expected2="anchored-back.fasta", cores=1, ) def test_paired_end_qualtrim(run_paired, cores): """single-pass paired-end with -q and -m""" run_paired( "-q 20 -a TTAGACATAT -A CAGTGGAGTA -m 14 -M 90", in1="paired.1.fastq", in2="paired.2.fastq", expected1="pairedq.1.fastq", expected2="pairedq.2.fastq", cores=cores, ) def test_paired_end_qualtrim_swapped(run_paired, cores): """single-pass paired-end with -q and -m, but files swapped""" run_paired( "-q 20 -a CAGTGGAGTA -A TTAGACATAT -m 14", in1="paired.2.fastq", in2="paired.1.fastq", expected1="pairedq.2.fastq", expected2="pairedq.1.fastq", cores=cores, ) @pytest.mark.parametrize( "args,expected1,expected2", [ ("", "lowqual.unchanged.fastq", "lowqual.unchanged.fastq"), ("-q 10", "lowqual.fastq", "lowqual.fastq"), ("-q 10 -Q 10", "lowqual.fastq", "lowqual.fastq"), ("-Q 10", "lowqual.unchanged.fastq", "lowqual.fastq"), ("-q 0 -Q 10", "lowqual.unchanged.fastq", "lowqual.fastq"), ("-q 10 -Q 0", "lowqual.fastq", "lowqual.unchanged.fastq"), ], ) def test_qualtrim_r2(run_paired, args, expected1, expected2): run_paired( args, in1="lowqual.fastq", in2="lowqual.fastq", expected1=expected1, expected2=expected2, cores=1, ) def test_paired_end_cut(run_paired, cores): run_paired( "-u 3 -u -1 -U 4 -U -2", in1="paired.1.fastq", in2="paired.2.fastq", expected1="pairedu.1.fastq", expected2="pairedu.2.fastq", cores=cores, ) def test_paired_end_upper_a_only(run_paired, cores): run_paired( "-A CAGTGGAGTA", in1="paired.1.fastq", in2="paired.2.fastq", expected1="paired-onlyA.1.fastq", expected2="paired-onlyA.2.fastq", cores=cores, ) def test_discard_untrimmed(run_paired, cores): # issue #146 # the first adapter is a sequence cut out from the first read run_paired( "-a CTCCAGCTTAGACATATC -A XXXXXXXX --discard-untrimmed", in1="paired.1.fastq", in2="paired.2.fastq", expected1="empty.fastq", expected2="empty.fastq", cores=cores, ) def test_discard_trimmed(run_paired, cores): run_paired( "-A C -O 1 --discard-trimmed", # applies everywhere in1="paired.1.fastq", in2="paired.2.fastq", expected1="empty.fastq", expected2="empty.fastq", cores=cores, ) def test_interleaved_in_and_out(run_interleaved, cores): """Single-pass interleaved paired-end with -q and -m""" run_interleaved( "-q 20 -a TTAGACATAT -A CAGTGGAGTA -m 14 -M 90", inpath1="interleaved.fastq", expected1="interleaved.fastq", cores=cores, ) def test_interleaved_in(run_interleaved, cores): """Interleaved input, two files output""" run_interleaved( "-q 20 -a TTAGACATAT -A CAGTGGAGTA -m 14 -M 90", inpath1="interleaved.fastq", expected1="pairedq.1.fastq", expected2="pairedq.2.fastq", cores=cores, ) def test_interleaved_out(run_interleaved, cores): """Two files input, interleaved output""" run_interleaved( "-q 20 -a TTAGACATAT -A CAGTGGAGTA -m 14 -M 90", inpath1="paired.1.fastq", inpath2="paired.2.fastq", expected1="interleaved.fastq", cores=cores, ) def test_interleaved_neither_nor(tmp_path): """Option --interleaved used, but pairs of files given for input and output""" p1 = os.fspath(tmp_path / "temp-paired.1.fastq") p2 = os.fspath(tmp_path / "temp-paired.2.fastq") params = "-a XX --interleaved".split() params += ["-o", p1, "-p1", p2, "paired.1.fastq", "paired.2.fastq"] with pytest.raises(SystemExit): main(params) def test_interleaved_untrimmed_output(tmp_path): o1 = os.fspath(tmp_path / "out.1.fastq") o2 = os.fspath(tmp_path / "out.2.fastq") untrimmed = os.fspath(tmp_path / "untrimmed.interleaved.fastq") main( [ "--interleaved", "-a", "XXXX", "-o", o1, "-p", o2, "--untrimmed-output", untrimmed, datapath("interleaved.fastq"), ] ) assert_files_equal(datapath("interleaved.fastq"), untrimmed) def test_pair_filter_both(run_paired, cores): run_paired( "--pair-filter=both -a TTAGACATAT -A GGAGTA -m 14", in1="paired.1.fastq", in2="paired.2.fastq", expected1="paired-filterboth.1.fastq", expected2="paired-filterboth.2.fastq", cores=cores, ) def test_pair_filter_first(run_paired, cores): run_paired( "--pair-filter=first -a TTAGACATAT -A GGAGTA -m 14", in1="paired.1.fastq", in2="paired.2.fastq", expected1="paired-filterfirst.1.fastq", expected2="paired-filterfirst.2.fastq", cores=cores, ) def test_too_short_paired_output(run_paired, tmp_path, cores): p1 = os.fspath(tmp_path / "too-short.1.fastq") p2 = os.fspath(tmp_path / "too-short.2.fastq") run_paired( " -a TTAGACATAT -A CAGTGGAGTA -m 14" " --too-short-output {}" " --too-short-paired-output {}".format(p1, p2), in1="paired.1.fastq", in2="paired.2.fastq", expected1="paired.1.fastq", expected2="paired.2.fastq", cores=cores, ) assert_files_equal(cutpath("paired-too-short.1.fastq"), p1) assert_files_equal(cutpath("paired-too-short.2.fastq"), p2) def test_too_long_output(run_paired, tmp_path, cores): p1 = os.fspath(tmp_path / "too-long.1.fastq") p2 = os.fspath(tmp_path / "too-long.2.fastq") run_paired( " -a TTAGACATAT -A CAGTGGAGTA -M 14" " --too-long-output {}" " --too-long-paired-output {}".format(p1, p2), in1="paired.1.fastq", in2="paired.2.fastq", expected1="paired-too-short.1.fastq", expected2="paired-too-short.2.fastq", cores=cores, ) assert_files_equal(cutpath("paired.1.fastq"), p1) assert_files_equal(cutpath("paired.2.fastq"), p2) def test_too_short_output_paired_option_missing(run_paired, tmp_path): p1 = os.fspath(tmp_path / "too-short.1.fastq") with pytest.raises(SystemExit): run_paired( "-a TTAGACATAT -A CAGTGGAGTA -m 14 --too-short-output " "{0}".format(p1), in1="paired.1.fastq", in2="paired.2.fastq", expected1="paired.1.fastq", expected2="paired.2.fastq", cores=1, ) def test_nextseq_paired(run_paired, cores): run_paired( "--nextseq-trim 22", in1="nextseq.fastq", in2="nextseq.fastq", expected1="nextseq.fastq", expected2="nextseq.fastq", cores=cores, ) def test_paired_demultiplex(tmp_path, cores): multiout1 = os.fspath(tmp_path / "demultiplexed.{name}.1.fastq") multiout2 = os.fspath(tmp_path / "demultiplexed.{name}.2.fastq") params = [ "--cores", str(cores), "-a", "first=AACATTAGACA", "-a", "second=CATTAGACATATCGG", "-A", "ignored=CAGTGGAGTA", "-A", "alsoignored=AATAACAGTGGAGTA", "-o", multiout1, "-p", multiout2, datapath("paired.1.fastq"), datapath("paired.2.fastq"), ] main(params) assert_files_equal( cutpath("demultiplexed.first.1.fastq"), multiout1.format(name="first") ) assert_files_equal( cutpath("demultiplexed.second.1.fastq"), multiout1.format(name="second") ) assert_files_equal( cutpath("demultiplexed.unknown.1.fastq"), multiout1.format(name="unknown") ) assert_files_equal( cutpath("demultiplexed.first.2.fastq"), multiout2.format(name="first") ) assert_files_equal( cutpath("demultiplexed.second.2.fastq"), multiout2.format(name="second") ) assert_files_equal( cutpath("demultiplexed.unknown.2.fastq"), multiout2.format(name="unknown") ) @pytest.mark.parametrize( "name_op,l1,l2,m", list( product( (("m", lambda x, y: x >= y), ("M", lambda x, y: x <= y)), range(1, 5), range(1, 5), [(2, 3), (2, None), (None, 3)], ) ), ) def test_separate_minmaxlength(tmp_path, name_op, l1, l2, m): """Separate minimum lengths for R1 and R2""" m1, m2 = m name, func = name_op inpath = os.fspath(tmp_path / "separate_minlength.fasta") expected = os.fspath(tmp_path / "separate_minlength_expected.fasta") outpath = os.fspath(tmp_path / "out.fasta") record = ">r{}:{}\n{}\n".format(l1, l2, "A" * l1) record += ">r{}:{}\n{}".format(l1, l2, "A" * l2) with open(inpath, "w") as f: print(record, file=f) with open(expected, "w") as f: if (m1 is None or func(l1, m1)) and (m2 is None or func(l2, m2)): print(record, file=f) assert os.path.exists(inpath) assert os.path.exists(expected) if m1 is None: m1 = "" if m2 is None: m2 = "" main(["--interleaved", "-o", outpath, "-" + name, "{}:{}".format(m1, m2), inpath]) assert_files_equal(expected, outpath) def test_separate_minlength_single(): """Using separate minlengths for single-end data""" with pytest.raises(SystemExit): main(["-m", "5:7", datapath("small.fastq")]) def test_paired_end_minimal_report(run_paired, cores): run_paired( "-a TTAGACATAT -A CAGTGGAGTA -m 14 --report=minimal", in1="paired.1.fastq", in2="paired.2.fastq", expected1="paired.1.fastq", expected2="paired.2.fastq", cores=cores, ) def test_pair_adapters(run_paired, cores): run_paired( "--pair-adapters -a GTCTCCAGCT -A GACAAATAAC", in1="paired.1.fastq", in2="paired.2.fastq", expected1="pair-adapters.1.fastq", expected2="pair-adapters.2.fastq", cores=cores, ) def test_pair_adapters_unequal_length(tmp_path): with pytest.raises(SystemExit): main( [ "--paired-adapters", "-a", "GTCTCCAGCT", "-a", "ACGTACGT", # Two R1 adapters "-A", "TGCA", # But only one R2 adapter "-o", os.fspath(tmp_path / "out.1.fastq"), "-p", os.fspath(tmp_path / "out.2.fastq"), datapath("paired.1.fastq"), datapath("paired.2.fastq"), ] ) def test_pair_adapters_demultiplexing(tmp_path, cores): params = "-g i1=AAAA -G i1=GGGG -g i2=CCCC -G i2=TTTT".split() params += ["--pair-adapters"] params += ["--cores", str(cores)] params += ["-o", os.fspath(tmp_path / "dual-{name}.1.fastq")] params += ["-p", os.fspath(tmp_path / "dual-{name}.2.fastq")] params += [datapath("dual-index.1.fastq"), datapath("dual-index.2.fastq")] main(params) for name in [ "dual-i1.1.fastq", "dual-i1.2.fastq", "dual-i2.1.fastq", "dual-i2.2.fastq", "dual-unknown.1.fastq", "dual-unknown.2.fastq", ]: assert (tmp_path / name).exists() assert_files_equal(cutpath(name), os.fspath(tmp_path / name)) @pytest.mark.parametrize("discarduntrimmed", (False, True)) def test_combinatorial_demultiplexing(tmp_path, discarduntrimmed, cores): params = ( "-g A=^AAAAAAAAAA -g C=^CCCCCCCCCC -G G=^GGGGGGGGGG -G T=^TTTTTTTTTT".split() ) params += ["-o", os.fspath(tmp_path / "combinatorial.{name1}_{name2}.1.fastq")] params += ["-p", os.fspath(tmp_path / "combinatorial.{name1}_{name2}.2.fastq")] params += ["--cores", str(cores)] params += [datapath("combinatorial.1.fastq"), datapath("combinatorial.2.fastq")] # third item in tuple says whether the file must exist combinations = [(a, b, True) for a, b in product("AC", "GT")] optional = [("unknown", "unknown")] optional += [(a, "unknown") for a in "AC"] optional += [("unknown", b) for b in "GT"] if discarduntrimmed: combinations.extend((a, b, False) for a, b in optional) params += ["--discard-untrimmed"] else: combinations.extend((a, b, True) for a, b in optional) main(params) for name1, name2, should_exist in combinations: for i in (1, 2): name = "combinatorial.{name1}_{name2}.{i}.fastq".format( name1=name1, name2=name2, i=i ) path = cutpath(os.path.join("combinatorial", name)) if should_exist: assert (tmp_path / name).exists(), ("Output file missing", name) assert_files_equal(path, os.fspath(tmp_path / name)) else: assert not (tmp_path / name).exists(), ( "Output file should not exist", name, ) def test_info_file(tmp_path): info_path = os.fspath(tmp_path / "info.txt") params = [ "--info-file", info_path, "-o", os.fspath(tmp_path / "out.1.fastq"), "-p", os.fspath(tmp_path / "out.2.fastq"), datapath("paired.1.fastq"), datapath("paired.2.fastq"), ] main(params) def test_rename(run_paired, cores): run_paired( [ "--rename={id} {r1.cut_prefix} {cut_prefix}" " {comment} {adapter_name} {r2.adapter_name}", "--cut=4", "-a", "R1adapter=GTCTCCAGCT", "-A", "R2adapter=GACAAATAAC", ], in1="paired.1.fastq", in2="paired.2.fastq", expected1="rename.1.fastq", expected2="rename.2.fastq", cores=cores, ) def test_poly_a_poly_t(run_paired, cores): run_paired( "--poly-a", in1="polya.1.fasta", in2="polya.2.fasta", expected1="polya.1.fasta", expected2="polya.2.fasta", cores=cores, ) def test_revcomp_only_r1(run_paired): run_paired( [ "--revcomp", "-g", "^TTATTTGTCT", "-g", "^TCCGCACTGGC", ], in1="revcomp.1.fastq", in2="revcomp.2.fastq", expected1="revcomp.1.fastq", expected2="revcomp.2.fastq", cores=1, ) def test_revcomp_only_r2(run_paired): run_paired( [ "--revcomp", "-G", "^TTATTTGTCT", "-G", "^TCCGCACTGGC", ], in1="revcomp.2.fastq", in2="revcomp.1.fastq", expected1="revcomp.2.fastq", expected2="revcomp.1.fastq", cores=1, ) def test_revcomp_r1_and_r2(run_paired): run_paired( [ "--revcomp", "-g", "^TTATTTGTCT", "-G", "^TCCGCACTGGC", ], in1="revcomp.1.fastq", in2="revcomp.2.fastq", expected1="revcomp-r1r2.1.fastq", expected2="revcomp-r1r2.2.fastq", cores=1, ) cutadapt-4.7/tests/test_parser.py000066400000000000000000000364551457457704700173010ustar00rootroot00000000000000import os from textwrap import dedent import pytest from dnaio import SequenceRecord from cutadapt.adapters import ( LinkedAdapter, BackAdapter, FrontAdapter, InvalidCharacter, PrefixAdapter, RightmostFrontAdapter, SuffixAdapter, ) from cutadapt.parser import ( AdapterSpecification, parse_search_parameters, expand_braces, make_adapters_from_specifications, make_adapters_from_one_specification, _make_not_linked_adapter, make_adapter, _normalize_ellipsis, ) from cutadapt.modifiers import ModificationInfo def test_expand_braces(): assert expand_braces("") == "" assert expand_braces("A") == "A" assert expand_braces("A{0}") == "" assert expand_braces("A{1}") == "A" assert expand_braces("A{2}") == "AA" assert expand_braces("A{2}C") == "AAC" assert expand_braces("ACGTN{3}TGACCC") == "ACGTNNNTGACCC" assert expand_braces("ACGTN{10}TGACCC") == "ACGTNNNNNNNNNNTGACCC" assert expand_braces("ACGTN{3}TGA{4}CCC") == "ACGTNNNTGAAAACCC" assert expand_braces("ACGTN{0}TGA{4}CCC") == "ACGTTGAAAACCC" def test_expand_braces_fail(): for expression in [ "{", "}", "{}", "{5", "{1}", "A{-7}", "A{", "A{1", "N{7", "AN{7", "A{4{}", "A{4}{3}", "A{b}", "A{6X}", "A{X6}", "A}A", ]: with pytest.raises(ValueError): expand_braces(expression) def test_parse_file_notation(tmp_path): tmp = tmp_path / "adapters.fasta" tmp.write_text( dedent( """>first_name ADAPTER1 >second_name ADAPTER2 """ ) ) search_parameters = dict( max_errors=0.2, min_overlap=4, read_wildcards=False, adapter_wildcards=False, indels=False, ) adapters = list( make_adapters_from_one_specification( "file:" + os.fspath(tmp), adapter_type="back", search_parameters=search_parameters, ) ) assert len(adapters) == 2 assert adapters[0].name == "first_name" assert adapters[0].sequence == "ADAPTER1" assert adapters[1].name == "second_name" assert adapters[1].sequence == "ADAPTER2" for a in adapters: assert a.max_error_rate == 0.2 assert a.min_overlap == 4 assert not a.read_wildcards assert not a.adapter_wildcards assert not a.indels def test_parse_not_linked(): p = AdapterSpecification.parse assert p("A", "front") == AdapterSpecification(None, None, "A", {}, "front", False) assert p("A", "back") == AdapterSpecification(None, None, "A", {}, "back", False) assert p("A", "anywhere") == AdapterSpecification( None, None, "A", {}, "anywhere", False ) assert p("^A", "front") == AdapterSpecification( None, "anchored", "A", {}, "front", False ) assert p("XXXA", "front") == AdapterSpecification( None, "noninternal", "A", {}, "front", False ) assert p("A$", "back") == AdapterSpecification( None, "anchored", "A", {}, "back", False ) assert p("AXXXX", "back") == AdapterSpecification( None, "noninternal", "A", {}, "back", False ) assert p("a_name=ADAPT", "front") == AdapterSpecification( "a_name", None, "ADAPT", {}, "front", False ) @pytest.mark.parametrize("where", ("front", "back")) @pytest.mark.parametrize("reqopt", ("required", "optional")) def test_parse_invalid_adapter_specific_parameter(where, reqopt): with pytest.raises(ValueError) as e: _make_not_linked_adapter("A;{}".format(reqopt), "name", where, dict()) assert "can only be used within linked adapters" in e.value.args[0] def test_parse_invalid_adapter_type(): with pytest.raises(ValueError) as e: AdapterSpecification.parse("A", "invalid_type") assert "adapter_type must be front, back or anywhere" in e.value.args[0] @pytest.mark.parametrize( "spec,adapter_type", [ ("^XA", "front"), ("^AX", "front"), ("XA$", "back"), ("AX$", "back"), ], ) def test_parse_double_placement_restrictions(spec, adapter_type): with pytest.raises(ValueError) as e: AdapterSpecification.parse(spec, adapter_type) assert "cannot use multiple placement restrictions" in e.value.args[0] def test_parse_misplaced_placement_restrictions(): with pytest.raises(ValueError) as e: AdapterSpecification.parse("A$", "front") assert "Allowed placement restrictions for a 5' adapter" in e.value.args[0] with pytest.raises(ValueError) as e: AdapterSpecification.parse("^A", "back") assert "Allowed placement restrictions for a 3' adapter" in e.value.args[0] def test_restriction_to_class(): with pytest.raises(ValueError) as e: AdapterSpecification._restriction_to_class("anywhere", "noninternal", False) assert "No placement may be specified" in e.value.args[0] def test_parse_search_parameters(): p = parse_search_parameters assert p("e=0.1") == {"max_errors": 0.1} assert p("error_rate=0.1") == {"max_errors": 0.1} assert p("max_errors=2") == {"max_errors": 2} assert p("o=5") == {"min_overlap": 5} assert p("min_overlap=5") == {"min_overlap": 5} assert p("o=7; e=0.4") == {"min_overlap": 7, "max_errors": 0.4} assert p("anywhere") == {"anywhere": True} assert p("required") == {"required": True} assert p("optional") == {"required": False} assert p("noindels") == {"indels": False} assert p("indels") == {"indels": True} assert p("rightmost") == {"rightmost": True} with pytest.raises(ValueError): p("e=hallo") with pytest.raises(KeyError): p("bla=0.1") with pytest.raises(ValueError): p("e=") with pytest.raises(KeyError) as e: p("e=0.1;e=0.1") assert "specified twice" in e.value.args[0] with pytest.raises(KeyError) as e: p("e=0.1;max_errors=0.1") assert "specified twice" in e.value.args[0] with pytest.raises(ValueError) as e: p("optional; required") assert "cannot be specified at the same time" in e.value.args[0] def test_make_adapter_front(): parameters = dict( max_errors=0.2, min_overlap=4, read_wildcards=False, adapter_wildcards=False, indels=False, ) a = make_adapter("ACGTACGT; e=0.15", "front", parameters) assert isinstance(a, FrontAdapter) assert a.max_error_rate == 0.15 assert a.min_overlap == 4 with pytest.raises(ValueError) as e: make_adapter("A", "invalid-cmdline-type", parameters) assert "adapter_type must be" in e.value.args[0] with pytest.raises(ValueError) as e: make_adapter("^ACGT;min_overlap=3", "front", parameters) assert "not possible" in e.value.args[0] def test_make_adapter_rightmost_front(): a = make_adapter("ACGT; rightmost", "front", dict()) assert isinstance(a, RightmostFrontAdapter) with pytest.raises(ValueError) as e: make_adapter("ACGT; rightmost", "back", dict()) assert "only allowed" in e.value.args[0] def test_make_adapter_back(): parameters = dict( max_errors=0.2, min_overlap=4, read_wildcards=False, adapter_wildcards=False, indels=False, ) a = make_adapter("ACGTAAAA; o=5; e=0.11", "back", parameters) assert isinstance(a, BackAdapter) assert a.max_error_rate == 0.11 assert a.min_overlap == 5 a = make_adapter("ACGTAAAA; noindels", "back", parameters) assert isinstance(a, BackAdapter) assert a.indels is False a = make_adapter("ACGTAAAA; indels", "back", parameters) assert isinstance(a, BackAdapter) assert a.indels is True for spec in ( "thename=ACG;e=0.15 ... TGT;e=0.17", "thename=ACG;e=0.15...TGT;e=0.17", ): a = make_adapter(spec, "back", parameters) assert isinstance(a, LinkedAdapter) assert a.front_adapter.max_error_rate == 0.15 assert a.back_adapter.max_error_rate == 0.17 with pytest.raises(ValueError) as e: make_adapter("ACGT$;min_overlap=3", "back", parameters) assert "not possible" in e.value.args[0] with pytest.raises(ValueError) as e: make_adapter("ACGT;min_overlap=5", "back", parameters) assert "exceeds" in e.value.args[0] def test_parse_file_notation_with_parameters(tmp_path): tmp = tmp_path / "adapters.fasta" tmp.write_text( dedent( """>first_name ADAPTER1;min_overlap=2 >second_name ADAPTER2;max_errors=0.4 """ ) ) parameters = dict( max_errors=0.2, min_overlap=4, read_wildcards=False, adapter_wildcards=False, indels=False, ) adapters = list( make_adapters_from_one_specification( "file:" + os.fspath(tmp) + ";max_errors=0.3;min_overlap=5;indels", adapter_type="back", search_parameters=parameters, ) ) assert len(adapters) == 2 a = adapters[0] assert isinstance(a, BackAdapter) assert a.name == "first_name" assert a.max_error_rate == 0.3 assert a.min_overlap == 2 assert a.indels is True a = adapters[1] assert isinstance(a, BackAdapter) assert a.name == "second_name" assert a.max_error_rate == 0.4 assert a.min_overlap == 5 assert a.indels is True def test_parse_file_notation_with_5prime_anchoring(tmp_path): tmp = tmp_path / "adapters.fasta" tmp.write_text( dedent( """>first ACCGGGTTTT >second AAAACCCGGT """ ) ) adapters = list( make_adapters_from_one_specification( "^file:" + os.fspath(tmp) + ";max_errors=0.3", adapter_type="front", search_parameters=dict(), ) ) assert len(adapters) == 2 for a in adapters: assert isinstance(a, PrefixAdapter) assert a.max_error_rate == 0.3 def test_parse_file_notation_with_3prime_anchoring(tmp_path): tmp = tmp_path / "adapters.fasta" tmp.write_text( dedent( """>first ACCGGGTTTT >second AAAACCCGGT """ ) ) adapters = list( make_adapters_from_one_specification( "file$:" + os.fspath(tmp) + ";max_errors=0.3", adapter_type="back", search_parameters=dict(), ) ) assert len(adapters) == 2 for a in adapters: assert isinstance(a, SuffixAdapter) assert a.max_error_rate == 0.3 def test_parse_with_adapter_sequence_as_a_path(tmp_path): with pytest.raises(InvalidCharacter): make_adapter("invalid.character", "back", dict()) # user forgot to write "file:" path = tmp_path / "afile.fasta" path.write_text(">abc\nACGT\n") with pytest.raises(InvalidCharacter) as e: list(make_adapters_from_one_specification(str(path), "back", dict())) assert "A file exists named" in e.value.args[0] def test_make_adapters_from_specifications(): with pytest.raises(ValueError) as e: make_adapters_from_specifications([("invalid-type", "A")], dict()) assert "adapter_type must be" in e.value.args[0] def test_normalize_ellipsis(): ne = _normalize_ellipsis assert ne("ACGT", "", "back") == ("ACGT", "front") # -a ACGT... assert ne("ACGT", "", "front") == ("ACGT", "front") # -g ACGT... assert ne("", "ACGT", "back") == ("ACGT", "back") # -a ...ACGT with pytest.raises(ValueError) as e: # -g ...ACGT ne("", "ACGT", "front") assert "Invalid adapter specification" in e.value.args[0] with pytest.raises(ValueError) as e: ne("A", "C", "back") assert "either" in e.value.args[0] with pytest.raises(ValueError) as e: ne("A", "", "anywhere") assert "No ellipsis" in e.value.args[0] @pytest.mark.parametrize( "seq,req1,req2", [ ("ACG...TGT", False, False), ("ACG...TGT$", False, True), ("^ACG...TGT", True, False), ("^ACG...TGT$", True, True), ], ) def test_anchoring_makes_front_linked_adapter_required(seq, req1, req2): # -a X...Y a = make_adapter(seq, "back", dict()) assert isinstance(a, LinkedAdapter) assert a.front_required is req1 assert a.back_required is req2 @pytest.mark.parametrize( "r1,r2,req1,req2", [ ("", "", False, False), ("", ";required", False, True), (";required", "", True, False), (";required", ";required", True, True), ("", ";optional", False, False), (";optional", "", False, False), (";optional", ";optional", False, False), ], ) def test_linked_adapter_back_required_optional(r1, r2, req1, req2): # -a X...Y a = make_adapter("ACG" + r1 + "...TGT" + r2, "back", dict()) assert isinstance(a, LinkedAdapter) assert a.front_required is req1 assert a.back_required is req2 @pytest.mark.parametrize( "r1,r2,exp1,exp2", [ ("", "", True, True), ("", ";required", True, True), (";required", "", True, True), (";required", ";required", True, True), ("", ";optional", True, False), (";optional", "", False, True), (";optional", ";optional", False, False), ], ) def test_linked_adapter_front_required_optional(r1, r2, exp1, exp2): # -g X...Y a = make_adapter("ACG" + r1 + "...TGT" + r2, "front", dict()) assert isinstance(a, LinkedAdapter) assert a.front_required is exp1 assert a.back_required is exp2 def test_linked_adapter_parameters(): # issue #394 a = make_adapter("ACG...TGT", "back", dict(max_errors=0.17, indels=False)) assert isinstance(a, LinkedAdapter) assert a.front_adapter.max_error_rate == 0.17 assert a.back_adapter.max_error_rate == 0.17 assert not a.front_adapter.indels assert not a.back_adapter.indels def test_linked_adapter_name(): # issue #414 a = make_adapter("the_name=^ACG...TGT", "back", dict()) assert isinstance(a, LinkedAdapter) assert a.create_statistics().name == "the_name" def test_anywhere_parameter_back(): adapter = make_adapter("CTGAAGTGAAGTACACGGTT;anywhere", "back", dict()) assert isinstance(adapter, BackAdapter) assert adapter._force_anywhere # TODO move the rest to a separate test read = SequenceRecord("foo1", "TGAAGTACACGGTTAAAAAAAAAA") from cutadapt.modifiers import AdapterCutter cutter = AdapterCutter([adapter]) trimmed_read = cutter(read, ModificationInfo(read)) assert trimmed_read.sequence == "" def test_anywhere_parameter_rightmost_front(): adapter = make_adapter("ACGT; rightmost; anywhere", "front", dict()) assert isinstance(adapter, RightmostFrontAdapter) assert adapter._force_anywhere def test_anywhere_parameter_front(): adapter = make_adapter("CTGAAGTGAAGTACACGGTT;anywhere", "front", dict()) assert isinstance(adapter, FrontAdapter) assert adapter._force_anywhere # TODO move the rest to a separate test read = SequenceRecord("foo1", "AAAAAAAAAACTGAAGTGAA") from cutadapt.modifiers import AdapterCutter cutter = AdapterCutter([adapter]) trimmed_read = cutter(read, ModificationInfo(read)) assert trimmed_read.sequence == "" def test_linked_adapter_rightmost(): a = make_adapter("ACG;rightmost...TGT", "back", dict()) assert isinstance(a, LinkedAdapter) assert isinstance(a.front_adapter, RightmostFrontAdapter) cutadapt-4.7/tests/test_predicates.py000066400000000000000000000045311457457704700201160ustar00rootroot00000000000000""" Tests write output (should it return True or False or write) """ import pytest from dnaio import SequenceRecord from cutadapt.predicates import TooManyN, TooHighAverageErrorRate from cutadapt.steps import PairedEndFilter @pytest.mark.parametrize( "seq,count,expected", [ ("AAA", 0, False), ("AAA", 1, False), ("AAACCTTGGN", 1, False), ("AAACNNNCTTGGN", 0.5, False), ("NNNNNN", 1, True), ("ANAAAA", 1 / 6, False), ("ANAAAA", 0, True), ], ) def test_too_many_n(seq, count, expected): predicate = TooManyN(count=count) _seq = SequenceRecord("read1", seq, qualities="#" * len(seq)) assert predicate.test(_seq, []) == expected @pytest.mark.parametrize( "seq1,seq2,count,expected", [ ("AAA", "AAA", 0, False), ("AAAN", "AAA", 0, True), ("AAA", "AANA", 0, True), ("ANAA", "AANA", 1, False), ], ) def test_too_many_n_paired(seq1, seq2, count, expected): predicate = TooManyN(count=count) filter_legacy = PairedEndFilter( predicate, predicate, None, pair_filter_mode="first" ) filter_any = PairedEndFilter(predicate, predicate, None, pair_filter_mode="any") read1 = SequenceRecord("read1", seq1, qualities="#" * len(seq1)) read2 = SequenceRecord("read1", seq2, qualities="#" * len(seq2)) assert (filter_legacy(read1, read2, [], []) is None) == predicate.test(read1, []) # True entire pair if one of the reads fulfills criteria assert (filter_any(read1, read2, [], []) is None) == expected def test_invalid_pair_filter_mode(): with pytest.raises(ValueError) as e: PairedEndFilter(None, None, None, "invalidmode") assert "pair_filter_mode must be" in e.value.args[0] @pytest.mark.parametrize( "quals,rate,expected", [ # 3 * 0.1 is larger than 0.3 due to floating point rounding. (chr(43) * 3, 0.1, True), (chr(43) * 3 + chr(33), 0.1, True), # 3 * 0.1 + 1 (chr(43) * 3 + chr(33), 0.33, False), # 3 * 0.1 + 1 (chr(43) * 3 + chr(33), 0.32, True), # 3 * 0.1 + 1 (chr(126) * 9 + chr(33), 0.1, True), # 9 * 10^-9.3 + 1 ], ) def test_too_high_average_error_rate(quals, rate, expected): predicate = TooHighAverageErrorRate(rate) _seq = SequenceRecord("read1", "A" * len(quals), qualities=quals) assert predicate.test(_seq, []) == expected cutadapt-4.7/tests/test_qualtrim.py000066400000000000000000000057551457457704700176420ustar00rootroot00000000000000import pytest from dnaio import SequenceRecord from cutadapt.qualtrim import nextseq_trim_index, expected_errors, poly_a_trim_index def test_nextseq_trim(): s = SequenceRecord("n", "", "") assert nextseq_trim_index(s, cutoff=22) == 0 s = SequenceRecord( "n", "TCTCGTATGCCGTCTTATGCTTGAAAAAAAAAAGGGGGGGGGGGGGGGGGNNNNNNNNNNNGGNGG", "AA//EAEE//A6///E//A//EA/EEEEEEAEA//EEEEEEEEEEEEEEE###########EE#EA", ) assert nextseq_trim_index(s, cutoff=22) == 33 @pytest.mark.parametrize( "sequence,tail", [ ("", ""), ("GGGGGGGGAAAGAAGAAGAAGAAGAAGAAG", ""), ("TTTAGA", ""), # shorter than three nucleotides ("TTTAGAA", ""), # shorter than three nucleotides ("TTTAG", "AAA"), ("TCAAGAAGTCCTTTACCAGCTTTC", "AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA"), ("TCAAGAAGTCCTTTACCAGCTTTC", "AAATAAAAAAAAAAAAAAAAAAAAAAAAAAAAA"), ("GCAGATCACCTT", "AAAAAAAAAAAAAAAAAAAAAAAAAAAATAAA"), ("GCAGATCACCTT", "AAAAAAAAAAAAAAAAAAAAAAAAAAAAT"), ("GCAGATCACCTT", "AAAAAAAAAAAAAAAAAAAAAAAAAAAATCG"), ("GCAGATCACCTAT", "AAAACAAAAAAACAAAAAAAACAAAAAA"), ("TTTT", "AAATAAAA"), ("GGGGGGGGAAAGAAGAAGAAGAAGAAGAAG", "AAA"), ], ) def test_poly_a_trim_index(sequence, tail): assert poly_a_trim_index(sequence + tail) == len(sequence) @pytest.mark.parametrize( "head,sequence", [ ("", ""), ("", "GGGGGGGGAAAGAAGAAGAAGAAGAAGAAG"), ("", "TGTCCC"), ("", "TTGTCCC"), ("TTT", "GTCCC"), ( "TTTTTTTTTTTTTTTTTTTTT", "CAAGAAGTCCCCAGCTTTC", ), ("TTTATTTTTTTTTTTTTTTTTTTTTTTTTTTTT", "CAAGAAGTCCTTTACCAGCTTTC"), ("TTTTTATTTTTTTTTTTTTTTTTTTTTTTTTT", "GCAGATCACCTT"), ("ATTTTTTTTTTTTTTTTTTTTTTTTTTTT", "GCAGATCACCTT"), ("AGCTTTTTTTTTTTTTTTTTTTTTTTTTTTT", "GCAGATCACCTT"), ("TTTTGTTTTTTTGTTTTTTTTGTTTTTT", "GCAGATCACCTAT"), ("TTTATTTT", "AAAA"), ("TTT", "GGGGGGGGAAAGAAGAAGAAGAAGAAGAAG"), ], ) def test_poly_t_trim_index(head, sequence): assert poly_a_trim_index(head + sequence, revcomp=True) == len(head) def test_expected_errors(): def encode_qualities(quals): return "".join(chr(q + 33) for q in quals) assert pytest.approx(0.0) == expected_errors("") assert pytest.approx(0.1) == expected_errors(encode_qualities([10])) assert pytest.approx(0.01) == expected_errors(encode_qualities([20])) assert pytest.approx(0.001) == expected_errors(encode_qualities([30])) assert pytest.approx(0.2) == expected_errors(encode_qualities([10, 10])) assert pytest.approx(0.11) == expected_errors(encode_qualities([10, 20])) assert pytest.approx(0.11) == expected_errors(encode_qualities([20, 10])) assert pytest.approx(0.3) == expected_errors(encode_qualities([10, 10, 10])) assert pytest.approx(0.111) == expected_errors(encode_qualities([10, 20, 30])) assert pytest.approx(0.2111) == expected_errors( encode_qualities([10, 10, 20, 30, 40]) ) cutadapt-4.7/tests/test_report.py000066400000000000000000000004741457457704700173100ustar00rootroot00000000000000from cutadapt.report import ( safe_divide, add_if_not_none, ) def test_safe_divide(): assert safe_divide(1, 0) == 0 assert safe_divide(5, 2) == 2.5 def test_add_if_not_none(): assert add_if_not_none(3, 5) == 8 assert add_if_not_none(3, None) == 3 assert add_if_not_none(None, 5) == 5 cutadapt-4.7/tests/test_stats.py000066400000000000000000000027551457457704700171370ustar00rootroot00000000000000from cutadapt.statistics import ReadLengthStatistics class TestReadLengthStatistics: def test_empty_on_init(self): rls = ReadLengthStatistics() assert rls.written_reads() == 0 assert rls.written_bp() == (0, 0) lengths = rls.written_lengths() assert not lengths[0] and not lengths[1] def test_some_reads(self): rls = ReadLengthStatistics() rls.update("THEREAD") # length: 7 rls.update("YETANOTHER") # length: 10 rls.update2("FIRST", "SECOND") # lengths: 5, 6 rls.update("12345") assert rls.written_reads() == 4 assert rls.written_bp() == (7 + 10 + 5 + 5, 6) lengths = rls.written_lengths() assert sorted(lengths[0].items()) == [(5, 2), (7, 1), (10, 1)] assert sorted(lengths[1].items()) == [(6, 1)] def test_iadd(self): rls = ReadLengthStatistics() rls.update("THEREAD") # length: 7 rls.update("YETANOTHER") # length: 10 rls.update2("FIRST", "SECOND") # lengths: 5, 6 rls.update("12345") rls2 = ReadLengthStatistics() rls2.update("TESTING") # length: 7 rls2.update2("LEFT", "RIGHT") # lengths: 4, 5 rls += rls2 assert rls.written_reads() == 6 assert rls.written_bp() == (7 + 10 + 5 + 5 + 7 + 4, 6 + 5) lengths = rls.written_lengths() assert sorted(lengths[0].items()) == [(4, 1), (5, 2), (7, 2), (10, 1)] assert sorted(lengths[1].items()) == [(5, 1), (6, 1)] cutadapt-4.7/tests/test_testutils.py000066400000000000000000000011521457457704700200270ustar00rootroot00000000000000import pytest from utils import assert_files_equal, FilesDifferent, binomial, datapath def test_files_different(): with pytest.raises(FileNotFoundError): assert_files_equal("simple.fasta", "simple.fastq") with pytest.raises(FilesDifferent): assert_files_equal(datapath("simple.fasta"), datapath("simple.fastq")) def test_binomial(): assert binomial(0, 0) == 1 assert binomial(0, 1) == 0 assert binomial(0, -1) == 0 assert binomial(1, 0) == 1 assert binomial(1, 1) == 1 assert binomial(1, 2) == 0 assert binomial(10, 5) == 10 * 9 * 8 * 7 * 6 // (2 * 3 * 4 * 5) cutadapt-4.7/tests/test_tokenizer.py000066400000000000000000000011711457457704700200020ustar00rootroot00000000000000import pytest from cutadapt.tokenizer import tokenize_braces, StringToken, BraceToken, TokenizeError def test_tokenize_braces(): tokenize = tokenize_braces assert list(tokenize("")) == [] assert list(tokenize("text")) == [StringToken("text")] assert list(tokenize("before {variable} after")) == [ StringToken("before "), BraceToken("variable"), StringToken(" after"), ] def test_tokenize_unexpected_braces(): with pytest.raises(TokenizeError): list(tokenize_braces("abc {def{ghi}")) with pytest.raises(TokenizeError): list(tokenize_braces("abc {def} gh} i")) cutadapt-4.7/tests/test_trim.py000066400000000000000000000050411457457704700167430ustar00rootroot00000000000000from typing import Sequence from dnaio import SequenceRecord from cutadapt.adapters import ( BackAdapter, AnywhereAdapter, BackAdapterStatistics, Adapter, ) from cutadapt.modifiers import AdapterCutter, ModificationInfo def test_statistics() -> None: read = SequenceRecord("name", "AAAACCCCAAAA") adapters: Sequence[Adapter] = [BackAdapter("CCCC", max_errors=0.1)] cutter = AdapterCutter(adapters, times=3) cutter(read, ModificationInfo(read)) assert isinstance(cutter.adapter_statistics[adapters[0]], BackAdapterStatistics) lengths = cutter.adapter_statistics[adapters[0]].end.lengths trimmed_bp = sum(seqlen * count for (seqlen, count) in lengths.items()) assert trimmed_bp <= len(read), trimmed_bp def test_end_trim_with_mismatch(): """ Test the not-so-obvious case where an adapter of length 13 is trimmed from the end of a sequence with overlap 9 and there is one deletion. In this case the algorithm starts with 10 bases of the adapter to get the hit and so the match is considered good. An insertion or substitution at the same spot is not a match. """ adapter = BackAdapter("TCGATCGATCGAT", max_errors=0.1) read = SequenceRecord("foo1", "AAAAAAAAAAATCGTCGATC") cutter = AdapterCutter([adapter], times=1) trimmed_read = cutter(read, ModificationInfo(read)) assert trimmed_read.sequence == "AAAAAAAAAAA" assert cutter.adapter_statistics[adapter].end.lengths == {9: 1} # We see 1 error at length 9 even though the number of allowed mismatches at # length 9 is 0. assert cutter.adapter_statistics[adapter].end.errors[9][1] == 1 read = SequenceRecord("foo2", "AAAAAAAAAAATCGAACGA") cutter = AdapterCutter([adapter], times=1) trimmed_read = cutter(read, ModificationInfo(read)) assert trimmed_read.sequence == read.sequence assert cutter.adapter_statistics[adapter].end.lengths == {} def test_anywhere_with_errors(): adapter = AnywhereAdapter("CCGCATTTAG", max_errors=0.1) for seq, expected_trimmed in ( ("AACCGGTTccgcatttagGATC", "AACCGGTT"), ("AACCGGTTccgcgtttagGATC", "AACCGGTT"), # one mismatch ("AACCGGTTccgcatttag", "AACCGGTT"), ("ccgcatttagAACCGGTT", "AACCGGTT"), ("ccgtatttagAACCGGTT", "AACCGGTT"), # one mismatch ("ccgatttagAACCGGTT", "AACCGGTT"), # one deletion ): read = SequenceRecord("foo", seq) cutter = AdapterCutter([adapter], times=1) trimmed_read = cutter(read, ModificationInfo(read)) assert trimmed_read.sequence == expected_trimmed cutadapt-4.7/tests/test_utils.py000066400000000000000000000012151457457704700171270ustar00rootroot00000000000000import time from itertools import islice from cutadapt.utils import ( Progress, DummyProgress, ) from cutadapt.files import raise_open_files_limit def test_raise_open_files_limit(): try: raise_open_files_limit(1) except ValueError: pass def test_progress(): p = Progress(every=1e-6) p.update(100) time.sleep(0.001) p.update(0) p.update(900) p.update(10000) p.close() def test_progress_scissors(): sc = Progress.scissors(width=10) for i in islice(sc, 0, 30): next(sc) def test_dummy_progress(): p = DummyProgress() p.update(100) p.update(900) p.close() cutadapt-4.7/tests/utils.py000066400000000000000000000025731457457704700161000ustar00rootroot00000000000000import sys import os.path import subprocess from pathlib import Path def datapath(path): return os.path.join(os.path.dirname(__file__), "data", path) def cutpath(path): return os.path.join(os.path.dirname(__file__), "cut", path) class FilesDifferent(Exception): pass def assert_files_equal(path1, path2, ignore_trailing_space: bool = False): if not Path(path1).exists(): raise FileNotFoundError(path1) if not Path(path2).exists(): raise FileNotFoundError(path2) cmd = ["diff", "-u"] if sys.platform == "win32": cmd.append("--strip-trailing-cr") if ignore_trailing_space: if sys.platform == "darwin": # Ignores too much, but macOS doesn’t have the option below cmd.append("-b") else: cmd.append("--ignore-trailing-space") if sys.platform == "win32": path1, path2 = os.fspath(path1), os.fspath(path2) try: subprocess.check_output(cmd + [path1, path2], stderr=subprocess.STDOUT) except subprocess.CalledProcessError as e: raise FilesDifferent("\n" + e.output.decode()) from None def binomial(n, k): """ Return binomial coefficient ('n choose k'). This implementation does not use factorials. """ k = min(k, n - k) if k < 0: return 0 r = 1 for j in range(k): r *= n - j r //= j + 1 return r cutadapt-4.7/tox.ini000066400000000000000000000024231457457704700145310ustar00rootroot00000000000000[tox] envlist = flake8,black,py38,py39,py310,py311,py312,mypy,docs isolated_build = True [testenv] deps = coverage pytest pytest-timeout pytest-mock install_command = python -m pip install --only-binary :all: {opts} {packages} setenv = PYTHONDEVMODE = 1 commands = coverage run -m pytest --doctest-modules --pyargs cutadapt tests coverage combine -q coverage report coverage xml [testenv:docs] basepython = python3.10 skip_install = true deps = -r doc/requirements.txt commands = sphinx-build -W -b html -d {envtmpdir}/doctrees doc {envtmpdir}/html [testenv:flake8] basepython = python3.10 deps = flake8 skip_install = true commands = flake8 src/ tests/ setup.py [testenv:mypy] basepython = python3.10 deps = mypy commands = mypy src/ [testenv:black] basepython = python3.10 deps = black==22.3.0 skip_install = true commands = black --check src/ tests/ setup.py [coverage:run] branch = True parallel = True concurrency = multiprocessing source_pkgs = cutadapt source = tests [coverage:paths] source = src/ */site-packages/ [coverage:report] precision = 1 exclude_lines = pragma: no cover def __repr__ raise NotImplementedError [flake8] max-line-length = 120 max-complexity = 16 select = E,F,W,C90,W504 extend_ignore = E128,E131,W503,E203