pax_global_header00006660000000000000000000000064134602307110014507gustar00rootroot0000000000000052 comment=4f73d69e75088e0e39021d1c6ab214a3c7b409ca kraken2-2.0.8-beta/000077500000000000000000000000001346023071100137645ustar00rootroot00000000000000kraken2-2.0.8-beta/.gitignore000066400000000000000000000001401346023071100157470ustar00rootroot00000000000000*.o src/build_db src/classify src/estimate_capacity src/dump_table src/lookup_accession_numbers kraken2-2.0.8-beta/CHANGELOG.md000066400000000000000000000073311346023071100156010ustar00rootroot00000000000000# Changelog ## Unreleased ## [2.0.8] - 2019-04-25 (beta) ### Added - FTP downloading option for taxonomy/libraries (--use-ftp for kraken2-build) - Option to skip downloading taxonomy maps ### Changed - Added lookup table to speed up parsing in MinimizerScanner class - Default parameters for minimizer lengths and spaces changed (spaces=7 for nucleotide search, length=12 for translated search) ### Fixed - Linked space expansion value for proteins to constant used by MinimizerScanner - Reporting of taxids in classified-out sequence files - Confidence scoring bug associated with failure to leave some sequences unclassified - Reverse complement shifting bug, code made backwards-compatible with existing databases (newly created DBs will have fix) - NCBI taxonomy download error due to removal of EST/GSS files ## [2.0.7] - 2018-08-11 (beta) ### Added - Disk usage info to kraken2-build --clean - Memory allocation error message for hash table - Option for --max-db-size and hash downsampling - Multithreading to kraken2-inspect ### Changed - Move to /usr/bin/env for perl scripts - Add DB loading message to keep people from killing processes early - Add flag files for resuming download of nucleotide accession map data - Converted lookup_accession_numbers script into C++ program w/ memory mapping - Clarified in manual that one or more libraries allowed for custom DBs - Silenced progress messages in C++ programs for non-TTY stderr - Taxonomy downloads switched to rsync from wget (ftp) - Removed '%' from reports ### Fixed - Allow d/l of protozoa library w/ kraken2-build script - Filenames for SILVA database taxonomy info - Typo in manual for output format example - Corrected default space count in manual - Removed obvious race condition in --add-to-library functionality - Corrected behavior of --classified-out and --unclassified-out (no longer forcing .fq/.fa file extensions, respecting '#' in paired mode) - Usage message in kraken2-inspect - Taxonomy creation for 16S databases ## [2.0.6] - 2018-06-13 (beta) ### Changed - New DB summary info printed out w/ inspect script + --skip-counts option ### Fixed - Now stripping carriage returns and other trailing whitespace from sequence data - Treating l-mers immediately following ambiguous characters as ambiguous until a full k-mer is processed - Bug in expansion of spaced seed masks that left spaces at end ## [2.0.5] - 2018-05-21 (beta) ### Added - New kraken2-inspect script to report minimizer counts per taxon ### Changed - Kraken 2X build now adds terminators to all reference sequences ## [2.0.4] - 2018-05-06 (beta) ### Fixed - Improved portability to older g++ by removing initialization of variable-length string. ## [2.0.3] - 2018-02-12 (alpha) ### Added - Reporting options to kraken2 script (like Kraken 1's kraken-report and kraken-mpa-report) ### Changed - Made loading to RAM default option, added --memory-mapping option to kraken2 ## [2.0.2] - 2018-02-04 (alpha) ### Added - Low base quality masking option ### Changed - Moved low-complexity masking to library download/addition, out of build process - Made no masking default for human genome in standard installation ## [2.0.1] - 2018-01-01 (alpha) ### Added - Low-complexity sequence masking as a default - UniVec/UniVec_Core databases to supported downloads - UniVec_Core & human in standard Kraken 2 DB - 16S DB support (Greengenes, Silva, RDP) - --use-names flag for kraken2 script - Priority queue to ensure classifier output order matches input order when multi-threading - Changelog ### Changed - Reduced amino acid alphabet (requires rebuild of old protein DBs) - Operating manual ### Fixed - kraken2 now allows compression & paired processing at same time ## [2.0.0] - 2017-12-04 (alpha, initial release) kraken2-2.0.8-beta/LICENSE000066400000000000000000000020741346023071100147740ustar00rootroot00000000000000The MIT License (MIT) Copyright (c) 2017-2018 Derrick Wood Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. kraken2-2.0.8-beta/README.md000066400000000000000000000002731346023071100152450ustar00rootroot00000000000000kraken2 ======= The second version of the Kraken taxonomic sequence classification system Please refer to the Operating Manual (in docs/MANUAL.html) for details on how to use Kraken 2. kraken2-2.0.8-beta/docs/000077500000000000000000000000001346023071100147145ustar00rootroot00000000000000kraken2-2.0.8-beta/docs/MANUAL.html000066400000000000000000001256061346023071100165710ustar00rootroot00000000000000 Kraken 2 Manual –

Kraken taxonomic sequence classification system

Version 2.0.8-beta

Operating Manual

Table of Contents

Introduction

Kraken is a taxonomic sequence classifier that assigns taxonomic labels to DNA sequences. Kraken examines the k-mers within a query sequence and uses the information within those k-mers to query a database. That database maps k-mers to the lowest common ancestor (LCA) of all genomes known to contain a given k-mer.

The first version of Kraken used a large indexed and sorted list of k-mer/LCA pairs as its database. While fast, the large memory requirements posed some problems for users, and so Kraken 2 was created to provide a solution to those problems.

Kraken 2 differs from Kraken 1 in several important ways:

  1. Only minimizers of the k-mers in the query sequences are used as database queries. Similarly, only minimizers of the k-mers in the reference sequences in the database's genomic library are stored in the database. We will also refer to the minimizers as -mers, where ℓ ≤ k. All k-mers are considered to have the same LCA as their minimizer's database LCA value.
  2. Kraken 2 uses a compact hash table that is a probabilistic data structure. This means that occasionally, database queries will fail by either returning the wrong LCA, or by not resulting in a search failure when a queried minimizer was never actually stored in the database. By incurring the risk of these false positives in the data structure, Kraken 2 is able to achieve faster speeds and lower memory requirements. Users should be aware that database false positive errors occur in less than 1% of queries, and can be compensated for by use of confidence scoring thresholds.
  3. Kraken 2 has the ability to build a database from amino acid sequences and perform a translated search of the query sequences against that database.
  4. Kraken 2 utilizes spaced seeds in the storage and querying of minimizers to improve classification accuracy.
  5. Kraken 2 provides support for "special" databases that are not based on NCBI's taxonomy. These are currently limited to three popular 16S databases.

Because Kraken 2 only stores minimizers in its hash table, and k can be much larger than , only a small percentage of the possible -mers in a genomic library are actually deposited in the database. This creates a situation similar to the Kraken 1 "MiniKraken" databases; however, preliminary testing has shown the accuracy of a reduced Kraken 2 database to be quite similar to the full-sized Kraken 2 database, while Kraken 1's MiniKraken databases often resulted in a substantial loss of per-read sensitivity.

The Kraken 2 paper is currently under preparation. Until it is released, please cite the original Kraken paper if you use Kraken 2 in your research. Thank you!

System Requirements

Installation

To begin using Kraken 2, you will first need to install it, and then either download or create a database.

Kraken 2 consists of two main scripts (kraken2 and kraken2-build), along with several programs and smaller scripts. As part of the installation process, all scripts and programs are installed in the same directory. After installation, you can move the main scripts elsewhere, but moving the other scripts and programs requires editing the scripts and changing the $KRAKEN2_DIR variables in the main scripts.

Once an install directory is selected, you need to run the following command in the directory where you extracted the Kraken 2 source:

./install_kraken2.sh $KRAKEN2_DIR

(Replace $KRAKEN2_DIR above with the directory where you want to install Kraken 2's programs/scripts.)

The install_kraken2.sh script should compile all of Kraken 2's code and setup your Kraken 2 program directory. Installation is successful if you see the message "Kraken 2 installation complete."

Once installation is complete, you may want to copy the main Kraken 2 scripts into a directory found in your PATH variable (e.g., "$HOME/bin"):

cp $KRAKEN2_DIR/bin/kraken2{,-build,-inspect} $HOME/bin

After installation, you're ready to either create or download a database.

Kraken 2 Databases

A Kraken 2 database is a directory containing at least 3 files:

None of these three files are in a human-readable format. Other files may also be present as part of the database build process, and can, if desired, be removed after a successful build of the database.

In interacting with Kraken 2, you should not have to directly reference any of these files, but rather simply provide the name of the directory in which they are stored. Kraken 2 allows both the use of a standard database as well as custom databases; these are described in the sections Standard Kraken 2 Database and Custom Databases below, respectively.

Standard Kraken 2 Database

To create the standard Kraken 2 database, you can use the following command:

kraken2-build --standard --db $DBNAME

(Replace "$DBNAME" above with your preferred database name/location. Please note that the database will use approximately 100 GB of disk space during creation, with the majority of that being reference sequences or taxonomy mapping information that can be removed after the build.)

This will download NCBI taxonomic information, as well as the complete genomes in RefSeq for the bacterial, archaeal, and viral domains, along with the human genome and a collection of known vectors (UniVec_Core). After downloading all this data, the build process begins; this can be the most time-consuming step. If you have multiple processing cores, you can run this process with multiple threads, e.g.:

kraken2-build --standard --threads 24 --db $DBNAME

Using 32 threads on an AWS EC2 r4.8xlarge instance with 16 dual-core hyperthreaded 2.30 GHz CPUs and 244 GB of RAM, the build process took approximately 35 minutes in Jan. 2018.

The build process itself has two main steps, each of which requires passing over the contents of the reference library:

  1. Estimation of the capacity needed in the Kraken 2 compact hash table. This uses a low-memory method to reliably estimate the number of minimizers present in the reference library given the selected parameters k and .
  2. Population of the hash table (and conversion of the taxonomy to an internal format). This step is a second pass over the reference library to find minimizers and then place them in the database.

(There is one other preliminary step where sequence IDs are mapped to taxonomy IDs, but this is usually a rather quick process and is mostly handled during library downloading.)

Unlike Kraken 1's build process, Kraken 2 does not perform checkpointing after the estimation step. This is because the estimation step is dependent on the selected k and values, and if the population step fails, it is likely because k needs to be increased (reducing the overall memory requirements).

Classification

To classify a set of sequences, use the kraken2 command:

kraken2 --db $DBNAME seqs.fa

Output will be sent to standard output by default. The files containing the sequences to be classified should be specified on the command line. Sequences can also be provided through standard input using the special filename /dev/fd/0.

The kraken2 program allows several different options:

To get a full list of options, use kraken2 --help.

Output Formats

Standard Kraken Output Format

Each sequence (or sequence pair, in the case of paired reads) classified by Kraken 2 results in a single line of output. Kraken 2's output lines contain five tab-delimited fields; from left to right, they are:

  1. "C"/"U": a one letter code indicating that the sequence was either classified or unclassified.
  2. The sequence ID, obtained from the FASTA/FASTQ header.
  3. The taxonomy ID Kraken 2 used to label the sequence; this is 0 if the sequence is unclassified.
  4. The length of the sequence in bp. In the case of paired read data, this will be a string containing the lengths of the two sequences in bp, separated by a pipe character, e.g. "98|94".
  5. A space-delimited list indicating the LCA mapping of each k-mer in the sequence(s). For example, "562:13 561:4 A:31 0:1 562:3" would indicate that:

Note that paired read data will contain a "|:|" token in this list to indicate the end of one read and the beginning of another.

When Kraken 2 is run against a protein database (see Translated Search), the LCA hitlist will contain the results of querying all six frames of each sequence. Reading frame data is separated by a "-:-" token.

Kraken 1 offered a kraken-translate and kraken-report script to change the output into different formats. Through the use of kraken2 --use-names, Kraken 2 will replace the taxonomy ID column with the scientific name and the taxonomy ID in parenthesis (e.g., "Bacteria (taxid 2)" instead of "2"), yielding similar functionality to Kraken 1's kraken-translate script. The sample report functionality now exists as part of the kraken2 script, with the use of the --report option; the sample report formats are described below.

Sample Report Output Format

Like Kraken 1, Kraken 2 offers two formats of sample-wide results. Kraken 2's standard sample report format is tab-delimited with one line per taxon. The fields of the output, from left-to-right, are as follows:

1. Percentage of fragments covered by the clade rooted at this taxon
2. Number of fragments covered by the clade rooted at this taxon
3. Number of fragments assigned directly to this taxon
4. A rank code, indicating (U)nclassified, (R)oot, (D)omain, (K)ingdom,
   (P)hylum, (C)lass, (O)rder, (F)amily, (G)enus, or (S)pecies.
   Taxa that are not at any of these 10 ranks have a rank code that is
   formed by using the rank code of the closest ancestor rank with
   a number indicating the distance from that rank.  E.g., "G2" is a
   rank code indicating a taxon is between genus and species and the
   grandparent taxon is at the genus rank.
5. NCBI taxonomic ID number
6. Indented scientific name

The scientific names are indented using space, according to the tree structure specified by the taxonomy.

By default, taxa with no reads assigned to (or under) them will not have any output produced. However, if you wish to have all taxa displayed, you can use the --report-zero-counts switch to do so. This can be useful if you are looking to do further downstream analysis of the reports, and want to compare samples. Sorting by the taxonomy ID (using sort -k5,5n) can provide a consistent line ordering between reports.

In addition, we also provide the option --use-mpa-style that can be used in conjunction with --report. This option provides output in a format similar to MetaPhlAn's output. The output with this option provides one taxon per line, with a lowercase version of the rank codes in Kraken 2's standard sample report format (except for 'U' and 'R'), two underscores, and the scientific name of the taxon (e.g., "d__Viruses"). The full taxonomy of each taxon (at the eight ranks considered) is given, with each rank's name separated by a pipe character (e.g., "d__Viruses|o_Caudovirales"). Following this version of the taxon's scientific name is a tab and the number of fragments assigned to the clade rooted at that taxon.

Translated Search

Kraken 2 allows users to perform a six-frame translated search, similar to the well-known BLASTX program. To do this, Kraken 2 uses a reduced 15 amino acid alphabet and stores amino acid minimizers in its database. LCA results from all 6 frames are combined to yield a set of LCA hits, which is then resolved in the same manner as in Kraken's normal operation.

To build a protein database, the --protein option should be given to kraken2-build (either along with --standard, or with all steps if building a custom database).

Custom Databases

We realize the standard database may not suit everyone's needs. Kraken 2 also allows creation of customized databases.

To build a custom database:

  1. Install a taxonomy. Usually, you will just use the NCBI taxonomy, which you can easily download using:

    kraken2-build --download-taxonomy --db $DBNAME

    This will download the accession number to taxon maps, as well as the taxonomic name and tree information from NCBI. These files can be found in $DBNAME/taxonomy/ . If you need to modify the taxonomy, edits can be made to the names.dmp and nodes.dmp files in this directory; you may also need to modify the *.accession2taxid files appropriately.

    Some of the standard sets of genomic libraries have taxonomic information associated with them, and don't need the accession number to taxon maps to build the database successfully. These libraries include all those available through the --download-library option (see next point), except for the plasmid and non-redundant databases. If you are not using custom sequences (see the --add-to-library option) and are not using one of the plasmid or non-redundant database libraries, you may want to skip downloading of the accession number to taxon maps. This can be done by passing --skip-maps to the kraken2-build --download-taxonomy command.

  2. Install one or more reference libraries. Several sets of standard genomes/proteins are made easily available through kraken2-build:

    To download and install any one of these, use the --download-library switch, e.g.:

    kraken2-build --download-library bacteria --db $DBNAME

    Multiple libraries can be downloaded into a database prior to building by issuing multiple kraken2-build --download-library commands, e.g.:

    kraken2-build --download-library archaea --db $DBNAME
    kraken2-build --download-library viral --db $DBNAME

    The above commands would prepare a database that would contain archaeal and viral genomes; the --build option (see below) will still need to be used after downloading these libraries to actually build the database, however.

    (Note that downloading nr or env_nr require use of the --protein option, and that UniVec and UniVec_Core are incompatible with the --protein option.)

    Other genomes can also be added, but such genomes must meet certain requirements:

    Sequences not downloaded from NCBI may need their taxonomy information assigned explicitly. This can be done using the string kraken:taxid|XXX in the sequence ID, with XXX replaced by the desired taxon ID. For example, to put a known adapter sequence in taxon 32630 ("synthetic construct"), you could use the following:

    >sequence16|kraken:taxid|32630  Adapter sequence
    CAAGCAGAAGACGGCATACGAGATCTTCGAGTGACTGGAGTTCCTTGGCACCCGAGAATTCCA

    The kraken:taxid string must begin the sequence ID or be immediately preceded by a pipe character (|). Explicit assignment of taxonomy IDs in this manner will override the accession number mapping provided by NCBI.

    If your genomes meet the requirements above, then you can add each sequence to your database's genomic library using the --add-to-library switch, e.g.:

    kraken2-build --add-to-library chr1.fa --db $DBNAME
    kraken2-build --add-to-library chr2.fa --db $DBNAME

    Note that if you have a list of files to add, you can do something like this in bash:

    for file in chr*.fa
    do
        kraken2-build --add-to-library $file --db $DBNAME
    done

    Or even add all *.fa files found in the directory genomes:

    find genomes/ -name '*.fa' -print0 | xargs -0 -I{} -n1 kraken2-build --add-to-library {} --db $DBNAME

    (You may also find the -P option to xargs useful to add many files in parallel if you have multiple processors.)

  3. Once your library is finalized, you need to build the database. This can be done with the command:

    kraken2-build --build --db $DBNAME

The --threads option is also helpful here to reduce build time.

By default, the values of k and are 35 and 31, respectively (or 15 and 15 for protein databases). These values can be explicitly set with the --kmer-len and minimizer-len options, however. Note that the minimizer length must be no more than 31 for nucleotide databases, and 15 for protein databases. Additionally, the minimizer length must be no more than the k-mer length. There is no upper bound on the value of k, but sequences less than k bp in length cannot be classified.

Kraken 2 also utilizes a simple spaced seed approach to increase accuracy. A number s < /4 can be chosen, and s positions in the minimizer will be masked out during all comparisons. Masked positions are chosen to alternate from the second-to-last position in the minimizer; e.g., s = 5 and = 31 will result in masking out the 0 positions shown here:

   111 1111 1111 1111 1111 1101 0101 0101

By default, s = 6 for nucleotide databases, and s = 0 for protein databases. This can be changed using the --minimizer-spaces option along with the --build task of kraken2-build.

A full list of options for kraken2-build can be obtained using kraken2-build --help.

After building a database, if you want to reduce the disk usage of the database, you can use the --clean option for kraken2-build to remove intermediate files from the database directory.

Masking of Low-complexity Sequences

Low-complexity sequences, e.g. "ACACACACACACACACACACACACAC", are known to occur in many different organisms and are typically less informative for use in alignments; the BLAST programs often mask these sequences by default. Using this masking can help prevent false positives in Kraken 2's results, and so we have added this functionality as a default option to Kraken 2's library download/addition process.

Kraken 2 uses two programs to perform low-complexity sequence masking, both available from NCBI: dustmasker, for nucleotide sequences, and segmasker, for amino acid sequences. These programs are available as part of the NCBI BLAST+ suite. If these programs are not installed on the local system and in the user's PATH when trying to use kraken2-build, the database build will fail. Users who do not wish to install these programs can use the --no-masking option to kraken2-build in conjunction with any of the --download-library, --add-to-library, or --standard options; use of the --no-masking option will skip masking of low-complexity sequences during the build of the Kraken 2 database.

Special Databases

To support some common use cases, we provide the ability to build Kraken 2 databases using data from various external databases. These external databases may not follow the NCBI taxonomy, and so we've provided mechanisms to automatically create a taxonomy that will work with Kraken 2 (although such taxonomies may not be identical to NCBI's).

To build one of these "special" Kraken 2 databases, use the following command:

kraken2-build --db $DBNAME --special TYPE

where the TYPE string is one of the database names listed below.

At present, the "special" Kraken 2 database support we provide is limited to pre-packaged solutions for some public 16S sequence databases, but this may grow in the future.

16S Databases

For targeted 16S sequencing projects, a normal Kraken 2 database using whole genome data may use more resources than necessary. A Kraken 2 database created from a well-curated genomic library of just 16S data can provide both a more efficient solution as well as a more accurate set of predictions for such projects. We provide support for building Kraken 2 databases from three publicly available 16S databases:

Note that these databases may have licensing restrictions regarding their data, and it is your responsibility to ensure you are in compliance with those restrictions; please visit the databases' websites for further details. The kraken2-build script only uses publicly available URLs to download data and then converts that data into a form compatible for use with Kraken 2.

Furthermore, if you use one of these databases in your research, please visit the corresponding database's website to determine the appropriate and up-to-date citation.

Confidence Scoring

At present, we have not yet developed a confidence score with a probabilistic interpretation for Kraken 2. However, we have developed a simple scoring scheme that has yielded good results for us, and we've made that available in Kraken 2 through use of the --confidence option to kraken2. The approach we use allows a user to specify a threshold score in the [0,1] interval; the classifier then will adjust labels up the tree until the label's score (described below) meets or exceeds that threshold. If a label at the root of the taxonomic tree would not have a score exceeding the threshold, the sequence is called unclassified by Kraken 2 when this threshold is applied.

A sequence label's score is a fraction C/Q, where C is the number of k-mers mapped to LCA values in the clade rooted at the label, and Q is the number of k-mers in the sequence that lack an ambiguous nucleotide (i.e., they were queried against the database). Consider the example of the LCA mappings in Kraken 2's output given earlier:

"562:13 561:4 A:31 0:1 562:3" would indicate that:

In this case, ID #561 is the parent node of #562. Here, a label of #562 for this sequence would have a score of C/Q = (13+3)/(13+4+1+3) = 16/21. A label of #561 would have a score of C/Q = (13+4+3)/(13+4+1+3) = 20/21. If a user specified a --confidence threshold over 16/21, the classifier would adjust the original label from #562 to #561; if the threshold was greater than 20/21, the sequence would become unclassified.

Inspecting a Kraken 2 Database's Contents

The kraken2-inspect script allows users to gain information about the content of a Kraken 2 database. The output format of kraken2-inspect is identical to the reports generated with the --report option to kraken2. Instead of reporting how many reads in input data classified to a given taxon or clade, as kraken2's --report option would, the kraken2-inspect script will report the number of minimizers in the database that are mapped to the various taxa/clades. For example, the first five lines of kraken2-inspect's output on an example database might look like this:

$ kraken2-inspect --db EXAMPLE_DB | head -5
100.00% 1770368409      1581179 R       1       root
 96.50% 1708407622      58003   R1      131567    cellular organisms
 91.28% 1615910070      985309  D       2           Bacteria
 43.89% 777062062       1312736 P       1224          Proteobacteria
 18.62% 329590216       555667  C       1236            Gammaproteobacteria

This output indicates that 555667 of the minimizers in the database map directly to the Gammaproteobacteria class (taxid #1236), and 329590216 (18.62%) of the database's minimizers map to a taxon in the clade rooted at Gammaproteobacteria. For more information on kraken2-inspect's options, use its --help option.

Kraken 2 Environment Variables

The kraken2 and kraken2-inpsect scripts supports the use of some environment variables to help in reducing command line lengths:

kraken2-2.0.8-beta/docs/MANUAL.markdown000066400000000000000000001064201346023071100174400ustar00rootroot00000000000000Introduction ============ [Kraken] is a taxonomic sequence classifier that assigns taxonomic labels to DNA sequences. Kraken examines the $k$-mers within a query sequence and uses the information within those $k$-mers to query a database. That database maps $k$-mers to the lowest common ancestor (LCA) of all genomes known to contain a given $k$-mer. The first version of Kraken used a large indexed and sorted list of $k$-mer/LCA pairs as its database. While fast, the large memory requirements posed some problems for users, and so Kraken 2 was created to provide a solution to those problems. Kraken 2 differs from Kraken 1 in several important ways: 1. Only minimizers of the $k$-mers in the query sequences are used as database queries. Similarly, only minimizers of the $k$-mers in the reference sequences in the database's genomic library are stored in the database. We will also refer to the minimizers as $\ell$-mers, where $\ell \leq k$. All $k$-mers are considered to have the same LCA as their minimizer's database LCA value. 2. Kraken 2 uses a compact hash table that is a probabilistic data structure. This means that occasionally, database queries will fail by either returning the wrong LCA, or by not resulting in a search failure when a queried minimizer was never actually stored in the database. By incurring the risk of these false positives in the data structure, Kraken 2 is able to achieve faster speeds and lower memory requirements. Users should be aware that database false positive errors occur in less than 1% of queries, and can be compensated for by use of confidence scoring thresholds. 3. Kraken 2 has the ability to build a database from amino acid sequences and perform a translated search of the query sequences against that database. 4. Kraken 2 utilizes spaced seeds in the storage and querying of minimizers to improve classification accuracy. 5. Kraken 2 provides support for "special" databases that are not based on NCBI's taxonomy. These are currently limited to three popular 16S databases. Because Kraken 2 only stores minimizers in its hash table, and $k$ can be much larger than $\ell$, only a small percentage of the possible $\ell$-mers in a genomic library are actually deposited in the database. This creates a situation similar to the Kraken 1 "MiniKraken" databases; however, preliminary testing has shown the accuracy of a reduced Kraken 2 database to be quite similar to the full-sized Kraken 2 database, while Kraken 1's MiniKraken databases often resulted in a substantial loss of per-read sensitivity. The Kraken 2 paper is currently under preparation. Until it is released, please cite the original [Kraken paper] if you use Kraken 2 in your research. Thank you! [Kraken]: http://ccb.jhu.edu/software/kraken/ [Kraken paper]: http://genomebiology.com/2014/15/3/R46 System Requirements =================== * **Disk space**: Construction of a Kraken 2 standard database requires approximately 100 GB of disk space. A test on 01 Jan 2018 of the default installation showed 42 GB of disk space was used to store the genomic library files, 26 GB was used to store the taxonomy information from NCBI, and 29 GB was used to store the Kraken 2 compact hash table. Like in Kraken 1, we strongly suggest against using NFS storage to store the Kraken 2 database if at all possible. * **Memory**: To run efficiently, Kraken 2 requires enough free memory to hold the database (primarily the hash table) in RAM. While this can be accomplished with a ramdisk, Kraken 2 will by default load the database into process-local RAM; the `--memory-mapping` switch to `kraken2` will avoid doing so. The default database size is 29 GB (as of Jan. 2018), and you will need slightly more than that in RAM if you want to build the default database. * **Dependencies**: Kraken 2 currently makes extensive use of Linux utilities such as sed, find, and wget. Many scripts are written using the Bash shell, and the main scripts are written using Perl. Core programs needed to build the database and run the classifier are written in C++11, and need to be compiled using a somewhat recent version of g++ that will support C++11. Multithreading is handled using OpenMP. Downloads of NCBI data are performed by wget and rsync. Most Linux systems will have all of the above listed programs and development libraries available either by default or via package download. Unlike Kraken 1, Kraken 2 does not use an external $k$-mer counter. However, by default, Kraken 2 will attempt to use the `dustmasker` or `segmasker` programs provided as part of NCBI's BLAST suite to mask low-complexity regions (see [Masking of Low-complexity Sequences]). **MacOS NOTE:** MacOS and other non-Linux operating systems are *not* explicitly supported by the developers, and MacOS users should refer to the Kraken-users group for support in installing the appropriate utilities to allow for full operation of Kraken 2. We will attempt to use MacOS-compliant code when possible, but development and testing time is at a premium and we cannot guarantee that Kraken 2 will install and work to its full potential on a default installation of MacOS. In particular, we note that the default MacOS X installation of GCC does not have support for OpenMP. Without OpenMP, Kraken 2 is limited to single-threaded operation, resulting in slower build and classification runtimes. * **Network connectivity**: Kraken 2's standard database build and download commands expect unfettered FTP and rsync access to the NCBI FTP server. If you're working behind a proxy, you may need to set certain environment variables (such as `ftp_proxy` or `RSYNC_PROXY`) in order to get these commands to work properly. Kraken 2's scripts default to using rsync for most downloads; however, you may find that your network situation prevents use of rsync. In such cases, you can try the `--use-ftp` option to `kraken2-build` to force the downloads to occur via FTP. * **MiniKraken**: At present, users with low-memory computing environments can replicate the "MiniKraken" functionality of Kraken 1 in two ways: first, by increasing the value of $k$ with respect to $\ell$ (using the `--kmer-len` and `--minimizer-len` options to `kraken2-build`); and secondly, through downsampling of minimizers (from both the database and query sequences) using a hash function. This second option is performed if the `--max-db-size` option to `kraken2-build` is used; however, the two options are not mutually exclusive. In a difference from Kraken 1, Kraken 2 does not require building a full database and then shrinking it to obtain a reduced database. Installation ============ To begin using Kraken 2, you will first need to install it, and then either download or create a database. Kraken 2 consists of two main scripts (`kraken2` and `kraken2-build`), along with several programs and smaller scripts. As part of the installation process, all scripts and programs are installed in the same directory. After installation, you can move the main scripts elsewhere, but moving the other scripts and programs requires editing the scripts and changing the `$KRAKEN2_DIR` variables in the main scripts. Once an install directory is selected, you need to run the following command in the directory where you extracted the Kraken 2 source: ./install_kraken2.sh $KRAKEN2_DIR (Replace `$KRAKEN2_DIR` above with the directory where you want to install Kraken 2's programs/scripts.) The `install_kraken2.sh` script should compile all of Kraken 2's code and setup your Kraken 2 program directory. Installation is successful if you see the message "`Kraken 2 installation complete.`" Once installation is complete, you may want to copy the main Kraken 2 scripts into a directory found in your `PATH` variable (e.g., "`$HOME/bin`"): cp $KRAKEN2_DIR/bin/kraken2{,-build,-inspect} $HOME/bin After installation, you're ready to either create or download a database. Kraken 2 Databases ================== A Kraken 2 database is a directory containing at least 3 files: * `hash.k2d`: Contains the minimizer to taxon mappings * `opts.k2d`: Contains information about the options used to build the database * `taxo.k2d`: Contains taxonomy information used to build the database None of these three files are in a human-readable format. Other files may also be present as part of the database build process, and can, if desired, be removed after a successful build of the database. In interacting with Kraken 2, you should not have to directly reference any of these files, but rather simply provide the name of the directory in which they are stored. Kraken 2 allows both the use of a standard database as well as custom databases; these are described in the sections [Standard Kraken 2 Database] and [Custom Databases] below, respectively. Standard Kraken 2 Database ========================== To create the standard Kraken 2 database, you can use the following command: kraken2-build --standard --db $DBNAME (Replace "`$DBNAME`" above with your preferred database name/location. Please note that the database will use approximately 100 GB of disk space during creation, with the majority of that being reference sequences or taxonomy mapping information that can be removed after the build.) This will download NCBI taxonomic information, as well as the complete genomes in RefSeq for the bacterial, archaeal, and viral domains, along with the human genome and a collection of known vectors (UniVec_Core). After downloading all this data, the build process begins; this can be the most time-consuming step. If you have multiple processing cores, you can run this process with multiple threads, e.g.: kraken2-build --standard --threads 24 --db $DBNAME Using 32 threads on an AWS EC2 r4.8xlarge instance with 16 dual-core hyperthreaded 2.30 GHz CPUs and 244 GB of RAM, the build process took approximately 35 minutes in Jan. 2018. The build process itself has two main steps, each of which requires passing over the contents of the reference library: 1. **Estimation** of the capacity needed in the Kraken 2 compact hash table. This uses a low-memory method to reliably estimate the number of minimizers present in the reference library given the selected parameters $k$ and $\ell$. 2. **Population** of the hash table (and conversion of the taxonomy to an internal format). This step is a second pass over the reference library to find minimizers and then place them in the database. (There is one other preliminary step where sequence IDs are mapped to taxonomy IDs, but this is usually a rather quick process and is mostly handled during library downloading.) Unlike Kraken 1's build process, Kraken 2 does not perform checkpointing after the estimation step. This is because the estimation step is dependent on the selected $k$ and $\ell$ values, and if the population step fails, it is likely because $k$ needs to be increased (reducing the overall memory requirements). Classification ============== To classify a set of sequences, use the `kraken2` command: kraken2 --db $DBNAME seqs.fa Output will be sent to standard output by default. The files containing the sequences to be classified should be specified on the command line. Sequences can also be provided through standard input using the special filename `/dev/fd/0`. The `kraken2` program allows several different options: * **Multithreading**: Use the `--threads NUM` switch to use multiple threads. * **Quick operation**: Rather than searching all $\ell$-mers in a sequence, stop classification after the first database hit; use `--quick` to enable this mode. Note that `--min-hits` will allow you to require multiple hits before declaring a sequence classified, which can be especially useful with custom databases when testing to see if sequences either do or do not belong to a particular genome. * **Sequence filtering**: Classified or unclassified sequences can be sent to a file for later processing, using the `--classified-out` and `--unclassified-out` switches, respectively. * **Output redirection**: Output can be directed using standard shell redirection (`|` or `>`), or using the `--output` switch. * **FASTQ input**: Input is normally expected to be in FASTA format, but you can classify FASTQ data using the `--fastq-input` switch. * **Compressed input**: Kraken 2 can handle gzip and bzip2 compressed files as input by specifying the proper switch of `--gzip-compressed` or `--bzip2-compressed`. * **Input format auto-detection**: If regular files are specified on the command line as input, Kraken 2 will attempt to determine the format of your input prior to classification. You can disable this by explicitly specifying `--fasta-input`, `--fastq-input`, `--gzip-compressed`, and/or `--bzip2-compressed` as appropriate. Note that use of the character device file `/dev/fd/0` to read from standard input (aka `stdin`) will **not** allow auto-detection. * **Paired reads**: Kraken 2 provides an enhancement over Kraken 1 in its handling of paired read data. Rather than needing to concatenate the pairs together with an `N` character between the reads, Kraken 2 is able to process the mates individually while still recognizing the pairing information. Using the `--paired` option to `kraken2` will indicate to `kraken2` that the input files provided are paired read data, and data will be read from the pairs of files concurrently. Usage of `--paired` also affects the `--classified-out` and `--unclassified-out` options; users should provide a `#` character in the filenames provided to those options, which will be replaced by `kraken2` with "`_1`" and "`_2`" with mates spread across the two files appropriately. For example: kraken2 --paired --classified-out cseqs#.fq seqs_1.fq seqs_2.fq will put the first reads from classified pairs in `cseqs_1.fq`, and the second reads from those pairs in `cseqs_2.fq`. To get a full list of options, use `kraken2 --help`. Output Formats ============== Standard Kraken Output Format ----------------------------- Each sequence (or sequence pair, in the case of paired reads) classified by Kraken 2 results in a single line of output. Kraken 2's output lines contain five tab-delimited fields; from left to right, they are: 1. "C"/"U": a one letter code indicating that the sequence was either classified or unclassified. 2. The sequence ID, obtained from the FASTA/FASTQ header. 3. The taxonomy ID Kraken 2 used to label the sequence; this is 0 if the sequence is unclassified. 4. The length of the sequence in bp. In the case of paired read data, this will be a string containing the lengths of the two sequences in bp, separated by a pipe character, e.g. "98|94". 5. A space-delimited list indicating the LCA mapping of each $k$-mer in the sequence(s). For example, "562:13 561:4 A:31 0:1 562:3" would indicate that: - the first 13 $k$-mers mapped to taxonomy ID #562 - the next 4 $k$-mers mapped to taxonomy ID #561 - the next 31 $k$-mers contained an ambiguous nucleotide - the next $k$-mer was not in the database - the last 3 $k$-mers mapped to taxonomy ID #562 Note that paired read data will contain a "`|:|`" token in this list to indicate the end of one read and the beginning of another. When Kraken 2 is run against a protein database (see [Translated Search]), the LCA hitlist will contain the results of querying all six frames of each sequence. Reading frame data is separated by a "`-:-`" token. Kraken 1 offered a `kraken-translate` and `kraken-report` script to change the output into different formats. Through the use of `kraken2 --use-names`, Kraken 2 will replace the taxonomy ID column with the scientific name and the taxonomy ID in parenthesis (e.g., "Bacteria (taxid 2)" instead of "2"), yielding similar functionality to Kraken 1's `kraken-translate` script. The sample report functionality now exists as part of the `kraken2` script, with the use of the `--report` option; the sample report formats are described below. Sample Report Output Format --------------------------- Like Kraken 1, Kraken 2 offers two formats of sample-wide results. Kraken 2's standard sample report format is tab-delimited with one line per taxon. The fields of the output, from left-to-right, are as follows: 1. Percentage of fragments covered by the clade rooted at this taxon 2. Number of fragments covered by the clade rooted at this taxon 3. Number of fragments assigned directly to this taxon 4. A rank code, indicating (U)nclassified, (R)oot, (D)omain, (K)ingdom, (P)hylum, (C)lass, (O)rder, (F)amily, (G)enus, or (S)pecies. Taxa that are not at any of these 10 ranks have a rank code that is formed by using the rank code of the closest ancestor rank with a number indicating the distance from that rank. E.g., "G2" is a rank code indicating a taxon is between genus and species and the grandparent taxon is at the genus rank. 5. NCBI taxonomic ID number 6. Indented scientific name The scientific names are indented using space, according to the tree structure specified by the taxonomy. By default, taxa with no reads assigned to (or under) them will not have any output produced. However, if you wish to have all taxa displayed, you can use the `--report-zero-counts` switch to do so. This can be useful if you are looking to do further downstream analysis of the reports, and want to compare samples. Sorting by the taxonomy ID (using `sort -k5,5n`) can provide a consistent line ordering between reports. In addition, we also provide the option `--use-mpa-style` that can be used in conjunction with `--report`. This option provides output in a format similar to MetaPhlAn's output. The output with this option provides one taxon per line, with a lowercase version of the rank codes in Kraken 2's standard sample report format (except for 'U' and 'R'), two underscores, and the scientific name of the taxon (e.g., "d__Viruses"). The full taxonomy of each taxon (at the eight ranks considered) is given, with each rank's name separated by a pipe character (e.g., "d__Viruses|o_Caudovirales"). Following this version of the taxon's scientific name is a tab and the number of fragments assigned to the clade rooted at that taxon. Translated Search ================= Kraken 2 allows users to perform a six-frame translated search, similar to the well-known BLASTX program. To do this, Kraken 2 uses a reduced 15 amino acid alphabet and stores amino acid minimizers in its database. LCA results from all 6 frames are combined to yield a set of LCA hits, which is then resolved in the same manner as in Kraken's normal operation. To build a protein database, the `--protein` option should be given to `kraken2-build` (either along with `--standard`, or with all steps if building a custom database). Custom Databases ================ We realize the standard database may not suit everyone's needs. Kraken 2 also allows creation of customized databases. To build a custom database: 1. Install a taxonomy. Usually, you will just use the NCBI taxonomy, which you can easily download using: kraken2-build --download-taxonomy --db $DBNAME This will download the accession number to taxon maps, as well as the taxonomic name and tree information from NCBI. These files can be found in `$DBNAME/taxonomy/` . If you need to modify the taxonomy, edits can be made to the `names.dmp` and `nodes.dmp` files in this directory; you may also need to modify the `*.accession2taxid` files appropriately. Some of the standard sets of genomic libraries have taxonomic information associated with them, and don't need the accession number to taxon maps to build the database successfully. These libraries include all those available through the `--download-library` option (see next point), except for the `plasmid` and non-redundant databases. If you are not using custom sequences (see the `--add-to-library` option) and are not using one of the `plasmid` or non-redundant database libraries, you may want to skip downloading of the accession number to taxon maps. This can be done by passing `--skip-maps` to the `kraken2-build --download-taxonomy` command. 2. Install one or more reference libraries. Several sets of standard genomes/proteins are made easily available through `kraken2-build`: - `archaea`: RefSeq complete archaeal genomes/proteins - `bacteria`: RefSeq complete bacterial genomes/proteins - `plasmid`: RefSeq plasmid nucleotide/protein sequences - `viral`: RefSeq complete viral genomes/proteins - `human`: GRCh38 human genome/proteins - `fungi`: RefSeq complete fungal genomes/proteins - `plant`: RefSeq complete plant genomes/proteins - `protozoa`: RefSeq complete protozoan genomes/proteins - `nr`: NCBI non-redundant protein database - `nt`: NCBI non-redundant nucleotide database - `env_nr`: NCBI non-redundant protein database with sequences from large environmental sequencing projects - `env_nt`: NCBI non-redundant nucleotide database with sequences from large environmental sequencing projects - `UniVec`: NCBI-supplied database of vector, adapter, linker, and primer sequences that may be contaminating sequencing projects and/or assemblies - `UniVec_Core`: A subset of UniVec chosen to minimize false positive hits to the vector database To download and install any one of these, use the `--download-library` switch, e.g.: kraken2-build --download-library bacteria --db $DBNAME Multiple libraries can be downloaded into a database prior to building by issuing multiple `kraken2-build --download-library` commands, e.g.: kraken2-build --download-library archaea --db $DBNAME kraken2-build --download-library viral --db $DBNAME The above commands would prepare a database that would contain archaeal and viral genomes; the `--build` option (see below) will still need to be used after downloading these libraries to actually build the database, however. (Note that downloading `nr` or `env_nr` require use of the `--protein` option, and that `UniVec` and `UniVec_Core` are incompatible with the `--protein` option.) Other genomes can also be added, but such genomes must meet certain requirements: - Sequences must be in a FASTA file (multi-FASTA is allowed) - Each sequence's ID (the string between the `>` and the first whitespace character on the header line) must contain either an NCBI accession number to allow Kraken 2 to lookup the correct taxa, or an explicit assignment of the taxonomy ID using `kraken:taxid` (see below). Sequences not downloaded from NCBI may need their taxonomy information assigned explicitly. This can be done using the string `kraken:taxid|XXX` in the sequence ID, with `XXX` replaced by the desired taxon ID. For example, to put a known adapter sequence in taxon 32630 ("synthetic construct"), you could use the following: >sequence16|kraken:taxid|32630 Adapter sequence CAAGCAGAAGACGGCATACGAGATCTTCGAGTGACTGGAGTTCCTTGGCACCCGAGAATTCCA The `kraken:taxid` string must begin the sequence ID or be immediately preceded by a pipe character (`|`). Explicit assignment of taxonomy IDs in this manner will override the accession number mapping provided by NCBI. If your genomes meet the requirements above, then you can add each sequence to your database's genomic library using the `--add-to-library` switch, e.g.: kraken2-build --add-to-library chr1.fa --db $DBNAME kraken2-build --add-to-library chr2.fa --db $DBNAME Note that if you have a list of files to add, you can do something like this in `bash`: for file in chr*.fa do kraken2-build --add-to-library $file --db $DBNAME done Or even add all `*.fa` files found in the directory `genomes`: `find genomes/ -name '*.fa' -print0 | xargs -0 -I{} -n1 kraken2-build --add-to-library {} --db $DBNAME` (You may also find the `-P` option to `xargs` useful to add many files in parallel if you have multiple processors.) 3. Once your library is finalized, you need to build the database. This can be done with the command: kraken2-build --build --db $DBNAME The `--threads` option is also helpful here to reduce build time. By default, the values of $k$ and $\ell$ are 35 and 31, respectively (or 15 and 15 for protein databases). These values can be explicitly set with the `--kmer-len` and `minimizer-len` options, however. Note that the minimizer length must be no more than 31 for nucleotide databases, and 15 for protein databases. Additionally, the minimizer length $\ell$ must be no more than the $k$-mer length. There is no upper bound on the value of $k$, but sequences less than $k$ bp in length cannot be classified. Kraken 2 also utilizes a simple spaced seed approach to increase accuracy. A number $s$ < $\ell$/4 can be chosen, and $s$ positions in the minimizer will be masked out during all comparisons. Masked positions are chosen to alternate from the second-to-last position in the minimizer; e.g., $s$ = 5 and $\ell$ = 31 will result in masking out the 0 positions shown here: 111 1111 1111 1111 1111 1101 0101 0101 By default, $s$ = 6 for nucleotide databases, and $s$ = 0 for protein databases. This can be changed using the `--minimizer-spaces` option along with the `--build` task of `kraken2-build`. A full list of options for `kraken2-build` can be obtained using `kraken2-build --help`. After building a database, if you want to reduce the disk usage of the database, you can use the `--clean` option for `kraken2-build` to remove intermediate files from the database directory. Masking of Low-complexity Sequences =================================== Low-complexity sequences, e.g. "ACACACACACACACACACACACACAC", are known to occur in many different organisms and are typically less informative for use in alignments; the BLAST programs often mask these sequences by default. Using this masking can help prevent false positives in Kraken 2's results, and so we have added this functionality as a default option to Kraken 2's library download/addition process. Kraken 2 uses two programs to perform low-complexity sequence masking, both available from NCBI: `dustmasker`, for nucleotide sequences, and `segmasker`, for amino acid sequences. These programs are available as part of the NCBI BLAST+ suite. If these programs are not installed on the local system and in the user's PATH when trying to use `kraken2-build`, the database build will fail. Users who do not wish to install these programs can use the `--no-masking` option to `kraken2-build` in conjunction with any of the `--download-library`, `--add-to-library`, or `--standard` options; use of the `--no-masking` option will skip masking of low-complexity sequences during the build of the Kraken 2 database. Special Databases ================= To support some common use cases, we provide the ability to build Kraken 2 databases using data from various external databases. These external databases may not follow the NCBI taxonomy, and so we've provided mechanisms to automatically create a taxonomy that will work with Kraken 2 (although such taxonomies may not be identical to NCBI's). To build one of these "special" Kraken 2 databases, use the following command: kraken2-build --db $DBNAME --special TYPE where the `TYPE` string is one of the database names listed below. At present, the "special" Kraken 2 database support we provide is limited to pre-packaged solutions for some public 16S sequence databases, but this may grow in the future. 16S Databases ------------- For targeted 16S sequencing projects, a normal Kraken 2 database using whole genome data may use more resources than necessary. A Kraken 2 database created from a well-curated genomic library of just 16S data can provide both a more efficient solution as well as a more accurate set of predictions for such projects. We provide support for building Kraken 2 databases from three publicly available 16S databases: * [Greengenes] (Kraken 2 database name: `greengenes`), using all available 16S data. * [RDP] (Kraken 2 database name: `rdp`), using the bacterial and archaeal 16S data. * [SILVA] (Kraken 2 database name: `silva`), using the Small subunit NR99 sequence set. Note that these databases may have licensing restrictions regarding their data, and it is your responsibility to ensure you are in compliance with those restrictions; please visit the databases' websites for further details. The `kraken2-build` script only uses publicly available URLs to download data and then converts that data into a form compatible for use with Kraken 2. Furthermore, if you use one of these databases in your research, please visit the corresponding database's website to determine the appropriate and up-to-date citation. [Greengenes]: http://greengenes.lbl.gov/ [RDP]: http://rdp.cme.msu.edu/ [SILVA]: http://www.arb-silva.de/ Confidence Scoring ================== At present, we have not yet developed a confidence score with a probabilistic interpretation for Kraken 2. However, we have developed a simple scoring scheme that has yielded good results for us, and we've made that available in Kraken 2 through use of the `--confidence` option to `kraken2`. The approach we use allows a user to specify a threshold score in the [0,1] interval; the classifier then will adjust labels up the tree until the label's score (described below) meets or exceeds that threshold. If a label at the root of the taxonomic tree would not have a score exceeding the threshold, the sequence is called unclassified by Kraken 2 when this threshold is applied. A sequence label's score is a fraction $C$/$Q$, where $C$ is the number of $k$-mers mapped to LCA values in the clade rooted at the label, and $Q$ is the number of $k$-mers in the sequence that lack an ambiguous nucleotide (i.e., they were queried against the database). Consider the example of the LCA mappings in Kraken 2's output given earlier: "562:13 561:4 A:31 0:1 562:3" would indicate that: * the first 13 $k$-mers mapped to taxonomy ID #562 * the next 4 $k$-mers mapped to taxonomy ID #561 * the next 31 $k$-mers contained an ambiguous nucleotide * the next $k$-mer was not in the database * the last 3 $k$-mers mapped to taxonomy ID #562 In this case, ID #561 is the parent node of #562. Here, a label of #562 for this sequence would have a score of $C$/$Q$ = (13+3)/(13+4+1+3) = 16/21. A label of #561 would have a score of $C$/$Q$ = (13+4+3)/(13+4+1+3) = 20/21. If a user specified a `--confidence` threshold over 16/21, the classifier would adjust the original label from #562 to #561; if the threshold was greater than 20/21, the sequence would become unclassified. Inspecting a Kraken 2 Database's Contents ========================================= The `kraken2-inspect` script allows users to gain information about the content of a Kraken 2 database. The output format of `kraken2-inspect` is identical to the reports generated with the `--report` option to `kraken2`. Instead of reporting how many reads in input data classified to a given taxon or clade, as `kraken2`'s `--report` option would, the `kraken2-inspect` script will report the number of minimizers in the database that are mapped to the various taxa/clades. For example, the first five lines of `kraken2-inspect`'s output on an example database might look like this: $ kraken2-inspect --db EXAMPLE_DB | head -5 100.00% 1770368409 1581179 R 1 root 96.50% 1708407622 58003 R1 131567 cellular organisms 91.28% 1615910070 985309 D 2 Bacteria 43.89% 777062062 1312736 P 1224 Proteobacteria 18.62% 329590216 555667 C 1236 Gammaproteobacteria This output indicates that 555667 of the minimizers in the database map directly to the Gammaproteobacteria class (taxid #1236), and 329590216 (18.62%) of the database's minimizers map to a taxon in the clade rooted at Gammaproteobacteria. For more information on `kraken2-inspect`'s options, use its `--help` option. Kraken 2 Environment Variables ============================== The `kraken2` and `kraken2-inpsect` scripts supports the use of some environment variables to help in reducing command line lengths: * **`KRAKEN2_NUM_THREADS`**: if the `--threads` option is not supplied to `kraken2`, then the value of this variable (if it is set) will be used as the number of threads to run `kraken2`. (This variable does not affect `kraken2-inspect`.) * **`KRAKEN2_DB_PATH`**: much like the `PATH` variable is used for executables by your shell, `KRAKEN2_DB_PATH` is a colon-separated list of directories that will be searched for the database you name if the named database does not have a slash (`/`) character. By default, Kraken 2 assumes the value of this variable is "`.`" (i.e., the current working directory). This variable can be used to create one (or more) central repositories of Kraken databases in a multi-user system. Example usage in bash: export KRAKEN2_DB_PATH="/home/user/my_kraken2_dbs:/data/kraken2_dbs:" This will cause three directories to be searched, in this order: 1) `/home/user/my_kraken2_dbs` 2) `/data/kraken2_dbs` 3) the current working directory (caused by the empty string as the third colon-separated field in the `KRAKEN2_DB_PATH` string) The search for a database will stop when a name match is found; if two directories in the `KRAKEN2_DB_PATH` have databases with the same name, the directory of the two that is searched first will have its database selected. If the above variable and value are used, and the databases `/data/kraken2_dbs/mainDB` and `./mainDB` are present, then kraken2 --db mainDB sequences.fa will classify `sequences.fa` using `/data/kraken_dbs/mainDB`; if instead you wanted to use the `mainDB` present in the current directory, you would need to specify a directory path to that database in order to circumvent searching, e.g.: kraken2 --db ./mainDB sequences.fa Note that the `KRAKEN2_DB_PATH` directory list can be skipped by the use of any absolute (beginning with `/`) or relative pathname (including at least one `/`) as the database name. * **`KRAKEN2_DEFAULT_DB`**: if no database is supplied with the `--db` option, the database named in this variable will be used instead. Using this variable, you can avoid using `--db` if you only have a single database that you usually use, e.g. in bash: export KRAKEN2_DEFAULT_DB="/home/user/kraken2db" kraken2 sequences.fa > kraken2.output This will classify `sequences.fa` using the `/home/user/kraken2db` database. Note that the value of `KRAKEN2_DEFAULT_DB` will also be interpreted in the context of the value of `KRAKEN2_DB_PATH` if you don't set `KRAKEN2_DEFAULT_DB` to an absolute or relative pathname. Given the earlier example in this section, the following: export KRAKEN2_DEFAULT_DB="mainDB" kraken2 sequences.fa will use `/data/kraken_dbs/mainDB` to classify `sequences.fa`. kraken2-2.0.8-beta/docs/Makefile000066400000000000000000000004271346023071100163570ustar00rootroot00000000000000all: pandoc --title-prefix "Kraken 2 Manual" \ --include-in-header head.html \ --include-before-body top.html \ --from markdown --to html \ --table-of-contents \ --css kraken.css \ --output MANUAL.html \ < MANUAL.markdown kraken2-2.0.8-beta/docs/bar-bg.png000066400000000000000000000002741346023071100165570ustar00rootroot00000000000000PNG  IHDR pHYs  tIME # %ztEXtCommentCreated with GIMPW6IDATHcd???#͸||bQG-xQG-xb % "IENDB`kraken2-2.0.8-beta/docs/head.html000066400000000000000000000001571346023071100165060ustar00rootroot00000000000000 kraken2-2.0.8-beta/docs/kraken.css000066400000000000000000000013271346023071100167040ustar00rootroot00000000000000body { background: #f0f0f0 url("bar-bg.png") top left repeat-y; color: #333; margin: 10px; margin-left: 50px; margin-bottom: 20px; font-family: 'Ubuntu', sans-serif; } a { color: #00b0f0; } a:visited { color: #0090f0; } a:hover { color: #00d0ff; } a:active { color: #00f0ff; } .pretoc { text-align: center; font-size: 1.2em; } .title { font-size: 2em; font-weight: bold; margin-bottom: 0; } .version { font-size: 0.9em; } h1 { color: #0090f0; border-bottom: 1px #0090f0 solid; margin-left: -10px; margin-bottom: 3px; } h1 a { color: #0090f0; text-decoration: none; } div#confidence-score-table table th { width: 7em; } pre { margin-left: 4em; } code { font-size: 1.2em; } kraken2-2.0.8-beta/docs/top.html000066400000000000000000000003071346023071100164040ustar00rootroot00000000000000

Kraken taxonomic sequence classification system

Version 2.0.8-beta

Operating Manual

Table of Contents

kraken2-2.0.8-beta/install_kraken2.sh000077500000000000000000000023661346023071100174150ustar00rootroot00000000000000#!/bin/bash # Copyright 2013-2019, Derrick Wood # # This file is part of the Kraken 2 taxonomic sequence classification system. set -e VERSION="2.0.8-beta" if [ -z "$1" ] || [ -n "$2" ] then echo "Usage: $(basename $0) KRAKEN2_DIR" exit 64 fi if [ "$1" = "KRAKEN2_DIR" ] then echo "Please replace \"KRAKEN2_DIR\" with the name of the directory" echo "that you want to install Kraken 2 in." exit 1 fi # Perl cmd used to canonicalize dirname - "readlink -f" doesn't work # on OS X. export KRAKEN2_DIR=$(perl -MCwd=abs_path -le 'print abs_path(shift)' "$1") mkdir -p "$KRAKEN2_DIR" make -C src install for file in scripts/* do perl -pl -e 'BEGIN { while (@ARGV) { $_ = shift; ($k,$v) = split /=/, $_, 2; $H{$k} = $v } }'\ -e 's/#####=(\w+)=#####/$H{$1}/g' \ "KRAKEN2_DIR=$KRAKEN2_DIR" "VERSION=$VERSION" \ < "$file" > "$KRAKEN2_DIR/$(basename $file)" if [ -x "$file" ] then chmod +x "$KRAKEN2_DIR/$(basename $file)" fi done echo echo "Kraken 2 installation complete." echo echo "To make things easier for you, you may want to copy/symlink the following" echo "files into a directory in your PATH:" for file in $KRAKEN2_DIR/kraken2* do if [ -x "$file" ] then echo " $file" fi done kraken2-2.0.8-beta/scripts/000077500000000000000000000000001346023071100154535ustar00rootroot00000000000000kraken2-2.0.8-beta/scripts/16S_gg_installation.sh000077500000000000000000000016771346023071100216340ustar00rootroot00000000000000#!/bin/bash # Copyright 2013-2019, Derrick Wood # # This file is part of the Kraken 2 taxonomic sequence classification system. # Build a 16S database from Greengenes data set -u # Protect against uninitialized vars. set -e # Stop on error set -o pipefail # Stop on failures in non-final pipeline commands FTP_SERVER="ftp://greengenes.microbio.me/" GG_VERSION="gg_13_5" REMOTE_DIR="$FTP_SERVER/greengenes_release/$GG_VERSION" mkdir -p "$KRAKEN2_DB_NAME" pushd "$KRAKEN2_DB_NAME" mkdir -p data taxonomy library pushd data wget "$REMOTE_DIR/${GG_VERSION}.fasta.gz" gunzip "${GG_VERSION}.fasta.gz" wget "$REMOTE_DIR/${GG_VERSION}_taxonomy.txt.gz" gunzip "${GG_VERSION}_taxonomy.txt.gz" build_gg_taxonomy.pl "${GG_VERSION}_taxonomy.txt" popd mv data/names.dmp data/nodes.dmp taxonomy/ mv data/seqid2taxid.map . mv "data/${GG_VERSION}.fasta" library/gg.fna popd kraken2-build --db $KRAKEN2_DB_NAME --build --threads $KRAKEN2_THREAD_CT kraken2-2.0.8-beta/scripts/16S_rdp_installation.sh000077500000000000000000000016741346023071100220210ustar00rootroot00000000000000#!/bin/bash # Copyright 2013-2019, Derrick Wood # # This file is part of the Kraken 2 taxonomic sequence classification system. # Build a 16S database from RDP data set -u # Protect against uninitialized vars. set -e # Stop on error set -o pipefail # Stop on failures in non-final pipeline commands HTTP_SERVER="http://rdp.cme.msu.edu/" REMOTE_DIR="$HTTP_SERVER/download/" mkdir -p "$KRAKEN2_DB_NAME" pushd "$KRAKEN2_DB_NAME" mkdir -p data taxonomy library pushd data wget "$REMOTE_DIR/current_Bacteria_unaligned.fa.gz" gunzip "current_Bacteria_unaligned.fa.gz" wget "$REMOTE_DIR/current_Archaea_unaligned.fa.gz" gunzip "current_Archaea_unaligned.fa.gz" build_rdp_taxonomy.pl current_*_unaligned.fa popd mv data/names.dmp data/nodes.dmp taxonomy/ mv data/seqid2taxid.map . for file in data/*.fa; do mv $file library/$(basename $file .fa).fna done popd kraken2-build --db $KRAKEN2_DB_NAME --build --threads $KRAKEN2_THREAD_CT kraken2-2.0.8-beta/scripts/16S_silva_installation.sh000077500000000000000000000021151346023071100223410ustar00rootroot00000000000000#!/bin/bash # Copyright 2013-2019, Derrick Wood # # This file is part of the Kraken 2 taxonomic sequence classification system. # Build a 16S database from Silva data set -u # Protect against uninitialized vars. set -e # Stop on error set -o pipefail # Stop on failures in non-final pipeline commands FTP_SERVER="ftp://ftp.arb-silva.de/" SILVA_VERSION="132" REMOTE_DIR="$FTP_SERVER/release_$SILVA_VERSION/Exports" FASTA_FILENAME="SILVA_${SILVA_VERSION}_SSURef_Nr99_tax_silva.fasta" TAXO_PREFIX="tax_slv_ssu_$SILVA_VERSION" mkdir -p "$KRAKEN2_DB_NAME" pushd "$KRAKEN2_DB_NAME" mkdir -p data taxonomy library pushd data wget "$REMOTE_DIR/${FASTA_FILENAME}.gz" gunzip "${FASTA_FILENAME}.gz" wget "$REMOTE_DIR/taxonomy/${TAXO_PREFIX}.acc_taxid" wget "$REMOTE_DIR/taxonomy/${TAXO_PREFIX}.txt" build_silva_taxonomy.pl "${TAXO_PREFIX}.txt" popd mv data/names.dmp data/nodes.dmp taxonomy/ mv data/${TAXO_PREFIX}.acc_taxid seqid2taxid.map sed -e '/^>/!y/U/T/' "data/$FASTA_FILENAME" > library/silva.fna popd kraken2-build --db $KRAKEN2_DB_NAME --build --threads $KRAKEN2_THREAD_CT kraken2-2.0.8-beta/scripts/add_to_library.sh000077500000000000000000000016631346023071100207760ustar00rootroot00000000000000#!/bin/bash # Copyright 2013-2019, Derrick Wood # # This file is part of the Kraken 2 taxonomic sequence classification system. # Copy specified file into a Kraken library set -u # Protect against uninitialized vars. set -e # Stop on error LIBRARY_DIR="$KRAKEN2_DB_NAME/library" if [ ! -e "$1" ] then 1>&2 echo "Can't add \"$1\": file does not exist" exit 1 fi if [ ! -f "$1" ] then 1>&2 echo "Can't add \"$1\": not a regular file" exit 1 fi add_dir="$LIBRARY_DIR/added" mkdir -p "$add_dir" prelim_map=$(cp_into_tempfile.pl -t "prelim_map_XXXXXXXXXX" -d "$add_dir" -s txt /dev/null) scan_fasta_file.pl "$1" > "$prelim_map" filename=$(cp_into_tempfile.pl -t "XXXXXXXXXX" -d "$add_dir" -s fna "$1") if [ -n "$KRAKEN2_MASK_LC" ]; then 1>&2 echo -n "Masking low-complexity regions of new file..." mask_low_complexity.sh $filename 1>&2 echo " done." fi 1>&2 echo "Added \"$1\" to library ($KRAKEN2_DB_NAME)" kraken2-2.0.8-beta/scripts/build_gg_taxonomy.pl000077500000000000000000000046061346023071100215330ustar00rootroot00000000000000#!/usr/bin/env perl # Copyright 2013-2019, Derrick Wood # # This file is part of the Kraken 2 taxonomic sequence classification system. # Parses Greengenes taxonomy file to create Kraken taxonomy # and sequence ID -> taxonomy ID mapping # Input (as <>): gg_13_5_taxonomy.txt use strict; use warnings; use File::Basename; my $PROG = basename $0; my %RANK_CODES = ( k => "superkingdom", p => "phylum", c => "class", o => "order", f => "family", g => "genus", s => "species" ); my %seqid_map; my %seen_it; my %child_data = ("root" => {}); LINE: while (<>) { chomp; my ($seqid, $taxo_str) = split /\t/; $taxo_str =~ s/(; [a-z]__)+$//; # Remove empty data $seqid_map{$seqid} = $taxo_str; next if $seen_it{$taxo_str}++; while ($taxo_str =~ s/(; [a-z]__[^;]+$)//) { my $level = $1; my $parent = $taxo_str; $child_data{$parent} ||= {}; $child_data{$parent}->{"$taxo_str$level"}++; next LINE if $seen_it{$taxo_str}++; } $child_data{"root"}->{$taxo_str}++; } # Assign IDs through BFS of tree, report names/nodes info in # NCBI format my %id_map; my $next_node_id = 1; open NAMES, ">", "names.dmp" or die "$PROG: can't write names.dmp: $!\n"; open NODES, ">", "nodes.dmp" or die "$PROG: can't write nodes.dmp: $!\n"; my @bfs_queue = (["root", 1]); while (@bfs_queue) { my $arg_ref = shift @bfs_queue; my ($node, $parent_id) = @$arg_ref; my $display_name = $node; my $rank; # special handling for species if ($node =~ /g__([^;]+); s__([^;]+)$/) { my ($genus, $species) = ($1, $2); $rank = "species"; if ($species =~ / endosymbiont /) { $display_name = $species; } else { $display_name = "$genus $species"; } } elsif ($node =~ /([a-z])__([^;]+)$/) { $rank = $RANK_CODES{$1}; $display_name = $2; } $rank ||= "no rank"; my $node_id = $next_node_id++; $id_map{$node} = $node_id; print NAMES "$node_id\t|\t$display_name\t|\t-\t|\tscientific name\t|\n"; print NODES "$node_id\t|\t$parent_id\t|\t$rank\t|\t-\t|\n"; my @children = sort keys %{ $child_data{$node} }; push @bfs_queue, [$_, $node_id] for @children; } close NAMES; close NODES; open SEQID_TAXID_MAP, ">", "seqid2taxid.map" or die "$PROG: can't write seqid2taxid.map: $!\n"; for my $seqid (sort { $a <=> $b } keys %seqid_map) { my $taxid = $id_map{ $seqid_map{$seqid} }; print SEQID_TAXID_MAP "$seqid\t$taxid\n"; } close SEQID_TAXID_MAP; kraken2-2.0.8-beta/scripts/build_kraken2_db.sh000077500000000000000000000103111346023071100211670ustar00rootroot00000000000000#!/bin/bash # Copyright 2013-2019, Derrick Wood # # This file is part of the Kraken 2 taxonomic sequence classification system. # Build a Kraken 2 database # Designed to be called by kraken2-build set -u # Protect against uninitialized vars. set -e # Stop on error set -o pipefail # Stop on failures in non-final pipeline commands function finalize_file() { mv $1.tmp $1 } function get_current_time() { date "+%s.%N" } function report_time_elapsed() { curr_time=$(get_current_time) perl -e '$time = $ARGV[1] - $ARGV[0];' \ -e '$sec = int($time); $nsec = $time - $sec;' \ -e '$min = int($sec/60); $sec %= 60;' \ -e '$hr = int($min/60); $min %= 60;' \ -e 'print "${hr}h" if $hr;' \ -e 'print "${min}m" if $min || $hr;' \ -e 'printf "%.3fs", $sec + $nsec;' \ $1 $curr_time } function list_sequence_files() { find library/ '(' -name '*.fna' -o -name '*.faa' ')' -print0 } start_time=$(get_current_time) DATABASE_DIR="$KRAKEN2_DB_NAME" if [ ! -d "$DATABASE_DIR" ] then echo "Can't find Kraken 2 DB directory \"$KRAKEN2_DB_NAME\"" exit 1 fi cd "$DATABASE_DIR" if [ ! -d "taxonomy/" ] then echo "Can't find taxonomy/ subdirectory in database directory, exiting." exit 1 fi if [ ! -d "library/" ] then echo "Can't find library/ subdirectory in database directory, exiting." exit 1 fi KRAKEN2XFLAG="" if [ -n "$KRAKEN2_PROTEIN_DB" ] then KRAKEN2XFLAG="-X" fi echo "Creating sequence ID to taxonomy ID map (step 1)..." if [ -d "library/added" ]; then find library/added/ -name 'prelim_map_*.txt' | xargs cat > library/added/prelim_map.txt fi seqid2taxid_map_file=seqid2taxid.map if [ -e "$seqid2taxid_map_file" ]; then echo "Sequence ID to taxonomy ID map already present, skipping map creation." else step_time=$(get_current_time) find library/ -maxdepth 2 -name prelim_map.txt | xargs cat > taxonomy/prelim_map.txt if [ ! -s "taxonomy/prelim_map.txt" ]; then echo "No preliminary seqid/taxid mapping files found, aborting." exit 1 fi grep "^TAXID" taxonomy/prelim_map.txt | cut -f 2- > $seqid2taxid_map_file.tmp || true if grep "^ACCNUM" taxonomy/prelim_map.txt | cut -f 2- > accmap_file.tmp; then if compgen -G "taxonomy/*.accession2taxid" > /dev/null; then lookup_accession_numbers accmap_file.tmp taxonomy/*.accession2taxid > seqid2taxid_acc.tmp cat seqid2taxid_acc.tmp >> $seqid2taxid_map_file.tmp rm seqid2taxid_acc.tmp else echo "Accession to taxid map files are required to build this DB." echo "Run 'kraken2-build --db $KRAKEN2_DB_NAME --download-taxonomy' again?" exit 1 fi fi rm -f accmap_file.tmp finalize_file $seqid2taxid_map_file echo "Sequence ID to taxonomy ID map complete. [$(report_time_elapsed $step_time)]" fi echo "Estimating required capacity (step 2)..." step_time=$(get_current_time) estimate=$(list_sequence_files | xargs -0 cat | estimate_capacity -k $KRAKEN2_KMER_LEN -l $KRAKEN2_MINIMIZER_LEN -S $KRAKEN2_SEED_TEMPLATE -p $KRAKEN2_THREAD_CT $KRAKEN2XFLAG ) required_capacity=$(perl -le 'print int(shift() / 0.7)' $estimate); echo "Estimated hash table requirement: $(( required_capacity * 4 )) bytes" max_db_flag="" if [ -n "$KRAKEN2_MAX_DB_SIZE" ] then if (( KRAKEN2_MAX_DB_SIZE < (required_capacity * 4) )) then max_db_flag="-M $(perl -le 'print int(shift() / 4)' $KRAKEN2_MAX_DB_SIZE)" echo "Specifying lower maximum hash table size of $KRAKEN2_MAX_DB_SIZE bytes" fi fi echo "Capacity estimation complete. [$(report_time_elapsed $step_time)]" echo "Building database files (step 3)..." if [ -e "hash.k2d" ] then echo "Hash table already present, skipping database file build." else step_time=$(get_current_time) list_sequence_files | xargs -0 cat | \ build_db -k $KRAKEN2_KMER_LEN -l $KRAKEN2_MINIMIZER_LEN -S $KRAKEN2_SEED_TEMPLATE $KRAKEN2XFLAG \ -H hash.k2d.tmp -t taxo.k2d.tmp -o opts.k2d.tmp -n taxonomy/ -m $seqid2taxid_map_file \ -c $required_capacity -p $KRAKEN2_THREAD_CT $max_db_flag finalize_file taxo.k2d finalize_file opts.k2d finalize_file hash.k2d echo "Database files completed. [$(report_time_elapsed $step_time)]" fi echo "Database construction complete. [Total: $(report_time_elapsed $start_time)]" kraken2-2.0.8-beta/scripts/build_rdp_taxonomy.pl000077500000000000000000000042171346023071100217210ustar00rootroot00000000000000#!/usr/bin/env perl # Copyright 2013-2019, Derrick Wood # # This file is part of the Kraken 2 taxonomic sequence classification system. # Parses RDP sequence data to create Kraken taxonomy # and sequence ID -> taxonomy ID mapping # Input (as <>): current_{Archaea,Bacteria}_unaligned.fa use strict; use warnings; use File::Basename; my $PROG = basename $0; my %seqid_map; my %seen_it; my %child_data = ("root;no rank" => {}); LINE: while (<>) { next unless s/^>//; chomp; my ($seq_label, $taxo_str) = split /\t/; my ($seqid) = split " ", $seq_label; $taxo_str =~ s/^Lineage=Root;rootrank;/root;no rank;/; $taxo_str =~ s/;$/;no rank/; # adjust for unclassified things $seqid_map{$seqid} = $taxo_str; next if $seen_it{$taxo_str}++; while ($taxo_str =~ s/(;[^;]+;[^;]+)$//) { my $level = $1; my $parent = $taxo_str; $child_data{$parent} ||= {}; $child_data{$parent}->{"$taxo_str$level"}++; next LINE if $seen_it{$taxo_str}++; } } # Assign IDs through BFS of tree, report names/nodes info in # NCBI format my %id_map; my $next_node_id = 1; open NAMES, ">", "names.dmp" or die "$PROG: can't write names.dmp: $!\n"; open NODES, ">", "nodes.dmp" or die "$PROG: can't write nodes.dmp: $!\n"; my @bfs_queue = (["root;no rank", 1]); while (@bfs_queue) { my $arg_ref = shift @bfs_queue; my ($node, $parent_id) = @$arg_ref; if ($node !~ /([^;]+);([^;]+)$/) { die "$PROG: BFS processing encountered formatting error, \"$node\"\n"; } my ($display_name, $rank) = ($1, $2); $rank = "superkingdom" if $rank eq "domain"; # conform w/ NCBI taxonomy my $node_id = $next_node_id++; $id_map{$node} = $node_id; print NAMES "$node_id\t|\t$display_name\t|\t-\t|\tscientific name\t|\n"; print NODES "$node_id\t|\t$parent_id\t|\t$rank\t|\t-\t|\n"; my @children = sort keys %{ $child_data{$node} }; push @bfs_queue, [$_, $node_id] for @children; } close NAMES; close NODES; open SEQID_TAXID_MAP, ">", "seqid2taxid.map" or die "$PROG: can't write seqid2taxid.map: $!\n"; for my $seqid (sort keys %seqid_map) { my $taxid = $id_map{ $seqid_map{$seqid} }; print SEQID_TAXID_MAP "$seqid\t$taxid\n"; } close SEQID_TAXID_MAP; kraken2-2.0.8-beta/scripts/build_silva_taxonomy.pl000077500000000000000000000023651346023071100222540ustar00rootroot00000000000000#!/usr/bin/env perl # Copyright 2013-2019, Derrick Wood # # This file is part of the Kraken 2 taxonomic sequence classification system. # Parses Silva taxonomy file to create Kraken taxonomy # Input (as <>): tax_slv_ssu_nr_119.txt use strict; use warnings; use File::Basename; my $PROG = basename $0; my %id_map = ("root" => 1); open NAMES, ">", "names.dmp" or die "$PROG: can't write names.dmp: $!\n"; open NODES, ">", "nodes.dmp" or die "$PROG: can't write nodes.dmp: $!\n"; print NAMES "1\t|\troot\t|\t-\t|\tscientific name\t|\n"; print NODES "1\t|\t1\t|\tno rank\t|\t-\t|\n"; while (<>) { chomp; my ($taxo_str, $node_id, $rank) = split /\t/; $id_map{$taxo_str} = $node_id; if ($taxo_str =~ /^(.+;|)([^;]+);$/) { my $parent_name = $1; my $display_name = $2; if ($parent_name eq "") { $parent_name = "root"; } my $parent_id = $id_map{$parent_name}; if (! defined $parent_id) { die "$PROG: orphan error, line $.\n"; } $rank = "superkingdom" if $rank eq "domain"; print NAMES "$node_id\t|\t$display_name\t|\t-\t|\tscientific name\t|\n"; print NODES "$node_id\t|\t$parent_id\t|\t$rank\t|\t-\t|\n"; } else { die "$PROG: strange input, line $.\n"; } } close NAMES; close NODES; kraken2-2.0.8-beta/scripts/clean_db.sh000077500000000000000000000011071346023071100175400ustar00rootroot00000000000000#!/bin/bash # Copyright 2013-2019, Derrick Wood # # This file is part of the Kraken 2 taxonomic sequence classification system. # Removes intermediate files from a database directory, # such as reference library FASTA files and taxonomy data from NCBI. set -u # Protect against uninitialized vars. set -e # Stop on error cd $KRAKEN2_DB_NAME previous_usage=$(du -sh | cut -f1) 1>&2 echo "Database disk usage: $previous_usage" rm -rf library/ taxonomy/ seqid2taxid.map current_usage=$(du -sh | cut -f1) 1>&2 echo "After cleaning, database uses $current_usage" kraken2-2.0.8-beta/scripts/cp_into_tempfile.pl000077500000000000000000000023751346023071100213420ustar00rootroot00000000000000#!/usr/bin/env perl # Copyright 2013-2019, Derrick Wood # # This file is part of the Kraken 2 taxonomic sequence classification system. # Create a file in a specified directory, then copy an # existing file's contents into the new file. Write name of # new file to standard output. # # This exists because the mktemp program doesn't act consistently across # operating systems/distros/versions. use strict; use warnings; use File::Basename; use File::Temp 'tempfile'; use Getopt::Std; my $PROG = basename $0; getopts('d:t:s:', \my %opts) or usage(); $opts{$_} or usage() for qw/d t s/; # all switches mandatory my ($directory, $template, $suffix) = @opts{qw/d t s/}; die "$PROG: '$directory' not a directory!\n" unless -d $directory; die "$PROG: must specify a single filename\n" unless @ARGV == 1; $suffix =~ s/^\.//; my $old_filename = shift @ARGV; open FILE, "<", $old_filename or die "$PROG: can't read $old_filename: $!\n"; my ($fh, $new_filename) = tempfile($template, DIR => $directory, UNLINK => 0, SUFFIX => ".$suffix"); # copy loop while () { print {$fh} $_; } close FILE; close $fh; print "$new_filename\n"; sub usage { die "$PROG: <-d directory> <-t template> <-s suffix> \n"; } kraken2-2.0.8-beta/scripts/download_genomic_library.sh000077500000000000000000000107271346023071100230550ustar00rootroot00000000000000#!/bin/bash # Copyright 2013-2019, Derrick Wood # # This file is part of the Kraken 2 taxonomic sequence classification system. # Download specific genomic libraries for use with Kraken 2. # Supported libraries were chosen based on support from NCBI's FTP site # in easily obtaining a good collection of genomic data. Others may # be added upon popular demand. set -u # Protect against uninitialized vars. set -e # Stop on error LIBRARY_DIR="$KRAKEN2_DB_NAME/library" NCBI_SERVER="ftp.ncbi.nlm.nih.gov" FTP_SERVER="ftp://$NCBI_SERVER" RSYNC_SERVER="rsync://$NCBI_SERVER" THIS_DIR=$PWD library_name="$1" ftp_subdir=$library_name library_file="library.fna" if [ -n "$KRAKEN2_PROTEIN_DB" ]; then library_file="library.faa" fi function download_file() { file="$1" if [ -n "$KRAKEN2_USE_FTP" ] then wget -q ${FTP_SERVER}${file} else rsync --no-motd ${RSYNC_SERVER}${file} . fi } case $library_name in "archaea" | "bacteria" | "viral" | "fungi" | "plant" | "human" | "protozoa") mkdir -p $LIBRARY_DIR/$library_name cd $LIBRARY_DIR/$library_name rm -f assembly_summary.txt remote_dir_name=$library_name if [ "$library_name" = "human" ]; then remote_dir_name="vertebrate_mammalian/Homo_sapiens" fi if ! download_file "/genomes/refseq/$remote_dir_name/assembly_summary.txt"; then 1>&2 echo "Error downloading assembly summary file for $library_name, exiting." exit 1 fi if [ "$library_name" = "human" ]; then grep "Genome Reference Consortium" assembly_summary.txt > x mv x assembly_summary.txt fi rm -rf all/ library.f* manifest.txt rsync.err rsync_from_ncbi.pl assembly_summary.txt scan_fasta_file.pl $library_file >> prelim_map.txt ;; "plasmid") mkdir -p $LIBRARY_DIR/plasmid cd $LIBRARY_DIR/plasmid rm -f library.f* plasmid.* ## This is staying FTP only D/L for now 1>&2 echo -n "Downloading plasmid files from FTP..." wget -q --no-remove-listing --spider $FTP_SERVER/genomes/refseq/plasmid/ if [ -n "$KRAKEN2_PROTEIN_DB" ]; then awk '{ print $NF }' .listing | perl -ple 'tr/\r//d' | grep '\.faa\.gz' > manifest.txt else awk '{ print $NF }' .listing | perl -ple 'tr/\r//d' | grep '\.fna\.gz' > manifest.txt fi cat manifest.txt | xargs -n1 -I{} wget -q $FTP_SERVER/genomes/refseq/plasmid/{} cat manifest.txt | xargs -n1 -I{} gunzip -c {} > $library_file rm -f plasmid.* .listing scan_fasta_file.pl $library_file > prelim_map.txt 1>&2 echo " done." ;; "env_nr" | "nr" | "env_nt" | "nt") protein_lib=0 if [ "$library_name" = "env_nr" ] || [ "$library_name" = "nr" ]; then protein_lib=1 fi if (( protein_lib == 1 )) && [ -z "$KRAKEN2_PROTEIN_DB" ]; then 1>&2 echo "$library_name is a protein database, and the Kraken DB specified is nucleotide" exit 1 fi mkdir -p $LIBRARY_DIR/$library_name cd $LIBRARY_DIR/$library_name rm -f $library_name.gz 1>&2 echo -n "Downloading $library_name database from server... " download_file "/blast/db/FASTA/$library_name.gz" 1>&2 echo "done." 1>&2 echo -n "Uncompressing $library_name database..." gunzip $library_name.gz mv $library_name $library_file 1>&2 echo "done." 1>&2 echo -n "Parsing $library_name FASTA file..." # The nr/nt files tend to have non-standard sequence IDs, so # --lenient is used here. scan_fasta_file.pl --lenient $library_file >> prelim_map.txt 1>&2 echo "done." ;; "UniVec" | "UniVec_Core") if [ -n "$KRAKEN2_PROTEIN_DB" ]; then 1>&2 echo "$library_name is for nucleotide databases only" exit 1 fi mkdir -p $LIBRARY_DIR/$library_name cd $LIBRARY_DIR/$library_name 1>&2 echo -n "Downloading $library_name data from server... " download_file "/pub/UniVec/$library_name" 1>&2 echo "done." # 28384: "other sequences" special_taxid=28384 1>&2 echo -n "Adding taxonomy ID of $special_taxid to all sequences... " sed -e "s/^>/>kraken:taxid|$special_taxid|/" $library_name > library.fna scan_fasta_file.pl library.fna > prelim_map.txt 1>&2 echo "done." ;; *) 1>&2 echo "Unsupported library. Valid options are: " 1>&2 echo " archaea bacteria viral fungi plant protozoa human plasmid" 1>&2 echo " nr nt env_nr env_nt UniVec UniVec_Core" exit 1 ;; esac if [ -n "$KRAKEN2_MASK_LC" ]; then 1>&2 echo -n "Masking low-complexity regions of downloaded library..." mask_low_complexity.sh . 1>&2 echo " done." fi kraken2-2.0.8-beta/scripts/download_taxonomy.sh000077500000000000000000000033401346023071100215570ustar00rootroot00000000000000#!/bin/bash # Copyright 2013-2019, Derrick Wood # # This file is part of the Kraken 2 taxonomic sequence classification system. # Download NCBI taxonomy information for Kraken 2. # Designed to be called by kraken2-build set -u # Protect against uninitialized vars. set -e # Stop on error TAXONOMY_DIR="$KRAKEN2_DB_NAME/taxonomy" NCBI_SERVER="ftp.ncbi.nlm.nih.gov" RSYNC_SERVER="rsync://$NCBI_SERVER" FTP_SERVER="ftp://$NCBI_SERVER" mkdir -p "$TAXONOMY_DIR" cd "$TAXONOMY_DIR" function download_file() { file="$1" if [ -n "$KRAKEN2_USE_FTP" ] then wget -q ${FTP_SERVER}${file} else rsync --no-motd ${RSYNC_SERVER}${file} . fi } if [ ! -e "accmap.dlflag" ] && [ -z "$KRAKEN2_SKIP_MAPS" ] then if [ -z "$KRAKEN2_PROTEIN_DB" ] then for subsection in gb wgs do 1>&2 echo -n "Downloading nucleotide ${subsection} accession to taxon map..." download_file "/pub/taxonomy/accession2taxid/nucl_${subsection}.accession2taxid.gz" 1>&2 echo " done." done else 1>&2 echo -n "Downloading protein accession to taxon map..." download_file "/pub/taxonomy/accession2taxid/prot.accession2taxid.gz" 1>&2 echo " done." fi touch accmap.dlflag 1>&2 echo "Downloaded accession to taxon map(s)" fi if [ ! -e "taxdump.dlflag" ] then 1>&2 echo -n "Downloading taxonomy tree data..." download_file "/pub/taxonomy/taxdump.tar.gz" touch taxdump.dlflag 1>&2 echo " done." fi if ls | grep -q 'accession2taxid\.gz$' then 1>&2 echo -n "Uncompressing taxonomy data..." gunzip *accession2taxid.gz 1>&2 echo " done." fi if [ ! -e "taxdump.untarflag" ] then 1>&2 echo -n "Untarring taxonomy tree data..." tar zxf taxdump.tar.gz touch taxdump.untarflag 1>&2 echo " done." fi kraken2-2.0.8-beta/scripts/kraken2000077500000000000000000000163711346023071100167460ustar00rootroot00000000000000#!/usr/bin/env perl # Copyright 2013-2019, Derrick Wood # # This file is part of the Kraken 2 taxonomic sequence classification system. # Wrapper for Kraken's classifier use strict; use warnings; use Fcntl; use File::Basename; use Getopt::Long; my $PROG = basename $0; my $KRAKEN2_DIR = "#####=KRAKEN2_DIR=#####"; # Test to see if the executables got moved, try to recover if we can if (! -e "$KRAKEN2_DIR/classify") { use Cwd 'abs_path'; $KRAKEN2_DIR = dirname abs_path($0); } require "$KRAKEN2_DIR/kraken2lib.pm"; $ENV{"KRAKEN2_DIR"} = $KRAKEN2_DIR; $ENV{"PATH"} = "$KRAKEN2_DIR:$ENV{PATH}"; my $CLASSIFY = "$KRAKEN2_DIR/classify"; my $GZIP_MAGIC = chr(hex "1f") . chr(hex "8b"); my $BZIP2_MAGIC = "BZ"; my $quick = 0; my $min_hits = 1; my $db_prefix; my $threads; my $memory_mapping = 0; my $gunzip = 0; my $bunzip2 = 0; my $paired = 0; my $names_in_output = 0; my $only_classified_output = 0; my $unclassified_out; my $classified_out; my $outfile; my $confidence_threshold = 0.0; my $minimum_base_quality = 0; my $report_filename; my $use_mpa_style = 0; my $report_zero_counts = 0; GetOptions( "help" => \&display_help, "version" => \&display_version, "db=s" => \$db_prefix, "threads=i" => \$threads, "quick" => \$quick, "unclassified-out=s" => \$unclassified_out, "classified-out=s" => \$classified_out, "output=s" => \$outfile, "confidence=f" => \$confidence_threshold, "memory-mapping" => \$memory_mapping, "paired" => \$paired, "use-names" => \$names_in_output, "gzip-compressed" => \$gunzip, "bzip2-compressed" => \$bunzip2, "only-classified-output" => \$only_classified_output, "minimum-base-quality=i" => \$minimum_base_quality, "report=s" => \$report_filename, "use-mpa-style" => \$use_mpa_style, "report-zero-counts" => \$report_zero_counts, ); if (! defined $threads) { $threads = $ENV{"KRAKEN2_NUM_THREADS"} || 1; } if (! @ARGV) { print STDERR "Need to specify input filenames!\n"; usage(); } eval { $db_prefix = kraken2lib::find_db($db_prefix); }; if ($@) { die "$PROG: $@"; } my $taxonomy = "$db_prefix/taxo.k2d"; my $kht_file = "$db_prefix/hash.k2d"; my $opt_file = "$db_prefix/opts.k2d"; for my $file ($taxonomy, $kht_file, $opt_file) { if (! -e $file) { die "$PROG: $file does not exist!\n"; } } if ($paired && ((@ARGV % 2) != 0 || @ARGV == 0)) { die "$PROG: --paired requires positive and even number filenames\n"; } my $compressed = $gunzip || $bunzip2; if ($gunzip && $bunzip2) { die "$PROG: can't use both gzip and bzip2 compression flags\n"; } if ($confidence_threshold < 0) { die "$PROG: confidence threshold must be nonnegative\n"; } if ($confidence_threshold > 1) { die "$PROG: confidence threshold must be no greater than 1\n"; } my $auto_detect = ! $compressed; if ($auto_detect) { auto_detect_file_format(); } # set flags for classifier my @flags; push @flags, "-H", $kht_file; push @flags, "-t", $taxonomy; push @flags, "-o", $opt_file; push @flags, "-p", $threads; push @flags, "-q" if $quick; push @flags, "-P" if $paired; push @flags, "-n" if $names_in_output; push @flags, "-T", $confidence_threshold; push @flags, "-U", $unclassified_out if defined $unclassified_out; push @flags, "-C", $classified_out if defined $classified_out; push @flags, "-O", $outfile if defined $outfile; push @flags, "-Q", $minimum_base_quality; push @flags, "-R", $report_filename if defined $report_filename; push @flags, "-m" if $use_mpa_style; push @flags, "-z" if $report_zero_counts; push @flags, "-M" if $memory_mapping; # Stupid hack to keep filehandles from closing before exec # filehandles opened inside for loop below go out of scope # and are closed at end of loop without this my @persistent_fhs; # handle compressed files by opening pipes from decompression programs if ($compressed) { my @replacement_ARGV; my $compression_program; if ($gunzip) { $compression_program = "gzip"; } elsif ($bunzip2) { $compression_program = "bzip2"; } else { die "$PROG: unrecognized compression program! This is a Kraken bug.\n"; } for my $file (@ARGV) { my $qm_file = quotemeta $file; open my $fh, "$compression_program -dc $qm_file |" or die "$PROG: error opening pipe from $compression_program for $file: $!\n"; # Have to unset close-on-exec flags to make these pipes stay open across # exec call my $flags = fcntl $fh, F_GETFD, 0 or die "$PROG: fcntl GETFD error: $!\n"; fcntl $fh, F_SETFD, ($flags & ~FD_CLOEXEC) or die "$PROG: fcntl SETFD error: $!\n"; push @persistent_fhs, $fh; my $fd = fileno $fh; push @replacement_ARGV, "/dev/fd/$fd"; } @ARGV = @replacement_ARGV; } exec $CLASSIFY, @flags, @ARGV; die "$PROG: exec error: $!\n"; sub usage { my $exit_code = @_ ? shift : 64; my $default_db = "none"; eval { $default_db = '"' . kraken2lib::find_db() . '"'; }; my $def_thread_ct = exists $ENV{"KRAKEN2_NUM_THREADS"} ? (0 + $ENV{"KRAKEN2_NUM_THREADS"}) : 1; print STDERR < Options: --db NAME Name for Kraken 2 DB (default: $default_db) --threads NUM Number of threads (default: $def_thread_ct) --quick Quick operation (use first hit or hits) --unclassified-out FILENAME Print unclassified sequences to filename --classified-out FILENAME Print classified sequences to filename --output FILENAME Print output to filename (default: stdout); "-" will suppress normal output --confidence FLOAT Confidence score threshold (default: 0.0); must be in [0, 1]. --minimum-base-quality NUM Minimum base quality used in classification (def: 0, only effective with FASTQ input). --report FILENAME Print a report with aggregrate counts/clade to file --use-mpa-style With --report, format report output like Kraken 1's kraken-mpa-report --report-zero-counts With --report, report counts for ALL taxa, even if counts are zero --memory-mapping Avoids loading database into RAM --paired The filenames provided have paired-end reads --use-names Print scientific names instead of just taxids --gzip-compressed Input files are compressed with gzip --bzip2-compressed Input files are compressed with bzip2 --help Print this message --version Print version information If none of the *-compressed flags are specified, and the filename provided is a regular file, automatic format detection is attempted. EOF exit $exit_code; } sub display_help { usage(0); } sub display_version { print "Kraken version #####=VERSION=#####\n"; print "Copyright 2013-2019, Derrick Wood (dwood\@cs.jhu.edu)\n"; exit 0; } sub auto_detect_file_format { my $magic; my $filename = $ARGV[0]; # Don't try to auto-detect when you can't unread data if (! -f $filename) { return; } # read 2-byte magic number to determine type of compression (if any) open FILE, "<", $filename; read FILE, $magic, 2; close FILE; if ($magic eq $GZIP_MAGIC) { $compressed = 1; $gunzip = 1; } elsif ($magic eq $BZIP2_MAGIC) { $compressed = 1; $bunzip2 = 1; } } kraken2-2.0.8-beta/scripts/kraken2-build000077500000000000000000000212531346023071100200360ustar00rootroot00000000000000#!/usr/bin/env perl # Copyright 2013-2019, Derrick Wood # # This file is part of the Kraken 2 taxonomic sequence classification system. # General build process wrapper for Kraken 2. use strict; use warnings; use File::Basename; use Getopt::Long; my $PROG = basename $0; my $KRAKEN2_DIR = "#####=KRAKEN2_DIR=#####"; # Test to see if the executables got moved, try to recover if we can if (! -e "$KRAKEN2_DIR/classify") { use Cwd 'abs_path'; $KRAKEN2_DIR = dirname abs_path($0); } $ENV{"KRAKEN2_DIR"} = $KRAKEN2_DIR; $ENV{"PATH"} = "$KRAKEN2_DIR:$ENV{PATH}"; my $DEF_AA_MINIMIZER_LEN = 12; my $DEF_AA_KMER_LEN = 15; my $DEF_AA_MINIMIZER_SPACES = 0; my $DEF_NT_MINIMIZER_LEN = 31; my $DEF_NT_KMER_LEN = 35; my $DEF_NT_MINIMIZER_SPACES = 7; my $DEF_THREAD_CT = 1; my @VALID_LIBRARY_TYPES = qw/archaea bacteria plasmid viral plant protozoa fungi human nr nt env_nr env_nt UniVec UniVec_Core/; my @VALID_SPECIAL_DB_TYPES = qw/greengenes silva rdp/; # Option/task option variables my ( $db, $threads, $minimizer_len, $kmer_len, $minimizer_spaces, $is_protein, $no_masking, $max_db_size, $use_ftp, $skip_maps, $dl_taxonomy, $dl_library, $add_to_library, $build, $standard, $clean, $special, ); $threads = $DEF_THREAD_CT; $is_protein = 0; # variables corresponding to task options my @TASK_LIST = ( \$dl_taxonomy, \$dl_library, \$add_to_library, \$build, \$standard, \$clean, \$special, ); GetOptions( "help" => \&display_help, "version" => \&display_version, "db=s" => \$db, "threads=i" => \$threads, "minimizer-len=i" => \$minimizer_len, "kmer-len=i" => \$kmer_len, "minimizer-spaces=i", \$minimizer_spaces, "protein" => \$is_protein, "no-masking" => \$no_masking, "max-db-size=i" => \$max_db_size, "use-ftp" => \$use_ftp, "skip-maps" => \$skip_maps, "download-taxonomy" => \$dl_taxonomy, "download-library=s" => \$dl_library, "add-to-library=s" => \$add_to_library, "build" => \$build, "standard" => \$standard, "clean" => \$clean, "special=s" => \$special, ) or usage(); if ($is_protein) { $kmer_len = $DEF_AA_KMER_LEN if ! defined $kmer_len; $minimizer_len = $DEF_AA_MINIMIZER_LEN if ! defined $minimizer_len; $minimizer_spaces = $DEF_AA_MINIMIZER_SPACES if ! defined $minimizer_spaces; } else { $kmer_len = $DEF_NT_KMER_LEN if ! defined $kmer_len; $minimizer_len = $DEF_NT_MINIMIZER_LEN if ! defined $minimizer_len; $minimizer_spaces = $DEF_NT_MINIMIZER_SPACES if ! defined $minimizer_spaces; } if (@ARGV) { warn "Extra arguments on command line.\n"; usage(); } my $task_options = scalar grep defined $$_, @TASK_LIST; if ($task_options > 1) { warn "More than one task option selected.\n"; usage(); } if ($task_options == 0) { warn "Must select a task option.\n"; usage(); } if (! defined $db) { die "Must specify a database name\n"; } if ($threads <= 0) { die "Can't use nonpositive thread count of $threads\n"; } if ($minimizer_len > $kmer_len) { die "Minimizer length ($minimizer_len) must not be greater than k ($kmer_len)\n"; } if ($minimizer_len <= 0) { die "Can't use nonpositive minimizer length of $minimizer_len\n"; } if ($minimizer_len > 31) { die "Can't use minimizer len of $minimizer_len (must be <= 31)\n"; } $ENV{"KRAKEN2_DB_NAME"} = $db; $ENV{"KRAKEN2_THREAD_CT"} = $threads; $ENV{"KRAKEN2_MINIMIZER_LEN"} = $minimizer_len; $ENV{"KRAKEN2_KMER_LEN"} = $kmer_len; $ENV{"KRAKEN2_MINIMIZER_SPACES"} = $minimizer_spaces; $ENV{"KRAKEN2_SEED_TEMPLATE"} = construct_seed_template(); $ENV{"KRAKEN2_PROTEIN_DB"} = $is_protein ? 1 : ""; $ENV{"KRAKEN2_MASK_LC"} = $no_masking ? "" : 1; $ENV{"KRAKEN2_MAX_DB_SIZE"} = defined($max_db_size) ? $max_db_size : ""; $ENV{"KRAKEN2_USE_FTP"} = $use_ftp ? 1 : ""; $ENV{"KRAKEN2_SKIP_MAPS"} = $skip_maps ? 1 : ""; if ($dl_taxonomy) { download_taxonomy(); } elsif (defined($dl_library)) { download_library($dl_library); } elsif (defined($add_to_library)) { add_to_library($add_to_library); } elsif ($standard) { standard_installation(); } elsif ($build) { build_database(); } elsif ($clean) { clean_database(); } elsif ($special) { build_special_database($special); } else { usage(); } exit -1; # END OF MAIN CODE. sub usage { my $exit_code = @_ ? shift : 64; print STDERR < # # This file is part of the Kraken 2 taxonomic sequence classification system. # Wrapper for the dump_table program to report minimizer counts per taxa use strict; use warnings; use Fcntl; use File::Basename; use Getopt::Long; my $PROG = basename $0; my $KRAKEN2_DIR = "#####=KRAKEN2_DIR=#####"; # Test to see if the executables got moved, try to recover if we can if (! -e "$KRAKEN2_DIR/classify") { use Cwd 'abs_path'; $KRAKEN2_DIR = dirname abs_path($0); } require "$KRAKEN2_DIR/kraken2lib.pm"; $ENV{"KRAKEN2_DIR"} = $KRAKEN2_DIR; $ENV{"PATH"} = "$KRAKEN2_DIR:$ENV{PATH}"; my $DUMP_TABLE = "$KRAKEN2_DIR/dump_table"; my $db_prefix; my $names_in_output = 0; my $use_mpa_style = 0; my $report_zero_counts = 0; my $skip_counts = 0; my $threads; GetOptions( "help" => \&display_help, "version" => \&display_version, "db=s" => \$db_prefix, "skip-counts" => \$skip_counts, "use-mpa-style" => \$use_mpa_style, "report-zero-counts" => \$report_zero_counts, "threads=i" => \$threads, ); if (@ARGV) { usage(); } eval { $db_prefix = kraken2lib::find_db($db_prefix); }; if ($@) { die "$PROG: $@"; } if (! defined $threads) { $threads = $ENV{"KRAKEN2_NUM_THREADS"} || 1; } my $taxonomy = "$db_prefix/taxo.k2d"; my $kht_file = "$db_prefix/hash.k2d"; my $opt_file = "$db_prefix/opts.k2d"; for my $file ($taxonomy, $kht_file, $opt_file) { if (! -e $file) { die "$PROG: $file does not exist!\n"; } } # set flags for dump_table my @flags; push @flags, "-H", $kht_file; push @flags, "-t", $taxonomy; push @flags, "-o", $opt_file; push @flags, "-s" if $skip_counts; push @flags, "-m" if $use_mpa_style; push @flags, "-z" if $report_zero_counts; push @flags, "-p", $threads; exec $DUMP_TABLE, @flags; die "$PROG: exec error: $!\n"; sub usage { my $exit_code = @_ ? shift : 64; my $default_db = "none"; eval { $default_db = '"' . kraken2lib::find_db() . '"'; }; print STDERR < # # This file is part of the Kraken 2 taxonomic sequence classification system. # Common subroutines for other Kraken scripts use strict; use warnings; # Input: the argument for a --db option (possibly undefined) # Returns: the DB to use, taking KRAKEN2_DEFAULT_DB and KRAKEN2_DB_PATH # into account. sub find_db { my $supplied_db_prefix = shift; my $db_prefix; if (! defined $supplied_db_prefix) { if (! exists $ENV{"KRAKEN2_DEFAULT_DB"}) { die "Must specify DB with either --db or \$KRAKEN2_DEFAULT_DB\n"; } $supplied_db_prefix = $ENV{"KRAKEN2_DEFAULT_DB"}; } my @db_path = ("."); if (exists $ENV{"KRAKEN2_DB_PATH"}) { my $path_str = $ENV{"KRAKEN2_DB_PATH"}; # Allow zero-length path to be current dir $path_str =~ s/^:/.:/; $path_str =~ s/:$/:./; $path_str =~ s/::/:.:/; @db_path = split /:/, $path_str; } # Use supplied DB if abs. or rel. path is given if ($supplied_db_prefix =~ m|/|) { $db_prefix = $supplied_db_prefix; } else { # Check all dirs in KRAKEN2_DB_PATH for my $dir (@db_path) { my $checked_db = "$dir/$supplied_db_prefix"; if (-e $checked_db && -d _) { $db_prefix = $checked_db; last; } } if (! defined $db_prefix) { my $printed_path = exists $ENV{"KRAKEN2_DB_PATH"} ? qq|"$ENV{'KRAKEN2_DB_PATH'}"| : "undefined"; die "unable to find $supplied_db_prefix in \$KRAKEN2_DB_PATH ($printed_path)\n"; } } for my $file (qw/taxo.k2d hash.k2d opts.k2d/) { if (! -e "$db_prefix/$file") { die "database (\"$db_prefix\") does not contain necessary file $file\n"; } } return $db_prefix; } # Input: a FASTA sequence ID # Output: either (a) a taxonomy ID number found in the sequence ID, # (b) an NCBI accession number found in the sequence ID, or undef sub check_seqid { my $seqid = shift; my $taxid = undef; # Note all regexes here use ?: to avoid capturing the ^ or | character if ($seqid =~ /(?:^|\|)kraken:taxid\|(\d+)/) { $taxid = $1; # OK, has explicit taxid } elsif ($seqid =~ /^(\d+)$/) { $taxid = $1; # OK, has explicit taxid (w/o token) } # Accession number check elsif ($seqid =~ /(?:^|\|) # Begins seqid or immediately follows pipe ([A-Z]+ # Starts with one or more UC alphas _? # Might have an underscore next [A-Z0-9]+) # Ends with UC alphas or digits (?:\||\b|\.)/x # Followed by pipe, word boundary, or period ) { $taxid = $1; # A bit misleading - failure to pass /^\d+$/ means this is # OK, but requires accession -> taxid mapping } return $taxid; } 1; kraken2-2.0.8-beta/scripts/lookup_accession_numbers.pl000077500000000000000000000033641346023071100231140ustar00rootroot00000000000000#!/usr/bin/env perl # Copyright 2013-2019, Derrick Wood # # This file is part of the Kraken 2 taxonomic sequence classification system. # Looks up accession numbers and reports associated taxonomy IDs # # Input is (a) 1 2-column TSV file w/ sequence IDs and accession numbers, # and (b) a list of accession2taxid files from NCBI. # Output is tab-delimited lines, with sequence IDs in first # column and taxonomy IDs in second. use strict; use warnings; use File::Basename; use Getopt::Std; my $PROG = basename $0; my $lookup_list_file = shift @ARGV; my %target_lists; open FILE, "<", $lookup_list_file or die "$PROG: can't open $lookup_list_file: $!\n"; while () { chomp; my ($seqid, $accnum) = split /\t/; $target_lists{$accnum} ||= []; push @{ $target_lists{$accnum} }, $seqid; } close FILE; my $initial_target_count = scalar keys %target_lists; my @accession_map_files = @ARGV; for my $file (@accession_map_files) { open FILE, "<", $file or die "$PROG: can't open $file: $!\n"; scalar(); # discard header line while () { chomp; my ($accession, $with_version, $taxid, $gi) = split /\t/; if ($target_lists{$accession}) { my @list = @{ $target_lists{$accession} }; delete $target_lists{$accession}; for my $seqid (@list) { print "$seqid\t$taxid\n"; } last if ! %target_lists; } } close FILE; last if ! %target_lists; } if (%target_lists) { warn "$PROG: @{[scalar keys %target_lists]}/$initial_target_count accession numbers remain unmapped, see unmapped.txt in DB directory\n"; open LIST, ">", "unmapped.txt" or die "$PROG: can't write unmapped.txt: $!\n"; for my $target (keys %target_lists) { print LIST "$target\n"; } close LIST; } kraken2-2.0.8-beta/scripts/make_seqid2taxid_map.pl000077500000000000000000000074531346023071100220770ustar00rootroot00000000000000#!/usr/bin/env perl # Copyright 2013-2019, Derrick Wood # # This file is part of the Kraken 2 taxonomic sequence classification system. # Reads multi-FASTA input and examines each sequence header. In quiet mode # (-q, used as an initial check on file validity), headers are OK if a # taxonomy ID is found (as either the entire sequence ID or as part of a # "kraken:taxid" token), or if something looking like a GI or accession # number is found. In normal mode, the taxonomy ID will be looked up (if # not explicitly specified in the sequence ID) and reported if it can be # found. Output is tab-delimited lines, with sequence IDs in first # column and taxonomy IDs in second. # Sequence IDs with a kraken:taxid token will use that to assign taxonomy # ID, e.g.: # >gi|32499|ref|NC_021949.2|kraken:taxid|562| # # Sequence IDs that are completely numeric are assumed to be the taxonomy # ID for that sequence. # # Otherwise, an accession number is searched for; if not found, a GI # number is searched for. Failure to find any of the above is a fatal error. # Without -q, a comma-separated file list specified by -A (for both accession # numbers and GI numbers) is examined; failure to find a # taxonomy ID that maps to a provided accession/GI number is non-fatal and # will emit a warning. # # With -q, does not print any output, and will die w/ nonzero exit instead # of warning when unable to find a taxid, accession #, or GI #. use strict; use warnings; use File::Basename; use Getopt::Std; my $PROG = basename $0; getopts('qA:L:', \my %opts); if ($opts{'q'} && (defined $opts{"A"} || defined $opts{"L"})) { die "$PROG: -q doesn't allow for -A/-L\n"; } my %target_lists; while (<>) { next unless /^>(\S+)/; my $seq_id = $1; my $output; # Note all regexes here use ?: to avoid capturing the ^ or | character if ($seq_id =~ /(?:^|\|)kraken:taxid\|(\d+)/) { $output = "$seq_id\t$1\n"; } elsif ($seq_id =~ /^\d+$/) { $output = "$seq_id\t$seq_id\n"; } # Accession regex is first, only puts number (not version) into $1 elsif ($seq_id =~ /(?:^|\|)([A-Z]+_?[A-Z0-9]+)(?:\||\b|\.)/ || $seq_id =~ /(?:^|\|)gi\|(\d+)/) { if (! $opts{"q"}) { $target_lists{$1} ||= []; push @{ $target_lists{$1} }, $seq_id; } } else { die "$PROG: unable to determine taxonomy ID for sequence $seq_id\n"; } if (defined $output && ! $opts{"q"}) { print $output; } } if ($opts{"q"}) { if (keys %target_lists) { print "$PROG: requires external map\n"; } exit 0; } exit 0 if ! keys %target_lists; if (! (defined $opts{"A"} || defined $opts{"L"})) { die "$PROG: found sequence ID without explicit taxonomy ID, but no map used\n"; } # Remove targets where we've already handled the mapping if (defined $opts{"L"}) { my $library_map_file = $opts{"L"}; open FILE, "<", $library_map_file or die "$PROG: can't open $library_map_file: $!\n"; while () { chomp; my ($seqid, $taxid) = split /\t/; if (exists $target_lists{$seqid}) { print "$seqid\t$taxid\n"; delete $target_lists{$seqid}; } } close FILE; } exit 0 if ! keys %target_lists; my @accession_map_files = split /,/, $opts{"A"}; for my $file (@accession_map_files) { open FILE, "<", $file or die "$PROG: can't open $file: $!\n"; scalar(); # discard header line while () { chomp; my ($accession, $with_version, $taxid, $gi) = split /\t/; if ($target_lists{$accession}) { my @list = @{ $target_lists{$accession} }; delete $target_lists{$accession}; for my $seqid (@list) { print "$seqid\t$taxid\n"; } } if ($gi ne "na" && $target_lists{$gi}) { my @list = @{ $target_lists{$gi} }; delete $target_lists{$gi}; for my $seqid (@list) { print "$seqid\t$taxid\n"; } } } close FILE; } kraken2-2.0.8-beta/scripts/mask_low_complexity.sh000077500000000000000000000022751346023071100221110ustar00rootroot00000000000000#!/bin/bash # Copyright 2013-2019, Derrick Wood # # This file is part of the Kraken 2 taxonomic sequence classification system. # Masks low complexity sequences in the database using the dustmasker (nucl.) # or segmasker (prot.) programs from NCBI. set -u # Protect against uninitialized vars. set -e # Stop on error set -o pipefail # Stop on failures in non-final pipeline commands target="$1" MASKER="dustmasker" if [ -n "$KRAKEN2_PROTEIN_DB" ]; then MASKER="segmasker" fi if ! which $MASKER > /dev/null; then echo "Unable to find $MASKER in path, can't mask low-complexity sequences" exit 1 fi if [ -d $target ]; then for file in $(find $target '(' -name '*.fna' -o -name '*.faa' ')'); do if [ ! -e "$file.masked" ]; then $MASKER -in $file -outfmt fasta | sed -e '/^>/!s/[a-z]/x/g' > "$file.tmp" mv "$file.tmp" $file touch "$file.masked" fi done elif [ -f $target ]; then if [ ! -e "$target.masked" ]; then $MASKER -in $target -outfmt fasta | sed -e '/^>/!s/[a-z]/x/g' > "$target.tmp" mv "$target.tmp" $target touch "$target.masked" fi else echo "Target $target must be directory or regular file, aborting masking" exit 1 fi kraken2-2.0.8-beta/scripts/rsync_from_ncbi.pl000077500000000000000000000135751346023071100212020ustar00rootroot00000000000000#!/usr/bin/env perl # Copyright 2013-2019, Derrick Wood # # This file is part of the Kraken 2 taxonomic sequence classification system. # Reads an assembly_summary.txt file, which indicates taxids and FTP paths for # genome/protein data. Performs the download of the complete genomes from # that file, decompresses, and explicitly assigns taxonomy as needed. use strict; use warnings; use File::Basename; use Getopt::Std; use Net::FTP; use List::Util qw/max/; my $PROG = basename $0; my $SERVER = "ftp.ncbi.nlm.nih.gov"; my $SERVER_PATH = "/genomes"; my $FTP_USER = "anonymous"; my $FTP_PASS = "kraken2download"; my $qm_server = quotemeta $SERVER; my $qm_server_path = quotemeta $SERVER_PATH; my $is_protein = $ENV{"KRAKEN2_PROTEIN_DB"}; my $use_ftp = $ENV{"KRAKEN2_USE_FTP"}; my $suffix = $is_protein ? "_protein.faa.gz" : "_genomic.fna.gz"; # Manifest hash maps filenames (keys) to taxids (values) my %manifest; while (<>) { next if /^#/; chomp; my @fields = split /\t/; my ($taxid, $asm_level, $ftp_path) = @fields[5, 11, 19]; # Possible TODO - make the list here configurable by user-supplied flags next unless grep {$asm_level eq $_} ("Complete Genome", "Chromosome"); my $full_path = $ftp_path . "/" . basename($ftp_path) . $suffix; # strip off server/leading dir name to allow --files-from= to work w/ rsync # also allows filenames to just start with "all/", which is nice if (! ($full_path =~ s#^ftp://${qm_server}${qm_server_path}/##)) { die "$PROG: unexpected FTP path (new server?) for $ftp_path\n"; } $manifest{$full_path} = $taxid; } open MANIFEST, ">", "manifest.txt" or die "$PROG: can't write manifest: $!\n"; print MANIFEST "$_\n" for keys %manifest; close MANIFEST; if ($is_protein && ! $use_ftp) { print STDERR "Step 0/2: performing rsync dry run (only protein d/l requires this)...\n"; # Protein files aren't always present, so we have to do this two-rsync run hack # First, do a dry run to find non-existent files, then delete them from the # manifest; after this, execution can proceed as usual. system("rsync --dry-run --no-motd --files-from=manifest.txt rsync://${SERVER}${SERVER_PATH} . 2> rsync.err"); open ERR_FILE, "<", "rsync.err" or die "$PROG: can't read rsync.err file: $!\n"; while () { chomp; # I really doubt this will work across every version of rsync. :( if (/failed: No such file or directory/ && /^rsync: link_stat "\/([^"]+)"/) { delete $manifest{$1}; } } close ERR_FILE; print STDERR "Rsync dry run complete, removing any non-existent files from manifest.\n"; # Rewrite manifest open MANIFEST, ">", "manifest.txt" or die "$PROG: can't write manifest: $!\n"; print MANIFEST "$_\n" for keys %manifest; close MANIFEST; } if ($use_ftp) { print STDERR "Step 1/2: Performing ftp file transfer of requested files\n"; my $ftp = Net::FTP->new($SERVER, Passive => 1) or die "$PROG: FTP connection error: $@\n"; $ftp->login($FTP_USER, $FTP_PASS) or die "$PROG: FTP login error: " . $ftp->message() . "\n"; $ftp->binary() or die "$PROG: FTP binary mode error: " . $ftp->message() . "\n"; $ftp->cwd($SERVER_PATH) or die "$PROG: FTP CD error: " . $ftp->message() . "\n"; open MANIFEST, "<", "manifest.txt" or die "$PROG: can't open manifest: $!\n"; mkdir "all" or die "$PROG: can't create 'all' directory: $!\n"; chdir "all" or die "$PROG: can't chdir into 'all' directory: $!\n"; while () { chomp; $ftp->get($_) or do { my $msg = $ftp->message(); if ($msg !~ /: No such file or directory$/) { warn "$PROG: unable to download $_: $msg\n"; } }; } close MANIFEST; chdir ".." or die "$PROG: can't return to correct directory: $!\n"; } else { print STDERR "Step 1/2: Performing rsync file transfer of requested files\n"; system("rsync --no-motd --files-from=manifest.txt rsync://${SERVER}${SERVER_PATH}/ .") == 0 or die "$PROG: rsync error, exiting: $?\n"; print STDERR "Rsync file transfer complete.\n"; } print STDERR "Step 2/2: Assigning taxonomic IDs to sequences\n"; my $output_file = $is_protein ? "library.faa" : "library.fna"; open OUT, ">", $output_file or die "$PROG: can't write $output_file: $!\n"; my $projects_added = 0; my $sequences_added = 0; my $ch_added = 0; my $ch = $is_protein ? "aa" : "bp"; my $max_out_chars = 0; for my $in_filename (keys %manifest) { my $taxid = $manifest{$in_filename}; if ($use_ftp) { # FTP downloading doesn't create full path locally $in_filename = "all/" . basename($in_filename); } open IN, "gunzip -c $in_filename |" or die "$PROG: can't read $in_filename: $!\n"; while () { if (/^>/) { s/^>/>kraken:taxid|$taxid|/; $sequences_added++; } else { $ch_added += length($_) - 1; } print OUT; } close IN; unlink $in_filename; $projects_added++; my $out_line = progress_line($projects_added, scalar keys %manifest, $sequences_added, $ch_added) . "..."; $max_out_chars = max(length($out_line), $max_out_chars); my $space_line = " " x $max_out_chars; print STDERR "\r$space_line\r$out_line" if -t STDERR; } close OUT; print STDERR " done.\n" if -t STDERR; print STDERR "All files processed, cleaning up extra sequence files..."; system("rm -rf all/") == 0 or die "$PROG: can't clean up all/ directory: $?\n"; print STDERR " done, library complete.\n"; sub progress_line { my ($projs, $total_projs, $seqs, $chs) = @_; my $line = "Processed "; $line .= ($projs == $total_projs) ? "$projs" : "$projs/$total_projs"; $line .= " project" . ($total_projs > 1 ? 's' : '') . " "; $line .= "($seqs sequence" . ($seqs > 1 ? 's' : '') . ", "; my $prefix; my @prefixes = qw/k M G T P E/; while (@prefixes && $chs >= 1000) { $prefix = shift @prefixes; $chs /= 1000; } if (defined $prefix) { $line .= sprintf '%.2f %s%s)', $chs, $prefix, $ch; } else { $line .= "$chs $ch)"; } return $line; } kraken2-2.0.8-beta/scripts/scan_fasta_file.pl000077500000000000000000000031371346023071100211200ustar00rootroot00000000000000#!/usr/bin/env perl # Copyright 2013-2019, Derrick Wood # # This file is part of the Kraken 2 taxonomic sequence classification system. # Reads multi-FASTA input and examines each sequence header. Headers are # OK if a taxonomy ID is found (as either the entire sequence ID or as part # of a # "kraken:taxid" token), or if something looking like an accession # number is found. Not "OK" headers will are fatal errors unless "--lenient" # is used. # # Each sequence header results in a line with three tab-separated values; # the first indicating whether third column is the taxonomy ID ("TAXID") or # an accession number ("ACCNUM") for the sequence ID listed in the second # column. use strict; use warnings; use File::Basename; use Getopt::Long; require "$ENV{KRAKEN2_DIR}/kraken2lib.pm"; my $PROG = basename $0; my $lenient = 0; GetOptions("lenient" => \$lenient) or die "Usage: $PROG [--lenient] \n"; while (<>) { next unless /^>/; # while (/.../g) needed because non-redundant DBs sometimes have multiple # sequence IDs in the header; extra sequence IDs are prefixed by # '\x01' characters (if downloaded in FASTA format from NCBI FTP directly). while (/(?:^>|\x01)(\S+)/g) { my $seqid = $1; my $taxid = kraken2lib::check_seqid($seqid); if (! defined $taxid) { next if $lenient; die "$PROG: unable to determine taxonomy ID for sequence $seqid\n"; } # "$taxid" may actually be an accession number if ($taxid =~ /^\d+$/) { print "TAXID\t$seqid\t$taxid\n"; } else { print "ACCNUM\t$seqid\t$taxid\n"; } } } kraken2-2.0.8-beta/scripts/standard_installation.sh000077500000000000000000000027431346023071100224010ustar00rootroot00000000000000#!/bin/bash # Copyright 2013-2019, Derrick Wood # # This file is part of the Kraken 2 taxonomic sequence classification system. # Build the standard Kraken database # Designed to be called by kraken_build set -u # Protect against uninitialized vars. set -e # Stop on error set -o pipefail # Stop on failures in non-final pipeline commands protein_flag="" if [ -n "$KRAKEN2_PROTEIN_DB" ]; then protein_flag="--protein" fi masking_flag="" if [ -z "$KRAKEN2_MASK_LC" ]; then masking_flag="--no-mask" fi ftp_flag="" if [ -n "$KRAKEN2_USE_FTP" ]; then ftp_flag="--use-ftp" fi kraken2-build --db $KRAKEN2_DB_NAME --download-taxonomy --skip-maps $masking_flag $protein_flag $ftp_flag kraken2-build --db $KRAKEN2_DB_NAME --download-library archaea $masking_flag $protein_flag $ftp_flag kraken2-build --db $KRAKEN2_DB_NAME --download-library bacteria $masking_flag $protein_flag $ftp_flag kraken2-build --db $KRAKEN2_DB_NAME --download-library viral $masking_flag $protein_flag $ftp_flag kraken2-build --db $KRAKEN2_DB_NAME --download-library human --no-mask $protein_flag $ftp_flag if [ -z "$KRAKEN2_PROTEIN_DB" ]; then kraken2-build --db $KRAKEN2_DB_NAME --download-library UniVec_Core $masking_flag $ftp_flag fi kraken2-build --db $KRAKEN2_DB_NAME --build --threads $KRAKEN2_THREAD_CT \ --minimizer-len $KRAKEN2_MINIMIZER_LEN \ --kmer-len $KRAKEN2_KMER_LEN \ --minimizer-spaces $KRAKEN2_MINIMIZER_SPACES \ $protein_flag kraken2-2.0.8-beta/src/000077500000000000000000000000001346023071100145535ustar00rootroot00000000000000kraken2-2.0.8-beta/src/Makefile000066400000000000000000000022201346023071100162070ustar00rootroot00000000000000CXX = g++ CXXFLAGS = -fopenmp -Wall -std=c++11 -O3 CXXFLAGS += -DLINEAR_PROBING .PHONY: all clean install PROGS = estimate_capacity build_db classify dump_table lookup_accession_numbers all: $(PROGS) install: $(PROGS) cp $(PROGS) $(KRAKEN2_DIR)/ clean: rm -f *.o $(PROGS) taxonomy.o: taxonomy.cc taxonomy.h mmap_file.h mmap_file.o: mmap_file.cc mmap_file.h compact_hash.o: compact_hash.cc compact_hash.h kv_store.h mmap_file.h mmscanner.o: mmscanner.cc mmscanner.h seqreader.o: seqreader.cc seqreader.h omp_hack.o: omp_hack.cc omp_hack.h reports.o: reports.cc reports.h aa_translate.o: aa_translate.cc aa_translate.h utilities.o: utilities.cc utilities.h build_db: build_db.cc mmap_file.o compact_hash.o taxonomy.o seqreader.o mmscanner.o omp_hack.o utilities.o classify: classify.cc reports.o mmap_file.o compact_hash.o taxonomy.o seqreader.o mmscanner.o omp_hack.o aa_translate.o utilities.o estimate_capacity: estimate_capacity.cc seqreader.o mmscanner.o omp_hack.o utilities.o dump_table: dump_table.cc mmap_file.o compact_hash.o omp_hack.o taxonomy.o reports.o lookup_accession_numbers: lookup_accession_numbers.cc mmap_file.o omp_hack.o utilities.o kraken2-2.0.8-beta/src/aa_translate.cc000066400000000000000000000075201346023071100175240ustar00rootroot00000000000000/* * Copyright 2013-2019, Derrick Wood * * This file is part of the Kraken 2 taxonomic sequence classification system. */ #include "aa_translate.h" using std::string; using std::vector; namespace kraken2 { /* This *was* the map, which was ridiculously slow. Note that ordering is AGCT, not ACGT. static map old_translation_map = { { "AAA", 'K' }, { "AAG", 'K' }, { "AAC", 'N' }, { "AAT", 'N' }, { "AGA", 'R' }, { "AGG", 'R' }, { "AGC", 'S' }, { "AGT", 'S' }, { "ACA", 'T' }, { "ACG", 'T' }, { "ACC", 'T' }, { "ACT", 'T' }, { "ATA", 'I' }, { "ATG", 'M' }, { "ATC", 'I' }, { "ATT", 'I' }, { "GAA", 'E' }, { "GAG", 'E' }, { "GAC", 'D' }, { "GAT", 'D' }, { "GGA", 'G' }, { "GGG", 'G' }, { "GGC", 'G' }, { "GGT", 'G' }, { "GCA", 'A' }, { "GCG", 'A' }, { "GCC", 'A' }, { "GCT", 'A' }, { "GTA", 'V' }, { "GTG", 'V' }, { "GTC", 'V' }, { "GTT", 'V' }, { "CAA", 'Q' }, { "CAG", 'Q' }, { "CAC", 'H' }, { "CAT", 'H' }, { "CGA", 'R' }, { "CGG", 'R' }, { "CGC", 'R' }, { "CGT", 'R' }, { "CCA", 'P' }, { "CCG", 'P' }, { "CCC", 'P' }, { "CCT", 'P' }, { "CTA", 'L' }, { "CTG", 'L' }, { "CTC", 'L' }, { "CTT", 'L' }, { "TAA", '*' }, { "TAG", '*' }, { "TAC", 'Y' }, { "TAT", 'Y' }, { "TGA", '*' }, { "TGG", 'W' }, { "TGC", 'C' }, { "TGT", 'C' }, { "TCA", 'S' }, { "TCG", 'S' }, { "TCC", 'S' }, { "TCT", 'S' }, { "TTA", 'L' }, { "TTG", 'L' }, { "TTC", 'F' }, { "TTT", 'F' } }; */ static char translation_map[] = "KKNNRRSSTTTTIMIIEEDDGGGGAAAAVVVVQQHHRRRRPPPPLLLL**YY*WCCSSSSLLFF"; static uint8_t fwd_lookup_table[UINT8_MAX + 1] = {0}; static uint8_t rev_lookup_table[UINT8_MAX + 1] = {0}; void TranslateToAllFrames(string &dna_seq, vector &aa_seqs) { auto max_size = (dna_seq.size() / 3) + 1; for (auto i = 0; i < 6; i++) aa_seqs[i].assign(max_size, ' '); if (dna_seq.size() < 3) return; if (fwd_lookup_table[0] == 0) { for (size_t i = 0; i <= UINT8_MAX; i++) fwd_lookup_table[i] = rev_lookup_table[i] = UINT8_MAX; // Map is based on AGCT coding, not ACGT fwd_lookup_table[(int) 'A'] = fwd_lookup_table[(int) 'a'] = 0x00; fwd_lookup_table[(int) 'G'] = fwd_lookup_table[(int) 'g'] = 0x01; fwd_lookup_table[(int) 'C'] = fwd_lookup_table[(int) 'c'] = 0x02; fwd_lookup_table[(int) 'T'] = fwd_lookup_table[(int) 't'] = 0x03; rev_lookup_table[(int) 'A'] = rev_lookup_table[(int) 'a'] = 0x30; rev_lookup_table[(int) 'G'] = rev_lookup_table[(int) 'g'] = 0x20; rev_lookup_table[(int) 'C'] = rev_lookup_table[(int) 'c'] = 0x10; rev_lookup_table[(int) 'T'] = rev_lookup_table[(int) 't'] = 0x00; } uint8_t fwd_codon = 0, rev_codon = 0; int ambig_nt_countdown = 0; // if positive, bases to go until N leaves codon size_t frame_len[6] = {0}; for (auto i = 0u; i < dna_seq.size(); i++) { auto frame = i % 3; fwd_codon <<= 2; fwd_codon &= 0x3f; rev_codon >>= 2; if (ambig_nt_countdown) ambig_nt_countdown--; auto fwd_lookup_code = fwd_lookup_table[(int) dna_seq[i]]; auto rev_lookup_code = rev_lookup_table[(int) dna_seq[i]]; if (fwd_lookup_code == UINT8_MAX) ambig_nt_countdown = 3; else { fwd_codon |= fwd_lookup_code; rev_codon |= rev_lookup_code; } if (i >= 2) { // we've got a full codon char ch; // Translate and append to forward frame ch = ambig_nt_countdown == 0 ? translation_map[fwd_codon] : 'X'; aa_seqs[frame][frame_len[frame]++] = ch; // Translate and prepend to reverse frame ch = ambig_nt_countdown == 0 ? translation_map[rev_codon] : 'X'; aa_seqs[frame + 3][max_size - 1 - frame_len[frame + 3]++] = ch; } } for (auto i = 0; i < 3; i++) if (aa_seqs[i].size() != frame_len[i]) aa_seqs[i].resize(frame_len[i]); for (auto i = 3; i < 6; i++) if (aa_seqs[i].size() != frame_len[i]) aa_seqs[i].erase(0, aa_seqs[i].size() - frame_len[i]); } } kraken2-2.0.8-beta/src/aa_translate.h000066400000000000000000000005451346023071100173660ustar00rootroot00000000000000/* * Copyright 2013-2019, Derrick Wood * * This file is part of the Kraken 2 taxonomic sequence classification system. */ #ifndef KRAKEN2_AA_TRANSLATE_H_ #define KRAKEN2_AA_TRANSLATE_H_ #include "kraken2_headers.h" namespace kraken2 { void TranslateToAllFrames(std::string &dna_seq, std::vector &aa_seqs); } #endif kraken2-2.0.8-beta/src/build_db.cc000066400000000000000000000276421346023071100166410ustar00rootroot00000000000000/* * Copyright 2013-2019, Derrick Wood * * This file is part of the Kraken 2 taxonomic sequence classification system. */ #include "kraken2_headers.h" #include "taxonomy.h" #include "mmscanner.h" #include "seqreader.h" #include "compact_hash.h" #include "kv_store.h" #include "kraken2_data.h" #include "utilities.h" using std::string; using std::map; using std::cout; using std::queue; using std::cerr; using std::endl; using std::vector; using std::istringstream; using std::ifstream; using std::ofstream; using namespace kraken2; #define DEFAULT_BLOCK_SIZE (10 * 1024 * 1024) // 10 MB struct Options { string ID_to_taxon_map_filename; string ncbi_taxonomy_directory; string hashtable_filename; string options_filename; string taxonomy_filename; size_t block_size; int num_threads; bool input_is_protein; ssize_t k, l; size_t capacity; size_t maximum_capacity; uint64_t spaced_seed_mask; uint64_t toggle_mask; uint64_t min_clear_hash_value; }; void ParseCommandLine(int argc, char **argv, Options &opts); void usage(int exit_code = EX_USAGE); vector ExtractNCBISequenceIDs(const string &header); void ProcessSequence(string &seq, uint64_t taxid, CompactHashTable &hash, Taxonomy &tax, MinimizerScanner &scanner, uint64_t min_clear_hash_value); void ProcessSequences(Options &opts, map &ID_to_taxon_map, CompactHashTable &kraken_index, Taxonomy &taxonomy); void ReadIDToTaxonMap(map &id_map, string &filename); void GenerateTaxonomy(Options &opts, map &id_map); struct TaxonSeqPair { uint64_t taxon; string seq; }; int main(int argc, char **argv) { Options opts; opts.spaced_seed_mask = DEFAULT_SPACED_SEED_MASK; opts.toggle_mask = DEFAULT_TOGGLE_MASK; opts.input_is_protein = false; opts.num_threads = 1; opts.block_size = DEFAULT_BLOCK_SIZE; opts.min_clear_hash_value = 0; opts.maximum_capacity = 0; ParseCommandLine(argc, argv, opts); omp_set_num_threads( opts.num_threads ); map ID_to_taxon_map; ReadIDToTaxonMap(ID_to_taxon_map, opts.ID_to_taxon_map_filename); GenerateTaxonomy(opts, ID_to_taxon_map); std::cerr << "Taxonomy parsed and converted." << std::endl; Taxonomy taxonomy(opts.taxonomy_filename.c_str()); taxonomy.GenerateExternalToInternalIDMap(); size_t bits_needed_for_value = 1; while ((1 << bits_needed_for_value) < (ssize_t) taxonomy.node_count()) bits_needed_for_value++; auto actual_capacity = opts.capacity; if (opts.maximum_capacity) { double frac = opts.maximum_capacity * 1.0 / opts.capacity; if (frac > 1) errx(EX_DATAERR, "maximum capacity larger than requested capacity"); opts.min_clear_hash_value = (uint64_t) ((1 - frac) * UINT64_MAX); actual_capacity = opts.maximum_capacity; } CompactHashTable kraken_index(actual_capacity, 32 - bits_needed_for_value, bits_needed_for_value); std::cerr << "CHT created with " << bits_needed_for_value << " bits reserved for taxid." << std::endl; ProcessSequences(opts, ID_to_taxon_map, kraken_index, taxonomy); std::cerr << "Writing data to disk... " << std::flush; kraken_index.WriteTable(opts.hashtable_filename.c_str()); IndexOptions index_opts; index_opts.k = opts.k; index_opts.l = opts.l; index_opts.spaced_seed_mask = opts.spaced_seed_mask; index_opts.toggle_mask = opts.toggle_mask; index_opts.dna_db = ! opts.input_is_protein; index_opts.minimum_acceptable_hash_value = opts.min_clear_hash_value; index_opts.revcom_version = CURRENT_REVCOM_VERSION; ofstream opts_fs(opts.options_filename); opts_fs.write((char *) &index_opts, sizeof(index_opts)); if (! opts_fs.good()) errx(EX_OSERR, "Unable to write options file %s", opts.options_filename.c_str()); opts_fs.close(); std::cerr << " complete." << std::endl; return 0; } void ProcessSequences(Options &opts, map &ID_to_taxon_map, CompactHashTable &kraken_index, Taxonomy &taxonomy) { size_t processed_seq_ct = 0; size_t processed_ch_ct = 0; #pragma omp parallel { Sequence sequence; MinimizerScanner scanner(opts.k, opts.l, opts.spaced_seed_mask, ! opts.input_is_protein, opts.toggle_mask); BatchSequenceReader reader; while (true) { // Declaration of "ok" and break need to be done outside of critical // section to conform with OpenMP spec. bool ok; #pragma omp critical(reader) ok = reader.LoadBlock(std::cin, opts.block_size); if (! ok) break; while (reader.NextSequence(sequence)) { auto all_sequence_ids = ExtractNCBISequenceIDs(sequence.header); uint64_t taxid = 0; for (auto &seqid : all_sequence_ids) { auto ext_taxid = ID_to_taxon_map[seqid]; taxid = taxonomy.LowestCommonAncestor(taxid, taxonomy.GetInternalID(ext_taxid)); } if (taxid) { // Add terminator for protein sequences if not already there if (opts.input_is_protein && sequence.seq.back() != '*') sequence.seq.push_back('*'); ProcessSequence(sequence.seq, taxid, kraken_index, taxonomy, scanner, opts.min_clear_hash_value); #pragma omp atomic processed_seq_ct++; #pragma omp atomic processed_ch_ct += sequence.seq.size(); } } if (isatty(fileno(stderr))) { #pragma omp critical(status_update) std::cerr << "\rProcessed " << processed_seq_ct << " sequences (" << processed_ch_ct << " " << (opts.input_is_protein ? "aa" : "bp") << ")..."; } } } if (isatty(fileno(stderr))) std::cerr << "\r"; std::cerr << "Completed processing of " << processed_seq_ct << " sequences, " << processed_ch_ct << " " << (opts.input_is_protein ? "aa" : "bp") << std::endl; } // This function exists to deal with NCBI's use of \x01 characters to denote // the start of a new FASTA header in the same line (for non-redundant DBs). // We return all sequence IDs in a header line, not just the first. vector ExtractNCBISequenceIDs(const string &header) { vector list; string current_str; bool in_id = true; // start loop at first char after '>' for (size_t i = 1; i < header.size(); ++i) { if (header[i] == 0x01) { // 0x01 starts new ID at next char if (! current_str.empty()) list.push_back(current_str); current_str.clear(); in_id = true; } else if (in_id && isspace(header[i])) { // spaces end ID if (! current_str.empty()) list.push_back(current_str); current_str.clear(); in_id = false; } else if (in_id) { // Build ID string char by char current_str.push_back(header[i]); } } if (! current_str.empty()) list.push_back(current_str); return list; } void ParseCommandLine(int argc, char **argv, Options &opts) { int opt; long long sig; while ((opt = getopt(argc, argv, "?hB:c:H:m:n:o:t:k:l:s:S:T:p:M:X")) != -1) { switch (opt) { case 'h' : case '?' : usage(0); break; case 'B' : sig = atoll(optarg); if (sig < 1) errx(EX_USAGE, "can't have negative block size"); opts.block_size = sig; break; case 'p' : opts.num_threads = atoll(optarg); if (opts.num_threads < 1) errx(EX_USAGE, "can't have negative number of threads"); if (opts.num_threads > omp_get_max_threads()) errx(EX_USAGE, "OMP only wants you to use %d threads", omp_get_max_threads()); break; case 'H' : opts.hashtable_filename = optarg; break; case 'm' : opts.ID_to_taxon_map_filename = optarg; break; case 'n' : opts.ncbi_taxonomy_directory = optarg; break; case 'o' : opts.options_filename = optarg; break; case 't' : opts.taxonomy_filename = optarg; break; case 'S' : opts.spaced_seed_mask = strtol(optarg, nullptr, 2); break; case 'T' : opts.toggle_mask = strtol(optarg, nullptr, 2); break; case 'k' : sig = atoll(optarg); if (sig < 1) errx(EX_USAGE, "k must be positive integer"); opts.k = sig; break; case 'l' : sig = atoll(optarg); if (sig < 1) errx(EX_USAGE, "l must be positive integer"); if (sig > 31) errx(EX_USAGE, "l must be no more than 31"); opts.l = sig; break; case 'c' : sig = atoll(optarg); if (sig < 1) errx(EX_USAGE, "capacity must be positive integer"); opts.capacity = sig; break; case 'M' : sig = atoll(optarg); if (sig < 1) errx(EX_USAGE, "max capacity must be positive integer"); opts.maximum_capacity = sig; break; case 'X' : opts.input_is_protein = true; break; } } if (opts.spaced_seed_mask != DEFAULT_SPACED_SEED_MASK) ExpandSpacedSeedMask(opts.spaced_seed_mask, opts.input_is_protein ? BITS_PER_CHAR_PRO : BITS_PER_CHAR_DNA); if (opts.hashtable_filename.empty() || opts.ID_to_taxon_map_filename.empty() || opts.ncbi_taxonomy_directory.empty() || opts.options_filename.empty() || opts.taxonomy_filename.empty()) { cerr << "missing mandatory filename parameter" << endl; usage(); } if (opts.k == 0 || opts.l == 0 || opts.capacity == 0) { cerr << "missing mandatory integer parameter" << endl; usage(); } if (opts.k < opts.l) { cerr << "k cannot be less than l" << endl; usage(); } if (opts.maximum_capacity > opts.capacity) { cerr << "maximum capacity option shouldn't specify larger capacity than normal" << endl; usage(); } } void usage(int exit_code) { cerr << "Usage: build_db \n" << "\n" << "Options (*mandatory):\n" << "* -H FILENAME Kraken 2 hash table filename\n" << "* -m FILENAME Sequence ID to taxon map filename\n" << "* -t FILENAME Kraken 2 taxonomy filename\n" << "* -n DIR NCBI taxonomy directory name\n" << "* -o FILENAME Kraken 2 options filename\n" << "* -k INT Set length of k-mers\n" << "* -l INT Set length of minimizers\n" << "* -c INT Set capacity of hash table\n" << " -M INT Set maximum capacity of hash table (MiniKraken)\n" << " -S BITSTRING Spaced seed mask\n" << " -T BITSTRING Minimizer toggle mask\n" << " -X Input seqs. are proteins\n" << " -p INT Number of threads\n" << " -B INT Read block size" << endl; exit(exit_code); } void ProcessSequence(string &seq, uint64_t taxid, CompactHashTable &hash, Taxonomy &tax, MinimizerScanner &scanner, uint64_t min_clear_hash_value) { scanner.LoadSequence(seq); uint64_t *minimizer_ptr; while ((minimizer_ptr = scanner.NextMinimizer())) { if (min_clear_hash_value && MurmurHash3(*minimizer_ptr) < min_clear_hash_value) continue; hvalue_t existing_taxid = 0; hvalue_t new_taxid = taxid; while (! hash.CompareAndSet(*minimizer_ptr, new_taxid, &existing_taxid)) { new_taxid = tax.LowestCommonAncestor(new_taxid, existing_taxid); } } } void ReadIDToTaxonMap(map &id_map, string &filename) { ifstream map_file(filename.c_str()); if (! map_file.good()) err(EX_NOINPUT, "unable to read from '%s'", filename.c_str()); string line; while (getline(map_file, line)) { string seq_id; uint64_t taxid; istringstream iss(line); iss >> seq_id; iss >> taxid; id_map[seq_id] = taxid; } } void GenerateTaxonomy(Options &opts, map &id_map) { NCBITaxonomy ncbi_taxonomy( opts.ncbi_taxonomy_directory + "/nodes.dmp", opts.ncbi_taxonomy_directory + "/names.dmp" ); for (auto &kv_pair : id_map) { if (kv_pair.second != 0) { ncbi_taxonomy.MarkNode(kv_pair.second); } } ncbi_taxonomy.ConvertToKrakenTaxonomy(opts.taxonomy_filename.c_str()); } kraken2-2.0.8-beta/src/classify.cc000066400000000000000000000647121346023071100167110ustar00rootroot00000000000000/* * Copyright 2013-2019, Derrick Wood * * This file is part of the Kraken 2 taxonomic sequence classification system. */ #include "kraken2_headers.h" #include "kv_store.h" #include "taxonomy.h" #include "seqreader.h" #include "mmscanner.h" #include "compact_hash.h" #include "kraken2_data.h" #include "aa_translate.h" #include "reports.h" #include "utilities.h" using std::cout; using std::cerr; using std::endl; using std::ifstream; using std::map; using std::ostringstream; using std::ofstream; using std::set; using std::string; using std::vector; using namespace kraken2; static const size_t NUM_FRAGMENTS_PER_THREAD = 10000; static const taxid_t MATE_PAIR_BORDER_TAXON = TAXID_MAX; static const taxid_t READING_FRAME_BORDER_TAXON = TAXID_MAX - 1; static const taxid_t AMBIGUOUS_SPAN_TAXON = TAXID_MAX - 2; struct Options { string index_filename; string taxonomy_filename; string options_filename; string report_filename; string classified_output_filename; string unclassified_output_filename; string kraken_output_filename; bool mpa_style_report; bool quick_mode; bool report_zero_counts; bool use_translated_search; bool print_scientific_name; double confidence_threshold; int num_threads; bool paired_end_processing; bool single_file_pairs; int minimum_quality_score; bool use_memory_mapping; }; struct ClassificationStats { uint64_t total_sequences; uint64_t total_bases; uint64_t total_classified; }; struct OutputStreamData { bool initialized; bool printing_sequences; std::ostream *classified_output1; std::ostream *classified_output2; std::ostream *unclassified_output1; std::ostream *unclassified_output2; std::ostream *kraken_output; }; struct OutputData { uint64_t block_id; string kraken_str; string classified_out1_str; string classified_out2_str; string unclassified_out1_str; string unclassified_out2_str; }; void ParseCommandLine(int argc, char **argv, Options &opts); void usage(int exit_code=EX_USAGE); void ProcessFiles(const char *filename1, const char *filename2, KeyValueStore *hash, Taxonomy &tax, IndexOptions &idx_opts, Options &opts, ClassificationStats &stats, OutputStreamData &outputs, taxon_counts_t &call_counts); taxid_t ClassifySequence(Sequence &dna, Sequence &dna2, ostringstream &koss, KeyValueStore *hash, Taxonomy &tax, IndexOptions &idx_opts, Options &opts, ClassificationStats &stats, MinimizerScanner &scanner, vector &taxa, taxon_counts_t &hit_counts, vector &tx_frames); void AddHitlistString(ostringstream &oss, vector &taxa, Taxonomy &taxonomy); taxid_t ResolveTree(taxon_counts_t &hit_counts, Taxonomy &tax, size_t total_minimizers, Options &opts); void ReportStats(struct timeval time1, struct timeval time2, ClassificationStats &stats); void InitializeOutputs(Options &opts, OutputStreamData &outputs, SequenceFormat format); void MaskLowQualityBases(Sequence &dna, int minimum_quality_score); int main(int argc, char **argv) { Options opts; opts.quick_mode = false; opts.confidence_threshold = 0; opts.paired_end_processing = false; opts.single_file_pairs = false; opts.num_threads = 1; opts.mpa_style_report = false; opts.report_zero_counts = false; opts.use_translated_search = false; opts.print_scientific_name = false; opts.minimum_quality_score = 0; opts.use_memory_mapping = false; ParseCommandLine(argc, argv, opts); omp_set_num_threads(opts.num_threads); cerr << "Loading database information..."; IndexOptions idx_opts = {0}; ifstream idx_opt_fs(opts.options_filename); idx_opt_fs.read((char *) &idx_opts, sizeof(idx_opts)); opts.use_translated_search = ! idx_opts.dna_db; Taxonomy taxonomy(opts.taxonomy_filename, opts.use_memory_mapping); KeyValueStore *hash_ptr = new CompactHashTable(opts.index_filename, opts.use_memory_mapping); cerr << " done." << endl; ClassificationStats stats = {0, 0, 0}; taxon_counts_t call_counts; OutputStreamData outputs = { false, false, nullptr, nullptr, nullptr, nullptr, &std::cout }; struct timeval tv1, tv2; gettimeofday(&tv1, nullptr); if (optind == argc) { if (opts.paired_end_processing && ! opts.single_file_pairs) errx(EX_USAGE, "paired end processing used with no files specified"); ProcessFiles(nullptr, nullptr, hash_ptr, taxonomy, idx_opts, opts, stats, outputs, call_counts); } else { for (int i = optind; i < argc; i++) { if (opts.paired_end_processing && ! opts.single_file_pairs) { if (i + 1 == argc) { errx(EX_USAGE, "paired end processing used with unpaired file"); } ProcessFiles(argv[i], argv[i+1], hash_ptr, taxonomy, idx_opts, opts, stats, outputs, call_counts); i += 1; } else { ProcessFiles(argv[i], nullptr, hash_ptr, taxonomy, idx_opts, opts, stats, outputs, call_counts); } } } gettimeofday(&tv2, nullptr); delete hash_ptr; ReportStats(tv1, tv2, stats); if (! opts.report_filename.empty()) { if (opts.mpa_style_report) ReportMpaStyle(opts.report_filename, opts.report_zero_counts, taxonomy, call_counts); else { auto total_unclassified = stats.total_sequences - stats.total_classified; ReportKrakenStyle(opts.report_filename, opts.report_zero_counts, taxonomy, call_counts, stats.total_sequences, total_unclassified); } } return 0; } void ReportStats(struct timeval time1, struct timeval time2, ClassificationStats &stats) { time2.tv_usec -= time1.tv_usec; time2.tv_sec -= time1.tv_sec; if (time2.tv_usec < 0) { time2.tv_sec--; time2.tv_usec += 1000000; } double seconds = time2.tv_usec; seconds /= 1e6; seconds += time2.tv_sec; uint64_t total_unclassified = stats.total_sequences - stats.total_classified; if (isatty(fileno(stderr))) cerr << "\r"; fprintf(stderr, "%llu sequences (%.2f Mbp) processed in %.3fs (%.1f Kseq/m, %.2f Mbp/m).\n", (unsigned long long) stats.total_sequences, stats.total_bases / 1.0e6, seconds, stats.total_sequences / 1.0e3 / (seconds / 60), stats.total_bases / 1.0e6 / (seconds / 60) ); fprintf(stderr, " %llu sequences classified (%.2f%%)\n", (unsigned long long) stats.total_classified, stats.total_classified * 100.0 / stats.total_sequences); fprintf(stderr, " %llu sequences unclassified (%.2f%%)\n", (unsigned long long) total_unclassified, total_unclassified * 100.0 / stats.total_sequences); } void ProcessFiles(const char *filename1, const char *filename2, KeyValueStore *hash, Taxonomy &tax, IndexOptions &idx_opts, Options &opts, ClassificationStats &stats, OutputStreamData &outputs, taxon_counts_t &call_counts) { std::istream *fptr1 = nullptr, *fptr2 = nullptr; if (filename1 == nullptr) fptr1 = &std::cin; else { fptr1 = new std::ifstream(filename1); } if (opts.paired_end_processing && ! opts.single_file_pairs) { fptr2 = new std::ifstream(filename2); } // The priority queue for output is designed to ensure fragment data // is output in the same order it was input auto comparator = [](const OutputData &a, const OutputData &b) { return a.block_id > b.block_id; }; std::priority_queue, decltype(comparator)> output_queue(comparator); uint64_t next_input_block_id = 0; uint64_t next_output_block_id = 0; omp_lock_t output_lock; omp_init_lock(&output_lock); #pragma omp parallel { MinimizerScanner scanner(idx_opts.k, idx_opts.l, idx_opts.spaced_seed_mask, idx_opts.dna_db, idx_opts.toggle_mask, idx_opts.revcom_version); vector taxa; taxon_counts_t hit_counts; ostringstream kraken_oss, c1_oss, c2_oss, u1_oss, u2_oss; ClassificationStats thread_stats = {0, 0, 0}; vector calls; vector translated_frames(6); BatchSequenceReader reader1, reader2; Sequence seq1, seq2; uint64_t block_id; OutputData out_data; while (true) { thread_stats.total_sequences = 0; thread_stats.total_bases = 0; thread_stats.total_classified = 0; auto ok_read = false; #pragma omp critical(seqread) { // Input processing block if (! opts.paired_end_processing) { // Unpaired data? Just read in a sized block ok_read = reader1.LoadBlock(*fptr1, (size_t)(3 * 1024 * 1024)); } else if (! opts.single_file_pairs) { // Paired data in 2 files? Read a line-counted batch from each file. ok_read = reader1.LoadBatch(*fptr1, NUM_FRAGMENTS_PER_THREAD); if (ok_read && opts.paired_end_processing) ok_read = reader2.LoadBatch(*fptr2, NUM_FRAGMENTS_PER_THREAD); } else { auto frags = NUM_FRAGMENTS_PER_THREAD * 2; // Ensure frag count is even - just in case above line is changed if (frags % 2 == 1) frags++; ok_read = reader1.LoadBatch(*fptr1, frags); } block_id = next_input_block_id++; } if (! ok_read) break; // Reset all dynamically-growing things calls.clear(); kraken_oss.str(""); c1_oss.str(""); c2_oss.str(""); u1_oss.str(""); u2_oss.str(""); while (true) { auto valid_fragment = reader1.NextSequence(seq1); if (opts.paired_end_processing && valid_fragment) { if (opts.single_file_pairs) valid_fragment = reader1.NextSequence(seq2); else valid_fragment = reader2.NextSequence(seq2); } if (! valid_fragment) break; thread_stats.total_sequences++; if (opts.minimum_quality_score > 0) { MaskLowQualityBases(seq1, opts.minimum_quality_score); if (opts.paired_end_processing) MaskLowQualityBases(seq2, opts.minimum_quality_score); } auto call = ClassifySequence(seq1, seq2, kraken_oss, hash, tax, idx_opts, opts, thread_stats, scanner, taxa, hit_counts, translated_frames); if (call) { char buffer[1024] = ""; sprintf(buffer, " kraken:taxid|%lu", tax.nodes()[call].external_id); seq1.header += buffer; seq2.header += buffer; c1_oss << seq1.to_string(); if (opts.paired_end_processing) c2_oss << seq2.to_string(); } else { u1_oss << seq1.to_string(); if (opts.paired_end_processing) u2_oss << seq2.to_string(); } calls.push_back(call); thread_stats.total_bases += seq1.seq.size(); if (opts.paired_end_processing) thread_stats.total_bases += seq2.seq.size(); } #pragma omp atomic stats.total_sequences += thread_stats.total_sequences; #pragma omp atomic stats.total_bases += thread_stats.total_bases; #pragma omp atomic stats.total_classified += thread_stats.total_classified; #pragma omp critical(output_stats) { if (isatty(fileno(stderr))) cerr << "\rProcessed " << stats.total_sequences << " sequences (" << stats.total_bases << " bp) ..."; } #pragma omp critical(update_calls) { for (auto &call : calls) { call_counts[call]++; } } if (! outputs.initialized) { InitializeOutputs(opts, outputs, reader1.file_format()); } out_data.block_id = block_id; out_data.kraken_str.assign(kraken_oss.str()); out_data.classified_out1_str.assign(c1_oss.str()); out_data.classified_out2_str.assign(c2_oss.str()); out_data.unclassified_out1_str.assign(u1_oss.str()); out_data.unclassified_out2_str.assign(u2_oss.str()); #pragma omp critical(output_queue) { output_queue.push(out_data); } bool output_loop = true; while (output_loop) { #pragma omp critical(output_queue) { output_loop = ! output_queue.empty(); if (output_loop) { out_data = output_queue.top(); if (out_data.block_id == next_output_block_id) { output_queue.pop(); // Acquiring output lock obligates thread to print out // next output data block, contained in out_data omp_set_lock(&output_lock); next_output_block_id++; } else output_loop = false; } } if (! output_loop) break; // Past this point in loop, we know lock is set if (outputs.kraken_output != nullptr) (*outputs.kraken_output) << out_data.kraken_str; if (outputs.classified_output1 != nullptr) (*outputs.classified_output1) << out_data.classified_out1_str; if (outputs.classified_output2 != nullptr) (*outputs.classified_output2) << out_data.classified_out2_str; if (outputs.unclassified_output1 != nullptr) (*outputs.unclassified_output1) << out_data.unclassified_out1_str; if (outputs.unclassified_output2 != nullptr) (*outputs.unclassified_output2) << out_data.unclassified_out2_str; omp_unset_lock(&output_lock); } // end while output loop } // end while } // end parallel block omp_destroy_lock(&output_lock); if (fptr1 != nullptr) delete fptr1; if (fptr2 != nullptr) delete fptr2; if (outputs.kraken_output != nullptr) (*outputs.kraken_output) << std::flush; if (outputs.classified_output1 != nullptr) (*outputs.classified_output1) << std::flush; if (outputs.classified_output2 != nullptr) (*outputs.classified_output2) << std::flush; if (outputs.unclassified_output1 != nullptr) (*outputs.unclassified_output1) << std::flush; if (outputs.unclassified_output2 != nullptr) (*outputs.unclassified_output2) << std::flush; } taxid_t ResolveTree(taxon_counts_t &hit_counts, Taxonomy &taxonomy, size_t total_minimizers, Options &opts) { taxid_t max_taxon = 0; uint32_t max_score = 0; uint32_t required_score = ceil(opts.confidence_threshold * total_minimizers); // Sum each taxon's LTR path, find taxon with highest LTR score for (auto &kv_pair : hit_counts) { taxid_t taxon = kv_pair.first; uint32_t score = 0; for (auto &kv_pair2 : hit_counts) { taxid_t taxon2 = kv_pair2.first; if (taxonomy.IsAAncestorOfB(taxon2, taxon)) { score += kv_pair2.second; } } if (score > max_score) { max_score = score; max_taxon = taxon; } else if (score == max_score) { max_taxon = taxonomy.LowestCommonAncestor(max_taxon, taxon); } } // Reset max. score to be only hits at the called taxon max_score = hit_counts[max_taxon]; // We probably have a call w/o required support (unless LCA resolved tie) while (max_taxon && max_score < required_score) { max_score = 0; for (auto &kv_pair : hit_counts) { taxid_t taxon = kv_pair.first; // Add to score if taxon in max_taxon's clade if (taxonomy.IsAAncestorOfB(max_taxon, taxon)) { max_score += kv_pair.second; } } // Score is now sum of hits at max_taxon and w/in max_taxon clade if (max_score >= required_score) // Kill loop and return, we've got enough support here return max_taxon; else // Run up tree until confidence threshold is met // Run off tree if required score isn't met max_taxon = taxonomy.nodes()[max_taxon].parent_id; } return max_taxon; } std::string TrimPairInfo(std::string &id) { size_t sz = id.size(); if (sz <= 2) return id; if ( (id[sz - 2] == '_' || ispunct(id[sz - 2])) && isdigit(id[sz - 1]) ) return id.substr(0, sz - 2); return id; } taxid_t ClassifySequence(Sequence &dna, Sequence &dna2, ostringstream &koss, KeyValueStore *hash, Taxonomy &taxonomy, IndexOptions &idx_opts, Options &opts, ClassificationStats &stats, MinimizerScanner &scanner, vector &taxa, taxon_counts_t &hit_counts, vector &tx_frames) { uint64_t *minimizer_ptr; taxid_t call = 0; taxa.clear(); hit_counts.clear(); auto frame_ct = opts.use_translated_search ? 6 : 1; for (int mate_num = 0; mate_num < 2; mate_num++) { if (mate_num == 1 && ! opts.paired_end_processing) break; if (opts.use_translated_search) { TranslateToAllFrames(mate_num == 0 ? dna.seq : dna2.seq, tx_frames); } // index of frame is 0 - 5 w/ tx search (or 0 if no tx search) for (int frame_idx = 0; frame_idx < frame_ct; frame_idx++) { if (opts.use_translated_search) { scanner.LoadSequence(tx_frames[frame_idx]); } else { scanner.LoadSequence(mate_num == 0 ? dna.seq : dna2.seq); } uint64_t last_minimizer = UINT64_MAX; taxid_t last_taxon = TAXID_MAX; bool ambig_flag = false; while ((minimizer_ptr = scanner.NextMinimizer(&ambig_flag)) != nullptr) { taxid_t taxon; if (ambig_flag) { taxon = AMBIGUOUS_SPAN_TAXON; } else { if (*minimizer_ptr != last_minimizer) { bool skip_lookup = false; if (idx_opts.minimum_acceptable_hash_value) { if (MurmurHash3(*minimizer_ptr) < idx_opts.minimum_acceptable_hash_value) skip_lookup = true; } taxon = 0; if (! skip_lookup) taxon = hash->Get(*minimizer_ptr); last_taxon = taxon; last_minimizer = *minimizer_ptr; } else { taxon = last_taxon; } if (taxon) { if (opts.quick_mode) { call = taxon; goto finished_searching; // need to break 3 loops here } hit_counts[taxon]++; } } taxa.push_back(taxon); } if (opts.use_translated_search && frame_idx != 5) taxa.push_back(READING_FRAME_BORDER_TAXON); } if (opts.paired_end_processing && mate_num == 0) taxa.push_back(MATE_PAIR_BORDER_TAXON); } finished_searching: auto total_kmers = taxa.size(); if (opts.paired_end_processing) total_kmers--; // account for the mate pair marker if (opts.use_translated_search) // account for reading frame markers total_kmers -= opts.paired_end_processing ? 4 : 2; if (! opts.quick_mode) call = ResolveTree(hit_counts, taxonomy, total_kmers, opts); if (call) stats.total_classified++; if (call) koss << "C\t"; else koss << "U\t"; if (! opts.paired_end_processing) koss << dna.id << "\t"; else koss << TrimPairInfo(dna.id) << "\t"; auto ext_call = taxonomy.nodes()[call].external_id; if (opts.print_scientific_name) { const char *name = nullptr; if (call) { name = taxonomy.name_data() + taxonomy.nodes()[call].name_offset; } koss << (name ? name : "unclassified") << " (taxid " << ext_call << ")"; } else { koss << ext_call; } koss << "\t"; if (! opts.paired_end_processing) koss << dna.seq.size() << "\t"; else koss << dna.seq.size() << "|" << dna2.seq.size() << "\t"; if (opts.quick_mode) { koss << call << ":Q"; } else { if (taxa.empty()) koss << "0:0"; else AddHitlistString(koss, taxa, taxonomy); } koss << endl; return call; } void AddHitlistString(ostringstream &oss, vector &taxa, Taxonomy &taxonomy) { auto last_code = taxa[0]; auto code_count = 1; for (size_t i = 1; i < taxa.size(); i++) { auto code = taxa[i]; if (code == last_code) { code_count += 1; } else { if (last_code != MATE_PAIR_BORDER_TAXON && last_code != READING_FRAME_BORDER_TAXON) { if (last_code == AMBIGUOUS_SPAN_TAXON) { oss << "A:" << code_count << " "; } else { auto ext_code = taxonomy.nodes()[last_code].external_id; oss << ext_code << ":" << code_count << " "; } } else { // mate pair/reading frame marker oss << (last_code == MATE_PAIR_BORDER_TAXON ? "|:| " : "-:- "); } code_count = 1; last_code = code; } } if (last_code != MATE_PAIR_BORDER_TAXON && last_code != READING_FRAME_BORDER_TAXON) { if (last_code == AMBIGUOUS_SPAN_TAXON) { oss << "A:" << code_count << " "; } else { auto ext_code = taxonomy.nodes()[last_code].external_id; oss << ext_code << ":" << code_count; } } else { // mate pair/reading frame marker oss << (last_code == MATE_PAIR_BORDER_TAXON ? "|:|" : "-:-"); } } void InitializeOutputs(Options &opts, OutputStreamData &outputs, SequenceFormat format) { #pragma omp critical(output_init) { if (! outputs.initialized) { if (! opts.classified_output_filename.empty()) { if (opts.paired_end_processing) { vector fields = SplitString(opts.classified_output_filename, "#", 3); if (fields.size() < 2) { errx(EX_DATAERR, "Paired filename format missing # character: %s", opts.classified_output_filename.c_str()); } else if (fields.size() > 2) { errx(EX_DATAERR, "Paired filename format has >1 # character: %s", opts.classified_output_filename.c_str()); } outputs.classified_output1 = new ofstream(fields[0] + "_1" + fields[1]); outputs.classified_output2 = new ofstream(fields[0] + "_2" + fields[1]); } else outputs.classified_output1 = new ofstream(opts.classified_output_filename); outputs.printing_sequences = true; } if (! opts.unclassified_output_filename.empty()) { if (opts.paired_end_processing) { vector fields = SplitString(opts.unclassified_output_filename, "#", 3); if (fields.size() < 2) { errx(EX_DATAERR, "Paired filename format missing # character: %s", opts.unclassified_output_filename.c_str()); } else if (fields.size() > 2) { errx(EX_DATAERR, "Paired filename format has >1 # character: %s", opts.unclassified_output_filename.c_str()); } outputs.unclassified_output1 = new ofstream(fields[0] + "_1" + fields[1]); outputs.unclassified_output2 = new ofstream(fields[0] + "_2" + fields[1]); } else outputs.unclassified_output1 = new ofstream(opts.unclassified_output_filename); outputs.printing_sequences = true; } if (! opts.kraken_output_filename.empty()) { if (opts.kraken_output_filename == "-") // Special filename to silence Kraken output outputs.kraken_output = nullptr; else outputs.kraken_output = new ofstream(opts.kraken_output_filename); } outputs.initialized = true; } } } void MaskLowQualityBases(Sequence &dna, int minimum_quality_score) { if (dna.format != FORMAT_FASTQ) return; if (dna.seq.size() != dna.quals.size()) errx(EX_DATAERR, "%s: Sequence length (%d) != Quality string length (%d)", dna.id.c_str(), (int) dna.seq.size(), (int) dna.quals.size()); for (size_t i = 0; i < dna.seq.size(); i++) { if ((dna.quals[i] - '!') < minimum_quality_score) dna.seq[i] = 'x'; } } void ParseCommandLine(int argc, char **argv, Options &opts) { int opt; while ((opt = getopt(argc, argv, "h?H:t:o:T:p:R:C:U:O:Q:nmzqPSM")) != -1) { switch (opt) { case 'h' : case '?' : usage(0); break; case 'H' : opts.index_filename = optarg; break; case 't' : opts.taxonomy_filename = optarg; break; case 'T' : opts.confidence_threshold = std::stod(optarg); if (opts.confidence_threshold < 0 || opts.confidence_threshold > 1) { errx(EX_USAGE, "confidence threshold must be in [0, 1]"); } break; case 'o' : opts.options_filename = optarg; break; case 'q' : opts.quick_mode = true; break; case 'p' : opts.num_threads = atoi(optarg); if (opts.num_threads < 1) errx(EX_USAGE, "number of threads can't be less than 1"); break; case 'P' : opts.paired_end_processing = true; break; case 'S' : opts.paired_end_processing = true; opts.single_file_pairs = true; break; case 'm' : opts.mpa_style_report = true; break; case 'R' : opts.report_filename = optarg; break; case 'z' : opts.report_zero_counts = true; break; case 'C' : opts.classified_output_filename = optarg; break; case 'U' : opts.unclassified_output_filename = optarg; break; case 'O' : opts.kraken_output_filename = optarg; break; case 'n' : opts.print_scientific_name = true; break; case 'Q' : opts.minimum_quality_score = atoi(optarg); break; case 'M' : opts.use_memory_mapping = true; break; } } if (opts.index_filename.empty() || opts.taxonomy_filename.empty() || opts.options_filename.empty()) { warnx("mandatory filename missing"); usage(); } if (opts.mpa_style_report && opts.report_filename.empty()) { warnx("-m requires -R be used"); usage(); } } void usage(int exit_code) { cerr << "Usage: classify [options] " << endl << endl << "Options: (*mandatory)" << endl << "* -H filename Kraken 2 index filename" << endl << "* -t filename Kraken 2 taxonomy filename" << endl << "* -o filename Kraken 2 options filename" << endl << " -q Quick mode" << endl << " -M Use memory mapping to access hash & taxonomy" << endl << " -T NUM Confidence score threshold (def. 0)" << endl << " -p NUM Number of threads (def. 1)" << endl << " -Q NUM Minimum quality score (FASTQ only, def. 0)" << endl << " -P Process pairs of reads" << endl << " -S Process pairs with mates in same file" << endl << " -R filename Print report to filename" << endl << " -m In comb. w/ -R, use mpa-style report" << endl << " -z In comb. w/ -R, report taxa w/ 0 count" << endl << " -n Print scientific name instead of taxid in Kraken output" << endl << " -C filename Filename/format to have classified sequences" << endl << " -U filename Filename/format to have unclassified sequences" << endl << " -O filename Output file for normal Kraken output" << endl; exit(exit_code); } kraken2-2.0.8-beta/src/compact_hash.cc000066400000000000000000000152731346023071100175230ustar00rootroot00000000000000/* * Copyright 2013-2019, Derrick Wood * * This file is part of the Kraken 2 taxonomic sequence classification system. */ #include "compact_hash.h" using std::string; using std::ofstream; using std::unordered_map; namespace kraken2 { CompactHashTable::CompactHashTable(size_t capacity, size_t key_bits, size_t value_bits) : capacity_(capacity), key_bits_(key_bits), value_bits_(value_bits), file_backed_(false) { if (key_bits + value_bits != sizeof(*table_) * 8) errx(EX_SOFTWARE, "sum of key bits and value bits must equal %u", (unsigned int) (sizeof(*table_) * 8)); if (key_bits == 0) errx(EX_SOFTWARE, "key bits cannot be zero"); if (value_bits == 0) errx(EX_SOFTWARE, "value bits cannot be zero"); for (size_t i = 0; i < LOCK_ZONES; i++) omp_init_lock(&zone_locks_[i]); size_ = 0; try { table_ = new CompactHashCell[capacity_]; } catch (std::bad_alloc ex) { std::cerr << "Failed attempt to allocate " << (sizeof(*table_) * capacity_) << "bytes;\n" << "you may not have enough free memory to build this database.\n" << "Perhaps increasing the k-mer length, or reducing memory usage from\n" << "other programs could help you build this database?" << std::endl; errx(EX_OSERR, "unable to allocate hash table memory"); } memset(table_, 0, capacity_ * sizeof(*table_)); } CompactHashTable::~CompactHashTable() { if (! file_backed_) { delete[] table_; for (size_t i = 0; i < LOCK_ZONES; i++) omp_destroy_lock(&zone_locks_[i]); } } CompactHashTable::CompactHashTable(const string &filename, bool memory_mapping) { LoadTable(filename.c_str(), memory_mapping); } CompactHashTable::CompactHashTable(const char *filename, bool memory_mapping) { LoadTable(filename, memory_mapping); } void CompactHashTable::LoadTable(const char *filename, bool memory_mapping) { if (memory_mapping) { backing_file_.OpenFile(filename); char *ptr = backing_file_.fptr(); memcpy((char *) &capacity_, ptr, sizeof(capacity_)); ptr += sizeof(capacity_); memcpy((char *) &size_, ptr, sizeof(size_)); ptr += sizeof(size_); memcpy((char *) &key_bits_, ptr, sizeof(key_bits_)); ptr += sizeof(key_bits_); memcpy((char *) &value_bits_, ptr, sizeof(value_bits_)); ptr += sizeof(value_bits_); table_ = (CompactHashCell *) ptr; if (backing_file_.filesize() - (ptr - backing_file_.fptr()) != sizeof(*table_) * capacity_) { errx(EX_DATAERR, "Capacity mismatch in %s, aborting", filename); } file_backed_ = true; } else { std::ifstream ifs(filename); ifs.read((char *) &capacity_, sizeof(capacity_)); ifs.read((char *) &size_, sizeof(size_)); ifs.read((char *) &key_bits_, sizeof(key_bits_)); ifs.read((char *) &value_bits_, sizeof(value_bits_)); try { table_ = new CompactHashCell[capacity_]; } catch (std::bad_alloc ex) { std::cerr << "Failed attempt to allocate " << (sizeof(*table_) * capacity_) << "bytes;\n" << "you may not have enough free memory to load this database.\n" << "If your computer has enough RAM, perhaps reducing memory usage from\n" << "other programs could help you load this database?" << std::endl; errx(EX_OSERR, "unable to allocate hash table memory"); } ifs.read((char *) table_, capacity_ * sizeof(*table_)); if (! ifs) errx(EX_OSERR, "Error reading in hash table"); file_backed_ = false; } } void CompactHashTable::WriteTable(const char *filename) { ofstream ofs(filename, ofstream::binary); ofs.write((char *) &capacity_, sizeof(capacity_)); ofs.write((char *) &size_, sizeof(size_)); ofs.write((char *) &key_bits_, sizeof(key_bits_)); ofs.write((char *) &value_bits_, sizeof(value_bits_)); ofs.write((char *) table_, sizeof(*table_) * capacity_); ofs.close(); } hvalue_t CompactHashTable::Get(hkey_t key) { uint64_t hc = MurmurHash3(key); uint64_t compacted_key = hc >> (32 + value_bits_); size_t idx = hc % capacity_; size_t first_idx = idx; size_t step = 0; while (true) { if (! table_[idx].value(value_bits_)) // value of 0 means data is 0, saves work break; // search over, empty cell encountered in probe if (table_[idx].hashed_key(value_bits_) == compacted_key) return table_[idx].value(value_bits_); if (step == 0) step = second_hash(hc); idx += step; idx %= capacity_; if (idx == first_idx) break; // search over, we've exhausted the table } return 0; } bool CompactHashTable::CompareAndSet (hkey_t key, hvalue_t new_value, hvalue_t *old_value) { if (file_backed_) return false; if (new_value == 0) return false; uint64_t hc = MurmurHash3(key); hkey_t compacted_key = hc >> (32 + value_bits_); size_t idx, first_idx; bool set_successful = false; bool search_successful = false; idx = first_idx = hc % capacity_; size_t step = 0; while (! search_successful) { size_t zone = idx % LOCK_ZONES; omp_set_lock(&zone_locks_[zone]); if (! table_[idx].value(value_bits_) || table_[idx].hashed_key(value_bits_) == compacted_key) { search_successful = true; #pragma omp flush if (*old_value == table_[idx].value(value_bits_)) { table_[idx].populate(key, new_value, key_bits_, value_bits_); if (! *old_value) { #pragma omp atomic size_++; } set_successful = true; } else { *old_value = table_[idx].value(value_bits_); } } omp_unset_lock(&zone_locks_[zone]); if (step == 0) step = second_hash(hc); idx += step; idx %= capacity_; if (idx == first_idx) errx(EX_SOFTWARE, "compact hash table capacity exceeded"); } return set_successful; } // Linear probing may be ok for accuracy, as long as occupancy is < 95% // Linear probing leads to more clustering, longer probing paths, and // higher probability of a false answer // Double hashing can have shorter probing paths, but less cache efficiency inline uint64_t CompactHashTable::second_hash(uint64_t first_hash) { #ifdef LINEAR_PROBING return 1; #else // Double hashing return (first_hash >> 8) | 1; #endif } taxon_counts_t CompactHashTable::GetValueCounts() { taxon_counts_t value_counts; int thread_ct = omp_get_max_threads(); taxon_counts_t thread_value_counts[thread_ct]; #pragma omp parallel for for (auto i = 0u; i < capacity_; i++) { auto val = table_[i].value(value_bits_); if (val) thread_value_counts[omp_get_thread_num()][val]++; } for (auto i = 0; i < thread_ct; i++) { for (auto &kv_pair : thread_value_counts[i]) value_counts[kv_pair.first] += kv_pair.second; } return value_counts; } } // end namespace kraken2-2.0.8-beta/src/compact_hash.h000066400000000000000000000061661346023071100173660ustar00rootroot00000000000000/* * Copyright 2013-2019, Derrick Wood * * This file is part of the Kraken 2 taxonomic sequence classification system. */ #ifndef KRAKEN2_COMPACT_HASH_H_ #define KRAKEN2_COMPACT_HASH_H_ #include "kv_store.h" #include "mmap_file.h" #include "kraken2_headers.h" #include "kraken2_data.h" namespace kraken2 { struct CompactHashCell { inline hkey_t hashed_key(size_t value_bits) { return (hkey_t) (data >> value_bits); } inline hvalue_t value(size_t value_bits) { return (hvalue_t) (data & ((1 << value_bits) - 1)); } void populate(hkey_t key, hvalue_t val, size_t key_bits, size_t value_bits) { if (key_bits + value_bits != 32) errx(EX_SOFTWARE, "key len of %u and value len of %u don't sum to 32", (unsigned int) key_bits, (unsigned int) value_bits); if (! key_bits || ! value_bits) errx(EX_SOFTWARE, "key len and value len must be nonzero"); uint64_t max_value = (1llu << value_bits) - 1; if (max_value < val) errx(EX_SOFTWARE, "value len of %u too small for value of %llu", (unsigned int) value_bits, (unsigned long long int) val); data = (uint32_t) (MurmurHash3(key) >> 32); data &= ~((1 << value_bits) - 1); data |= val; } uint32_t data; }; /** Implementation of a compact, fixed-size probabilistic hash table. Operations supported: - search - compare-and-set Does NOT support: - values of 0 Keys must be 64-bit unsigned integers, < 2 ** 63. Values must be 32-bit unsigned integers, > 0. Cells are 32-bit unsigned integers, with truncated hashed keys stored in the most significant bits, and truncated values stored in the least significant bits. Compaction level is set when the table is created by the WriteCompactedTable() method of the HashTable class. **/ class CompactHashTable : public KeyValueStore { public: CompactHashTable(size_t capacity, size_t key_bits, size_t value_bits); CompactHashTable(const std::string &filename, bool memory_mapping=false); CompactHashTable(const char *filename, bool memory_mapping=false); ~CompactHashTable(); hvalue_t Get(hkey_t key); // How CompareAndSet works: // if *old_value == CHT[key] // CHT[key] = new_value // return true // else // *old_value = CHT[key] // return false bool CompareAndSet(hkey_t key, hvalue_t new_value, hvalue_t *old_value); void WriteTable(const char *filename); taxon_counts_t GetValueCounts(); size_t capacity() { return capacity_; } size_t size() { return size_; } size_t key_bits() { return key_bits_; } size_t value_bits() { return value_bits_; } double occupancy() { return size_ * 1.0 / capacity_; } private: static const size_t LOCK_ZONES = 256; size_t capacity_; size_t size_; //size_t index_mask_; size_t key_bits_; size_t value_bits_; CompactHashCell *table_; bool file_backed_; MMapFile backing_file_; omp_lock_t zone_locks_[LOCK_ZONES]; CompactHashTable(const CompactHashTable &rhs); CompactHashTable& operator=(const CompactHashTable &rhs); void LoadTable(const char *filename, bool memory_mapping); uint64_t second_hash(uint64_t first_hash); }; } // end namespace #endif kraken2-2.0.8-beta/src/dump_table.cc000066400000000000000000000103361346023071100172010ustar00rootroot00000000000000/* * Copyright 2013-2019, Derrick Wood * * This file is part of the Kraken 2 taxonomic sequence classification system. */ #include "kraken2_headers.h" #include "compact_hash.h" #include "taxonomy.h" #include "mmscanner.h" #include "kraken2_data.h" #include "reports.h" using std::string; using std::unordered_map; using std::cout; using std::cerr; using std::endl; using namespace kraken2; struct Options { string hashtable_filename; string taxonomy_filename; string options_filename; string output_filename; bool use_mpa_style; bool report_zeros; bool skip_counts; unsigned int num_threads; }; void ParseCommandLine(int argc, char **argv, Options &opts); void usage(int exit_code = EX_USAGE); std::string mask2str(uint64_t mask, int digits) { std::string str; for (int i = digits - 1; i >= 0; i--) str.push_back((mask >> i) & 1 ? '1' : '0'); return str; } int main(int argc, char **argv) { Options opts; opts.report_zeros = false; opts.output_filename = "/dev/fd/1"; opts.use_mpa_style = false; opts.skip_counts = false; opts.num_threads = 1; ParseCommandLine(argc, argv, opts); omp_set_num_threads(opts.num_threads); CompactHashTable kraken_index(opts.hashtable_filename); Taxonomy taxonomy(opts.taxonomy_filename); IndexOptions idx_opts = {0}; std::ifstream idx_opt_fs(opts.options_filename); idx_opt_fs.read((char *) &idx_opts, sizeof(idx_opts)); std::cout << "# Database options: " << (idx_opts.dna_db ? "nucleotide" : "protein") << " db, " << "k = " << idx_opts.k << ", " << "l = " << idx_opts.l << "\n" << "# Spaced mask = " << mask2str(idx_opts.spaced_seed_mask, idx_opts.l * (idx_opts.dna_db ? BITS_PER_CHAR_DNA : BITS_PER_CHAR_PRO)) << "\n" << "# Toggle mask = " << mask2str(idx_opts.toggle_mask, 64) << "\n" << "# Total taxonomy nodes: " << taxonomy.node_count() << "\n" << "# Table size: " << kraken_index.size() << "\n" << "# Table capacity: " << kraken_index.capacity() << "\n" << "# Min clear hash value = " << idx_opts.minimum_acceptable_hash_value << "\n"; if (idx_opts.revcom_version != CURRENT_REVCOM_VERSION) std::cout << "# Built with outdated revcom version\n"; std::cout << std::flush; if (! opts.skip_counts) { taxon_counts_t taxid_counts = kraken_index.GetValueCounts(); uint64_t total_seqs = 0; for (auto &kv_pair : taxid_counts) { total_seqs += kv_pair.second; } if (opts.use_mpa_style) ReportMpaStyle(opts.output_filename, opts.report_zeros, taxonomy, taxid_counts); else ReportKrakenStyle(opts.output_filename, opts.report_zeros, taxonomy, taxid_counts, total_seqs, 0); } return 0; } void ParseCommandLine(int argc, char **argv, Options &opts) { int opt; while ((opt = getopt(argc, argv, "?hH:t:o:O:p:zms")) != -1) { switch (opt) { case 'h' : case '?' : usage(0); break; case 'H' : opts.hashtable_filename = optarg; break; case 't' : opts.taxonomy_filename = optarg; break; case 'o' : opts.options_filename = optarg; break; case 'z' : opts.report_zeros = true; break; case 'O' : opts.output_filename = optarg; break; case 'm' : opts.use_mpa_style = true; break; case 's' : opts.skip_counts = true; break; case 'p' : opts.num_threads = atoi(optarg); break; } } if (opts.hashtable_filename.empty() || opts.taxonomy_filename.empty() || opts.options_filename.empty()) { cerr << "missing mandatory filename parameter" << endl; usage(); } } void usage(int exit_code) { cerr << "Usage: dump_table \n" << "\n" << "Options (*mandatory):\n" << "* -H FILENAME Kraken 2 hash table filename\n" << "* -t FILENAME Kraken 2 taxonomy filename\n" << "* -o FILENAME Kraken 2 database options filename\n" << " -O FILENAME Output filename (def: /dev/fd/1)\n" << " -m Use MPA style output instead of Kraken 2 output\n" << " -s Skip reporting minimizer counts, just show DB stats\n" << " -z Report taxa with zero counts\n"; exit(exit_code); } kraken2-2.0.8-beta/src/estimate_capacity.cc000066400000000000000000000122501346023071100205520ustar00rootroot00000000000000/* * Copyright 2013-2019, Derrick Wood * * This file is part of the Kraken 2 taxonomic sequence classification system. */ #include "kraken2_headers.h" #include "kv_store.h" #include "mmscanner.h" #include "seqreader.h" #include "utilities.h" using std::string; using std::cout; using std::queue; using std::cerr; using std::endl; using std::ifstream; using std::ofstream; using std::unordered_set; using std::vector; using namespace kraken2; #define RANGE_SECTIONS 1024 // must be power of 2 #define RANGE_MASK (RANGE_SECTIONS - 1) #define MAX_N RANGE_SECTIONS #define DEFAULT_N 4 #define DEFAULT_BLOCK_SIZE (30 * 1024 * 1024) // yes, 30 MB struct Options { size_t k, l, n; bool input_is_protein; int threads; size_t block_size; uint64_t spaced_seed_mask; uint64_t toggle_mask; }; void ParseCommandLine(int argc, char **argv, Options &opts); void usage(int exit_code = EX_USAGE); void ProcessSequence(string &seq, Options &opts, vector> &sets); void ProcessSequences(Options &opts); int main(int argc, char **argv) { Options opts; opts.k = 0; opts.l = 0; opts.n = DEFAULT_N; opts.threads = 1; opts.input_is_protein = false; opts.spaced_seed_mask = DEFAULT_SPACED_SEED_MASK; opts.toggle_mask = DEFAULT_TOGGLE_MASK; opts.block_size = DEFAULT_BLOCK_SIZE; ParseCommandLine(argc, argv, opts); omp_set_num_threads(opts.threads); ProcessSequences(opts); return 0; } void ProcessSequences(Options &opts) { vector> sets(opts.n); #pragma omp parallel { bool have_work = true; BatchSequenceReader reader; Sequence sequence; while (have_work) { #pragma omp critical(batch_reading) have_work = reader.LoadBlock(std::cin, opts.block_size); if (have_work) while (reader.NextSequence(sequence)) ProcessSequence(sequence.seq, opts, sets); } } size_t sum_set_sizes = 0; for (auto &s : sets) { sum_set_sizes += s.size(); } sum_set_sizes++; // ensure non-zero estimate cout << (size_t) (sum_set_sizes * RANGE_SECTIONS * 1.0 / opts.n) << endl; } void ParseCommandLine(int argc, char **argv, Options &opts) { int opt; long long sig; while ((opt = getopt(argc, argv, "?hk:l:n:S:T:B:p:X")) != -1) { switch (opt) { case 'h' : case '?' : usage(0); break; case 'p' : sig = atoll(optarg); if (sig < 1) errx(EX_USAGE, "must have at least 1 thread"); opts.threads = sig; break; case 'B' : sig = atoll(optarg); if (sig < 1) errx(EX_USAGE, "block size must be positive"); opts.block_size = sig; break; case 'n' : sig = atoll(optarg); if (sig < 1) errx(EX_USAGE, "n must be positive integer"); if (sig > MAX_N) errx(EX_USAGE, "n must be no more than %d", MAX_N); opts.n = sig; break; case 'k' : sig = atoll(optarg); if (sig < 1) errx(EX_USAGE, "k must be positive integer"); opts.k = sig; break; case 'l' : sig = atoll(optarg); if (sig < 1) errx(EX_USAGE, "l must be positive integer"); if (sig > 31) errx(EX_USAGE, "l must be no more than 31"); opts.l = sig; break; case 'X' : opts.input_is_protein = true; break; case 'S' : opts.spaced_seed_mask = strtol(optarg, nullptr, 2); break; case 'T' : opts.toggle_mask = strtol(optarg, nullptr, 2); break; } } if (opts.spaced_seed_mask != DEFAULT_SPACED_SEED_MASK) ExpandSpacedSeedMask(opts.spaced_seed_mask, opts.input_is_protein ? BITS_PER_CHAR_PRO : BITS_PER_CHAR_DNA); if (opts.k == 0 || opts.l == 0) { cerr << "missing mandatory integer parameter" << endl; usage(); } if (opts.k < opts.l) { cerr << "k cannot be less than l" << endl; usage(); } } void usage(int exit_code) { cerr << "Usage: estimate_capacity " << endl << endl << "Options (*mandatory):" << endl << "* -k INT Set length of k-mers" << endl << "* -l INT Set length of minimizers" << endl << " -n INT Set maximum qualifying hash code" << endl << " -X Input sequences are proteins" << endl << " -S BITSTRING Spaced seed mask" << endl << " -T BITSTRING Minimizer ordering toggle mask" << endl << " -B INT Read block size" << endl << " -p INT Number of threads" << endl; exit(exit_code); } void ProcessSequence(string &seq, Options &opts, vector> &sets) { MinimizerScanner scanner(opts.k, opts.l, opts.spaced_seed_mask, ! opts.input_is_protein, opts.toggle_mask); // Add terminator for protein sequences if not already there if (opts.input_is_protein && seq.back() != '*') seq.push_back('*'); scanner.LoadSequence(seq); uint64_t *minimizer_ptr; while ((minimizer_ptr = scanner.NextMinimizer())) { uint64_t hash_code = MurmurHash3(*minimizer_ptr); if ((hash_code & RANGE_MASK) < opts.n) { #pragma omp critical(set_insert) sets[hash_code & RANGE_MASK].insert(*minimizer_ptr); } } } kraken2-2.0.8-beta/src/kraken2_data.h000066400000000000000000000011201346023071100172440ustar00rootroot00000000000000/* * Copyright 2013-2019, Derrick Wood * * This file is part of the Kraken 2 taxonomic sequence classification system. */ #ifndef KRAKEN2_KRAKEN2_DATA_H_ #define KRAKEN2_KRAKEN2_DATA_H_ #include "kraken2_headers.h" struct IndexOptions { size_t k; size_t l; uint64_t spaced_seed_mask; uint64_t toggle_mask; bool dna_db; uint64_t minimum_acceptable_hash_value; int revcom_version; // Fix bug from before K2.0.8 }; typedef uint64_t taxid_t; const taxid_t TAXID_MAX = (taxid_t) -1; typedef std::unordered_map taxon_counts_t; #endif kraken2-2.0.8-beta/src/kraken2_headers.h000066400000000000000000000012751346023071100177610ustar00rootroot00000000000000#include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include "omp_hack.h" #ifndef SIZE_MAX #define SIZE_MAX ((size_t) -1) #endif #ifndef UINT64_MAX #define UINT64_MAX ((uint64_t) -1) #endif kraken2-2.0.8-beta/src/kv_store.h000066400000000000000000000011701346023071100165570ustar00rootroot00000000000000/* * Copyright 2013-2019, Derrick Wood * * This file is part of the Kraken 2 taxonomic sequence classification system. */ #ifndef KRAKEN2_KV_STORE_H_ #define KRAKEN2_KV_STORE_H_ #include "kraken2_headers.h" namespace kraken2 { typedef uint64_t hkey_t; typedef uint32_t hvalue_t; class KeyValueStore { public: virtual hvalue_t Get(hkey_t key) = 0; virtual ~KeyValueStore() { } }; uint64_t inline MurmurHash3(hkey_t key) { uint64_t k = (uint64_t) key; k ^= k >> 33; k *= 0xff51afd7ed558ccd; k ^= k >> 33; k *= 0xc4ceb9fe1a85ec53; k ^= k >> 33; return k; } } // end namespace #endif kraken2-2.0.8-beta/src/lookup_accession_numbers.cc000066400000000000000000000073241346023071100221630ustar00rootroot00000000000000/* * Copyright 2013-2019, Derrick Wood * * This file is part of the Kraken 2 taxonomic sequence classification system. */ #include "kraken2_headers.h" #include "mmap_file.h" #include "utilities.h" using namespace kraken2; using std::string; using std::unordered_map; using std::vector; int main(int argc, char **argv) { if (argc < 3) errx(EX_USAGE, "Usage: lookup_accession_numbers "); unordered_map> target_lists; std::ifstream lookup_list_file(argv[1]); string line; while (getline(lookup_list_file, line)) { auto fields = SplitString(line, "\t", 2); string seqid = fields[0]; string accnum = fields[1]; target_lists[accnum].push_back(seqid); } lookup_list_file.close(); auto initial_target_count = target_lists.size(); MMapFile accmap_file; if (isatty(fileno(stderr))) std::cerr << "\rFound 0/" << initial_target_count << " targets..."; uint64_t accessions_searched = 0; for (int i = 2; i < argc; i++) { if (target_lists.empty()) // Stop processing files if we've found all we need break; accmap_file.OpenFile(argv[i]); char *ptr = accmap_file.fptr(); // Skip header line char *lf_ptr = (char *) memchr(ptr, '\n', accmap_file.filesize()); if (lf_ptr != nullptr) ptr = lf_ptr + 1; while ((size_t)(ptr - accmap_file.fptr()) < accmap_file.filesize()) { lf_ptr = (char *) memchr(ptr, '\n', accmap_file.filesize() - (ptr - accmap_file.fptr())); if (lf_ptr == nullptr) { warnx("expected EOL not found at EOF in %s", argv[i]); break; } char *tab_ptr = (char *) memchr(ptr, '\t', lf_ptr - ptr); if (tab_ptr == nullptr) { warnx("expected TAB not found in %s", argv[i]); break; } string accnum(ptr, tab_ptr - ptr); accessions_searched++; if (target_lists.count(accnum)) { for (int j = 0; j < 2; j++) { ptr = tab_ptr + 1; tab_ptr = (char *) memchr(ptr, '\t', lf_ptr - ptr); if (tab_ptr == nullptr) { warnx("expected TAB not found in %s", argv[i]); break; } } if (tab_ptr == nullptr) break; string taxid(ptr, tab_ptr - ptr); for (auto &seqid : target_lists[accnum]) std::cout << seqid << "\t" << taxid << std::endl; target_lists.erase(accnum); if (isatty(fileno(stderr))) std::cerr << "\rFound " << (initial_target_count - target_lists.size()) << "/" << initial_target_count << " targets, searched through " << accessions_searched << " accession IDs..."; if (target_lists.empty()) // Stop processing file if we've found all we need break; } if (accessions_searched % 10000000 == 0 && isatty(fileno(stderr))) std::cerr << "\rFound " << (initial_target_count - target_lists.size()) << "/" << initial_target_count << " targets, searched through " << accessions_searched << " accession IDs..."; ptr = lf_ptr + 1; } accmap_file.CloseFile(); } if (isatty(fileno(stderr))) std::cerr << "\r"; std::cerr << "Found " << (initial_target_count - target_lists.size()) << "/" << initial_target_count << " targets, searched through " << accessions_searched << " accession IDs, search complete." << std::endl; if (! target_lists.empty()) { std::cerr << "lookup_accession_numbers: " << target_lists.size() << "/" << initial_target_count << " accession numbers remain unmapped, see " << "unmapped.txt in DB directory" << std::endl; std::ofstream ofs("unmapped.txt"); for (auto &kv_pair : target_lists) ofs << kv_pair.first << std::endl; } return 0; } kraken2-2.0.8-beta/src/mmap_file.cc000066400000000000000000000056021346023071100170160ustar00rootroot00000000000000/* * Copyright 2013-2019, Derrick Wood * * This file is part of the Kraken 2 taxonomic sequence classification system. */ #include "mmap_file.h" using std::string; namespace kraken2 { MMapFile::MMapFile() { valid_ = false; fptr_ = NULL; filesize_ = 0; fd_ = -1; } void MMapFile::OpenFile(const string &filename_str, int mode, int prot_flags, int map_flags, size_t size) { const char *filename = filename_str.c_str(); OpenFile(filename, mode, map_flags, prot_flags, size); } void MMapFile::OpenFile(const char *filename, int mode, int prot_flags, int map_flags, size_t size) { if (mode & O_APPEND || (mode & O_ACCMODE) == O_WRONLY) errx(EX_SOFTWARE, "illegal mode passed to MMapFile"); if (prot_flags < 0) prot_flags = (mode & O_ACCMODE) == O_RDONLY ? PROT_READ : PROT_READ | PROT_WRITE; if (map_flags < 0) map_flags = (mode & O_ACCMODE) == O_RDONLY ? MAP_PRIVATE : MAP_SHARED; fd_ = open(filename, mode, 0666); if (fd_ < 0) err(EX_OSERR, "unable to open %s", filename); if (mode & O_CREAT) { if (lseek(fd_, size - 1, SEEK_SET) < 0) err(EX_OSERR, "unable to lseek (%s)", filename); if (write(fd_, "", 1) < 0) err(EX_OSERR, "write error (%s)", filename); filesize_ = size; } else { struct stat sb; if (fstat(fd_, &sb) < 0) err(EX_OSERR, "unable to fstat %s", filename); filesize_ = sb.st_size; } fptr_ = (char *) mmap(0, filesize_, prot_flags, map_flags, fd_, 0); if (fptr_ == MAP_FAILED) { err(EX_OSERR, "unable to mmap %s", filename); } valid_ = true; } // Basically a cat operation, loads file into OS cache // I don't use MAP_POPULATE to do this because of portability issues void MMapFile::LoadFile() { int thread_ct = 1; int thread = 0; #ifdef _OPENMP int old_thread_ct = omp_get_max_threads(); // Don't use more than 4 threads, don't want to hammer disk if (old_thread_ct > 4) omp_set_num_threads(4); thread_ct = omp_get_max_threads(); #endif size_t page_size = getpagesize(); char buf[thread_ct][page_size]; #pragma omp parallel { #ifdef _OPENMP thread = omp_get_thread_num(); #endif #pragma omp for schedule(dynamic) for (size_t pos = 0; pos < filesize_; pos += page_size) { size_t this_page_size = filesize_ - pos; if (this_page_size > page_size) this_page_size = page_size; memcpy(buf[thread], fptr_ + pos, this_page_size); } } #ifdef _OPENMP omp_set_num_threads(old_thread_ct); #endif } char * MMapFile::fptr() { return valid_ ? fptr_ : NULL; } size_t MMapFile::filesize() { return valid_ ? filesize_ : 0; } MMapFile::~MMapFile() { CloseFile(); } void MMapFile::SyncFile() { msync(fptr_, filesize_, MS_SYNC); } void MMapFile::CloseFile() { if (! valid_) return; SyncFile(); munmap(fptr_, filesize_); close(fd_); valid_ = false; } } // namespace kraken2-2.0.8-beta/src/mmap_file.h000066400000000000000000000015261346023071100166610ustar00rootroot00000000000000/* * Copyright 2013-2019, Derrick Wood * * This file is part of the Kraken 2 taxonomic sequence classification system. */ #ifndef KRAKEN2_MMAP_FILE_H_ #define KRAKEN2_MMAP_FILE_H_ #include "kraken2_headers.h" namespace kraken2 { class MMapFile { public: MMapFile(); ~MMapFile(); void OpenFile(const std::string &filename, int mode = O_RDONLY, int map_flags = -1, int prot_flags = -1, size_t size = 0); void OpenFile(const char *filename, int mode = O_RDONLY, int map_flags = -1, int prot_flags = -1, size_t size = 0); char *fptr(); size_t filesize(); void LoadFile(); void SyncFile(); void CloseFile(); private: MMapFile(const MMapFile &rhs); MMapFile& operator=(const MMapFile &rhs); bool valid_; int fd_; char *fptr_; size_t filesize_; }; } #endif kraken2-2.0.8-beta/src/mmscanner.cc000066400000000000000000000205601346023071100170500ustar00rootroot00000000000000/* * Copyright 2013-2019, Derrick Wood * * This file is part of the Kraken 2 taxonomic sequence classification system. */ #include "mmscanner.h" #include using std::string; using std::vector; namespace kraken2 { void MinimizerScanner::set_lookup_table_character(char c, uint8_t val) { lookup_table_[(int) c] = val; lookup_table_[tolower(c)] = val; } MinimizerScanner::MinimizerScanner(ssize_t k, ssize_t l, uint64_t spaced_seed_mask, bool dna_sequence, uint64_t toggle_mask, int revcom_version) : str_(nullptr), k_(k), l_(l), str_pos_(0), start_(0), finish_(0), spaced_seed_mask_(spaced_seed_mask), dna_(dna_sequence), toggle_mask_(toggle_mask), loaded_ch_(0), last_ambig_(0), revcom_version_(revcom_version) { if (l_ > (ssize_t) ((sizeof(uint64_t) * 8 - 1) / (dna_ ? BITS_PER_CHAR_DNA : BITS_PER_CHAR_PRO))) errx(EX_SOFTWARE, "l exceeds size limits for minimizer %s scanner", dna_ ? "nucleotide" : "protein"); lmer_mask_ = 1; lmer_mask_ <<= (l_ * (dna_ ? BITS_PER_CHAR_DNA : BITS_PER_CHAR_PRO)); lmer_mask_--; toggle_mask_ &= lmer_mask_; if (finish_ == SIZE_MAX) finish_ = str_->size(); if ((ssize_t) (finish_ - start_) + 1 < l_) // Invalidate scanner if interval < 1 l-mer str_pos_ = finish_; for (int i = 0; i < UINT8_MAX + 1; i++) lookup_table_[i] = UINT8_MAX; if (dna_) { set_lookup_table_character('A', 0x00); set_lookup_table_character('C', 0x01); set_lookup_table_character('G', 0x02); set_lookup_table_character('T', 0x03); } else { // Reduced alphabet uses 15-letter alphabet from AD Solis (2015), // in Proteins (doi:10.1002/prot.24936) // This means that the ambiguous codes B (N/D) and Z (E/Q) aren't // usable here because they span multiple groups. Although J (I/L) // could be used because I & L are in the same group, we treat it as // we do B, Z, and X for consistency and to ease future changes to // the code base. // stop codons/rare amino acids set_lookup_table_character('*', 0x00); set_lookup_table_character('U', 0x00); set_lookup_table_character('O', 0x00); // alanine set_lookup_table_character('A', 0x01); // asparagine, glutamine, serine set_lookup_table_character('N', 0x02); set_lookup_table_character('Q', 0x02); set_lookup_table_character('S', 0x02); // cysteine set_lookup_table_character('C', 0x03); // aspartic acid, glutamic acid set_lookup_table_character('D', 0x04); set_lookup_table_character('E', 0x04); // phenylalanine set_lookup_table_character('F', 0x05); // glycine set_lookup_table_character('G', 0x06); // histidine set_lookup_table_character('H', 0x07); // isoleucine, leucine set_lookup_table_character('I', 0x08); set_lookup_table_character('L', 0x08); // lysine set_lookup_table_character('K', 0x09); // proline set_lookup_table_character('P', 0x0a); // arginine set_lookup_table_character('R', 0x0b); // methionine, valine set_lookup_table_character('M', 0x0c); set_lookup_table_character('V', 0x0c); // threonine set_lookup_table_character('T', 0x0d); // tryptophan set_lookup_table_character('W', 0x0e); // tyrosine set_lookup_table_character('Y', 0x0f); } } void MinimizerScanner::LoadSequence(string &seq, size_t start, size_t finish) { str_ = &seq; start_ = start; finish_ = finish; str_pos_ = start_; if (finish_ == SIZE_MAX) finish_ = str_->size(); if ((ssize_t) (finish_ - start_) + 1 < l_) // Invalidate scanner if interval < 1 l-mer str_pos_ = finish_; queue_.clear(); queue_pos_ = 0; loaded_ch_ = 0; last_minimizer_ = ~0; last_ambig_ = 0; } uint64_t *MinimizerScanner::NextMinimizer(bool *ambig_flag) { if (str_pos_ >= finish_) // Abort if we've exhausted string interval return nullptr; bool changed_minimizer = false; auto bits_per_char = dna_ ? BITS_PER_CHAR_DNA : BITS_PER_CHAR_PRO; auto ambig_code = (1u << bits_per_char) - 1; while (! changed_minimizer) { // Incorporate next character (and more if needed to fill l-mer) if (loaded_ch_ == l_) loaded_ch_--; while (loaded_ch_ < l_ && str_pos_ < finish_) { // char loading loop loaded_ch_++; lmer_ <<= bits_per_char; last_ambig_ <<= bits_per_char; auto lookup_code = lookup_table_[ (int) (*str_)[str_pos_++] ]; if (lookup_code == UINT8_MAX) { queue_.clear(); queue_pos_ = 0; lmer_ = 0; loaded_ch_ = 0; last_ambig_ |= ambig_code; } else lmer_ |= lookup_code; lmer_ &= lmer_mask_; last_ambig_ &= lmer_mask_; // non-null arg means we need to do some special handling if (ambig_flag != nullptr) { *ambig_flag = !! last_ambig_; // If we haven't filled up first k-mer, don't return // Otherwise, if l-mer is incomplete, still return // If l-mer is complete, go through queue management // Ambig flag being non-null means we're expecting a return for each // k-mer, and if we don't have a full l-mer, there's no point in // queue management for minimizer calculation if (str_pos_ >= (size_t) k_ && loaded_ch_ < l_) return &last_minimizer_; } } // end character loading loop if (loaded_ch_ < l_) // Abort if we've exhausted string interval w/o return nullptr; // filling the l-mer uint64_t canonical_lmer = dna_ ? canonical_representation(lmer_, l_) : lmer_; if (spaced_seed_mask_) canonical_lmer &= spaced_seed_mask_; uint64_t candidate_lmer = canonical_lmer ^ toggle_mask_; if (k_ == l_) { // Short-circuit queue work last_minimizer_ = candidate_lmer ^ toggle_mask_; return &last_minimizer_; } // Sliding window minimum calculation while (! queue_.empty() && queue_.back().candidate > candidate_lmer) queue_.pop_back(); MinimizerData data = { candidate_lmer, queue_pos_ }; if (queue_.empty() && queue_pos_ >= k_ - l_) { // Empty queue means front will change // Minimizer will change iff we've processed enough l-mers changed_minimizer = true; } queue_.push_back(data); // expire an l-mer not in the current window if (queue_.front().pos < queue_pos_ - k_ + l_) { queue_.erase(queue_.begin()); // Change in front means minimizer changed changed_minimizer = true; } // Change from no minimizer (beginning of sequence/near ambig. char) if (queue_pos_ == k_ - l_) changed_minimizer = true; queue_pos_++; // (almost) Always return after each ch. when ambig flag is set if (ambig_flag != nullptr) { // Return only if we've read in at least one k-mer's worth of chars if (str_pos_ >= (size_t) k_) { // Force flag to true if we haven't read in a k-mer since last clearing // Must use <= due to increment of queue_pos_ just above here if (queue_pos_ <= k_ - l_) *ambig_flag = true; break; } } } // end while ! changed_minimizer assert(! queue_.empty()); last_minimizer_ = queue_.front().candidate ^ toggle_mask_; return &last_minimizer_; } // Adapted for 64-bit DNA use from public domain code at: // https://graphics.stanford.edu/~seander/bithacks.html#ReverseParallel uint64_t MinimizerScanner::reverse_complement(uint64_t kmer, uint8_t n) { // Reverse bits (leaving bit pairs - nucleotides - intact) // swap consecutive pairs kmer = ((kmer & 0xCCCCCCCCCCCCCCCCUL) >> 2) | ((kmer & 0x3333333333333333UL) << 2); // swap consecutive nibbles kmer = ((kmer & 0xF0F0F0F0F0F0F0F0UL) >> 4) | ((kmer & 0x0F0F0F0F0F0F0F0FUL) << 4); // swap consecutive bytes kmer = ((kmer & 0xFF00FF00FF00FF00UL) >> 8) | ((kmer & 0x00FF00FF00FF00FFUL) << 8); // swap consecutive byte pairs kmer = ((kmer & 0xFFFF0000FFFF0000UL) >> 16) | ((kmer & 0x0000FFFF0000FFFFUL) << 16); // swap halves of 64-bit word kmer = ( kmer >> 32 ) | ( kmer << 32); // Then complement if (revcom_version_ == 0) // This branch present to maintain backwards compatibility with old DBs return (~kmer) & ((1ull << (n * 2)) - 1); else return ((~kmer) >> (sizeof(kmer) * CHAR_BIT - n * 2)) & ((1ull << (n * 2)) - 1); } uint64_t MinimizerScanner::canonical_representation(uint64_t kmer, uint8_t n) { uint64_t revcom = reverse_complement(kmer, n); return kmer < revcom ? kmer : revcom; } } // end namespace kraken2-2.0.8-beta/src/mmscanner.h000066400000000000000000000040761346023071100167160ustar00rootroot00000000000000/* * Copyright 2013-2019, Derrick Wood * * This file is part of the Kraken 2 taxonomic sequence classification system. */ #ifndef KRAKEN2_MMSCANNER_H_ #define KRAKEN2_MMSCANNER_H_ #include "kraken2_headers.h" namespace kraken2 { const uint64_t DEFAULT_TOGGLE_MASK = 0xe37e28c4271b5a2dULL; const uint64_t DEFAULT_SPACED_SEED_MASK = 0; const int BITS_PER_CHAR_DNA = 2; const int BITS_PER_CHAR_PRO = 4; const int CURRENT_REVCOM_VERSION = 1; struct MinimizerData { uint64_t candidate; ssize_t pos; }; class MinimizerScanner { public: // Create scanner for seq over interval [start, finish) // Will report length l minimizers for all k-mers in the interval // Minimizers ordered by XORing candidates w/ toggle_mask // toggle_mask == 0 implies lexicographical ordering MinimizerScanner(ssize_t k, ssize_t l, uint64_t spaced_seed_mask = DEFAULT_SPACED_SEED_MASK, bool dna_sequence = true, uint64_t toggle_mask = DEFAULT_TOGGLE_MASK, int revcom_version = CURRENT_REVCOM_VERSION); void LoadSequence(std::string &seq, size_t start = 0, size_t finish = SIZE_MAX); uint64_t *NextMinimizer(bool *ambig_flag = nullptr); // Return last minimizer, only valid if NextMinimizer last returned non-NULL uint64_t last_minimizer() { return last_minimizer_; } ssize_t k() { return k_; } ssize_t l() { return l_; } bool is_dna() { return dna_; } private: uint64_t reverse_complement(uint64_t kmer, uint8_t n); uint64_t canonical_representation(uint64_t kmer, uint8_t n); void set_lookup_table_character(char ch, uint8_t val); std::string *str_; // pointer to sequence ssize_t k_; ssize_t l_; size_t str_pos_, start_, finish_; uint64_t spaced_seed_mask_; bool dna_; uint64_t toggle_mask_; uint64_t lmer_; uint64_t lmer_mask_; uint64_t last_minimizer_; ssize_t loaded_ch_; std::vector queue_; ssize_t queue_pos_; uint64_t last_ambig_; uint8_t lookup_table_[UINT8_MAX + 1]; const int revcom_version_; }; } #endif kraken2-2.0.8-beta/src/mmtest.cc000066400000000000000000000005051346023071100163730ustar00rootroot00000000000000#include "kraken2_headers.h" #include "mmscanner.h" using namespace std; using namespace kraken2; int main() { string seq = "ACGATCGACGACG"; MinimizerScanner scanner(10, 5, 0); scanner.LoadSequence(seq); uint64_t *mmp; while ((mmp = scanner.NextMinimizer()) != nullptr) printf("%016x\n", *mmp); return 0; } kraken2-2.0.8-beta/src/omp_hack.cc000066400000000000000000000017201346023071100166430ustar00rootroot00000000000000/* * Copyright 2013-2019, Derrick Wood * * This file is part of the Kraken 2 taxonomic sequence classification system. */ #ifndef _OPENMP #warning "OpenMP not found, multi-threading will be DISABLED!" #warning "To fix this, please make sure your GCC installation uses OpenMP." // This file defines datatypes and declares functions supplied by the // omp.h file to allow compilation on systems that do not have OpenMP // supported by the installed GCC. Note that these functions are // effectively no-op functions that will allow single-threaded operation only. typedef int omp_lock_t; int omp_get_thread_num() { return 0; } int omp_get_max_threads() { return 1; } void omp_set_num_threads(int num) { } void omp_init_lock(omp_lock_t *lock) { } void omp_destroy_lock(omp_lock_t *lock) { } void omp_set_lock(omp_lock_t *lock) { } void omp_unset_lock(omp_lock_t *lock) { } int omp_test_lock(omp_lock_t *lock) { return 1; } #endif // defined _OPENMP kraken2-2.0.8-beta/src/omp_hack.h000066400000000000000000000015461346023071100165130ustar00rootroot00000000000000/* * Copyright 2013-2019, Derrick Wood * * This file is part of the Kraken 2 taxonomic sequence classification system. */ #ifndef KRAKEN2_OMP_HACK_H_ #define KRAKEN2_OMP_HACK_H_ #ifdef _OPENMP #include #else // This file defines a datatype and declares functions supplied by the // omp.h file to allow compilation on systems that do not have OpenMP // supported by the installed GCC. Note that these functions are // effectively no-op functions that will allow single-threaded operation only. typedef int omp_lock_t; int omp_get_thread_num(); int omp_get_max_threads(); void omp_set_num_threads(int num); void omp_init_lock(omp_lock_t *lock); void omp_destroy_lock(omp_lock_t *lock); void omp_set_lock(omp_lock_t *lock); void omp_unset_lock(omp_lock_t *lock); int omp_test_lock(omp_lock_t *lock); #endif // defined _OPENMP #endif kraken2-2.0.8-beta/src/reports.cc000066400000000000000000000140231346023071100165600ustar00rootroot00000000000000/* * Copyright 2013-2019, Derrick Wood * * This file is part of the Kraken 2 taxonomic sequence classification system. */ #include "reports.h" using std::vector; using std::string; using std::ofstream; using std::map; namespace kraken2 { taxon_counts_t GetCladeCounts(Taxonomy &tax, taxon_counts_t &call_counts) { taxon_counts_t clade_counts; for (auto &kv_pair : call_counts) { auto taxid = kv_pair.first; auto count = kv_pair.second; while (taxid) { clade_counts[taxid] += count; taxid = tax.nodes()[taxid].parent_id; } } return clade_counts; } void PrintMpaStyleReportLine(ofstream &ofs, uint64_t clade_count, const string &taxonomy_line) { ofs << taxonomy_line << "\t" << clade_count << std::endl; } void MpaReportDFS(taxid_t taxid, ofstream &ofs, bool report_zeros, Taxonomy &taxonomy, taxon_counts_t &clade_counts, vector &taxonomy_names) { // Clade count of 0 means all subtree nodes have clade count of 0 if (! report_zeros && clade_counts[taxid] == 0) return; TaxonomyNode node = taxonomy.nodes()[taxid]; string rank = taxonomy.rank_data() + node.rank_offset; char rank_code = '\0'; if (rank == "superkingdom") { rank_code = 'd'; } else if (rank == "kingdom") { rank_code = 'k'; } else if (rank == "phylum") { rank_code = 'p'; } else if (rank == "class") { rank_code = 'c'; } else if (rank == "order") { rank_code = 'o'; } else if (rank == "family") { rank_code = 'f'; } else if (rank == "genus") { rank_code = 'g'; } else if (rank == "species") { rank_code = 's'; } if (rank_code) { string name = ""; name += rank_code; name += "__"; name += (taxonomy.name_data() + node.name_offset); taxonomy_names.push_back(name); string taxonomy_line = ""; for (auto &str : taxonomy_names) taxonomy_line += str + "|"; taxonomy_line.pop_back(); PrintMpaStyleReportLine(ofs, clade_counts[taxid], taxonomy_line); } auto child_count = node.child_count; if (child_count != 0) { vector children(child_count); for (auto i = 0u; i < child_count; i++) { children[i] = node.first_child + i; } // Sorting child IDs by descending order of clade counts sort(children.begin(), children.end(), [&](const uint64_t &a, const uint64_t &b) { return clade_counts[a] > clade_counts[b]; } ); for (auto child : children) { MpaReportDFS(child, ofs, report_zeros, taxonomy, clade_counts, taxonomy_names); } } if (rank_code) taxonomy_names.pop_back(); } void ReportMpaStyle(string filename, bool report_zeros, Taxonomy &taxonomy, taxon_counts_t &call_counts) { taxon_counts_t clade_counts = GetCladeCounts(taxonomy, call_counts); ofstream ofs(filename); vector taxonomy_names; MpaReportDFS(1, ofs, report_zeros, taxonomy, clade_counts, taxonomy_names); } void PrintKrakenStyleReportLine(ofstream &ofs, uint64_t total_seqs, uint64_t clade_count, uint64_t call_count, const string &rank_str, uint32_t taxid, const string &sci_name, int depth) { char pct_buffer[10] = ""; sprintf(pct_buffer, "%6.2f", 100.0 * clade_count / total_seqs); ofs << pct_buffer << "\t" << clade_count << "\t" << call_count << "\t" << rank_str << "\t" << taxid << "\t"; for (auto i = 0; i < depth; i++) ofs << " "; ofs << sci_name << std::endl; } // Depth-first search of taxonomy tree, reporting info at each node void KrakenReportDFS(uint32_t taxid, ofstream &ofs, bool report_zeros, Taxonomy &taxonomy, taxon_counts_t &clade_counts, taxon_counts_t &call_counts, uint64_t total_seqs, char rank_code, int rank_depth, int depth) { // Clade count of 0 means all subtree nodes have clade count of 0 if (! report_zeros && clade_counts[taxid] == 0) return; TaxonomyNode node = taxonomy.nodes()[taxid]; string rank = taxonomy.rank_data() + node.rank_offset; if (rank == "superkingdom") { rank_code = 'D'; rank_depth = 0; } else if (rank == "kingdom") { rank_code = 'K'; rank_depth = 0; } else if (rank == "phylum") { rank_code = 'P'; rank_depth = 0; } else if (rank == "class") { rank_code = 'C'; rank_depth = 0; } else if (rank == "order") { rank_code = 'O'; rank_depth = 0; } else if (rank == "family") { rank_code = 'F'; rank_depth = 0; } else if (rank == "genus") { rank_code = 'G'; rank_depth = 0; } else if (rank == "species") { rank_code = 'S'; rank_depth = 0; } else { rank_depth++; } string rank_str(&rank_code, 0, 1); if (rank_depth != 0) rank_str += std::to_string(rank_depth); string name = taxonomy.name_data() + node.name_offset; PrintKrakenStyleReportLine(ofs, total_seqs, clade_counts[taxid], call_counts[taxid], rank_str, node.external_id, name, depth); auto child_count = node.child_count; if (child_count != 0) { vector children(child_count); for (auto i = 0u; i < child_count; i++) { children[i] = node.first_child + i; } // Sorting child IDs by descending order of clade counts std::sort(children.begin(), children.end(), [&](const uint64_t &a, const uint64_t &b) { return clade_counts[a] > clade_counts[b]; } ); for (auto child : children) { KrakenReportDFS(child, ofs, report_zeros, taxonomy, clade_counts, call_counts, total_seqs, rank_code, rank_depth, depth + 1); } } } void ReportKrakenStyle(string filename, bool report_zeros, Taxonomy &taxonomy, taxon_counts_t &call_counts, uint64_t total_seqs, uint64_t total_unclassified) { taxon_counts_t clade_counts = GetCladeCounts(taxonomy, call_counts); ofstream ofs(filename); // Special handling of the unclassified sequences if (total_unclassified != 0 || report_zeros) PrintKrakenStyleReportLine(ofs, total_seqs, total_unclassified, total_unclassified, "U", 0, "unclassified", 0); // DFS through the taxonomy, printing nodes as encountered KrakenReportDFS(1, ofs, report_zeros, taxonomy, clade_counts, call_counts, total_seqs, 'R', -1, 0); } } // end namespace kraken2-2.0.8-beta/src/reports.h000066400000000000000000000031221346023071100164200ustar00rootroot00000000000000/* * Copyright 2013-2019, Derrick Wood * * This file is part of the Kraken 2 taxonomic sequence classification system. */ #ifndef KRAKEN2_REPORTS_H_ #define KRAKEN2_REPORTS_H_ #include "kraken2_headers.h" #include "taxonomy.h" #include "kraken2_data.h" namespace kraken2 { // Still TODO: Create an MPA-style reporter that can take a std::vector of // call_counts and a std::vector of sample IDs void ReportMpaStyle(std::string filename, bool report_zeros, Taxonomy &tax, taxon_counts_t &call_counts); //void ReportKrakenStyle(std::string filename, Taxonomy &tax, taxon_counts_t &call_counts, uint64_t total_unclassified); taxon_counts_t GetCladeCounts(Taxonomy &tax, taxon_counts_t &call_counts); void PrintMpaStyleReportLine(std::ofstream &ofs, uint64_t clade_count, const std::string &taxonomy_line); void MpaReportDFS(taxid_t taxid, std::ofstream &ofs, bool report_zeros, Taxonomy &taxonomy, taxon_counts_t &clade_counts, std::vector &taxonomy_names); void PrintKrakenStyleReportLine(std::ofstream &ofs, uint64_t total_seqs, uint64_t clade_count, uint64_t call_count, const std::string &rank_str, uint32_t taxid, const std::string &sci_name, int depth); void KrakenReportDFS(uint32_t taxid, std::ofstream &ofs, bool report_zeros, Taxonomy &taxonomy, taxon_counts_t &clade_counts, taxon_counts_t &call_counts, uint64_t total_seqs, char rank_code, int rank_depth, int depth); void ReportKrakenStyle(std::string filename, bool report_zeros, Taxonomy &taxonomy, taxon_counts_t &call_counts, uint64_t total_seqs, uint64_t total_unclassified); } #endif kraken2-2.0.8-beta/src/seqreader.cc000066400000000000000000000124551346023071100170440ustar00rootroot00000000000000/* * Copyright 2013-2019, Derrick Wood * * This file is part of the Kraken 2 taxonomic sequence classification system. */ #include "seqreader.h" using std::string; namespace kraken2 { void StripString(string &str) { while (isspace(str.back())) str.pop_back(); } string &Sequence::to_string() { str_representation.assign(header); switch (format) { case FORMAT_FASTQ: str_representation.append("\n"); str_representation.append(seq); str_representation.append("\n+\n"); str_representation.append(quals); str_representation.append("\n"); break; default: str_representation.append("\n"); str_representation.append(seq); str_representation.append("\n"); break; } return str_representation; } BatchSequenceReader::BatchSequenceReader() { file_format_ = FORMAT_AUTO_DETECT; str_buffer_.reserve(8192); block_buffer_ = new char[8192]; block_buffer_size_ = 8192; } BatchSequenceReader::~BatchSequenceReader() { delete[] block_buffer_; } bool BatchSequenceReader::LoadBlock(std::istream &ifs, size_t block_size) { ss_.clear(); ss_.str(""); if (block_buffer_size_ < block_size) { delete[] block_buffer_; block_buffer_ = new char[block_size]; block_buffer_size_ = block_size; } ifs.read(block_buffer_, block_size); if (! ifs && ifs.gcount() <= 0) return false; if (file_format_ == FORMAT_AUTO_DETECT) { switch (block_buffer_[0]) { case '@' : file_format_ = FORMAT_FASTQ; break; case '>' : file_format_ = FORMAT_FASTA; break; default: errx(EX_DATAERR, "sequence reader - unrecognized file format"); } } str_buffer_.assign(block_buffer_, ifs.gcount()); ss_ << str_buffer_; if (getline(ifs, str_buffer_)) ss_ << str_buffer_ << "\n"; if (file_format_ == FORMAT_FASTQ) { while (getline(ifs, str_buffer_)) { ss_ << str_buffer_ << "\n"; if (str_buffer_[0] == '@') break; } int lines_to_read = 0; if (getline(ifs, str_buffer_)) { ss_ << str_buffer_ << "\n"; lines_to_read = str_buffer_[0] == '@' ? 3 : 2; while (lines_to_read-- > 0 && getline(ifs, str_buffer_)) ss_ << str_buffer_ << "\n"; } } else { while (ifs) { if (ifs.peek() == '>') break; if (getline(ifs, str_buffer_)) ss_ << str_buffer_ << "\n"; } } return true; } bool BatchSequenceReader::LoadBatch(std::istream &ifs, size_t record_count) { ss_.clear(); ss_.str(""); auto valid = false; if (file_format_ == FORMAT_AUTO_DETECT) { if (! ifs) return false; switch (ifs.peek()) { case '@' : file_format_ = FORMAT_FASTQ; break; case '>' : file_format_ = FORMAT_FASTA; break; case EOF : return false; default: errx(EX_DATAERR, "sequence reader - unrecognized file format"); } valid = true; } size_t line_count = 0; while (record_count > 0 && ifs) { if (getline(ifs, str_buffer_)) line_count++; valid = true; if (file_format_ == FORMAT_FASTQ) { if (line_count % 4 == 0) record_count--; } else { if (ifs.peek() == '>') record_count--; } ss_ << str_buffer_ << "\n"; } return valid; } bool BatchSequenceReader::NextSequence(Sequence &seq) { return BatchSequenceReader::ReadNextSequence (ss_, seq, str_buffer_, file_format_); } bool BatchSequenceReader::ReadNextSequence(std::istream &is, Sequence &seq, std::string &str_buffer, SequenceFormat file_format) { if (! getline(is, str_buffer)) return false; StripString(str_buffer); if (file_format == FORMAT_AUTO_DETECT) { switch (str_buffer[0]) { case '@' : file_format = FORMAT_FASTQ; break; case '>' : file_format = FORMAT_FASTA; break; default: errx(EX_DATAERR, "sequence reader - unrecognized file format"); } } seq.format = file_format; if (seq.format == FORMAT_FASTQ) { if (str_buffer.empty()) // Allow empty line to end file return false; if (str_buffer[0] != '@') errx(EX_DATAERR, "malformed FASTQ file (exp. '@', saw \"%s\"), aborting", str_buffer.c_str()); } else if (seq.format == FORMAT_FASTA) { if (str_buffer[0] != '>') errx(EX_DATAERR, "malformed FASTA file (exp. '>', saw \"%s\"), aborting", str_buffer.c_str()); } else errx(EX_SOFTWARE, "illegal sequence format encountered in parsing"); seq.header.assign(str_buffer); auto first_whitespace_ch = str_buffer.find_first_of(" \t\r", 1); auto substr_len = first_whitespace_ch; if (substr_len != std::string::npos) substr_len--; if (str_buffer.size() > 1) seq.id.assign(str_buffer, 1, substr_len); else return false; if (seq.format == FORMAT_FASTQ) { if (! getline(is, str_buffer)) return false; StripString(str_buffer); seq.seq.assign(str_buffer); if (! getline(is, str_buffer)) // + line, discard return false; if (! getline(is, str_buffer)) return false; StripString(str_buffer); seq.quals.assign(str_buffer); } else if (seq.format == FORMAT_FASTA) { seq.quals.assign(""); seq.seq.assign(""); while (is && is.peek() != '>') { if (! getline(is, str_buffer)) return ! seq.seq.empty(); StripString(str_buffer); seq.seq.append(str_buffer); } } return true; } } // end namespace kraken2-2.0.8-beta/src/seqreader.h000066400000000000000000000026751346023071100167110ustar00rootroot00000000000000/* * Copyright 2013-2019, Derrick Wood * * This file is part of the Kraken 2 taxonomic sequence classification system. */ #ifndef KRAKEN2_SEQREADER_H_ #define KRAKEN2_SEQREADER_H_ #include "kraken2_headers.h" namespace kraken2 { enum SequenceFormat { FORMAT_AUTO_DETECT, FORMAT_FASTA, FORMAT_FASTQ }; struct Sequence { SequenceFormat format; std::string header; // header line, including @/>, but not newline std::string id; // from first char. after @/> up to first whitespace std::string seq; std::string quals; // only meaningful for FASTQ seqs std::string &to_string(); private: std::string str_representation; }; class BatchSequenceReader { public: BatchSequenceReader(); ~BatchSequenceReader(); BatchSequenceReader(const BatchSequenceReader &rhs) = delete; BatchSequenceReader& operator=(const BatchSequenceReader &rhs) = delete; bool LoadBatch(std::istream &ifs, size_t record_count); bool LoadBlock(std::istream &ifs, size_t block_size); bool NextSequence(Sequence &seq); static bool ReadNextSequence(std::istream &is, Sequence &seq, std::string &str_buffer_ptr, SequenceFormat format = FORMAT_AUTO_DETECT); SequenceFormat file_format() { return file_format_; } private: std::stringstream ss_; std::string str_buffer_; // used to prevent realloc upon every load/parse SequenceFormat file_format_; char *block_buffer_; size_t block_buffer_size_; }; } #endif kraken2-2.0.8-beta/src/taxonomy.cc000066400000000000000000000220321346023071100167370ustar00rootroot00000000000000/* * Copyright 2013-2019, Derrick Wood * * This file is part of the Kraken 2 taxonomic sequence classification system. */ #include "taxonomy.h" using std::string; using std::map; using std::set; using std::ifstream; using std::istringstream; using std::queue; using std::ofstream; namespace kraken2 { NCBITaxonomy::NCBITaxonomy(string nodes_filename, string names_filename) { ifstream nodes_file(nodes_filename.c_str()); if (! nodes_file.good()) err(EX_NOINPUT, "error opening %s", nodes_filename.c_str()); ifstream names_file(names_filename.c_str()); if (! names_file.good()) err(EX_NOINPUT, "error opening %s", names_filename.c_str()); string line; uint64_t node_id, parent_id; string name, rank, junk; const string delim = "\t|\t"; while (getline(nodes_file, line)) { line.pop_back(); // discard trailing line.pop_back(); // "\t|" size_t pos1, pos2; pos1 = 0; int field_ct = 0; bool finished = false; while (field_ct++ < 10 && ! finished) { // tokenizing loop pos2 = line.find(delim, pos1); string token; if (pos2 == string::npos) { token = line.substr(pos1); finished = true; } else { token = line.substr(pos1, pos2 - pos1); pos1 = pos2 + delim.size(); } switch (field_ct) { // 1-based counting case 1 : // node ID node_id = (uint64_t) stoul(token); if (node_id == 0) errx(EX_DATAERR, "attempt to create taxonomy w/ node ID == 0"); break; case 2 : // parent ID parent_id = (uint64_t) stoul(token); break; case 3 : // rank rank = token; finished = true; break; } } // end tokenizing loop if (node_id == 1) parent_id = 0; parent_map_[node_id] = parent_id; child_map_[parent_id].insert(node_id); rank_map_[node_id] = rank; known_ranks_.insert(rank); } while (getline(names_file, line)) { line.pop_back(); // discard trailing line.pop_back(); // "\t|" size_t pos1, pos2; pos1 = 0; int field_ct = 0; bool finished = false; while (field_ct++ < 10 && ! finished) { // tokenizing loop pos2 = line.find(delim, pos1); string token; if (pos2 == string::npos) { token = line.substr(pos1); finished = true; } else { token = line.substr(pos1, pos2 - pos1); pos1 = pos2 + delim.size(); } switch (field_ct) { // 1-based counting case 1 : // node ID node_id = (uint64_t) stoul(token); if (node_id == 0) errx(EX_DATAERR, "attempt to create taxonomy w/ node ID == 0"); break; case 2 : // name name = token; break; case 4 : // type of name - determines whether or not to record in map if (token == "scientific name") name_map_[node_id] = name; finished = true; break; } } // end tokenizing loop } marked_nodes_.insert(1); // mark root node } // Mark the given taxonomy node and all its unmarked ancestors void NCBITaxonomy::MarkNode(uint64_t taxid) { while (marked_nodes_.count(taxid) == 0) { marked_nodes_.insert(taxid); taxid = parent_map_[taxid]; } } void NCBITaxonomy::ConvertToKrakenTaxonomy(const char *filename) { Taxonomy taxo; TaxonomyNode zeroes_node = { 0, 0, 0, 0, 0, 0, 0 }; taxo.node_count_ = marked_nodes_.size() + 1; // +1 because 0 is illegal value taxo.nodes_ = new TaxonomyNode[taxo.node_count_]; taxo.nodes_[0] = zeroes_node; string name_data; string rank_data; // Because so many of the node rank names are shared, we only store one copy // of each rank map rank_offsets; for (string rank : known_ranks_) { rank_offsets[rank] = rank_data.size(); rank_data.append(rank.c_str(), rank.size() + 1); } uint64_t internal_node_id = 0; map external_id_map; // keys: ext. ID, values: int. ID external_id_map[0] = 0; external_id_map[1] = 1; // 1 is root in both NCBI and Kraken taxonomies // Breadth-first search through NCBI taxonomy, assigning internal IDs // in sequential order as nodes are encountered via BFS. queue bfs_queue; bfs_queue.push(1); while (! bfs_queue.empty()) { ++internal_node_id; uint64_t external_node_id = bfs_queue.front(); bfs_queue.pop(); external_id_map[external_node_id] = internal_node_id; TaxonomyNode node = zeroes_node; // just to initialize node to zeros node.parent_id = external_id_map[ parent_map_[external_node_id] ]; node.external_id = external_node_id; node.rank_offset = rank_offsets[ rank_map_[external_node_id] ]; node.name_offset = name_data.size(); node.first_child = internal_node_id + 1 + bfs_queue.size(); for (uint64_t child_node : child_map_[external_node_id]) { // Only add marked nodes to our internal tree if (marked_nodes_.count(child_node) > 0) { bfs_queue.push(child_node); node.child_count++; } } taxo.nodes_[internal_node_id] = node; string name = name_map_[external_node_id]; name_data.append(name.c_str(), name.size() + 1); } // end BFS while loop taxo.rank_data_ = new char[ rank_data.size() ]; memcpy(taxo.rank_data_, rank_data.data(), rank_data.size()); taxo.rank_data_len_ = rank_data.size(); taxo.name_data_ = new char[ name_data.size() ]; memcpy(taxo.name_data_, name_data.data(), name_data.size()); taxo.name_data_len_ = name_data.size(); taxo.WriteToDisk(filename); } Taxonomy::Taxonomy(const string &filename, bool memory_mapping) { Init(filename.c_str(), memory_mapping); } Taxonomy::Taxonomy(const char *filename, bool memory_mapping) { Init(filename, memory_mapping); } void Taxonomy::Init(const char *filename, bool memory_mapping) { if (memory_mapping) { taxonomy_data_file_.OpenFile(filename); file_backed_ = true; char *ptr = taxonomy_data_file_.fptr(); if (strncmp(FILE_MAGIC, ptr, strlen(FILE_MAGIC)) != 0) { errx(EX_DATAERR, "attempt to load taxonomy from malformed file %s", filename); } ptr += strlen(FILE_MAGIC); memcpy((char *) &node_count_, ptr, sizeof(node_count_)); ptr += sizeof(node_count_); memcpy((char *) &name_data_len_, ptr, sizeof(name_data_len_)); ptr += sizeof(name_data_len_); memcpy((char *) &rank_data_len_, ptr, sizeof(rank_data_len_)); ptr += sizeof(rank_data_len_); nodes_ = (TaxonomyNode *) ptr; ptr += sizeof(*nodes_) * node_count_; name_data_ = ptr; ptr += name_data_len_; rank_data_ = ptr; } else { std::ifstream ifs(filename); file_backed_ = false; char magic[strlen(FILE_MAGIC) + 1]; memset(magic, 0, strlen(FILE_MAGIC) + 1); ifs.read(magic, strlen(FILE_MAGIC)); if (strcmp(magic, FILE_MAGIC) != 0) errx(EX_DATAERR, "malformed taxonomy file %s", filename); ifs.read((char *) &node_count_, sizeof(node_count_)); ifs.read((char *) &name_data_len_, sizeof(name_data_len_)); ifs.read((char *) &rank_data_len_, sizeof(rank_data_len_)); nodes_ = new TaxonomyNode[node_count_]; ifs.read((char *) nodes_, sizeof(*nodes_) * node_count_); name_data_ = new char[name_data_len_]; ifs.read((char *) name_data_, name_data_len_); rank_data_ = new char[rank_data_len_]; ifs.read((char *) rank_data_, rank_data_len_); if (! ifs) errx(EX_DATAERR, "read exhausted taxonomy information in %s", filename); } } Taxonomy::~Taxonomy() { // If file backed, deleting would be... bad. if (! file_backed_) { delete[] nodes_; delete[] name_data_; delete[] rank_data_; } } // Logic here depends on higher nodes having smaller IDs // Idea: advance B tracker up tree, A is ancestor iff B tracker hits A bool Taxonomy::IsAAncestorOfB(uint64_t a, uint64_t b) { if (! a || ! b) return false; while (b > a) { b = nodes_[b].parent_id; } return b == a; } // Logic here depends on higher nodes having smaller IDs // Idea: track two nodes, advance lower tracker up tree, trackers meet @ LCA uint64_t Taxonomy::LowestCommonAncestor(uint64_t a, uint64_t b) { if (! a || ! b) // LCA(x,0) = LCA(0,x) = x return a ? a : b; while (a != b) { if (a > b) a = nodes_[a].parent_id; else b = nodes_[b].parent_id; } return a; } // Dump binary data to file void Taxonomy::WriteToDisk(const char *filename) { ofstream taxo_file(filename); taxo_file.write(FILE_MAGIC, strlen(FILE_MAGIC)); taxo_file.write((char *) &node_count_, sizeof(node_count_)); taxo_file.write((char *) &name_data_len_, sizeof(name_data_len_)); taxo_file.write((char *) &rank_data_len_, sizeof(rank_data_len_)); taxo_file.write((char *) nodes_, sizeof(*nodes_) * node_count_); taxo_file.write(name_data_, name_data_len_); taxo_file.write(rank_data_, rank_data_len_); if (! taxo_file.good()) errx(EX_OSERR, "error writing taxonomy to %s", filename); taxo_file.close(); } void Taxonomy::GenerateExternalToInternalIDMap() { for (size_t i = 1; i < node_count_; i++) { external_to_internal_id_map_[nodes_[i].external_id] = i; } } } kraken2-2.0.8-beta/src/taxonomy.h000066400000000000000000000052451346023071100166100ustar00rootroot00000000000000/* * Copyright 2013-2019, Derrick Wood * * This file is part of the Kraken 2 taxonomic sequence classification system. */ #ifndef KRAKEN2_TAXONOMY_H_ #define KRAKEN2_TAXONOMY_H_ #include "mmap_file.h" #include "kraken2_headers.h" namespace kraken2 { // ID of a taxonomy node is implicit from its position in the node array struct TaxonomyNode { uint64_t parent_id; // Must be lower-numbered node uint64_t first_child; // Must be higher-numbered node uint64_t child_count; // Children of a node are in contiguous block uint64_t name_offset; // Location of name in name data super-string uint64_t rank_offset; // Location of rank in rank data super-string uint64_t external_id; // Taxonomy ID for reporting purposes (usually NCBI) uint64_t godparent_id; // Reserved for future use to enable faster traversal }; class NCBITaxonomy { public: NCBITaxonomy(std::string nodes_filename, std::string names_filename); void MarkNode(uint64_t taxid); void ConvertToKrakenTaxonomy(const char *filename); private: std::map parent_map_; std::map name_map_; std::map rank_map_; std::map > child_map_; std::set marked_nodes_; std::set known_ranks_; }; class Taxonomy { public: Taxonomy(const std::string &filename, bool memory_mapping=false); Taxonomy(const char *filename, bool memory_mapping=false); Taxonomy() : file_backed_(false), nodes_(nullptr), node_count_(0), name_data_(nullptr), name_data_len_(0), rank_data_(nullptr), rank_data_len_(0) { } ~Taxonomy(); inline const TaxonomyNode *nodes() const { return nodes_; } inline const char *name_data() const { return name_data_; } inline const char *rank_data() const { return rank_data_; } inline const size_t node_count() { return node_count_; } bool IsAAncestorOfB(uint64_t a, uint64_t b); uint64_t LowestCommonAncestor(uint64_t a, uint64_t b); void WriteToDisk(const char *filename); void MoveToMemory(); void GenerateExternalToInternalIDMap(); uint64_t GetInternalID(uint64_t external_id) { return external_to_internal_id_map_[external_id]; } private: void Init(const char *filename, bool memory_mapping); char const * const FILE_MAGIC = "K2TAXDAT"; MMapFile taxonomy_data_file_; bool file_backed_; TaxonomyNode *nodes_; size_t node_count_; char *name_data_; size_t name_data_len_; char *rank_data_; size_t rank_data_len_; std::unordered_map external_to_internal_id_map_; friend void NCBITaxonomy::ConvertToKrakenTaxonomy(const char *filename); }; } #endif // KRAKEN_TAXONOMY_H_ kraken2-2.0.8-beta/src/utilities.cc000066400000000000000000000022221346023071100170730ustar00rootroot00000000000000/* * Copyright 2013-2019, Derrick Wood * * This file is part of the Kraken 2 taxonomic sequence classification system. */ #include "utilities.h" using std::vector; using std::string; namespace kraken2 { void ExpandSpacedSeedMask (uint64_t &spaced_seed_mask, const int bit_expansion_factor) { uint64_t new_mask = 0; uint64_t bits = 1 << bit_expansion_factor; bits--; for (int i = 64 / bit_expansion_factor - 1; i >= 0; i--) { new_mask <<= bit_expansion_factor; if ((spaced_seed_mask >> i) & 1) new_mask |= bits; } spaced_seed_mask = new_mask; } vector SplitString (const string &str, const string &delim, const size_t max_fields) { vector output; size_t pos1, pos2; pos1 = 0; size_t field_ct = 0; bool finished = false; while (field_ct++ < max_fields && ! finished) { // tokenizing loop pos2 = str.find(delim, pos1); string token; if (pos2 == string::npos) { token = str.substr(pos1); finished = true; } else { token = str.substr(pos1, pos2 - pos1); pos1 = pos2 + delim.size(); } output.push_back(token); } return output; } } kraken2-2.0.8-beta/src/utilities.h000066400000000000000000000015351346023071100167430ustar00rootroot00000000000000/* * Copyright 2013-2019, Derrick Wood * * This file is part of the Kraken 2 taxonomic sequence classification system. */ #ifndef KRAKEN2_UTILITIES_H_ #define KRAKEN2_UTILITIES_H_ #include "kraken2_headers.h" // Functions used by 2+ programs that I couldn't think of a better place for. namespace kraken2 { // Turns a simple bitstring of 0s and 1s into one where the 1s and 0s are expanded, // e.g. 010110 expanded by a factor of 2 would become 001100111100 // Allows specification of the spaced seed to represent positions in the sequence // rather than actual bits in the internal representation void ExpandSpacedSeedMask(uint64_t &spaced_seed_mask, const int bit_expansion_factor); std::vector SplitString(const std::string &str, const std::string &delim = "\t", const size_t max_fields = (size_t) -1); } #endif