pax_global_header 0000666 0000000 0000000 00000000064 13460230711 0014507 g ustar 00root root 0000000 0000000 52 comment=4f73d69e75088e0e39021d1c6ab214a3c7b409ca
kraken2-2.0.8-beta/ 0000775 0000000 0000000 00000000000 13460230711 0013764 5 ustar 00root root 0000000 0000000 kraken2-2.0.8-beta/.gitignore 0000664 0000000 0000000 00000000140 13460230711 0015747 0 ustar 00root root 0000000 0000000 *.o
src/build_db
src/classify
src/estimate_capacity
src/dump_table
src/lookup_accession_numbers
kraken2-2.0.8-beta/CHANGELOG.md 0000664 0000000 0000000 00000007331 13460230711 0015601 0 ustar 00root root 0000000 0000000 # Changelog
## Unreleased
## [2.0.8] - 2019-04-25 (beta)
### Added
- FTP downloading option for taxonomy/libraries (--use-ftp for kraken2-build)
- Option to skip downloading taxonomy maps
### Changed
- Added lookup table to speed up parsing in MinimizerScanner class
- Default parameters for minimizer lengths and spaces changed (spaces=7 for
nucleotide search, length=12 for translated search)
### Fixed
- Linked space expansion value for proteins to constant used by MinimizerScanner
- Reporting of taxids in classified-out sequence files
- Confidence scoring bug associated with failure to leave some sequences
unclassified
- Reverse complement shifting bug, code made backwards-compatible with
existing databases (newly created DBs will have fix)
- NCBI taxonomy download error due to removal of EST/GSS files
## [2.0.7] - 2018-08-11 (beta)
### Added
- Disk usage info to kraken2-build --clean
- Memory allocation error message for hash table
- Option for --max-db-size and hash downsampling
- Multithreading to kraken2-inspect
### Changed
- Move to /usr/bin/env for perl scripts
- Add DB loading message to keep people from killing processes early
- Add flag files for resuming download of nucleotide accession map data
- Converted lookup_accession_numbers script into C++ program w/ memory mapping
- Clarified in manual that one or more libraries allowed for custom DBs
- Silenced progress messages in C++ programs for non-TTY stderr
- Taxonomy downloads switched to rsync from wget (ftp)
- Removed '%' from reports
### Fixed
- Allow d/l of protozoa library w/ kraken2-build script
- Filenames for SILVA database taxonomy info
- Typo in manual for output format example
- Corrected default space count in manual
- Removed obvious race condition in --add-to-library functionality
- Corrected behavior of --classified-out and --unclassified-out (no longer
forcing .fq/.fa file extensions, respecting '#' in paired mode)
- Usage message in kraken2-inspect
- Taxonomy creation for 16S databases
## [2.0.6] - 2018-06-13 (beta)
### Changed
- New DB summary info printed out w/ inspect script + --skip-counts option
### Fixed
- Now stripping carriage returns and other trailing whitespace from sequence
data
- Treating l-mers immediately following ambiguous characters as ambiguous
until a full k-mer is processed
- Bug in expansion of spaced seed masks that left spaces at end
## [2.0.5] - 2018-05-21 (beta)
### Added
- New kraken2-inspect script to report minimizer counts per taxon
### Changed
- Kraken 2X build now adds terminators to all reference sequences
## [2.0.4] - 2018-05-06 (beta)
### Fixed
- Improved portability to older g++ by removing initialization of
variable-length string.
## [2.0.3] - 2018-02-12 (alpha)
### Added
- Reporting options to kraken2 script (like Kraken 1's kraken-report and
kraken-mpa-report)
### Changed
- Made loading to RAM default option, added --memory-mapping option to kraken2
## [2.0.2] - 2018-02-04 (alpha)
### Added
- Low base quality masking option
### Changed
- Moved low-complexity masking to library download/addition, out of build
process
- Made no masking default for human genome in standard installation
## [2.0.1] - 2018-01-01 (alpha)
### Added
- Low-complexity sequence masking as a default
- UniVec/UniVec_Core databases to supported downloads
- UniVec_Core & human in standard Kraken 2 DB
- 16S DB support (Greengenes, Silva, RDP)
- --use-names flag for kraken2 script
- Priority queue to ensure classifier output order matches input order when
multi-threading
- Changelog
### Changed
- Reduced amino acid alphabet (requires rebuild of old protein DBs)
- Operating manual
### Fixed
- kraken2 now allows compression & paired processing at same time
## [2.0.0] - 2017-12-04 (alpha, initial release)
kraken2-2.0.8-beta/LICENSE 0000664 0000000 0000000 00000002074 13460230711 0014774 0 ustar 00root root 0000000 0000000 The MIT License (MIT)
Copyright (c) 2017-2018 Derrick Wood
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in all
copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
SOFTWARE.
kraken2-2.0.8-beta/README.md 0000664 0000000 0000000 00000000273 13460230711 0015245 0 ustar 00root root 0000000 0000000 kraken2
=======
The second version of the Kraken taxonomic sequence classification system
Please refer to the Operating Manual (in docs/MANUAL.html) for details on
how to use Kraken 2.
kraken2-2.0.8-beta/docs/ 0000775 0000000 0000000 00000000000 13460230711 0014714 5 ustar 00root root 0000000 0000000 kraken2-2.0.8-beta/docs/MANUAL.html 0000664 0000000 0000000 00000125606 13460230711 0016571 0 ustar 00root root 0000000 0000000
Kraken is a taxonomic sequence classifier that assigns taxonomic labels to DNA sequences. Kraken examines the k-mers within a query sequence and uses the information within those k-mers to query a database. That database maps k-mers to the lowest common ancestor (LCA) of all genomes known to contain a given k-mer.
The first version of Kraken used a large indexed and sorted list of k-mer/LCA pairs as its database. While fast, the large memory requirements posed some problems for users, and so Kraken 2 was created to provide a solution to those problems.
Kraken 2 differs from Kraken 1 in several important ways:
Only minimizers of the k-mers in the query sequences are used as database queries. Similarly, only minimizers of the k-mers in the reference sequences in the database's genomic library are stored in the database. We will also refer to the minimizers as ℓ-mers, where ℓ ≤ k. All k-mers are considered to have the same LCA as their minimizer's database LCA value.
Kraken 2 uses a compact hash table that is a probabilistic data structure. This means that occasionally, database queries will fail by either returning the wrong LCA, or by not resulting in a search failure when a queried minimizer was never actually stored in the database. By incurring the risk of these false positives in the data structure, Kraken 2 is able to achieve faster speeds and lower memory requirements. Users should be aware that database false positive errors occur in less than 1% of queries, and can be compensated for by use of confidence scoring thresholds.
Kraken 2 has the ability to build a database from amino acid sequences and perform a translated search of the query sequences against that database.
Kraken 2 utilizes spaced seeds in the storage and querying of minimizers to improve classification accuracy.
Kraken 2 provides support for "special" databases that are not based on NCBI's taxonomy. These are currently limited to three popular 16S databases.
Because Kraken 2 only stores minimizers in its hash table, and k can be much larger than ℓ, only a small percentage of the possible ℓ-mers in a genomic library are actually deposited in the database. This creates a situation similar to the Kraken 1 "MiniKraken" databases; however, preliminary testing has shown the accuracy of a reduced Kraken 2 database to be quite similar to the full-sized Kraken 2 database, while Kraken 1's MiniKraken databases often resulted in a substantial loss of per-read sensitivity.
The Kraken 2 paper is currently under preparation. Until it is released, please cite the original Kraken paper if you use Kraken 2 in your research. Thank you!
System Requirements
Disk space: Construction of a Kraken 2 standard database requires approximately 100 GB of disk space. A test on 01 Jan 2018 of the default installation showed 42 GB of disk space was used to store the genomic library files, 26 GB was used to store the taxonomy information from NCBI, and 29 GB was used to store the Kraken 2 compact hash table.
Like in Kraken 1, we strongly suggest against using NFS storage to store the Kraken 2 database if at all possible.
Memory: To run efficiently, Kraken 2 requires enough free memory to hold the database (primarily the hash table) in RAM. While this can be accomplished with a ramdisk, Kraken 2 will by default load the database into process-local RAM; the --memory-mapping switch to kraken2 will avoid doing so. The default database size is 29 GB (as of Jan. 2018), and you will need slightly more than that in RAM if you want to build the default database.
Dependencies: Kraken 2 currently makes extensive use of Linux utilities such as sed, find, and wget. Many scripts are written using the Bash shell, and the main scripts are written using Perl. Core programs needed to build the database and run the classifier are written in C++11, and need to be compiled using a somewhat recent version of g++ that will support C++11. Multithreading is handled using OpenMP. Downloads of NCBI data are performed by wget and rsync. Most Linux systems will have all of the above listed programs and development libraries available either by default or via package download.
Unlike Kraken 1, Kraken 2 does not use an external k-mer counter. However, by default, Kraken 2 will attempt to use the dustmasker or segmasker programs provided as part of NCBI's BLAST suite to mask low-complexity regions (see Masking of Low-complexity Sequences).
MacOS NOTE: MacOS and other non-Linux operating systems are not explicitly supported by the developers, and MacOS users should refer to the Kraken-users group for support in installing the appropriate utilities to allow for full operation of Kraken 2. We will attempt to use MacOS-compliant code when possible, but development and testing time is at a premium and we cannot guarantee that Kraken 2 will install and work to its full potential on a default installation of MacOS.
In particular, we note that the default MacOS X installation of GCC does not have support for OpenMP. Without OpenMP, Kraken 2 is limited to single-threaded operation, resulting in slower build and classification runtimes.
Network connectivity: Kraken 2's standard database build and download commands expect unfettered FTP and rsync access to the NCBI FTP server. If you're working behind a proxy, you may need to set certain environment variables (such as ftp_proxy or RSYNC_PROXY) in order to get these commands to work properly.
Kraken 2's scripts default to using rsync for most downloads; however, you may find that your network situation prevents use of rsync. In such cases, you can try the --use-ftp option to kraken2-build to force the downloads to occur via FTP.
MiniKraken: At present, users with low-memory computing environments can replicate the "MiniKraken" functionality of Kraken 1 in two ways: first, by increasing the value of k with respect to ℓ (using the --kmer-len and --minimizer-len options to kraken2-build); and secondly, through downsampling of minimizers (from both the database and query sequences) using a hash function. This second option is performed if the --max-db-size option to kraken2-build is used; however, the two options are not mutually exclusive. In a difference from Kraken 1, Kraken 2 does not require building a full database and then shrinking it to obtain a reduced database.
Installation
To begin using Kraken 2, you will first need to install it, and then either download or create a database.
Kraken 2 consists of two main scripts (kraken2 and kraken2-build), along with several programs and smaller scripts. As part of the installation process, all scripts and programs are installed in the same directory. After installation, you can move the main scripts elsewhere, but moving the other scripts and programs requires editing the scripts and changing the $KRAKEN2_DIR variables in the main scripts.
Once an install directory is selected, you need to run the following command in the directory where you extracted the Kraken 2 source:
./install_kraken2.sh $KRAKEN2_DIR
(Replace $KRAKEN2_DIR above with the directory where you want to install Kraken 2's programs/scripts.)
The install_kraken2.sh script should compile all of Kraken 2's code and setup your Kraken 2 program directory. Installation is successful if you see the message "Kraken 2 installation complete."
Once installation is complete, you may want to copy the main Kraken 2 scripts into a directory found in your PATH variable (e.g., "$HOME/bin"):
After installation, you're ready to either create or download a database.
Kraken 2 Databases
A Kraken 2 database is a directory containing at least 3 files:
hash.k2d: Contains the minimizer to taxon mappings
opts.k2d: Contains information about the options used to build the database
taxo.k2d: Contains taxonomy information used to build the database
None of these three files are in a human-readable format. Other files may also be present as part of the database build process, and can, if desired, be removed after a successful build of the database.
In interacting with Kraken 2, you should not have to directly reference any of these files, but rather simply provide the name of the directory in which they are stored. Kraken 2 allows both the use of a standard database as well as custom databases; these are described in the sections Standard Kraken 2 Database and Custom Databases below, respectively.
Standard Kraken 2 Database
To create the standard Kraken 2 database, you can use the following command:
kraken2-build --standard --db $DBNAME
(Replace "$DBNAME" above with your preferred database name/location. Please note that the database will use approximately 100 GB of disk space during creation, with the majority of that being reference sequences or taxonomy mapping information that can be removed after the build.)
This will download NCBI taxonomic information, as well as the complete genomes in RefSeq for the bacterial, archaeal, and viral domains, along with the human genome and a collection of known vectors (UniVec_Core). After downloading all this data, the build process begins; this can be the most time-consuming step. If you have multiple processing cores, you can run this process with multiple threads, e.g.:
Using 32 threads on an AWS EC2 r4.8xlarge instance with 16 dual-core hyperthreaded 2.30 GHz CPUs and 244 GB of RAM, the build process took approximately 35 minutes in Jan. 2018.
The build process itself has two main steps, each of which requires passing over the contents of the reference library:
Estimation of the capacity needed in the Kraken 2 compact hash table. This uses a low-memory method to reliably estimate the number of minimizers present in the reference library given the selected parameters k and ℓ.
Population of the hash table (and conversion of the taxonomy to an internal format). This step is a second pass over the reference library to find minimizers and then place them in the database.
(There is one other preliminary step where sequence IDs are mapped to taxonomy IDs, but this is usually a rather quick process and is mostly handled during library downloading.)
Unlike Kraken 1's build process, Kraken 2 does not perform checkpointing after the estimation step. This is because the estimation step is dependent on the selected k and ℓ values, and if the population step fails, it is likely because k needs to be increased (reducing the overall memory requirements).
Classification
To classify a set of sequences, use the kraken2 command:
kraken2 --db $DBNAME seqs.fa
Output will be sent to standard output by default. The files containing the sequences to be classified should be specified on the command line. Sequences can also be provided through standard input using the special filename /dev/fd/0.
The kraken2 program allows several different options:
Multithreading: Use the --threads NUM switch to use multiple threads.
Quick operation: Rather than searching all ℓ-mers in a sequence, stop classification after the first database hit; use --quick to enable this mode. Note that --min-hits will allow you to require multiple hits before declaring a sequence classified, which can be especially useful with custom databases when testing to see if sequences either do or do not belong to a particular genome.
Sequence filtering: Classified or unclassified sequences can be sent to a file for later processing, using the --classified-out and --unclassified-out switches, respectively.
Output redirection: Output can be directed using standard shell redirection (| or >), or using the --output switch.
FASTQ input: Input is normally expected to be in FASTA format, but you can classify FASTQ data using the --fastq-input switch.
Compressed input: Kraken 2 can handle gzip and bzip2 compressed files as input by specifying the proper switch of --gzip-compressed or --bzip2-compressed.
Input format auto-detection: If regular files are specified on the command line as input, Kraken 2 will attempt to determine the format of your input prior to classification. You can disable this by explicitly specifying --fasta-input, --fastq-input, --gzip-compressed, and/or --bzip2-compressed as appropriate. Note that use of the character device file /dev/fd/0 to read from standard input (aka stdin) will not allow auto-detection.
Paired reads: Kraken 2 provides an enhancement over Kraken 1 in its handling of paired read data. Rather than needing to concatenate the pairs together with an N character between the reads, Kraken 2 is able to process the mates individually while still recognizing the pairing information. Using the --paired option to kraken2 will indicate to kraken2 that the input files provided are paired read data, and data will be read from the pairs of files concurrently.
Usage of --paired also affects the --classified-out and --unclassified-out options; users should provide a # character in the filenames provided to those options, which will be replaced by kraken2 with "_1" and "_2" with mates spread across the two files appropriately. For example:
will put the first reads from classified pairs in cseqs_1.fq, and the second reads from those pairs in cseqs_2.fq.
To get a full list of options, use kraken2 --help.
Output Formats
Standard Kraken Output Format
Each sequence (or sequence pair, in the case of paired reads) classified by Kraken 2 results in a single line of output. Kraken 2's output lines contain five tab-delimited fields; from left to right, they are:
"C"/"U": a one letter code indicating that the sequence was either classified or unclassified.
The sequence ID, obtained from the FASTA/FASTQ header.
The taxonomy ID Kraken 2 used to label the sequence; this is 0 if the sequence is unclassified.
The length of the sequence in bp. In the case of paired read data, this will be a string containing the lengths of the two sequences in bp, separated by a pipe character, e.g. "98|94".
A space-delimited list indicating the LCA mapping of each k-mer in the sequence(s). For example, "562:13 561:4 A:31 0:1 562:3" would indicate that:
the first 13 k-mers mapped to taxonomy ID #562
the next 4 k-mers mapped to taxonomy ID #561
the next 31 k-mers contained an ambiguous nucleotide
the next k-mer was not in the database
the last 3 k-mers mapped to taxonomy ID #562
Note that paired read data will contain a "|:|" token in this list to indicate the end of one read and the beginning of another.
When Kraken 2 is run against a protein database (see Translated Search), the LCA hitlist will contain the results of querying all six frames of each sequence. Reading frame data is separated by a "-:-" token.
Kraken 1 offered a kraken-translate and kraken-report script to change the output into different formats. Through the use of kraken2 --use-names, Kraken 2 will replace the taxonomy ID column with the scientific name and the taxonomy ID in parenthesis (e.g., "Bacteria (taxid 2)" instead of "2"), yielding similar functionality to Kraken 1's kraken-translate script. The sample report functionality now exists as part of the kraken2 script, with the use of the --report option; the sample report formats are described below.
Sample Report Output Format
Like Kraken 1, Kraken 2 offers two formats of sample-wide results. Kraken 2's standard sample report format is tab-delimited with one line per taxon. The fields of the output, from left-to-right, are as follows:
1. Percentage of fragments covered by the clade rooted at this taxon
2. Number of fragments covered by the clade rooted at this taxon
3. Number of fragments assigned directly to this taxon
4. A rank code, indicating (U)nclassified, (R)oot, (D)omain, (K)ingdom,
(P)hylum, (C)lass, (O)rder, (F)amily, (G)enus, or (S)pecies.
Taxa that are not at any of these 10 ranks have a rank code that is
formed by using the rank code of the closest ancestor rank with
a number indicating the distance from that rank. E.g., "G2" is a
rank code indicating a taxon is between genus and species and the
grandparent taxon is at the genus rank.
5. NCBI taxonomic ID number
6. Indented scientific name
The scientific names are indented using space, according to the tree structure specified by the taxonomy.
By default, taxa with no reads assigned to (or under) them will not have any output produced. However, if you wish to have all taxa displayed, you can use the --report-zero-counts switch to do so. This can be useful if you are looking to do further downstream analysis of the reports, and want to compare samples. Sorting by the taxonomy ID (using sort -k5,5n) can provide a consistent line ordering between reports.
In addition, we also provide the option --use-mpa-style that can be used in conjunction with --report. This option provides output in a format similar to MetaPhlAn's output. The output with this option provides one taxon per line, with a lowercase version of the rank codes in Kraken 2's standard sample report format (except for 'U' and 'R'), two underscores, and the scientific name of the taxon (e.g., "d__Viruses"). The full taxonomy of each taxon (at the eight ranks considered) is given, with each rank's name separated by a pipe character (e.g., "d__Viruses|o_Caudovirales"). Following this version of the taxon's scientific name is a tab and the number of fragments assigned to the clade rooted at that taxon.
Translated Search
Kraken 2 allows users to perform a six-frame translated search, similar to the well-known BLASTX program. To do this, Kraken 2 uses a reduced 15 amino acid alphabet and stores amino acid minimizers in its database. LCA results from all 6 frames are combined to yield a set of LCA hits, which is then resolved in the same manner as in Kraken's normal operation.
To build a protein database, the --protein option should be given to kraken2-build (either along with --standard, or with all steps if building a custom database).
Custom Databases
We realize the standard database may not suit everyone's needs. Kraken 2 also allows creation of customized databases.
To build a custom database:
Install a taxonomy. Usually, you will just use the NCBI taxonomy, which you can easily download using:
kraken2-build --download-taxonomy --db $DBNAME
This will download the accession number to taxon maps, as well as the taxonomic name and tree information from NCBI. These files can be found in $DBNAME/taxonomy/ . If you need to modify the taxonomy, edits can be made to the names.dmp and nodes.dmp files in this directory; you may also need to modify the *.accession2taxid files appropriately.
Some of the standard sets of genomic libraries have taxonomic information associated with them, and don't need the accession number to taxon maps to build the database successfully. These libraries include all those available through the --download-library option (see next point), except for the plasmid and non-redundant databases. If you are not using custom sequences (see the --add-to-library option) and are not using one of the plasmid or non-redundant database libraries, you may want to skip downloading of the accession number to taxon maps. This can be done by passing --skip-maps to the kraken2-build --download-taxonomy command.
Install one or more reference libraries. Several sets of standard genomes/proteins are made easily available through kraken2-build:
The above commands would prepare a database that would contain archaeal and viral genomes; the --build option (see below) will still need to be used after downloading these libraries to actually build the database, however.
(Note that downloading nr or env_nr require use of the --protein option, and that UniVec and UniVec_Core are incompatible with the --protein option.)
Other genomes can also be added, but such genomes must meet certain requirements:
Sequences must be in a FASTA file (multi-FASTA is allowed)
Each sequence's ID (the string between the > and the first whitespace character on the header line) must contain either an NCBI accession number to allow Kraken 2 to lookup the correct taxa, or an explicit assignment of the taxonomy ID using kraken:taxid (see below).
Sequences not downloaded from NCBI may need their taxonomy information assigned explicitly. This can be done using the string kraken:taxid|XXX in the sequence ID, with XXX replaced by the desired taxon ID. For example, to put a known adapter sequence in taxon 32630 ("synthetic construct"), you could use the following:
The kraken:taxid string must begin the sequence ID or be immediately preceded by a pipe character (|). Explicit assignment of taxonomy IDs in this manner will override the accession number mapping provided by NCBI.
If your genomes meet the requirements above, then you can add each sequence to your database's genomic library using the --add-to-library switch, e.g.:
(You may also find the -P option to xargs useful to add many files in parallel if you have multiple processors.)
Once your library is finalized, you need to build the database. This can be done with the command:
kraken2-build --build --db $DBNAME
The --threads option is also helpful here to reduce build time.
By default, the values of k and ℓ are 35 and 31, respectively (or 15 and 15 for protein databases). These values can be explicitly set with the --kmer-len and minimizer-len options, however. Note that the minimizer length must be no more than 31 for nucleotide databases, and 15 for protein databases. Additionally, the minimizer length ℓ must be no more than the k-mer length. There is no upper bound on the value of k, but sequences less than k bp in length cannot be classified.
Kraken 2 also utilizes a simple spaced seed approach to increase accuracy. A number s < ℓ/4 can be chosen, and s positions in the minimizer will be masked out during all comparisons. Masked positions are chosen to alternate from the second-to-last position in the minimizer; e.g., s = 5 and ℓ = 31 will result in masking out the 0 positions shown here:
111 1111 1111 1111 1111 1101 0101 0101
By default, s = 6 for nucleotide databases, and s = 0 for protein databases. This can be changed using the --minimizer-spaces option along with the --build task of kraken2-build.
A full list of options for kraken2-build can be obtained using kraken2-build --help.
After building a database, if you want to reduce the disk usage of the database, you can use the --clean option for kraken2-build to remove intermediate files from the database directory.
Masking of Low-complexity Sequences
Low-complexity sequences, e.g. "ACACACACACACACACACACACACAC", are known to occur in many different organisms and are typically less informative for use in alignments; the BLAST programs often mask these sequences by default. Using this masking can help prevent false positives in Kraken 2's results, and so we have added this functionality as a default option to Kraken 2's library download/addition process.
Kraken 2 uses two programs to perform low-complexity sequence masking, both available from NCBI: dustmasker, for nucleotide sequences, and segmasker, for amino acid sequences. These programs are available as part of the NCBI BLAST+ suite. If these programs are not installed on the local system and in the user's PATH when trying to use kraken2-build, the database build will fail. Users who do not wish to install these programs can use the --no-masking option to kraken2-build in conjunction with any of the --download-library, --add-to-library, or --standard options; use of the --no-masking option will skip masking of low-complexity sequences during the build of the Kraken 2 database.
Special Databases
To support some common use cases, we provide the ability to build Kraken 2 databases using data from various external databases. These external databases may not follow the NCBI taxonomy, and so we've provided mechanisms to automatically create a taxonomy that will work with Kraken 2 (although such taxonomies may not be identical to NCBI's).
To build one of these "special" Kraken 2 databases, use the following command:
kraken2-build --db $DBNAME --special TYPE
where the TYPE string is one of the database names listed below.
At present, the "special" Kraken 2 database support we provide is limited to pre-packaged solutions for some public 16S sequence databases, but this may grow in the future.
16S Databases
For targeted 16S sequencing projects, a normal Kraken 2 database using whole genome data may use more resources than necessary. A Kraken 2 database created from a well-curated genomic library of just 16S data can provide both a more efficient solution as well as a more accurate set of predictions for such projects. We provide support for building Kraken 2 databases from three publicly available 16S databases:
Greengenes (Kraken 2 database name: greengenes), using all available 16S data.
RDP (Kraken 2 database name: rdp), using the bacterial and archaeal 16S data.
SILVA (Kraken 2 database name: silva), using the Small subunit NR99 sequence set.
Note that these databases may have licensing restrictions regarding their data, and it is your responsibility to ensure you are in compliance with those restrictions; please visit the databases' websites for further details. The kraken2-build script only uses publicly available URLs to download data and then converts that data into a form compatible for use with Kraken 2.
Furthermore, if you use one of these databases in your research, please visit the corresponding database's website to determine the appropriate and up-to-date citation.
Confidence Scoring
At present, we have not yet developed a confidence score with a probabilistic interpretation for Kraken 2. However, we have developed a simple scoring scheme that has yielded good results for us, and we've made that available in Kraken 2 through use of the --confidence option to kraken2. The approach we use allows a user to specify a threshold score in the [0,1] interval; the classifier then will adjust labels up the tree until the label's score (described below) meets or exceeds that threshold. If a label at the root of the taxonomic tree would not have a score exceeding the threshold, the sequence is called unclassified by Kraken 2 when this threshold is applied.
A sequence label's score is a fraction C/Q, where C is the number of k-mers mapped to LCA values in the clade rooted at the label, and Q is the number of k-mers in the sequence that lack an ambiguous nucleotide (i.e., they were queried against the database). Consider the example of the LCA mappings in Kraken 2's output given earlier:
"562:13 561:4 A:31 0:1 562:3" would indicate that:
the first 13 k-mers mapped to taxonomy ID #562
the next 4 k-mers mapped to taxonomy ID #561
the next 31 k-mers contained an ambiguous nucleotide
the next k-mer was not in the database
the last 3 k-mers mapped to taxonomy ID #562
In this case, ID #561 is the parent node of #562. Here, a label of #562 for this sequence would have a score of C/Q = (13+3)/(13+4+1+3) = 16/21. A label of #561 would have a score of C/Q = (13+4+3)/(13+4+1+3) = 20/21. If a user specified a --confidence threshold over 16/21, the classifier would adjust the original label from #562 to #561; if the threshold was greater than 20/21, the sequence would become unclassified.
Inspecting a Kraken 2 Database's Contents
The kraken2-inspect script allows users to gain information about the content of a Kraken 2 database. The output format of kraken2-inspect is identical to the reports generated with the --report option to kraken2. Instead of reporting how many reads in input data classified to a given taxon or clade, as kraken2's --report option would, the kraken2-inspect script will report the number of minimizers in the database that are mapped to the various taxa/clades. For example, the first five lines of kraken2-inspect's output on an example database might look like this:
$ kraken2-inspect --db EXAMPLE_DB | head -5
100.00% 1770368409 1581179 R 1 root
96.50% 1708407622 58003 R1 131567 cellular organisms
91.28% 1615910070 985309 D 2 Bacteria
43.89% 777062062 1312736 P 1224 Proteobacteria
18.62% 329590216 555667 C 1236 Gammaproteobacteria
This output indicates that 555667 of the minimizers in the database map directly to the Gammaproteobacteria class (taxid #1236), and 329590216 (18.62%) of the database's minimizers map to a taxon in the clade rooted at Gammaproteobacteria. For more information on kraken2-inspect's options, use its --help option.
Kraken 2 Environment Variables
The kraken2 and kraken2-inpsect scripts supports the use of some environment variables to help in reducing command line lengths:
KRAKEN2_NUM_THREADS: if the --threads option is not supplied to kraken2, then the value of this variable (if it is set) will be used as the number of threads to run kraken2. (This variable does not affect kraken2-inspect.)
KRAKEN2_DB_PATH: much like the PATH variable is used for executables by your shell, KRAKEN2_DB_PATH is a colon-separated list of directories that will be searched for the database you name if the named database does not have a slash (/) character. By default, Kraken 2 assumes the value of this variable is "." (i.e., the current working directory). This variable can be used to create one (or more) central repositories of Kraken databases in a multi-user system. Example usage in bash:
This will cause three directories to be searched, in this order:
/home/user/my_kraken2_dbs
/data/kraken2_dbs
the current working directory (caused by the empty string as the third colon-separated field in the KRAKEN2_DB_PATH string)
The search for a database will stop when a name match is found; if two directories in the KRAKEN2_DB_PATH have databases with the same name, the directory of the two that is searched first will have its database selected.
If the above variable and value are used, and the databases /data/kraken2_dbs/mainDB and ./mainDB are present, then
kraken2 --db mainDB sequences.fa
will classify sequences.fa using /data/kraken_dbs/mainDB; if instead you wanted to use the mainDB present in the current directory, you would need to specify a directory path to that database in order to circumvent searching, e.g.:
kraken2 --db ./mainDB sequences.fa
Note that the KRAKEN2_DB_PATH directory list can be skipped by the use of any absolute (beginning with /) or relative pathname (including at least one /) as the database name.
KRAKEN2_DEFAULT_DB: if no database is supplied with the --db option, the database named in this variable will be used instead. Using this variable, you can avoid using --db if you only have a single database that you usually use, e.g. in bash:
This will classify sequences.fa using the /home/user/kraken2db database.
Note that the value of KRAKEN2_DEFAULT_DB will also be interpreted in the context of the value of KRAKEN2_DB_PATH if you don't set KRAKEN2_DEFAULT_DB to an absolute or relative pathname. Given the earlier example in this section, the following:
will use /data/kraken_dbs/mainDB to classify sequences.fa.
kraken2-2.0.8-beta/docs/MANUAL.markdown 0000664 0000000 0000000 00000106420 13460230711 0017440 0 ustar 00root root 0000000 0000000 Introduction
============
[Kraken] is a taxonomic sequence classifier that assigns taxonomic
labels to DNA sequences. Kraken examines the $k$-mers within
a query sequence and uses the information within those $k$-mers
to query a database. That database maps $k$-mers to the lowest
common ancestor (LCA) of all genomes known to contain a given $k$-mer.
The first version of Kraken used a large indexed and sorted list of
$k$-mer/LCA pairs as its database. While fast, the large memory
requirements posed some problems for users, and so Kraken 2 was
created to provide a solution to those problems.
Kraken 2 differs from Kraken 1 in several important ways:
1. Only minimizers of the $k$-mers in the query sequences are used
as database queries. Similarly, only minimizers of the $k$-mers in
the reference sequences in the database's genomic library are stored
in the database. We will also refer to the minimizers as $\ell$-mers,
where $\ell \leq k$. All $k$-mers are considered to have the same LCA
as their minimizer's database LCA value.
2. Kraken 2 uses a compact hash table that is a probabilistic data
structure. This means that occasionally, database queries will fail
by either returning the wrong LCA, or by not resulting in a search
failure when a queried minimizer was never actually stored in the
database. By incurring the risk of these false positives in the data
structure, Kraken 2 is able to achieve faster speeds and lower memory
requirements. Users should be aware that database false positive
errors occur in less than 1% of queries, and can be compensated for
by use of confidence scoring thresholds.
3. Kraken 2 has the ability to build a database from amino acid
sequences and perform a translated search of the query sequences
against that database.
4. Kraken 2 utilizes spaced seeds in the storage and querying of
minimizers to improve classification accuracy.
5. Kraken 2 provides support for "special" databases that are
not based on NCBI's taxonomy. These are currently limited to
three popular 16S databases.
Because Kraken 2 only stores minimizers in its hash table, and $k$ can be
much larger than $\ell$, only a small percentage
of the possible $\ell$-mers in a genomic library are actually deposited in
the database. This creates a situation similar to the Kraken 1 "MiniKraken"
databases; however, preliminary testing has shown the accuracy of a reduced
Kraken 2 database to be quite similar to the full-sized Kraken 2 database,
while Kraken 1's MiniKraken databases often resulted in a substantial loss
of per-read sensitivity.
The Kraken 2 paper is currently under preparation. Until it is released,
please cite the original [Kraken paper] if you use Kraken 2 in your research.
Thank you!
[Kraken]: http://ccb.jhu.edu/software/kraken/
[Kraken paper]: http://genomebiology.com/2014/15/3/R46
System Requirements
===================
* **Disk space**: Construction of a Kraken 2 standard database requires
approximately 100 GB of disk space. A test on 01 Jan 2018 of the
default installation showed 42 GB of disk space was used to store
the genomic library files, 26 GB was used to store the taxonomy
information from NCBI, and 29 GB was used to store the Kraken 2
compact hash table.
Like in Kraken 1, we strongly suggest against using NFS storage
to store the Kraken 2 database if at all possible.
* **Memory**: To run efficiently, Kraken 2 requires enough free memory
to hold the database (primarily the hash table) in RAM. While this
can be accomplished with a ramdisk, Kraken 2 will by default load
the database into process-local RAM; the `--memory-mapping` switch
to `kraken2` will avoid doing so. The default database size is 29 GB
(as of Jan. 2018), and you will need slightly more than that in
RAM if you want to build the default database.
* **Dependencies**: Kraken 2 currently makes extensive use of Linux
utilities such as sed, find, and wget. Many scripts are written
using the Bash shell, and the main scripts are written using Perl.
Core programs needed to build the database and run the classifier
are written in C++11, and need to be compiled using a somewhat
recent version of g++ that will support C++11. Multithreading is
handled using OpenMP. Downloads of NCBI data are performed by wget
and rsync. Most Linux systems will have all of the above listed
programs and development libraries available either by default or
via package download.
Unlike Kraken 1, Kraken 2 does not use an external $k$-mer counter.
However, by default, Kraken 2 will attempt to use the `dustmasker` or
`segmasker` programs provided as part of NCBI's BLAST suite to mask
low-complexity regions (see [Masking of Low-complexity Sequences]).
**MacOS NOTE:** MacOS and other non-Linux operating systems are *not*
explicitly supported by the developers, and MacOS users should refer to
the Kraken-users group for support in installing the appropriate utilities
to allow for full operation of Kraken 2. We will attempt to use
MacOS-compliant code when possible, but development and testing time
is at a premium and we cannot guarantee that Kraken 2 will install
and work to its full potential on a default installation of MacOS.
In particular, we note that the default MacOS X installation of GCC
does not have support for OpenMP. Without OpenMP, Kraken 2 is
limited to single-threaded operation, resulting in slower build and
classification runtimes.
* **Network connectivity**: Kraken 2's standard database build and download
commands expect unfettered FTP and rsync access to the NCBI FTP
server. If you're working behind a proxy, you may need to set
certain environment variables (such as `ftp_proxy` or `RSYNC_PROXY`)
in order to get these commands to work properly.
Kraken 2's scripts default to using rsync for most downloads; however, you
may find that your network situation prevents use of rsync. In such cases,
you can try the `--use-ftp` option to `kraken2-build` to force the
downloads to occur via FTP.
* **MiniKraken**: At present, users with low-memory computing environments
can replicate the "MiniKraken" functionality of Kraken 1 in two ways:
first, by increasing
the value of $k$ with respect to $\ell$ (using the `--kmer-len` and
`--minimizer-len` options to `kraken2-build`); and secondly, through
downsampling of minimizers (from both the database and query sequences)
using a hash function. This second option is performed if
the `--max-db-size` option to `kraken2-build` is used; however, the two
options are not mutually exclusive.
In a difference from Kraken 1, Kraken 2 does not require building a full
database and then shrinking it to obtain a reduced database.
Installation
============
To begin using Kraken 2, you will first need to install it, and then
either download or create a database.
Kraken 2 consists of two main scripts (`kraken2` and `kraken2-build`),
along with several programs and smaller scripts. As part of the installation
process, all scripts and programs are installed in the same directory.
After installation, you can move the main scripts elsewhere, but moving
the other scripts and programs requires editing the scripts and changing
the `$KRAKEN2_DIR` variables in the main scripts.
Once an install directory is selected, you need to run the following
command in the directory where you extracted the Kraken 2 source:
./install_kraken2.sh $KRAKEN2_DIR
(Replace `$KRAKEN2_DIR` above with the directory where you want to install
Kraken 2's programs/scripts.)
The `install_kraken2.sh` script should compile all of Kraken 2's code
and setup your Kraken 2 program directory. Installation is successful if
you see the message "`Kraken 2 installation complete.`"
Once installation is complete, you may want to copy the main Kraken 2
scripts into a directory found in your `PATH` variable (e.g., "`$HOME/bin`"):
cp $KRAKEN2_DIR/bin/kraken2{,-build,-inspect} $HOME/bin
After installation, you're ready to either create or download a database.
Kraken 2 Databases
==================
A Kraken 2 database is a directory containing at least 3 files:
* `hash.k2d`: Contains the minimizer to taxon mappings
* `opts.k2d`: Contains information about the options used to build the database
* `taxo.k2d`: Contains taxonomy information used to build the database
None of these three files are in a human-readable format. Other files
may also be present as part of the database build process, and can, if
desired, be removed after a successful build of the database.
In interacting with Kraken 2, you should not have to directly reference
any of these files, but rather simply provide the name of the directory
in which they are stored. Kraken 2 allows both the use of a standard
database as well as custom databases; these are described in the
sections [Standard Kraken 2 Database] and [Custom Databases] below,
respectively.
Standard Kraken 2 Database
==========================
To create the standard Kraken 2 database, you can use the following command:
kraken2-build --standard --db $DBNAME
(Replace "`$DBNAME`" above with your preferred database name/location.
Please note that the database will use approximately 100 GB of
disk space during creation, with the majority of that being reference
sequences or taxonomy mapping information that can be removed after the
build.)
This will download NCBI taxonomic information, as well as the
complete genomes in RefSeq for the bacterial, archaeal, and
viral domains, along with the human genome and a collection of
known vectors (UniVec_Core). After downloading all this data, the build
process begins; this can be the most time-consuming step. If you
have multiple processing cores, you can run this process with
multiple threads, e.g.:
kraken2-build --standard --threads 24 --db $DBNAME
Using 32 threads on an AWS EC2 r4.8xlarge instance with 16 dual-core
hyperthreaded 2.30 GHz CPUs and 244 GB of RAM, the build process took
approximately 35 minutes in Jan. 2018.
The build process itself has two main steps, each of which requires passing
over the contents of the reference library:
1. **Estimation** of the capacity needed in the Kraken 2 compact hash table.
This uses a low-memory method to reliably estimate the number of
minimizers present in the reference library given the selected parameters
$k$ and $\ell$.
2. **Population** of the hash table (and conversion of the taxonomy to an
internal format). This step is a second pass over the reference library
to find minimizers and then place them in the database.
(There is one other preliminary step where sequence IDs are mapped to
taxonomy IDs, but this is usually a rather quick process and is mostly handled
during library downloading.)
Unlike Kraken 1's build process, Kraken 2 does not perform checkpointing
after the estimation step. This is because the estimation step is dependent
on the selected $k$ and $\ell$ values, and if the population step fails, it is
likely because $k$ needs to be increased (reducing the overall memory
requirements).
Classification
==============
To classify a set of sequences, use the `kraken2` command:
kraken2 --db $DBNAME seqs.fa
Output will be sent to standard output by default. The files
containing the sequences to be classified should be specified
on the command line. Sequences can also be provided through
standard input using the special filename `/dev/fd/0`.
The `kraken2` program allows several different options:
* **Multithreading**: Use the `--threads NUM` switch to use multiple
threads.
* **Quick operation**: Rather than searching all $\ell$-mers in a sequence,
stop classification after the first database hit; use `--quick`
to enable this mode. Note that `--min-hits` will allow you to
require multiple hits before declaring a sequence classified,
which can be especially useful with custom databases when testing
to see if sequences either do or do not belong to a particular
genome.
* **Sequence filtering**: Classified or unclassified sequences can be
sent to a file for later processing, using the `--classified-out`
and `--unclassified-out` switches, respectively.
* **Output redirection**: Output can be directed using standard shell
redirection (`|` or `>`), or using the `--output` switch.
* **FASTQ input**: Input is normally expected to be in FASTA format, but
you can classify FASTQ data using the `--fastq-input` switch.
* **Compressed input**: Kraken 2 can handle gzip and bzip2 compressed
files as input by specifying the proper switch of `--gzip-compressed`
or `--bzip2-compressed`.
* **Input format auto-detection**: If regular files are specified on
the command line as input, Kraken 2 will attempt to determine the
format of your input prior to classification. You can disable this
by explicitly specifying `--fasta-input`, `--fastq-input`,
`--gzip-compressed`, and/or `--bzip2-compressed` as appropriate.
Note that use of the character device file `/dev/fd/0` to read
from standard input (aka `stdin`) will **not** allow auto-detection.
* **Paired reads**: Kraken 2 provides an enhancement over Kraken 1 in its
handling of paired read data. Rather than needing to concatenate the
pairs together with an `N` character between the reads, Kraken 2 is
able to process the mates individually while still recognizing the
pairing information. Using the `--paired` option to `kraken2` will
indicate to `kraken2` that the input files provided are paired read
data, and data will be read from the pairs of files concurrently.
Usage of `--paired` also affects the `--classified-out` and
`--unclassified-out` options; users should provide a `#` character
in the filenames provided to those options, which will be replaced
by `kraken2` with "`_1`" and "`_2`" with mates spread across the two
files appropriately. For example:
kraken2 --paired --classified-out cseqs#.fq seqs_1.fq seqs_2.fq
will put the first reads from classified pairs in `cseqs_1.fq`, and
the second reads from those pairs in `cseqs_2.fq`.
To get a full list of options, use `kraken2 --help`.
Output Formats
==============
Standard Kraken Output Format
-----------------------------
Each sequence (or sequence pair, in the case of paired reads) classified
by Kraken 2 results in a single line of output. Kraken 2's output lines
contain five tab-delimited fields; from left to right, they are:
1. "C"/"U": a one letter code indicating that the sequence was either
classified or unclassified.
2. The sequence ID, obtained from the FASTA/FASTQ header.
3. The taxonomy ID Kraken 2 used to label the sequence; this is 0 if
the sequence is unclassified.
4. The length of the sequence in bp. In the case of paired read data,
this will be a string containing the lengths of the two sequences in
bp, separated by a pipe character, e.g. "98|94".
5. A space-delimited list indicating the LCA mapping of each $k$-mer in
the sequence(s). For example, "562:13 561:4 A:31 0:1 562:3" would
indicate that:
- the first 13 $k$-mers mapped to taxonomy ID #562
- the next 4 $k$-mers mapped to taxonomy ID #561
- the next 31 $k$-mers contained an ambiguous nucleotide
- the next $k$-mer was not in the database
- the last 3 $k$-mers mapped to taxonomy ID #562
Note that paired read data will contain a "`|:|`" token in this list
to indicate the end of one read and the beginning of another.
When Kraken 2 is run against a protein database (see [Translated Search]),
the LCA hitlist will contain the results of querying all six frames of
each sequence. Reading frame data is separated by a "`-:-`" token.
Kraken 1 offered a `kraken-translate` and `kraken-report` script to change
the output into different formats. Through the use of `kraken2 --use-names`,
Kraken 2 will replace the taxonomy ID column with the scientific name and
the taxonomy ID in parenthesis (e.g., "Bacteria (taxid 2)" instead of "2"),
yielding similar functionality to Kraken 1's `kraken-translate` script.
The sample report functionality now exists as part of the `kraken2` script,
with the use of the `--report` option; the sample report formats are
described below.
Sample Report Output Format
---------------------------
Like Kraken 1, Kraken 2 offers two formats of sample-wide results.
Kraken 2's standard sample report format is tab-delimited with one
line per taxon. The fields of the output, from left-to-right, are
as follows:
1. Percentage of fragments covered by the clade rooted at this taxon
2. Number of fragments covered by the clade rooted at this taxon
3. Number of fragments assigned directly to this taxon
4. A rank code, indicating (U)nclassified, (R)oot, (D)omain, (K)ingdom,
(P)hylum, (C)lass, (O)rder, (F)amily, (G)enus, or (S)pecies.
Taxa that are not at any of these 10 ranks have a rank code that is
formed by using the rank code of the closest ancestor rank with
a number indicating the distance from that rank. E.g., "G2" is a
rank code indicating a taxon is between genus and species and the
grandparent taxon is at the genus rank.
5. NCBI taxonomic ID number
6. Indented scientific name
The scientific names are indented using space, according to the tree
structure specified by the taxonomy.
By default, taxa with no reads assigned to (or under) them will not have
any output produced. However, if you wish to have all taxa displayed, you
can use the `--report-zero-counts` switch to do so. This can be useful if
you are looking to do further downstream analysis of the reports, and want
to compare samples. Sorting by the taxonomy ID (using `sort -k5,5n`) can
provide a consistent line ordering between reports.
In addition, we also provide the option `--use-mpa-style` that can be used
in conjunction with `--report`. This option provides output in a format
similar to MetaPhlAn's output. The output with this option provides one
taxon per line, with a lowercase version of the rank codes in Kraken 2's
standard sample report format (except for 'U' and 'R'), two underscores,
and the scientific name of the taxon (e.g., "d__Viruses"). The full
taxonomy of each taxon (at the eight ranks considered) is given, with each
rank's name separated by a pipe character (e.g., "d__Viruses|o_Caudovirales").
Following this version of the taxon's scientific name is a tab and the
number of fragments assigned to the clade rooted at that taxon.
Translated Search
=================
Kraken 2 allows users to perform a six-frame translated search, similar
to the well-known BLASTX program. To do this, Kraken 2 uses a reduced
15 amino acid alphabet and stores amino acid minimizers in its database.
LCA results from all 6 frames are combined to yield a set of LCA hits,
which is then resolved in the same manner as in Kraken's normal operation.
To build a protein database, the `--protein` option should be given to
`kraken2-build` (either along with `--standard`, or with all steps if
building a custom database).
Custom Databases
================
We realize the standard database may not suit everyone's needs. Kraken 2
also allows creation of customized databases.
To build a custom database:
1. Install a taxonomy. Usually, you will just use the NCBI taxonomy,
which you can easily download using:
kraken2-build --download-taxonomy --db $DBNAME
This will download the accession number to taxon maps, as well as the
taxonomic name and tree information from NCBI. These files can
be found in `$DBNAME/taxonomy/` . If you need to modify the taxonomy,
edits can be made to the `names.dmp` and `nodes.dmp` files in this
directory; you may also need to modify the `*.accession2taxid` files
appropriately.
Some of the standard sets of genomic libraries have taxonomic information
associated with them, and don't need the accession number to taxon maps
to build the database successfully. These libraries include all those
available through the `--download-library` option (see next point), except
for the `plasmid` and non-redundant databases. If you are not using
custom sequences (see the `--add-to-library` option) and are not using
one of the `plasmid` or non-redundant database libraries, you may want to
skip downloading of the accession number to taxon maps. This can be done
by passing `--skip-maps` to the `kraken2-build --download-taxonomy` command.
2. Install one or more reference libraries. Several sets of standard
genomes/proteins are made easily available through `kraken2-build`:
- `archaea`: RefSeq complete archaeal genomes/proteins
- `bacteria`: RefSeq complete bacterial genomes/proteins
- `plasmid`: RefSeq plasmid nucleotide/protein sequences
- `viral`: RefSeq complete viral genomes/proteins
- `human`: GRCh38 human genome/proteins
- `fungi`: RefSeq complete fungal genomes/proteins
- `plant`: RefSeq complete plant genomes/proteins
- `protozoa`: RefSeq complete protozoan genomes/proteins
- `nr`: NCBI non-redundant protein database
- `nt`: NCBI non-redundant nucleotide database
- `env_nr`: NCBI non-redundant protein database with sequences from
large environmental sequencing projects
- `env_nt`: NCBI non-redundant nucleotide database with sequences from
large environmental sequencing projects
- `UniVec`: NCBI-supplied database of vector, adapter, linker, and
primer sequences that may be contaminating sequencing projects and/or
assemblies
- `UniVec_Core`: A subset of UniVec chosen to minimize false positive
hits to the vector database
To download and install any one of these, use the `--download-library`
switch, e.g.:
kraken2-build --download-library bacteria --db $DBNAME
Multiple libraries can be downloaded into a database prior to building
by issuing multiple `kraken2-build --download-library` commands, e.g.:
kraken2-build --download-library archaea --db $DBNAME
kraken2-build --download-library viral --db $DBNAME
The above commands would prepare a database that would contain archaeal
and viral genomes; the `--build` option (see below) will still need to
be used after downloading these libraries to actually build the database,
however.
(Note that downloading `nr` or `env_nr` require use of the `--protein`
option, and that `UniVec` and `UniVec_Core` are incompatible with
the `--protein` option.)
Other genomes can also be added, but such genomes must meet certain
requirements:
- Sequences must be in a FASTA file (multi-FASTA is allowed)
- Each sequence's ID (the string between the `>` and the first
whitespace character on the header line) must contain either
an NCBI accession number to allow Kraken 2 to lookup the correct taxa,
or an explicit assignment of the taxonomy ID using `kraken:taxid`
(see below).
Sequences not downloaded from NCBI may need their taxonomy information
assigned explicitly. This can be done using the string `kraken:taxid|XXX`
in the sequence ID, with `XXX` replaced by the desired taxon ID. For
example, to put a known adapter sequence in taxon 32630 ("synthetic
construct"), you could use the following:
>sequence16|kraken:taxid|32630 Adapter sequence
CAAGCAGAAGACGGCATACGAGATCTTCGAGTGACTGGAGTTCCTTGGCACCCGAGAATTCCA
The `kraken:taxid` string must begin the sequence ID or be immediately
preceded by a pipe character (`|`). Explicit assignment of taxonomy IDs
in this manner will override the accession number mapping provided by NCBI.
If your genomes meet the requirements above, then you can add each
sequence to your database's genomic library using the `--add-to-library`
switch, e.g.:
kraken2-build --add-to-library chr1.fa --db $DBNAME
kraken2-build --add-to-library chr2.fa --db $DBNAME
Note that if you have a list of files to add, you can do something like
this in `bash`:
for file in chr*.fa
do
kraken2-build --add-to-library $file --db $DBNAME
done
Or even add all `*.fa` files found in the directory `genomes`:
`find genomes/ -name '*.fa' -print0 | xargs -0 -I{} -n1 kraken2-build --add-to-library {} --db $DBNAME`
(You may also find the `-P` option to `xargs` useful to add many files in
parallel if you have multiple processors.)
3. Once your library is finalized, you need to build the database. This
can be done with the command:
kraken2-build --build --db $DBNAME
The `--threads` option is also helpful here to reduce build time.
By default, the values of $k$ and $\ell$ are 35 and 31, respectively (or
15 and 15 for protein databases). These values can be explicitly set
with the `--kmer-len` and `minimizer-len` options, however. Note that
the minimizer length must be no more than 31 for nucleotide databases,
and 15 for protein databases. Additionally, the minimizer length $\ell$
must be no more than the $k$-mer length. There is no upper bound on
the value of $k$, but sequences less than $k$ bp in length cannot be
classified.
Kraken 2 also utilizes a simple spaced seed approach to increase
accuracy. A number $s$ < $\ell$/4 can be chosen, and $s$ positions
in the minimizer will be masked out during all comparisons.
Masked positions are chosen to alternate from the second-to-last
position in the minimizer; e.g., $s$ = 5 and $\ell$ = 31 will result
in masking out the 0 positions shown here:
111 1111 1111 1111 1111 1101 0101 0101
By default, $s$ = 6 for nucleotide databases, and $s$ = 0 for
protein databases. This can be changed using the `--minimizer-spaces`
option along with the `--build` task of `kraken2-build`.
A full list of options for `kraken2-build` can be obtained using
`kraken2-build --help`.
After building a database, if you want to reduce the disk usage of
the database, you can use the `--clean` option for `kraken2-build`
to remove intermediate files from the database directory.
Masking of Low-complexity Sequences
===================================
Low-complexity sequences, e.g. "ACACACACACACACACACACACACAC", are known
to occur in many different organisms and are typically less informative
for use in alignments; the BLAST programs often mask these sequences by
default. Using this masking can help prevent false positives in Kraken 2's
results, and so we have added this functionality as a default option to
Kraken 2's library download/addition process.
Kraken 2 uses two programs to perform low-complexity sequence masking,
both available from NCBI: `dustmasker`, for nucleotide sequences, and
`segmasker`, for amino acid sequences. These programs are available
as part of the NCBI BLAST+ suite. If these programs are not installed
on the local system and in the user's PATH when trying to use
`kraken2-build`, the database build will fail. Users who do not wish to
install these programs can use the `--no-masking` option to `kraken2-build`
in conjunction with any of the `--download-library`, `--add-to-library`, or
`--standard` options; use of the `--no-masking` option will skip masking of
low-complexity sequences during the build of the Kraken 2 database.
Special Databases
=================
To support some common use cases, we provide the ability to build Kraken 2
databases using data from various external databases. These external
databases may not follow the NCBI taxonomy, and so we've provided
mechanisms to automatically create a taxonomy that will work with Kraken 2
(although such taxonomies may not be identical to NCBI's).
To build one of these "special" Kraken 2 databases, use the following command:
kraken2-build --db $DBNAME --special TYPE
where the `TYPE` string is one of the database names listed below.
At present, the "special" Kraken 2 database support we provide is limited
to pre-packaged solutions for some public 16S sequence databases, but this may
grow in the future.
16S Databases
-------------
For targeted 16S sequencing projects, a normal Kraken 2 database using whole
genome data may use more resources than necessary. A Kraken 2 database created
from a well-curated genomic library of just 16S data can provide both a more
efficient solution as well as a more accurate set of predictions for such
projects. We provide support for building Kraken 2 databases from three
publicly available 16S databases:
* [Greengenes] (Kraken 2 database name: `greengenes`), using all available 16S data.
* [RDP] (Kraken 2 database name: `rdp`), using the bacterial and archaeal 16S data.
* [SILVA] (Kraken 2 database name: `silva`), using the Small subunit NR99 sequence set.
Note that these databases may have licensing restrictions regarding their data,
and it is your responsibility to ensure you are in compliance with those
restrictions; please visit the databases' websites for further details. The
`kraken2-build` script only uses publicly available URLs to download data and
then converts that data into a form compatible for use with Kraken 2.
Furthermore, if you use one of these databases in your research, please
visit the corresponding database's website to determine the appropriate and
up-to-date citation.
[Greengenes]: http://greengenes.lbl.gov/
[RDP]: http://rdp.cme.msu.edu/
[SILVA]: http://www.arb-silva.de/
Confidence Scoring
==================
At present, we have not yet developed a confidence score with a
probabilistic interpretation for Kraken 2. However, we have developed a
simple scoring scheme that has yielded good results for us, and we've
made that available in Kraken 2 through use of the `--confidence` option
to `kraken2`. The approach we use allows a user to specify a threshold
score in the [0,1] interval; the classifier then will adjust labels up
the tree until the label's score (described below) meets or exceeds that
threshold. If a label at the root of the taxonomic tree would not have
a score exceeding the threshold, the sequence is called unclassified by
Kraken 2 when this threshold is applied.
A sequence label's score is a fraction $C$/$Q$, where $C$ is the number of
$k$-mers mapped to LCA values in the clade rooted at the label, and $Q$ is the
number of $k$-mers in the sequence that lack an ambiguous nucleotide (i.e.,
they were queried against the database). Consider the example of the
LCA mappings in Kraken 2's output given earlier:
"562:13 561:4 A:31 0:1 562:3" would indicate that:
* the first 13 $k$-mers mapped to taxonomy ID #562
* the next 4 $k$-mers mapped to taxonomy ID #561
* the next 31 $k$-mers contained an ambiguous nucleotide
* the next $k$-mer was not in the database
* the last 3 $k$-mers mapped to taxonomy ID #562
In this case, ID #561 is the parent node of #562. Here, a label of #562
for this sequence would have a score of $C$/$Q$ = (13+3)/(13+4+1+3) = 16/21.
A label of #561 would have a score of $C$/$Q$ = (13+4+3)/(13+4+1+3) = 20/21.
If a user specified a `--confidence` threshold over 16/21, the classifier
would adjust the original label from #562 to #561; if the threshold was
greater than 20/21, the sequence would become unclassified.
Inspecting a Kraken 2 Database's Contents
=========================================
The `kraken2-inspect` script allows users to gain information about the content
of a Kraken 2 database. The output format of `kraken2-inspect`
is identical to the reports generated with the `--report` option to `kraken2`.
Instead of reporting how many reads in input data classified to a given taxon
or clade, as `kraken2`'s `--report` option would, the `kraken2-inspect` script
will report the number of minimizers in the database that are mapped to the
various taxa/clades. For example, the first five lines of `kraken2-inspect`'s
output on an example database might look like this:
$ kraken2-inspect --db EXAMPLE_DB | head -5
100.00% 1770368409 1581179 R 1 root
96.50% 1708407622 58003 R1 131567 cellular organisms
91.28% 1615910070 985309 D 2 Bacteria
43.89% 777062062 1312736 P 1224 Proteobacteria
18.62% 329590216 555667 C 1236 Gammaproteobacteria
This output indicates that 555667 of the minimizers in the database map
directly to the Gammaproteobacteria class (taxid #1236), and 329590216 (18.62%)
of the database's minimizers map to a taxon in the clade rooted at
Gammaproteobacteria. For more information on `kraken2-inspect`'s options,
use its `--help` option.
Kraken 2 Environment Variables
==============================
The `kraken2` and `kraken2-inpsect` scripts supports the use of some
environment variables to help in reducing command line lengths:
* **`KRAKEN2_NUM_THREADS`**: if the
`--threads` option is not supplied to `kraken2`, then the value of this
variable (if it is set) will be used as the number of threads to run
`kraken2`. (This variable does not affect `kraken2-inspect`.)
* **`KRAKEN2_DB_PATH`**: much like the `PATH` variable is used for executables
by your shell, `KRAKEN2_DB_PATH` is a colon-separated list of directories
that will be searched for the database you name if the named database
does not have a slash (`/`) character. By default, Kraken 2 assumes the
value of this variable is "`.`" (i.e., the current working directory).
This variable can be used to create one (or more) central repositories
of Kraken databases in a multi-user system. Example usage in bash:
export KRAKEN2_DB_PATH="/home/user/my_kraken2_dbs:/data/kraken2_dbs:"
This will cause three directories to be searched, in this order:
1) `/home/user/my_kraken2_dbs`
2) `/data/kraken2_dbs`
3) the current working directory (caused by the empty string as
the third colon-separated field in the `KRAKEN2_DB_PATH` string)
The search for a database will stop when a name match is found; if
two directories in the `KRAKEN2_DB_PATH` have databases with the same
name, the directory of the two that is searched first will have its
database selected.
If the above variable and value are used, and the databases
`/data/kraken2_dbs/mainDB` and `./mainDB` are present, then
kraken2 --db mainDB sequences.fa
will classify `sequences.fa` using `/data/kraken_dbs/mainDB`; if instead
you wanted to use the `mainDB` present in the current directory,
you would need to specify a directory path to that database in order
to circumvent searching, e.g.:
kraken2 --db ./mainDB sequences.fa
Note that the `KRAKEN2_DB_PATH` directory list can be skipped by the use
of any absolute (beginning with `/`) or relative pathname (including
at least one `/`) as the database name.
* **`KRAKEN2_DEFAULT_DB`**: if no database is supplied with the `--db` option,
the database named in this variable will be used instead. Using this
variable, you can avoid using `--db` if you only have a single database
that you usually use, e.g. in bash:
export KRAKEN2_DEFAULT_DB="/home/user/kraken2db"
kraken2 sequences.fa > kraken2.output
This will classify `sequences.fa` using the `/home/user/kraken2db`
database.
Note that the value of `KRAKEN2_DEFAULT_DB` will also be interpreted in
the context of the value of `KRAKEN2_DB_PATH` if you don't set
`KRAKEN2_DEFAULT_DB` to an absolute or relative pathname. Given the earlier
example in this section, the following:
export KRAKEN2_DEFAULT_DB="mainDB"
kraken2 sequences.fa
will use `/data/kraken_dbs/mainDB` to classify `sequences.fa`.
kraken2-2.0.8-beta/docs/Makefile 0000664 0000000 0000000 00000000427 13460230711 0016357 0 ustar 00root root 0000000 0000000 all:
pandoc --title-prefix "Kraken 2 Manual" \
--include-in-header head.html \
--include-before-body top.html \
--from markdown --to html \
--table-of-contents \
--css kraken.css \
--output MANUAL.html \
< MANUAL.markdown
kraken2-2.0.8-beta/docs/bar-bg.png 0000664 0000000 0000000 00000000274 13460230711 0016557 0 ustar 00root root 0000000 0000000 PNG
IHDR pHYs tIME # %z tEXtComment Created with GIMPW 6IDATHcd???#||b QG-xQG-xb % " IENDB` kraken2-2.0.8-beta/docs/head.html 0000664 0000000 0000000 00000000157 13460230711 0016506 0 ustar 00root root 0000000 0000000
kraken2-2.0.8-beta/docs/kraken.css 0000664 0000000 0000000 00000001327 13460230711 0016704 0 ustar 00root root 0000000 0000000 body {
background: #f0f0f0 url("bar-bg.png") top left repeat-y;
color: #333;
margin: 10px;
margin-left: 50px;
margin-bottom: 20px;
font-family: 'Ubuntu', sans-serif;
}
a { color: #00b0f0; }
a:visited { color: #0090f0; }
a:hover { color: #00d0ff; }
a:active { color: #00f0ff; }
.pretoc {
text-align: center;
font-size: 1.2em;
}
.title {
font-size: 2em;
font-weight: bold;
margin-bottom: 0;
}
.version {
font-size: 0.9em;
}
h1 {
color: #0090f0;
border-bottom: 1px #0090f0 solid;
margin-left: -10px;
margin-bottom: 3px;
}
h1 a {
color: #0090f0;
text-decoration: none;
}
div#confidence-score-table table th {
width: 7em;
}
pre {
margin-left: 4em;
}
code {
font-size: 1.2em;
}
kraken2-2.0.8-beta/docs/top.html 0000664 0000000 0000000 00000000307 13460230711 0016404 0 ustar 00root root 0000000 0000000
Kraken taxonomic sequence classification system
Version 2.0.8-beta
Operating Manual
Table of Contents
kraken2-2.0.8-beta/install_kraken2.sh 0000775 0000000 0000000 00000002366 13460230711 0017415 0 ustar 00root root 0000000 0000000 #!/bin/bash
# Copyright 2013-2019, Derrick Wood
#
# This file is part of the Kraken 2 taxonomic sequence classification system.
set -e
VERSION="2.0.8-beta"
if [ -z "$1" ] || [ -n "$2" ]
then
echo "Usage: $(basename $0) KRAKEN2_DIR"
exit 64
fi
if [ "$1" = "KRAKEN2_DIR" ]
then
echo "Please replace \"KRAKEN2_DIR\" with the name of the directory"
echo "that you want to install Kraken 2 in."
exit 1
fi
# Perl cmd used to canonicalize dirname - "readlink -f" doesn't work
# on OS X.
export KRAKEN2_DIR=$(perl -MCwd=abs_path -le 'print abs_path(shift)' "$1")
mkdir -p "$KRAKEN2_DIR"
make -C src install
for file in scripts/*
do
perl -pl -e 'BEGIN { while (@ARGV) { $_ = shift; ($k,$v) = split /=/, $_, 2; $H{$k} = $v } }'\
-e 's/#####=(\w+)=#####/$H{$1}/g' \
"KRAKEN2_DIR=$KRAKEN2_DIR" "VERSION=$VERSION" \
< "$file" > "$KRAKEN2_DIR/$(basename $file)"
if [ -x "$file" ]
then
chmod +x "$KRAKEN2_DIR/$(basename $file)"
fi
done
echo
echo "Kraken 2 installation complete."
echo
echo "To make things easier for you, you may want to copy/symlink the following"
echo "files into a directory in your PATH:"
for file in $KRAKEN2_DIR/kraken2*
do
if [ -x "$file" ]
then
echo " $file"
fi
done
kraken2-2.0.8-beta/scripts/ 0000775 0000000 0000000 00000000000 13460230711 0015453 5 ustar 00root root 0000000 0000000 kraken2-2.0.8-beta/scripts/16S_gg_installation.sh 0000775 0000000 0000000 00000001677 13460230711 0021634 0 ustar 00root root 0000000 0000000 #!/bin/bash
# Copyright 2013-2019, Derrick Wood
#
# This file is part of the Kraken 2 taxonomic sequence classification system.
# Build a 16S database from Greengenes data
set -u # Protect against uninitialized vars.
set -e # Stop on error
set -o pipefail # Stop on failures in non-final pipeline commands
FTP_SERVER="ftp://greengenes.microbio.me/"
GG_VERSION="gg_13_5"
REMOTE_DIR="$FTP_SERVER/greengenes_release/$GG_VERSION"
mkdir -p "$KRAKEN2_DB_NAME"
pushd "$KRAKEN2_DB_NAME"
mkdir -p data taxonomy library
pushd data
wget "$REMOTE_DIR/${GG_VERSION}.fasta.gz"
gunzip "${GG_VERSION}.fasta.gz"
wget "$REMOTE_DIR/${GG_VERSION}_taxonomy.txt.gz"
gunzip "${GG_VERSION}_taxonomy.txt.gz"
build_gg_taxonomy.pl "${GG_VERSION}_taxonomy.txt"
popd
mv data/names.dmp data/nodes.dmp taxonomy/
mv data/seqid2taxid.map .
mv "data/${GG_VERSION}.fasta" library/gg.fna
popd
kraken2-build --db $KRAKEN2_DB_NAME --build --threads $KRAKEN2_THREAD_CT
kraken2-2.0.8-beta/scripts/16S_rdp_installation.sh 0000775 0000000 0000000 00000001674 13460230711 0022021 0 ustar 00root root 0000000 0000000 #!/bin/bash
# Copyright 2013-2019, Derrick Wood
#
# This file is part of the Kraken 2 taxonomic sequence classification system.
# Build a 16S database from RDP data
set -u # Protect against uninitialized vars.
set -e # Stop on error
set -o pipefail # Stop on failures in non-final pipeline commands
HTTP_SERVER="http://rdp.cme.msu.edu/"
REMOTE_DIR="$HTTP_SERVER/download/"
mkdir -p "$KRAKEN2_DB_NAME"
pushd "$KRAKEN2_DB_NAME"
mkdir -p data taxonomy library
pushd data
wget "$REMOTE_DIR/current_Bacteria_unaligned.fa.gz"
gunzip "current_Bacteria_unaligned.fa.gz"
wget "$REMOTE_DIR/current_Archaea_unaligned.fa.gz"
gunzip "current_Archaea_unaligned.fa.gz"
build_rdp_taxonomy.pl current_*_unaligned.fa
popd
mv data/names.dmp data/nodes.dmp taxonomy/
mv data/seqid2taxid.map .
for file in data/*.fa; do
mv $file library/$(basename $file .fa).fna
done
popd
kraken2-build --db $KRAKEN2_DB_NAME --build --threads $KRAKEN2_THREAD_CT
kraken2-2.0.8-beta/scripts/16S_silva_installation.sh 0000775 0000000 0000000 00000002115 13460230711 0022341 0 ustar 00root root 0000000 0000000 #!/bin/bash
# Copyright 2013-2019, Derrick Wood
#
# This file is part of the Kraken 2 taxonomic sequence classification system.
# Build a 16S database from Silva data
set -u # Protect against uninitialized vars.
set -e # Stop on error
set -o pipefail # Stop on failures in non-final pipeline commands
FTP_SERVER="ftp://ftp.arb-silva.de/"
SILVA_VERSION="132"
REMOTE_DIR="$FTP_SERVER/release_$SILVA_VERSION/Exports"
FASTA_FILENAME="SILVA_${SILVA_VERSION}_SSURef_Nr99_tax_silva.fasta"
TAXO_PREFIX="tax_slv_ssu_$SILVA_VERSION"
mkdir -p "$KRAKEN2_DB_NAME"
pushd "$KRAKEN2_DB_NAME"
mkdir -p data taxonomy library
pushd data
wget "$REMOTE_DIR/${FASTA_FILENAME}.gz"
gunzip "${FASTA_FILENAME}.gz"
wget "$REMOTE_DIR/taxonomy/${TAXO_PREFIX}.acc_taxid"
wget "$REMOTE_DIR/taxonomy/${TAXO_PREFIX}.txt"
build_silva_taxonomy.pl "${TAXO_PREFIX}.txt"
popd
mv data/names.dmp data/nodes.dmp taxonomy/
mv data/${TAXO_PREFIX}.acc_taxid seqid2taxid.map
sed -e '/^>/!y/U/T/' "data/$FASTA_FILENAME" > library/silva.fna
popd
kraken2-build --db $KRAKEN2_DB_NAME --build --threads $KRAKEN2_THREAD_CT
kraken2-2.0.8-beta/scripts/add_to_library.sh 0000775 0000000 0000000 00000001663 13460230711 0020776 0 ustar 00root root 0000000 0000000 #!/bin/bash
# Copyright 2013-2019, Derrick Wood
#
# This file is part of the Kraken 2 taxonomic sequence classification system.
# Copy specified file into a Kraken library
set -u # Protect against uninitialized vars.
set -e # Stop on error
LIBRARY_DIR="$KRAKEN2_DB_NAME/library"
if [ ! -e "$1" ]
then
1>&2 echo "Can't add \"$1\": file does not exist"
exit 1
fi
if [ ! -f "$1" ]
then
1>&2 echo "Can't add \"$1\": not a regular file"
exit 1
fi
add_dir="$LIBRARY_DIR/added"
mkdir -p "$add_dir"
prelim_map=$(cp_into_tempfile.pl -t "prelim_map_XXXXXXXXXX" -d "$add_dir" -s txt /dev/null)
scan_fasta_file.pl "$1" > "$prelim_map"
filename=$(cp_into_tempfile.pl -t "XXXXXXXXXX" -d "$add_dir" -s fna "$1")
if [ -n "$KRAKEN2_MASK_LC" ]; then
1>&2 echo -n "Masking low-complexity regions of new file..."
mask_low_complexity.sh $filename
1>&2 echo " done."
fi
1>&2 echo "Added \"$1\" to library ($KRAKEN2_DB_NAME)"
kraken2-2.0.8-beta/scripts/build_gg_taxonomy.pl 0000775 0000000 0000000 00000004606 13460230711 0021533 0 ustar 00root root 0000000 0000000 #!/usr/bin/env perl
# Copyright 2013-2019, Derrick Wood
#
# This file is part of the Kraken 2 taxonomic sequence classification system.
# Parses Greengenes taxonomy file to create Kraken taxonomy
# and sequence ID -> taxonomy ID mapping
# Input (as <>): gg_13_5_taxonomy.txt
use strict;
use warnings;
use File::Basename;
my $PROG = basename $0;
my %RANK_CODES = (
k => "superkingdom",
p => "phylum",
c => "class",
o => "order",
f => "family",
g => "genus",
s => "species"
);
my %seqid_map;
my %seen_it;
my %child_data = ("root" => {});
LINE: while (<>) {
chomp;
my ($seqid, $taxo_str) = split /\t/;
$taxo_str =~ s/(; [a-z]__)+$//; # Remove empty data
$seqid_map{$seqid} = $taxo_str;
next if $seen_it{$taxo_str}++;
while ($taxo_str =~ s/(; [a-z]__[^;]+$)//) {
my $level = $1;
my $parent = $taxo_str;
$child_data{$parent} ||= {};
$child_data{$parent}->{"$taxo_str$level"}++;
next LINE if $seen_it{$taxo_str}++;
}
$child_data{"root"}->{$taxo_str}++;
}
# Assign IDs through BFS of tree, report names/nodes info in
# NCBI format
my %id_map;
my $next_node_id = 1;
open NAMES, ">", "names.dmp" or die "$PROG: can't write names.dmp: $!\n";
open NODES, ">", "nodes.dmp" or die "$PROG: can't write nodes.dmp: $!\n";
my @bfs_queue = (["root", 1]);
while (@bfs_queue) {
my $arg_ref = shift @bfs_queue;
my ($node, $parent_id) = @$arg_ref;
my $display_name = $node;
my $rank;
# special handling for species
if ($node =~ /g__([^;]+); s__([^;]+)$/) {
my ($genus, $species) = ($1, $2);
$rank = "species";
if ($species =~ / endosymbiont /) {
$display_name = $species;
}
else {
$display_name = "$genus $species";
}
}
elsif ($node =~ /([a-z])__([^;]+)$/) {
$rank = $RANK_CODES{$1};
$display_name = $2;
}
$rank ||= "no rank";
my $node_id = $next_node_id++;
$id_map{$node} = $node_id;
print NAMES "$node_id\t|\t$display_name\t|\t-\t|\tscientific name\t|\n";
print NODES "$node_id\t|\t$parent_id\t|\t$rank\t|\t-\t|\n";
my @children = sort keys %{ $child_data{$node} };
push @bfs_queue, [$_, $node_id] for @children;
}
close NAMES;
close NODES;
open SEQID_TAXID_MAP, ">", "seqid2taxid.map" or die "$PROG: can't write seqid2taxid.map: $!\n";
for my $seqid (sort { $a <=> $b } keys %seqid_map) {
my $taxid = $id_map{ $seqid_map{$seqid} };
print SEQID_TAXID_MAP "$seqid\t$taxid\n";
}
close SEQID_TAXID_MAP;
kraken2-2.0.8-beta/scripts/build_kraken2_db.sh 0000775 0000000 0000000 00000010311 13460230711 0021167 0 ustar 00root root 0000000 0000000 #!/bin/bash
# Copyright 2013-2019, Derrick Wood
#
# This file is part of the Kraken 2 taxonomic sequence classification system.
# Build a Kraken 2 database
# Designed to be called by kraken2-build
set -u # Protect against uninitialized vars.
set -e # Stop on error
set -o pipefail # Stop on failures in non-final pipeline commands
function finalize_file() {
mv $1.tmp $1
}
function get_current_time() {
date "+%s.%N"
}
function report_time_elapsed() {
curr_time=$(get_current_time)
perl -e '$time = $ARGV[1] - $ARGV[0];' \
-e '$sec = int($time); $nsec = $time - $sec;' \
-e '$min = int($sec/60); $sec %= 60;' \
-e '$hr = int($min/60); $min %= 60;' \
-e 'print "${hr}h" if $hr;' \
-e 'print "${min}m" if $min || $hr;' \
-e 'printf "%.3fs", $sec + $nsec;' \
$1 $curr_time
}
function list_sequence_files() {
find library/ '(' -name '*.fna' -o -name '*.faa' ')' -print0
}
start_time=$(get_current_time)
DATABASE_DIR="$KRAKEN2_DB_NAME"
if [ ! -d "$DATABASE_DIR" ]
then
echo "Can't find Kraken 2 DB directory \"$KRAKEN2_DB_NAME\""
exit 1
fi
cd "$DATABASE_DIR"
if [ ! -d "taxonomy/" ]
then
echo "Can't find taxonomy/ subdirectory in database directory, exiting."
exit 1
fi
if [ ! -d "library/" ]
then
echo "Can't find library/ subdirectory in database directory, exiting."
exit 1
fi
KRAKEN2XFLAG=""
if [ -n "$KRAKEN2_PROTEIN_DB" ]
then
KRAKEN2XFLAG="-X"
fi
echo "Creating sequence ID to taxonomy ID map (step 1)..."
if [ -d "library/added" ]; then
find library/added/ -name 'prelim_map_*.txt' | xargs cat > library/added/prelim_map.txt
fi
seqid2taxid_map_file=seqid2taxid.map
if [ -e "$seqid2taxid_map_file" ]; then
echo "Sequence ID to taxonomy ID map already present, skipping map creation."
else
step_time=$(get_current_time)
find library/ -maxdepth 2 -name prelim_map.txt | xargs cat > taxonomy/prelim_map.txt
if [ ! -s "taxonomy/prelim_map.txt" ]; then
echo "No preliminary seqid/taxid mapping files found, aborting."
exit 1
fi
grep "^TAXID" taxonomy/prelim_map.txt | cut -f 2- > $seqid2taxid_map_file.tmp || true
if grep "^ACCNUM" taxonomy/prelim_map.txt | cut -f 2- > accmap_file.tmp; then
if compgen -G "taxonomy/*.accession2taxid" > /dev/null; then
lookup_accession_numbers accmap_file.tmp taxonomy/*.accession2taxid > seqid2taxid_acc.tmp
cat seqid2taxid_acc.tmp >> $seqid2taxid_map_file.tmp
rm seqid2taxid_acc.tmp
else
echo "Accession to taxid map files are required to build this DB."
echo "Run 'kraken2-build --db $KRAKEN2_DB_NAME --download-taxonomy' again?"
exit 1
fi
fi
rm -f accmap_file.tmp
finalize_file $seqid2taxid_map_file
echo "Sequence ID to taxonomy ID map complete. [$(report_time_elapsed $step_time)]"
fi
echo "Estimating required capacity (step 2)..."
step_time=$(get_current_time)
estimate=$(list_sequence_files | xargs -0 cat | estimate_capacity -k $KRAKEN2_KMER_LEN -l $KRAKEN2_MINIMIZER_LEN -S $KRAKEN2_SEED_TEMPLATE -p $KRAKEN2_THREAD_CT $KRAKEN2XFLAG )
required_capacity=$(perl -le 'print int(shift() / 0.7)' $estimate);
echo "Estimated hash table requirement: $(( required_capacity * 4 )) bytes"
max_db_flag=""
if [ -n "$KRAKEN2_MAX_DB_SIZE" ]
then
if (( KRAKEN2_MAX_DB_SIZE < (required_capacity * 4) ))
then
max_db_flag="-M $(perl -le 'print int(shift() / 4)' $KRAKEN2_MAX_DB_SIZE)"
echo "Specifying lower maximum hash table size of $KRAKEN2_MAX_DB_SIZE bytes"
fi
fi
echo "Capacity estimation complete. [$(report_time_elapsed $step_time)]"
echo "Building database files (step 3)..."
if [ -e "hash.k2d" ]
then
echo "Hash table already present, skipping database file build."
else
step_time=$(get_current_time)
list_sequence_files | xargs -0 cat | \
build_db -k $KRAKEN2_KMER_LEN -l $KRAKEN2_MINIMIZER_LEN -S $KRAKEN2_SEED_TEMPLATE $KRAKEN2XFLAG \
-H hash.k2d.tmp -t taxo.k2d.tmp -o opts.k2d.tmp -n taxonomy/ -m $seqid2taxid_map_file \
-c $required_capacity -p $KRAKEN2_THREAD_CT $max_db_flag
finalize_file taxo.k2d
finalize_file opts.k2d
finalize_file hash.k2d
echo "Database files completed. [$(report_time_elapsed $step_time)]"
fi
echo "Database construction complete. [Total: $(report_time_elapsed $start_time)]"
kraken2-2.0.8-beta/scripts/build_rdp_taxonomy.pl 0000775 0000000 0000000 00000004217 13460230711 0021721 0 ustar 00root root 0000000 0000000 #!/usr/bin/env perl
# Copyright 2013-2019, Derrick Wood
#
# This file is part of the Kraken 2 taxonomic sequence classification system.
# Parses RDP sequence data to create Kraken taxonomy
# and sequence ID -> taxonomy ID mapping
# Input (as <>): current_{Archaea,Bacteria}_unaligned.fa
use strict;
use warnings;
use File::Basename;
my $PROG = basename $0;
my %seqid_map;
my %seen_it;
my %child_data = ("root;no rank" => {});
LINE: while (<>) {
next unless s/^>//;
chomp;
my ($seq_label, $taxo_str) = split /\t/;
my ($seqid) = split " ", $seq_label;
$taxo_str =~ s/^Lineage=Root;rootrank;/root;no rank;/;
$taxo_str =~ s/;$/;no rank/; # adjust for unclassified things
$seqid_map{$seqid} = $taxo_str;
next if $seen_it{$taxo_str}++;
while ($taxo_str =~ s/(;[^;]+;[^;]+)$//) {
my $level = $1;
my $parent = $taxo_str;
$child_data{$parent} ||= {};
$child_data{$parent}->{"$taxo_str$level"}++;
next LINE if $seen_it{$taxo_str}++;
}
}
# Assign IDs through BFS of tree, report names/nodes info in
# NCBI format
my %id_map;
my $next_node_id = 1;
open NAMES, ">", "names.dmp" or die "$PROG: can't write names.dmp: $!\n";
open NODES, ">", "nodes.dmp" or die "$PROG: can't write nodes.dmp: $!\n";
my @bfs_queue = (["root;no rank", 1]);
while (@bfs_queue) {
my $arg_ref = shift @bfs_queue;
my ($node, $parent_id) = @$arg_ref;
if ($node !~ /([^;]+);([^;]+)$/) {
die "$PROG: BFS processing encountered formatting error, \"$node\"\n";
}
my ($display_name, $rank) = ($1, $2);
$rank = "superkingdom" if $rank eq "domain"; # conform w/ NCBI taxonomy
my $node_id = $next_node_id++;
$id_map{$node} = $node_id;
print NAMES "$node_id\t|\t$display_name\t|\t-\t|\tscientific name\t|\n";
print NODES "$node_id\t|\t$parent_id\t|\t$rank\t|\t-\t|\n";
my @children = sort keys %{ $child_data{$node} };
push @bfs_queue, [$_, $node_id] for @children;
}
close NAMES;
close NODES;
open SEQID_TAXID_MAP, ">", "seqid2taxid.map" or die "$PROG: can't write seqid2taxid.map: $!\n";
for my $seqid (sort keys %seqid_map) {
my $taxid = $id_map{ $seqid_map{$seqid} };
print SEQID_TAXID_MAP "$seqid\t$taxid\n";
}
close SEQID_TAXID_MAP;
kraken2-2.0.8-beta/scripts/build_silva_taxonomy.pl 0000775 0000000 0000000 00000002365 13460230711 0022254 0 ustar 00root root 0000000 0000000 #!/usr/bin/env perl
# Copyright 2013-2019, Derrick Wood
#
# This file is part of the Kraken 2 taxonomic sequence classification system.
# Parses Silva taxonomy file to create Kraken taxonomy
# Input (as <>): tax_slv_ssu_nr_119.txt
use strict;
use warnings;
use File::Basename;
my $PROG = basename $0;
my %id_map = ("root" => 1);
open NAMES, ">", "names.dmp" or die "$PROG: can't write names.dmp: $!\n";
open NODES, ">", "nodes.dmp" or die "$PROG: can't write nodes.dmp: $!\n";
print NAMES "1\t|\troot\t|\t-\t|\tscientific name\t|\n";
print NODES "1\t|\t1\t|\tno rank\t|\t-\t|\n";
while (<>) {
chomp;
my ($taxo_str, $node_id, $rank) = split /\t/;
$id_map{$taxo_str} = $node_id;
if ($taxo_str =~ /^(.+;|)([^;]+);$/) {
my $parent_name = $1;
my $display_name = $2;
if ($parent_name eq "") {
$parent_name = "root";
}
my $parent_id = $id_map{$parent_name};
if (! defined $parent_id) {
die "$PROG: orphan error, line $.\n";
}
$rank = "superkingdom" if $rank eq "domain";
print NAMES "$node_id\t|\t$display_name\t|\t-\t|\tscientific name\t|\n";
print NODES "$node_id\t|\t$parent_id\t|\t$rank\t|\t-\t|\n";
}
else {
die "$PROG: strange input, line $.\n";
}
}
close NAMES;
close NODES;
kraken2-2.0.8-beta/scripts/clean_db.sh 0000775 0000000 0000000 00000001107 13460230711 0017540 0 ustar 00root root 0000000 0000000 #!/bin/bash
# Copyright 2013-2019, Derrick Wood
#
# This file is part of the Kraken 2 taxonomic sequence classification system.
# Removes intermediate files from a database directory,
# such as reference library FASTA files and taxonomy data from NCBI.
set -u # Protect against uninitialized vars.
set -e # Stop on error
cd $KRAKEN2_DB_NAME
previous_usage=$(du -sh | cut -f1)
1>&2 echo "Database disk usage: $previous_usage"
rm -rf library/ taxonomy/ seqid2taxid.map
current_usage=$(du -sh | cut -f1)
1>&2 echo "After cleaning, database uses $current_usage"
kraken2-2.0.8-beta/scripts/cp_into_tempfile.pl 0000775 0000000 0000000 00000002375 13460230711 0021342 0 ustar 00root root 0000000 0000000 #!/usr/bin/env perl
# Copyright 2013-2019, Derrick Wood
#
# This file is part of the Kraken 2 taxonomic sequence classification system.
# Create a file in a specified directory, then copy an
# existing file's contents into the new file. Write name of
# new file to standard output.
#
# This exists because the mktemp program doesn't act consistently across
# operating systems/distros/versions.
use strict;
use warnings;
use File::Basename;
use File::Temp 'tempfile';
use Getopt::Std;
my $PROG = basename $0;
getopts('d:t:s:', \my %opts) or usage();
$opts{$_} or usage() for qw/d t s/; # all switches mandatory
my ($directory, $template, $suffix) = @opts{qw/d t s/};
die "$PROG: '$directory' not a directory!\n" unless -d $directory;
die "$PROG: must specify a single filename\n" unless @ARGV == 1;
$suffix =~ s/^\.//;
my $old_filename = shift @ARGV;
open FILE, "<", $old_filename
or die "$PROG: can't read $old_filename: $!\n";
my ($fh, $new_filename) = tempfile($template, DIR => $directory,
UNLINK => 0, SUFFIX => ".$suffix");
# copy loop
while () {
print {$fh} $_;
}
close FILE;
close $fh;
print "$new_filename\n";
sub usage {
die "$PROG: <-d directory> <-t template> <-s suffix> \n";
}
kraken2-2.0.8-beta/scripts/download_genomic_library.sh 0000775 0000000 0000000 00000010727 13460230711 0023055 0 ustar 00root root 0000000 0000000 #!/bin/bash
# Copyright 2013-2019, Derrick Wood
#
# This file is part of the Kraken 2 taxonomic sequence classification system.
# Download specific genomic libraries for use with Kraken 2.
# Supported libraries were chosen based on support from NCBI's FTP site
# in easily obtaining a good collection of genomic data. Others may
# be added upon popular demand.
set -u # Protect against uninitialized vars.
set -e # Stop on error
LIBRARY_DIR="$KRAKEN2_DB_NAME/library"
NCBI_SERVER="ftp.ncbi.nlm.nih.gov"
FTP_SERVER="ftp://$NCBI_SERVER"
RSYNC_SERVER="rsync://$NCBI_SERVER"
THIS_DIR=$PWD
library_name="$1"
ftp_subdir=$library_name
library_file="library.fna"
if [ -n "$KRAKEN2_PROTEIN_DB" ]; then
library_file="library.faa"
fi
function download_file() {
file="$1"
if [ -n "$KRAKEN2_USE_FTP" ]
then
wget -q ${FTP_SERVER}${file}
else
rsync --no-motd ${RSYNC_SERVER}${file} .
fi
}
case $library_name in
"archaea" | "bacteria" | "viral" | "fungi" | "plant" | "human" | "protozoa")
mkdir -p $LIBRARY_DIR/$library_name
cd $LIBRARY_DIR/$library_name
rm -f assembly_summary.txt
remote_dir_name=$library_name
if [ "$library_name" = "human" ]; then
remote_dir_name="vertebrate_mammalian/Homo_sapiens"
fi
if ! download_file "/genomes/refseq/$remote_dir_name/assembly_summary.txt"; then
1>&2 echo "Error downloading assembly summary file for $library_name, exiting."
exit 1
fi
if [ "$library_name" = "human" ]; then
grep "Genome Reference Consortium" assembly_summary.txt > x
mv x assembly_summary.txt
fi
rm -rf all/ library.f* manifest.txt rsync.err
rsync_from_ncbi.pl assembly_summary.txt
scan_fasta_file.pl $library_file >> prelim_map.txt
;;
"plasmid")
mkdir -p $LIBRARY_DIR/plasmid
cd $LIBRARY_DIR/plasmid
rm -f library.f* plasmid.*
## This is staying FTP only D/L for now
1>&2 echo -n "Downloading plasmid files from FTP..."
wget -q --no-remove-listing --spider $FTP_SERVER/genomes/refseq/plasmid/
if [ -n "$KRAKEN2_PROTEIN_DB" ]; then
awk '{ print $NF }' .listing | perl -ple 'tr/\r//d' | grep '\.faa\.gz' > manifest.txt
else
awk '{ print $NF }' .listing | perl -ple 'tr/\r//d' | grep '\.fna\.gz' > manifest.txt
fi
cat manifest.txt | xargs -n1 -I{} wget -q $FTP_SERVER/genomes/refseq/plasmid/{}
cat manifest.txt | xargs -n1 -I{} gunzip -c {} > $library_file
rm -f plasmid.* .listing
scan_fasta_file.pl $library_file > prelim_map.txt
1>&2 echo " done."
;;
"env_nr" | "nr" | "env_nt" | "nt")
protein_lib=0
if [ "$library_name" = "env_nr" ] || [ "$library_name" = "nr" ]; then
protein_lib=1
fi
if (( protein_lib == 1 )) && [ -z "$KRAKEN2_PROTEIN_DB" ]; then
1>&2 echo "$library_name is a protein database, and the Kraken DB specified is nucleotide"
exit 1
fi
mkdir -p $LIBRARY_DIR/$library_name
cd $LIBRARY_DIR/$library_name
rm -f $library_name.gz
1>&2 echo -n "Downloading $library_name database from server... "
download_file "/blast/db/FASTA/$library_name.gz"
1>&2 echo "done."
1>&2 echo -n "Uncompressing $library_name database..."
gunzip $library_name.gz
mv $library_name $library_file
1>&2 echo "done."
1>&2 echo -n "Parsing $library_name FASTA file..."
# The nr/nt files tend to have non-standard sequence IDs, so
# --lenient is used here.
scan_fasta_file.pl --lenient $library_file >> prelim_map.txt
1>&2 echo "done."
;;
"UniVec" | "UniVec_Core")
if [ -n "$KRAKEN2_PROTEIN_DB" ]; then
1>&2 echo "$library_name is for nucleotide databases only"
exit 1
fi
mkdir -p $LIBRARY_DIR/$library_name
cd $LIBRARY_DIR/$library_name
1>&2 echo -n "Downloading $library_name data from server... "
download_file "/pub/UniVec/$library_name"
1>&2 echo "done."
# 28384: "other sequences"
special_taxid=28384
1>&2 echo -n "Adding taxonomy ID of $special_taxid to all sequences... "
sed -e "s/^>/>kraken:taxid|$special_taxid|/" $library_name > library.fna
scan_fasta_file.pl library.fna > prelim_map.txt
1>&2 echo "done."
;;
*)
1>&2 echo "Unsupported library. Valid options are: "
1>&2 echo " archaea bacteria viral fungi plant protozoa human plasmid"
1>&2 echo " nr nt env_nr env_nt UniVec UniVec_Core"
exit 1
;;
esac
if [ -n "$KRAKEN2_MASK_LC" ]; then
1>&2 echo -n "Masking low-complexity regions of downloaded library..."
mask_low_complexity.sh .
1>&2 echo " done."
fi
kraken2-2.0.8-beta/scripts/download_taxonomy.sh 0000775 0000000 0000000 00000003340 13460230711 0021557 0 ustar 00root root 0000000 0000000 #!/bin/bash
# Copyright 2013-2019, Derrick Wood
#
# This file is part of the Kraken 2 taxonomic sequence classification system.
# Download NCBI taxonomy information for Kraken 2.
# Designed to be called by kraken2-build
set -u # Protect against uninitialized vars.
set -e # Stop on error
TAXONOMY_DIR="$KRAKEN2_DB_NAME/taxonomy"
NCBI_SERVER="ftp.ncbi.nlm.nih.gov"
RSYNC_SERVER="rsync://$NCBI_SERVER"
FTP_SERVER="ftp://$NCBI_SERVER"
mkdir -p "$TAXONOMY_DIR"
cd "$TAXONOMY_DIR"
function download_file() {
file="$1"
if [ -n "$KRAKEN2_USE_FTP" ]
then
wget -q ${FTP_SERVER}${file}
else
rsync --no-motd ${RSYNC_SERVER}${file} .
fi
}
if [ ! -e "accmap.dlflag" ] && [ -z "$KRAKEN2_SKIP_MAPS" ]
then
if [ -z "$KRAKEN2_PROTEIN_DB" ]
then
for subsection in gb wgs
do
1>&2 echo -n "Downloading nucleotide ${subsection} accession to taxon map..."
download_file "/pub/taxonomy/accession2taxid/nucl_${subsection}.accession2taxid.gz"
1>&2 echo " done."
done
else
1>&2 echo -n "Downloading protein accession to taxon map..."
download_file "/pub/taxonomy/accession2taxid/prot.accession2taxid.gz"
1>&2 echo " done."
fi
touch accmap.dlflag
1>&2 echo "Downloaded accession to taxon map(s)"
fi
if [ ! -e "taxdump.dlflag" ]
then
1>&2 echo -n "Downloading taxonomy tree data..."
download_file "/pub/taxonomy/taxdump.tar.gz"
touch taxdump.dlflag
1>&2 echo " done."
fi
if ls | grep -q 'accession2taxid\.gz$'
then
1>&2 echo -n "Uncompressing taxonomy data..."
gunzip *accession2taxid.gz
1>&2 echo " done."
fi
if [ ! -e "taxdump.untarflag" ]
then
1>&2 echo -n "Untarring taxonomy tree data..."
tar zxf taxdump.tar.gz
touch taxdump.untarflag
1>&2 echo " done."
fi
kraken2-2.0.8-beta/scripts/kraken2 0000775 0000000 0000000 00000016371 13460230711 0016746 0 ustar 00root root 0000000 0000000 #!/usr/bin/env perl
# Copyright 2013-2019, Derrick Wood
#
# This file is part of the Kraken 2 taxonomic sequence classification system.
# Wrapper for Kraken's classifier
use strict;
use warnings;
use Fcntl;
use File::Basename;
use Getopt::Long;
my $PROG = basename $0;
my $KRAKEN2_DIR = "#####=KRAKEN2_DIR=#####";
# Test to see if the executables got moved, try to recover if we can
if (! -e "$KRAKEN2_DIR/classify") {
use Cwd 'abs_path';
$KRAKEN2_DIR = dirname abs_path($0);
}
require "$KRAKEN2_DIR/kraken2lib.pm";
$ENV{"KRAKEN2_DIR"} = $KRAKEN2_DIR;
$ENV{"PATH"} = "$KRAKEN2_DIR:$ENV{PATH}";
my $CLASSIFY = "$KRAKEN2_DIR/classify";
my $GZIP_MAGIC = chr(hex "1f") . chr(hex "8b");
my $BZIP2_MAGIC = "BZ";
my $quick = 0;
my $min_hits = 1;
my $db_prefix;
my $threads;
my $memory_mapping = 0;
my $gunzip = 0;
my $bunzip2 = 0;
my $paired = 0;
my $names_in_output = 0;
my $only_classified_output = 0;
my $unclassified_out;
my $classified_out;
my $outfile;
my $confidence_threshold = 0.0;
my $minimum_base_quality = 0;
my $report_filename;
my $use_mpa_style = 0;
my $report_zero_counts = 0;
GetOptions(
"help" => \&display_help,
"version" => \&display_version,
"db=s" => \$db_prefix,
"threads=i" => \$threads,
"quick" => \$quick,
"unclassified-out=s" => \$unclassified_out,
"classified-out=s" => \$classified_out,
"output=s" => \$outfile,
"confidence=f" => \$confidence_threshold,
"memory-mapping" => \$memory_mapping,
"paired" => \$paired,
"use-names" => \$names_in_output,
"gzip-compressed" => \$gunzip,
"bzip2-compressed" => \$bunzip2,
"only-classified-output" => \$only_classified_output,
"minimum-base-quality=i" => \$minimum_base_quality,
"report=s" => \$report_filename,
"use-mpa-style" => \$use_mpa_style,
"report-zero-counts" => \$report_zero_counts,
);
if (! defined $threads) {
$threads = $ENV{"KRAKEN2_NUM_THREADS"} || 1;
}
if (! @ARGV) {
print STDERR "Need to specify input filenames!\n";
usage();
}
eval { $db_prefix = kraken2lib::find_db($db_prefix); };
if ($@) {
die "$PROG: $@";
}
my $taxonomy = "$db_prefix/taxo.k2d";
my $kht_file = "$db_prefix/hash.k2d";
my $opt_file = "$db_prefix/opts.k2d";
for my $file ($taxonomy, $kht_file, $opt_file) {
if (! -e $file) {
die "$PROG: $file does not exist!\n";
}
}
if ($paired && ((@ARGV % 2) != 0 || @ARGV == 0)) {
die "$PROG: --paired requires positive and even number filenames\n";
}
my $compressed = $gunzip || $bunzip2;
if ($gunzip && $bunzip2) {
die "$PROG: can't use both gzip and bzip2 compression flags\n";
}
if ($confidence_threshold < 0) {
die "$PROG: confidence threshold must be nonnegative\n";
}
if ($confidence_threshold > 1) {
die "$PROG: confidence threshold must be no greater than 1\n";
}
my $auto_detect = ! $compressed;
if ($auto_detect) {
auto_detect_file_format();
}
# set flags for classifier
my @flags;
push @flags, "-H", $kht_file;
push @flags, "-t", $taxonomy;
push @flags, "-o", $opt_file;
push @flags, "-p", $threads;
push @flags, "-q" if $quick;
push @flags, "-P" if $paired;
push @flags, "-n" if $names_in_output;
push @flags, "-T", $confidence_threshold;
push @flags, "-U", $unclassified_out if defined $unclassified_out;
push @flags, "-C", $classified_out if defined $classified_out;
push @flags, "-O", $outfile if defined $outfile;
push @flags, "-Q", $minimum_base_quality;
push @flags, "-R", $report_filename if defined $report_filename;
push @flags, "-m" if $use_mpa_style;
push @flags, "-z" if $report_zero_counts;
push @flags, "-M" if $memory_mapping;
# Stupid hack to keep filehandles from closing before exec
# filehandles opened inside for loop below go out of scope
# and are closed at end of loop without this
my @persistent_fhs;
# handle compressed files by opening pipes from decompression programs
if ($compressed) {
my @replacement_ARGV;
my $compression_program;
if ($gunzip) {
$compression_program = "gzip";
}
elsif ($bunzip2) {
$compression_program = "bzip2";
}
else {
die "$PROG: unrecognized compression program! This is a Kraken bug.\n";
}
for my $file (@ARGV) {
my $qm_file = quotemeta $file;
open my $fh, "$compression_program -dc $qm_file |"
or die "$PROG: error opening pipe from $compression_program for $file: $!\n";
# Have to unset close-on-exec flags to make these pipes stay open across
# exec call
my $flags = fcntl $fh, F_GETFD, 0 or die "$PROG: fcntl GETFD error: $!\n";
fcntl $fh, F_SETFD, ($flags & ~FD_CLOEXEC) or die "$PROG: fcntl SETFD error: $!\n";
push @persistent_fhs, $fh;
my $fd = fileno $fh;
push @replacement_ARGV, "/dev/fd/$fd";
}
@ARGV = @replacement_ARGV;
}
exec $CLASSIFY, @flags, @ARGV;
die "$PROG: exec error: $!\n";
sub usage {
my $exit_code = @_ ? shift : 64;
my $default_db = "none";
eval { $default_db = '"' . kraken2lib::find_db() . '"'; };
my $def_thread_ct = exists $ENV{"KRAKEN2_NUM_THREADS"} ? (0 + $ENV{"KRAKEN2_NUM_THREADS"}) : 1;
print STDERR <
Options:
--db NAME Name for Kraken 2 DB
(default: $default_db)
--threads NUM Number of threads (default: $def_thread_ct)
--quick Quick operation (use first hit or hits)
--unclassified-out FILENAME
Print unclassified sequences to filename
--classified-out FILENAME
Print classified sequences to filename
--output FILENAME Print output to filename (default: stdout); "-" will
suppress normal output
--confidence FLOAT Confidence score threshold (default: 0.0); must be
in [0, 1].
--minimum-base-quality NUM
Minimum base quality used in classification (def: 0,
only effective with FASTQ input).
--report FILENAME Print a report with aggregrate counts/clade to file
--use-mpa-style With --report, format report output like Kraken 1's
kraken-mpa-report
--report-zero-counts With --report, report counts for ALL taxa, even if
counts are zero
--memory-mapping Avoids loading database into RAM
--paired The filenames provided have paired-end reads
--use-names Print scientific names instead of just taxids
--gzip-compressed Input files are compressed with gzip
--bzip2-compressed Input files are compressed with bzip2
--help Print this message
--version Print version information
If none of the *-compressed flags are specified, and the filename provided
is a regular file, automatic format detection is attempted.
EOF
exit $exit_code;
}
sub display_help {
usage(0);
}
sub display_version {
print "Kraken version #####=VERSION=#####\n";
print "Copyright 2013-2019, Derrick Wood (dwood\@cs.jhu.edu)\n";
exit 0;
}
sub auto_detect_file_format {
my $magic;
my $filename = $ARGV[0];
# Don't try to auto-detect when you can't unread data
if (! -f $filename) {
return;
}
# read 2-byte magic number to determine type of compression (if any)
open FILE, "<", $filename;
read FILE, $magic, 2;
close FILE;
if ($magic eq $GZIP_MAGIC) {
$compressed = 1;
$gunzip = 1;
}
elsif ($magic eq $BZIP2_MAGIC) {
$compressed = 1;
$bunzip2 = 1;
}
}
kraken2-2.0.8-beta/scripts/kraken2-build 0000775 0000000 0000000 00000021253 13460230711 0020036 0 ustar 00root root 0000000 0000000 #!/usr/bin/env perl
# Copyright 2013-2019, Derrick Wood
#
# This file is part of the Kraken 2 taxonomic sequence classification system.
# General build process wrapper for Kraken 2.
use strict;
use warnings;
use File::Basename;
use Getopt::Long;
my $PROG = basename $0;
my $KRAKEN2_DIR = "#####=KRAKEN2_DIR=#####";
# Test to see if the executables got moved, try to recover if we can
if (! -e "$KRAKEN2_DIR/classify") {
use Cwd 'abs_path';
$KRAKEN2_DIR = dirname abs_path($0);
}
$ENV{"KRAKEN2_DIR"} = $KRAKEN2_DIR;
$ENV{"PATH"} = "$KRAKEN2_DIR:$ENV{PATH}";
my $DEF_AA_MINIMIZER_LEN = 12;
my $DEF_AA_KMER_LEN = 15;
my $DEF_AA_MINIMIZER_SPACES = 0;
my $DEF_NT_MINIMIZER_LEN = 31;
my $DEF_NT_KMER_LEN = 35;
my $DEF_NT_MINIMIZER_SPACES = 7;
my $DEF_THREAD_CT = 1;
my @VALID_LIBRARY_TYPES = qw/archaea bacteria plasmid viral plant
protozoa fungi human nr nt env_nr env_nt
UniVec UniVec_Core/;
my @VALID_SPECIAL_DB_TYPES = qw/greengenes silva rdp/;
# Option/task option variables
my (
$db,
$threads,
$minimizer_len,
$kmer_len,
$minimizer_spaces,
$is_protein,
$no_masking,
$max_db_size,
$use_ftp,
$skip_maps,
$dl_taxonomy,
$dl_library,
$add_to_library,
$build,
$standard,
$clean,
$special,
);
$threads = $DEF_THREAD_CT;
$is_protein = 0;
# variables corresponding to task options
my @TASK_LIST = (
\$dl_taxonomy,
\$dl_library,
\$add_to_library,
\$build,
\$standard,
\$clean,
\$special,
);
GetOptions(
"help" => \&display_help,
"version" => \&display_version,
"db=s" => \$db,
"threads=i" => \$threads,
"minimizer-len=i" => \$minimizer_len,
"kmer-len=i" => \$kmer_len,
"minimizer-spaces=i", \$minimizer_spaces,
"protein" => \$is_protein,
"no-masking" => \$no_masking,
"max-db-size=i" => \$max_db_size,
"use-ftp" => \$use_ftp,
"skip-maps" => \$skip_maps,
"download-taxonomy" => \$dl_taxonomy,
"download-library=s" => \$dl_library,
"add-to-library=s" => \$add_to_library,
"build" => \$build,
"standard" => \$standard,
"clean" => \$clean,
"special=s" => \$special,
) or usage();
if ($is_protein) {
$kmer_len = $DEF_AA_KMER_LEN if ! defined $kmer_len;
$minimizer_len = $DEF_AA_MINIMIZER_LEN if ! defined $minimizer_len;
$minimizer_spaces = $DEF_AA_MINIMIZER_SPACES if ! defined $minimizer_spaces;
}
else {
$kmer_len = $DEF_NT_KMER_LEN if ! defined $kmer_len;
$minimizer_len = $DEF_NT_MINIMIZER_LEN if ! defined $minimizer_len;
$minimizer_spaces = $DEF_NT_MINIMIZER_SPACES if ! defined $minimizer_spaces;
}
if (@ARGV) {
warn "Extra arguments on command line.\n";
usage();
}
my $task_options = scalar grep defined $$_, @TASK_LIST;
if ($task_options > 1) {
warn "More than one task option selected.\n";
usage();
}
if ($task_options == 0) {
warn "Must select a task option.\n";
usage();
}
if (! defined $db) {
die "Must specify a database name\n";
}
if ($threads <= 0) {
die "Can't use nonpositive thread count of $threads\n";
}
if ($minimizer_len > $kmer_len) {
die "Minimizer length ($minimizer_len) must not be greater than k ($kmer_len)\n";
}
if ($minimizer_len <= 0) {
die "Can't use nonpositive minimizer length of $minimizer_len\n";
}
if ($minimizer_len > 31) {
die "Can't use minimizer len of $minimizer_len (must be <= 31)\n";
}
$ENV{"KRAKEN2_DB_NAME"} = $db;
$ENV{"KRAKEN2_THREAD_CT"} = $threads;
$ENV{"KRAKEN2_MINIMIZER_LEN"} = $minimizer_len;
$ENV{"KRAKEN2_KMER_LEN"} = $kmer_len;
$ENV{"KRAKEN2_MINIMIZER_SPACES"} = $minimizer_spaces;
$ENV{"KRAKEN2_SEED_TEMPLATE"} = construct_seed_template();
$ENV{"KRAKEN2_PROTEIN_DB"} = $is_protein ? 1 : "";
$ENV{"KRAKEN2_MASK_LC"} = $no_masking ? "" : 1;
$ENV{"KRAKEN2_MAX_DB_SIZE"} = defined($max_db_size) ? $max_db_size : "";
$ENV{"KRAKEN2_USE_FTP"} = $use_ftp ? 1 : "";
$ENV{"KRAKEN2_SKIP_MAPS"} = $skip_maps ? 1 : "";
if ($dl_taxonomy) {
download_taxonomy();
}
elsif (defined($dl_library)) {
download_library($dl_library);
}
elsif (defined($add_to_library)) {
add_to_library($add_to_library);
}
elsif ($standard) {
standard_installation();
}
elsif ($build) {
build_database();
}
elsif ($clean) {
clean_database();
}
elsif ($special) {
build_special_database($special);
}
else {
usage();
}
exit -1;
# END OF MAIN CODE.
sub usage {
my $exit_code = @_ ? shift : 64;
print STDERR <