pax_global_header 0000666 0000000 0000000 00000000064 13175666030 0014521 g ustar 00root root 0000000 0000000 52 comment=1bb596c1508ebbb4c84edb7acea695c35aa6a47f
kraken-1.1/ 0000775 0000000 0000000 00000000000 13175666030 0012635 5 ustar 00root root 0000000 0000000 kraken-1.1/CHANGELOG 0000664 0000000 0000000 00000007667 13175666030 0014067 0 ustar 00root root 0000000 0000000 v1.1:
* added --out-fmt paired and --out-fmt interleaved to allow paired reads
to be separated when using --[un]classified-out options
* updated MANUAL.html to reflect additional options.
v1.0:
* removed dependence on GI numbers
* using taxonomy nucl_*.accession2taxid maps to create seqid2taxid maps
* changed kraken-build --download method to use rsync
v0.10.6-beta:
* fixed overflow bug in command line parsing
* fixed GRCh38.p2 bug in human genome downloads
* fixed installation exit code bug
v0.10.5-beta:
* fix bug in GRCh38 download to handle multi-fasta files
* add --header-line and --intermediate-ranks options to kraken-mpa-report
* improved support for adding multi-FASTA files with --add-to-library
* allow assigning taxon IDs in reference sequences w/o GI numbers
using "kraken:taxid" code
* included full sequence descriptions when using "--[un]classified-out"
* reduced memory usage of db_shrink (Build step 2 / kraken-build --shrink)
* reduced memory usage of db_sort (Build step 3)
* reduced memory usage of set_lcas (Build step 6)
* support added for KRAKEN_NUM_THREADS, KRAKEN_DB_PATH, and KRAKEN_DEFAULT_DB
env. variables
* added kraken-translate for getting taxonomic names for each sequence
* added a --rebuild option to kraken-build
* turned off default name checking for PE reads; added --check-names option
* added plasmids to --download-library options
* added HTML manual, redirecting README to that
v0.10.4-beta:
* use GRCh38 for human genome library
* enable input via stdin (via /dev/fd/0)
* enable compressed (gzip/bzip2) input
* enable auto-detection of fasta/fastq/gz/bz2
* simplified add_to_library.sh code to speed up large additions
* use RNA genomes for viral genome library
* scan .ffn (RNA) files for genomic data when building databases
* handle paired-end reads with --paired option
* provide MetaPhlAn-compatible output with kraken-mpa-report
* added domain/kingdom codes to kraken-report
* added kraken-filter script for simple confidence scoring
* added support for multi-FASTA files in custom DBs
* fixed build_kraken_db.sh bug for k-mers w/ k < 31
* updates to README file
v0.10.3-beta:
* remove Fatal.pm use in kraken-report
* fixed false success message on make failure in installer
* explicitly require g++ as C++ compiler in Makefile
* change to quickfile.cpp to do proper syncing on close
* fixed kraken-build bug w/ --work-on-disk (cause of some major build stalls)
* changed hash size calculation to use Perl
* close input files explicitly in db_sort/db_shrink to reduce reported memory
* allow db_shrink to work in RAM
* updates to README file
v0.10.2-beta:
* fixed kraken-report bug w/ --show-zeros
* fixed kraken-report installation bug
* updates to README file
v0.10.1-beta:
* fixed 2nd bug in build_kraken.sh in calculating hash size (thanks T. Antao)
* fixed bug in add_to_library.sh for some bash versions (thanks T. Antao)
* fixed issue where search window wasn't cached until a failure (query speedup)
* added $KRAKEN_DIR fallback for kraken/kraken-build (thanks S. Koren)
v0.10.0-beta:
* added CHANGELOG
* fixed quick mode hit list output
* updated README citation
* changed minimizer sort order (query speedup), changes database structure
* use linear search with small windows (query speedup)
* changed query procedure (query speedup); search w/o 1st calculating minimizer
* changed readlink in installer to perl Cwd::abs_path (portability)
* removed MAP_POPULATE for preloading, uses read loop instead (bugfix/port.)
* added --work-on-disk switch to kraken-build
* added kraken-report script
* fixed bug in build_kraken.sh in calculating hash size (thanks T. Antao)
v0.9.1b:
* fixed bug to allow kraken-build --shrink
v0.9.0b:
* full rewrite
* minimizers used to speed queries, prefix index removed
v0.3:
* DB build parallelized, Jellyfish removed from LCA assignment
v0.2:
* full rewrite, most progs. changed to C++
* Jellyfish removed from classification step
* prefix index used to speed queries
v0.1:
* initial version, mostly Perl
kraken-1.1/LICENSE 0000664 0000000 0000000 00000104513 13175666030 0013646 0 ustar 00root root 0000000 0000000 GNU GENERAL PUBLIC LICENSE
Version 3, 29 June 2007
Copyright (C) 2007 Free Software Foundation, Inc.
Everyone is permitted to copy and distribute verbatim copies
of this license document, but changing it is not allowed.
Preamble
The GNU General Public License is a free, copyleft license for
software and other kinds of works.
The licenses for most software and other practical works are designed
to take away your freedom to share and change the works. By contrast,
the GNU General Public License is intended to guarantee your freedom to
share and change all versions of a program--to make sure it remains free
software for all its users. We, the Free Software Foundation, use the
GNU General Public License for most of our software; it applies also to
any other work released this way by its authors. You can apply it to
your programs, too.
When we speak of free software, we are referring to freedom, not
price. Our General Public Licenses are designed to make sure that you
have the freedom to distribute copies of free software (and charge for
them if you wish), that you receive source code or can get it if you
want it, that you can change the software or use pieces of it in new
free programs, and that you know you can do these things.
To protect your rights, we need to prevent others from denying you
these rights or asking you to surrender the rights. Therefore, you have
certain responsibilities if you distribute copies of the software, or if
you modify it: responsibilities to respect the freedom of others.
For example, if you distribute copies of such a program, whether
gratis or for a fee, you must pass on to the recipients the same
freedoms that you received. You must make sure that they, too, receive
or can get the source code. And you must show them these terms so they
know their rights.
Developers that use the GNU GPL protect your rights with two steps:
(1) assert copyright on the software, and (2) offer you this License
giving you legal permission to copy, distribute and/or modify it.
For the developers' and authors' protection, the GPL clearly explains
that there is no warranty for this free software. For both users' and
authors' sake, the GPL requires that modified versions be marked as
changed, so that their problems will not be attributed erroneously to
authors of previous versions.
Some devices are designed to deny users access to install or run
modified versions of the software inside them, although the manufacturer
can do so. This is fundamentally incompatible with the aim of
protecting users' freedom to change the software. The systematic
pattern of such abuse occurs in the area of products for individuals to
use, which is precisely where it is most unacceptable. Therefore, we
have designed this version of the GPL to prohibit the practice for those
products. If such problems arise substantially in other domains, we
stand ready to extend this provision to those domains in future versions
of the GPL, as needed to protect the freedom of users.
Finally, every program is threatened constantly by software patents.
States should not allow patents to restrict development and use of
software on general-purpose computers, but in those that do, we wish to
avoid the special danger that patents applied to a free program could
make it effectively proprietary. To prevent this, the GPL assures that
patents cannot be used to render the program non-free.
The precise terms and conditions for copying, distribution and
modification follow.
TERMS AND CONDITIONS
0. Definitions.
"This License" refers to version 3 of the GNU General Public License.
"Copyright" also means copyright-like laws that apply to other kinds of
works, such as semiconductor masks.
"The Program" refers to any copyrightable work licensed under this
License. Each licensee is addressed as "you". "Licensees" and
"recipients" may be individuals or organizations.
To "modify" a work means to copy from or adapt all or part of the work
in a fashion requiring copyright permission, other than the making of an
exact copy. The resulting work is called a "modified version" of the
earlier work or a work "based on" the earlier work.
A "covered work" means either the unmodified Program or a work based
on the Program.
To "propagate" a work means to do anything with it that, without
permission, would make you directly or secondarily liable for
infringement under applicable copyright law, except executing it on a
computer or modifying a private copy. Propagation includes copying,
distribution (with or without modification), making available to the
public, and in some countries other activities as well.
To "convey" a work means any kind of propagation that enables other
parties to make or receive copies. Mere interaction with a user through
a computer network, with no transfer of a copy, is not conveying.
An interactive user interface displays "Appropriate Legal Notices"
to the extent that it includes a convenient and prominently visible
feature that (1) displays an appropriate copyright notice, and (2)
tells the user that there is no warranty for the work (except to the
extent that warranties are provided), that licensees may convey the
work under this License, and how to view a copy of this License. If
the interface presents a list of user commands or options, such as a
menu, a prominent item in the list meets this criterion.
1. Source Code.
The "source code" for a work means the preferred form of the work
for making modifications to it. "Object code" means any non-source
form of a work.
A "Standard Interface" means an interface that either is an official
standard defined by a recognized standards body, or, in the case of
interfaces specified for a particular programming language, one that
is widely used among developers working in that language.
The "System Libraries" of an executable work include anything, other
than the work as a whole, that (a) is included in the normal form of
packaging a Major Component, but which is not part of that Major
Component, and (b) serves only to enable use of the work with that
Major Component, or to implement a Standard Interface for which an
implementation is available to the public in source code form. A
"Major Component", in this context, means a major essential component
(kernel, window system, and so on) of the specific operating system
(if any) on which the executable work runs, or a compiler used to
produce the work, or an object code interpreter used to run it.
The "Corresponding Source" for a work in object code form means all
the source code needed to generate, install, and (for an executable
work) run the object code and to modify the work, including scripts to
control those activities. However, it does not include the work's
System Libraries, or general-purpose tools or generally available free
programs which are used unmodified in performing those activities but
which are not part of the work. For example, Corresponding Source
includes interface definition files associated with source files for
the work, and the source code for shared libraries and dynamically
linked subprograms that the work is specifically designed to require,
such as by intimate data communication or control flow between those
subprograms and other parts of the work.
The Corresponding Source need not include anything that users
can regenerate automatically from other parts of the Corresponding
Source.
The Corresponding Source for a work in source code form is that
same work.
2. Basic Permissions.
All rights granted under this License are granted for the term of
copyright on the Program, and are irrevocable provided the stated
conditions are met. This License explicitly affirms your unlimited
permission to run the unmodified Program. The output from running a
covered work is covered by this License only if the output, given its
content, constitutes a covered work. This License acknowledges your
rights of fair use or other equivalent, as provided by copyright law.
You may make, run and propagate covered works that you do not
convey, without conditions so long as your license otherwise remains
in force. You may convey covered works to others for the sole purpose
of having them make modifications exclusively for you, or provide you
with facilities for running those works, provided that you comply with
the terms of this License in conveying all material for which you do
not control copyright. Those thus making or running the covered works
for you must do so exclusively on your behalf, under your direction
and control, on terms that prohibit them from making any copies of
your copyrighted material outside their relationship with you.
Conveying under any other circumstances is permitted solely under
the conditions stated below. Sublicensing is not allowed; section 10
makes it unnecessary.
3. Protecting Users' Legal Rights From Anti-Circumvention Law.
No covered work shall be deemed part of an effective technological
measure under any applicable law fulfilling obligations under article
11 of the WIPO copyright treaty adopted on 20 December 1996, or
similar laws prohibiting or restricting circumvention of such
measures.
When you convey a covered work, you waive any legal power to forbid
circumvention of technological measures to the extent such circumvention
is effected by exercising rights under this License with respect to
the covered work, and you disclaim any intention to limit operation or
modification of the work as a means of enforcing, against the work's
users, your or third parties' legal rights to forbid circumvention of
technological measures.
4. Conveying Verbatim Copies.
You may convey verbatim copies of the Program's source code as you
receive it, in any medium, provided that you conspicuously and
appropriately publish on each copy an appropriate copyright notice;
keep intact all notices stating that this License and any
non-permissive terms added in accord with section 7 apply to the code;
keep intact all notices of the absence of any warranty; and give all
recipients a copy of this License along with the Program.
You may charge any price or no price for each copy that you convey,
and you may offer support or warranty protection for a fee.
5. Conveying Modified Source Versions.
You may convey a work based on the Program, or the modifications to
produce it from the Program, in the form of source code under the
terms of section 4, provided that you also meet all of these conditions:
a) The work must carry prominent notices stating that you modified
it, and giving a relevant date.
b) The work must carry prominent notices stating that it is
released under this License and any conditions added under section
7. This requirement modifies the requirement in section 4 to
"keep intact all notices".
c) You must license the entire work, as a whole, under this
License to anyone who comes into possession of a copy. This
License will therefore apply, along with any applicable section 7
additional terms, to the whole of the work, and all its parts,
regardless of how they are packaged. This License gives no
permission to license the work in any other way, but it does not
invalidate such permission if you have separately received it.
d) If the work has interactive user interfaces, each must display
Appropriate Legal Notices; however, if the Program has interactive
interfaces that do not display Appropriate Legal Notices, your
work need not make them do so.
A compilation of a covered work with other separate and independent
works, which are not by their nature extensions of the covered work,
and which are not combined with it such as to form a larger program,
in or on a volume of a storage or distribution medium, is called an
"aggregate" if the compilation and its resulting copyright are not
used to limit the access or legal rights of the compilation's users
beyond what the individual works permit. Inclusion of a covered work
in an aggregate does not cause this License to apply to the other
parts of the aggregate.
6. Conveying Non-Source Forms.
You may convey a covered work in object code form under the terms
of sections 4 and 5, provided that you also convey the
machine-readable Corresponding Source under the terms of this License,
in one of these ways:
a) Convey the object code in, or embodied in, a physical product
(including a physical distribution medium), accompanied by the
Corresponding Source fixed on a durable physical medium
customarily used for software interchange.
b) Convey the object code in, or embodied in, a physical product
(including a physical distribution medium), accompanied by a
written offer, valid for at least three years and valid for as
long as you offer spare parts or customer support for that product
model, to give anyone who possesses the object code either (1) a
copy of the Corresponding Source for all the software in the
product that is covered by this License, on a durable physical
medium customarily used for software interchange, for a price no
more than your reasonable cost of physically performing this
conveying of source, or (2) access to copy the
Corresponding Source from a network server at no charge.
c) Convey individual copies of the object code with a copy of the
written offer to provide the Corresponding Source. This
alternative is allowed only occasionally and noncommercially, and
only if you received the object code with such an offer, in accord
with subsection 6b.
d) Convey the object code by offering access from a designated
place (gratis or for a charge), and offer equivalent access to the
Corresponding Source in the same way through the same place at no
further charge. You need not require recipients to copy the
Corresponding Source along with the object code. If the place to
copy the object code is a network server, the Corresponding Source
may be on a different server (operated by you or a third party)
that supports equivalent copying facilities, provided you maintain
clear directions next to the object code saying where to find the
Corresponding Source. Regardless of what server hosts the
Corresponding Source, you remain obligated to ensure that it is
available for as long as needed to satisfy these requirements.
e) Convey the object code using peer-to-peer transmission, provided
you inform other peers where the object code and Corresponding
Source of the work are being offered to the general public at no
charge under subsection 6d.
A separable portion of the object code, whose source code is excluded
from the Corresponding Source as a System Library, need not be
included in conveying the object code work.
A "User Product" is either (1) a "consumer product", which means any
tangible personal property which is normally used for personal, family,
or household purposes, or (2) anything designed or sold for incorporation
into a dwelling. In determining whether a product is a consumer product,
doubtful cases shall be resolved in favor of coverage. For a particular
product received by a particular user, "normally used" refers to a
typical or common use of that class of product, regardless of the status
of the particular user or of the way in which the particular user
actually uses, or expects or is expected to use, the product. A product
is a consumer product regardless of whether the product has substantial
commercial, industrial or non-consumer uses, unless such uses represent
the only significant mode of use of the product.
"Installation Information" for a User Product means any methods,
procedures, authorization keys, or other information required to install
and execute modified versions of a covered work in that User Product from
a modified version of its Corresponding Source. The information must
suffice to ensure that the continued functioning of the modified object
code is in no case prevented or interfered with solely because
modification has been made.
If you convey an object code work under this section in, or with, or
specifically for use in, a User Product, and the conveying occurs as
part of a transaction in which the right of possession and use of the
User Product is transferred to the recipient in perpetuity or for a
fixed term (regardless of how the transaction is characterized), the
Corresponding Source conveyed under this section must be accompanied
by the Installation Information. But this requirement does not apply
if neither you nor any third party retains the ability to install
modified object code on the User Product (for example, the work has
been installed in ROM).
The requirement to provide Installation Information does not include a
requirement to continue to provide support service, warranty, or updates
for a work that has been modified or installed by the recipient, or for
the User Product in which it has been modified or installed. Access to a
network may be denied when the modification itself materially and
adversely affects the operation of the network or violates the rules and
protocols for communication across the network.
Corresponding Source conveyed, and Installation Information provided,
in accord with this section must be in a format that is publicly
documented (and with an implementation available to the public in
source code form), and must require no special password or key for
unpacking, reading or copying.
7. Additional Terms.
"Additional permissions" are terms that supplement the terms of this
License by making exceptions from one or more of its conditions.
Additional permissions that are applicable to the entire Program shall
be treated as though they were included in this License, to the extent
that they are valid under applicable law. If additional permissions
apply only to part of the Program, that part may be used separately
under those permissions, but the entire Program remains governed by
this License without regard to the additional permissions.
When you convey a copy of a covered work, you may at your option
remove any additional permissions from that copy, or from any part of
it. (Additional permissions may be written to require their own
removal in certain cases when you modify the work.) You may place
additional permissions on material, added by you to a covered work,
for which you have or can give appropriate copyright permission.
Notwithstanding any other provision of this License, for material you
add to a covered work, you may (if authorized by the copyright holders of
that material) supplement the terms of this License with terms:
a) Disclaiming warranty or limiting liability differently from the
terms of sections 15 and 16 of this License; or
b) Requiring preservation of specified reasonable legal notices or
author attributions in that material or in the Appropriate Legal
Notices displayed by works containing it; or
c) Prohibiting misrepresentation of the origin of that material, or
requiring that modified versions of such material be marked in
reasonable ways as different from the original version; or
d) Limiting the use for publicity purposes of names of licensors or
authors of the material; or
e) Declining to grant rights under trademark law for use of some
trade names, trademarks, or service marks; or
f) Requiring indemnification of licensors and authors of that
material by anyone who conveys the material (or modified versions of
it) with contractual assumptions of liability to the recipient, for
any liability that these contractual assumptions directly impose on
those licensors and authors.
All other non-permissive additional terms are considered "further
restrictions" within the meaning of section 10. If the Program as you
received it, or any part of it, contains a notice stating that it is
governed by this License along with a term that is a further
restriction, you may remove that term. If a license document contains
a further restriction but permits relicensing or conveying under this
License, you may add to a covered work material governed by the terms
of that license document, provided that the further restriction does
not survive such relicensing or conveying.
If you add terms to a covered work in accord with this section, you
must place, in the relevant source files, a statement of the
additional terms that apply to those files, or a notice indicating
where to find the applicable terms.
Additional terms, permissive or non-permissive, may be stated in the
form of a separately written license, or stated as exceptions;
the above requirements apply either way.
8. Termination.
You may not propagate or modify a covered work except as expressly
provided under this License. Any attempt otherwise to propagate or
modify it is void, and will automatically terminate your rights under
this License (including any patent licenses granted under the third
paragraph of section 11).
However, if you cease all violation of this License, then your
license from a particular copyright holder is reinstated (a)
provisionally, unless and until the copyright holder explicitly and
finally terminates your license, and (b) permanently, if the copyright
holder fails to notify you of the violation by some reasonable means
prior to 60 days after the cessation.
Moreover, your license from a particular copyright holder is
reinstated permanently if the copyright holder notifies you of the
violation by some reasonable means, this is the first time you have
received notice of violation of this License (for any work) from that
copyright holder, and you cure the violation prior to 30 days after
your receipt of the notice.
Termination of your rights under this section does not terminate the
licenses of parties who have received copies or rights from you under
this License. If your rights have been terminated and not permanently
reinstated, you do not qualify to receive new licenses for the same
material under section 10.
9. Acceptance Not Required for Having Copies.
You are not required to accept this License in order to receive or
run a copy of the Program. Ancillary propagation of a covered work
occurring solely as a consequence of using peer-to-peer transmission
to receive a copy likewise does not require acceptance. However,
nothing other than this License grants you permission to propagate or
modify any covered work. These actions infringe copyright if you do
not accept this License. Therefore, by modifying or propagating a
covered work, you indicate your acceptance of this License to do so.
10. Automatic Licensing of Downstream Recipients.
Each time you convey a covered work, the recipient automatically
receives a license from the original licensors, to run, modify and
propagate that work, subject to this License. You are not responsible
for enforcing compliance by third parties with this License.
An "entity transaction" is a transaction transferring control of an
organization, or substantially all assets of one, or subdividing an
organization, or merging organizations. If propagation of a covered
work results from an entity transaction, each party to that
transaction who receives a copy of the work also receives whatever
licenses to the work the party's predecessor in interest had or could
give under the previous paragraph, plus a right to possession of the
Corresponding Source of the work from the predecessor in interest, if
the predecessor has it or can get it with reasonable efforts.
You may not impose any further restrictions on the exercise of the
rights granted or affirmed under this License. For example, you may
not impose a license fee, royalty, or other charge for exercise of
rights granted under this License, and you may not initiate litigation
(including a cross-claim or counterclaim in a lawsuit) alleging that
any patent claim is infringed by making, using, selling, offering for
sale, or importing the Program or any portion of it.
11. Patents.
A "contributor" is a copyright holder who authorizes use under this
License of the Program or a work on which the Program is based. The
work thus licensed is called the contributor's "contributor version".
A contributor's "essential patent claims" are all patent claims
owned or controlled by the contributor, whether already acquired or
hereafter acquired, that would be infringed by some manner, permitted
by this License, of making, using, or selling its contributor version,
but do not include claims that would be infringed only as a
consequence of further modification of the contributor version. For
purposes of this definition, "control" includes the right to grant
patent sublicenses in a manner consistent with the requirements of
this License.
Each contributor grants you a non-exclusive, worldwide, royalty-free
patent license under the contributor's essential patent claims, to
make, use, sell, offer for sale, import and otherwise run, modify and
propagate the contents of its contributor version.
In the following three paragraphs, a "patent license" is any express
agreement or commitment, however denominated, not to enforce a patent
(such as an express permission to practice a patent or covenant not to
sue for patent infringement). To "grant" such a patent license to a
party means to make such an agreement or commitment not to enforce a
patent against the party.
If you convey a covered work, knowingly relying on a patent license,
and the Corresponding Source of the work is not available for anyone
to copy, free of charge and under the terms of this License, through a
publicly available network server or other readily accessible means,
then you must either (1) cause the Corresponding Source to be so
available, or (2) arrange to deprive yourself of the benefit of the
patent license for this particular work, or (3) arrange, in a manner
consistent with the requirements of this License, to extend the patent
license to downstream recipients. "Knowingly relying" means you have
actual knowledge that, but for the patent license, your conveying the
covered work in a country, or your recipient's use of the covered work
in a country, would infringe one or more identifiable patents in that
country that you have reason to believe are valid.
If, pursuant to or in connection with a single transaction or
arrangement, you convey, or propagate by procuring conveyance of, a
covered work, and grant a patent license to some of the parties
receiving the covered work authorizing them to use, propagate, modify
or convey a specific copy of the covered work, then the patent license
you grant is automatically extended to all recipients of the covered
work and works based on it.
A patent license is "discriminatory" if it does not include within
the scope of its coverage, prohibits the exercise of, or is
conditioned on the non-exercise of one or more of the rights that are
specifically granted under this License. You may not convey a covered
work if you are a party to an arrangement with a third party that is
in the business of distributing software, under which you make payment
to the third party based on the extent of your activity of conveying
the work, and under which the third party grants, to any of the
parties who would receive the covered work from you, a discriminatory
patent license (a) in connection with copies of the covered work
conveyed by you (or copies made from those copies), or (b) primarily
for and in connection with specific products or compilations that
contain the covered work, unless you entered into that arrangement,
or that patent license was granted, prior to 28 March 2007.
Nothing in this License shall be construed as excluding or limiting
any implied license or other defenses to infringement that may
otherwise be available to you under applicable patent law.
12. No Surrender of Others' Freedom.
If conditions are imposed on you (whether by court order, agreement or
otherwise) that contradict the conditions of this License, they do not
excuse you from the conditions of this License. If you cannot convey a
covered work so as to satisfy simultaneously your obligations under this
License and any other pertinent obligations, then as a consequence you may
not convey it at all. For example, if you agree to terms that obligate you
to collect a royalty for further conveying from those to whom you convey
the Program, the only way you could satisfy both those terms and this
License would be to refrain entirely from conveying the Program.
13. Use with the GNU Affero General Public License.
Notwithstanding any other provision of this License, you have
permission to link or combine any covered work with a work licensed
under version 3 of the GNU Affero General Public License into a single
combined work, and to convey the resulting work. The terms of this
License will continue to apply to the part which is the covered work,
but the special requirements of the GNU Affero General Public License,
section 13, concerning interaction through a network will apply to the
combination as such.
14. Revised Versions of this License.
The Free Software Foundation may publish revised and/or new versions of
the GNU General Public License from time to time. Such new versions will
be similar in spirit to the present version, but may differ in detail to
address new problems or concerns.
Each version is given a distinguishing version number. If the
Program specifies that a certain numbered version of the GNU General
Public License "or any later version" applies to it, you have the
option of following the terms and conditions either of that numbered
version or of any later version published by the Free Software
Foundation. If the Program does not specify a version number of the
GNU General Public License, you may choose any version ever published
by the Free Software Foundation.
If the Program specifies that a proxy can decide which future
versions of the GNU General Public License can be used, that proxy's
public statement of acceptance of a version permanently authorizes you
to choose that version for the Program.
Later license versions may give you additional or different
permissions. However, no additional obligations are imposed on any
author or copyright holder as a result of your choosing to follow a
later version.
15. Disclaimer of Warranty.
THERE IS NO WARRANTY FOR THE PROGRAM, TO THE EXTENT PERMITTED BY
APPLICABLE LAW. EXCEPT WHEN OTHERWISE STATED IN WRITING THE COPYRIGHT
HOLDERS AND/OR OTHER PARTIES PROVIDE THE PROGRAM "AS IS" WITHOUT WARRANTY
OF ANY KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING, BUT NOT LIMITED TO,
THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
PURPOSE. THE ENTIRE RISK AS TO THE QUALITY AND PERFORMANCE OF THE PROGRAM
IS WITH YOU. SHOULD THE PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF
ALL NECESSARY SERVICING, REPAIR OR CORRECTION.
16. Limitation of Liability.
IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING
WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MODIFIES AND/OR CONVEYS
THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES, INCLUDING ANY
GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING OUT OF THE
USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED TO LOSS OF
DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY YOU OR THIRD
PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER PROGRAMS),
EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE POSSIBILITY OF
SUCH DAMAGES.
17. Interpretation of Sections 15 and 16.
If the disclaimer of warranty and limitation of liability provided
above cannot be given local legal effect according to their terms,
reviewing courts shall apply local law that most closely approximates
an absolute waiver of all civil liability in connection with the
Program, unless a warranty or assumption of liability accompanies a
copy of the Program in return for a fee.
END OF TERMS AND CONDITIONS
How to Apply These Terms to Your New Programs
If you develop a new program, and you want it to be of the greatest
possible use to the public, the best way to achieve this is to make it
free software which everyone can redistribute and change under these terms.
To do so, attach the following notices to the program. It is safest
to attach them to the start of each source file to most effectively
state the exclusion of warranty; and each file should have at least
the "copyright" line and a pointer to where the full notice is found.
Copyright (C)
This program is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with this program. If not, see .
Also add information on how to contact you by electronic and paper mail.
If the program does terminal interaction, make it output a short
notice like this when it starts in an interactive mode:
Copyright (C)
This program comes with ABSOLUTELY NO WARRANTY; for details type `show w'.
This is free software, and you are welcome to redistribute it
under certain conditions; type `show c' for details.
The hypothetical commands `show w' and `show c' should show the appropriate
parts of the General Public License. Of course, your program's commands
might be different; for a GUI interface, you would use an "about box".
You should also get your employer (if you work as a programmer) or school,
if any, to sign a "copyright disclaimer" for the program, if necessary.
For more information on this, and how to apply and follow the GNU GPL, see
.
The GNU General Public License does not permit incorporating your program
into proprietary programs. If your program is a subroutine library, you
may consider it more useful to permit linking proprietary applications with
the library. If this is what you want to do, use the GNU Lesser General
Public License instead of this License. But first, please read
.
kraken-1.1/README.md 0000664 0000000 0000000 00000000701 13175666030 0014112 0 ustar 00root root 0000000 0000000 Kraken taxonomic sequence classification system
===============================================
Please see the [Kraken webpage] or the [Kraken manual]
for information on installing and operating Kraken.
A local copy of the [Kraken manual] is also present here
in the `docs/` directory (`MANUAL.html` and `MANUAL.markdown`).
[Kraken webpage]: http://ccb.jhu.edu/software/kraken/
[Kraken manual]: http://ccb.jhu.edu/software/kraken/MANUAL.html
kraken-1.1/docs/ 0000775 0000000 0000000 00000000000 13175666030 0013565 5 ustar 00root root 0000000 0000000 kraken-1.1/docs/MANUAL.html 0000664 0000000 0000000 00000132172 13175666030 0015436 0 ustar 00root root 0000000 0000000
Kraken Manual -
Kraken is a taxonomic sequence classifier that assigns taxonomic labels to short DNA reads. It does this by examining the k-mers within a read and querying a database with those k-mers. This database contains a mapping of every k-mer in Kraken's genomic library to the lowest common ancestor (LCA) in a taxonomic tree of all genomes that contain that k-mer. The set of LCA taxa that correspond to the k-mers in a read are then analyzed to create a single taxonomic label for the read; this label can be any of the nodes in the taxonomic tree. Kraken is designed to be rapid, sensitive, and highly precise. Our tests on various real and simulated data have shown Kraken to have sensitivity slightly lower than Megablast with precision being slightly higher. On a set of simulated 100 bp reads, Kraken processed over 1.3 million reads per minute on a single core in normal operation, and over 4.1 million reads per minute in quick operation.
The latest released version of Kraken will be available at the Kraken website, and the latest updates to the Kraken source code are available at the Kraken GitHub repository.
If you use Kraken in your research, please cite the Kraken paper. Thank you!
Note: Users concerned about the disk or memory requirements should read the paragraph about MiniKraken, below.
Disk space: Construction of Kraken's standard database will require at least 500 GB of disk space as of Oct. 2017. Customized databases may require more or less space. After construction, the minimum required database files require approximately 200 GB of disk space. Disk space used is linearly proportional to the number of distinct k-mers; as of Oct. 2017, Kraken's default database contains approximately 14 billion (1.4e9) distinct k-mers.
In addition, the disk used to store the database should be locally-attached storage. Storing the database on a network filesystem (NFS) partition can cause Kraken's operation to be very slow, or to be stopped completely. As NFS accesses are much slower than local disk accesses, both preloading and database building will be slowed by use of NFS.
Memory: To run efficiently, Kraken requires enough free memory to hold the database in RAM. While this can be accomplished using a ramdisk, Kraken supplies a utility for loading the database into RAM via the OS cache. The default database size is 174 GB (as of Oct. 2017), and so you will need at least that much RAM if you want to build or run with the default database.
Dependencies: Kraken currently makes extensive use of Linux utilities such as sed, find, and wget. Many scripts are written using the Bash shell, and the main scripts are written using Perl. Core programs needed to build the database and run the classifier are written in C++, and need to be compiled using g++. Multithreading is handled using OpenMP. Downloads of NCBI data are performed by wget and in some cases, by rsync. Most Linux systems that have any sort of development package installed will have all of the above listed programs and libraries available.
Finally, if you want to build your own database, you will need to install the Jellyfishk-mer counter. Note that Kraken only supports use of Jellyfish version 1. Jellyfish version 2 is not compatible with Kraken.
Network connectivity: Kraken's standard database build and download commands expect unfettered FTP and rsync access to the NCBI FTP server. If you're working behind a proxy, you may need to set certain environment variables (such as ftp_proxy or RSYNC_PROXY) in order to get these commands to work properly.
MiniKraken: To allow users with low-memory computing environments to use Kraken, we supply a reduced standard database that can be downloaded from the Kraken web site. When Kraken is run with a reduced database, we call it MiniKraken.
The databases we make available are only 4 GB and 8 GB in size, and should run well on computers with as little as 8 GB and 16 GB of RAM (respectively). Disk space required for each MiniKraken database is also only 4 GB or 8 GB.
To begin using Kraken, you will first need to install it, and then either download or create a database.
Kraken consists of two main scripts ("kraken" and "kraken-build"), along with several programs and smaller scripts. As part of the installation process, all scripts and programs are installed in the same directory. After installation, you can move the main scripts elsewhere, but moving the other scripts and programs requires editing the scripts and changing the "$KRAKEN_DIR" variables.
Once a directory is selected, you need to run the following command in the directory where you extracted the Kraken source:
./install_kraken.sh $KRAKEN_DIR
(Replace "$KRAKEN_DIR" above with the directory where you want to install Kraken's programs/directories.)
The install_kraken.sh script should compile all of Kraken's code and setup your Kraken data directory. Installation is successful if you see the message "Kraken installation complete."
Once installation is complete, you may want to copy the two main Kraken scripts into a directory found in your PATH variable (e.g., "$HOME/bin"):
A Kraken database is a directory containing at least 4 files:
database.kdb: Contains the k-mer to taxon mappings
database.idx: Contains minimizer offset locations in database.kdb
taxonomy/nodes.dmp: Taxonomy tree structure + ranks
taxonomy/names.dmp: Taxonomy names
Other files may be present as part of the database build process.
In interacting with Kraken, you should not have to directly reference any of these files, but rather simply provide the name of the directory in which they are stored. Kraken allows both the use of a standard database as well as custom databases; these are described in the sections Standard Kraken Database and Custom Databases below, respectively.
NOTE: Building the standard Kraken database downloads and uses all complete bacterial, archeal, and viral genomes in Refseq at the time of the build.
As of October 2017, this includes ~25,000 genomes, requiring 33GB of disk space.
The build process will then require approximately 450GB of additional disk space.
After building the standard database, usage of the database will require users to keep only the database.idx, database.kdb, and taxonomy/ files, which requires approximately 200GB of disk space. When running a sample against this database, users will need 175 GB of RAM.
If you do not have this computational resources or require testing against this Refseq database of ~25,000 genomes, we recommend building a custom database with only the genomes needed for your application.
To create the standard Kraken database, you can use the following command:
kraken-build --standard --db $DBNAME
(Replace "$DBNAME" above with your preferred database name/location.)
This will download NCBI taxonomic information, as well as the complete genomes in RefSeq for the bacterial, archaeal, and viral domains. After downloading all this data, the build process begins; this is the most time-consuming step. If you have multiple processing cores, you can run this process with multiple threads, e.g.:
kraken-build --standard --threads 24 --db $DBNAME
Using 24 threads on a computer with 244 GB of RAM, the build process took approximately 5 hours (steps with an asterisk have some multi-threading enabled) in October 2017. Please note that the time required for building the database depends on the number of genomic sequences:
24m50s *Step 1 (create kmer set)
n/a Step 2 (reduce database, optional and skipped)
2h34m53s *Step 3 (sort set)
n/a Step 4 (GI number to sequence ID map)
0.17s Step 5 (Sequence ID to taxon map)
2h7m28s *Step 6 (set LCA values)
--------
5h7m11s Total build time
Note that if any step (including the initial downloads) fails, the build process will abort. However, kraken-build will produce checkpoints throughout the installation process, and will restart the build at the last incomplete step if you attempt to run the same command again on a partially-built database.
After building the database, to remove any unnecessary files (including the library files no longer needed), run the following:
kraken-build --db $DBNAME --clean
To create a custom database, or to use a database from another source, see Custom Databases.
Notes for users with lower amounts of RAM:
If you encounter problems with Jellyfish not being able to allocate enough memory on your system to run the build process, you can supply a smaller hash size to Jellyfish using kraken-build's --jellyfish-hash-size switch. Each space in the hash table uses approximately 6.9 bytes, so using "--jellyfish-hash-size 6400M" will use a hash table size of 6.4 billion spaces and require 44.3 GB of RAM.
Kraken's build process will normally attempt to minimize disk writing by allocating large blocks of RAM and operating within them until data needs to be written to disk. However, this extra RAM usage may exceed your capacity. In such cases, you may want to use kraken-build's --work-on-disk switch. This will minimize the amount of RAM usage and cause Kraken's build programs to perform most operations off of disk files. This switch can also be useful for people building on a ramdisk or solid state drive. Please note that working off of disk files can be quite slow on some computers, causing builds to take several days if not weeks.
We realize the standard database may not suit everyone's needs. Kraken also allows creation of customized databases.
To build a custom database:
Install a taxonomy. Usually, you will just use the NCBI taxonomy, which you can easily download using:
kraken-build --download-taxonomy --db $DBNAME
This will download the sequence ID to taxon map, as well as the taxonomic name and tree information from NCBI. These files can be found in $DBNAME/taxonomy/ . If you need to modify the taxonomy, edits can be made to the names.dmp and nodes.dmp files in this directory; the gi_taxid_nucl.dmp file will also need to be updated appropriately.
Install a genomic library. Four sets of standard genomes are made easily available through kraken-build:
(You may also find the -P option to xargs useful to add many files in parallel if you have multiple processors.)
Replicons not downloaded from NCBI may need their taxonomy information assigned explicitly. This can be done using the string kraken:taxid|XXX in the sequence ID, with XXX replaced by the desired taxon ID. For example, to put a known adapter sequence in taxon 32630 ("synthetic construct"), you could use the following:
The kraken:taxid string must begin the sequence ID or be immediately preceded by a pipe character (|).
Explicit assignment of taxonomy IDs in this manner will override the sequence ID mapping provided by NCBI.
Once your library is finalized, you need to build the database. Depending on your size requirements, you may want to adjust the k-mer and/or minimizer lengths from the defaults. Except for some small bookkeeping fields, a Kraken database will use sD + 8(4M) bytes, where s is the number of bytes used to store the k-mer/taxon pair (usually 12, but lower for smaller k-mers), D is the number of distinct k-mers in your library and M is the length (in bp) of the minimizers. Although D does increase as k increases, it is impossible to know exactly how many distinct k-mers will exist in a library for a given k without actually performing the count. By default, k = 31 and M = 15.
The minimizers serve to keep k-mers that are adjacent in query sequences close to each other in the database, which allows Kraken to exploit the CPU cache. Changing the value of M can significantly affect the speed of Kraken, and neither increasing or decreasing M will guarantee faster or slower speed.
To build the database, you'll use the --build switch:
kraken-build --build --db $DBNAME
As noted above, you may want to also use any of --threads, --kmer-len, or --minimizer-len to adjust the database build time and/or final size.
Shrinking the database: The "--shrink" task allows you to take an existing Kraken database and create a smaller MiniKraken database from it. The use of this option removes all but a specified number of k-mer/taxon pairs to create a new, smaller database. For example:
This will create a new database named minikraken that contains 10000 k-mers selected from across the original database ($DBNAME).
The --shrink task is only meant to be run on a completed database. However, if you know before you create a database that you will only be able to use a certain amount of memory, you can use the --max-db-size switch for the --build task to provide a maximum size (in GB) for the database. This allows you to create a MiniKraken database without having to create a full Kraken database first.
A full list of options for kraken-build can be obtained using kraken-build --help.
After building a database, if you want to reduce the disk usage of the database you can use kraken-build's --clean switch to remove all intermediate files from the database directory.
To classify a set of sequences (reads), use the kraken command:
kraken --db $DBNAME seqs.fa
Output will be sent to standard output by default. The files containing the sequences to be classified should be specified on the command line. Sequences can also be provided through standard input using the special filename /dev/fd/0.
Note that to obtain optimum speeds, Kraken's database should be loaded into RAM first. This can be done through use of a ramdisk, if you have superuser permissions. Failing that, you can use the --preload switch to kraken, e.g.:
kraken --preload --db $DBNAME seqs.fa
The database files will be loaded before classification using this switch. See Memory Usage and Efficiency for more information.
The kraken program allows several different options:
Multithreading: Use the --threads NUM switch to use multiple threads.
Quick operation: Rather than searching all k-mers in a sequence, stop classification after the first database hit; use --quick to enable this mode. Note that --min-hits will allow you to require multiple hits before declaring a sequence classified, which can be especially useful with custom databases when testing to see if sequences either do or do not belong to a particular genome.
Sequence filtering: Classified or unclassified sequences can be sent to a file for later processing, using the --classified-out and --unclassified-out switches, respectively.
Output redirection: Output can be directed using standard shell redirection (| or >), or using the --output switch.
FASTQ input: Input is normally expected to be in FASTA format, but you can classify FASTQ data using the --fastq-input switch.
Compressed input: Kraken can handle gzip and bzip2 compressed files as input by specifying the proper switch of --gzip-compressed or --bzip2-compressed.
Input format auto-detection: If regular files are specified on the command line as input, Kraken will attempt to determine the format of your input prior to classification. You can disable this by explicitly specifying --fasta-input, --fastq-input, --gzip-compressed, and/or --bzip2-compressed as appropriate. Note that use of the character device file /dev/fd/0 to read from standard input (aka stdin) will not allow auto-detection.
Paired reads: Kraken does not query k-mers containing ambiguous nucleotides (non-ACGT). If you have paired reads, you can use this fact to your advantage and increase Kraken's accuracy by concatenating the pairs together with a single N between the sequences. Using the --paired option when running kraken will automatically do this for you; simply specify the two mate pair files on the command line. We have found this to raise sensitivity by about 3 percentage points over classifying the sequences as single-end reads. For more information about paired reads input/output, see Paired Reads
Each sequence classified by Kraken results in a single line of output. Output lines contain five tab-delimited fields; from left to right, they are:
"C"/"U": one letter code indicating that the sequence was either classified or unclassified.
The sequence ID, obtained from the FASTA/FASTQ header.
The taxonomy ID Kraken used to label the sequence; this is 0 if the sequence is unclassified.
The length of the sequence in bp.
A space-delimited list indicating the LCA mapping of each k-mer in the sequence. For example, "562:13 561:4 A:31 0:1 562:3" would indicate that:
the first 13 k-mers mapped to taxonomy ID #562
the next 4 k-mers mapped to taxonomy ID #561
the next 31 k-mers contained an ambiguous nucleotide
the next k-mer was not in the database
the last 3 k-mers mapped to taxonomy ID #562
For users who want the full taxonomic name associated with each input sequence, we provide a script named kraken-translate that produces two different output formats for classified sequences. The script operates on the output of kraken, like so:
(The same database used to run kraken should be used to translate the output; see Kraken Environment Variables below for ways to reduce redundancy on the command line.)
The file sequences.labels generated by the above example is a text file with two tab-delimited columns, and one line for each classified sequence in sequences.fa; unclassified sequences are not reported by kraken-translate. The first column of kraken-translate's output are the sequence IDs of the classified sequences, and the second column contains the taxonomy of the sequence. For example, an output line from kraken of:
C SEQ1 562 36 562:6
Would result in a corresponding output line from kraken-translate of:
Alternatively, kraken-translate accepts the option --mpa-format which will report only levels of the taxonomy with standard rank assignments (superkingdom, kingdom, phylum, class, order, family, genus, species), and uses pipes to delimit the various levels of the taxonomy. For example, kraken-translate --mpa-format --db $DBNAME with the above example output from kraken would result in the following line of output:
Kraken will classify paired reads when the user specifies the --paired option by first concatenating the reads using | before classifying the combined reads against the Kraken database.
A number of other options are included in Kraken v1.0 that simplifies analysis of the paired reads. The following describes these options and lists the possible combinations of these options and their behavior when applied. Note that all options require that the --paired option is specified and that two input FASTA/FASTQ files are provided.
--out-fmt legacy: [default] uses N as the sequence delimiter if classified/unclassified reads are printed using the --classified-out or --unclassified-out tags. --out-fmt legacy does not currently support FASTQ output.
--out-fmt legacy --classified-out C_reads.fa: prints classified paired reads with N concatenating the two paired reads.
--out-fmt paired: separates paired sequences into two separate FASTA files when using --classified-out or --unclassified-out tags.
--out-fmt paired --fastq-output: separates paired sequences into two separate FASTQ files when using --classified-out or --unclassified-out tags. FASTQ headers will include everything up to the second whitespace character in the original FASTQ header.
--out-fmt paired --classified-out C_reads: prints classified paired reads to FASTA files C_reads_R1.fa and C_reads_R2.fa
--out-fmt interleaved: prints paired sequences to a single FASTA file without concatenating the paired reads; paired reads are instead printed one after another.
Kraken's execution requires many random accesses to a very large file. To obtain maximal speed, these accesses need to be made as quickly as possible. This means that the database must be in physical memory during execution. Although we provide the --preload option to Kraken for users who cannot use a ramdisk, the ramdisk is likely the simplest option, and is well-suited for installations on computers where Kraken is to be run a majority of the time. In addition, using a ramdisk allows the initial start-up of Kraken to be accomplished much more quickly. If a ramdisk is used, the --preload switch should not be used.
We also note that in some cases, --preload may not be needed (or even advisable). If you know that your database is already in memory (for example, if it has been recently read or unzipped, then it should be in your operating system cache, which resides in physical memory), then there is no need to perform this step. We have noticed that in low-memory (~8 GB) situations, preloading a MiniKraken DB is actually much slower than simply using cat minikraken/database.* > /dev/null. The selection of the best way to get the database into memory is dependent on several factors, including your total amount of RAM, operating system, and current free memory. For this reason, you may need to experiment with your own setup to find a good solution for you.
To create a ramdisk, you will need to have superuser (root) permission. As root, you can use the following commands to create a ramdisk:
mkdir /ramdisk
mount -t ramfs none /ramdisk
Optionally, you may have a trusted user who you want to be able to copy databases into this directory. In that case, you'll need to make that user the owner of the directory via chown.
To put the database on the ramdisk, simply copy the database directory to the ramdisk directory:
cp -a $DBNAME /ramdisk
And then you can use it with Kraken by specifying the database copy on the ramdisk, e.g.:
kraken --db /ramdisk/$DBNAME seqs.fa
Note that anything copied into a ramdisk will be deleted if the ramdisk is unmounted or the computer is restarted, so make sure that you have a copy of the database on a hard disk (or other non-volatile storage).
Note that when using the --paired option, Kraken will not (by default) make any attempt to ensure that the two files you specify are indeed matching sets of paired-end reads. To verify that the names of each read do indeed match, you can use the --check-names option in combination with the --paired option.
To get an idea as to Kraken's results across an entire sample, we provide the kraken-report script. It is used like this:
kraken-report --db $DBNAME kraken.output
Note that the database used must be the same as the one used to generate the output file, or the report script may encounter problems. Output is sent to standard output.
The output of kraken-report is tab-delimited, with one line per taxon. The fields of the output, from left-to-right, are as follows:
Percentage of reads covered by the clade rooted at this taxon
Number of reads covered by the clade rooted at this taxon
Number of reads assigned directly to this taxon
A rank code, indicating (U)nclassified, (D)omain, (K)ingdom, (P)hylum, (C)lass, (O)rder, (F)amily, (G)enus, or (S)pecies. All other ranks are simply '-'.
NCBI taxonomy ID
indented scientific name
The scientific names are indented using spaces, according to the tree structure specified by the taxonomy.
By default, taxa with no reads assigned to (or under) them will not have any output produced. However, if you wish to have all taxa displayed, you can use the --show-zeros switch to do so. This can be useful if you are looking to do further downstream analysis of the reports, and want to compare samples. Sorting by the taxonomy ID (using sort -nf5) can provide a consistent line ordering between reports.
In addition, we also provide the program kraken-mpa-report; this program provides output in a format similar to MetaPhlAn's tab-delimited output. For kraken-mpa-report, multiple Kraken output files can be specified on the command line and each will be treated as a separate sample. For each taxon at the standard ranks (from domain to species), the count of reads in each sample assigned to any node in the clade rooted at that taxon is displayed. kraken-mpa-report is run in the same manner as kraken-report, and its output is also sent to standard output.
At present, we have not yet developed a confidence score with a solid probabilistic interpretation for Kraken. However, we have developed a simple scoring scheme that has yielded good results for us, and we've made that available in the kraken-filter script. The approach we use allows a user to specify a threshold score in the [0,1] interval; the kraken-filter script then will adjust labels up the tree until the label's score (described below) meets or exceeds that threshold. If a label at the root of the taxonomic tree would not have a score exceeding the threshold, the sequence is called unclassified by kraken-filter.
A sequence label's score is a fraction C/Q, where C is the number of k-mers mapped to LCA values in the clade rooted at the label, and Q is the number of k-mers in the sequence that lack an ambiguous nucleotide (i.e., they were queried against the database). Consider the example of the LCA mappings in Kraken's output given earlier:
"562:13 561:4 A:31 0:1 562:3" would indicate that:
the first 13 k-mers mapped to taxonomy ID #562
the next 4 k-mers mapped to taxonomy ID #561
the next 31 k-mers contained an ambiguous nucleotide
the next k-mer was not in the database
the last 3 k-mers mapped to taxonomy ID #562
In this case, ID #561 is the parent node of #562. Here, a label of #562 for this sequence would have a score of C/Q = (13+3)/(13+4+1+3) = 16/21. A label of #561 would have a score of C/Q = (13+4+3)/(13+4+1+3) = 20/21. If a user specified a threshold over 16/21, kraken-filter would adjust the original label from #562 to #561; if the threshold was greater than 20/21, the sequence would become unclassified.
If not specified, the threshold will be 0. kraken-filter's output is similar to kraken's, but a new field between the length and LCA mapping list is present, indicating the new label's score (or the root label's score if the sequence has become unclassified).
To give some guidance toward selecting an appropriate threshold, we show here the results of different thresholds on the MiSeq metagenome from the Kraken paper (see the paper for more details; note that the database used here is more recent than that used in the paper). Precision, sensitivity, and F-score are measured at the genus rank:
Thres
Prec
Sens
F-score
0
95.43
77.32
85.43
0.05
97.28
76.31
85.53
0.10
98.25
75.13
85.15
0.15
98.81
73.87
84.54
0.20
99.13
72.82
83.96
0.25
99.38
71.74
83.33
0.30
99.55
70.75
82.71
0.35
99.61
69.53
81.90
0.40
99.66
68.35
81.09
0.45
99.70
66.93
80.09
0.50
99.71
65.49
79.06
As can be seen, with no threshold (i.e., Kraken's original labels), Kraken's precision is fairly high, but it does increase with the threshold. Diminishing returns apply, however, and there is a loss in sensitivity that must be taken into account when deciding on the threshold to use for your own project.
The Kraken programs (with the exception of kraken-build) support the use of some environment variables to help in reducing command line lengths:
KRAKEN_NUM_THREADS: this variable is only used by kraken; if the --threads option is not supplied to kraken, then the value of this variable (if it is set) will be used as the number of threads to run kraken.
KRAKEN_DB_PATH: much like the PATH variable is used for executables by your shell, KRAKEN_DB_PATH is a colon-separated list of directories that will be searched for the database you name if the named database does not have a slash (/) character. By default, Kraken assumes the value of this variable is "." (i.e., the current working directory). This variable can be used to create one (or more) central repositories of Kraken databases in a multi-user system. Example usage in bash:
This will cause three directories to be searched, in this order:
/home/user/my_kraken_dbs
/data/kraken_dbs
the current working directory (caused by the empty string as the third colon-separated field in the KRAKEN_DB_PATH string)
The search for a database will stop when a name match is found; if two directories in the KRAKEN_DB_PATH have databases with the same name, the directory of the two that is searched first will have its database selected.
If the above variable and value are used, and the databases /data/kraken_dbs/mainDB and ./mainDB are present, then
kraken --db mainDB sequences.fa
will classify sequences.fa using /data/kraken_dbs/mainDB; if instead you wanted to use the mainDB present in the current directory, you would need to specify a directory path to that database in order to circumvent searching, e.g.:
kraken --db ./mainDB sequences.fa
Note that the KRAKEN_DB_PATH directory list can be skipped by the use of any absolute (beginning with /) or relative pathname (including at least one /) as the database name.
KRAKEN_DEFAULT_DB: if no database is supplied with the --db option, the database named in this variable will be used instead. Using this variable, you can avoid using --db if you only have a single database that you usually use, e.g. in bash:
This will classify sequences.fa using the /home/user/krakendb directory.
Note that the value of KRAKEN_DEFAULT_DB will also be interpreted in the context of the value of KRAKEN_DB_PATH if you don't set KRAKEN_DEFAULT_DB to an absolute or relative pathname. Given the earlier example in this section, the following:
The minimizer ordering in Kraken versions prior to v0.10.0-beta was a simple lexicographical ordering that provided a suboptimal distribution of k-mers within the bins. Ideally, the bin sizes would be uniform, but simple lexicographical ordering creates a bias toward low-complexity minimizers. To resolve this, the ordering is now "scrambled" by XORing all minimizers with a predefined constant to toggle half of each minimizer's bits before sorting. The more evenly distributed bins provide better caching performance, but databases created in this way are not compatible with earlier versions of Kraken. Kraken versions from v0.10.0-beta up to (but not including) v1.0 will support the use of the older databases, but we nonetheless recommend one of the two following options:
Build a new database. This is the preferred option, as a newly-created database will have the latest genomes and NCBI taxonomy information.
Re-sort an existing database. If you have a custom database, you may want to simply reformat the database to provide you with Kraken's increased speed. To do so, you'll need to do the following:
kraken-build --upgrade --db $DBNAME
(Note: the --threads switch is both valid and encouraged with this operation.)
This command will not delete your existing $DBNAME/database.* files, but will simply rename them. If you're satisfied with the new database's performance, then you can use kraken-build's --clean option to remove the old files and save space.
Sorting the database is step 3 of the build process, so you should expect a database upgrade to take about as long as step 3 took when building the original database.
Note that the rest of Kraken v0.10.0-beta's speed improvements are available without upgrading or changing your database.
Upgrading to the Kraken version 1.0 does not require rebuilding of any existing Kraken databases. The main updates for this version are within the building process itself. Due to the phasing out of NCBI GI numbers, Kraken version 1.0 does not rely on GI numbers and rather uses the sequence ID to taxon ID maps provided in the NCBI taxonomy. The new version of Kraken uses these in the building of the database but the final database files have not changed. Other changes include changes in the rsync downloads of Refseq databases and in updated runtimes.
kraken-1.1/docs/MANUAL.markdown 0000664 0000000 0000000 00000102305 13175666030 0016307 0 ustar 00root root 0000000 0000000 Introduction
============
[Kraken] is a taxonomic sequence classifier that assigns taxonomic
labels to short DNA reads. It does this by examining the $k$-mers
within a read and querying a database with those $k$-mers. This database
contains a mapping of every $k$-mer in [Kraken]'s genomic library to the
lowest common ancestor (LCA) in a taxonomic tree of all genomes that
contain that $k$-mer. The set of LCA taxa that correspond to the $k$-mers
in a read are then analyzed to create a single taxonomic label for the
read; this label can be any of the nodes in the taxonomic tree.
[Kraken] is designed to be rapid, sensitive, and highly precise. Our
tests on various real and simulated data have shown [Kraken] to have
sensitivity slightly lower than Megablast with precision being slightly
higher. On a set of simulated 100 bp reads, [Kraken] processed over 1.3
million reads per minute on a single core in normal operation, and over
4.1 million reads per minute in quick operation.
The latest released version of Kraken will be available at the
[Kraken website], and the latest updates to the Kraken source code
are available at the [Kraken GitHub repository].
If you use [Kraken] in your research, please cite the [Kraken paper].
Thank you!
[Kraken]: http://ccb.jhu.edu/software/kraken/
[Kraken website]: http://ccb.jhu.edu/software/kraken/
[Kraken paper]: http://genomebiology.com/2014/15/3/R46
[Kraken GitHub repository]: https://github.com/DerrickWood/kraken
System Requirements
===================
Note: Users concerned about the disk or memory requirements should
read the paragraph about MiniKraken, below.
* **Disk space**: Construction of Kraken's standard database will require at
least 500 GB of disk space. Customized databases may require
more or less space. Disk space used is linearly proportional
to the number of distinct $k$-mers; as of Oct. 2017, Kraken's
default database contains just over 14.4 billion (1.44e10)
distinct $k$-mers.
In addition, the disk used to store the database should be
locally-attached storage. Storing the database on a network
filesystem (NFS) partition can cause Kraken's operation to be
very slow, or to be stopped completely. As NFS accesses are
much slower than local disk accesses, both preloading and database
building will be slowed by use of NFS.
* **Memory**: To run efficiently, Kraken requires enough free memory to
hold the database in RAM. While this can be accomplished using a
ramdisk, Kraken supplies a utility for loading the database into
RAM via the OS cache. The default database size is 170 GB (as of
Oct. 2017), and so you will need at least that much RAM if you want
to build or run with the default database.
* **Dependencies**: Kraken currently makes extensive use of Linux utilities
such as sed, find, and wget. Many scripts are written using the
Bash shell, and the main scripts are written using Perl. Core
programs needed to build the database and run the classifier are
written in C++, and need to be compiled using g++. Multithreading
is handled using OpenMP. Downloads of NCBI data are performed by
wget and in some cases, by rsync. Most Linux systems that have any
sort of development package installed will have all of the above
listed programs and libraries available.
Finally, if you want to build your own database, you will need to
install the [Jellyfish] $k$-mer counter. Note that Kraken only
supports use of Jellyfish version 1. Jellyfish version 2 is not
yet compatible with Kraken.
* **Network connectivity**: Kraken's standard database build and download
commands expect unfettered FTP and rsync access to the NCBI FTP
server. If you're working behind a proxy, you may need to set
certain environment variables (such as `ftp_proxy` or `RSYNC_PROXY`)
in order to get these commands to work properly.
* **MiniKraken**: To allow users with low-memory computing environments to
use Kraken, we supply a reduced standard database that can be
downloaded from the Kraken web site. When Kraken is run with a
reduced database, we call it MiniKraken.
The database we make available is only 4 GB in size, and should
run well on computers with as little as 8 GB of RAM. Disk space
required for this database is also only 4 GB.
[Jellyfish]: http://www.cbcb.umd.edu/software/jellyfish/
Installation
============
To begin using Kraken, you will first need to install it, and then
either download or create a database.
Kraken consists of two main scripts ("`kraken`" and "`kraken-build`"),
along with several programs and smaller scripts. As part of the
installation process, all scripts and programs are installed in
the same directory. After installation, you can move the main
scripts elsewhere, but moving the other scripts and programs
requires editing the scripts and changing the "`$KRAKEN_DIR`" variables.
Once a directory is selected, you need to run the following
command in the directory where you extracted the Kraken
source:
./install_kraken.sh $KRAKEN_DIR
(Replace "`$KRAKEN_DIR`" above with the directory where you want to
install Kraken's programs/directories.)
The `install_kraken.sh` script should compile all of Kraken's code
and setup your Kraken data directory. Installation is successful
if you see the message "`Kraken installation complete.`"
Once installation is complete, you may want to copy the two main
Kraken scripts into a directory found in your `PATH` variable
(e.g., "`$HOME/bin`"):
cp $KRAKEN_DIR/bin/kraken $HOME/bin
cp $KRAKEN_DIR/bin/kraken-build $HOME/bin
After installation, you're ready to either create or download a
database.
Kraken Databases
================
A Kraken database is a directory containing at least 4 files:
* `database.kdb`: Contains the $k$-mer to taxon mappings
* `database.idx`: Contains minimizer offset locations in database.kdb
* `taxonomy/nodes.dmp`: Taxonomy tree structure + ranks
* `taxonomy/names.dmp`: Taxonomy names
Other files may be present as part of the database build process.
In interacting with Kraken, you should not have to directly reference
any of these files, but rather simply provide the name of the directory
in which they are stored. Kraken allows both the use of a standard
database as well as custom databases; these are described in the sections
[Standard Kraken Database] and [Custom Databases] below, respectively.
Standard Kraken Database
========================
To create the standard Kraken database, you can use the following command:
kraken-build --standard --db $DBNAME
(Replace "`$DBNAME`" above with your preferred database name/location.
Please note that the database will use approximately 500 GB of
disk space during creation.)
This will download NCBI taxonomic information, as well as the
complete genomes in RefSeq for the bacterial, archaeal, and
viral domains. After downloading all this data, the build
process begins; this is the most time-consuming step. If you
have multiple processing cores, you can run this process with
multiple threads, e.g.:
kraken-build --standard --threads 24 --db $DBNAME
Using 24 threads on a computer (an AWS r4.8xlarge instance)
with 244 GB of RAM, the build process took approximately 5 hours
(steps with an asterisk have some multi-threading enabled) in
October 2017:
24m50s *Step 1 (create set)
n/a Step 2 (reduce database, optional and skipped)
154m53s *Step 3 (sort set)
n/a Step 4 (GI number to sequence ID map - now obsolete)
<1s Step 5 (Sequence ID to taxon map)
127m28s *Step 6 (set LCA values)
-------
5h7m11s Total build time
This process used the automatically estimated jellyfish hash size
of 20170976000.
Note that if any step (including the initial downloads) fails,
the build process will abort. However, `kraken-build` will
produce checkpoints throughout the installation process, and
will restart the build at the last incomplete step if you
attempt to run the same command again on a partially-built
database.
To create a custom database, or to use a database from another
source, see [Custom Databases].
Notes for users with lower amounts of RAM:
1) If you encounter problems with Jellyfish not being able
to allocate enough memory on your system to run the build
process, you can supply a smaller hash size to Jellyfish
using `kraken-build`'s `--jellyfish-hash-size` switch. Each space
in the hash table uses approximately 6.9 bytes, so using
"`--jellyfish-hash-size 6400M`" will use a hash table size of
6.4 billion spaces and require 44.3 GB of RAM.
2) Kraken's build process will normally attempt to minimize
disk writing by allocating large blocks of RAM and operating
within them until data needs to be written to disk. However,
this extra RAM usage may exceed your capacity. In such cases,
you may want to use `kraken-build`'s `--work-on-disk` switch. This
will minimize the amount of RAM usage and cause Kraken's build
programs to perform most operations off of disk files. This
switch can also be useful for people building on a ramdisk or
solid state drive. Please note that working off of disk files
can be quite slow on some computers, causing builds to take
several days if not weeks.
Classification
==============
To classify a set of sequences (reads), use the `kraken` command:
kraken --db $DBNAME seqs.fa
Output will be sent to standard output by default. The files
containing the sequences to be classified should be specified
on the command line. Sequences can also be provided through
standard input using the special filename `/dev/fd/0`.
Note that to obtain optimum speeds, Kraken's database should be
loaded into RAM first. This can be done through use of a ramdisk,
if you have superuser permissions. Failing that, you can use
the `--preload` switch to `kraken`, e.g.:
kraken --preload --db $DBNAME seqs.fa
The database files will be loaded before classification using this
switch. See [Memory Usage and Efficiency] for more information.
The `kraken` program allows several different options:
* **Multithreading**: Use the `--threads NUM` switch to use multiple
threads.
* **Quick operation**: Rather than searching all $k$-mers in a sequence,
stop classification after the first database hit; use `--quick`
to enable this mode. Note that `--min-hits` will allow you to
require multiple hits before declaring a sequence classified,
which can be especially useful with custom databases when testing
to see if sequences either do or do not belong to a particular
genome.
* **Sequence filtering**: Classified or unclassified sequences can be
sent to a file for later processing, using the `--classified-out`
and `--unclassified-out` switches, respectively.
* **Output redirection**: Output can be directed using standard shell
redirection (`|` or `>`), or using the `--output` switch.
* **FASTQ input**: Input is normally expected to be in FASTA format, but
you can classify FASTQ data using the `--fastq-input` switch.
* **Compressed input**: Kraken can handle gzip and bzip2 compressed
files as input by specifying the proper switch of `--gzip-compressed`
or `--bzip2-compressed`.
* **Input format auto-detection**: If regular files are specified on
the command line as input, Kraken will attempt to determine the
format of your input prior to classification. You can disable this
by explicitly specifying `--fasta-input`, `--fastq-input`,
`--gzip-compressed`, and/or `--bzip2-compressed` as appropriate.
Note that use of the character device file `/dev/fd/0` to read
from standard input (aka `stdin`) will **not** allow auto-detection.
* **Paired reads**: Kraken does not query $k$-mers containing ambiguous
nucleotides (non-ACGT). If you have paired reads, you can use this
fact to your advantage and increase Kraken's accuracy by concatenating
the pairs together with a single `N` between the sequences. Using the
`--paired` option when running `kraken` will automatically do this for
you; simply specify the two mate pair files on the command line. We
have found this to raise sensitivity by about 3 percentage points over
classifying the sequences as single-end reads.
To get a full list of options, use `kraken --help`.
Output Format
=============
Each sequence classified by Kraken results in a single line of
output. Output lines contain five tab-delimited fields; from
left to right, they are:
1) "C"/"U": one letter code indicating that the sequence was
either classified or unclassified.
2) The sequence ID, obtained from the FASTA/FASTQ header.
3) The taxonomy ID Kraken used to label the sequence; this is
0 if the sequence is unclassified.
4) The length of the sequence in bp.
5) A space-delimited list indicating the LCA mapping of each $k$-mer
in the sequence. For example, "562:13 561:4 A:31 0:1 562:3"
would indicate that:
- the first 13 $k$-mers mapped to taxonomy ID #562
- the next 4 $k$-mers mapped to taxonomy ID #561
- the next 31 $k$-mers contained an ambiguous nucleotide
- the next $k$-mer was not in the database
- the last 3 $k$-mers mapped to taxonomy ID #562
For users who want the full taxonomic name associated with each input
sequence, we provide a script named `kraken-translate` that produces two
different output formats for classified sequences. The script operates
on the output of `kraken`, like so:
kraken --db $DBNAME sequences.fa > sequences.kraken
kraken-translate --db $DBNAME sequences.kraken > sequences.labels
(The same database used to run `kraken` should be used to translate the
output; see [Kraken Environment Variables] below for ways to reduce
redundancy on the command line.)
The file `sequences.labels` generated by the above example is a text file
with two tab-delimited columns, and one line for each classified sequence
in `sequences.fa`; unclassified sequences are not reported by
`kraken-translate`. The first column of `kraken-translate`'s output are the
sequence IDs of the classified sequences, and the second column contains
the taxonomy of the sequence. For example, an output line from `kraken` of:
C SEQ1 562 36 562:6
Would result in a corresponding output line from `kraken-translate` of:
SEQ1 root;cellular organisms;Bacteria;Proteobacteria;Gammaproteobacteria;Enterobacteriales;Enterobacteriaceae;Escherichia;Escherichia coli
Alternatively, `kraken-translate` accepts the option `--mpa-format` which
will report only levels of the taxonomy with standard rank assignments
(superkingdom, kingdom, phylum, class, order, family, genus, species),
and uses pipes to delimit the various levels of the taxonomy. For example,
`kraken-translate --mpa-format --db $DBNAME` with the above example output
from `kraken` would result in the following line of output:
SEQ1 d__Bacteria|p__Proteobacteria|c__Gammaproteobacteria|o__Enterobacteriales|f__Enterobacteriaceae|g__Escherichia|s__Escherichia_coli
Taxonomy assignments above the superkingdom (`d__`) rank are represented as
just "root" when using the `--mpa-report` option with `kraken-translate`.
Custom Databases
================
We realize the standard database may not suit everyone's needs. Kraken
also allows creation of customized databases.
To build a custom database:
1) Install a taxonomy. Usually, you will just use the NCBI taxonomy,
which you can easily download using:
kraken-build --download-taxonomy --db $DBNAME
This will download the accession number to taxon map, as well as the
taxonomic name and tree information from NCBI. These files can
be found in `$DBNAME/taxonomy/` . If you need to modify the taxonomy,
edits can be made to the `names.dmp` and `nodes.dmp` files in this directory;
the `gi_taxid_nucl.dmp` file will also need to be updated appropriately.
2) Install a genomic library. Four sets of standard genomes are
made easily available through `kraken-build`:
- archaea: RefSeq complete archaeal genomes
- bacteria: RefSeq complete bacterial genomes
- plasmid: RefSeq plasmid sequences
- viral: RefSeq complete viral genomes
- human: GRCh38 human genome
To download and install any one of these, use the `--download-library`
switch, e.g.:
kraken-build --download-library bacteria --db $DBNAME
Other genomes can also be added, but such genomes must meet certain
requirements:
- Sequences must be in a FASTA file (multi-FASTA is allowed)
- Each sequence's ID (the string between the `>` and the first
whitespace character on the header line) must contain either
an NCBI accession number to allow Kraken to lookup the correct taxa, or an
explicit assignment of the taxonomy ID using `kraken:taxid` (see below).
Replicons not downloaded from NCBI may need their taxonomy information
assigned explicitly. This can be done using the string `kraken:taxid|XXX`
in the sequence ID, with `XXX` replaced by the desired taxon ID. For
example, to put a known adapter sequence in taxon 32630 ("synthetic
construct"), you could use the following:
>sequence16|kraken:taxid|32630 Adapter sequence
CAAGCAGAAGACGGCATACGAGATCTTCGAGTGACTGGAGTTCCTTGGCACCCGAGAATTCCA
The `kraken:taxid` string must begin the sequence ID or be immediately
preceded by a pipe character (`|`). Explicit assignment of taxonomy IDs
in this manner will override the accession number mapping provided by NCBI.
If your genomes meet the requirements above, then you can add each
replicon to your database's genomic library using the `--add-to-library`
switch, e.g.:
kraken-build --add-to-library chr1.fa --db $DBNAME
kraken-build --add-to-library chr2.fa --db $DBNAME
Note that if you have a list of files to add, you can do something like
this in `bash`:
for file in chr*.fa
do
kraken-build --add-to-library $file --db $DBNAME
done
Or even add all `*.fa` files found in the directory `genomes`:
find genomes/ -name '*.fa' -print0 | \
xargs -0 -I{} -n1 kraken-build --add-to-library {} --db $DBNAME
(You may also find the `-P` option to `xargs` useful to add many files in
parallel if you have multiple processors.)
3) Once your library is finalized, you need to build the database.
Depending on your size requirements, you may want to adjust the
$k$-mer and/or minimizer lengths from the defaults. Except for some
small bookkeeping fields, a Kraken database will use
$sD$ + $8(4^{M})$
bytes, where $s$ is the number of bytes used to store the $k$-mer/taxon
pair (usually 12, but lower for smaller $k$-mers), $D$ is the number of
distinct $k$-mers in your library and
$M$ is the length (in bp) of the minimizers. Although $D$ does increase
as $k$ increases, it is impossible to know exactly how many distinct
$k$-mers will exist in a library for a given $k$ without actually
performing the count. By default, $k$ = 31 and $M$ = 15.
The minimizers serve to keep $k$-mers that are adjacent in query
sequences close to each other in the database, which allows
Kraken to exploit the CPU cache. Changing the value of $M$ can
significantly affect the speed of Kraken, and neither increasing
or decreasing $M$ will guarantee faster or slower speed.
To build the database, you'll use the `--build` switch:
kraken-build --build --db $DBNAME
As noted above, you may want to also use any of `--threads`,
`--kmer-len`, or `--minimizer-len` to adjust the database build
time and/or final size.
4) Shrinking the database: The "--shrink" task allows you to take
an existing Kraken database and create a smaller MiniKraken database
from it. The use of this option removes all but a specified number of
$k$-mer/taxon pairs to create a new, smaller database. For example:
kraken-build --shrink 10000 --db $DBNAME --new-db minikraken
This will create a new database named `minikraken` that contains
10000 $k$-mers selected from across the original database (`$DBNAME`).
The `--shrink` task is only meant to be run on a completed database.
However, if you know before you create a database that you will
only be able to use a certain amount of memory, you can use the
`--max-db-size` switch for the `--build` task to provide a maximum
size (in GB) for the database. This allows you to create a MiniKraken
database without having to create a full Kraken database first.
A full list of options for `kraken-build` can be obtained using
`kraken-build --help`.
After building a database, if you want to reduce the disk usage of
the database you can use `kraken-build`'s `--clean` switch to remove
all intermediate files from the database directory.
Memory Usage and Efficiency
===========================
Kraken's execution requires many random accesses to a very large file.
To obtain maximal speed, these accesses need to be made as quickly as
possible. This means that the database must be in physical memory
during execution. Although we provide the `--preload` option to Kraken
for users who cannot use a ramdisk, the ramdisk is likely the simplest
option, and is well-suited for installations on computers where Kraken
is to be run a majority of the time. In addition, using a ramdisk
allows the initial start-up of Kraken to be accomplished much more quickly.
If a ramdisk is used, the `--preload` switch should not be used.
We also note that in some cases, `--preload` may not be needed (or even
advisable). If you know that your database is already in memory (for
example, if it has been recently read or unzipped, then it should be in
your operating system cache, which resides in physical memory), then there
is no need to perform this step. We have noticed that in low-memory (~8 GB)
situations, preloading a MiniKraken DB is actually much slower than simply
using `cat minikraken/database.* > /dev/null`. The selection of the best way
to get the database into memory is dependent on several factors, including
your total amount of RAM, operating system, and current free memory. For this
reason, you may need to experiment with your own setup to find a good solution
for you.
To create a ramdisk, you will need to have superuser (root) permission.
As root, you can use the following commands to create a ramdisk:
mkdir /ramdisk
mount -t ramfs none /ramdisk
Optionally, you may have a trusted user who you want to be able to copy
databases into this directory. In that case, you'll need to make that
user the owner of the directory via chown.
To put the database on the ramdisk, simply copy the database directory
to the ramdisk directory:
cp -a $DBNAME /ramdisk
And then you can use it with Kraken by specifying the database copy on
the ramdisk, e.g.:
kraken --db /ramdisk/$DBNAME seqs.fa
Note that anything copied into a ramdisk will be deleted if the ramdisk
is unmounted or the computer is restarted, so make sure that you have a
copy of the database on a hard disk (or other non-volatile storage).
Note that when using the `--paired` option, Kraken will not (by default)
make any attempt to ensure that the two files you specify are indeed
matching sets of paired-end reads. To verify that the names of each
read do indeed match, you can use the `--check-names` option in
combination with the `--paired` option.
Sample Reports
==============
To get an idea as to Kraken's results across an entire sample, we provide
the `kraken-report` script. It is used like this:
kraken-report --db $DBNAME kraken.output
Note that the database used must be the same as the one used to generate
the output file, or the report script may encounter problems. Output is
sent to standard output.
The output of `kraken-report` is tab-delimited, with one line per taxon.
The fields of the output, from left-to-right, are as follows:
1) Percentage of reads covered by the clade rooted at this taxon
2) Number of reads covered by the clade rooted at this taxon
3) Number of reads assigned directly to this taxon
4) A rank code, indicating (U)nclassified, (D)omain, (K)ingdom,
(P)hylum, (C)lass, (O)rder, (F)amily, (G)enus, or (S)pecies.
All other ranks are simply '-'.
5) NCBI taxonomy ID
6) indented scientific name
The scientific names are indented using spaces, according to the tree
structure specified by the taxonomy.
By default, taxa with no reads assigned to (or under) them will not have
any output produced. However, if you wish to have all taxa displayed,
you can use the `--show-zeros` switch to do so. This can be useful if
you are looking to do further downstream analysis of the reports, and
want to compare samples. Sorting by the taxonomy ID (using `sort -nf5`)
can provide a consistent line ordering between reports.
In addition, we also provide the program `kraken-mpa-report`; this program
provides output in a format similar to MetaPhlAn's tab-delimited output.
For `kraken-mpa-report`, multiple Kraken output files can be specified on
the command line and each will be treated as a separate sample. For each
taxon at the standard ranks (from domain to species), the count of reads
in each sample assigned to any node in the clade rooted at that taxon is
displayed. `kraken-mpa-report` is run in the same manner as `kraken-report`,
and its output is also sent to standard output.
Confidence Scoring
==================
At present, we have not yet developed a confidence score with a solid
probabilistic interpretation for Kraken. However, we have developed a
simple scoring scheme that has yielded good results for us, and we've
made that available in the `kraken-filter` script. The approach we use
allows a user to specify a threshold score in the [0,1] interval; the
`kraken-filter` script then will adjust labels up the tree until the
label's score (described below) meets or exceeds that threshold. If
a label at the root of the taxonomic tree would not have a score exceeding
the threshold, the sequence is called unclassified by kraken-filter.
A sequence label's score is a fraction $C$/$Q$, where $C$ is the number of
$k$-mers mapped to LCA values in the clade rooted at the label, and $Q$ is the
number of $k$-mers in the sequence that lack an ambiguous nucleotide (i.e.,
they were queried against the database). Consider the example of the
LCA mappings in Kraken's output given earlier:
"562:13 561:4 A:31 0:1 562:3" would indicate that:
* the first 13 $k$-mers mapped to taxonomy ID #562
* the next 4 $k$-mers mapped to taxonomy ID #561
* the next 31 $k$-mers contained an ambiguous nucleotide
* the next $k$-mer was not in the database
* the last 3 $k$-mers mapped to taxonomy ID #562
In this case, ID #561 is the parent node of #562. Here, a label of #562
for this sequence would have a score of $C$/$Q$ = (13+3)/(13+4+1+3) = 16/21.
A label of #561 would have a score of $C$/$Q$ = (13+4+3)/(13+4+1+3) = 20/21.
If a user specified a threshold over 16/21, kraken-filter would adjust the
original label from #562 to #561; if the threshold was greater than 20/21,
the sequence would become unclassified.
`kraken-filter` is used like this:
kraken-filter --db $DBNAME [--threshold NUM] kraken.output
If not specified, the threshold will be 0. `kraken-filter`'s output is
similar to `kraken`'s, but a new field between the length and LCA mapping
list is present, indicating the new label's score (or the root label's
score if the sequence has become unclassified).
To give some guidance toward selecting an appropriate threshold, we
show here the results of different thresholds on the MiSeq metagenome
from the [Kraken paper] \(see the paper for more details; note that the
database used here is more recent than that used in the paper).
Precision, sensitivity, and F-score are measured at the genus rank:
As can be seen, with no threshold (i.e., Kraken's original labels),
Kraken's precision is fairly high, but it does increase with the
threshold. Diminishing returns apply, however, and there is a loss
in sensitivity that must be taken into account when deciding on the
threshold to use for your own project.
Kraken Environment Variables
============================
The Kraken programs (with the exception of `kraken-build`) support the
use of some environment variables to help in reducing command line
lengths:
* **`KRAKEN_NUM_THREADS`**: this variable is only used by `kraken`; if the
`--threads` option is not supplied to `kraken`, then the value of this
variable (if it is set) will be used as the number of threads to run
`kraken`.
* **`KRAKEN_DB_PATH`**: much like the `PATH` variable is used for executables
by your shell, `KRAKEN_DB_PATH` is a colon-separated list of directories
that will be searched for the database you name if the named database
does not have a slash (`/`) character. By default, Kraken assumes the
value of this variable is "`.`" (i.e., the current working directory).
This variable can be used to create one (or more) central repositories
of Kraken databases in a multi-user system. Example usage in bash:
export KRAKEN_DB_PATH="/home/user/my_kraken_dbs:/data/kraken_dbs:"
This will cause three directories to be searched, in this order:
1) `/home/user/my_kraken_dbs`
2) `/data/kraken_dbs`
3) the current working directory (caused by the empty string as
the third colon-separated field in the `KRAKEN_DB_PATH` string)
The search for a database will stop when a name match is found; if
two directories in the `KRAKEN_DB_PATH` have databases with the same
name, the directory of the two that is searched first will have its
database selected.
If the above variable and value are used, and the databases
`/data/kraken_dbs/mainDB` and `./mainDB` are present, then
kraken --db mainDB sequences.fa
will classify `sequences.fa` using `/data/kraken_dbs/mainDB`; if instead
you wanted to use the `mainDB` present in the current directory,
you would need to specify a directory path to that database in order
to circumvent searching, e.g.:
kraken --db ./mainDB sequences.fa
Note that the `KRAKEN_DB_PATH` directory list can be skipped by the use
of any absolute (beginning with `/`) or relative pathname (including
at least one `/`) as the database name.
* **`KRAKEN_DEFAULT_DB`**: if no database is supplied with the `--db` option,
the database named in this variable will be used instead. Using this
variable, you can avoid using `--db` if you only have a single database
that you usually use, e.g. in bash:
export KRAKEN_DEFAULT_DB="/home/user/krakendb"
kraken sequences.fa | kraken-report > sequences.kreport
This will classify `sequences.fa` using the `/home/user/krakendb` directory.
Note that the value of `KRAKEN_DEFAULT_DB` will also be interpreted in
the context of the value of `KRAKEN_DB_PATH` if you don't set
`KRAKEN_DEFAULT_DB` to an absolute or relative pathname. Given the earlier
example in this section, the following:
export KRAKEN_DEFAULT_DB="mainDB"
kraken sequences.fa
will use `/data/kraken_dbs/mainDB` to classify `sequences.fa`.
Upgrading Databases to v0.10+
=============================
The minimizer ordering in Kraken versions prior to v0.10.0-beta was a
simple lexicographical ordering that provided a suboptimal distribution
of k-mers within the bins. Ideally, the bin sizes would be uniform,
but simple lexicographical ordering creates a bias toward low-complexity
minimizers. To resolve this, the ordering is now "scrambled" by XORing all
minimizers with a predefined constant to toggle half of each minimizer's
bits before sorting. The more evenly distributed bins provide better
caching performance, but databases created in this way are not compatible
with earlier versions of Kraken. Kraken versions from v0.10.0-beta up to
(and including) v1.0 will support the use of the older databases, but
we nonetheless recommend one of the two following options:
1) Build a new database. This is the preferred option, as a newly-created
database will have the latest genomes and NCBI taxonomy information.
2) Re-sort an existing database. If you have a custom database, you may
want to simply reformat the database to provide you with Kraken's
increased speed. To do so, you'll need to do the following:
kraken-build --upgrade --db $DBNAME
(**Note**: the `--threads` switch is both valid and encouraged with this
operation.)
This command will **not** delete your existing `$DBNAME/database.*`
files, but will simply rename them. If you're satisfied with the new
database's performance, then you can use `kraken-build`'s `--clean`
option to remove the old files and save space.
Sorting the database is step 3 of the build process, so you should
expect a database upgrade to take about as long as step 3 took when
building the original database.
Note that the rest of Kraken v0.10.0-beta's speed improvements are available
without upgrading or changing your database.
kraken-1.1/docs/Makefile 0000664 0000000 0000000 00000000443 13175666030 0015226 0 ustar 00root root 0000000 0000000 all:
pandoc --title-prefix "Kraken Manual" \
--include-in-header head.html \
--include-before-body top.html \
--from markdown --to html \
--table-of-contents \
--css kraken.css \
--output MANUAL.html \
< MANUAL.markdown
kraken-1.1/docs/bar-bg.png 0000664 0000000 0000000 00000000274 13175666030 0015430 0 ustar 00root root 0000000 0000000 ‰PNG
IHDR š«Ä pHYs šœ tIMEß # %z¤ tEXtComment Created with GIMPW 6IDATHÇcdØðá?à¿??#ŒÍ¸ñã||b à Q‹G-µxÔâQ‹G-µxÔbú á% Ë" IEND®B`‚ kraken-1.1/docs/head.html 0000664 0000000 0000000 00000000157 13175666030 0015357 0 ustar 00root root 0000000 0000000
kraken-1.1/docs/kraken.css 0000664 0000000 0000000 00000001327 13175666030 0015555 0 ustar 00root root 0000000 0000000 body {
background: #f0f0f0 url("bar-bg.png") top left repeat-y;
color: #333;
margin: 10px;
margin-left: 50px;
margin-bottom: 20px;
font-family: 'Ubuntu', sans-serif;
}
a { color: #00b0f0; }
a:visited { color: #0090f0; }
a:hover { color: #00d0ff; }
a:active { color: #00f0ff; }
.pretoc {
text-align: center;
font-size: 1.2em;
}
.title {
font-size: 2em;
font-weight: bold;
margin-bottom: 0;
}
.version {
font-size: 0.9em;
}
h1 {
color: #0090f0;
border-bottom: 1px #0090f0 solid;
margin-left: -10px;
margin-bottom: 3px;
}
h1 a {
color: #0090f0;
text-decoration: none;
}
div#confidence-score-table table th {
width: 7em;
}
pre {
margin-left: 4em;
}
code {
font-size: 1.2em;
}
kraken-1.1/docs/top.html 0000664 0000000 0000000 00000000300 13175666030 0015246 0 ustar 00root root 0000000 0000000
Kraken taxonomic sequence classification system
Version 1.0
Operating Manual
Table of Contents
kraken-1.1/install_kraken.sh 0000775 0000000 0000000 00000003477 13175666030 0016210 0 ustar 00root root 0000000 0000000 #!/bin/bash
# Copyright 2013-2015, Derrick Wood
#
# This file is part of the Kraken taxonomic classification system.
#
# Kraken is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# Kraken is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with Kraken. If not, see .
set -e
VERSION="1.0"
if [ -z "$1" ] || [ -n "$2" ]
then
echo "Usage: $(basename $0) KRAKEN_DIR"
exit 64
fi
if [ "$1" = "KRAKEN_DIR" ]
then
echo "Please replace \"KRAKEN_DIR\" with the name of the directory"
echo "that you want to install Kraken in."
exit 1
fi
# Perl cmd used to canonicalize dirname - "readlink -f" doesn't work
# on OS X.
export KRAKEN_DIR=$(perl -MCwd=abs_path -le 'print abs_path(shift)' "$1")
mkdir -p "$KRAKEN_DIR"
make -C src install
for file in scripts/*
do
perl -pl -e 'BEGIN { while (@ARGV) { $_ = shift; ($k,$v) = split /=/, $_, 2; $H{$k} = $v } }'\
-e 's/#####=(\w+)=#####/$H{$1}/g' \
"KRAKEN_DIR=$KRAKEN_DIR" "VERSION=$VERSION" \
< "$file" > "$KRAKEN_DIR/$(basename $file)"
if [ -x "$file" ]
then
chmod +x "$KRAKEN_DIR/$(basename $file)"
fi
done
echo
echo "Kraken installation complete."
echo
echo "To make things easier for you, you may want to copy/symlink the following"
echo "files into a directory in your PATH:"
for file in $KRAKEN_DIR/kraken*
do
if [ -x "$file" ]
then
echo " $file"
fi
done
kraken-1.1/scripts/ 0000775 0000000 0000000 00000000000 13175666030 0014324 5 ustar 00root root 0000000 0000000 kraken-1.1/scripts/add_to_library.sh 0000775 0000000 0000000 00000003316 13175666030 0017644 0 ustar 00root root 0000000 0000000 #!/bin/bash
# Copyright 2013-2017, Derrick Wood
#
# This file is part of the Kraken taxonomic sequence classification system.
#
# Kraken is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# Kraken is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with Kraken. If not, see .
# Copy specified file into a Kraken library
set -u # Protect against uninitialized vars.
set -e # Stop on error
LIBRARY_DIR="$KRAKEN_DB_NAME/library"
input_file=$1
if [ ! -e "$input_file" ]
then
echo "Can't add \"$input_file\": file does not exist"
exit 1
fi
if [ ! -f "$input_file" ]
then
echo "Can't add \"$input_file\": not a regular file"
exit 1
fi
add_dir="$LIBRARY_DIR/added"
mkdir -p "$add_dir"
if [[ $input_file == *.gbff || $input_file == *.gbff.gz || $input_file == *.gbk || $input_file == *.gbk.gz ]]
then
convert_gb_to_fa.pl $input_file > "$add_dir/temp.fna"
input_file="$add_dir/temp.fna"
fi
scan_fasta_file.pl "$input_file" > "$add_dir/temp_map.txt"
filename=$(cp_into_tempfile.pl -t "XXXXXXXXXX" -d "$add_dir" -s fna "$input_file")
cat "$add_dir/temp_map.txt" >> "$add_dir/prelim_map.txt"
rm "$add_dir/temp_map.txt"
if [ -e "$add_dir/temp.fna" ]
then
rm "$add_dir/temp.fna"
fi
echo "Added \"$1\" to library ($KRAKEN_DB_NAME)"
kraken-1.1/scripts/build_kraken_db.sh 0000775 0000000 0000000 00000015672 13175666030 0017775 0 ustar 00root root 0000000 0000000 #!/bin/bash
# Copyright 2013-2017, Derrick Wood
#
# This file is part of the Kraken taxonomic sequence classification system.
#
# Kraken is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# Kraken is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with Kraken. If not, see .
# Build a Kraken database
# Designed to be called by kraken_build
set -u # Protect against uninitialized vars.
set -e # Stop on error
set -o pipefail # Stop on failures in non-final pipeline commands
function report_time_elapsed() {
curr_time=$(date "+%s.%N")
perl -e '$time = $ARGV[1] - $ARGV[0];' \
-e '$sec = int($time); $nsec = $time - $sec;' \
-e '$min = int($sec/60); $sec %= 60;' \
-e '$hr = int($min/60); $min %= 60;' \
-e 'print "${hr}h" if $hr;' \
-e 'print "${min}m" if $min || $hr;' \
-e 'printf "%.3fs", $sec + $nsec;' \
$1 $curr_time
}
start_time=$(date "+%s.%N")
DATABASE_DIR="$KRAKEN_DB_NAME"
if [ ! -d "$DATABASE_DIR" ]
then
echo "Can't find Kraken DB directory \"$KRAKEN_DB_NAME\""
exit 1
fi
cd "$DATABASE_DIR"
MEMFLAG=""
if [ -z "$KRAKEN_WORK_ON_DISK" ]
then
MEMFLAG="-M"
echo "Kraken build set to minimize disk writes."
else
echo "Kraken build set to minimize RAM usage."
fi
if [ -n "$KRAKEN_REBUILD_DATABASE" ]
then
rm -f database.* *.map lca.complete
fi
if [ -e "database.jdb" ]
then
echo "Skipping step 1, k-mer set already exists."
else
echo "Creating k-mer set (step 1 of 6)..."
start_time1=$(date "+%s.%N")
check_for_jellyfish.sh
# Estimate hash size as 1.25 * estimated k-mer count
if [ -z "$KRAKEN_HASH_SIZE" ]
then
KRAKEN_HASH_SIZE=$(find library/ -name '*.fna' -print0 | xargs -0 cat | kmer_estimator -m 1.25 -t $KRAKEN_THREAD_CT -k $KRAKEN_KMER_LEN)
echo "Hash size not specified, using '$KRAKEN_HASH_SIZE'"
fi
find library/ -name '*.fna' -print0 | \
xargs -0 cat | \
jellyfish count -m $KRAKEN_KMER_LEN -s $KRAKEN_HASH_SIZE -C -t $KRAKEN_THREAD_CT \
-o database /dev/fd/0
# Merge only if necessary
if [ -e "database_1" ]
then
jellyfish merge -o database.jdb.tmp database_*
else
mv database_0 database.jdb.tmp
fi
# Once here, DB is finalized, can put file in place.
mv database.jdb.tmp database.jdb
echo "K-mer set created. [$(report_time_elapsed $start_time1)]"
fi
if [ -z "$KRAKEN_MAX_DB_SIZE" ]
then
echo "Skipping step 2, no database reduction requested."
else
if [ -e "database.jdb.big" ]
then
echo "Skipping step 2, database reduction already done."
else
start_time1=$(date "+%s.%N")
kdb_size=$(stat -c '%s' database.jdb)
idx_size=$(echo "8 * (4 ^ $KRAKEN_MINIMIZER_LEN + 2)" | bc)
resize_needed=$(echo "scale = 10; ($kdb_size+$idx_size)/(2^30) > $KRAKEN_MAX_DB_SIZE" | bc)
if (( resize_needed == 0 ))
then
echo "Skipping step 2, database reduction unnecessary."
else
echo "Reducing database size (step 2 of 6)..."
max_kdb_size=$(echo "$KRAKEN_MAX_DB_SIZE*2^30 - $idx_size" | bc)
if (( $(echo "$max_kdb_size < 0" | bc) == 1 ))
then
echo "Maximum database size too small, aborting reduction."
exit 1
fi
# Key ct is 8 byte int stored 48 bytes from start of file
key_ct=$(perl -MFcntl -le 'open F, "database.jdb"; seek F, 48, SEEK_SET; read F, $b, 8; $a = unpack("Q", $b); print $a')
# key_bits is 8 bytes from start
key_bits=$(perl -MFcntl -le 'open F, "database.jdb"; seek F, 8, SEEK_SET; read F, $b, 8; $a = unpack("Q", $b); print $a')
# this is basically ceil(key_bits / 8) - why no ceiling function, bc?
key_len=$(echo "($key_bits + 7) / 8" | bc)
# val_len is 16 bytes from start
val_len=$(perl -MFcntl -le 'open F, "database.jdb"; seek F, 16, SEEK_SET; read F, $b, 8; $a = unpack("Q", $b); print $a')
record_len=$(( key_len + val_len ))
new_ct=$(echo "$max_kdb_size / $record_len" | bc)
echo "Shrinking DB to use only $new_ct of the $key_ct k-mers"
db_shrink -d database.jdb -o database.jdb.small -n $new_ct
mv database.jdb database.jdb.big.tmp
mv database.jdb.small database.jdb
mv database.jdb.big.tmp database.jdb.big
echo "Database reduced. [$(report_time_elapsed $start_time1)]"
fi
fi
fi
if [ -e "database.kdb" ]
then
echo "Skipping step 3, k-mer set already sorted."
else
echo "Sorting k-mer set (step 3 of 6)..."
start_time1=$(date "+%s.%N")
db_sort -z $MEMFLAG -t $KRAKEN_THREAD_CT -n $KRAKEN_MINIMIZER_LEN \
-d database.jdb -o database.kdb.tmp \
-i database.idx
# Once here, DB is sorted, can put file in proper place.
mv database.kdb.tmp database.kdb
echo "K-mer set sorted. [$(report_time_elapsed $start_time1)]"
fi
echo "Skipping step 4, GI number to seqID map now obsolete."
seqid2taxid_map_file="seqid2taxid.map"
if [ -e "$seqid2taxid_map_file" ]
then
echo "Skipping step 5, seqID to taxID map already complete."
else
echo "Creating seqID to taxID map (step 5 of 6)..."
start_time1=$(date "+%s.%N")
find library/ -maxdepth 2 -name prelim_map.txt | xargs cat > taxonomy/prelim_map.txt
if [ ! -s "taxonomy/prelim_map.txt" ]; then
echo "No preliminary seqid/taxid mapping files found, aborting."
exit 1
fi
grep "^TAXID" taxonomy/prelim_map.txt | cut -f 2- > $seqid2taxid_map_file.tmp || true
if grep "^ACCNUM" taxonomy/prelim_map.txt | cut -f 2- > accmap_file.tmp; then
if compgen -G "taxonomy/*.accession2taxid" > /dev/null; then
lookup_accession_numbers.pl accmap_file.tmp taxonomy/*.accession2taxid > seqid2taxid_acc.tmp
cat seqid2taxid_acc.tmp >> $seqid2taxid_map_file.tmp
rm seqid2taxid_acc.tmp
else
echo "Accession to taxid map files are required to build this DB."
echo "Run 'kraken-build --db $KRAKEN_DB_NAME --download-taxonomy' again?"
exit 1
fi
fi
mv $seqid2taxid_map_file.tmp $seqid2taxid_map_file
line_ct=$(wc -l $seqid2taxid_map_file | awk '{print $1}')
echo "$line_ct sequences mapped to taxa. [$(report_time_elapsed $start_time1)]"
fi
if [ -e "lca.complete" ]
then
echo "Skipping step 6, LCAs already set."
else
echo "Setting LCAs in database (step 6 of 6)..."
start_time1=$(date "+%s.%N")
find library/ '(' -name '*.fna' -o -name '*.fa' -o -name '*.ffn' ')' -print0 | \
xargs -0 cat | \
set_lcas $MEMFLAG -x -d database.kdb -i database.idx \
-n taxonomy/nodes.dmp -t $KRAKEN_THREAD_CT -m seqid2taxid.map -F /dev/fd/0
touch "lca.complete"
echo "Database LCAs set. [$(report_time_elapsed $start_time1)]"
fi
echo "Database construction complete. [Total: $(report_time_elapsed $start_time)]"
kraken-1.1/scripts/check_for_jellyfish.sh 0000775 0000000 0000000 00000002350 13175666030 0020657 0 ustar 00root root 0000000 0000000 #!/bin/bash
# Copyright 2013-2015, Derrick Wood
#
# This file is part of the Kraken taxonomic sequence classification system.
#
# Kraken is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# Kraken is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with Kraken. If not, see .
# Check that jellyfish is executable and is proper version
# Designed to be called by kraken-build
set -u # Protect against uninitialized vars.
set -e # Stop on error
set -o pipefail # Stop on failures in non-final pipeline commands
JELLYFISH_VERSION=$(jellyfish --version | awk '{print $2}')
if [[ $JELLYFISH_VERSION =~ ^1\. ]]
then
echo "Found jellyfish v$JELLYFISH_VERSION"
else
echo "Found jellyfish v$JELLYFISH_VERSION"
echo "Kraken requires jellyfish version 1"
exit 1
fi
kraken-1.1/scripts/clean_db.sh 0000775 0000000 0000000 00000002541 13175666030 0016414 0 ustar 00root root 0000000 0000000 #!/bin/bash
# Copyright 2013-2015, Derrick Wood
#
# This file is part of the Kraken taxonomic sequence classification system.
#
# Kraken is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# Kraken is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with Kraken. If not, see .
# Clean unneeded files from a database
set -u # Protect against uninitialized vars.
set -e # Stop on error
cd "$KRAKEN_DB_NAME"
[ -e "database.kdb" ] || (echo "Incomplete database, clean aborted."; exit 1)
[ -e "database.idx" ] || (echo "Incomplete database, clean aborted."; exit 1)
[ -e "taxonomy/nodes.dmp" ] || (echo "Incomplete database, clean aborted."; exit 1)
[ -e "taxonomy/names.dmp" ] || (echo "Incomplete database, clean aborted."; exit 1)
rm -rf library
rm -f database.jdb* database_* *.map lca.complete
mkdir newtaxo
mv taxonomy/{nodes,names}.dmp newtaxo
rm -rf taxonomy
mv newtaxo taxonomy
kraken-1.1/scripts/convert_gb_to_fa.pl 0000775 0000000 0000000 00000003055 13175666030 0020167 0 ustar 00root root 0000000 0000000 #!/usr/bin/env perl
# Copyright 2013-2017, Derrick Wood
#
# This file is part of the Kraken taxonomic sequence classification system.
#
# Kraken is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# Kraken is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with Kraken. If not, see .
# Pull sequence data with accession and taxid from Genbank file and output in fasta format
# Adapted from @tseemann https://github.com/MDU-PHL/mdu-tools/blob/master/bin/genbank-to-kraken_fasta.pl
use strict;
use warnings;
@ARGV or die "Usage: $0 ...";
my $wrote=0;
my($seqid, $in_seq, $taxid);
my $input_file = $ARGV[0];
open(IN, "gunzip -c -f \Q$input_file\E |") or die "can’t open pipe to $input_file";
while () {
if (m/^VERSION\s+(\S+)/) {
$seqid = $1;
}
elsif (m/taxon:(\d+)/) {
$taxid = $1;
}
elsif (m/^ORIGIN/) {
$in_seq = 1;
print ">$seqid|kraken:taxid|$taxid\n";
}
elsif (m{^//}) {
$in_seq = $taxid = $seqid = undef;
$wrote++;
}
elsif ($in_seq) {
substr $_, 0, 10, '';
s/\s//g;
print uc($_), "\n";
}
}
close IN;
kraken-1.1/scripts/cp_into_tempfile.pl 0000775 0000000 0000000 00000003540 13175666030 0020206 0 ustar 00root root 0000000 0000000 #!/usr/bin/env perl
# Copyright 2013-2015, Derrick Wood
#
# This file is part of the Kraken taxonomic sequence classification system.
#
# Kraken is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# Kraken is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with Kraken. If not, see .
# Create a file in a specified directory, then copy an
# existing file's contents into the new file. Write name of
# new file to standard output.
#
# Thanks to everyone who wrote the mktemp program and couldn't be
# bothered to standardize the behavior.
use strict;
use warnings;
use File::Basename;
use File::Temp 'tempfile';
use Getopt::Std;
my $PROG = basename $0;
getopts('d:t:s:', \my %opts) or usage();
$opts{$_} or usage() for qw/d t s/; # all switches mandatory
my ($directory, $template, $suffix) = @opts{qw/d t s/};
die "$PROG: '$directory' not a directory!\n" unless -d $directory;
die "$PROG: must specify a single filename\n" unless @ARGV == 1;
$suffix =~ s/^\.//;
my $old_filename = shift @ARGV;
open FILE, "<", $old_filename
or die "$PROG: can't read $old_filename: $!\n";
my ($fh, $new_filename) = tempfile($template, DIR => $directory,
UNLINK => 0, SUFFIX => ".$suffix");
# copy loop
while () {
print {$fh} $_;
}
close FILE;
close $fh;
print "$new_filename\n";
sub usage {
die "$PROG: <-d directory> <-t template> <-s suffix> \n";
}
kraken-1.1/scripts/download_genomic_library.sh 0000775 0000000 0000000 00000006006 13175666030 0021721 0 ustar 00root root 0000000 0000000 #!/bin/bash
# Copyright 2013-2017, Derrick Wood
#
# This file is part of the Kraken taxonomic sequence classification system.
#
# Kraken is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# Kraken is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with Kraken. If not, see .
# Download specific genomic libraries for use with Kraken.
# Supported choices are:
# archaea - NCBI RefSeq complete archaeal genomes
# bacteria - NCBI RefSeq complete bacterial genomes
# plasmids - NCBI RefSeq plasmid sequences
# viral - NCBI RefSeq complete viral DNA and RNA genomes
# human - NCBI RefSeq GRCh38 human reference genome
set -u # Protect against uninitialized vars.
set -e # Stop on error
LIBRARY_DIR="$KRAKEN_DB_NAME/library"
NCBI_SERVER="ftp.ncbi.nlm.nih.gov"
FTP_SERVER="ftp://$NCBI_SERVER"
RSYNC_SERVER="rsync://$NCBI_SERVER"
THIS_DIR=$PWD
library_name="$1"
library_file="library.fna"
if [ -e "$LIBRARY_DIR/$library_name/.completed" ]; then
echo "Skipping $library_name, already completed library download"
exit 0
fi
case "$1" in
"archaea" | "bacteria" | "viral" | "human" )
mkdir -p $LIBRARY_DIR/$library_name
cd $LIBRARY_DIR/$library_name
rm -f assembly_summary.txt
remote_dir_name=$library_name
if [ "$library_name" = "human" ]; then
remote_dir_name="vertebrate_mammalian/Homo_sapiens"
fi
if ! wget -q $FTP_SERVER/genomes/refseq/$remote_dir_name/assembly_summary.txt; then
echo "Error downloading assembly summary file for $library_name, exiting." >/dev/fd/2
exit 1
fi
if [ "$library_name" = "human" ]; then
grep "Genome Reference Consortium" assembly_summary.txt > x
mv x assembly_summary.txt
fi
rm -rf all/ library.f* manifest.txt rsync.err
rsync_from_ncbi.pl assembly_summary.txt
scan_fasta_file.pl $library_file > prelim_map.txt
touch .completed
;;
"plasmid")
mkdir -p $LIBRARY_DIR/plasmid
cd $LIBRARY_DIR/plasmid
rm -f library.f* plasmid.*
echo -n "Downloading plasmid files from FTP..."
wget -q --no-remove-listing --spider $FTP_SERVER/genomes/refseq/plasmid/
awk '{ print $NF }' .listing | perl -ple 'tr/\r//d' | grep '\.fna\.gz' > manifest.txt
cat manifest.txt | xargs -n1 -I{} wget -q $FTP_SERVER/genomes/refseq/plasmid/{}
cat manifest.txt | xargs -n1 -I{} gunzip -c {} > $library_file
rm -f plasmid.* .listing
scan_fasta_file.pl $library_file > prelim_map.txt
touch .completed
echo " done."
;;
*)
echo "Unsupported library. Valid options are: "
echo " archaea bacteria plasmid viral human"
;;
esac
kraken-1.1/scripts/download_taxonomy.sh 0000775 0000000 0000000 00000002427 13175666030 0020435 0 ustar 00root root 0000000 0000000 #!/bin/bash
# Copyright 2013-2017, Derrick Wood
#
# This file is part of the Kraken taxonomic sequence classification system.
# Download NCBI taxonomy information for Kraken.
# Designed to be called by kraken-build
set -u # Protect against uninitialized vars.
set -e # Stop on error
TAXONOMY_DIR="$KRAKEN_DB_NAME/taxonomy"
NCBI_SERVER="ftp.ncbi.nlm.nih.gov"
FTP_SERVER="ftp://$NCBI_SERVER"
mkdir -p "$TAXONOMY_DIR"
cd "$TAXONOMY_DIR"
if [ ! -e "accmap.dlflag" ]
then
wget $FTP_SERVER/pub/taxonomy/accession2taxid/nucl_est.accession2taxid.gz
wget $FTP_SERVER/pub/taxonomy/accession2taxid/nucl_gb.accession2taxid.gz
wget $FTP_SERVER/pub/taxonomy/accession2taxid/nucl_gss.accession2taxid.gz
wget $FTP_SERVER/pub/taxonomy/accession2taxid/nucl_wgs.accession2taxid.gz
touch accmap.dlflag
echo "Downloaded accession to taxon map(s)"
fi
if [ ! -e "taxdump.dlflag" ]
then
wget $FTP_SERVER/pub/taxonomy/taxdump.tar.gz
touch taxdump.dlflag
echo "Downloaded taxonomy tree data"
fi
if ls | grep -q 'accession2taxid\.gz$'
then
echo -n "Uncompressing taxonomy data... "
gunzip *accession2taxid.gz
echo "done."
fi
if [ ! -e "taxdump.untarflag" ]
then
echo -n "Untarring taxonomy tree data... "
tar zxf taxdump.tar.gz
touch taxdump.untarflag
echo "done."
fi
kraken-1.1/scripts/kraken 0000775 0000000 0000000 00000021162 13175666030 0015527 0 ustar 00root root 0000000 0000000 #!/usr/bin/env perl
# Copyright 2013-2015, Derrick Wood
#
# This file is part of the Kraken taxonomic sequence classification system.
#
# Kraken is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# Kraken is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with Kraken. If not, see .
# Wrapper for Kraken's classifier
use strict;
use warnings;
use File::Basename;
use Getopt::Long;
my $PROG = basename $0;
my $KRAKEN_DIR = "#####=KRAKEN_DIR=#####";
# Test to see if the executables got moved, try to recover if we can
if (! -e "$KRAKEN_DIR/classify") {
use Cwd 'abs_path';
$KRAKEN_DIR = dirname abs_path($0);
}
require "$KRAKEN_DIR/krakenlib.pm";
$ENV{"KRAKEN_DIR"} = $KRAKEN_DIR;
$ENV{"PATH"} = "$KRAKEN_DIR:$ENV{PATH}";
my $CLASSIFY = "$KRAKEN_DIR/classify";
my $GZIP_MAGIC = chr(hex "1f") . chr(hex "8b");
my $BZIP2_MAGIC = "BZ";
my $quick = 0;
my $min_hits = 1;
my $fasta_input = 0;
my $fastq_input = 0;
my $fastq_output = 0;
my $db_prefix;
my $threads;
my $preload = 0;
my $gunzip = 0;
my $bunzip2 = 0;
my $paired = 0;
my $check_names = 0;
my $only_classified_output = 0;
my $unclassified_out;
my $classified_out;
my $output_format = "legacy";
my $outfile;
GetOptions(
"help" => \&display_help,
"version" => \&display_version,
"db=s" => \$db_prefix,
"threads=i" => \$threads,
"fasta-input" => \$fasta_input,
"fastq-input" => \$fastq_input,
"fastq-output" => \$fastq_output,
"quick" => \$quick,
"min-hits=i" => \$min_hits,
"unclassified-out=s" => \$unclassified_out,
"classified-out=s" => \$classified_out,
"out-fmt=s" => \$output_format,
"output=s" => \$outfile,
"preload" => \$preload,
"paired" => \$paired,
"check-names" => \$check_names,
"gzip-compressed" => \$gunzip,
"bzip2-compressed" => \$bunzip2,
"only-classified-output" => \$only_classified_output,
);
if (! defined $threads) {
$threads = $ENV{"KRAKEN_NUM_THREADS"} || 1;
}
if (! @ARGV) {
print STDERR "Need to specify input filenames!\n";
usage();
}
eval { $db_prefix = krakenlib::find_db($db_prefix); };
if ($@) {
die "$PROG: $@";
}
my $taxonomy = "$db_prefix/taxonomy/nodes.dmp";
if ($quick) {
undef $taxonomy; # Skip loading nodes file, not needed in quick mode
}
my $kdb_file = "$db_prefix/database.kdb";
my $idx_file = "$db_prefix/database.idx";
if (! -e $kdb_file) {
die "$PROG: $kdb_file does not exist!\n";
}
if (! -e $idx_file) {
die "$PROG: $idx_file does not exist!\n";
}
if ($min_hits > 1 && ! $quick) {
die "$PROG: --min_hits requires --quick to be specified\n";
}
if ($paired && @ARGV != 2) {
die "$PROG: --paired requires exactly two filenames\n";
}
my $compressed = $gunzip || $bunzip2;
if ($gunzip && $bunzip2) {
die "$PROG: can't use both gzip and bzip2 compression flags\n";
}
if ($fasta_input && $fastq_input) {
die "$PROG: can't use both FASTA and FASTQ input flags\n";
}
my $auto_detect = 1;
if ($fasta_input || $fastq_input || $compressed) {
$auto_detect = 0;
}
if (! -f $ARGV[0]) {
$auto_detect = 0;
}
if ($auto_detect) {
auto_detect_file_format();
}
# set flags for classifier
my @flags;
push @flags, "-d", $kdb_file;
push @flags, "-i", $idx_file;
push @flags, "-t", $threads if $threads > 1;
push @flags, "-n", $taxonomy if defined $taxonomy;
push @flags, "-q", if $quick;
push @flags, "-m", $min_hits if $min_hits > 1;
push @flags, "-f", if $fastq_input;
push @flags, "-F", if $fastq_output;
push @flags, "-U", $unclassified_out if defined $unclassified_out;
push @flags, "-C", $classified_out if defined $classified_out;
push @flags, "-O", $output_format if defined $output_format;
push @flags, "-o", $outfile if defined $outfile;
push @flags, "-c", if $only_classified_output;
push @flags, "-M", if $preload;
push @flags, "-P", if $paired;
# handle piping for decompression/merging
my @pipe_argv;
if ($paired) {
my @merge_flags;
push @merge_flags, "--fa" if $fasta_input;
push @merge_flags, "--fq" if $fastq_input;
push @merge_flags, "--gz" if $gunzip;
push @merge_flags, "--bz2" if $bunzip2;
push @merge_flags, "--check-names" if $check_names;
push @merge_flags, "--output-format", $output_format if defined $output_format;
@pipe_argv = ("read_merger.pl", @merge_flags, @ARGV);
}
elsif ($compressed) {
if ($gunzip) {
@pipe_argv = ("gzip", "-dc", @ARGV);
}
elsif ($bunzip2) {
@pipe_argv = ("bzip2", "-dc", @ARGV);
}
else {
die "$PROG: unrecognized compression program! This is a Kraken bug.\n";
}
}
# if args exist, set up the pipe/fork/exec
if (@pipe_argv) {
pipe RD, WR;
my $pid = fork();
if ($pid < 0) {
die "$PROG: fork error: $!\n";
}
if ($pid) {
open STDIN, "<&RD"
or die "$PROG: can't dup stdin to read end of pipe: $!\n";
close RD;
close WR;
@ARGV = ("/dev/fd/0"); # make classifier read from pipe
}
else {
open STDOUT, ">&WR"
or die "$PROG: can't dup stdout to write end of pipe: $!\n";
close RD;
close WR;
exec @pipe_argv
or die "$PROG: can't exec $pipe_argv[0]: $!\n";
}
}
exec $CLASSIFY, @flags, @ARGV;
die "$PROG: exec error: $!\n";
sub usage {
my $exit_code = @_ ? shift : 64;
my $default_db = "none";
eval { $default_db = '"' . krakenlib::find_db() . '"'; };
my $def_thread_ct = exists $ENV{"KRAKEN_NUM_THREADS"} ? (0 + $ENV{"KRAKEN_NUM_THREADS"}) : 1;
print STDERR <
Options:
--db NAME Name for Kraken DB
(default: $default_db)
--threads NUM Number of threads (default: $def_thread_ct)
--fasta-input Input is FASTA format
--fastq-input Input is FASTQ format
--fastq-output Output in FASTQ format
--gzip-compressed Input is gzip compressed
--bzip2-compressed Input is bzip2 compressed
--quick Quick operation (use first hit or hits)
--min-hits NUM In quick op., number of hits req'd for classification
NOTE: this is ignored if --quick is not specified
--unclassified-out FILENAME
Print unclassified sequences to filename
--classified-out FILENAME
Print classified sequences to filename
--out-fmt FORMAT Format for [un]classified sequence output. supported
options are: {legacy, paired, interleaved}
--output FILENAME Print output to filename (default: stdout); "-" will
suppress normal output
--only-classified-output
Print no Kraken output for unclassified sequences
--preload Loads DB into memory before classification
--paired The two filenames provided are paired-end reads
--check-names Ensure each pair of reads have names that agree
with each other; ignored if --paired is not specified
--help Print this message
--version Print version information
If none of the *-input or *-compressed flags are specified, and the
file is a regular file, automatic format detection is attempted.
EOF
exit $exit_code;
}
sub display_help {
usage(0);
}
sub display_version {
print "Kraken version #####=VERSION=#####\n";
print "Copyright 2013-2015, Derrick Wood (dwood\@cs.jhu.edu)\n";
exit 0;
}
sub auto_detect_file_format {
my $magic;
my $filename = $ARGV[0];
# read 2-byte magic number to determine type of compression (if any)
open FILE, "<", $filename;
read FILE, $magic, 2;
close FILE;
if ($magic eq $GZIP_MAGIC) {
$compressed = 1;
$gunzip = 1;
}
elsif ($magic eq $BZIP2_MAGIC) {
$compressed = 1;
$bunzip2 = 1;
}
else {
# if no compression, just look at first char
chop $magic;
}
# uncompress to stream and read first char
if ($gunzip) {
open FILE, "-|", "gzip", "-dc", $filename
or die "$PROG: can't determine format of $filename (gzip error): $!\n";
read FILE, $magic, 1;
close FILE;
}
elsif ($bunzip2) {
open FILE, "-|", "bzip2", "-dc", $ARGV[0]
or die "$PROG: can't determine format of $filename (bzip2 error): $!\n";
read FILE, $magic, 1;
close FILE;
}
if ($magic eq ">") {
$fasta_input = 1;
}
elsif ($magic eq "@") {
$fastq_input = 1;
}
else {
die "$PROG: can't determine what format $filename is!\n";
}
}
kraken-1.1/scripts/kraken-build 0000775 0000000 0000000 00000017467 13175666030 0016641 0 ustar 00root root 0000000 0000000 #!/usr/bin/env perl
# Copyright 2013-2017, Derrick Wood
#
# This file is part of the Kraken taxonomic sequence classification system.
#
# Kraken is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# Kraken is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with Kraken. If not, see .
# General build process wrapper for Kraken.
use strict;
use warnings;
use File::Basename;
use Getopt::Long;
my $PROG = basename $0;
my $KRAKEN_DIR = "#####=KRAKEN_DIR=#####";
# Test to see if the executables got moved, try to recover if we can
if (! -e "$KRAKEN_DIR/classify") {
use Cwd 'abs_path';
$KRAKEN_DIR = dirname abs_path($0);
}
$ENV{"KRAKEN_DIR"} = $KRAKEN_DIR;
$ENV{"PATH"} = "$KRAKEN_DIR:$ENV{PATH}";
my $DEF_MINIMIZER_LEN = 15;
my $DEF_KMER_LEN = 31;
my $DEF_THREAD_CT = 1;
my @VALID_LIBRARY_TYPES = qw/archaea bacteria plasmid viral human/;
# Option/task option variables
my (
$db,
$threads,
$minimizer_len,
$kmer_len,
$new_db,
$hash_size,
$max_db_size,
$work_on_disk,
$shrink_block_offset,
$dl_taxonomy,
$dl_library,
$add_to_library,
$build,
$rebuild,
$shrink,
$standard,
$upgrade,
$clean,
);
$threads = $DEF_THREAD_CT;
$minimizer_len = $DEF_MINIMIZER_LEN;
$kmer_len = $DEF_KMER_LEN;
$work_on_disk = "";
$hash_size = "";
$max_db_size = "";
# variables corresponding to task options
my @TASK_LIST = (
\$dl_taxonomy,
\$dl_library,
\$add_to_library,
\$build,
\$rebuild,
\$shrink,
\$standard,
\$upgrade,
\$clean,
);
GetOptions(
"help" => \&display_help,
"version" => \&display_version,
"db=s" => \$db,
"threads=i" => \$threads,
"minimizer-len=i", \$minimizer_len,
"kmer-len=i", \$kmer_len,
"new-db=s", \$new_db,
"jellyfish-hash-size=s", \$hash_size,
"max-db-size=s", \$max_db_size,
"work-on-disk", \$work_on_disk,
"shrink-block-offset=i", \$shrink_block_offset,
"download-taxonomy" => \$dl_taxonomy,
"download-library=s" => \$dl_library,
"add-to-library=s" => \$add_to_library,
"build" => \$build,
"rebuild" => \$rebuild,
"shrink=i" => \$shrink,
"upgrade" => \$upgrade,
"standard" => \$standard,
"clean" => \$clean,
) or usage();
if (@ARGV) {
warn "Extra arguments on command line.\n";
usage();
}
my $task_options = 0;
for my $flag_ref (@TASK_LIST) {
defined($$flag_ref) and $task_options++;
}
if ($task_options > 1) {
warn "More than one task option selected.\n";
usage();
}
if ($task_options == 0) {
warn "Must select a task option.\n";
usage();
}
if (! defined $db) {
die "Must specify a database name\n";
}
if ($threads <= 0) {
die "Can't use nonpositive thread count of $threads\n";
}
if ($minimizer_len >= $kmer_len) {
die "Minimizer length ($minimizer_len) must be less than k ($kmer_len)\n";
}
if ($minimizer_len <= 0) {
die "Can't use nonpositive minimizer length of $minimizer_len\n";
}
if ($kmer_len <= 2) {
die "Can't use k of $kmer_len (must be >= 2)\n";
}
if ($kmer_len > 31) {
die "Can't use k of $kmer_len (must be <= 31)\n";
}
if ($hash_size !~ /^(\d+[kKmMgG]?)?$/) {
die "Illegal hash size string\n";
}
if ($max_db_size !~ /^$/ && $max_db_size <= 0) {
die "Can't have negative max database size.\n";
}
$ENV{"KRAKEN_DB_NAME"} = $db;
$ENV{"KRAKEN_THREAD_CT"} = $threads;
$ENV{"KRAKEN_MINIMIZER_LEN"} = $minimizer_len;
$ENV{"KRAKEN_KMER_LEN"} = $kmer_len;
$ENV{"KRAKEN_HASH_SIZE"} = $hash_size;
$ENV{"KRAKEN_MAX_DB_SIZE"} = $max_db_size;
$ENV{"KRAKEN_WORK_ON_DISK"} = $work_on_disk;
if ($dl_taxonomy) {
download_taxonomy();
}
elsif (defined($dl_library)) {
download_library($dl_library);
}
elsif (defined($add_to_library)) {
add_to_library($add_to_library);
}
elsif (defined($shrink)) {
shrink_db($shrink);
}
elsif ($standard) {
standard_installation();
}
elsif ($build || $rebuild) {
build_database();
}
elsif ($clean) {
clean_database();
}
elsif ($upgrade) {
upgrade_database();
}
else {
usage();
}
exit -1;
# END OF MAIN CODE.
sub usage {
my $exit_code = @_ ? shift : 64;
print STDERR <