pax_global_header00006660000000000000000000000064131734211710014512gustar00rootroot0000000000000052 comment=9f26bfcc7780753129b60717ecab0ebba6f04b7c bwa-0.7.17/000077500000000000000000000000001317342117100123575ustar00rootroot00000000000000bwa-0.7.17/.gitignore000066400000000000000000000000671317342117100143520ustar00rootroot00000000000000*.[oa] bwa test test64 .*.swp Makefile.bak bwamem-lite bwa-0.7.17/.travis.yml000066400000000000000000000000651317342117100144710ustar00rootroot00000000000000language: c compiler: - gcc - clang script: make bwa-0.7.17/COPYING000066400000000000000000001045131317342117100134160ustar00rootroot00000000000000 GNU GENERAL PUBLIC LICENSE Version 3, 29 June 2007 Copyright (C) 2007 Free Software Foundation, Inc. Everyone is permitted to copy and distribute verbatim copies of this license document, but changing it is not allowed. Preamble The GNU General Public License is a free, copyleft license for software and other kinds of works. The licenses for most software and other practical works are designed to take away your freedom to share and change the works. By contrast, the GNU General Public License is intended to guarantee your freedom to share and change all versions of a program--to make sure it remains free software for all its users. We, the Free Software Foundation, use the GNU General Public License for most of our software; it applies also to any other work released this way by its authors. You can apply it to your programs, too. When we speak of free software, we are referring to freedom, not price. Our General Public Licenses are designed to make sure that you have the freedom to distribute copies of free software (and charge for them if you wish), that you receive source code or can get it if you want it, that you can change the software or use pieces of it in new free programs, and that you know you can do these things. To protect your rights, we need to prevent others from denying you these rights or asking you to surrender the rights. Therefore, you have certain responsibilities if you distribute copies of the software, or if you modify it: responsibilities to respect the freedom of others. For example, if you distribute copies of such a program, whether gratis or for a fee, you must pass on to the recipients the same freedoms that you received. You must make sure that they, too, receive or can get the source code. And you must show them these terms so they know their rights. Developers that use the GNU GPL protect your rights with two steps: (1) assert copyright on the software, and (2) offer you this License giving you legal permission to copy, distribute and/or modify it. For the developers' and authors' protection, the GPL clearly explains that there is no warranty for this free software. For both users' and authors' sake, the GPL requires that modified versions be marked as changed, so that their problems will not be attributed erroneously to authors of previous versions. Some devices are designed to deny users access to install or run modified versions of the software inside them, although the manufacturer can do so. This is fundamentally incompatible with the aim of protecting users' freedom to change the software. The systematic pattern of such abuse occurs in the area of products for individuals to use, which is precisely where it is most unacceptable. Therefore, we have designed this version of the GPL to prohibit the practice for those products. If such problems arise substantially in other domains, we stand ready to extend this provision to those domains in future versions of the GPL, as needed to protect the freedom of users. Finally, every program is threatened constantly by software patents. States should not allow patents to restrict development and use of software on general-purpose computers, but in those that do, we wish to avoid the special danger that patents applied to a free program could make it effectively proprietary. To prevent this, the GPL assures that patents cannot be used to render the program non-free. The precise terms and conditions for copying, distribution and modification follow. TERMS AND CONDITIONS 0. Definitions. "This License" refers to version 3 of the GNU General Public License. "Copyright" also means copyright-like laws that apply to other kinds of works, such as semiconductor masks. "The Program" refers to any copyrightable work licensed under this License. Each licensee is addressed as "you". "Licensees" and "recipients" may be individuals or organizations. To "modify" a work means to copy from or adapt all or part of the work in a fashion requiring copyright permission, other than the making of an exact copy. The resulting work is called a "modified version" of the earlier work or a work "based on" the earlier work. A "covered work" means either the unmodified Program or a work based on the Program. To "propagate" a work means to do anything with it that, without permission, would make you directly or secondarily liable for infringement under applicable copyright law, except executing it on a computer or modifying a private copy. Propagation includes copying, distribution (with or without modification), making available to the public, and in some countries other activities as well. To "convey" a work means any kind of propagation that enables other parties to make or receive copies. Mere interaction with a user through a computer network, with no transfer of a copy, is not conveying. An interactive user interface displays "Appropriate Legal Notices" to the extent that it includes a convenient and prominently visible feature that (1) displays an appropriate copyright notice, and (2) tells the user that there is no warranty for the work (except to the extent that warranties are provided), that licensees may convey the work under this License, and how to view a copy of this License. If the interface presents a list of user commands or options, such as a menu, a prominent item in the list meets this criterion. 1. Source Code. The "source code" for a work means the preferred form of the work for making modifications to it. "Object code" means any non-source form of a work. A "Standard Interface" means an interface that either is an official standard defined by a recognized standards body, or, in the case of interfaces specified for a particular programming language, one that is widely used among developers working in that language. The "System Libraries" of an executable work include anything, other than the work as a whole, that (a) is included in the normal form of packaging a Major Component, but which is not part of that Major Component, and (b) serves only to enable use of the work with that Major Component, or to implement a Standard Interface for which an implementation is available to the public in source code form. A "Major Component", in this context, means a major essential component (kernel, window system, and so on) of the specific operating system (if any) on which the executable work runs, or a compiler used to produce the work, or an object code interpreter used to run it. The "Corresponding Source" for a work in object code form means all the source code needed to generate, install, and (for an executable work) run the object code and to modify the work, including scripts to control those activities. However, it does not include the work's System Libraries, or general-purpose tools or generally available free programs which are used unmodified in performing those activities but which are not part of the work. For example, Corresponding Source includes interface definition files associated with source files for the work, and the source code for shared libraries and dynamically linked subprograms that the work is specifically designed to require, such as by intimate data communication or control flow between those subprograms and other parts of the work. The Corresponding Source need not include anything that users can regenerate automatically from other parts of the Corresponding Source. The Corresponding Source for a work in source code form is that same work. 2. Basic Permissions. All rights granted under this License are granted for the term of copyright on the Program, and are irrevocable provided the stated conditions are met. This License explicitly affirms your unlimited permission to run the unmodified Program. The output from running a covered work is covered by this License only if the output, given its content, constitutes a covered work. This License acknowledges your rights of fair use or other equivalent, as provided by copyright law. You may make, run and propagate covered works that you do not convey, without conditions so long as your license otherwise remains in force. You may convey covered works to others for the sole purpose of having them make modifications exclusively for you, or provide you with facilities for running those works, provided that you comply with the terms of this License in conveying all material for which you do not control copyright. Those thus making or running the covered works for you must do so exclusively on your behalf, under your direction and control, on terms that prohibit them from making any copies of your copyrighted material outside their relationship with you. Conveying under any other circumstances is permitted solely under the conditions stated below. Sublicensing is not allowed; section 10 makes it unnecessary. 3. Protecting Users' Legal Rights From Anti-Circumvention Law. No covered work shall be deemed part of an effective technological measure under any applicable law fulfilling obligations under article 11 of the WIPO copyright treaty adopted on 20 December 1996, or similar laws prohibiting or restricting circumvention of such measures. When you convey a covered work, you waive any legal power to forbid circumvention of technological measures to the extent such circumvention is effected by exercising rights under this License with respect to the covered work, and you disclaim any intention to limit operation or modification of the work as a means of enforcing, against the work's users, your or third parties' legal rights to forbid circumvention of technological measures. 4. Conveying Verbatim Copies. You may convey verbatim copies of the Program's source code as you receive it, in any medium, provided that you conspicuously and appropriately publish on each copy an appropriate copyright notice; keep intact all notices stating that this License and any non-permissive terms added in accord with section 7 apply to the code; keep intact all notices of the absence of any warranty; and give all recipients a copy of this License along with the Program. You may charge any price or no price for each copy that you convey, and you may offer support or warranty protection for a fee. 5. Conveying Modified Source Versions. You may convey a work based on the Program, or the modifications to produce it from the Program, in the form of source code under the terms of section 4, provided that you also meet all of these conditions: a) The work must carry prominent notices stating that you modified it, and giving a relevant date. b) The work must carry prominent notices stating that it is released under this License and any conditions added under section 7. This requirement modifies the requirement in section 4 to "keep intact all notices". c) You must license the entire work, as a whole, under this License to anyone who comes into possession of a copy. This License will therefore apply, along with any applicable section 7 additional terms, to the whole of the work, and all its parts, regardless of how they are packaged. This License gives no permission to license the work in any other way, but it does not invalidate such permission if you have separately received it. d) If the work has interactive user interfaces, each must display Appropriate Legal Notices; however, if the Program has interactive interfaces that do not display Appropriate Legal Notices, your work need not make them do so. A compilation of a covered work with other separate and independent works, which are not by their nature extensions of the covered work, and which are not combined with it such as to form a larger program, in or on a volume of a storage or distribution medium, is called an "aggregate" if the compilation and its resulting copyright are not used to limit the access or legal rights of the compilation's users beyond what the individual works permit. Inclusion of a covered work in an aggregate does not cause this License to apply to the other parts of the aggregate. 6. Conveying Non-Source Forms. You may convey a covered work in object code form under the terms of sections 4 and 5, provided that you also convey the machine-readable Corresponding Source under the terms of this License, in one of these ways: a) Convey the object code in, or embodied in, a physical product (including a physical distribution medium), accompanied by the Corresponding Source fixed on a durable physical medium customarily used for software interchange. b) Convey the object code in, or embodied in, a physical product (including a physical distribution medium), accompanied by a written offer, valid for at least three years and valid for as long as you offer spare parts or customer support for that product model, to give anyone who possesses the object code either (1) a copy of the Corresponding Source for all the software in the product that is covered by this License, on a durable physical medium customarily used for software interchange, for a price no more than your reasonable cost of physically performing this conveying of source, or (2) access to copy the Corresponding Source from a network server at no charge. c) Convey individual copies of the object code with a copy of the written offer to provide the Corresponding Source. This alternative is allowed only occasionally and noncommercially, and only if you received the object code with such an offer, in accord with subsection 6b. d) Convey the object code by offering access from a designated place (gratis or for a charge), and offer equivalent access to the Corresponding Source in the same way through the same place at no further charge. You need not require recipients to copy the Corresponding Source along with the object code. If the place to copy the object code is a network server, the Corresponding Source may be on a different server (operated by you or a third party) that supports equivalent copying facilities, provided you maintain clear directions next to the object code saying where to find the Corresponding Source. Regardless of what server hosts the Corresponding Source, you remain obligated to ensure that it is available for as long as needed to satisfy these requirements. e) Convey the object code using peer-to-peer transmission, provided you inform other peers where the object code and Corresponding Source of the work are being offered to the general public at no charge under subsection 6d. A separable portion of the object code, whose source code is excluded from the Corresponding Source as a System Library, need not be included in conveying the object code work. A "User Product" is either (1) a "consumer product", which means any tangible personal property which is normally used for personal, family, or household purposes, or (2) anything designed or sold for incorporation into a dwelling. In determining whether a product is a consumer product, doubtful cases shall be resolved in favor of coverage. For a particular product received by a particular user, "normally used" refers to a typical or common use of that class of product, regardless of the status of the particular user or of the way in which the particular user actually uses, or expects or is expected to use, the product. A product is a consumer product regardless of whether the product has substantial commercial, industrial or non-consumer uses, unless such uses represent the only significant mode of use of the product. "Installation Information" for a User Product means any methods, procedures, authorization keys, or other information required to install and execute modified versions of a covered work in that User Product from a modified version of its Corresponding Source. The information must suffice to ensure that the continued functioning of the modified object code is in no case prevented or interfered with solely because modification has been made. If you convey an object code work under this section in, or with, or specifically for use in, a User Product, and the conveying occurs as part of a transaction in which the right of possession and use of the User Product is transferred to the recipient in perpetuity or for a fixed term (regardless of how the transaction is characterized), the Corresponding Source conveyed under this section must be accompanied by the Installation Information. But this requirement does not apply if neither you nor any third party retains the ability to install modified object code on the User Product (for example, the work has been installed in ROM). The requirement to provide Installation Information does not include a requirement to continue to provide support service, warranty, or updates for a work that has been modified or installed by the recipient, or for the User Product in which it has been modified or installed. Access to a network may be denied when the modification itself materially and adversely affects the operation of the network or violates the rules and protocols for communication across the network. Corresponding Source conveyed, and Installation Information provided, in accord with this section must be in a format that is publicly documented (and with an implementation available to the public in source code form), and must require no special password or key for unpacking, reading or copying. 7. Additional Terms. "Additional permissions" are terms that supplement the terms of this License by making exceptions from one or more of its conditions. Additional permissions that are applicable to the entire Program shall be treated as though they were included in this License, to the extent that they are valid under applicable law. If additional permissions apply only to part of the Program, that part may be used separately under those permissions, but the entire Program remains governed by this License without regard to the additional permissions. When you convey a copy of a covered work, you may at your option remove any additional permissions from that copy, or from any part of it. (Additional permissions may be written to require their own removal in certain cases when you modify the work.) You may place additional permissions on material, added by you to a covered work, for which you have or can give appropriate copyright permission. Notwithstanding any other provision of this License, for material you add to a covered work, you may (if authorized by the copyright holders of that material) supplement the terms of this License with terms: a) Disclaiming warranty or limiting liability differently from the terms of sections 15 and 16 of this License; or b) Requiring preservation of specified reasonable legal notices or author attributions in that material or in the Appropriate Legal Notices displayed by works containing it; or c) Prohibiting misrepresentation of the origin of that material, or requiring that modified versions of such material be marked in reasonable ways as different from the original version; or d) Limiting the use for publicity purposes of names of licensors or authors of the material; or e) Declining to grant rights under trademark law for use of some trade names, trademarks, or service marks; or f) Requiring indemnification of licensors and authors of that material by anyone who conveys the material (or modified versions of it) with contractual assumptions of liability to the recipient, for any liability that these contractual assumptions directly impose on those licensors and authors. All other non-permissive additional terms are considered "further restrictions" within the meaning of section 10. If the Program as you received it, or any part of it, contains a notice stating that it is governed by this License along with a term that is a further restriction, you may remove that term. If a license document contains a further restriction but permits relicensing or conveying under this License, you may add to a covered work material governed by the terms of that license document, provided that the further restriction does not survive such relicensing or conveying. If you add terms to a covered work in accord with this section, you must place, in the relevant source files, a statement of the additional terms that apply to those files, or a notice indicating where to find the applicable terms. Additional terms, permissive or non-permissive, may be stated in the form of a separately written license, or stated as exceptions; the above requirements apply either way. 8. Termination. You may not propagate or modify a covered work except as expressly provided under this License. Any attempt otherwise to propagate or modify it is void, and will automatically terminate your rights under this License (including any patent licenses granted under the third paragraph of section 11). However, if you cease all violation of this License, then your license from a particular copyright holder is reinstated (a) provisionally, unless and until the copyright holder explicitly and finally terminates your license, and (b) permanently, if the copyright holder fails to notify you of the violation by some reasonable means prior to 60 days after the cessation. Moreover, your license from a particular copyright holder is reinstated permanently if the copyright holder notifies you of the violation by some reasonable means, this is the first time you have received notice of violation of this License (for any work) from that copyright holder, and you cure the violation prior to 30 days after your receipt of the notice. Termination of your rights under this section does not terminate the licenses of parties who have received copies or rights from you under this License. If your rights have been terminated and not permanently reinstated, you do not qualify to receive new licenses for the same material under section 10. 9. Acceptance Not Required for Having Copies. You are not required to accept this License in order to receive or run a copy of the Program. Ancillary propagation of a covered work occurring solely as a consequence of using peer-to-peer transmission to receive a copy likewise does not require acceptance. However, nothing other than this License grants you permission to propagate or modify any covered work. These actions infringe copyright if you do not accept this License. Therefore, by modifying or propagating a covered work, you indicate your acceptance of this License to do so. 10. Automatic Licensing of Downstream Recipients. Each time you convey a covered work, the recipient automatically receives a license from the original licensors, to run, modify and propagate that work, subject to this License. You are not responsible for enforcing compliance by third parties with this License. An "entity transaction" is a transaction transferring control of an organization, or substantially all assets of one, or subdividing an organization, or merging organizations. If propagation of a covered work results from an entity transaction, each party to that transaction who receives a copy of the work also receives whatever licenses to the work the party's predecessor in interest had or could give under the previous paragraph, plus a right to possession of the Corresponding Source of the work from the predecessor in interest, if the predecessor has it or can get it with reasonable efforts. You may not impose any further restrictions on the exercise of the rights granted or affirmed under this License. For example, you may not impose a license fee, royalty, or other charge for exercise of rights granted under this License, and you may not initiate litigation (including a cross-claim or counterclaim in a lawsuit) alleging that any patent claim is infringed by making, using, selling, offering for sale, or importing the Program or any portion of it. 11. Patents. A "contributor" is a copyright holder who authorizes use under this License of the Program or a work on which the Program is based. The work thus licensed is called the contributor's "contributor version". A contributor's "essential patent claims" are all patent claims owned or controlled by the contributor, whether already acquired or hereafter acquired, that would be infringed by some manner, permitted by this License, of making, using, or selling its contributor version, but do not include claims that would be infringed only as a consequence of further modification of the contributor version. For purposes of this definition, "control" includes the right to grant patent sublicenses in a manner consistent with the requirements of this License. Each contributor grants you a non-exclusive, worldwide, royalty-free patent license under the contributor's essential patent claims, to make, use, sell, offer for sale, import and otherwise run, modify and propagate the contents of its contributor version. In the following three paragraphs, a "patent license" is any express agreement or commitment, however denominated, not to enforce a patent (such as an express permission to practice a patent or covenant not to sue for patent infringement). To "grant" such a patent license to a party means to make such an agreement or commitment not to enforce a patent against the party. If you convey a covered work, knowingly relying on a patent license, and the Corresponding Source of the work is not available for anyone to copy, free of charge and under the terms of this License, through a publicly available network server or other readily accessible means, then you must either (1) cause the Corresponding Source to be so available, or (2) arrange to deprive yourself of the benefit of the patent license for this particular work, or (3) arrange, in a manner consistent with the requirements of this License, to extend the patent license to downstream recipients. "Knowingly relying" means you have actual knowledge that, but for the patent license, your conveying the covered work in a country, or your recipient's use of the covered work in a country, would infringe one or more identifiable patents in that country that you have reason to believe are valid. If, pursuant to or in connection with a single transaction or arrangement, you convey, or propagate by procuring conveyance of, a covered work, and grant a patent license to some of the parties receiving the covered work authorizing them to use, propagate, modify or convey a specific copy of the covered work, then the patent license you grant is automatically extended to all recipients of the covered work and works based on it. A patent license is "discriminatory" if it does not include within the scope of its coverage, prohibits the exercise of, or is conditioned on the non-exercise of one or more of the rights that are specifically granted under this License. You may not convey a covered work if you are a party to an arrangement with a third party that is in the business of distributing software, under which you make payment to the third party based on the extent of your activity of conveying the work, and under which the third party grants, to any of the parties who would receive the covered work from you, a discriminatory patent license (a) in connection with copies of the covered work conveyed by you (or copies made from those copies), or (b) primarily for and in connection with specific products or compilations that contain the covered work, unless you entered into that arrangement, or that patent license was granted, prior to 28 March 2007. Nothing in this License shall be construed as excluding or limiting any implied license or other defenses to infringement that may otherwise be available to you under applicable patent law. 12. No Surrender of Others' Freedom. If conditions are imposed on you (whether by court order, agreement or otherwise) that contradict the conditions of this License, they do not excuse you from the conditions of this License. If you cannot convey a covered work so as to satisfy simultaneously your obligations under this License and any other pertinent obligations, then as a consequence you may not convey it at all. For example, if you agree to terms that obligate you to collect a royalty for further conveying from those to whom you convey the Program, the only way you could satisfy both those terms and this License would be to refrain entirely from conveying the Program. 13. Use with the GNU Affero General Public License. Notwithstanding any other provision of this License, you have permission to link or combine any covered work with a work licensed under version 3 of the GNU Affero General Public License into a single combined work, and to convey the resulting work. The terms of this License will continue to apply to the part which is the covered work, but the special requirements of the GNU Affero General Public License, section 13, concerning interaction through a network will apply to the combination as such. 14. Revised Versions of this License. The Free Software Foundation may publish revised and/or new versions of the GNU General Public License from time to time. Such new versions will be similar in spirit to the present version, but may differ in detail to address new problems or concerns. Each version is given a distinguishing version number. If the Program specifies that a certain numbered version of the GNU General Public License "or any later version" applies to it, you have the option of following the terms and conditions either of that numbered version or of any later version published by the Free Software Foundation. If the Program does not specify a version number of the GNU General Public License, you may choose any version ever published by the Free Software Foundation. If the Program specifies that a proxy can decide which future versions of the GNU General Public License can be used, that proxy's public statement of acceptance of a version permanently authorizes you to choose that version for the Program. Later license versions may give you additional or different permissions. However, no additional obligations are imposed on any author or copyright holder as a result of your choosing to follow a later version. 15. Disclaimer of Warranty. THERE IS NO WARRANTY FOR THE PROGRAM, TO THE EXTENT PERMITTED BY APPLICABLE LAW. EXCEPT WHEN OTHERWISE STATED IN WRITING THE COPYRIGHT HOLDERS AND/OR OTHER PARTIES PROVIDE THE PROGRAM "AS IS" WITHOUT WARRANTY OF ANY KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE. THE ENTIRE RISK AS TO THE QUALITY AND PERFORMANCE OF THE PROGRAM IS WITH YOU. SHOULD THE PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF ALL NECESSARY SERVICING, REPAIR OR CORRECTION. 16. Limitation of Liability. IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MODIFIES AND/OR CONVEYS THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES, INCLUDING ANY GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING OUT OF THE USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED TO LOSS OF DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY YOU OR THIRD PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER PROGRAMS), EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE POSSIBILITY OF SUCH DAMAGES. 17. Interpretation of Sections 15 and 16. If the disclaimer of warranty and limitation of liability provided above cannot be given local legal effect according to their terms, reviewing courts shall apply local law that most closely approximates an absolute waiver of all civil liability in connection with the Program, unless a warranty or assumption of liability accompanies a copy of the Program in return for a fee. END OF TERMS AND CONDITIONS How to Apply These Terms to Your New Programs If you develop a new program, and you want it to be of the greatest possible use to the public, the best way to achieve this is to make it free software which everyone can redistribute and change under these terms. To do so, attach the following notices to the program. It is safest to attach them to the start of each source file to most effectively state the exclusion of warranty; and each file should have at least the "copyright" line and a pointer to where the full notice is found. Copyright (C) This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program. If not, see . Also add information on how to contact you by electronic and paper mail. If the program does terminal interaction, make it output a short notice like this when it starts in an interactive mode: Copyright (C) This program comes with ABSOLUTELY NO WARRANTY; for details type `show w'. This is free software, and you are welcome to redistribute it under certain conditions; type `show c' for details. The hypothetical commands `show w' and `show c' should show the appropriate parts of the General Public License. Of course, your program's commands might be different; for a GUI interface, you would use an "about box". You should also get your employer (if you work as a programmer) or school, if any, to sign a "copyright disclaimer" for the program, if necessary. For more information on this, and how to apply and follow the GNU GPL, see . The GNU General Public License does not permit incorporating your program into proprietary programs. If your program is a subroutine library, you may consider it more useful to permit linking proprietary applications with the library. If this is what you want to do, use the GNU Lesser General Public License instead of this License. But first, please read . bwa-0.7.17/ChangeLog000066400000000000000000003653311317342117100141440ustar00rootroot00000000000000------------------------------------------------------------------------ r1605 | lh3 | 2010-12-29 20:20:20 -0500 (Wed, 29 Dec 2010) | 3 lines Changed paths: M /branches/prog/bwa/bwtsw2_aux.c M /branches/prog/bwa/main.c * bwa-0.5.9rc1-2 (r1605) * fixed a typo/bug in bwasw ------------------------------------------------------------------------ r1587 | lh3 | 2010-12-21 18:48:30 -0500 (Tue, 21 Dec 2010) | 2 lines Changed paths: M /branches/prog/bwa/bwa.1 a typo in the manual ------------------------------------------------------------------------ r1586 | lh3 | 2010-12-21 18:47:48 -0500 (Tue, 21 Dec 2010) | 3 lines Changed paths: M /branches/prog/bwa/bwape.c M /branches/prog/bwa/bwase.c M /branches/prog/bwa/bwtaln.c M /branches/prog/bwa/bwtsw2_main.c M /branches/prog/bwa/main.c M /branches/prog/bwa/utils.c M /branches/prog/bwa/utils.h * bwa-0.5.9rc1-1 (r1586) * a few patches by John ------------------------------------------------------------------------ r1562 | lh3 | 2010-12-10 01:02:06 -0500 (Fri, 10 Dec 2010) | 2 lines Changed paths: M /branches/prog/bwa/bwa.1 M /branches/prog/bwa/bwape.c M /branches/prog/bwa/bwase.c documentation on specifying @RG ------------------------------------------------------------------------ r1561 | lh3 | 2010-12-10 00:45:40 -0500 (Fri, 10 Dec 2010) | 2 lines Changed paths: M /branches/prog/bwa/ChangeLog M /branches/prog/bwa/NEWS M /branches/prog/bwa/bwa.1 M /branches/prog/bwa/main.c Release bwa-0.5.9rc1 (r1561) ------------------------------------------------------------------------ r1560 | lh3 | 2010-12-10 00:29:08 -0500 (Fri, 10 Dec 2010) | 3 lines Changed paths: M /branches/prog/bwa/bwaseqio.c M /branches/prog/bwa/main.c * fixed a small memory leak caused by the BAM reader * fixed a memory violation, also in the BAM reader ------------------------------------------------------------------------ r1559 | lh3 | 2010-12-10 00:10:48 -0500 (Fri, 10 Dec 2010) | 2 lines Changed paths: M /branches/prog/bwa/ChangeLog M /branches/prog/bwa/Makefile change Makefile gcc options ------------------------------------------------------------------------ r1558 | lh3 | 2010-12-10 00:09:22 -0500 (Fri, 10 Dec 2010) | 4 lines Changed paths: M /branches/prog/bwa/bwtsw2_aux.c M /branches/prog/bwa/bwtsw2_core.c M /branches/prog/bwa/main.c * bwa-0.5.8-6 (r1557) * added a little more comments to BWA-SW * randomly choosing a mapping if there are more than one ------------------------------------------------------------------------ r1557 | lh3 | 2010-12-09 21:58:00 -0500 (Thu, 09 Dec 2010) | 2 lines Changed paths: M /branches/prog/bwa/Makefile M /branches/prog/bwa/bwtsw2_aux.c sometimes unmapped reads may not be printed... ------------------------------------------------------------------------ r1556 | lh3 | 2010-12-09 21:50:26 -0500 (Thu, 09 Dec 2010) | 2 lines Changed paths: M /branches/prog/bwa/Makefile M /branches/prog/bwa/bwtsw2_aux.c print unmapped reads ------------------------------------------------------------------------ r1555 | lh3 | 2010-12-09 21:17:20 -0500 (Thu, 09 Dec 2010) | 3 lines Changed paths: M /branches/prog/bwa/ChangeLog M /branches/prog/bwa/bwa.1 M /branches/prog/bwa/bwtaln.c M /branches/prog/bwa/main.c * bwa-0.5.8-5 (r1555) * BAM input documentation ------------------------------------------------------------------------ r1544 | lh3 | 2010-11-23 11:01:41 -0500 (Tue, 23 Nov 2010) | 3 lines Changed paths: M /branches/prog/bwa/bwape.c M /branches/prog/bwa/bwase.c M /branches/prog/bwa/main.c * bwa-0.5.8-4 (r1544) * supporting adding RG tags and RG lines ------------------------------------------------------------------------ r1543 | lh3 | 2010-11-23 00:16:40 -0500 (Tue, 23 Nov 2010) | 3 lines Changed paths: M /branches/prog/bwa/bwtaln.c M /branches/prog/bwa/main.c * bwa-0.5.8-3 (r1543) * fixed a memory leak ------------------------------------------------------------------------ r1542 | lh3 | 2010-11-22 23:50:56 -0500 (Mon, 22 Nov 2010) | 3 lines Changed paths: M /branches/prog/bwa/bwase.c M /branches/prog/bwa/main.c * bwa-0.5.8-2 (r1542) * fixed a long existing bug in random placement of reads ------------------------------------------------------------------------ r1541 | lh3 | 2010-11-22 23:27:29 -0500 (Mon, 22 Nov 2010) | 2 lines Changed paths: M /branches/prog/bwa/Makefile A /branches/prog/bwa/bamlite.c A /branches/prog/bwa/bamlite.h M /branches/prog/bwa/bwape.c M /branches/prog/bwa/bwase.c M /branches/prog/bwa/bwaseqio.c M /branches/prog/bwa/bwtaln.c M /branches/prog/bwa/bwtaln.h M /branches/prog/bwa/main.c preliminary BAM input support ------------------------------------------------------------------------ r1537 | lh3 | 2010-10-16 23:46:20 -0400 (Sat, 16 Oct 2010) | 2 lines Changed paths: M /branches/prog/bwa/ChangeLog M /branches/prog/bwa/bwa.1 change version number and ChangeLog ------------------------------------------------------------------------ r1536 | lh3 | 2010-10-16 23:35:10 -0400 (Sat, 16 Oct 2010) | 3 lines Changed paths: M /branches/prog/bwa/bwape.c M /branches/prog/bwa/main.c M /branches/prog/bwa/stdaln.c * fixed a bug in the scoring matrix * release bwa-0.5.8c (r1536) ------------------------------------------------------------------------ r1451 | lh3 | 2010-06-15 09:43:52 -0400 (Tue, 15 Jun 2010) | 2 lines Changed paths: M /branches/prog/bwa/bwa.1 version change ------------------------------------------------------------------------ r1450 | lh3 | 2010-06-15 09:42:21 -0400 (Tue, 15 Jun 2010) | 3 lines Changed paths: M /branches/prog/bwa/main.c M /branches/prog/bwa/stdaln.c * bwa-0.5.8b (r1450) * fixed a bug in scoring matrix ------------------------------------------------------------------------ r1445 | lh3 | 2010-06-11 08:58:33 -0400 (Fri, 11 Jun 2010) | 2 lines Changed paths: M /branches/prog/bwa/bwa.1 M /branches/prog/bwa/bwape.c fixed a serious bug ------------------------------------------------------------------------ r1442 | lh3 | 2010-06-08 10:22:14 -0400 (Tue, 08 Jun 2010) | 2 lines Changed paths: M /branches/prog/bwa/ChangeLog M /branches/prog/bwa/NEWS M /branches/prog/bwa/main.c Release bwa-0.5.8 (r1442) ------------------------------------------------------------------------ r1440 | lh3 | 2010-05-19 13:43:50 -0400 (Wed, 19 May 2010) | 3 lines Changed paths: M /branches/prog/bwa/bwtsw2_core.c M /branches/prog/bwa/main.c * bwa-r1440 * sorry, forget to remove a debugging line ------------------------------------------------------------------------ r1439 | lh3 | 2010-05-19 13:43:08 -0400 (Wed, 19 May 2010) | 4 lines Changed paths: M /branches/prog/bwa/bwape.c M /branches/prog/bwa/bwtsw2_core.c M /branches/prog/bwa/main.c * bwa-r1439 * fixed a bug in bwasw caused by a recent modification * throwing insane insert size when estimating isize ------------------------------------------------------------------------ r1425 | lh3 | 2010-04-29 15:15:23 -0400 (Thu, 29 Apr 2010) | 10 lines Changed paths: M /branches/prog/bwa/bwtsw2_core.c M /branches/prog/bwa/bwtsw2_main.c M /branches/prog/bwa/main.c * bwa-0.5.7-7 (r1425) * fixed a minor bug in bwasw command-line parsing * When band-width is not large enough, bwasw may find two highly overlapping but not completely overlapping alignments. The old version will filter out one of them, which leads to false negatives. The current outputs both. This solution is obviously not ideal. The ideal one would be to increase the band-width and redo the alignment. ------------------------------------------------------------------------ r1399 | lh3 | 2010-04-16 09:20:49 -0400 (Fri, 16 Apr 2010) | 3 lines Changed paths: M /branches/prog/bwa/ChangeLog M /branches/prog/bwa/bwase.c M /branches/prog/bwa/main.c * bwa-0.5.7-6 (r1399) * fixed a typo/bug (by Vaughn Iverson) ------------------------------------------------------------------------ r1329 | lh3 | 2010-03-19 23:32:46 -0400 (Fri, 19 Mar 2010) | 2 lines Changed paths: M /branches/prog/bwa/bwape.c M /branches/prog/bwa/main.c small correction ------------------------------------------------------------------------ r1328 | lh3 | 2010-03-19 23:28:44 -0400 (Fri, 19 Mar 2010) | 3 lines Changed paths: M /branches/prog/bwa/bwape.c M /branches/prog/bwa/main.c * bwa-0.5.7-4 (r1328) * automatically adjust ap_prior based on alignment ------------------------------------------------------------------------ r1327 | lh3 | 2010-03-19 23:02:40 -0400 (Fri, 19 Mar 2010) | 3 lines Changed paths: M /branches/prog/bwa/bwape.c M /branches/prog/bwa/main.c M /branches/prog/bwa/stdaln.c M /branches/prog/bwa/stdaln.h * bwa-0.5.7-3 (r1327) * evaluate hits obtained from SW alignment in a more proper way. ------------------------------------------------------------------------ r1320 | lh3 | 2010-03-17 15:13:22 -0400 (Wed, 17 Mar 2010) | 2 lines Changed paths: M /branches/prog/bwa/bwape.c fixed a potential out-of-boundary error. Need more testing. ------------------------------------------------------------------------ r1319 | lh3 | 2010-03-14 22:44:46 -0400 (Sun, 14 Mar 2010) | 2 lines Changed paths: M /branches/prog/bwa/bwape.c insert size is `weird' if the 3rd quatile larger than 100,000bp ------------------------------------------------------------------------ r1318 | lh3 | 2010-03-14 22:37:35 -0400 (Sun, 14 Mar 2010) | 3 lines Changed paths: M /branches/prog/bwa/bwape.c M /branches/prog/bwa/bwtaln.h M /branches/prog/bwa/main.c * bwa-0.5.7-2 (r1318) * in sampe, allow to disable insert size estimate ------------------------------------------------------------------------ r1317 | lh3 | 2010-03-14 22:14:14 -0400 (Sun, 14 Mar 2010) | 5 lines Changed paths: M /branches/prog/bwa/bwape.c M /branches/prog/bwa/bwase.c M /branches/prog/bwa/main.c M /branches/prog/bwa/solid2fastq.pl * bwa-0.5.7-1 (r1317) * fixed a potential bug in solid2fastq.pl * fixed a bug in calculating mapping quality (by Rodrigo Goya) * fixed a very rare bug (if ever occur) about pairing ------------------------------------------------------------------------ r1310 | lh3 | 2010-03-01 10:35:45 -0500 (Mon, 01 Mar 2010) | 2 lines Changed paths: M /branches/prog/bwa/ChangeLog M /branches/prog/bwa/NEWS M /branches/prog/bwa/main.c Release bwa-0.5.7 ------------------------------------------------------------------------ r1309 | lh3 | 2010-02-26 21:42:22 -0500 (Fri, 26 Feb 2010) | 4 lines Changed paths: M /branches/prog/bwa/bwape.c M /branches/prog/bwa/bwtaln.c M /branches/prog/bwa/main.c * bwa-0.5.6-2 (r1309) * fixed an unfixed bug (by Carol Scott) * fixed some tiny formatting ------------------------------------------------------------------------ r1305 | lh3 | 2010-02-25 13:47:58 -0500 (Thu, 25 Feb 2010) | 3 lines Changed paths: M /branches/prog/bwa/bwape.c M /branches/prog/bwa/bwase.c M /branches/prog/bwa/bwtaln.c M /branches/prog/bwa/bwtsw2_main.c M /branches/prog/bwa/main.c * bwa-0.5.6-1 (r1304) * optionally write output to a file (by Tim Fennel) ------------------------------------------------------------------------ r1303 | lh3 | 2010-02-10 23:43:48 -0500 (Wed, 10 Feb 2010) | 2 lines Changed paths: M /branches/prog/bwa/ChangeLog M /branches/prog/bwa/NEWS M /branches/prog/bwa/bwa.1 M /branches/prog/bwa/bwtsw2_main.c M /branches/prog/bwa/main.c Release bwa-0.5.6 ------------------------------------------------------------------------ r1302 | lh3 | 2010-02-10 11:11:49 -0500 (Wed, 10 Feb 2010) | 3 lines Changed paths: M /branches/prog/bwa/ChangeLog M /branches/prog/bwa/NEWS M /branches/prog/bwa/bwape.c M /branches/prog/bwa/bwtaln.h M /branches/prog/bwa/main.c * bwa-0.5.5-10 (r1302) * improve max insert size estimate (method suggested by Gerton Lunter) ------------------------------------------------------------------------ r1301 | lh3 | 2010-02-09 16:15:28 -0500 (Tue, 09 Feb 2010) | 5 lines Changed paths: M /branches/prog/bwa/bwape.c M /branches/prog/bwa/bwase.c M /branches/prog/bwa/main.c * bwa-0.5.5-9 (r1301) * improve mapping quality calculation for abnomalous pairs * fixed a bug in multiple hits * SOLiD multiple hits should work now ------------------------------------------------------------------------ r1300 | lh3 | 2010-02-09 12:50:02 -0500 (Tue, 09 Feb 2010) | 3 lines Changed paths: M /branches/prog/bwa/bwape.c M /branches/prog/bwa/main.c * bwa-0.5.5-8 (r1300) * output kurtosis ------------------------------------------------------------------------ r1299 | lh3 | 2010-02-09 12:33:34 -0500 (Tue, 09 Feb 2010) | 5 lines Changed paths: M /branches/prog/bwa/bwape.c M /branches/prog/bwa/main.c * bwa-0.5.5-7 (r1299) * calculate skewness in sampe * increase min_len in SW to 20 * perform more SW to fix discordant pairs ------------------------------------------------------------------------ r1298 | lh3 | 2010-02-08 12:40:31 -0500 (Mon, 08 Feb 2010) | 3 lines Changed paths: M /branches/prog/bwa/bwape.c M /branches/prog/bwa/bwase.c M /branches/prog/bwa/bwtaln.c M /branches/prog/bwa/bwtaln.h M /branches/prog/bwa/cs2nt.c M /branches/prog/bwa/main.c M /branches/prog/bwa/stdaln.h * bwa-0.5.5-6 (r1297) * prepare to replace all 16-bit CIGAR (patches by Rodrigo Goya) ------------------------------------------------------------------------ r1297 | lh3 | 2010-02-05 22:26:11 -0500 (Fri, 05 Feb 2010) | 2 lines Changed paths: M /branches/prog/bwa/solid2fastq.pl the old fix seems not working! ------------------------------------------------------------------------ r1296 | lh3 | 2010-02-05 21:51:03 -0500 (Fri, 05 Feb 2010) | 3 lines Changed paths: M /branches/prog/bwa/bwa.1 M /branches/prog/bwa/bwape.c M /branches/prog/bwa/main.c * bwa-0.5.5-5 (r1296) * fixed a minor issue that the lower bound of insert size is not correctly set. ------------------------------------------------------------------------ r1295 | lh3 | 2010-02-05 21:01:10 -0500 (Fri, 05 Feb 2010) | 5 lines Changed paths: M /branches/prog/bwa/bwape.c M /branches/prog/bwa/bwase.c M /branches/prog/bwa/bwaseqio.c M /branches/prog/bwa/main.c * bwa-0.5.5-4 (r1295) * fixed a memory leak * change the behaviour of -n (samse and sampe) * change the default of -n ------------------------------------------------------------------------ r1294 | lh3 | 2010-02-05 17:24:06 -0500 (Fri, 05 Feb 2010) | 3 lines Changed paths: M /branches/prog/bwa/bwape.c M /branches/prog/bwa/bwase.c M /branches/prog/bwa/bwaseqio.c M /branches/prog/bwa/bwtaln.h M /branches/prog/bwa/main.c * bwa-0.5.5-3 (r1294) * improved multi-hit report ------------------------------------------------------------------------ r1293 | lh3 | 2010-02-05 12:57:38 -0500 (Fri, 05 Feb 2010) | 5 lines Changed paths: M /branches/prog/bwa/bwape.c M /branches/prog/bwa/bwase.c M /branches/prog/bwa/cs2nt.c M /branches/prog/bwa/main.c M /branches/prog/bwa/solid2fastq.pl * bwa-0.5.5-2 (r1293) * bugfix: truncated quality string * bugfix: quality -1 in solid->fastq conversion * bugfix: color reads on the reverse strand is not complemented ------------------------------------------------------------------------ r1279 | lh3 | 2009-11-23 22:42:34 -0500 (Mon, 23 Nov 2009) | 3 lines Changed paths: M /branches/prog/bwa/bntseq.c M /branches/prog/bwa/bntseq.h M /branches/prog/bwa/bwase.c A /branches/prog/bwa/bwase.h M /branches/prog/bwa/bwtaln.c M /branches/prog/bwa/bwtaln.h M /branches/prog/bwa/bwtsw2_aux.c M /branches/prog/bwa/main.c * bwa-0.5.5-1 (r1279) * incorporate changes from Matt Hanna for Java bindings. ------------------------------------------------------------------------ r1275 | lh3 | 2009-11-10 22:13:10 -0500 (Tue, 10 Nov 2009) | 2 lines Changed paths: M /branches/prog/bwa/ChangeLog update ChangeLog ------------------------------------------------------------------------ r1273 | lh3 | 2009-11-10 22:08:16 -0500 (Tue, 10 Nov 2009) | 2 lines Changed paths: M /branches/prog/bwa/NEWS M /branches/prog/bwa/bwa.1 M /branches/prog/bwa/main.c A /branches/prog/bwa/qualfa2fq.pl Release bwa-0.5.5 (r1273) ------------------------------------------------------------------------ r1272 | lh3 | 2009-11-10 22:02:50 -0500 (Tue, 10 Nov 2009) | 3 lines Changed paths: M /branches/prog/bwa/bwape.c M /branches/prog/bwa/main.c * bwa-0.5.4-3 (r1272) * fixed another typo which may lead to incorrect single-end mapping quality ------------------------------------------------------------------------ r1271 | lh3 | 2009-11-10 21:59:47 -0500 (Tue, 10 Nov 2009) | 4 lines Changed paths: M /branches/prog/bwa/bwtaln.c M /branches/prog/bwa/main.c * bwa-0.5.4-2 (r1271) * fixed a serious typo/bug which does not hurt if we allow one gap open and work with <200bp reads, but causes segfault for long reads. ------------------------------------------------------------------------ r1270 | lh3 | 2009-11-09 23:12:42 -0500 (Mon, 09 Nov 2009) | 3 lines Changed paths: M /branches/prog/bwa/cs2nt.c M /branches/prog/bwa/main.c * bwa-0.5.4-1 (r1270) * fixed a bug in color alignment ------------------------------------------------------------------------ r1245 | lh3 | 2009-10-09 07:42:52 -0400 (Fri, 09 Oct 2009) | 2 lines Changed paths: M /branches/prog/bwa/ChangeLog M /branches/prog/bwa/NEWS M /branches/prog/bwa/bwa.1 M /branches/prog/bwa/bwape.c M /branches/prog/bwa/bwase.c M /branches/prog/bwa/bwaseqio.c M /branches/prog/bwa/main.c Release bwa-0.5.4 ------------------------------------------------------------------------ r1244 | lh3 | 2009-10-09 05:53:52 -0400 (Fri, 09 Oct 2009) | 5 lines Changed paths: M /branches/prog/bwa/ChangeLog M /branches/prog/bwa/bwape.c M /branches/prog/bwa/bwase.c M /branches/prog/bwa/bwaseqio.c M /branches/prog/bwa/bwtaln.h M /branches/prog/bwa/main.c M /branches/prog/bwa/stdaln.c * bwa-0.5.3-4 (r1244) * output the clipped length in XC:i: tag * skip mate alignment when stdaln is buggy * fixed a bug in NM:i: tag ------------------------------------------------------------------------ r1243 | lh3 | 2009-10-07 08:15:04 -0400 (Wed, 07 Oct 2009) | 3 lines Changed paths: M /branches/prog/bwa/bwape.c M /branches/prog/bwa/main.c * bwa-0.5.3-3 (r1243) * sampe: fixed a bug when a read sequence is identical its reverse complement. ------------------------------------------------------------------------ r1242 | lh3 | 2009-10-07 07:49:13 -0400 (Wed, 07 Oct 2009) | 4 lines Changed paths: M /branches/prog/bwa/bntseq.c M /branches/prog/bwa/bwape.c M /branches/prog/bwa/bwtaln.c M /branches/prog/bwa/bwtaln.h M /branches/prog/bwa/main.c * bwa-0.5.3-2 (r1242) * sampe: optionall preload the full index into memory * aln: change the default seed length to 32bp ------------------------------------------------------------------------ r1238 | lh3 | 2009-09-26 18:38:15 -0400 (Sat, 26 Sep 2009) | 2 lines Changed paths: M /branches/prog/bwa/khash.h Improve portability of khash.h ------------------------------------------------------------------------ r1228 | lh3 | 2009-09-15 09:20:22 -0400 (Tue, 15 Sep 2009) | 2 lines Changed paths: M /branches/prog/bwa/main.c fixed a typo ------------------------------------------------------------------------ r1227 | lh3 | 2009-09-15 09:19:35 -0400 (Tue, 15 Sep 2009) | 3 lines Changed paths: M /branches/prog/bwa/bwtsw2.h M /branches/prog/bwa/bwtsw2_aux.c M /branches/prog/bwa/bwtsw2_main.c M /branches/prog/bwa/main.c * bwa-0.5.3-1 (r1226) * in dBWT-SW, optionall use hard clipping instead of soft clipping ------------------------------------------------------------------------ r1225 | lh3 | 2009-09-15 08:32:30 -0400 (Tue, 15 Sep 2009) | 2 lines Changed paths: M /branches/prog/bwa/NEWS M /branches/prog/bwa/bwase.c M /branches/prog/bwa/main.c Release bwa-0.5.3 (r1225) ------------------------------------------------------------------------ r1223 | lh3 | 2009-09-13 07:30:41 -0400 (Sun, 13 Sep 2009) | 2 lines Changed paths: M /branches/prog/bwa/ChangeLog M /branches/prog/bwa/NEWS M /branches/prog/bwa/bwa.1 M /branches/prog/bwa/main.c Release bwa-0.5.2 ------------------------------------------------------------------------ r1222 | lh3 | 2009-09-11 09:11:39 -0400 (Fri, 11 Sep 2009) | 3 lines Changed paths: M /branches/prog/bwa/bwtaln.c M /branches/prog/bwa/main.c * bwa-0.5.1-5 (r1222) * fixed a typo. No real change ------------------------------------------------------------------------ r1221 | lh3 | 2009-09-11 09:09:44 -0400 (Fri, 11 Sep 2009) | 3 lines Changed paths: M /branches/prog/bwa/bwa.1 M /branches/prog/bwa/bwape.c M /branches/prog/bwa/bwase.c M /branches/prog/bwa/bwaseqio.c M /branches/prog/bwa/bwtaln.c M /branches/prog/bwa/bwtaln.h M /branches/prog/bwa/main.c * bwa-0.5.1-4 (r1221) * trim reads before alignment ------------------------------------------------------------------------ r1216 | lh3 | 2009-09-08 17:50:15 -0400 (Tue, 08 Sep 2009) | 4 lines Changed paths: M /branches/prog/bwa/bwape.c M /branches/prog/bwa/bwase.c M /branches/prog/bwa/bwtaln.h M /branches/prog/bwa/main.c * bwa-0.5.1-3 (r1216) * fixed a bug about NM tags for gapped alignment * print SAM header ------------------------------------------------------------------------ r1215 | lh3 | 2009-09-08 17:14:42 -0400 (Tue, 08 Sep 2009) | 3 lines Changed paths: M /branches/prog/bwa/bwtaln.c M /branches/prog/bwa/main.c * bwa-0.5.1-2 (r1215) * fixed a bug when read lengths vary (by John Marshall) ------------------------------------------------------------------------ r1213 | lh3 | 2009-09-06 18:58:15 -0400 (Sun, 06 Sep 2009) | 3 lines Changed paths: M /branches/prog/bwa/bwtsw2_aux.c M /branches/prog/bwa/main.c * bwa-0.5.1-1 (r1213) * change default -T to 30 ------------------------------------------------------------------------ r1209 | lh3 | 2009-09-02 06:06:02 -0400 (Wed, 02 Sep 2009) | 2 lines Changed paths: M /branches/prog/bwa/NEWS M /branches/prog/bwa/bwa.1 M /branches/prog/bwa/main.c Release bwa-0.5.1 ------------------------------------------------------------------------ r1208 | lh3 | 2009-09-02 05:56:33 -0400 (Wed, 02 Sep 2009) | 2 lines Changed paths: M /branches/prog/bwa/ChangeLog * ChangeLog ------------------------------------------------------------------------ r1206 | lh3 | 2009-08-30 18:27:30 -0400 (Sun, 30 Aug 2009) | 3 lines Changed paths: M /branches/prog/bwa/bwtsw2_aux.c M /branches/prog/bwa/main.c * bwa-0.5.0-6 (r1206) * fixed two bugs caused by previous modification ------------------------------------------------------------------------ r1205 | lh3 | 2009-08-30 17:28:36 -0400 (Sun, 30 Aug 2009) | 4 lines Changed paths: M /branches/prog/bwa/ChangeLog M /branches/prog/bwa/bwtsw2_aux.c M /branches/prog/bwa/main.c * bwa-0.5.0-4 (r1205) * reduce false coordinates and CIGAR when a query bridges two reference sequences, although some very rare cases may fail bwa. ------------------------------------------------------------------------ r1204 | lh3 | 2009-08-30 06:06:16 -0400 (Sun, 30 Aug 2009) | 3 lines Changed paths: M /branches/prog/bwa/bwtsw2_aux.c M /branches/prog/bwa/bwtsw2_core.c M /branches/prog/bwa/main.c * bwa-0.5.0-3 (r1204) * choose one repetitive hit to extend ------------------------------------------------------------------------ r1203 | lh3 | 2009-08-29 18:11:51 -0400 (Sat, 29 Aug 2009) | 4 lines Changed paths: M /branches/prog/bwa/bwase.c M /branches/prog/bwa/bwtsw2_aux.c M /branches/prog/bwa/main.c * bwa-0.5.0-2 (r1203) * dBWT-SW: change a parameter in calculating mapping quality * fixed a bug in samse ------------------------------------------------------------------------ r1202 | lh3 | 2009-08-28 19:48:41 -0400 (Fri, 28 Aug 2009) | 4 lines Changed paths: M /branches/prog/bwa/bwtsw2_aux.c M /branches/prog/bwa/main.c * bwa-0.5.0-1 (r1202) * change default band width to 50 * improve mapping quality a bit ------------------------------------------------------------------------ r1200 | lh3 | 2009-08-20 06:21:24 -0400 (Thu, 20 Aug 2009) | 2 lines Changed paths: M /branches/prog/bwa/NEWS M /branches/prog/bwa/main.c Release bwa-0.5.0 (r1200) ------------------------------------------------------------------------ r1199 | lh3 | 2009-08-20 04:49:15 -0400 (Thu, 20 Aug 2009) | 2 lines Changed paths: M /branches/prog/bwa/ChangeLog M /branches/prog/bwa/bwa.1 Updated ChangeLog and the manual ------------------------------------------------------------------------ r1198 | lh3 | 2009-08-19 11:09:15 -0400 (Wed, 19 Aug 2009) | 3 lines Changed paths: M /branches/prog/bwa/bwtsw2_aux.c M /branches/prog/bwa/bwtsw2_core.c M /branches/prog/bwa/main.c * bwa-0.4.9-36 (r1198) * simplify duphits removal. The accuracy is changed a tiny bit, sometimes better, sometimes worse. ------------------------------------------------------------------------ r1197 | lh3 | 2009-08-19 08:15:05 -0400 (Wed, 19 Aug 2009) | 3 lines Changed paths: M /branches/prog/bwa/Makefile M /branches/prog/bwa/bwtsw2_aux.c A /branches/prog/bwa/bwtsw2_chain.c M /branches/prog/bwa/main.c * bwa-0.4.9-35 (r1197) * further heuristic acceleration for long queries ------------------------------------------------------------------------ r1196 | lh3 | 2009-08-18 06:54:03 -0400 (Tue, 18 Aug 2009) | 4 lines Changed paths: M /branches/prog/bwa/bwa.1 M /branches/prog/bwa/bwtsw2_aux.c M /branches/prog/bwa/main.c * bwa-0.4.9-34 (r1196) * updated the manual page * output base quality if the input is fastq ------------------------------------------------------------------------ r1195 | lh3 | 2009-08-18 06:23:00 -0400 (Tue, 18 Aug 2009) | 4 lines Changed paths: M /branches/prog/bwa/bwase.c M /branches/prog/bwa/bwtsw2_main.c M /branches/prog/bwa/main.c M /branches/prog/bwa/simple_dp.c * bwa-0.4.9-33 (r1191) * fixed a bug in sampe/samse when gaps occur to the 5'-end in SW alignment * in dbwtsw adjust -T and -c according to -a ------------------------------------------------------------------------ r1192 | lh3 | 2009-08-13 05:37:28 -0400 (Thu, 13 Aug 2009) | 2 lines Changed paths: M /branches/prog/bwa/bwa.1 update manual ------------------------------------------------------------------------ r1191 | lh3 | 2009-08-12 19:40:51 -0400 (Wed, 12 Aug 2009) | 2 lines Changed paths: M /branches/prog/bwa/bwa.1 M /branches/prog/bwa/bwtsw2_main.c update documentation ------------------------------------------------------------------------ r1190 | lh3 | 2009-08-12 08:56:10 -0400 (Wed, 12 Aug 2009) | 3 lines Changed paths: M /branches/prog/bwa/bwtsw2_main.c M /branches/prog/bwa/main.c * bwa-0.4.9-32 (r1190) * only help messages are changed ------------------------------------------------------------------------ r1189 | lh3 | 2009-08-11 09:28:55 -0400 (Tue, 11 Aug 2009) | 4 lines Changed paths: M /branches/prog/bwa/bwase.c M /branches/prog/bwa/bwtsw2.h M /branches/prog/bwa/bwtsw2_aux.c M /branches/prog/bwa/bwtsw2_core.c M /branches/prog/bwa/main.c * bwa-0.4.9-31 (r1189) * in bwape/bwase, print CIGAR "*" if the read is unmapped * improved the calculation of mapping quality ------------------------------------------------------------------------ r1181 | lh3 | 2009-08-03 12:09:41 -0400 (Mon, 03 Aug 2009) | 2 lines Changed paths: M /branches/prog/bwa/bwtsw2_aux.c fflush() ------------------------------------------------------------------------ r1180 | lh3 | 2009-08-03 12:08:46 -0400 (Mon, 03 Aug 2009) | 4 lines Changed paths: M /branches/prog/bwa/bwtsw2_aux.c M /branches/prog/bwa/main.c * bwa-0.4.9-30 (r1180) * fixed a memory problem * multi-threading sometimes does not work... ------------------------------------------------------------------------ r1179 | lh3 | 2009-08-03 11:04:39 -0400 (Mon, 03 Aug 2009) | 3 lines Changed paths: M /branches/prog/bwa/Makefile M /branches/prog/bwa/bwtsw2.h M /branches/prog/bwa/bwtsw2_aux.c M /branches/prog/bwa/bwtsw2_main.c M /branches/prog/bwa/main.c * bwa-0.4.9-29 (r1179) * preliminary mutli-threading support in dbwtsw ------------------------------------------------------------------------ r1178 | lh3 | 2009-08-03 09:14:54 -0400 (Mon, 03 Aug 2009) | 3 lines Changed paths: M /branches/prog/bwa/bwtsw2_aux.c M /branches/prog/bwa/bwtsw2_main.c M /branches/prog/bwa/main.c * bwa-0.4.9-28 (r1178) * fixed a bug in printing repetitive hits ------------------------------------------------------------------------ r1177 | lh3 | 2009-08-03 05:03:42 -0400 (Mon, 03 Aug 2009) | 3 lines Changed paths: M /branches/prog/bwa/bwtsw2_core.c M /branches/prog/bwa/main.c * bwa-0.4.9-27 (r1177) * bwtsw2: fixed a hidden memory leak ------------------------------------------------------------------------ r1176 | lh3 | 2009-07-31 10:58:24 -0400 (Fri, 31 Jul 2009) | 3 lines Changed paths: M /branches/prog/bwa/bwtsw2_aux.c M /branches/prog/bwa/bwtsw2_main.c M /branches/prog/bwa/main.c * bwa-0.4.9-26 * change the way mapping quality is calculated ------------------------------------------------------------------------ r1175 | lh3 | 2009-07-31 09:15:54 -0400 (Fri, 31 Jul 2009) | 4 lines Changed paths: M /branches/prog/bwa/bwtsw2.h M /branches/prog/bwa/bwtsw2_aux.c M /branches/prog/bwa/bwtsw2_core.c M /branches/prog/bwa/bwtsw2_main.c M /branches/prog/bwa/main.c * bwa-0.4.9-25 * code clean up * automatically adjust ->t and ->is_rev based on input ------------------------------------------------------------------------ r1174 | lh3 | 2009-07-30 08:50:25 -0400 (Thu, 30 Jul 2009) | 3 lines Changed paths: M /branches/prog/bwa/bwtsw2_aux.c M /branches/prog/bwa/main.c * bwa-0.4.9-24 * fixed a bug in printing the hits ------------------------------------------------------------------------ r1173 | lh3 | 2009-07-29 18:32:43 -0400 (Wed, 29 Jul 2009) | 4 lines Changed paths: M /branches/prog/bwa/bwtsw2.h M /branches/prog/bwa/bwtsw2_aux.c M /branches/prog/bwa/bwtsw2_main.c M /branches/prog/bwa/main.c * bwa-0.4.9-23 * allow to skip reverse alignment * increase opt->t to 37 ------------------------------------------------------------------------ r1172 | lh3 | 2009-07-29 17:22:39 -0400 (Wed, 29 Jul 2009) | 3 lines Changed paths: M /branches/prog/bwa/bwtsw2_aux.c M /branches/prog/bwa/bwtsw2_main.c M /branches/prog/bwa/main.c * bwa-0.4.9-22 * report if the hit is found in both directions ------------------------------------------------------------------------ r1171 | lh3 | 2009-07-29 17:12:02 -0400 (Wed, 29 Jul 2009) | 3 lines Changed paths: M /branches/prog/bwa/bwtsw2.h M /branches/prog/bwa/bwtsw2_aux.c M /branches/prog/bwa/bwtsw2_main.c M /branches/prog/bwa/main.c * bwa-0.4.9-21 * dbwtsw: map to both forward and reverse BWT to reduce false alignment ------------------------------------------------------------------------ r1170 | lh3 | 2009-07-29 15:25:14 -0400 (Wed, 29 Jul 2009) | 2 lines Changed paths: M /branches/prog/bwa/bwtsw2_core.c M /branches/prog/bwa/main.c save hits before cut_tail() ------------------------------------------------------------------------ r1169 | lh3 | 2009-07-29 08:06:01 -0400 (Wed, 29 Jul 2009) | 3 lines Changed paths: M /branches/prog/bwa/bwtsw2.h M /branches/prog/bwa/bwtsw2_aux.c M /branches/prog/bwa/bwtsw2_core.c M /branches/prog/bwa/main.c M /branches/prog/bwa/stdaln.c M /branches/prog/bwa/stdaln.h * bwa-0.4.9-19 * use a global memory pool to reduce the CPU time spent on malloc/free(). ------------------------------------------------------------------------ r1168 | lh3 | 2009-07-29 06:13:29 -0400 (Wed, 29 Jul 2009) | 5 lines Changed paths: M /branches/prog/bwa/bwtsw2.h M /branches/prog/bwa/bwtsw2_aux.c M /branches/prog/bwa/bwtsw2_core.c M /branches/prog/bwa/bwtsw2_main.c M /branches/prog/bwa/main.c * bwa-0.4.9-18 * reduce unnecessary extension to the 5'-end * allow to use different interval size for the 2 rounds * change default parameters ------------------------------------------------------------------------ r1167 | lh3 | 2009-07-28 19:06:17 -0400 (Tue, 28 Jul 2009) | 3 lines Changed paths: M /branches/prog/bwa/bwtsw2_core.c M /branches/prog/bwa/main.c * bwa-0.4.9-17 * dbwtsw: fixed THE memory leak. ------------------------------------------------------------------------ r1166 | lh3 | 2009-07-28 16:31:41 -0400 (Tue, 28 Jul 2009) | 5 lines Changed paths: M /branches/prog/bwa/bwtsw2_aux.c M /branches/prog/bwa/bwtsw2_core.c M /branches/prog/bwa/bwtsw2_main.c M /branches/prog/bwa/main.c M /branches/prog/bwa/stdaln.c * bwa-0.4.9-16 * fixed a memory leak * a small memory leak still occurs to bwtsw2_core(). I will work on that later. * changed the default parameters ------------------------------------------------------------------------ r1165 | lh3 | 2009-07-28 10:15:40 -0400 (Tue, 28 Jul 2009) | 4 lines Changed paths: M /branches/prog/bwa/bwtsw2_aux.c M /branches/prog/bwa/bwtsw2_core.c M /branches/prog/bwa/main.c M /branches/prog/bwa/stdaln.c * bwa-0.4.9-15 * generate CIGAR right before output. This saves unnecessary computation. * this version may be buggy as I have not tested it. ------------------------------------------------------------------------ r1164 | lh3 | 2009-07-28 09:04:14 -0400 (Tue, 28 Jul 2009) | 11 lines Changed paths: M /branches/prog/bwa/bwtsw2.h M /branches/prog/bwa/bwtsw2_aux.c M /branches/prog/bwa/bwtsw2_core.c M /branches/prog/bwa/main.c M /branches/prog/bwa/stdaln.c M /branches/prog/bwa/stdaln.h * bwa-0.4.9-14 * deplete unique hits in dbwtsw and postprocess them with standard sw * in principle, this stratgy should be faster and more accurate, but I have not tested this point. I may switch back to the old method if this does not work. * the code looks quite nasty now. it needs clean up... ------------------------------------------------------------------------ r1163 | lh3 | 2009-07-27 17:41:10 -0400 (Mon, 27 Jul 2009) | 2 lines Changed paths: M /branches/prog/bwa/bwtsw2_aux.c change a default parameter ------------------------------------------------------------------------ r1162 | lh3 | 2009-07-27 17:04:35 -0400 (Mon, 27 Jul 2009) | 3 lines Changed paths: M /branches/prog/bwa/bwtsw2.h M /branches/prog/bwa/bwtsw2_aux.c M /branches/prog/bwa/bwtsw2_core.c M /branches/prog/bwa/bwtsw2_main.c M /branches/prog/bwa/main.c * bwa-0.4.9-13 * dbwtsw: switch between small and large Z-best ------------------------------------------------------------------------ r1161 | lh3 | 2009-07-27 12:17:41 -0400 (Mon, 27 Jul 2009) | 4 lines Changed paths: M /branches/prog/bwa/bwtsw2_aux.c M /branches/prog/bwa/bwtsw2_core.c M /branches/prog/bwa/main.c * bwa-0.4.9-12 * changed the default -z to 100 * heuristically speed up alignments for polyA reads ------------------------------------------------------------------------ r1160 | lh3 | 2009-07-27 07:50:57 -0400 (Mon, 27 Jul 2009) | 6 lines Changed paths: M /branches/prog/bwa/bwtsw2.h M /branches/prog/bwa/bwtsw2_aux.c M /branches/prog/bwa/bwtsw2_core.c M /branches/prog/bwa/main.c * bwa-0.4.9-11 * dbwtsw potentially generates less false alignments, although in practice, the modification brings no improvement. ------------------------------------------------------------------------ r1159 | lh3 | 2009-07-27 04:37:02 -0400 (Mon, 27 Jul 2009) | 4 lines Changed paths: M /branches/prog/bwa/bwase.c M /branches/prog/bwa/bwtsw2_aux.c M /branches/prog/bwa/bwtsw2_core.c M /branches/prog/bwa/main.c * bwa-0.4.9-10 * disabled debugging code * add "BAM_FMU" if both ends are unmapped ------------------------------------------------------------------------ r1158 | lh3 | 2009-07-24 09:36:52 -0400 (Fri, 24 Jul 2009) | 2 lines Changed paths: M /branches/prog/bwa/main.c nothing, really ------------------------------------------------------------------------ r1157 | lh3 | 2009-07-24 09:05:44 -0400 (Fri, 24 Jul 2009) | 3 lines Changed paths: M /branches/prog/bwa/bwtsw2_aux.c M /branches/prog/bwa/bwtsw2_core.c M /branches/prog/bwa/main.c * bwa-0.4.9-9 * bwtsw2: generate SAM output ------------------------------------------------------------------------ r1156 | lh3 | 2009-07-24 05:42:47 -0400 (Fri, 24 Jul 2009) | 6 lines Changed paths: M /branches/prog/bwa/bwape.c M /branches/prog/bwa/main.c * bwa-0.4.9-8 * fixed a weird deadloop which only happens to icc -O3. Thanks John Marshall for the fix. ------------------------------------------------------------------------ r1155 | lh3 | 2009-07-24 05:28:40 -0400 (Fri, 24 Jul 2009) | 8 lines Changed paths: M /branches/prog/bwa/bwtsw2_aux.c M /branches/prog/bwa/bwtsw2_core.c M /branches/prog/bwa/main.c * bwa-0.4.9-7 * fixed a typo in bwtsw2 alignment. Now score from the standard SW seems to agree with score from bwtsw2, except that in reporting alignments, bwtsw2 may report non-optimal segments. This is expected, though. I will improve in future. ------------------------------------------------------------------------ r1154 | lh3 | 2009-07-23 17:40:20 -0400 (Thu, 23 Jul 2009) | 3 lines Changed paths: M /branches/prog/bwa/stdaln.c M /branches/prog/bwa/stdaln.h * aln_left_core() seems to work properly * aln_local_core() has a bug... AN EVER EXISTING BUG!!!!!!!!!!! ------------------------------------------------------------------------ r1153 | lh3 | 2009-07-23 17:06:09 -0400 (Thu, 23 Jul 2009) | 2 lines Changed paths: M /branches/prog/bwa/stdaln.c removed debugging code... ------------------------------------------------------------------------ r1152 | lh3 | 2009-07-23 17:01:00 -0400 (Thu, 23 Jul 2009) | 3 lines Changed paths: M /branches/prog/bwa/stdaln.c * radical changes failed... * fixed a bug ------------------------------------------------------------------------ r1151 | lh3 | 2009-07-23 14:46:35 -0400 (Thu, 23 Jul 2009) | 2 lines Changed paths: M /branches/prog/bwa/stdaln.c temporary changes. Will apply some radical changes to this file... ------------------------------------------------------------------------ r1150 | lh3 | 2009-07-23 10:09:56 -0400 (Thu, 23 Jul 2009) | 2 lines Changed paths: M /branches/prog/bwa/bwtsw2_aux.c M /branches/prog/bwa/bwtsw2_main.c M /branches/prog/bwa/stdaln.c fixed a long-existing bug in Smith-Waterman alignment ------------------------------------------------------------------------ r1149 | lh3 | 2009-07-23 08:50:52 -0400 (Thu, 23 Jul 2009) | 3 lines Changed paths: M /branches/prog/bwa/bwtsw2.h M /branches/prog/bwa/bwtsw2_aux.c M /branches/prog/bwa/bwtsw2_main.c M /branches/prog/bwa/main.c M /branches/prog/bwa/simple_dp.c M /branches/prog/bwa/stdaln.c M /branches/prog/bwa/stdaln.h * bwa-0.4.9-6 * unexplained inconsistency still occurs, but the results largely look reasonable. ------------------------------------------------------------------------ r1148 | lh3 | 2009-07-23 08:07:29 -0400 (Thu, 23 Jul 2009) | 2 lines Changed paths: M /branches/prog/bwa/stdaln.c half DP ------------------------------------------------------------------------ r1147 | lh3 | 2009-07-22 08:03:06 -0400 (Wed, 22 Jul 2009) | 2 lines Changed paths: M /branches/prog/bwa/bwtsw2.h M /branches/prog/bwa/bwtsw2_aux.c M /branches/prog/bwa/bwtsw2_core.c M /branches/prog/bwa/bwtsw2_main.c a bit code clean up ------------------------------------------------------------------------ r1145 | lh3 | 2009-07-21 15:52:05 -0400 (Tue, 21 Jul 2009) | 4 lines Changed paths: M /branches/prog/bwa/bwtsw2_aux.c M /branches/prog/bwa/bwtsw2_core.c M /branches/prog/bwa/main.c * bwa-0.4.9-5 * fixed a bug in determining sub-optimal hits * removed some debugging codes ------------------------------------------------------------------------ r1144 | lh3 | 2009-07-21 10:17:29 -0400 (Tue, 21 Jul 2009) | 4 lines Changed paths: M /branches/prog/bwa/bwtsw2.h M /branches/prog/bwa/bwtsw2_aux.c M /branches/prog/bwa/bwtsw2_core.c M /branches/prog/bwa/bwtsw2_main.c M /branches/prog/bwa/main.c * bwa-0.4.9-4 * better cmd interface * faster speed ------------------------------------------------------------------------ r1143 | lh3 | 2009-07-20 16:38:18 -0400 (Mon, 20 Jul 2009) | 3 lines Changed paths: M /branches/prog/bwa/bwtsw2.h M /branches/prog/bwa/bwtsw2_aux.c M /branches/prog/bwa/bwtsw2_core.c M /branches/prog/bwa/bwtsw2_main.c M /branches/prog/bwa/main.c bwtsw2 (dBWT-SW) is working apparently... ------------------------------------------------------------------------ r1139 | lh3 | 2009-07-15 05:52:18 -0400 (Wed, 15 Jul 2009) | 4 lines Changed paths: M /branches/prog/bwa/bwtsw2_core.c M /branches/prog/bwa/main.c * bwa-0.4.9-2 * bwtsw2: change cut_tail() such that it is faster but more likely to miss true hits ------------------------------------------------------------------------ r1138 | lh3 | 2009-07-15 05:18:42 -0400 (Wed, 15 Jul 2009) | 3 lines Changed paths: M /branches/prog/bwa/Makefile A /branches/prog/bwa/bwt_lite.c A /branches/prog/bwa/bwt_lite.h A /branches/prog/bwa/bwtsw2.h A /branches/prog/bwa/bwtsw2_aux.c A /branches/prog/bwa/bwtsw2_core.c A /branches/prog/bwa/bwtsw2_main.c M /branches/prog/bwa/main.c M /branches/prog/bwa/main.h * bwa-0.4.9-1 * added back bwtsw2 ------------------------------------------------------------------------ r1075 | lh3 | 2009-05-19 05:14:50 -0400 (Tue, 19 May 2009) | 2 lines Changed paths: M /branches/prog/bwa/NEWS M /branches/prog/bwa/bwase.c M /branches/prog/bwa/main.c Release bwa-0.4.9 ------------------------------------------------------------------------ r1073 | lh3 | 2009-05-18 17:13:19 -0400 (Mon, 18 May 2009) | 2 lines Changed paths: M /branches/prog/bwa/NEWS M /branches/prog/bwa/bwa.1 M /branches/prog/bwa/main.c Release bwa-0.4.8 ------------------------------------------------------------------------ r1069 | lh3 | 2009-05-14 09:54:54 -0400 (Thu, 14 May 2009) | 3 lines Changed paths: M /branches/prog/bwa/bwtaln.c M /branches/prog/bwa/main.c * bwa-0.4.7-2 * change the default of "aln -R" to 30 ------------------------------------------------------------------------ r1068 | lh3 | 2009-05-14 09:27:55 -0400 (Thu, 14 May 2009) | 3 lines Changed paths: M /branches/prog/bwa/bwape.c M /branches/prog/bwa/bwase.c M /branches/prog/bwa/bwtaln.c M /branches/prog/bwa/bwtaln.h M /branches/prog/bwa/bwtgap.c M /branches/prog/bwa/main.c * bwa-0.4.7-1 * search for suboptimal hits if the top hit is not so repetitive ------------------------------------------------------------------------ r1066 | lh3 | 2009-05-12 15:31:31 -0400 (Tue, 12 May 2009) | 2 lines Changed paths: M /branches/prog/bwa/ChangeLog M /branches/prog/bwa/NEWS M /branches/prog/bwa/bwase.c M /branches/prog/bwa/main.c Release bwa-0.4.7 ------------------------------------------------------------------------ r1065 | lh3 | 2009-05-12 15:20:40 -0400 (Tue, 12 May 2009) | 3 lines Changed paths: M /branches/prog/bwa/bwape.c M /branches/prog/bwa/bwase.c M /branches/prog/bwa/main.c * bwa-0.4.6-9 * fixed compiling errors on some Linux machines ------------------------------------------------------------------------ r1064 | lh3 | 2009-05-12 07:30:46 -0400 (Tue, 12 May 2009) | 3 lines Changed paths: M /branches/prog/bwa/bwtaln.c M /branches/prog/bwa/main.c * bwa-0.4.6-8 * avoid compilation error on some systems. ------------------------------------------------------------------------ r1035 | lh3 | 2009-05-09 05:41:33 -0400 (Sat, 09 May 2009) | 4 lines Changed paths: M /branches/prog/bwa/bwape.c M /branches/prog/bwa/bwase.c M /branches/prog/bwa/main.c * bwa-0.4.6-7 * fixed an integer overflow caused by previous modifications * made insert size estimation more robust ------------------------------------------------------------------------ r1008 | lh3 | 2009-04-29 05:41:58 -0400 (Wed, 29 Apr 2009) | 4 lines Changed paths: M /branches/prog/bwa/bwase.c M /branches/prog/bwa/main.c * bwa-0.4.6-5 * fixed a integer overflow problem which may cause seg fault in very rare cases * made XN tags more accurate ------------------------------------------------------------------------ r1005 | lh3 | 2009-04-27 07:37:23 -0400 (Mon, 27 Apr 2009) | 4 lines Changed paths: M /branches/prog/bwa/bwape.c M /branches/prog/bwa/main.c M /branches/prog/bwa/simple_dp.c M /branches/prog/bwa/stdaln.c M /branches/prog/bwa/stdaln.h * bwa-0.4.6-4 * heuristic rules to detect suboptimal alignment * stdsw: support double-strand and protein alignment ------------------------------------------------------------------------ r1003 | lh3 | 2009-04-26 12:48:19 -0400 (Sun, 26 Apr 2009) | 4 lines Changed paths: M /branches/prog/bwa/main.c M /branches/prog/bwa/simple_dp.c M /branches/prog/bwa/stdaln.c M /branches/prog/bwa/stdaln.h * bwa-0.4.6-2 * improve the functionality of stdsw * allow to add a threshold on SW alignment. Hope this does not incur new bugs... ------------------------------------------------------------------------ r1002 | lh3 | 2009-04-22 03:56:15 -0400 (Wed, 22 Apr 2009) | 3 lines Changed paths: M /branches/prog/bwa/bwa.1 M /branches/prog/bwa/bwape.c M /branches/prog/bwa/bwase.c M /branches/prog/bwa/bwtaln.h M /branches/prog/bwa/main.c * bwa-0.4.6-1 * output SM and AM tag ------------------------------------------------------------------------ r914 | lh3 | 2009-03-09 17:53:50 -0400 (Mon, 09 Mar 2009) | 2 lines Changed paths: M /branches/prog/bwa/ChangeLog M /branches/prog/bwa/NEWS M /branches/prog/bwa/main.c Release bwa-0.4.6 ------------------------------------------------------------------------ r913 | lh3 | 2009-03-09 17:23:24 -0400 (Mon, 09 Mar 2009) | 4 lines Changed paths: M /branches/prog/bwa/bwa.1 M /branches/prog/bwa/bwape.c A /branches/prog/bwa/solid2fastq.pl * added notes to bwa * added a script to convert SOLiD reads * updated documentations ------------------------------------------------------------------------ r912 | lh3 | 2009-03-09 16:57:05 -0400 (Mon, 09 Mar 2009) | 2 lines Changed paths: M /branches/prog/bwa/ChangeLog M /branches/prog/bwa/kstring.c M /branches/prog/bwa/main.c fixed a bug in kstring ------------------------------------------------------------------------ r881 | lh3 | 2009-03-02 15:36:06 -0500 (Mon, 02 Mar 2009) | 3 lines Changed paths: M /branches/prog/bwa/bwtmisc.c M /branches/prog/bwa/main.c * bwa-0.4.5-7 * fixed a bug in pac2cspac ------------------------------------------------------------------------ r880 | lh3 | 2009-03-01 16:34:08 -0500 (Sun, 01 Mar 2009) | 2 lines Changed paths: M /branches/prog/bwa/Makefile disable debugging ------------------------------------------------------------------------ r879 | lh3 | 2009-03-01 16:28:04 -0500 (Sun, 01 Mar 2009) | 3 lines Changed paths: M /branches/prog/bwa/bwase.c M /branches/prog/bwa/cs2nt.c M /branches/prog/bwa/main.c * bwa-0.4.5-6 * fixed problems with coordinates for color gapped alignment ------------------------------------------------------------------------ r878 | lh3 | 2009-03-01 13:43:09 -0500 (Sun, 01 Mar 2009) | 3 lines Changed paths: M /branches/prog/bwa/bwase.c M /branches/prog/bwa/cs2nt.c M /branches/prog/bwa/main.c * bwa-0.4.5-5 * added support for gapped color alignment ------------------------------------------------------------------------ r877 | lh3 | 2009-03-01 10:27:52 -0500 (Sun, 01 Mar 2009) | 2 lines Changed paths: M /branches/prog/bwa/Makefile M /branches/prog/bwa/bwape.c M /branches/prog/bwa/bwase.c M /branches/prog/bwa/bwtaln.h M /branches/prog/bwa/cs2nt.c M /branches/prog/bwa/main.c * convert cs read to nt read (for ungapped alignment only) ------------------------------------------------------------------------ r860 | lh3 | 2009-02-27 08:58:39 -0500 (Fri, 27 Feb 2009) | 2 lines Changed paths: M /branches/prog/bwa/Makefile M /branches/prog/bwa/bwase.c A /branches/prog/bwa/cs2nt.c prepare to implement cs->nt conversion (have not yet...) ------------------------------------------------------------------------ r859 | lh3 | 2009-02-27 07:00:03 -0500 (Fri, 27 Feb 2009) | 3 lines Changed paths: M /branches/prog/bwa/bntseq.c M /branches/prog/bwa/bntseq.h M /branches/prog/bwa/bwtindex.c M /branches/prog/bwa/bwtmisc.c M /branches/prog/bwa/main.c M /branches/prog/bwa/main.h * bwa-0.4.5-3 * generate color index from nucleotide fasta reference ------------------------------------------------------------------------ r857 | lh3 | 2009-02-26 10:22:58 -0500 (Thu, 26 Feb 2009) | 4 lines Changed paths: M /branches/prog/bwa/bwape.c M /branches/prog/bwa/main.c * bwa-0.4.5-2 * improved mapping quality a bit if one end falls in a tandem repeat but the mate is unique. ------------------------------------------------------------------------ r856 | lh3 | 2009-02-26 10:02:29 -0500 (Thu, 26 Feb 2009) | 3 lines Changed paths: M /branches/prog/bwa/bwape.c M /branches/prog/bwa/bwtaln.h M /branches/prog/bwa/main.c * bwa-0.4.5-1 * make bwa work for SOLiD reads ------------------------------------------------------------------------ r828 | lh3 | 2009-02-18 17:36:41 -0500 (Wed, 18 Feb 2009) | 2 lines Changed paths: M /branches/prog/bwa/ChangeLog M /branches/prog/bwa/NEWS M /branches/prog/bwa/bwa.1 M /branches/prog/bwa/main.c Release bwa-0.4.5 ------------------------------------------------------------------------ r827 | lh3 | 2009-02-18 16:48:48 -0500 (Wed, 18 Feb 2009) | 3 lines Changed paths: M /branches/prog/bwa/main.c M /branches/prog/bwa/stdaln.c M /branches/prog/bwa/stdaln.h * bwa-0.4.4-6 * fixed a bug in SW alignment when no residue matches ------------------------------------------------------------------------ r824 | lh3 | 2009-02-17 05:33:07 -0500 (Tue, 17 Feb 2009) | 3 lines Changed paths: M /branches/prog/bwa/bwape.c M /branches/prog/bwa/main.c * bwa-0.4.4-5 * fixed that bounary bug ------------------------------------------------------------------------ r823 | lh3 | 2009-02-17 04:54:18 -0500 (Tue, 17 Feb 2009) | 2 lines Changed paths: M /branches/prog/bwa/ChangeLog M /branches/prog/bwa/bwape.c just change some logging information ------------------------------------------------------------------------ r822 | lh3 | 2009-02-17 04:20:39 -0500 (Tue, 17 Feb 2009) | 2 lines Changed paths: M /branches/prog/bwa/bwa.1 update manual ------------------------------------------------------------------------ r821 | lh3 | 2009-02-17 04:11:14 -0500 (Tue, 17 Feb 2009) | 3 lines Changed paths: M /branches/prog/bwa/bwape.c M /branches/prog/bwa/bwase.c M /branches/prog/bwa/main.c * bwa-0.4.4-4 * fixed a bug on boundary check in pair_sw ------------------------------------------------------------------------ r820 | lh3 | 2009-02-16 17:43:37 -0500 (Mon, 16 Feb 2009) | 3 lines Changed paths: M /branches/prog/bwa/bwtaln.c M /branches/prog/bwa/main.c * bwa-0.4.4-3 * allow to change mismatch penalty ------------------------------------------------------------------------ r819 | lh3 | 2009-02-16 17:40:28 -0500 (Mon, 16 Feb 2009) | 4 lines Changed paths: M /branches/prog/bwa/bwtaln.c M /branches/prog/bwa/main.c * bwa-0.4.4-2 * remove timer * allow to change default gapo and gape penalty at the command line ------------------------------------------------------------------------ r818 | lh3 | 2009-02-16 09:30:51 -0500 (Mon, 16 Feb 2009) | 2 lines Changed paths: M /branches/prog/bwa/bwa.1 update benchmark ------------------------------------------------------------------------ r817 | lh3 | 2009-02-16 08:44:40 -0500 (Mon, 16 Feb 2009) | 4 lines Changed paths: M /branches/prog/bwa/bwape.c M /branches/prog/bwa/bwtaln.c M /branches/prog/bwa/kvec.h M /branches/prog/bwa/main.c * bwa-0.4.4-1 * automatically detect insert size * use insert size in pairing. This may potentially improve accuracy (untested!) ------------------------------------------------------------------------ r814 | lh3 | 2009-02-15 11:10:23 -0500 (Sun, 15 Feb 2009) | 2 lines Changed paths: M /branches/prog/bwa/ChangeLog M /branches/prog/bwa/NEWS M /branches/prog/bwa/main.c Release bwa-0.4.4 ------------------------------------------------------------------------ r813 | lh3 | 2009-02-15 10:22:50 -0500 (Sun, 15 Feb 2009) | 3 lines Changed paths: M /branches/prog/bwa/bwa.1 M /branches/prog/bwa/bwase.c M /branches/prog/bwa/main.c * bwa-0.4.3-5 * impose boundary check in refine_gapped ------------------------------------------------------------------------ r811 | lh3 | 2009-02-14 09:46:13 -0500 (Sat, 14 Feb 2009) | 3 lines Changed paths: M /branches/prog/bwa/bwase.c M /branches/prog/bwa/main.c * bwa-0.4.3-4 * change MD tag to match the latest SAM specification ------------------------------------------------------------------------ r810 | lh3 | 2009-02-13 04:46:04 -0500 (Fri, 13 Feb 2009) | 2 lines Changed paths: M /branches/prog/bwa/ChangeLog update ChangeLog ------------------------------------------------------------------------ r799 | lh3 | 2009-02-05 12:01:17 -0500 (Thu, 05 Feb 2009) | 2 lines Changed paths: M /branches/prog/bwa/bwase.c M /branches/prog/bwa/main.c change MD tag to meet the latest SAM specification ------------------------------------------------------------------------ r796 | lh3 | 2009-02-05 08:35:13 -0500 (Thu, 05 Feb 2009) | 3 lines Changed paths: M /branches/prog/bwa/bntseq.c M /branches/prog/bwa/bwase.c M /branches/prog/bwa/main.c * bwa-0.4.3-2 * fixed a bug on counting 'N' ------------------------------------------------------------------------ r795 | lh3 | 2009-02-05 07:41:27 -0500 (Thu, 05 Feb 2009) | 4 lines Changed paths: M /branches/prog/bwa/bwa.1 M /branches/prog/bwa/bwape.c M /branches/prog/bwa/main.c * bwa-0.4.3-1 * fixed potential boundary problems * update benchmark result ------------------------------------------------------------------------ r791 | lh3 | 2009-01-25 05:20:47 -0500 (Sun, 25 Jan 2009) | 2 lines Changed paths: M /branches/prog/bwa/bwa.1 update some numbers ------------------------------------------------------------------------ r790 | lh3 | 2009-01-24 15:13:03 -0500 (Sat, 24 Jan 2009) | 2 lines Changed paths: M /branches/prog/bwa/bwa.1 update benchmark ------------------------------------------------------------------------ r789 | lh3 | 2009-01-22 10:18:44 -0500 (Thu, 22 Jan 2009) | 2 lines Changed paths: M /branches/prog/bwa/bwtindex.c a warning message for index ------------------------------------------------------------------------ r788 | lh3 | 2009-01-22 09:54:06 -0500 (Thu, 22 Jan 2009) | 2 lines Changed paths: M /branches/prog/bwa/main.c forget to change release number ------------------------------------------------------------------------ r786 | lh3 | 2009-01-22 06:27:39 -0500 (Thu, 22 Jan 2009) | 2 lines Changed paths: M /branches/prog/bwa/NEWS Release bwa-0.4.3 ------------------------------------------------------------------------ r785 | lh3 | 2009-01-22 06:27:16 -0500 (Thu, 22 Jan 2009) | 2 lines Changed paths: M /branches/prog/bwa/ChangeLog M /branches/prog/bwa/NEWS Release bwa-0.4.3 ------------------------------------------------------------------------ r784 | lh3 | 2009-01-22 06:19:59 -0500 (Thu, 22 Jan 2009) | 4 lines Changed paths: M /branches/prog/bwa/bwa.1 M /branches/prog/bwa/bwase.c M /branches/prog/bwa/main.c * bwa-0.4.2-10 * update documentation * fixed a bug on generating MD tags for SW alignment ------------------------------------------------------------------------ r782 | lh3 | 2009-01-19 12:08:38 -0500 (Mon, 19 Jan 2009) | 3 lines Changed paths: M /branches/prog/bwa/bwase.c M /branches/prog/bwa/main.c * bwa-0.4.2-9 * fixed a bug in samse -n... ------------------------------------------------------------------------ r781 | lh3 | 2009-01-19 11:26:37 -0500 (Mon, 19 Jan 2009) | 3 lines Changed paths: M /branches/prog/bwa/bwtaln.c M /branches/prog/bwa/main.c * bwa-0.4.2-8 * given -N, the previous version would stop if the top hit is a repeat. Now changed. ------------------------------------------------------------------------ r780 | lh3 | 2009-01-19 11:20:18 -0500 (Mon, 19 Jan 2009) | 4 lines Changed paths: M /branches/prog/bwa/bwape.c M /branches/prog/bwa/bwase.c M /branches/prog/bwa/bwtaln.c M /branches/prog/bwa/bwtaln.h M /branches/prog/bwa/bwtgap.c M /branches/prog/bwa/main.c * bwa-0.4.2-7 * use a bit-wise flag to replace some member variables in the option struct * allow to switch off the iterative strategy ------------------------------------------------------------------------ r779 | lh3 | 2009-01-19 10:45:57 -0500 (Mon, 19 Jan 2009) | 3 lines Changed paths: M /branches/prog/bwa/bwa.1 M /branches/prog/bwa/bwase.c M /branches/prog/bwa/main.c * bwa-0.4.2-6 * allow to dump multiple hits from samse, in another format, though ------------------------------------------------------------------------ r778 | lh3 | 2009-01-19 06:24:29 -0500 (Mon, 19 Jan 2009) | 5 lines Changed paths: M /branches/prog/bwa/Makefile M /branches/prog/bwa/bwape.c M /branches/prog/bwa/bwase.c M /branches/prog/bwa/bwaseqio.c M /branches/prog/bwa/bwtaln.h M /branches/prog/bwa/kseq.h A /branches/prog/bwa/kstring.c A /branches/prog/bwa/kstring.h M /branches/prog/bwa/main.c M /branches/prog/bwa/simple_dp.c * bwa-0.4.2-5 * update kseq.h to the latest version * generate MD tag * print mate coordinate if only one end is unmapped ------------------------------------------------------------------------ r775 | lh3 | 2009-01-18 05:40:35 -0500 (Sun, 18 Jan 2009) | 3 lines Changed paths: M /branches/prog/bwa/bwase.c M /branches/prog/bwa/main.c * bwa-0.4.2-4 * fixed a bug for SAM format ------------------------------------------------------------------------ r774 | lh3 | 2009-01-17 13:48:52 -0500 (Sat, 17 Jan 2009) | 4 lines Changed paths: M /branches/prog/bwa/bwtaln.c M /branches/prog/bwa/main.c * bwa-0.4.2-3 * change default fnr to 0.04 * print max_diff for valid fnr ------------------------------------------------------------------------ r773 | lh3 | 2009-01-17 05:54:37 -0500 (Sat, 17 Jan 2009) | 3 lines Changed paths: M /branches/prog/bwa/bwape.c M /branches/prog/bwa/bwase.c M /branches/prog/bwa/bwtaln.c M /branches/prog/bwa/bwtaln.h M /branches/prog/bwa/main.c * bwa-0.4.2-2 * automatically choose max_diff ------------------------------------------------------------------------ r772 | lh3 | 2009-01-16 18:16:14 -0500 (Fri, 16 Jan 2009) | 3 lines Changed paths: M /branches/prog/bwa/bwaseqio.c M /branches/prog/bwa/bwt.c M /branches/prog/bwa/bwtaln.c M /branches/prog/bwa/bwtaln.h M /branches/prog/bwa/bwtgap.c M /branches/prog/bwa/main.c * bwa-0.4.2-1 * take N as a mismatch ------------------------------------------------------------------------ r768 | lh3 | 2009-01-09 11:57:23 -0500 (Fri, 09 Jan 2009) | 2 lines Changed paths: M /branches/prog/bwa/ChangeLog M /branches/prog/bwa/NEWS M /branches/prog/bwa/bntseq.c M /branches/prog/bwa/main.c Release bwa-0.4.2 ------------------------------------------------------------------------ r759 | lh3 | 2009-01-07 09:55:43 -0500 (Wed, 07 Jan 2009) | 2 lines Changed paths: M /branches/prog/bwa/ChangeLog M /branches/prog/bwa/NEWS M /branches/prog/bwa/bwa.1 M /branches/prog/bwa/bwape.c M /branches/prog/bwa/bwtaln.h M /branches/prog/bwa/main.c Release bwa-0.4.1 ------------------------------------------------------------------------ r758 | lh3 | 2009-01-07 05:36:06 -0500 (Wed, 07 Jan 2009) | 3 lines Changed paths: M /branches/prog/bwa/bwape.c M /branches/prog/bwa/bwase.c M /branches/prog/bwa/bwtaln.h M /branches/prog/bwa/main.c * bwa-0.4.0-2 * make mate_sw fully working ------------------------------------------------------------------------ r757 | lh3 | 2009-01-06 18:04:29 -0500 (Tue, 06 Jan 2009) | 5 lines Changed paths: M /branches/prog/bwa/bwape.c M /branches/prog/bwa/bwase.c M /branches/prog/bwa/bwaseqio.c M /branches/prog/bwa/bwtaln.h M /branches/prog/bwa/main.c * bwa-0.4.0-1 * do SW alignment for unmapped mate. It is working. * I still need to do some extra work for SW alignment, but it is too late and I am getting tired... I will do tomorrow. ------------------------------------------------------------------------ r755 | lh3 | 2009-01-06 10:23:29 -0500 (Tue, 06 Jan 2009) | 2 lines Changed paths: M /branches/prog/bwa/ChangeLog M /branches/prog/bwa/NEWS M /branches/prog/bwa/bwa.1 M /branches/prog/bwa/main.c Release bwa-0.4.0 ------------------------------------------------------------------------ r754 | lh3 | 2009-01-06 07:45:02 -0500 (Tue, 06 Jan 2009) | 3 lines Changed paths: M /branches/prog/bwa/bwtaln.c M /branches/prog/bwa/bwtgap.c M /branches/prog/bwa/bwtgap.h M /branches/prog/bwa/main.c * bwa-0.3.0-12 * better lock ------------------------------------------------------------------------ r753 | lh3 | 2009-01-06 06:17:21 -0500 (Tue, 06 Jan 2009) | 5 lines Changed paths: M /branches/prog/bwa/Makefile M /branches/prog/bwa/bwaseqio.c M /branches/prog/bwa/bwtaln.c M /branches/prog/bwa/bwtaln.h M /branches/prog/bwa/bwtgap.c M /branches/prog/bwa/main.c * bwa-0.3.0-11 * fixed a small memory leak in bwa_seq_close() * fixed "uninitialized memory" from bwt_aln1_t * multithreading for "aln" command ------------------------------------------------------------------------ r752 | lh3 | 2009-01-05 17:34:13 -0500 (Mon, 05 Jan 2009) | 3 lines Changed paths: M /branches/prog/bwa/Makefile D /branches/prog/bwa/bwt2fmv.c M /branches/prog/bwa/bwt_gen/bwt_gen.c A /branches/prog/bwa/bwtmisc.c (from /branches/prog/bwa/pac2bwt.c:748) M /branches/prog/bwa/main.c M /branches/prog/bwa/main.h D /branches/prog/bwa/pac2bwt.c * bwa-0.3.0-10 * a little bit code clean up ------------------------------------------------------------------------ r751 | lh3 | 2009-01-05 17:19:04 -0500 (Mon, 05 Jan 2009) | 3 lines Changed paths: M /branches/prog/bwa/bwt.c M /branches/prog/bwa/main.c * bwa-0.3.0-9 * use 64-bit integer to speed up Occ calculate, although just a little bit ------------------------------------------------------------------------ r750 | lh3 | 2009-01-05 16:44:26 -0500 (Mon, 05 Jan 2009) | 3 lines Changed paths: M /branches/prog/bwa/bwt.c M /branches/prog/bwa/main.c * bwa-0.3.0-8 * a little bit code cleanup ------------------------------------------------------------------------ r749 | lh3 | 2009-01-05 16:37:28 -0500 (Mon, 05 Jan 2009) | 3 lines Changed paths: M /branches/prog/bwa/bwt.c M /branches/prog/bwa/main.c * bwa-0.1.0-7 * accelerate Occ calculation ------------------------------------------------------------------------ r748 | lh3 | 2009-01-05 16:12:28 -0500 (Mon, 05 Jan 2009) | 4 lines Changed paths: M /branches/prog/bwa/bwape.c M /branches/prog/bwa/bwase.c M /branches/prog/bwa/bwt.c M /branches/prog/bwa/bwt.h M /branches/prog/bwa/bwt2fmv.c M /branches/prog/bwa/bwtaln.c M /branches/prog/bwa/bwtindex.c M /branches/prog/bwa/bwtio.c M /branches/prog/bwa/main.c M /branches/prog/bwa/main.h M /branches/prog/bwa/pac2bwt.c * bwa-0.3.0-6 * put occ table along with bwt to save another cache miss * this version is already faster than the previous and I can still improve it... ------------------------------------------------------------------------ r747 | lh3 | 2009-01-05 10:16:18 -0500 (Mon, 05 Jan 2009) | 5 lines Changed paths: M /branches/prog/bwa/bwt.c M /branches/prog/bwa/bwt.h M /branches/prog/bwa/bwtio.c M /branches/prog/bwa/main.c * bwa-0.3.0-5 * remove occ_major to save a cache miss; however, OCC_INTERVAL has to be increased to keep the same memory. As a result, the speed is a little slower in fact. ------------------------------------------------------------------------ r746 | lh3 | 2009-01-05 09:50:53 -0500 (Mon, 05 Jan 2009) | 3 lines Changed paths: M /branches/prog/bwa/bwt.c M /branches/prog/bwa/main.c * bwa-0.3.0-4 * added back optimization codes (it is a pain...) ------------------------------------------------------------------------ r745 | lh3 | 2009-01-05 08:23:00 -0500 (Mon, 05 Jan 2009) | 3 lines Changed paths: M /branches/prog/bwa/bwt.c M /branches/prog/bwa/bwtaln.c M /branches/prog/bwa/main.c * bwa-0.3.0-3 * faster bit operations ------------------------------------------------------------------------ r744 | lh3 | 2009-01-05 05:58:46 -0500 (Mon, 05 Jan 2009) | 4 lines Changed paths: M /branches/prog/bwa/bwt.c M /branches/prog/bwa/main.c * bwa-0.3.0-2 * removed optimization codes again... * use a new method to count the bits ------------------------------------------------------------------------ r743 | lh3 | 2009-01-04 17:18:38 -0500 (Sun, 04 Jan 2009) | 5 lines Changed paths: M /branches/prog/bwa/bwa.1 M /branches/prog/bwa/bwt.c M /branches/prog/bwa/bwtaln.c M /branches/prog/bwa/bwtaln.h M /branches/prog/bwa/bwtgap.c M /branches/prog/bwa/main.c * bwa-0.3.0-1 * added back the optimization codes * added a new option to aln: max_entries, although this is disabled by default * updated benchmark ------------------------------------------------------------------------ r742 | lh3 | 2009-01-04 07:56:12 -0500 (Sun, 04 Jan 2009) | 2 lines Changed paths: M /branches/prog/bwa/bwa.1 add URL ------------------------------------------------------------------------ r740 | lh3 | 2009-01-04 07:39:43 -0500 (Sun, 04 Jan 2009) | 2 lines Changed paths: M /branches/prog/bwa/ChangeLog M /branches/prog/bwa/NEWS M /branches/prog/bwa/bwa.1 M /branches/prog/bwa/main.c Release bwa-0.3.0 ------------------------------------------------------------------------ r739 | lh3 | 2009-01-04 06:55:06 -0500 (Sun, 04 Jan 2009) | 2 lines Changed paths: A /branches/prog/bwa/COPYING M /branches/prog/bwa/ChangeLog M /branches/prog/bwa/bntseq.c M /branches/prog/bwa/bntseq.h M /branches/prog/bwa/bwa.1 M /branches/prog/bwa/bwt.c M /branches/prog/bwa/bwt.h M /branches/prog/bwa/bwtindex.c M /branches/prog/bwa/utils.c M /branches/prog/bwa/utils.h added licensing information ------------------------------------------------------------------------ r738 | lh3 | 2009-01-04 06:18:25 -0500 (Sun, 04 Jan 2009) | 4 lines Changed paths: M /branches/prog/bwa/bwa.1 M /branches/prog/bwa/bwape.c M /branches/prog/bwa/main.c * bwa-0.2.0-31 * better mapping quality * update benchmark ------------------------------------------------------------------------ r737 | lh3 | 2009-01-03 16:00:58 -0500 (Sat, 03 Jan 2009) | 2 lines Changed paths: M /branches/prog/bwa/ChangeLog M /branches/prog/bwa/bwa.1 update documentation ------------------------------------------------------------------------ r736 | lh3 | 2009-01-02 10:26:38 -0500 (Fri, 02 Jan 2009) | 2 lines Changed paths: M /branches/prog/bwa/bwa.1 update documentation ------------------------------------------------------------------------ r735 | lh3 | 2009-01-02 07:10:20 -0500 (Fri, 02 Jan 2009) | 4 lines Changed paths: M /branches/prog/bwa/bwa.1 M /branches/prog/bwa/bwape.c M /branches/prog/bwa/bwtaln.c M /branches/prog/bwa/main.c * bwa-0.2.0-30 * reduce memory a little bit * update documentation ------------------------------------------------------------------------ r734 | lh3 | 2009-01-01 13:45:45 -0500 (Thu, 01 Jan 2009) | 8 lines Changed paths: M /branches/prog/bwa/bwa.1 M /branches/prog/bwa/bwape.c M /branches/prog/bwa/bwase.c M /branches/prog/bwa/bwtaln.c M /branches/prog/bwa/bwtaln.h M /branches/prog/bwa/bwtgap.c M /branches/prog/bwa/main.c * bwa-0.2.0-29 * sampe: removed -O option; changed default -o to 100000 * sampe: fixed a bug in calculating paired mapping quality * aln: added an option to search for suboptimal hits even if the best is a repeat. This option will make sampe MUCH SLOWER. * sampe: set isize as zero if mapped to two different chr * update manual (unfinished) ------------------------------------------------------------------------ r733 | lh3 | 2009-01-01 11:01:20 -0500 (Thu, 01 Jan 2009) | 3 lines Changed paths: M /branches/prog/bwa/bwape.c M /branches/prog/bwa/main.c * bwa-0.2.0-28 * fixed a bug in calculating paired mapping quality ------------------------------------------------------------------------ r732 | lh3 | 2009-01-01 09:27:46 -0500 (Thu, 01 Jan 2009) | 3 lines Changed paths: M /branches/prog/bwa/bwape.c A /branches/prog/bwa/khash.h (from /branches/prog/sclib/khash/khash.h:675) M /branches/prog/bwa/main.c * bwa-0.2.0-27 * accelerate sampe by storing visited large intervals ------------------------------------------------------------------------ r731 | lh3 | 2009-01-01 06:51:21 -0500 (Thu, 01 Jan 2009) | 3 lines Changed paths: M /branches/prog/bwa/bwt.c M /branches/prog/bwa/main.c * bwa-0.2.0-26 * remove the optimation codes ------------------------------------------------------------------------ r730 | lh3 | 2009-01-01 06:48:59 -0500 (Thu, 01 Jan 2009) | 4 lines Changed paths: M /branches/prog/bwa/bwt.c M /branches/prog/bwa/main.c * bwa-0.2.0-25 * accelerate OCC calculation by ~7%. However, it seems not worth doing this by complicate the codes. I will change back later. ------------------------------------------------------------------------ r729 | lh3 | 2008-12-31 16:43:56 -0500 (Wed, 31 Dec 2008) | 6 lines Changed paths: M /branches/prog/bwa/bntseq.c M /branches/prog/bwa/bwape.c M /branches/prog/bwa/bwase.c M /branches/prog/bwa/main.c * bwa-0.2.0-24 * change command "sai2sam_pe" to "sampe" * print usage for sampe command * in sampe: change default max_occ to 1000 * fixed a few compiling warnings in bntseq.c ------------------------------------------------------------------------ r728 | lh3 | 2008-12-27 07:14:59 -0500 (Sat, 27 Dec 2008) | 3 lines Changed paths: M /branches/prog/bwa/bwape.c M /branches/prog/bwa/bwase.c M /branches/prog/bwa/main.c * bwa-0.2.0-22 * mating information can be printed to SAM ------------------------------------------------------------------------ r727 | lh3 | 2008-12-26 18:10:59 -0500 (Fri, 26 Dec 2008) | 4 lines Changed paths: M /branches/prog/bwa/bwape.c M /branches/prog/bwa/bwase.c M /branches/prog/bwa/bwaseqio.c M /branches/prog/bwa/bwtaln.c M /branches/prog/bwa/bwtaln.h M /branches/prog/bwa/main.c * bwa-0.2.0-21 * implement pairing (still UNFINISHED) * output all reads even if full of N ------------------------------------------------------------------------ r726 | lh3 | 2008-12-26 13:31:27 -0500 (Fri, 26 Dec 2008) | 5 lines Changed paths: M /branches/prog/bwa/Makefile A /branches/prog/bwa/bwape.c M /branches/prog/bwa/bwase.c M /branches/prog/bwa/bwt2fmv.c M /branches/prog/bwa/bwtaln.c M /branches/prog/bwa/bwtaln.h M /branches/prog/bwa/bwtgap.c M /branches/prog/bwa/main.c M /branches/prog/bwa/main.h * bwa-0.2.0-20 * remove "-t" from aln cmd * code clean up: move some functions in bwt2fmv.c to other source files * added sai2sam_pe cmd: *UNFINISHED* ------------------------------------------------------------------------ r725 | lh3 | 2008-12-26 07:04:11 -0500 (Fri, 26 Dec 2008) | 3 lines Changed paths: M /branches/prog/bwa/Makefile A /branches/prog/bwa/bwase.c A /branches/prog/bwa/bwaseqio.c M /branches/prog/bwa/bwt2fmv.c M /branches/prog/bwa/bwtaln.c M /branches/prog/bwa/bwtaln.h M /branches/prog/bwa/bwtgap.c M /branches/prog/bwa/kseq.h A /branches/prog/bwa/ksort.h (from /branches/prog/sclib/ksort/ksort.h:712) A /branches/prog/bwa/kvec.h (from /branches/prog/sclib/kvec/kvec.h:537) M /branches/prog/bwa/main.c * bwa-0.2.0-19 * considerable code cleanup; no actual changes ------------------------------------------------------------------------ r724 | lh3 | 2008-12-25 11:32:11 -0500 (Thu, 25 Dec 2008) | 3 lines Changed paths: M /branches/prog/bwa/bwtaln.c M /branches/prog/bwa/bwtaln.h M /branches/prog/bwa/main.c * bwa-0.2.0-18 * generate SAM output ------------------------------------------------------------------------ r723 | lh3 | 2008-12-25 10:48:31 -0500 (Thu, 25 Dec 2008) | 4 lines Changed paths: M /branches/prog/bwa/bwt2fmv.c M /branches/prog/bwa/bwtaln.c M /branches/prog/bwa/main.c M /branches/prog/bwa/main.h * bwa-0.2.0-17 * remove bwtsw2 related codes * separate searching for SA interval from generating alignments ------------------------------------------------------------------------ r722 | lh3 | 2008-12-25 08:57:13 -0500 (Thu, 25 Dec 2008) | 3 lines Changed paths: M /branches/prog/bwa/Makefile M /branches/prog/bwa/bwt2fmv.c D /branches/prog/bwa/bwt_lite.c D /branches/prog/bwa/bwt_lite.h M /branches/prog/bwa/bwtgap.c D /branches/prog/bwa/bwtsw2.h D /branches/prog/bwa/bwtsw2_aux.c D /branches/prog/bwa/bwtsw2_core.c D /branches/prog/bwa/bwtsw2_main.c D /branches/prog/bwa/khash.h D /branches/prog/bwa/ksort.h D /branches/prog/bwa/kvec.h M /branches/prog/bwa/main.c * added interface to "aln -t" * remove bwtsw2 related codes ------------------------------------------------------------------------ r666 | lh3 | 2008-11-18 18:34:29 -0500 (Tue, 18 Nov 2008) | 4 lines Changed paths: M /branches/prog/bwa/bwt2fmv.c M /branches/prog/bwa/bwtaln.c M /branches/prog/bwa/bwtaln.h M /branches/prog/bwa/main.c * bwa-0.2.0-16 * allow to set max mismatches based on read length, but I do not know whether this really works ------------------------------------------------------------------------ r665 | lh3 | 2008-11-18 08:34:03 -0500 (Tue, 18 Nov 2008) | 3 lines Changed paths: M /branches/prog/bwa/bwtaln.c M /branches/prog/bwa/main.c * bwa-0.2.0-15 * fixed a bug in sequence parser. ------------------------------------------------------------------------ r612 | lh3 | 2008-10-28 06:50:53 -0400 (Tue, 28 Oct 2008) | 3 lines Changed paths: M /branches/prog/bwa/bntseq.c M /branches/prog/bwa/bwtindex.c M /branches/prog/bwa/main.c M /branches/prog/bwa/utils.c * bwa-0.2.0-14 * fixed a bug caused by the change of the FASTA/Q parser ------------------------------------------------------------------------ r611 | lh3 | 2008-10-28 06:24:56 -0400 (Tue, 28 Oct 2008) | 2 lines Changed paths: M /branches/prog/bwa/Makefile M /branches/prog/bwa/bntseq.c M /branches/prog/bwa/bntseq.h M /branches/prog/bwa/bwtaln.c M /branches/prog/bwa/bwtsw2_core.c A /branches/prog/bwa/kseq.h D /branches/prog/bwa/seq.c D /branches/prog/bwa/seq.h M /branches/prog/bwa/simple_dp.c M /branches/prog/bwa/utils.c M /branches/prog/bwa/utils.h replace seq.* with kseq.h ------------------------------------------------------------------------ r610 | lh3 | 2008-10-27 13:00:04 -0400 (Mon, 27 Oct 2008) | 3 lines Changed paths: M /branches/prog/bwa/bwtsw2.h M /branches/prog/bwa/bwtsw2_aux.c M /branches/prog/bwa/bwtsw2_core.c M /branches/prog/bwa/main.c * bwa-0.2.0-13 * make bwtsw2 output sub-optimal hits. not completed ------------------------------------------------------------------------ r609 | lh3 | 2008-10-24 16:52:00 -0400 (Fri, 24 Oct 2008) | 2 lines Changed paths: M /branches/prog/bwa/kvec.h little... ------------------------------------------------------------------------ r532 | lh3 | 2008-09-19 05:28:45 -0400 (Fri, 19 Sep 2008) | 2 lines Changed paths: M /branches/prog/bwa/bwtsw2_core.c M /branches/prog/bwa/khash.h improve interface of khash ------------------------------------------------------------------------ r531 | lh3 | 2008-09-18 06:52:59 -0400 (Thu, 18 Sep 2008) | 2 lines Changed paths: M /branches/prog/bwa/bwtsw2_core.c improve minor things, which make bwtsw2 slower, but should miss less true hits ------------------------------------------------------------------------ r530 | lh3 | 2008-09-17 18:19:26 -0400 (Wed, 17 Sep 2008) | 3 lines Changed paths: M /branches/prog/bwa/bwtsw2_core.c * fixed a bug in calculating ->D * enforce band-width checking ------------------------------------------------------------------------ r529 | lh3 | 2008-09-17 18:06:49 -0400 (Wed, 17 Sep 2008) | 2 lines Changed paths: M /branches/prog/bwa/bwtsw2_core.c delete a line of code that is never visited ------------------------------------------------------------------------ r528 | lh3 | 2008-09-17 17:58:51 -0400 (Wed, 17 Sep 2008) | 2 lines Changed paths: M /branches/prog/bwa/bwtsw2_core.c a bit code clean up ------------------------------------------------------------------------ r527 | lh3 | 2008-09-17 10:55:45 -0400 (Wed, 17 Sep 2008) | 3 lines Changed paths: M /branches/prog/bwa/bwtsw2_core.c M /branches/prog/bwa/bwtsw2_main.c M /branches/prog/bwa/main.c * bwa-0.2.0-12 * max-depth can be set, although it does not help the speed at all ------------------------------------------------------------------------ r526 | lh3 | 2008-09-16 17:59:36 -0400 (Tue, 16 Sep 2008) | 2 lines Changed paths: M /branches/prog/bwa/bwtsw2_core.c cut_tail after remove duplicate ------------------------------------------------------------------------ r525 | lh3 | 2008-09-16 17:56:11 -0400 (Tue, 16 Sep 2008) | 3 lines Changed paths: M /branches/prog/bwa/bwtsw2.h M /branches/prog/bwa/bwtsw2_aux.c M /branches/prog/bwa/bwtsw2_core.c M /branches/prog/bwa/bwtsw2_main.c M /branches/prog/bwa/khash.h M /branches/prog/bwa/main.c * bwa-0.2.0-11 * improved cut_tail() ------------------------------------------------------------------------ r524 | lh3 | 2008-09-15 16:53:22 -0400 (Mon, 15 Sep 2008) | 3 lines Changed paths: M /branches/prog/bwa/bwtsw2_aux.c M /branches/prog/bwa/bwtsw2_core.c M /branches/prog/bwa/bwtsw2_main.c M /branches/prog/bwa/main.c * bwa-0.2.0-10 * fixed a bug in cut_tail() ------------------------------------------------------------------------ r518 | lh3 | 2008-09-15 04:35:59 -0400 (Mon, 15 Sep 2008) | 2 lines Changed paths: M /branches/prog/bwa/bwtsw2_core.c a bit code clean up ------------------------------------------------------------------------ r517 | lh3 | 2008-09-14 18:18:11 -0400 (Sun, 14 Sep 2008) | 2 lines Changed paths: M /branches/prog/bwa/bwtsw2_core.c improve speed (<1%) ------------------------------------------------------------------------ r516 | lh3 | 2008-09-14 18:08:55 -0400 (Sun, 14 Sep 2008) | 3 lines Changed paths: M /branches/prog/bwa/bwtsw2_core.c * fixed two potential bugs, although I have not seen their effects * improve speed a bit (<2%) ------------------------------------------------------------------------ r515 | lh3 | 2008-09-14 17:26:49 -0400 (Sun, 14 Sep 2008) | 2 lines Changed paths: M /branches/prog/bwa/bwtsw2_core.c M /branches/prog/bwa/bwtsw2_main.c nothing, really ------------------------------------------------------------------------ r514 | lh3 | 2008-09-14 17:10:13 -0400 (Sun, 14 Sep 2008) | 2 lines Changed paths: M /branches/prog/bwa/bwtsw2_core.c disable X-drop, which has to be reimplemented in the current algorithm ------------------------------------------------------------------------ r513 | lh3 | 2008-09-14 16:49:42 -0400 (Sun, 14 Sep 2008) | 4 lines Changed paths: M /branches/prog/bwa/bwt_lite.c M /branches/prog/bwa/bwt_lite.h M /branches/prog/bwa/bwtsw2_core.c M /branches/prog/bwa/bwtsw2_main.c * temporarily disable cut_tail() * calculate SA in bwt_lite.c * fixed a bug in reversing the sequence ------------------------------------------------------------------------ r512 | lh3 | 2008-09-13 17:35:40 -0400 (Sat, 13 Sep 2008) | 2 lines Changed paths: M /branches/prog/bwa/bwtsw2.h M /branches/prog/bwa/bwtsw2_aux.c M /branches/prog/bwa/bwtsw2_core.c M /branches/prog/bwa/bwtsw2_main.c A /branches/prog/bwa/ksort.h n-best method ------------------------------------------------------------------------ r507 | lh3 | 2008-09-13 09:06:54 -0400 (Sat, 13 Sep 2008) | 2 lines Changed paths: M /branches/prog/bwa/Makefile M /branches/prog/bwa/bwtsw2_core.c give correct result again ------------------------------------------------------------------------ r506 | lh3 | 2008-09-13 08:12:07 -0400 (Sat, 13 Sep 2008) | 2 lines Changed paths: M /branches/prog/bwa/bwtsw2_core.c I think I know the reason. It needs more work... ------------------------------------------------------------------------ r505 | lh3 | 2008-09-13 06:20:43 -0400 (Sat, 13 Sep 2008) | 2 lines Changed paths: M /branches/prog/bwa/Makefile M /branches/prog/bwa/bwtsw2_core.c fixed another bug, but still have ------------------------------------------------------------------------ r504 | lh3 | 2008-09-12 18:13:37 -0400 (Fri, 12 Sep 2008) | 2 lines Changed paths: M /branches/prog/bwa/bwtsw2_core.c fixed another bug ------------------------------------------------------------------------ r503 | lh3 | 2008-09-12 17:15:56 -0400 (Fri, 12 Sep 2008) | 3 lines Changed paths: M /branches/prog/bwa/bwtsw2_core.c M /branches/prog/bwa/khash.h * do not segfault, but the result is WRONG! * prepare to remove bsw2_connectivity_check() ------------------------------------------------------------------------ r502 | lh3 | 2008-09-12 15:52:41 -0400 (Fri, 12 Sep 2008) | 2 lines Changed paths: M /branches/prog/bwa/bwtsw2_core.c M /branches/prog/bwa/kvec.h more revisions ------------------------------------------------------------------------ r501 | lh3 | 2008-09-11 18:06:15 -0400 (Thu, 11 Sep 2008) | 2 lines Changed paths: M /branches/prog/bwa/bwtsw2_core.c further simply codes with kvec.h ------------------------------------------------------------------------ r500 | lh3 | 2008-09-11 17:42:15 -0400 (Thu, 11 Sep 2008) | 2 lines Changed paths: M /branches/prog/bwa/bwtsw2_core.c part of revisions... have not finished ------------------------------------------------------------------------ r499 | lh3 | 2008-09-11 17:24:15 -0400 (Thu, 11 Sep 2008) | 2 lines Changed paths: M /branches/prog/bwa/bwtsw2.h M /branches/prog/bwa/bwtsw2_core.c M /branches/prog/bwa/bwtsw2_main.c M /branches/prog/bwa/khash.h A /branches/prog/bwa/kvec.h prepare for abrupt change ------------------------------------------------------------------------ r496 | lh3 | 2008-09-11 10:34:38 -0400 (Thu, 11 Sep 2008) | 2 lines Changed paths: M /branches/prog/bwa/bwtsw2_core.c fixed a bug; now "bwtsw2 -d" is useless ------------------------------------------------------------------------ r495 | lh3 | 2008-09-11 09:22:03 -0400 (Thu, 11 Sep 2008) | 2 lines Changed paths: M /branches/prog/bwa/bwtsw2_core.c M /branches/prog/bwa/simple_dp.c M /branches/prog/bwa/stdaln.c M /branches/prog/bwa/stdaln.h improve speed a little bit ------------------------------------------------------------------------ r494 | lh3 | 2008-09-11 08:28:08 -0400 (Thu, 11 Sep 2008) | 2 lines Changed paths: M /branches/prog/bwa/bwtsw2_core.c remove debug codes ------------------------------------------------------------------------ r493 | lh3 | 2008-09-11 07:49:53 -0400 (Thu, 11 Sep 2008) | 3 lines Changed paths: M /branches/prog/bwa/bwtsw2_core.c * improve the speed a little bit (<5%) * prepare to remove BSW_DEBUG ------------------------------------------------------------------------ r492 | lh3 | 2008-09-11 06:15:56 -0400 (Thu, 11 Sep 2008) | 4 lines Changed paths: M /branches/prog/bwa/bwtsw2.h M /branches/prog/bwa/bwtsw2_aux.c M /branches/prog/bwa/bwtsw2_core.c M /branches/prog/bwa/bwtsw2_main.c M /branches/prog/bwa/main.c * bwa-0.2.0-9 * support reverse strand * fixed a bug that causes missing hits ------------------------------------------------------------------------ r491 | lh3 | 2008-09-11 05:46:16 -0400 (Thu, 11 Sep 2008) | 3 lines Changed paths: M /branches/prog/bwa/bwtsw2_core.c M /branches/prog/bwa/main.c * bwa-0.2.0-8 * better progress report ------------------------------------------------------------------------ r490 | lh3 | 2008-09-10 17:04:49 -0400 (Wed, 10 Sep 2008) | 4 lines Changed paths: M /branches/prog/bwa/bwtsw2.h M /branches/prog/bwa/bwtsw2_aux.c M /branches/prog/bwa/bwtsw2_core.c M /branches/prog/bwa/bwtsw2_main.c M /branches/prog/bwa/main.c * bwa-0.2.0-7 * avoid some missing hits * add maximum depth ------------------------------------------------------------------------ r489 | lh3 | 2008-09-10 11:51:13 -0400 (Wed, 10 Sep 2008) | 4 lines Changed paths: M /branches/prog/bwa/bwtsw2_core.c M /branches/prog/bwa/main.c * bwa-0.2.0-6 * bwtsw2 works although on the forward strand only for now * better progress information ------------------------------------------------------------------------ r488 | lh3 | 2008-09-10 10:21:53 -0400 (Wed, 10 Sep 2008) | 3 lines Changed paths: M /branches/prog/bwa/bwtsw2_core.c * implement memory pool * avoid some rehashing ------------------------------------------------------------------------ r487 | lh3 | 2008-09-10 09:23:38 -0400 (Wed, 10 Sep 2008) | 3 lines Changed paths: M /branches/prog/bwa/bwtsw2.h M /branches/prog/bwa/bwtsw2_aux.c M /branches/prog/bwa/bwtsw2_main.c * fixed a memory leak * prepare to implement mempool ------------------------------------------------------------------------ r486 | lh3 | 2008-09-10 09:10:09 -0400 (Wed, 10 Sep 2008) | 4 lines Changed paths: M /branches/prog/bwa/bwtsw2.h M /branches/prog/bwa/bwtsw2_aux.c M /branches/prog/bwa/bwtsw2_core.c M /branches/prog/bwa/bwtsw2_main.c M /branches/prog/bwa/khash.h * add X-dropoff * remove duplicated results * switch to simple stack ------------------------------------------------------------------------ r485 | lh3 | 2008-09-10 06:31:20 -0400 (Wed, 10 Sep 2008) | 3 lines Changed paths: M /branches/prog/bwa/bwtsw2.h M /branches/prog/bwa/bwtsw2_aux.c M /branches/prog/bwa/bwtsw2_core.c M /branches/prog/bwa/bwtsw2_main.c * check whether t-node has been visited * prepare to remove two-level stack ------------------------------------------------------------------------ r484 | lh3 | 2008-09-10 05:00:57 -0400 (Wed, 10 Sep 2008) | 2 lines Changed paths: A /branches/prog/bwa/khash.h khash library ------------------------------------------------------------------------ r483 | lh3 | 2008-09-10 04:22:53 -0400 (Wed, 10 Sep 2008) | 2 lines Changed paths: M /branches/prog/bwa/bwtsw2_core.c add inline ------------------------------------------------------------------------ r482 | lh3 | 2008-09-09 16:34:57 -0400 (Tue, 09 Sep 2008) | 2 lines Changed paths: M /branches/prog/bwa/Makefile M /branches/prog/bwa/bwtsw2_aux.c M /branches/prog/bwa/bwtsw2_core.c improve speed ------------------------------------------------------------------------ r481 | lh3 | 2008-09-09 13:13:00 -0400 (Tue, 09 Sep 2008) | 4 lines Changed paths: M /branches/prog/bwa/bwtsw2_core.c Use a 128bit hash table to keep all (tk,tl,qk,ql). This is slow. Just keep a copy in case I may need this in future. ------------------------------------------------------------------------ r480 | lh3 | 2008-09-09 12:53:32 -0400 (Tue, 09 Sep 2008) | 2 lines Changed paths: M /branches/prog/bwa/bwtsw2.h M /branches/prog/bwa/bwtsw2_core.c * no principal modification ------------------------------------------------------------------------ r479 | lh3 | 2008-09-09 11:01:45 -0400 (Tue, 09 Sep 2008) | 4 lines Changed paths: M /branches/prog/bwa/Makefile M /branches/prog/bwa/bwtsw2_core.c * fixed a bug which may cause duplicated matching * accelerate the speed a bit, although using hash in avoiding duplications slows the speed down in the end ------------------------------------------------------------------------ r474 | lh3 | 2008-09-03 17:22:57 -0400 (Wed, 03 Sep 2008) | 4 lines Changed paths: M /branches/prog/bwa/Makefile M /branches/prog/bwa/bwtsw2.h M /branches/prog/bwa/bwtsw2_aux.c M /branches/prog/bwa/bwtsw2_core.c M /branches/prog/bwa/bwtsw2_main.c M /branches/prog/bwa/main.c * bwa-0.2.0-5 * indel seems to work on toy example * add band ------------------------------------------------------------------------ r469 | lh3 | 2008-09-01 09:18:45 -0400 (Mon, 01 Sep 2008) | 3 lines Changed paths: M /branches/prog/bwa/ChangeLog M /branches/prog/bwa/Makefile M /branches/prog/bwa/bwt_lite.c M /branches/prog/bwa/bwt_lite.h M /branches/prog/bwa/bwtgap.c M /branches/prog/bwa/bwtsw2.h A /branches/prog/bwa/bwtsw2_aux.c M /branches/prog/bwa/bwtsw2_core.c M /branches/prog/bwa/bwtsw2_main.c M /branches/prog/bwa/is.c M /branches/prog/bwa/main.c M /branches/prog/bwa/main.h M /branches/prog/bwa/simple_dp.c * bwa-0.2.0-4 * updated bwtsw2, which seems to work properly on toy examples ------------------------------------------------------------------------ r447 | lh3 | 2008-08-27 10:05:09 -0400 (Wed, 27 Aug 2008) | 3 lines Changed paths: M /branches/prog/bwa/bwt2fmv.c M /branches/prog/bwa/bwtaln.c M /branches/prog/bwa/bwtaln.h M /branches/prog/bwa/bwtgap.c M /branches/prog/bwa/main.c * bwa-0.2.0-3 * tune for longer gaps, but it does not really work with kilo-bp gaps... ------------------------------------------------------------------------ r446 | lh3 | 2008-08-26 13:30:41 -0400 (Tue, 26 Aug 2008) | 3 lines Changed paths: M /branches/prog/bwa/bwt2fmv.c M /branches/prog/bwa/bwtaln.c M /branches/prog/bwa/bwtaln.h M /branches/prog/bwa/bwtgap.c M /branches/prog/bwa/main.c * bwa-0.2.0-2 * changed the way to extend long deletions. Now use max_del_occ. ------------------------------------------------------------------------ r445 | lh3 | 2008-08-26 13:05:58 -0400 (Tue, 26 Aug 2008) | 2 lines Changed paths: M /branches/prog/bwa/bwt_lite.c M /branches/prog/bwa/bwt_lite.h updated from bwtsw2_lite ------------------------------------------------------------------------ r436 | lh3 | 2008-08-23 12:28:44 -0400 (Sat, 23 Aug 2008) | 4 lines Changed paths: M /branches/prog/bwa/Makefile M /branches/prog/bwa/bwt.h A /branches/prog/bwa/bwt_lite.c A /branches/prog/bwa/bwt_lite.h A /branches/prog/bwa/bwtsw2.h A /branches/prog/bwa/bwtsw2_core.c A /branches/prog/bwa/bwtsw2_main.c M /branches/prog/bwa/main.c * bwa-0.2.0-1 * add bwt_lite: a light-weighted version of bwt (NOT TESTED!) * add core codes for bwtsw2: NOT TESTED!!! ------------------------------------------------------------------------ r427 | lh3 | 2008-08-15 05:38:12 -0400 (Fri, 15 Aug 2008) | 2 lines Changed paths: M /branches/prog/bwa/ChangeLog M /branches/prog/bwa/NEWS M /branches/prog/bwa/bwa.1 M /branches/prog/bwa/bwtaln.c M /branches/prog/bwa/main.c Release bwa-0.2.0 ------------------------------------------------------------------------ r426 | lh3 | 2008-08-14 11:26:19 -0400 (Thu, 14 Aug 2008) | 4 lines Changed paths: M /branches/prog/bwa/bwt2fmv.c M /branches/prog/bwa/bwtaln.c M /branches/prog/bwa/bwtaln.h M /branches/prog/bwa/main.c * bwa-0.1.6-7 * change default seed length to 31 * add incomplete support to color sequences (not tested yet!) ------------------------------------------------------------------------ r425 | lh3 | 2008-08-14 06:23:11 -0400 (Thu, 14 Aug 2008) | 3 lines Changed paths: M /branches/prog/bwa/bwtaln.c M /branches/prog/bwa/main.c * bwa-0.1.6-6 * change default seed length to 33bp ------------------------------------------------------------------------ r424 | lh3 | 2008-08-14 05:55:33 -0400 (Thu, 14 Aug 2008) | 6 lines Changed paths: M /branches/prog/bwa/bwtaln.c M /branches/prog/bwa/bwtgap.c M /branches/prog/bwa/main.c * bwa-0.1.6-5 * fixed a bug that may miss true alignments. this bugs exists in most early versions. * fixed a bug that yields wrong coordinates for reads mapped on the forward strands with gaps. ------------------------------------------------------------------------ r423 | lh3 | 2008-08-14 04:07:28 -0400 (Thu, 14 Aug 2008) | 2 lines Changed paths: D /branches/prog/bwa/Makefile.div useless ------------------------------------------------------------------------ r422 | lh3 | 2008-08-13 19:21:14 -0400 (Wed, 13 Aug 2008) | 4 lines Changed paths: M /branches/prog/bwa/bwtaln.c M /branches/prog/bwa/main.c * bwa-0.1.6-4 * fixed one bug * there is another one... ------------------------------------------------------------------------ r421 | lh3 | 2008-08-13 18:23:33 -0400 (Wed, 13 Aug 2008) | 3 lines Changed paths: M /branches/prog/bwa/bwtaln.c M /branches/prog/bwa/bwtaln.h M /branches/prog/bwa/bwtgap.c M /branches/prog/bwa/bwtgap.h M /branches/prog/bwa/bwtindex.c M /branches/prog/bwa/main.c * bwa-0.1.6-3 * almost there, but not quite right ------------------------------------------------------------------------ r419 | lh3 | 2008-08-13 17:27:02 -0400 (Wed, 13 Aug 2008) | 3 lines Changed paths: M /branches/prog/bwa/bwtaln.c M /branches/prog/bwa/bwtaln.h M /branches/prog/bwa/bwtgap.c M /branches/prog/bwa/bwtgap.h M /branches/prog/bwa/main.c * improve the seeding method * prepare to load two BWTs into memory. A BIG change! ------------------------------------------------------------------------ r418 | lh3 | 2008-08-13 10:56:54 -0400 (Wed, 13 Aug 2008) | 3 lines Changed paths: M /branches/prog/bwa/bwt2fmv.c M /branches/prog/bwa/bwtaln.c M /branches/prog/bwa/bwtaln.h M /branches/prog/bwa/bwtgap.c M /branches/prog/bwa/bwtgap.h M /branches/prog/bwa/main.c * added seeding * unfinished yet ------------------------------------------------------------------------ r413 | lh3 | 2008-08-08 11:48:35 -0400 (Fri, 08 Aug 2008) | 2 lines Changed paths: M /branches/prog/bwa/ChangeLog M /branches/prog/bwa/NEWS M /branches/prog/bwa/main.c Release bwa-0.1.6 ------------------------------------------------------------------------ r410 | lh3 | 2008-08-06 15:48:22 -0400 (Wed, 06 Aug 2008) | 2 lines Changed paths: M /branches/prog/bwa/simple_dp.c sw: output alignment score ------------------------------------------------------------------------ r407 | lh3 | 2008-08-04 10:01:20 -0400 (Mon, 04 Aug 2008) | 4 lines Changed paths: M /branches/prog/bwa/Makefile M /branches/prog/bwa/main.c M /branches/prog/bwa/main.h A /branches/prog/bwa/simple_dp.c M /branches/prog/bwa/stdaln.c M /branches/prog/bwa/stdaln.h * bwa-0.1.5-3 * added a simple interface to SW/NW alignment * stdaln-0.9.8 (see header for more details) ------------------------------------------------------------------------ r406 | lh3 | 2008-08-01 19:21:59 -0400 (Fri, 01 Aug 2008) | 3 lines Changed paths: M /branches/prog/bwa/Makefile M /branches/prog/bwa/bwtaln.c M /branches/prog/bwa/bwtaln.h M /branches/prog/bwa/main.c A /branches/prog/bwa/stdaln.c A /branches/prog/bwa/stdaln.h * bwa-0.1.5-2 * give accurate gap positions ------------------------------------------------------------------------ r405 | lh3 | 2008-08-01 19:06:19 -0400 (Fri, 01 Aug 2008) | 2 lines Changed paths: M /branches/prog/bwa/Makefile M /branches/prog/bwa/bwtaln.c M /branches/prog/bwa/bwtaln.h unfinished, but I am tired... ------------------------------------------------------------------------ r401 | lh3 | 2008-07-30 05:59:24 -0400 (Wed, 30 Jul 2008) | 4 lines Changed paths: M /branches/prog/bwa/bntseq.c M /branches/prog/bwa/main.c * bwa-0.1.5-1 * fixed a potential bug which may produce an alignment in N regions, although extremely rare. ------------------------------------------------------------------------ r399 | lh3 | 2008-07-27 11:41:52 -0400 (Sun, 27 Jul 2008) | 2 lines Changed paths: M /branches/prog/bwa/ChangeLog M /branches/prog/bwa/NEWS M /branches/prog/bwa/main.c Release bwa-0.1.5 ------------------------------------------------------------------------ r398 | lh3 | 2008-07-25 12:14:47 -0400 (Fri, 25 Jul 2008) | 2 lines Changed paths: M /branches/prog/bwa/bwa.1 update documentation ------------------------------------------------------------------------ r397 | lh3 | 2008-07-25 09:58:56 -0400 (Fri, 25 Jul 2008) | 2 lines Changed paths: M /branches/prog/bwa/bwt2fmv.c M /branches/prog/bwa/bwtaln.c M /branches/prog/bwa/main.c * ------------------------------------------------------------------------ r396 | lh3 | 2008-07-25 06:42:01 -0400 (Fri, 25 Jul 2008) | 3 lines Changed paths: M /branches/prog/bwa/bwtaln.c M /branches/prog/bwa/main.c * bwa-0.1.4-4 * add timer for debugging ------------------------------------------------------------------------ r395 | lh3 | 2008-07-24 05:46:21 -0400 (Thu, 24 Jul 2008) | 4 lines Changed paths: M /branches/prog/bwa/bwtgap.c M /branches/prog/bwa/main.c * bwa-0.1.4-3 * fixed a bug in the previous code * this version gives identical result to bwa-0.1.4, just 10% faster ------------------------------------------------------------------------ r394 | lh3 | 2008-07-24 05:18:53 -0400 (Thu, 24 Jul 2008) | 4 lines Changed paths: M /branches/prog/bwa/bwtaln.c M /branches/prog/bwa/bwtgap.c M /branches/prog/bwa/bwtgap.h M /branches/prog/bwa/main.c * bwa-0.1.4-2 * further improve the speed * The result is slightly different from bwa-0.1.4 now. I need to check... ------------------------------------------------------------------------ r393 | lh3 | 2008-07-23 12:04:16 -0400 (Wed, 23 Jul 2008) | 2 lines Changed paths: M /branches/prog/bwa/bwt.c comments only ------------------------------------------------------------------------ r392 | lh3 | 2008-07-23 10:34:03 -0400 (Wed, 23 Jul 2008) | 2 lines Changed paths: M /branches/prog/bwa/bwt.c M /branches/prog/bwa/main.c further improve the speed in Occ functions ------------------------------------------------------------------------ r386 | lh3 | 2008-07-22 10:03:54 -0400 (Tue, 22 Jul 2008) | 2 lines Changed paths: M /branches/prog/bwa/NEWS M /branches/prog/bwa/main.c Release bwa-0.1.4 ------------------------------------------------------------------------ r385 | lh3 | 2008-07-22 09:44:50 -0400 (Tue, 22 Jul 2008) | 2 lines Changed paths: M /branches/prog/bwa/ChangeLog M /branches/prog/bwa/bwa.1 update documentation and ChangeLog ------------------------------------------------------------------------ r384 | lh3 | 2008-07-22 08:50:03 -0400 (Tue, 22 Jul 2008) | 4 lines Changed paths: M /branches/prog/bwa/Makefile M /branches/prog/bwa/bwt2fmv.c M /branches/prog/bwa/bwtaln.c M /branches/prog/bwa/bwtaln.h M /branches/prog/bwa/bwtgap.c M /branches/prog/bwa/main.c * bwa-0.1.3-2 * fixed the bug in the last modification * now the alignment should be more clearly defined ------------------------------------------------------------------------ r383 | lh3 | 2008-07-21 18:32:21 -0400 (Mon, 21 Jul 2008) | 4 lines Changed paths: M /branches/prog/bwa/bwt2fmv.c M /branches/prog/bwa/bwtaln.c M /branches/prog/bwa/bwtgap.c M /branches/prog/bwa/main.c * bwa-0.1.3-1 * this is a buggy verion! * i will fix the bug tomorrow. It is late... ------------------------------------------------------------------------ r381 | lh3 | 2008-07-21 06:45:32 -0400 (Mon, 21 Jul 2008) | 2 lines Changed paths: M /branches/prog/bwa/ChangeLog M /branches/prog/bwa/NEWS M /branches/prog/bwa/bwa.1 M /branches/prog/bwa/main.c Release bwa-0.1.3 ------------------------------------------------------------------------ r380 | lh3 | 2008-07-21 06:07:43 -0400 (Mon, 21 Jul 2008) | 4 lines Changed paths: M /branches/prog/bwa/ChangeLog M /branches/prog/bwa/bwa.1 M /branches/prog/bwa/bwt.c M /branches/prog/bwa/bwt2fmv.c M /branches/prog/bwa/main.c * bwa-0.1.2-3 * improve the speed for gcc on Intel Mac OS X, but not really on icc on Linux * aln: more command-line options ------------------------------------------------------------------------ r373 | lh3 | 2008-07-17 09:09:46 -0400 (Thu, 17 Jul 2008) | 4 lines Changed paths: M /branches/prog/bwa/bwt.c M /branches/prog/bwa/bwt.h M /branches/prog/bwa/bwtio.c M /branches/prog/bwa/main.c * bwa-0.1.2-2 * further improve the speed * this version gives exactly the same result as bwa-0.1.2 ------------------------------------------------------------------------ r372 | lh3 | 2008-07-17 07:51:08 -0400 (Thu, 17 Jul 2008) | 3 lines Changed paths: M /branches/prog/bwa/bwt.c M /branches/prog/bwa/main.c * bwa-0.1.2-1 * speed up by about 5% ------------------------------------------------------------------------ r370 | lh3 | 2008-07-17 05:12:00 -0400 (Thu, 17 Jul 2008) | 2 lines Changed paths: M /branches/prog/bwa/NEWS M /branches/prog/bwa/bwa.1 M /branches/prog/bwa/main.c Release bwa-0.1.2 ------------------------------------------------------------------------ r368 | lh3 | 2008-07-16 08:51:25 -0400 (Wed, 16 Jul 2008) | 4 lines Changed paths: M /branches/prog/bwa/Makefile D /branches/prog/bwa/bwt1away.c M /branches/prog/bwa/bwt2fmv.c M /branches/prog/bwa/bwtaln.c M /branches/prog/bwa/bwtaln.h M /branches/prog/bwa/bwtgap.c M /branches/prog/bwa/bwtgap.h D /branches/prog/bwa/bwttop2.c M /branches/prog/bwa/main.c * bwa-0.1.1-9 * some code cleanup * remove 1away and top2 ------------------------------------------------------------------------ r367 | lh3 | 2008-07-16 08:24:34 -0400 (Wed, 16 Jul 2008) | 2 lines Changed paths: M /branches/prog/bwa/is.c Yuta Mori's implementation of IS algorithm. ------------------------------------------------------------------------ r365 | lh3 | 2008-07-16 06:58:04 -0400 (Wed, 16 Jul 2008) | 6 lines Changed paths: M /branches/prog/bwa/bwtaln.c M /branches/prog/bwa/bwtgap.c M /branches/prog/bwa/bwtgap.h M /branches/prog/bwa/main.c * bwa-0.1.1-8 * improve gapped alignment * this version will miss more gapped alignments, but the speed is much faster * prepare to remove top2 and 1away algorithms * prepare to add SAIS algorithm for bwt construction ------------------------------------------------------------------------ r358 | lh3 | 2008-06-09 06:03:04 -0400 (Mon, 09 Jun 2008) | 4 lines Changed paths: M /branches/prog/bwa/bwt2fmv.c M /branches/prog/bwa/bwtgap.c M /branches/prog/bwa/main.c * bwa-0.1.1-7 * change END_SKIP from 3 to 5, but still gaps may be wrongly added * change default '-g' from 5 to 3 ------------------------------------------------------------------------ r357 | lh3 | 2008-06-09 05:18:36 -0400 (Mon, 09 Jun 2008) | 3 lines Changed paths: M /branches/prog/bwa/bntseq.c M /branches/prog/bwa/bwt2fmv.c M /branches/prog/bwa/bwtgap.c M /branches/prog/bwa/main.c * bwa-0.1.1-6 * fix a bug in nested stack ------------------------------------------------------------------------ r356 | lh3 | 2008-06-08 18:43:13 -0400 (Sun, 08 Jun 2008) | 4 lines Changed paths: M /branches/prog/bwa/Makefile M /branches/prog/bwa/bwtaln.c M /branches/prog/bwa/bwtaln.h M /branches/prog/bwa/bwtgap.c A /branches/prog/bwa/bwtgap.h M /branches/prog/bwa/main.c * bwa-0.1.1-5 * replace heap with nested stacks * there are still obvious bugs... ------------------------------------------------------------------------ r355 | lh3 | 2008-06-08 17:13:44 -0400 (Sun, 08 Jun 2008) | 4 lines Changed paths: M /branches/prog/bwa/bwt2fmv.c M /branches/prog/bwa/bwtaln.c M /branches/prog/bwa/bwtaln.h M /branches/prog/bwa/bwtgap.c M /branches/prog/bwa/main.c M /branches/prog/bwa/main.h * bwa-0.1.1-4 * add interface to affine gap alignment * there are obvious bugs and I will fix them later ------------------------------------------------------------------------ r354 | lh3 | 2008-06-08 15:39:05 -0400 (Sun, 08 Jun 2008) | 3 lines Changed paths: M /branches/prog/bwa/bwtaln.c M /branches/prog/bwa/bwtaln.h M /branches/prog/bwa/bwtgap.c M /branches/prog/bwa/main.c * bwa-0.1.1-3 * affine gap seems to work, at least partially ------------------------------------------------------------------------ r353 | lh3 | 2008-06-08 09:27:18 -0400 (Sun, 08 Jun 2008) | 3 lines Changed paths: M /branches/prog/bwa/Makefile M /branches/prog/bwa/bwt2fmv.c M /branches/prog/bwa/bwtaln.c M /branches/prog/bwa/bwtaln.h A /branches/prog/bwa/bwtgap.c M /branches/prog/bwa/bwttop2.c M /branches/prog/bwa/main.c * bwa-0.1.1-2 * initial gapped alignment. not work at the moment ------------------------------------------------------------------------ r352 | lh3 | 2008-06-06 04:37:34 -0400 (Fri, 06 Jun 2008) | 3 lines Changed paths: M /branches/prog/bwa/bwttop2.c M /branches/prog/bwa/main.c * bwa-0.1.1-1 * ungap: remove a useless varible in top2_entry_t ------------------------------------------------------------------------ r348 | lh3 | 2008-06-03 09:04:12 -0400 (Tue, 03 Jun 2008) | 2 lines Changed paths: M /branches/prog/bwa/ChangeLog A /branches/prog/bwa/NEWS M /branches/prog/bwa/bwa.1 M /branches/prog/bwa/main.c Release bwa-0.1.1 ------------------------------------------------------------------------ r347 | lh3 | 2008-06-03 05:45:08 -0400 (Tue, 03 Jun 2008) | 2 lines Changed paths: M /branches/prog/bwa/bwa.1 update documentation ------------------------------------------------------------------------ r346 | lh3 | 2008-06-02 18:59:50 -0400 (Mon, 02 Jun 2008) | 5 lines Changed paths: A /branches/prog/bwa/ChangeLog A /branches/prog/bwa/bwa.1 M /branches/prog/bwa/bwtaln.c M /branches/prog/bwa/main.c * bwa-0.1.0-11 * improve approximating mapping qualities * add documentation * add ChangeLog ------------------------------------------------------------------------ r345 | lh3 | 2008-06-02 16:04:39 -0400 (Mon, 02 Jun 2008) | 3 lines Changed paths: M /branches/prog/bwa/bwtaln.c M /branches/prog/bwa/bwttop2.c M /branches/prog/bwa/main.c * bwa-0.1.0-10 * output a random position for repetitive reads ------------------------------------------------------------------------ r344 | lh3 | 2008-06-02 15:03:54 -0400 (Mon, 02 Jun 2008) | 4 lines Changed paths: M /branches/prog/bwa/bntseq.c M /branches/prog/bwa/bwtaln.c M /branches/prog/bwa/main.c M /branches/prog/bwa/pac2bwt.c * bwa-0.1.0-9 * fix memory leaks * fix a potential bug in coverting to the real coordinate ------------------------------------------------------------------------ r343 | lh3 | 2008-06-02 13:44:51 -0400 (Mon, 02 Jun 2008) | 5 lines Changed paths: M /branches/prog/bwa/Makefile.div M /branches/prog/bwa/bwt.c M /branches/prog/bwa/bwt.h M /branches/prog/bwa/bwt2fmv.c M /branches/prog/bwa/bwtaln.c M /branches/prog/bwa/bwttop2.c M /branches/prog/bwa/main.c * bwa-0.1.0-8 * fix a bug about strand * update Makefile.div * change top2b as the default method ------------------------------------------------------------------------ r342 | lh3 | 2008-06-02 11:23:26 -0400 (Mon, 02 Jun 2008) | 3 lines Changed paths: M /branches/prog/bwa/bwt.c M /branches/prog/bwa/bwt1away.c M /branches/prog/bwa/bwtaln.c M /branches/prog/bwa/main.c * bwa-0.1.0-7 * use bwt_2occ() and bwt_2occ4() in other functions ------------------------------------------------------------------------ r341 | lh3 | 2008-06-02 09:31:39 -0400 (Mon, 02 Jun 2008) | 3 lines Changed paths: M /branches/prog/bwa/bwttop2.c M /branches/prog/bwa/main.c * bwa-0.1.0-6 * fix a bug for missing hits ------------------------------------------------------------------------ r340 | lh3 | 2008-06-02 09:10:18 -0400 (Mon, 02 Jun 2008) | 3 lines Changed paths: M /branches/prog/bwa/bwttop2.c M /branches/prog/bwa/main.c * bwa-0.1.0-5 * accelerate comparisons in heap, a bit ------------------------------------------------------------------------ r339 | lh3 | 2008-06-02 08:41:31 -0400 (Mon, 02 Jun 2008) | 3 lines Changed paths: M /branches/prog/bwa/bwt.c M /branches/prog/bwa/bwt.h M /branches/prog/bwa/bwttop2.c M /branches/prog/bwa/main.c * bwa-0.1.0-4 * avoid marginal repeated calculation in occ ------------------------------------------------------------------------ r338 | lh3 | 2008-06-02 06:46:51 -0400 (Mon, 02 Jun 2008) | 5 lines Changed paths: M /branches/prog/bwa/Makefile M /branches/prog/bwa/bwt.c M /branches/prog/bwa/bwttop2.c M /branches/prog/bwa/main.c * bwa-0.1.0-3 * fix a bug caused by previours change * fix a bug in heap * order the heap by more criteria ------------------------------------------------------------------------ r337 | lh3 | 2008-06-01 19:11:15 -0400 (Sun, 01 Jun 2008) | 4 lines Changed paths: M /branches/prog/bwa/bwttop2.c M /branches/prog/bwa/main.c * bwa-0.1.0-2 * also sort sa range in heapsort, in attempt to improve cache performance. Unfortunately, it does not work well at all. ------------------------------------------------------------------------ r336 | lh3 | 2008-06-01 17:45:23 -0400 (Sun, 01 Jun 2008) | 3 lines Changed paths: M /branches/prog/bwa/Makefile M /branches/prog/bwa/Makefile.div M /branches/prog/bwa/bntseq.c M /branches/prog/bwa/main.c * 0.1.0-1 * fix a bug in calculating the real coordinate ------------------------------------------------------------------------ r335 | lh3 | 2008-06-01 16:03:09 -0400 (Sun, 01 Jun 2008) | 2 lines Changed paths: M /branches/prog/bwa/Makefile nothing, really ------------------------------------------------------------------------ r334 | lh3 | 2008-06-01 15:59:13 -0400 (Sun, 01 Jun 2008) | 2 lines Changed paths: M /branches/prog/bwa/Makefile A /branches/prog/bwa/Makefile.div M /branches/prog/bwa/bwtindex.c M /branches/prog/bwa/pac2bwt.c use IS algorithm by default ------------------------------------------------------------------------ r333 | lh3 | 2008-06-01 15:05:15 -0400 (Sun, 01 Jun 2008) | 3 lines Changed paths: M /branches/prog/bwa/Makefile M /branches/prog/bwa/bwtindex.c M /branches/prog/bwa/is.c M /branches/prog/bwa/pac2bwt.c * a bit code clean up in is.c * add IS algorithm for constructing BWT, albeit slower ------------------------------------------------------------------------ r332 | lh3 | 2008-06-01 13:23:08 -0400 (Sun, 01 Jun 2008) | 2 lines Changed paths: A /branches/prog/bwa/is.c IS linear-time algorithm for constructing SA/BWT ------------------------------------------------------------------------ r331 | lh3 | 2008-06-01 10:35:26 -0400 (Sun, 01 Jun 2008) | 3 lines Changed paths: M /branches/prog/bwa/Makefile M /branches/prog/bwa/bntseq.c A /branches/prog/bwa/bwtindex.c M /branches/prog/bwa/main.c M /branches/prog/bwa/main.h * fix a bug in generating .pac * index in one go ------------------------------------------------------------------------ r330 | lh3 | 2008-06-01 09:17:05 -0400 (Sun, 01 Jun 2008) | 2 lines Changed paths: M /branches/prog/bwa/bntseq.c M /branches/prog/bwa/bntseq.h M /branches/prog/bwa/bwtaln.c M /branches/prog/bwa/bwttop2.c real coordinates can be ouput ------------------------------------------------------------------------ r329 | lh3 | 2008-05-31 19:21:02 -0400 (Sat, 31 May 2008) | 2 lines Changed paths: M /branches/prog/bwa/bwt2fmv.c M /branches/prog/bwa/bwtaln.h M /branches/prog/bwa/bwttop2.c add top2e which is similar to 1away ------------------------------------------------------------------------ r328 | lh3 | 2008-05-31 18:46:12 -0400 (Sat, 31 May 2008) | 3 lines Changed paths: M /branches/prog/bwa/bwt2fmv.c M /branches/prog/bwa/bwtaln.c M /branches/prog/bwa/bwtaln.h M /branches/prog/bwa/bwttop2.c M /branches/prog/bwa/main.c M /branches/prog/bwa/main.h * unified cmd-line interface for ungapped alignment * add two alternatives to top2 algorithm ------------------------------------------------------------------------ r327 | lh3 | 2008-05-31 18:14:46 -0400 (Sat, 31 May 2008) | 2 lines Changed paths: M /branches/prog/bwa/bwt2fmv.c M /branches/prog/bwa/bwtaln.c M /branches/prog/bwa/bwtaln.h M /branches/prog/bwa/main.c M /branches/prog/bwa/main.h add cmd-line interface to alntop2 ------------------------------------------------------------------------ r326 | lh3 | 2008-05-31 17:59:31 -0400 (Sat, 31 May 2008) | 2 lines Changed paths: M /branches/prog/bwa/Makefile M /branches/prog/bwa/bwt1away.c M /branches/prog/bwa/bwtaln.c M /branches/prog/bwa/bwtaln.h A /branches/prog/bwa/bwttop2.c top2 algorithm seems to work. I need to change interface, though ------------------------------------------------------------------------ r325 | lh3 | 2008-05-31 15:11:49 -0400 (Sat, 31 May 2008) | 2 lines Changed paths: M /branches/prog/bwa/bwt1away.c change the variable in the structure ------------------------------------------------------------------------ r324 | lh3 | 2008-05-31 14:52:13 -0400 (Sat, 31 May 2008) | 2 lines Changed paths: M /branches/prog/bwa/bwt1away.c set a slightly better bound on the maximum allowed mismatches ------------------------------------------------------------------------ r323 | lh3 | 2008-05-30 18:40:21 -0400 (Fri, 30 May 2008) | 2 lines Changed paths: M /branches/prog/bwa/bwtaln.c * output time statistics ------------------------------------------------------------------------ r322 | lh3 | 2008-05-30 17:58:25 -0400 (Fri, 30 May 2008) | 4 lines Changed paths: M /branches/prog/bwa/Makefile M /branches/prog/bwa/bwt.c M /branches/prog/bwa/bwt.h A /branches/prog/bwa/bwt1away.c M /branches/prog/bwa/bwtaln.c M /branches/prog/bwa/bwtaln.h * presumably better way to make use of prefix. But for the moment I do not know whether it is correct or not. * a bit code clean up: separate alignment part ------------------------------------------------------------------------ r321 | lh3 | 2008-05-30 13:57:43 -0400 (Fri, 30 May 2008) | 3 lines Changed paths: M /branches/prog/bwa/Makefile M /branches/prog/bwa/bwt.c M /branches/prog/bwa/bwt.h M /branches/prog/bwa/bwt_gen/Makefile M /branches/prog/bwa/bwt_gen/bwt_gen.c M /branches/prog/bwa/bwtaln.c M /branches/prog/bwa/main.c M /branches/prog/bwa/main.h M /branches/prog/bwa/pac2bwt.c * a bit code clean up * put bwt_gen in bwa ------------------------------------------------------------------------ r320 | lh3 | 2008-05-30 11:40:11 -0400 (Fri, 30 May 2008) | 4 lines Changed paths: M /branches/prog/bwa/bwt.c M /branches/prog/bwa/bwt2fmv.c M /branches/prog/bwa/bwtaln.c M /branches/prog/bwa/bwtaln.h M /branches/prog/bwa/bwtio.c * improve cmd-line interface * fix a bug in loading .sa * change default sa interval to 32 ------------------------------------------------------------------------ r319 | lh3 | 2008-05-30 10:31:37 -0400 (Fri, 30 May 2008) | 3 lines Changed paths: M /branches/prog/bwa/bwtaln.c * fix memory leak (I know that. Just a bit lazy) * change to another method to do 1-away alignment ------------------------------------------------------------------------ r318 | lh3 | 2008-05-30 09:21:49 -0400 (Fri, 30 May 2008) | 2 lines Changed paths: M /branches/prog/bwa/bwt.c M /branches/prog/bwa/bwt.h M /branches/prog/bwa/bwt2fmv.c M /branches/prog/bwa/bwtaln.c M /branches/prog/bwa/bwtaln.h M /branches/prog/bwa/main.c M /branches/prog/bwa/main.h best unique match is partially finished ------------------------------------------------------------------------ r317 | lh3 | 2008-05-30 06:33:28 -0400 (Fri, 30 May 2008) | 2 lines Changed paths: M /branches/prog/bwa/bwt.c M /branches/prog/bwa/bwt2fmv.c M /branches/prog/bwa/bwtaln.c M /branches/prog/bwa/bwtaln.h M /branches/prog/bwa/main.c M /branches/prog/bwa/main.h remove "ungapped" command and related codes ------------------------------------------------------------------------ r316 | lh3 | 2008-05-30 06:05:20 -0400 (Fri, 30 May 2008) | 2 lines Changed paths: M /branches/prog/bwa/Makefile M /branches/prog/bwa/bwt.c M /branches/prog/bwa/bwtaln.c M /branches/prog/bwa/bwtaln.h change variable name thick to width ------------------------------------------------------------------------ r315 | lh3 | 2008-05-29 19:06:13 -0400 (Thu, 29 May 2008) | 2 lines Changed paths: M /branches/prog/bwa/bntseq.c M /branches/prog/bwa/bwt.c M /branches/prog/bwa/bwt.h M /branches/prog/bwa/bwt2fmv.c M /branches/prog/bwa/bwtaln.c M /branches/prog/bwa/bwtaln.h M /branches/prog/bwa/bwtio.c M /branches/prog/bwa/main.c M /branches/prog/bwa/main.h M /branches/prog/bwa/pac2bwt.c revised algorithm for ungapped alignment. the old one can still be used. ------------------------------------------------------------------------ r314 | lh3 | 2008-05-29 16:36:11 -0400 (Thu, 29 May 2008) | 2 lines Changed paths: M /branches/prog/bwa/bwt2fmv.c M /branches/prog/bwa/bwt_gen/bwt_gen.c M /branches/prog/bwa/bwtio.c M /branches/prog/bwa/pac2bwt.c * make commands more independent, but ungapped does not work at the moment ------------------------------------------------------------------------ r313 | lh3 | 2008-05-29 15:56:14 -0400 (Thu, 29 May 2008) | 2 lines Changed paths: M /branches/prog/bwa/bwt_gen/bwt_gen.c little... ------------------------------------------------------------------------ r312 | lh3 | 2008-05-29 15:54:01 -0400 (Thu, 29 May 2008) | 3 lines Changed paths: M /branches/prog/bwa/bwt_gen/bwt_gen.c M /branches/prog/bwa/bwt_gen/bwt_gen.h * add CopyRight information from the original codes * do not dump .fmv files ------------------------------------------------------------------------ r311 | lh3 | 2008-05-29 15:44:36 -0400 (Thu, 29 May 2008) | 2 lines Changed paths: A /branches/prog/bwa/bwt_gen A /branches/prog/bwa/bwt_gen/Makefile A /branches/prog/bwa/bwt_gen/QSufSort.c A /branches/prog/bwa/bwt_gen/QSufSort.h A /branches/prog/bwa/bwt_gen/bwt_gen.c A /branches/prog/bwa/bwt_gen/bwt_gen.h codes from BWT-SW, for building BWT from packed file ------------------------------------------------------------------------ r310 | lh3 | 2008-05-28 17:03:35 -0400 (Wed, 28 May 2008) | 4 lines Changed paths: M /branches/prog/bwa/bwt.c M /branches/prog/bwa/bwt.h M /branches/prog/bwa/bwt2fmv.c M /branches/prog/bwa/bwtio.c M /branches/prog/bwa/main.c M /branches/prog/bwa/main.h * change OCC_INTERVAL to 0x40, which makes bwa twice as fast. * write Occ file as ".occ" as it is using a different interval from .fmv, the BWT-SW correspondance of .occ ------------------------------------------------------------------------ r309 | lh3 | 2008-05-28 11:39:37 -0400 (Wed, 28 May 2008) | 2 lines Changed paths: M /branches/prog/bwa/bwt.c M /branches/prog/bwa/bwt2fmv.c fix a bug ------------------------------------------------------------------------ r308 | lh3 | 2008-05-28 09:56:16 -0400 (Wed, 28 May 2008) | 4 lines Changed paths: M /branches/prog/bwa/bwt.c M /branches/prog/bwa/bwt2fmv.c add heuristics to improve the speed, but I have not tested whether the results are correct or not. ------------------------------------------------------------------------ r307 | lh3 | 2008-05-28 06:31:34 -0400 (Wed, 28 May 2008) | 5 lines Changed paths: M /branches/prog/bwa/bwt.c M /branches/prog/bwa/bwt2fmv.c M /branches/prog/bwa/bwtaln.c M /branches/prog/bwa/bwtaln.h M /branches/prog/bwa/main.c M /branches/prog/bwa/main.h * make ungapped alignment basically works... * but it is very slow in comparison to others... * also I need to improve the interface... * a lot of things to keep me busy today... ------------------------------------------------------------------------ r306 | lh3 | 2008-05-27 18:41:27 -0400 (Tue, 27 May 2008) | 3 lines Changed paths: M /branches/prog/bwa/bwt.c M /branches/prog/bwa/bwt.h M /branches/prog/bwa/bwtaln.c * remove recursion * fixed a bug in bwt_occ() ------------------------------------------------------------------------ r305 | lh3 | 2008-05-27 16:59:44 -0400 (Tue, 27 May 2008) | 5 lines Changed paths: M /branches/prog/bwa/bwt.c M /branches/prog/bwa/bwt.h M /branches/prog/bwa/bwtaln.c * bwa now tells whether a sequenced can be mapped with maximum allowed mismatches. ONLY ungapped. * this is a recursive version. I will remove recursion later. ------------------------------------------------------------------------ r304 | lh3 | 2008-05-27 09:12:17 -0400 (Tue, 27 May 2008) | 3 lines Changed paths: M /branches/prog/bwa/Makefile M /branches/prog/bwa/bwt.c M /branches/prog/bwa/bwt.h M /branches/prog/bwa/bwt2fmv.c A /branches/prog/bwa/bwtaln.c A /branches/prog/bwa/bwtaln.h M /branches/prog/bwa/bwtio.c M /branches/prog/bwa/main.c M /branches/prog/bwa/main.h M /branches/prog/bwa/utils.c * load .sa and .fmv files * exact alignment now works ------------------------------------------------------------------------ r303 | lh3 | 2008-05-27 06:33:38 -0400 (Tue, 27 May 2008) | 2 lines Changed paths: M /branches/prog/bwa/bntseq.c M /branches/prog/bwa/bwt.c M /branches/prog/bwa/bwtio.c M /branches/prog/bwa/utils.c M /branches/prog/bwa/utils.h add xassert and fix a bug ------------------------------------------------------------------------ r302 | lh3 | 2008-05-27 06:23:20 -0400 (Tue, 27 May 2008) | 2 lines Changed paths: M /branches/prog/bwa/Makefile M /branches/prog/bwa/bntseq.c M /branches/prog/bwa/bwt.c M /branches/prog/bwa/bwtio.c A /branches/prog/bwa/utils.c A /branches/prog/bwa/utils.h improve error message and error handling ------------------------------------------------------------------------ r301 | lh3 | 2008-05-27 05:37:51 -0400 (Tue, 27 May 2008) | 4 lines Changed paths: M /branches/prog/bwa/Makefile M /branches/prog/bwa/bwt.c M /branches/prog/bwa/bwt.h M /branches/prog/bwa/bwt2fmv.c A /branches/prog/bwa/bwtio.c M /branches/prog/bwa/main.c M /branches/prog/bwa/main.h * move I/O codes to bwtio.c * SA can be dumped and interestingly, it is identical to BWTSW * now, .fmv is still different from BWTSW ------------------------------------------------------------------------ r299 | lh3 | 2008-05-26 18:07:44 -0400 (Mon, 26 May 2008) | 2 lines Changed paths: M /branches/prog/bwa/Makefile M /branches/prog/bwa/bwt.c M /branches/prog/bwa/bwt.h M /branches/prog/bwa/bwt2fmv.c generate/retrieve SA and Occ ------------------------------------------------------------------------ r298 | lh3 | 2008-05-26 13:16:49 -0400 (Mon, 26 May 2008) | 3 lines Changed paths: M /branches/prog/bwa/bntseq.h M /branches/prog/bwa/bwt.c M /branches/prog/bwa/bwt.h M /branches/prog/bwa/bwt2fmv.c * retrieve occ value at any position * move bwt_cal_occ() to bwt.c ------------------------------------------------------------------------ r297 | lh3 | 2008-05-25 17:43:58 -0400 (Sun, 25 May 2008) | 6 lines Changed paths: M /branches/prog/bwa/Makefile A /branches/prog/bwa/bwt.c A /branches/prog/bwa/bwt.h A /branches/prog/bwa/bwt2fmv.c M /branches/prog/bwa/main.c M /branches/prog/bwa/main.h M /branches/prog/bwa/pac2bwt.c * add bwt2fmv. It works to some extend. However, I do not understand the purpose of some weird codes in BWT-SW. As a consequence, bwt2fmv could generate a file almost identical, but not exactly identical, to the .fmv file from BWT-SW. ------------------------------------------------------------------------ r296 | lh3 | 2008-05-24 18:35:02 -0400 (Sat, 24 May 2008) | 5 lines Changed paths: M /branches/prog/bwa/Makefile M /branches/prog/bwa/bntseq.c M /branches/prog/bwa/bntseq.h M /branches/prog/bwa/main.c M /branches/prog/bwa/main.h A /branches/prog/bwa/pac2bwt.c Burrows-Wheeler Transform now works. At least on one example, the current code generates the same BWT as BWT-SW. Kind of magical, I would say. :) ------------------------------------------------------------------------ r295 | lh3 | 2008-05-24 11:25:31 -0400 (Sat, 24 May 2008) | 3 lines Changed paths: A /branches/prog/bwa/Makefile M /branches/prog/bwa/bntseq.c A /branches/prog/bwa/main.c A /branches/prog/bwa/main.h * add Makefile and main.* * improve interface to fa2bns, a bit ------------------------------------------------------------------------ r293 | lh3 | 2008-05-24 10:57:03 -0400 (Sat, 24 May 2008) | 3 lines Changed paths: A /branches/prog/bwa A /branches/prog/bwa/bntseq.c A /branches/prog/bwa/bntseq.h A /branches/prog/bwa/seq.c A /branches/prog/bwa/seq.h * Burrow-Wheeler Alignment * initial codes ------------------------------------------------------------------------ bwa-0.7.17/Makefile000066400000000000000000000062441317342117100140250ustar00rootroot00000000000000CC= gcc #CC= clang --analyze CFLAGS= -g -Wall -Wno-unused-function -O2 WRAP_MALLOC=-DUSE_MALLOC_WRAPPERS AR= ar DFLAGS= -DHAVE_PTHREAD $(WRAP_MALLOC) LOBJS= utils.o kthread.o kstring.o ksw.o bwt.o bntseq.o bwa.o bwamem.o bwamem_pair.o bwamem_extra.o malloc_wrap.o \ QSufSort.o bwt_gen.o rope.o rle.o is.o bwtindex.o AOBJS= bwashm.o bwase.o bwaseqio.o bwtgap.o bwtaln.o bamlite.o \ bwape.o kopen.o pemerge.o maxk.o \ bwtsw2_core.o bwtsw2_main.o bwtsw2_aux.o bwt_lite.o \ bwtsw2_chain.o fastmap.o bwtsw2_pair.o PROG= bwa INCLUDES= LIBS= -lm -lz -lpthread SUBDIRS= . ifeq ($(shell uname -s),Linux) LIBS += -lrt endif .SUFFIXES:.c .o .cc .c.o: $(CC) -c $(CFLAGS) $(DFLAGS) $(INCLUDES) $< -o $@ all:$(PROG) bwa:libbwa.a $(AOBJS) main.o $(CC) $(CFLAGS) $(DFLAGS) $(AOBJS) main.o -o $@ -L. -lbwa $(LIBS) bwamem-lite:libbwa.a example.o $(CC) $(CFLAGS) $(DFLAGS) example.o -o $@ -L. -lbwa $(LIBS) libbwa.a:$(LOBJS) $(AR) -csru $@ $(LOBJS) clean: rm -f gmon.out *.o a.out $(PROG) *~ *.a depend: ( LC_ALL=C ; export LC_ALL; makedepend -Y -- $(CFLAGS) $(DFLAGS) -- *.c ) # DO NOT DELETE THIS LINE -- make depend depends on it. QSufSort.o: QSufSort.h bamlite.o: bamlite.h malloc_wrap.h bntseq.o: bntseq.h utils.h kseq.h malloc_wrap.h khash.h bwa.o: bntseq.h bwa.h bwt.h ksw.h utils.h kstring.h malloc_wrap.h kvec.h bwa.o: kseq.h bwamem.o: kstring.h malloc_wrap.h bwamem.h bwt.h bntseq.h bwa.h ksw.h kvec.h bwamem.o: ksort.h utils.h kbtree.h bwamem_extra.o: bwa.h bntseq.h bwt.h bwamem.h kstring.h malloc_wrap.h bwamem_pair.o: kstring.h malloc_wrap.h bwamem.h bwt.h bntseq.h bwa.h kvec.h bwamem_pair.o: utils.h ksw.h bwape.o: bwtaln.h bwt.h kvec.h malloc_wrap.h bntseq.h utils.h bwase.h bwa.h bwape.o: ksw.h khash.h bwase.o: bwase.h bntseq.h bwt.h bwtaln.h utils.h kstring.h malloc_wrap.h bwase.o: bwa.h ksw.h bwaseqio.o: bwtaln.h bwt.h utils.h bamlite.h malloc_wrap.h kseq.h bwashm.o: bwa.h bntseq.h bwt.h bwt.o: utils.h bwt.h kvec.h malloc_wrap.h bwt_gen.o: QSufSort.h malloc_wrap.h bwt_lite.o: bwt_lite.h malloc_wrap.h bwtaln.o: bwtaln.h bwt.h bwtgap.h utils.h bwa.h bntseq.h malloc_wrap.h bwtgap.o: bwtgap.h bwt.h bwtaln.h malloc_wrap.h bwtindex.o: bntseq.h bwa.h bwt.h utils.h rle.h rope.h malloc_wrap.h bwtsw2_aux.o: bntseq.h bwt_lite.h utils.h bwtsw2.h bwt.h kstring.h bwtsw2_aux.o: malloc_wrap.h bwa.h ksw.h kseq.h ksort.h bwtsw2_chain.o: bwtsw2.h bntseq.h bwt_lite.h bwt.h malloc_wrap.h ksort.h bwtsw2_core.o: bwt_lite.h bwtsw2.h bntseq.h bwt.h kvec.h malloc_wrap.h bwtsw2_core.o: khash.h ksort.h bwtsw2_main.o: bwt.h bwtsw2.h bntseq.h bwt_lite.h utils.h bwa.h bwtsw2_pair.o: utils.h bwt.h bntseq.h bwtsw2.h bwt_lite.h kstring.h bwtsw2_pair.o: malloc_wrap.h ksw.h example.o: bwamem.h bwt.h bntseq.h bwa.h kseq.h malloc_wrap.h fastmap.o: bwa.h bntseq.h bwt.h bwamem.h kvec.h malloc_wrap.h utils.h kseq.h is.o: malloc_wrap.h kopen.o: malloc_wrap.h kstring.o: kstring.h malloc_wrap.h ksw.o: ksw.h malloc_wrap.h main.o: kstring.h malloc_wrap.h utils.h malloc_wrap.o: malloc_wrap.h maxk.o: bwa.h bntseq.h bwt.h bwamem.h kseq.h malloc_wrap.h pemerge.o: ksw.h kseq.h malloc_wrap.h kstring.h bwa.h bntseq.h bwt.h utils.h rle.o: rle.h rope.o: rle.h rope.h utils.o: utils.h ksort.h malloc_wrap.h kseq.h bwa-0.7.17/NEWS.md000066400000000000000000001166271317342117100134720ustar00rootroot00000000000000Release 0.7.17 (23 October 2017) -------------------------------- This release adds option -q to preserve the mapping quality of split alignment with a lower alignment score than the primary alignment. Option -5 automatically applies -q as well. (0.7.17: 23 October 2017, r1188) Release 0.7.16 (30 July 2017) ----------------------------- This release added a couple of minor features and incorporated multiple pull requests, including: * Added option -5, which is useful to some Hi-C pipelines. * Fixed an error with samtools sorting (#129). Updated download link for GRCh38 (#123). Fixed README MarkDown formatting (#70). Addressed multiple issues via a collected pull request #139 by @jmarshall. Avoid malformatted SAM header when -R is used with TAB (#84). Output mate CIGAR (#138). (0.7.16: 30 July 2017, r1180) Release 0.7.15 (31 May 2016) ---------------------------- Fixed a long existing bug which potentially leads to underestimated insert size upper bound. This bug should have little effect in practice. (0.7.15: 31 May 2016, r1140) Release 0.7.14 (4 May 2016) --------------------------- In the ALT mapping mode, this release adds the "AH:*" header tag to SQ lines corresponding to alternate haplotypes. (0.7.14: 4 May 2016, r1136) Release 0.7.13 (23 Feburary 2016) --------------------------------- This release fixes a few minor bugs in the previous version and adds a few minor features. All BWA algorithms should produce identical output to 0.7.12 when there are no ALT contigs. Detailed changes: * Fixed a bug in "bwa-postalt.js". The old version may produce 0.5% of wrong bases for reads mapped to the ALT contigs. * Fixed a potential bug in the multithreading mode. It may occur when mapping is much faster than file reading, which should almost never happen in practice. * Changed the download URL of GRCh38. * Removed the read overlap mode. It is not working well. * Added the ropebwt2 algorithm as an alternative to index large genomes. Ropebwt2 is slower than the "bwtsw" algorithm, but it has a permissive license. This allows us to create an Apache2-licensed BWA (in the "Apache2" branch) for commercial users who are concerned with GPL. (0.7.13: 23 Feburary 2016, r1126) Release 0.7.12 (28 December 2014) --------------------------------- This release fixed a bug in the pair-end mode when ALT contigs are present. It leads to undercalling in regions overlapping ALT contigs. (0.7.12: 28 December 2014, r1039) Release 0.7.11 (23 December, 2014) ---------------------------------- A major change to BWA-MEM is the support of mapping to ALT contigs in addition to the primary assembly. Part of the ALT mapping strategy is implemented in BWA-MEM and the rest in a postprocessing script for now. Due to the extra layer of complexity on generating the reference genome and on the two-step mapping, we start to provide a wrapper script and precompiled binaries since this release. The package may be more convenient to some specific use cases. For general uses, the single BWA binary still works like the old way. Another major addition to BWA-MEM is HLA typing, which made possible with the new ALT mapping strategy. Necessary data and programs are included in the binary release. The wrapper script also optionally performs HLA typing when HLA genes are included in the reference genome as additional ALT contigs. Other notable changes to BWA-MEM: * Added option `-b` to `bwa index`. This option tunes the batch size used in the construction of BWT. It is advised to use large `-b` for huge reference sequences such as the BLAST *nt* database. * Optimized for PacBio data. This includes a change to scoring based on a study done by Aaron Quinlan and a heuristic speedup. Further speedup is possible, but needs more careful investigation. * Dropped PacBio read-to-read alignment for now. BWA-MEM is good for finding the best hit, but is not very sensitive to suboptimal hits. Option `-x pbread` is still available, but hidden on the command line. This may be removed in future releases. * Added a new pre-setting for Oxford Nanopore 2D reads. LAST is still a little more sensitive on older bacterial data, but bwa-mem is as good on more recent data and is times faster for mapping against mammalian genomes. * Added LAST-like seeding. This improves the accuracy for longer reads. * Added option `-H` to insert arbitrary header lines. * Smarter option `-p`. Given an interleaved FASTQ stream, old bwa-mem identifies the 2i-th and (2i+1)-th reads as a read pair. The new verion identifies adjacent reads with the same read name as a read pair. It is possible to mix single-end and paired-end reads in one FASTQ. * Improved parallelization. Old bwa-mem waits for I/O. The new version puts I/O on a separate thread. It performs mapping while reading FASTQ and writing SAM. This saves significant wall-clock time when reading from or writing to a slow Unix pipe. With the new release, the recommended way to map Illumina reads to GRCh38 is to use the bwakit binary package: bwa.kit/run-gen-ref hs38DH bwa.kit/bwa index hs38DH.fa bwa.kit/run-bwamem -t8 -H -o out-prefix hs38DH.fa read1.fq.gz read2.fq.gz | sh Please check bwa.kit/README.md for details and command line options. (0.7.11: 23 December 2014, r1034) Release 0.7.10 (13 July, 2014) ------------------------------ Notable changes to BWA-MEM: * Fixed a segmentation fault due to an alignment bridging the forward-reverse boundary. This is a bug. * Use the PacBio heuristic to map contigs to the reference genome. The old heuristic evaluates the necessity of full extension for each chain. This may not work in long low-complexity regions. The PacBio heuristic performs SSE2-SW around each short seed. It works better. Note that the heuristic is only applied to long query sequences. For Illumina reads, the output is identical to the previous version. (0.7.10: 13 July 2014, r789) Release 0.7.9 (19 May, 2014) ---------------------------- This release brings several major changes to BWA-MEM. Notably, BWA-MEM now formally supports PacBio read-to-reference alignment and experimentally supports PacBio read-to-read alignment. BWA-MEM also runs faster at a minor cost of accuracy. The speedup is more significant when GRCh38 is in use. More specifically: * Support PacBio subread-to-reference alignment. Although older BWA-MEM works with PacBio data in principle, the resultant alignments are frequently fragmented. In this release, we fine tuned existing methods and introduced new heuristics to improve PacBio alignment. These changes are not used by default. Users need to add option "-x pacbio" to enable the feature. * Support PacBio subread-to-subread alignment (EXPERIMENTAL). This feature is enabled with option "-x pbread". In this mode, the output only gives the overlapping region between a pair of reads without detailed alignment. * Output alternative hits in the XA tag if there are not so many of them. This is a BWA-backtrack feature. * Support mapping to ALT contigs in GRCh38 (EXPERIMENTAL). We provide a script to postprocess hits in the XA tag to adjust the mapping quality and generate new primary alignments to all overlapping ALT contigs. We would *NOT* recommend this feature for production uses. * Improved alignments to many short reference sequences. Older BWA-MEM may generate an alignment bridging two or more adjacent reference sequences. Such alignments are split at a later step as postprocessing. This approach is complex and does not always work. This release forbids these alignments from the very beginning. BWA-MEM should not produce an alignment bridging two or more reference sequences any more. * Reduced the maximum seed occurrence from 10000 to 500. Reduced the maximum rounds of Smith-Waterman mate rescue from 100 to 50. Added a heuristic to lower the mapping quality if a read contains seeds with excessive occurrences. These changes make BWA-MEM faster at a minor cost of accuracy in highly repetitive regions. * Added an option "-Y" to use soft clipping for supplementary alignments. * Bugfix: incomplete alignment extension in corner cases. * Bugfix: integer overflow when aligning long query sequences. * Bugfix: chain score is not computed correctly (almost no practical effect) * General code cleanup * Added FAQs to README Changes in BWA-backtrack: * Bugfix: a segmentation fault when an alignment stands out of the end of the last chromosome. (0.7.9: 19 May 2014, r783) Release 0.7.8 (31 March, 2014) ------------------------------ Changes in BWA-MEM: * Bugfix: off-diagonal X-dropoff (option -d) not working as intended. Short-read alignment is not affected. * Bugfix: unnecessarily large bandwidth used during global alignment, which reduces the mapping speed by -5% for short reads. Results are not affected. * Bugfix: when the matching score is not one, paired-end mapping quality is inaccurate. * When the matching score (option -A) is changed, scale all score-related options accordingly unless overridden by users. * Allow to specify different gap open (or extension) penalties for deletions and insertions separately. * Allow to specify the insert size distribution. * Better and more detailed debugging information. With the default setting, 0.7.8 and 0.7.7 gave identical output on one million 100bp read pairs. (0.7.8: 31 March 2014, r455) Release 0.7.7 (25 Feburary, 2014) --------------------------------- This release fixes incorrect MD tags in the BWA-MEM output. A note about short-read mapping to GRCh38. The new human reference genome GRCh38 contains 60Mbp program generated alpha repeat arrays, some of which are hard masked as they cannot be localized. These highly repetitive arrays make BWA-MEM -50% slower. If you are concerned with the performance of BWA-MEM, you may consider to use option "-c2000 -m50". On simulated data, this setting helps the performance at a very minor cost on accuracy. I may consider to change the default in future releases. (0.7.7: 25 Feburary 2014, r441) Release 0.7.6 (31 Januaray, 2014) --------------------------------- Changes in BWA-MEM: * Changed the way mapping quality is estimated. The new method tends to give the same alignment a higher mapping quality. On paired-end reads, the change is minor as with pairing, the mapping quality is usually high. For short single-end reads, the difference is considerable. * Improved load balance when many threads are spawned. However, bwa-mem is still not very thread efficient, probably due to the frequent heap memory allocation. Further improvement is a little difficult and may affect the code stability. * Allow to use different clipping penalties for 5'- and 3'-ends. This helps when we do not want to clip one end. * Print the @PG line, including the command line options. * Improved the band width estimate: a) fixed a bug causing the band width extimated from extension not used in the final global alignment; b) try doubled band width if the global alignment score is smaller. Insufficient band width leads to wrong CIGAR and spurious mismatches/indels. * Added a new option -D to fine tune a heuristic on dropping suboptimal hits. Reducing -D increases accuracy but decreases the mapping speed. If unsure, leave it to the default. * Bugfix: for a repetitive single-end read, the reported hit is not randomly distributed among equally best hits. * Bugfix: missing paired-end hits due to unsorted list of SE hits. * Bugfix: incorrect CIGAR caused by a defect in the global alignment. * Bugfix: incorrect CIGAR caused by failed SW rescue. * Bugfix: alignments largely mapped to the same position are regarded to be distinct from each other, which leads to underestimated mapping quality. * Added the MD tag. There are no changes to BWA-backtrack in this release. However, it has a few known issues yet to be fixed. If you prefer BWA-track, It is still advised to use bwa-0.6.x. While I developed BWA-MEM, I also found a few issues with BWA-SW. It is now possible to improve BWA-SW with the lessons learned from BWA-MEM. However, as BWA-MEM is usually better, I will not improve BWA-SW until I find applications where BWA-SW may excel. (0.7.6: 31 January 2014, r432) Release 0.7.5a (30 May, 2013) ----------------------------- Fixed a bug in BWA-backtrack which leads to off-by-one mapping errors in rare cases. (0.7.5a: 30 May 2013, r405) Release 0.7.5 (29 May, 2013) ---------------------------- Changes in all components: * Improved error checking on memory allocation and file I/O. Patches provided by Rob Davies. * Updated README. * Bugfix: return code is zero upon errors. Changes in BWA-MEM: * Changed the way a chimeric alignment is reported (conforming to the upcoming SAM spec v1.5). With 0.7.5, if the read has a chimeric alignment, the paired or the top hit uses soft clipping and is marked with neither 0x800 nor 0x100 bits. All the other hits part of the chimeric alignment will use hard clipping and be marked with 0x800 if option "-M" is not in use, or marked with 0x100 otherwise. * Other hits part of a chimeric alignment are now reported in the SA tag, conforming to the SAM spec v1.5. * Better method for resolving an alignment bridging two or more short reference sequences. The current strategy maps the query to the reference sequence that covers the middle point of the alignment. For most applications, this change has no effects. Changes in BWA-backtrack: * Added a magic number to .sai files. This prevents samse/sampe from reading corrupted .sai (e.g. a .sai file containing LSF log) or incompatible .sai generated by a different version of bwa. * Bugfix: alignments in the XA:Z: tag were wrong. * Keep track of #ins and #del during backtracking. This simplifies the code and reduces errors in rare corner cases. I should have done this in the early days of bwa. In addition, if you use BWA-MEM or the fastmap command of BWA, please cite: - Li H. (2013) Aligning sequence reads, clone sequences and assembly contigs with BWA-MEM. arXiv:1303.3997v2 [q-bio.GN]. Thank you. (0.7.5: 29 May 2013, r404) Release 0.7.4 (23 April, 2013) ------------------------------ This is a bugfix release. Most of bugs are considered to be minor which only occur very rarely. * Bugfix: wrong CIGAR when a query sequence bridges three or more target sequences. This only happens when aligning reads to short assembly contigs. * Bugfix: leading "D" operator in CIGAR. * Extend more seeds for better alignment around tandem repeats. This is also a cause of the leading "D" operator in CIGAR. * Bugfix: SSE2-SSW may occasionally find incorrect query starting position around tandem repeat. This will lead to a suboptimal CIGAR in BWA-MEM and a wrong CIGAR in BWA. * Bugfix: clipping penalty does not work as is intended when there is a gap towards the end of a read. * Fixed an issue caused by a bug in the libc from Mac/Darwin. In Darwin, fread() is unable to read a data block longer than 2GB due to an integer overflow bug in its implementation. Since version 0.7.4, BWA-MEM is considered to reach similar stability to BWA-backtrack for short-read mapping. (0.7.4: 23 April, r385) Release 0.7.3a (15 March, 2013) ------------------------------- In 0.7.3, the wrong CIGAR bug was only fixed in one scenario, but not fixed in another corner case. (0.7.3a: 15 March 2013, r367) Release 0.7.3 (15 March, 2013) ------------------------------ Changes to BWA-MEM: * Bugfix: pairing score is inaccurate when option -A does not take the default value. This is a very minor issue even if it happens. * Bugfix: occasionally wrong CIGAR. This happens when in the alignment there is a 1bp deletion and a 1bp insertion which are close to the end of the reads, and there are no other substitutions or indels. BWA-MEM would not do a gapped alignment due to the bug. * New feature: output other non-overlapping alignments in the XP tag such that we can see the entire picture of alignment from one SAM line. XP gives the position, CIGAR, NM and mapQ of each aligned subsequence of the query. BWA-MEM has been used to align -300Gbp 100-700bp SE/PE reads. SNP/indel calling has also been evaluated on part of these data. BWA-MEM generally gives better pre-filtered SNP calls than BWA. No significant issues have been observed since 0.7.2, though minor improvements or bugs (e.g. the bug fixed in this release) are still possible. If you find potential issues, please send bug reports to (free registration required). In addition, more detailed description of the BWA-MEM algorithm can be found at . (0.7.3: 15 March 2013, r366) Release 0.7.2 (9 March, 2013) ----------------------------- Emergent bug fix: 0.7.0 and 0.7.1 give a wrong sign to TLEN. In addition, flagging 'properly paired' also gets improved a little. (0.7.2: 9 March 2013, r351) Release 0.7.1 (8 March, 2013) ----------------------------- Changes to BWA-MEM: * Bugfix: rare segmentation fault caused by a partial hit to the end of the last sequence. * Bugfix: occasional mis-pairing given an interleaved fastq. * Bugfix: wrong mate information when the mate is unmapped. SAM generated by BWA-MEM can now be validated with Picard. * Improved the performance and accuracy for ultra-long query sequences. Short-read alignment is not affected. Changes to other components: * In BWA-backtrack and BWA-SW, replaced the code for global alignment, Smith-Waterman and SW extension. The performance and accuracy of the two algorithms stay the same. * Added an experimental subcommand to merge overlapping paired ends. The algorithm is very conservative: it may miss true overlaps but rarely makes mistakes. An important note is that like BWA-SW, BWA-MEM may output multiple primary alignments for a read, which may cause problems to some tools. For aligning sequence reads, it is advised to use '-M' to flag extra hits as secondary. This option is not the default because multiple primary alignments are theoretically possible in sequence alignment. (0.7.1: 8 March 2013, r347) Beta Release 0.7.0 (28 Feburary, 2013) -------------------------------------- This release comes with a new alignment algorithm, BWA-MEM, for 70bp-1Mbp query sequences. BWA-MEM essentially seeds alignments with a variant of the fastmap algorithm and extends seeds with banded affine-gap-penalty dynamic programming (i.e. the Smith-Waterman-Gotoh algorithm). For typical Illumina 100bp reads or longer low-divergence query sequences, BWA-MEM is about twice as fast as BWA and BWA-SW and is more accurate. It also supports split alignments like BWA-SW and may optionally output multiple hits like BWA. BWA-MEM does not guarantee to find hits within a certain edit distance, but BWA is not efficient for such task given longer reads anyway, and the edit-distance criterion is arguably not as important in long-read alignment. In addition to the algorithmic improvements, BWA-MEM also implements a few handy features in practical aspects: 1. BWA-MEM automatically switches between local and glocal (global wrt reads; local wrt reference) alignment. It reports the end-to-end glocal alignment if the glocal alignment is not much worse than the optimal local alignment. Glocal alignment reduces reference bias. 2. BWA-MEM automatically infers pair orientation from a batch of single-end alignments. It allows more than one orientations if there are sufficient supporting reads. This feature has not been tested on reads from Illumina jumping library yet. (EXPERIMENTAL) 3. BWA-MEM optionally takes one interleaved fastq for paired-end mapping. It is possible to convert a name-sorted BAM to an interleaved fastq on the fly and feed the data stream to BWA-MEM for mapping. 4. BWA-MEM optionally copies FASTA/Q comments to the final SAM output, which helps to transfer individual read annotations to the output. 5. BWA-MEM supports more advanced piping. Users can now run: (bwa mem ref.fa '20) CPU cores. * Check I/O error. * Increased the maximum barcode length to 63bp. * Automatically choose the indexing algorithm. * Bugfix: very rare segfault due to an uninitialized variable. The bug also affects the placement of suboptimal alignments. The effect is very minor. This release involves quite a lot of tricky changes. Although it has been tested on a few data sets, subtle bugs may be still hidden. It is *NOT* recommended to use this release in a production pipeline. In future, however, BWA-SW may be better when reads continue to go longer. I would encourage users to try the 0.6 release. I would also like to hear the users' experience. Thank you. (0.6.0: 12 November 2011, r85) Beta Release 0.5.9 (24 January, 2011) ------------------------------------- Notable changes: * Feature: barcode support via the '-B' option. * Feature: Illumina 1.3+ read format support via the '-I' option. * Bugfix: RG tags are not attached to unmapped reads. * Bugfix: very rare bwasw mismappings * Recommend options for PacBio reads in bwasw help message. Also, since January 13, the BWA master repository has been moved to github: https://github.com/lh3/bwa The revision number has been reset. All recent changes will be first committed to this repository. (0.5.9: 24 January 2011, r16) Beta Release Candidate 0.5.9rc1 (10 December, 2010) --------------------------------------------------- Notable changes in bwasw: * Output unmapped reads. * For a repetitive read, choose a random hit instead of a fixed one. This is not well tested. Notable changes in bwa-short: * Fixed a bug in the SW scoring system, which may lead to unexpected gaps towards the end of a read. * Fixed a bug which invalidates the randomness of repetitive reads. * Fixed a rare memory leak. * Allowed to specify the read group at the command line. * Take name-grouped BAM files as input. Changes to this release are usually safe in that they do not interfere with the key functionality. However, the release has only been tested on small samples instead of on large-scale real data. If anything weird happens, please report the bugs to the bio-bwa-help mailing list. (0.5.9rc1: 10 December 2010, r1561) Beta Release 0.5.8 (8 June, 2010) --------------------------------- Notable changes in bwasw: * Fixed an issue of missing alignments. This should happen rarely and only when the contig/read alignment is multi-part. Very rarely, bwasw may still miss a segment in a multi-part alignment. This is difficult to fix, although possible. Notable changes in bwa-short: * Discard the SW alignment when the best single-end alignment is much better. Such a SW alignment may caused by structural variations and forcing it to be aligned leads to false alignment. This fix has not been tested thoroughly. It would be great to receive more users feedbacks on this issue. * Fixed a typo/bug in sampe which leads to unnecessarily large memory usage in some cases. * Further reduced the chance of reporting 'weird pairing'. (0.5.8: 8 June 2010, r1442) Beta Release 0.5.7 (1 March, 2010) ---------------------------------- This release only has an effect on paired-end data with fat insert-size distribution. Users are still recommended to update as the new release improves the robustness to poor data. * The fix for 'weird pairing' was not working in version 0.5.6, pointed out by Carol Scott. It should work now. * Optionally output to a normal file rather than to stdout (by Tim Fennel). (0.5.7: 1 March 2010, r1310) Beta Release 0.5.6 (10 Feburary, 2010) -------------------------------------- Notable changes in bwa-short: * Report multiple hits in the SAM format at a new tag XA encoded as: (chr,pos,CIGAR,NM;)*. By default, if a paired or single-end read has 4 or fewer hits, they will all be reported; if a read in a anomalous pair has 11 or fewer hits, all of them will be reported. * Perform Smith-Waterman alignment also for anomalous read pairs when both ends have quality higher than 17. This reduces false positives for some SV discovery algorithms. * Do not report "weird pairing" when the insert size distribution is too fat or has a mean close to zero. * If a read is bridging two adjacent chromsomes, flag it as unmapped. * Fixed a small but long existing memory leak in paired-end mapping. * Multiple bug fixes in SOLiD mapping: a) quality "-1" can be correctly parsed by solid2fastq.pl; b) truncated quality string is resolved; c) SOLiD read mapped to the reverse strand is complemented. * Bwa now calculates skewness and kurtosis of the insert size distribution. * Deploy a Bayesian method to estimate the maximum distance for a read pair considered to be paired properly. The method is proposed by Gerton Lunter, but bwa only implements a simplified version. * Export more functions for Java bindings, by Matt Hanna (See: http://www.broadinstitute.org/gsa/wiki/index.php/Sting_BWA/C_bindings) * Abstract bwa CIGAR for further extension, by Rodrigo Goya. (0.5.6: 10 Feburary 2010, r1303) Beta Release 0.5.5 (10 November, 2009) -------------------------------------- This is a bug fix release: * Fixed a serious bug/typo in aln which does not occur given short reads, but will lead to segfault for >500bp reads. Of course, the aln command is not recommended for reads longer than 200bp, but this is a bug anyway. * Fixed a minor bug/typo which leads to incorrect single-end mapping quality when one end is moved to meet the mate-pair requirement. * Fixed a bug in samse for mapping in the color space. This bug is caused by quality filtration added since 0.5.1. (0.5.5: 10 November 2009, r1273) Beta Release 0.5.4 (9 October, 2009) ------------------------------------ Since this version, the default seed length used in the "aln" command is changed to 32. Notable changes in bwa-short: * Added a new tag "XC:i" which gives the length of clipped reads. * In sampe, skip alignments in case of a bug in the Smith-Waterman alignment module. * In sampe, fixed a bug in pairing when the read sequence is identical to its reverse complement. * In sampe, optionally preload the entire FM-index into memory to reduce disk operations. Notable changes in dBWT-SW/BWA-SW: * Changed name dBWT-SW to BWA-SW. * Optionally use "hard clipping" in the SAM output. (0.5.4: 9 October 2009, r1245) Beta Release 0.5.3 (15 September, 2009) --------------------------------------- Fixed a critical bug in bwa-short: reads mapped to the reverse strand are not complemented. (0.5.3: 15 September 2009, r1225) Beta Release 0.5.2 (13 September, 2009) --------------------------------------- Notable changes in bwa-short: * Optionally trim reads before alignment. See the manual page on 'aln -q' for detailed description. * Fixed a bug in calculating the NM tag for a gapped alignment. * Fixed a bug given a mixture of reads with some longer than the seed length and some shorter. * Print SAM header. Notable changes in dBWT-SW: * Changed the default value of -T to 30. As a result, the accuracy is a little higher for short reads at the cost of speed. (0.5.2: 13 September 2009, r1223) Beta Release 0.5.1 (2 September, 2009) -------------------------------------- Notable changes in the short read alignment component: * Fixed a bug in samse: do not write mate coordinates. Notable changes in dBWT-SW: * Randomly choose one alignment if the read is a repetitive. * Fixed a flaw when a read is mapped across two adjacent reference sequences. However, wrong alignment reports may still occur rarely in this case. * Changed the default band width to 50. The speed is slower due to this change. * Improved the mapping quality a little given long query sequences. (0.5.1: 2 September 2009, r1209) Beta Release 0.5.0 (20 August, 2009) ------------------------------------ This release implements a novel algorithm, dBWT-SW, specifically designed for long reads. It is 10-50 times faster than SSAHA2, depending on the characteristics of the input data, and achieves comparable alignment accuracy while allowing chimera detection. In comparison to BLAT, dBWT-SW is several times faster and much more accurate especially when the error rate is high. Please read the manual page for more information. The dBWT-SW algorithm is kind of developed for future sequencing technologies which produce much longer reads with a little higher error rate. It is still at its early development stage. Some features are missing and it may be buggy although I have evaluated on several simulated and real data sets. But following the "release early" paradigm, I would like the users to try it first. Other notable changes in BWA are: * Fixed a rare bug in the Smith-Waterman alignment module. * Fixed a rare bug about the wrong alignment coordinate when a read is poorly aligned. * Fixed a bug in generating the "mate-unmap" SAM tag when both ends in a pair are unmapped. (0.5.0: 20 August 2009, r1200) Beta Release 0.4.9 (19 May, 2009) --------------------------------- Interestingly, the integer overflow bug claimed to be fixed in 0.4.7 has not in fact. Now I have fixed the bug. Sorry for this and thank Quan Long for pointing out the bug (again). (0.4.9: 19 May 2009, r1075) Beta Release 0.4.8 (18 May, 2009) --------------------------------- One change to "aln -R". Now by default, if there are no more than '-R' equally best hits, bwa will search for suboptimal hits. This change affects the ability in finding SNPs in segmental duplications. I have not tested this option thoroughly, but this simple change is less likely to cause new bugs. Hope I am right. (0.4.8: 18 May 2009, r1073) Beta Release 0.4.7 (12 May, 2009) --------------------------------- Notable changes: * Output SM (single-end mapping quality) and AM (smaller mapping quality among the two ends) tag from sam output. * Improved the functionality of stdsw. * Made the XN tag more accurate. * Fixed a very rare segfault caused by integer overflow. * Improve the insert size estimation. * Fixed compiling errors for some Linux systems. (0.4.7: 12 May 2009, r1066) Beta Release 0.4.6 (9 March, 2009) ---------------------------------- This release improves the SOLiD support. First, a script for converting SOLiD raw data is provided. This script is adapted from solid2fastq.pl in the MAQ package. Second, a nucleotide reference file can be directly used with 'bwa index'. Third, SOLiD paired-end support is completed. Fourth, color-space reads will be converted to nucleotides when SAM output is generated. Color errors are corrected in this process. Please note that like MAQ, BWA cannot make use of the primer base and the first color. In addition, the calculation of mapping quality is also improved a little bit, although end-users may barely observe the difference. (0.4.6: 9 March 2009, r915) Beta Release 0.4.5 (18 Feburary, 2009) -------------------------------------- Not much happened, but I think it would be good to let the users use the latest version. Notable changes (Thank Bob Handsaker for catching the two bugs): * Improved bounary check. Previous version may still give incorrect alignment coordinates in rare cases. * Fixed a bug in SW alignment when no residue matches. This only affects the 'sampe' command. * Robustly estimate insert size without setting the maximum on the command line. Since this release 'sampe -a' only has an effect if there are not enough good pairs to infer the insert size distribution. * Reduced false PE alignments a little bit by using the inferred insert size distribution. This fix may be more important for long insert size libraries. (0.4.5: 18 Feburary 2009, r829) Beta Release 0.4.4 (15 Feburary, 2009) -------------------------------------- This is mainly a bug fix release. Notable changes are: * Imposed boundary check for extracting subsequence from the genome. Previously this causes memory problem in rare cases. * Fixed a bug in failing to find whether an alignment overlapping with N on the genome. * Changed MD tag to meet the latest SAM specification. (0.4.4: 15 Feburary 2009, r815) Beta Release 0.4.3 (22 January, 2009) ------------------------------------ Notable changes: * Treat an ambiguous base N as a mismatch. Previous versions will not map reads containing any N. * Automatically choose the maximum allowed number of differences. This is important when reads of different lengths are mixed together. * Print mate coordinate if only one end is unmapped. * Generate MD tag. This tag encodes the mismatching positions and the reference bases at these positions. Deletions from the reference will also be printed. * Optionally dump multiple hits from samse, in another concise format rather than SAM. * Optionally disable iterative search. This is VERY SLOOOOW, though. * Fixed a bug in generate SAM. (0.4.3: 22 January 2009, r787) Beta Release 0.4.2 (9 January, 2009) ------------------------------------ Aaron Quinlan found a bug in the indexer: the bwa indexer segfaults if there are no comment texts in the FASTA header. This is a critical bug. Nothing else was changed. (0.4.2: 9 January 2009, r769) Beta Release 0.4.1 (7 January, 2009) ------------------------------------ I am sorry for the quick updates these days. I like to set a milestone for BWA and this release seems to be. For paired end reads, BWA also does Smith-Waterman alignment for an unmapped read whose mate can be mapped confidently. With this strategy BWA achieves similar accuracy to maq. Benchmark is also updated accordingly. (0.4.1: 7 January 2009, r760) Beta Release 0.4.0 (6 January, 2009) ------------------------------------ In comparison to the release two days ago, this release is mainly tuned for performance with some tricks I learnt from Bowtie. However, as the indexing format has also been changed, I have to increase the version number to 0.4.0 to emphasize that *DATABASE MUST BE RE-INDEXED* with 'bwa index'. * Improved the speed by about 20%. * Added multi-threading to 'bwa aln'. (0.4.0: 6 January 2009, r756) Beta Release 0.3.0 (4 January, 2009) ------------------------------------ * Added paired-end support by separating SA calculation and alignment output. * Added SAM output. * Added evaluation to the documentation. (0.3.0: 4 January 2009, r741) Beta Release 0.2.0 (15 Augusst, 2008) ------------------------------------- * Take the subsequence at the 5'-end as seed. Seeding strategy greatly improves the speed for long reads, at the cost of missing a few true hits that contain many differences in the seed. Seeding also increase the memory by 800MB. * Fixed a bug which may miss some gapped alignments. Fixing the bug also slows the speed a little. (0.2.0: 15 August 2008, r428) Beta Release 0.1.6 (08 Augusst, 2008) ------------------------------------- * Give accurate CIGAR string. * Add a simple interface to SW/NW alignment (0.1.6: 08 August 2008, r414) Beta Release 0.1.5 (27 July, 2008) ---------------------------------- * Improve the speed. This version is expected to give the same results. (0.1.5: 27 July 2008, r400) Beta Release 0.1.4 (22 July, 2008) ---------------------------------- * Fixed a bug which may cause missing gapped alignments. * More clearly define what alignments can be found by BWA (See manual). Now BWA runs a little slower because it will visit more potential gapped alignments. * A bit code clean up. (0.1.4: 22 July 2008, r387) Beta Release 0.1.3 (21 July, 2008) ---------------------------------- Improve the speed with some tricks on retrieving occurences. The results should be exactly the same as that of 0.1.2. (0.1.3: 21 July 2008, r382) Beta Release 0.1.2 (17 July, 2008) ---------------------------------- Support gapped alignment. Codes for ungapped alignment has been removed. (0.1.2: 17 July 2008, r371) Beta Release 0.1.1 (03 June, 2008) ----------------------------------- This is the first release of BWA, Burrows-Wheeler Alignment tool. Please read man page for more information about this software. (0.1.1: 03 June 2008, r349) bwa-0.7.17/QSufSort.c000066400000000000000000000305361317342117100142600ustar00rootroot00000000000000/* QSufSort.c Original source from qsufsort.c Copyright 1999, N. Jesper Larsson, all rights reserved. This file contains an implementation of the algorithm presented in "Faster Suffix Sorting" by N. Jesper Larsson (jesper@cs.lth.se) and Kunihiko Sadakane (sada@is.s.u-tokyo.ac.jp). This software may be used freely for any purpose. However, when distributed, the original source must be clearly stated, and, when the source code is distributed, the copyright notice must be retained and any alterations in the code must be clearly marked. No warranty is given regarding the quality of this software. Modified by Wong Chi-Kwong, 2004 Changes summary: - Used long variable and function names - Removed global variables - Replace pointer references with array references - Used insertion sort in place of selection sort and increased insertion sort threshold - Reconstructing suffix array from inverse becomes an option - Add handling where end-of-text symbol is not necessary < all characters - Removed codes for supporting alphabet size > number of characters No warrenty is given regarding the quality of the modifications. */ #include #include #include #include "QSufSort.h" #define min(value1, value2) ( ((value1) < (value2)) ? (value1) : (value2) ) #define med3(a, b, c) ( ac ? b : a>c ? c : a)) #define swap(a, b, t); t = a; a = b; b = t; // Static functions static void QSufSortSortSplit(qsint_t* __restrict V, qsint_t* __restrict I, const qsint_t lowestPos, const qsint_t highestPos, const qsint_t numSortedChar); static qsint_t QSufSortChoosePivot(qsint_t* __restrict V, qsint_t* __restrict I, const qsint_t lowestPos, const qsint_t highestPos, const qsint_t numSortedChar); static void QSufSortInsertSortSplit(qsint_t* __restrict V, qsint_t* __restrict I, const qsint_t lowestPos, const qsint_t highestPos, const qsint_t numSortedChar); static void QSufSortBucketSort(qsint_t* __restrict V, qsint_t* __restrict I, const qsint_t numChar, const qsint_t alphabetSize); static qsint_t QSufSortTransform(qsint_t* __restrict V, qsint_t* __restrict I, const qsint_t numChar, const qsint_t largestInputSymbol, const qsint_t smallestInputSymbol, const qsint_t maxNewAlphabetSize, qsint_t *numSymbolAggregated); /* Makes suffix array p of x. x becomes inverse of p. p and x are both of size n+1. Contents of x[0...n-1] are integers in the range l...k-1. Original contents of x[n] is disregarded, the n-th symbol being regarded as end-of-string smaller than all other symbols.*/ void QSufSortSuffixSort(qsint_t* __restrict V, qsint_t* __restrict I, const qsint_t numChar, const qsint_t largestInputSymbol, const qsint_t smallestInputSymbol, const int skipTransform) { qsint_t i, j; qsint_t s, negatedSortedGroupLength; qsint_t numSymbolAggregated; qsint_t numSortedPos = 1; qsint_t newAlphabetSize; if (!skipTransform) { /* bucketing possible*/ newAlphabetSize = QSufSortTransform(V, I, numChar, largestInputSymbol, smallestInputSymbol, numChar, &numSymbolAggregated); QSufSortBucketSort(V, I, numChar, newAlphabetSize); I[0] = -1; V[numChar] = 0; numSortedPos = numSymbolAggregated; } while ((qsint_t)(I[0]) >= -(qsint_t)numChar) { i = 0; negatedSortedGroupLength = 0; do { s = I[i]; if (s < 0) { i -= s; /* skip over sorted group.*/ negatedSortedGroupLength += s; } else { if (negatedSortedGroupLength) { I[i+negatedSortedGroupLength] = negatedSortedGroupLength; /* combine preceding sorted groups */ negatedSortedGroupLength = 0; } j = V[s] + 1; QSufSortSortSplit(V, I, i, j - 1, numSortedPos); i = j; } } while (i <= numChar); if (negatedSortedGroupLength) { /* array ends with a sorted group.*/ I[i+negatedSortedGroupLength] = negatedSortedGroupLength; /* combine sorted groups at end of I.*/ } numSortedPos *= 2; /* double sorted-depth.*/ } } void QSufSortGenerateSaFromInverse(const qsint_t* V, qsint_t* __restrict I, const qsint_t numChar) { qsint_t i; for (i=0; i<=numChar; i++) I[V[i]] = i + 1; } /* Sorting routine called for each unsorted group. Sorts the array of integers (suffix numbers) of length n starting at p. The algorithm is a ternary-split quicksort taken from Bentley & McIlroy, "Engineering a Sort Function", Software -- Practice and Experience 23(11), 1249-1265 (November 1993). This function is based on Program 7.*/ static void QSufSortSortSplit(qsint_t* __restrict V, qsint_t* __restrict I, const qsint_t lowestPos, const qsint_t highestPos, const qsint_t numSortedChar) { qsint_t a, b, c, d; qsint_t l, m; qsint_t f, v, s, t; qsint_t tmp; qsint_t numItem; numItem = highestPos - lowestPos + 1; if (numItem <= INSERT_SORT_NUM_ITEM) { QSufSortInsertSortSplit(V, I, lowestPos, highestPos, numSortedChar); return; } v = QSufSortChoosePivot(V, I, lowestPos, highestPos, numSortedChar); a = b = lowestPos; c = d = highestPos; while (1) { while (c >= b && (f = KEY(V, I, b, numSortedChar)) <= v) { if (f == v) { swap(I[a], I[b], tmp); a++; } b++; } while (c >= b && (f = KEY(V, I, c, numSortedChar)) >= v) { if (f == v) { swap(I[c], I[d], tmp); d--; } c--; } if (b > c) break; swap(I[b], I[c], tmp); b++; c--; } s = a - lowestPos; t = b - a; s = min(s, t); for (l = lowestPos, m = b - s; m < b; l++, m++) { swap(I[l], I[m], tmp); } s = d - c; t = highestPos - d; s = min(s, t); for (l = b, m = highestPos - s + 1; m <= highestPos; l++, m++) { swap(I[l], I[m], tmp); } s = b - a; t = d - c; if (s > 0) QSufSortSortSplit(V, I, lowestPos, lowestPos + s - 1, numSortedChar); // Update group number for equal portion a = lowestPos + s; b = highestPos - t; if (a == b) { // Sorted group V[I[a]] = a; I[a] = -1; } else { // Unsorted group for (c=a; c<=b; c++) V[I[c]] = b; } if (t > 0) QSufSortSortSplit(V, I, highestPos - t + 1, highestPos, numSortedChar); } /* Algorithm by Bentley & McIlroy.*/ static qsint_t QSufSortChoosePivot(qsint_t* __restrict V, qsint_t* __restrict I, const qsint_t lowestPos, const qsint_t highestPos, const qsint_t numSortedChar) { qsint_t m; qsint_t keyl, keym, keyn; qsint_t key1, key2, key3; qsint_t s; qsint_t numItem; numItem = highestPos - lowestPos + 1; m = lowestPos + numItem / 2; s = numItem / 8; key1 = KEY(V, I, lowestPos, numSortedChar); key2 = KEY(V, I, lowestPos+s, numSortedChar); key3 = KEY(V, I, lowestPos+2*s, numSortedChar); keyl = med3(key1, key2, key3); key1 = KEY(V, I, m-s, numSortedChar); key2 = KEY(V, I, m, numSortedChar); key3 = KEY(V, I, m+s, numSortedChar); keym = med3(key1, key2, key3); key1 = KEY(V, I, highestPos-2*s, numSortedChar); key2 = KEY(V, I, highestPos-s, numSortedChar); key3 = KEY(V, I, highestPos, numSortedChar); keyn = med3(key1, key2, key3); return med3(keyl, keym, keyn); } /* Quadratic sorting method to use for small subarrays. */ static void QSufSortInsertSortSplit(qsint_t* __restrict V, qsint_t* __restrict I, const qsint_t lowestPos, const qsint_t highestPos, const qsint_t numSortedChar) { qsint_t i, j; qsint_t tmpKey, tmpPos; qsint_t numItem; qsint_t key[INSERT_SORT_NUM_ITEM], pos[INSERT_SORT_NUM_ITEM]; qsint_t negativeSortedLength; qsint_t groupNum; numItem = highestPos - lowestPos + 1; for (i=0; i0 && key[j-1] > tmpKey; j--) { key[j] = key[j-1]; pos[j] = pos[j-1]; } key[j] = tmpKey; pos[j] = tmpPos; } negativeSortedLength = -1; i = numItem - 1; groupNum = highestPos; while (i > 0) { I[i+lowestPos] = pos[i]; V[I[i+lowestPos]] = groupNum; if (key[i-1] == key[i]) { negativeSortedLength = 0; } else { if (negativeSortedLength < 0) I[i+lowestPos] = negativeSortedLength; groupNum = i + lowestPos - 1; negativeSortedLength--; } i--; } I[lowestPos] = pos[0]; V[I[lowestPos]] = groupNum; if (negativeSortedLength < 0) I[lowestPos] = negativeSortedLength; } /* Bucketsort for first iteration. Input: x[0...n-1] holds integers in the range 1...k-1, all of which appear at least once. x[n] is 0. (This is the corresponding output of transform.) k must be at most n+1. p is array of size n+1 whose contents are disregarded. Output: x is V and p is I after the initial sorting stage of the refined suffix sorting algorithm.*/ static void QSufSortBucketSort(qsint_t* __restrict V, qsint_t* __restrict I, const qsint_t numChar, const qsint_t alphabetSize) { qsint_t i, c; qsint_t d; qsint_t groupNum; qsint_t currentIndex; // mark linked list empty for (i=0; i0; i--) { c = I[i-1]; d = (qsint_t)(V[c]); groupNum = currentIndex; V[c] = groupNum; if (d >= 0) { I[currentIndex] = c; while (d >= 0) { c = d; d = V[c]; V[c] = groupNum; currentIndex--; I[currentIndex] = c; } } else { // sorted group I[currentIndex] = -1; } currentIndex--; } } /* Transforms the alphabet of x by attempting to aggregate several symbols into one, while preserving the suffix order of x. The alphabet may also be compacted, so that x on output comprises all integers of the new alphabet with no skipped numbers. Input: x is an array of size n+1 whose first n elements are positive integers in the range l...k-1. p is array of size n+1, used for temporary storage. q controls aggregation and compaction by defining the maximum intue for any symbol during transformation: q must be at least k-l; if q<=n, compaction is guaranteed; if k-l>n, compaction is never done; if q is INT_MAX, the maximum number of symbols are aggregated into one. Output: Returns an integer j in the range 1...q representing the size of the new alphabet. If j<=n+1, the alphabet is compacted. The global variable r is set to the number of old symbols grouped into one. Only x[n] is 0.*/ static qsint_t QSufSortTransform(qsint_t* __restrict V, qsint_t* __restrict I, const qsint_t numChar, const qsint_t largestInputSymbol, const qsint_t smallestInputSymbol, const qsint_t maxNewAlphabetSize, qsint_t *numSymbolAggregated) { qsint_t c, i, j; qsint_t a; // numSymbolAggregated qsint_t mask; qsint_t minSymbolInChunk = 0, maxSymbolInChunk = 0; qsint_t newAlphabetSize; qsint_t maxNumInputSymbol, maxNumBit, maxSymbol; maxNumInputSymbol = largestInputSymbol - smallestInputSymbol + 1; for (maxNumBit = 0, i = maxNumInputSymbol; i; i >>= 1) ++maxNumBit; maxSymbol = QSINT_MAX >> maxNumBit; c = maxNumInputSymbol; for (a = 0; a < numChar && maxSymbolInChunk <= maxSymbol && c <= maxNewAlphabetSize; a++) { minSymbolInChunk = (minSymbolInChunk << maxNumBit) | (V[a] - smallestInputSymbol + 1); maxSymbolInChunk = c; c = (maxSymbolInChunk << maxNumBit) | maxNumInputSymbol; } mask = (1 << (a-1) * maxNumBit) - 1; /* mask masks off top old symbol from chunk.*/ V[numChar] = smallestInputSymbol - 1; /* emulate zero terminator.*/ /* bucketing possible, compact alphabet.*/ for (i=0; i<=maxSymbolInChunk; i++) I[i] = 0; /* zero transformation table.*/ c = minSymbolInChunk; for (i=a; i<=numChar; i++) { I[c] = 1; /* mark used chunk symbol.*/ c = ((c & mask) << maxNumBit) | (V[i] - smallestInputSymbol + 1); /* shift in next old symbol in chunk.*/ } for (i=1; i number of characters No warrenty is given regarding the quality of the modifications. */ #ifndef __QSUFSORT_H__ #define __QSUFSORT_H__ #include #define KEY(V, I, p, h) ( V[ I[p] + h ] ) #define INSERT_SORT_NUM_ITEM 16 typedef int64_t qsint_t; #define QSINT_MAX INT64_MAX void QSufSortSuffixSort(qsint_t* __restrict V, qsint_t* __restrict I, const qsint_t numChar, const qsint_t largestInputSymbol, const qsint_t smallestInputSymbol, const int skipTransform); void QSufSortGenerateSaFromInverse(const qsint_t *V, qsint_t* __restrict I, const qsint_t numChar); #endif bwa-0.7.17/README-alt.md000066400000000000000000000222611317342117100144170ustar00rootroot00000000000000## For the Impatient ```sh # Download bwakit (or from manually) wget -O- http://sourceforge.net/projects/bio-bwa/files/bwakit/bwakit-0.7.12_x64-linux.tar.bz2/download \ | gzip -dc | tar xf - # Generate the GRCh38+ALT+decoy+HLA and create the BWA index bwa.kit/run-gen-ref hs38DH # download GRCh38 and write hs38DH.fa bwa.kit/bwa index hs38DH.fa # create BWA index # mapping bwa.kit/run-bwamem -o out -H hs38DH.fa read1.fq read2.fq | sh # skip "|sh" to show command lines ``` This generates `out.aln.bam` as the final alignment, `out.hla.top` for best HLA genotypes on each gene and `out.hla.all` for other possible HLA genotypes. Please check out [bwa/bwakit/README.md][kithelp] for details. ## Background GRCh38 consists of several components: chromosomal assembly, unlocalized contigs (chromosome known but location unknown), unplaced contigs (chromosome unknown) and ALT contigs (long clustered variations). The combination of the first three components is called the *primary assembly*. It is recommended to use the complete primary assembly for all analyses. Using ALT contigs in read mapping is tricky. GRCh38 ALT contigs are totaled 109Mb in length, spanning 60Mbp of the primary assembly. However, sequences that are highly diverged from the primary assembly only contribute a few million bp. Most subsequences of ALT contigs are nearly identical to the primary assembly. If we align sequence reads to GRCh38+ALT blindly, we will get many additional reads with zero mapping quality and miss variants on them. It is crucial to make mappers aware of ALTs. BWA-MEM is ALT-aware. It essentially computes mapping quality across the non-redundant content of the primary assembly plus the ALT contigs and is free of the problem above. ## Methods ### Sequence alignment As of now, ALT mapping is done in two separate steps: BWA-MEM mapping and postprocessing. The `bwa.kit/run-bwamem` script performs the two steps when ALT contigs are present. The following picture shows an example about how BWA-MEM infers mapping quality and reports alignment after step 2: ![](http://lh3lh3.users.sourceforge.net/images/alt-demo.png) #### Step 1: BWA-MEM mapping At this step, BWA-MEM reads the ALT contig names from "*idxbase*.alt", ignoring the ALT-to-ref alignment, and labels a potential hit as *ALT* or *non-ALT*, depending on whether the hit lands on an ALT contig or not. BWA-MEM then reports alignments and assigns mapQ following these two rules: 1. The mapQ of a non-ALT hit is computed across non-ALT hits only. The mapQ of an ALT hit is computed across all hits. 2. If there are no non-ALT hits, the best ALT hit is outputted as the primary alignment. If there are both ALT and non-ALT hits, non-ALT hits will be primary and ALT hits be supplementary (SAM flag 0x800). In theory, non-ALT alignments from step 1 should be identical to alignments against the reference genome with ALT contigs. In practice, the two types of alignments may differ in rare cases due to seeding heuristics. When an ALT hit is significantly better than non-ALT hits, BWA-MEM may miss seeds on the non-ALT hits. If we don't care about ALT hits, we may skip postprocessing (step 2). Nonetheless, postprocessing is recommended as it improves mapQ and gives more information about ALT hits. #### Step 2: Postprocessing Postprocessing is done with a separate script `bwa-postalt.js`. It reads all potential hits reported in the XA tag, lifts ALT hits to the chromosomal positions using the ALT-to-ref alignment, groups them based on overlaps between their lifted positions, and then re-estimates mapQ across the best scoring hit in each group. Being aware of the ALT-to-ref alignment, this script can greatly improve mapQ of ALT hits and occasionally improve mapQ of non-ALT hits. It also writes each hit overlapping the reported hit into a separate SAM line. This enables variant calling on each ALT contig independent of others. ### On the completeness of GRCh38+ALT While GRCh38 is much more complete than GRCh37, it is still missing some true human sequences. To make sure every piece of sequence in the reference assembly is correct, the [Genome Reference Consortium][grc] (GRC) require each ALT contig to have enough support from multiple sources before considering to add it to the reference assembly. This careful and sophisticated procedure has left out some sequences, one of which is [this example][novel], a 10kb contig assembled from CHM1 short reads and present also in NA12878. You can try [BLAT][blat] or [BLAST][blast] to see where it maps. For a more complete reference genome, we compiled a new set of decoy sequences from GenBank clones and the de novo assembly of 254 public [SGDP][sgdp] samples. The sequences are included in `hs38DH-extra.fa` from the [BWA binary package][res]. In addition to decoy, we also put multiple alleles of HLA genes in `hs38DH-extra.fa`. These genomic sequences were acquired from [IMGT/HLA][hladb], version 3.18.0 and are used to collect reads sequenced from these genes. ### HLA typing HLA genes are known to be associated with many autoimmune diseases, infectious diseases and drug responses. They are among the most important genes but are rarely studied by WGS projects due to the high sequence divergence between HLA genes and the reference genome in these regions. By including the HLA gene regions in the reference assembly as ALT contigs, we are able to effectively identify reads coming from these genes. We also provide a pipeline, which is included in the [BWA binary package][res], to type the several classic HLA genes. The pipeline is conceptually simple. It de novo assembles sequence reads mapped to each gene, aligns exon sequences of each allele to the assembled contigs and then finds the pairs of alleles that best explain the contigs. In practice, however, the completeness of IMGT/HLA and copy-number changes related to these genes are not so straightforward to resolve. HLA typing may not always be successful. Users may also consider to use other programs for typing such as [Warren et al (2012)][hla4], [Liu et al (2013)][hla2], [Bai et al (2014)][hla3] and [Dilthey et al (2014)][hla1], though most of them are distributed under restrictive licenses. ## Preliminary Evaluation To check whether GRCh38 is better than GRCh37, we mapped the CHM1 and NA12878 unitigs to GRCh37 primary (hs37), GRCh38 primary (hs38) and GRCh38+ALT+decoy (hs38DH), and called small variants from the alignment. CHM1 is haploid. Ideally, heterozygous calls are false positives (FP). NA12878 is diploid. The true positive (TP) heterozygous calls from NA12878 are approximately equal to the difference between NA12878 and CHM1 heterozygous calls. A better assembly should yield higher TP and lower FP. The following table shows the numbers for these assemblies: |Assembly|hs37 |hs38 |hs38DH|CHM1_1.1| huref| |:------:|------:|------:|------:|------:|------:| |FP | 255706| 168068| 142516|307172 | 575634| |TP |2142260|2163113|2150844|2167235|2137053| With this measurement, hs38 is clearly better than hs37. Genome hs38DH reduces FP by ~25k but also reduces TP by ~12k. We manually inspected variants called from hs38 only and found the majority of them are associated with excessive read depth, clustered variants or weak alignment. We believe most hs38-only calls are problematic. In addition, if we compare two NA12878 replicates from HiSeq X10 with nearly identical library construction, the difference is ~140k, an order of magnitude higher than the difference between hs38 and hs38DH. ALT contigs, decoy and HLA genes in hs38DH improve variant calling and enable the analyses of ALT contigs and HLA typing at little cost. ## Problems and Future Development There are some uncertainties about ALT mappings - we are not sure whether they help biological discovery and don't know the best way to analyze them. Without clear demand from downstream analyses, it is very difficult to design the optimal mapping strategy. The current BWA-MEM method is just a start. If it turns out to be useful in research, we will probably rewrite bwa-postalt.js in C for performance; if not, we may make changes. It is also possible that we might make breakthrough on the representation of multiple genomes, in which case, we can even get rid of ALT contigs for good. [res]: https://sourceforge.net/projects/bio-bwa/files/bwakit [sb]: https://github.com/GregoryFaust/samblaster [grc]: http://www.ncbi.nlm.nih.gov/projects/genome/assembly/grc/ [novel]: https://gist.github.com/lh3/9935148b71f04ba1a8cc [blat]: https://genome.ucsc.edu/cgi-bin/hgBlat [blast]: http://blast.st-va.ncbi.nlm.nih.gov/Blast.cgi?PROGRAM=blastn&PAGE_TYPE=BlastSearch&LINK_LOC=blasthome [sgdp]: http://www.simonsfoundation.org/life-sciences/simons-genome-diversity-project/ [hladb]: http://www.ebi.ac.uk/ipd/imgt/hla/ [grcdef]: http://www.ncbi.nlm.nih.gov/projects/genome/assembly/grc/info/definitions.shtml [hla1]: http://biorxiv.org/content/early/2014/07/08/006973 [hlalink]: http://www.hladiseaseassociations.com [hlatools]: https://www.biostars.org/p/93245/ [hla2]: http://nar.oxfordjournals.org/content/41/14/e142.full.pdf+html [hla3]: http://www.biomedcentral.com/1471-2164/15/325 [hla4]: http://genomemedicine.com/content/4/12/95 [kithelp]: https://github.com/lh3/bwa/tree/master/bwakit bwa-0.7.17/README.md000066400000000000000000000202571317342117100136440ustar00rootroot00000000000000[![Build Status](https://travis-ci.org/lh3/bwa.svg?branch=dev)](https://travis-ci.org/lh3/bwa) ## Getting started git clone https://github.com/lh3/bwa.git cd bwa; make ./bwa index ref.fa ./bwa mem ref.fa read-se.fq.gz | gzip -3 > aln-se.sam.gz ./bwa mem ref.fa read1.fq read2.fq | gzip -3 > aln-pe.sam.gz ## Introduction BWA is a software package for mapping DNA sequences against a large reference genome, such as the human genome. It consists of three algorithms: BWA-backtrack, BWA-SW and BWA-MEM. The first algorithm is designed for Illumina sequence reads up to 100bp, while the rest two for longer sequences ranged from 70bp to a few megabases. BWA-MEM and BWA-SW share similar features such as the support of long reads and chimeric alignment, but BWA-MEM, which is the latest, is generally recommended as it is faster and more accurate. BWA-MEM also has better performance than BWA-backtrack for 70-100bp Illumina reads. For all the algorithms, BWA first needs to construct the FM-index for the reference genome (the **index** command). Alignment algorithms are invoked with different sub-commands: **aln/samse/sampe** for BWA-backtrack, **bwasw** for BWA-SW and **mem** for the BWA-MEM algorithm. ## Availability BWA is released under [GPLv3][1]. The latest source code is [freely available at github][2]. Released packages can [be downloaded][3] at SourceForge. After you acquire the source code, simply use `make` to compile and copy the single executable `bwa` to the destination you want. The only dependency required to build BWA is [zlib][14]. Since 0.7.11, precompiled binary for x86\_64-linux is available in [bwakit][17]. In addition to BWA, this self-consistent package also comes with bwa-associated and 3rd-party tools for proper BAM-to-FASTQ conversion, mapping to ALT contigs, adapter triming, duplicate marking, HLA typing and associated data files. ## Seeking help The detailed usage is described in the man page available together with the source code. You can use `man ./bwa.1` to view the man page in a terminal. The [HTML version][4] of the man page can be found at the [BWA website][5]. If you have questions about BWA, you may [sign up the mailing list][6] and then send the questions to [bio-bwa-help@sourceforge.net][7]. You may also ask questions in forums such as [BioStar][8] and [SEQanswers][9]. ## Citing BWA * Li H. and Durbin R. (2009) Fast and accurate short read alignment with Burrows-Wheeler transform. *Bioinformatics*, **25**, 1754-1760. [PMID: [19451168][10]]. (if you use the BWA-backtrack algorithm) * Li H. and Durbin R. (2010) Fast and accurate long-read alignment with Burrows-Wheeler transform. *Bioinformatics*, **26**, 589-595. [PMID: [20080505][11]]. (if you use the BWA-SW algorithm) * Li H. (2013) Aligning sequence reads, clone sequences and assembly contigs with BWA-MEM. [arXiv:1303.3997v2][12] [q-bio.GN]. (if you use the BWA-MEM algorithm or the **fastmap** command, or want to cite the whole BWA package) Please note that the last reference is a preprint hosted at [arXiv.org][13]. I do not have plan to submit it to a peer-reviewed journal in the near future. ## Frequently asked questions (FAQs) 1. [What types of data does BWA work with?](#type) 2. [Why does a read appear multiple times in the output SAM?](#multihit) 3. [Does BWA work on reference sequences longer than 4GB in total?](#4gb) 4. [Why can one read in a pair has high mapping quality but the other has zero?](#pe0) 5. [How can a BWA-backtrack alignment stands out of the end of a chromosome?](#endref) 6. [Does BWA work with ALT contigs in the GRCh38 release?](#altctg) 7. [Can I just run BWA-MEM against GRCh38+ALT without post-processing?](#postalt) #### 1. What types of data does BWA work with? BWA works with a variety types of DNA sequence data, though the optimal algorithm and setting may vary. The following list gives the recommended settings: * Illumina/454/IonTorrent single-end reads longer than ~70bp or assembly contigs up to a few megabases mapped to a closely related reference genome: bwa mem ref.fa reads.fq > aln.sam * Illumina single-end reads shorter than ~70bp: bwa aln ref.fa reads.fq > reads.sai; bwa samse ref.fa reads.sai reads.fq > aln-se.sam * Illumina/454/IonTorrent paired-end reads longer than ~70bp: bwa mem ref.fa read1.fq read2.fq > aln-pe.sam * Illumina paired-end reads shorter than ~70bp: bwa aln ref.fa read1.fq > read1.sai; bwa aln ref.fa read2.fq > read2.sai bwa sampe ref.fa read1.sai read2.sai read1.fq read2.fq > aln-pe.sam * PacBio subreads or Oxford Nanopore reads to a reference genome: bwa mem -x pacbio ref.fa reads.fq > aln.sam bwa mem -x ont2d ref.fa reads.fq > aln.sam BWA-MEM is recommended for query sequences longer than ~70bp for a variety of error rates (or sequence divergence). Generally, BWA-MEM is more tolerant with errors given longer query sequences as the chance of missing all seeds is small. As is shown above, with non-default settings, BWA-MEM works with Oxford Nanopore reads with a sequencing error rate over 20%. #### 2. Why does a read appear multiple times in the output SAM? BWA-SW and BWA-MEM perform local alignments. If there is a translocation, a gene fusion or a long deletion, a read bridging the break point may have two hits, occupying two lines in the SAM output. With the default setting of BWA-MEM, one and only one line is primary and is soft clipped; other lines are tagged with 0x800 SAM flag (supplementary alignment) and are hard clipped. #### 3. Does BWA work on reference sequences longer than 4GB in total? Yes. Since 0.6.x, all BWA algorithms work with a genome with total length over 4GB. However, individual chromosome should not be longer than 2GB. #### 4. Why can one read in a pair have a high mapping quality but the other has zero? This is correct. Mapping quality is assigned for individual read, not for a read pair. It is possible that one read can be mapped unambiguously, but its mate falls in a tandem repeat and thus its accurate position cannot be determined. #### 5. How can a BWA-backtrack alignment stand out of the end of a chromosome? Internally BWA concatenates all reference sequences into one long sequence. A read may be mapped to the junction of two adjacent reference sequences. In this case, BWA-backtrack will flag the read as unmapped (0x4), but you will see position, CIGAR and all the tags. A similar issue may occur to BWA-SW alignment as well. BWA-MEM does not have this problem. #### 6. Does BWA work with ALT contigs in the GRCh38 release? Yes, since 0.7.11, BWA-MEM officially supports mapping to GRCh38+ALT. BWA-backtrack and BWA-SW don't properly support ALT mapping as of now. Please see [README-alt.md][18] for details. Briefly, it is recommended to use [bwakit][17], the binary release of BWA, for generating the reference genome and for mapping. #### 7. Can I just run BWA-MEM against GRCh38+ALT without post-processing? If you are not interested in hits to ALT contigs, it is okay to run BWA-MEM without post-processing. The alignments produced this way are very close to alignments against GRCh38 without ALT contigs. Nonetheless, applying post-processing helps to reduce false mappings caused by reads from the diverged part of ALT contigs and also enables HLA typing. It is recommended to run the post-processing script. [1]: http://en.wikipedia.org/wiki/GNU_General_Public_License [2]: https://github.com/lh3/bwa [3]: http://sourceforge.net/projects/bio-bwa/files/ [4]: http://bio-bwa.sourceforge.net/bwa.shtml [5]: http://bio-bwa.sourceforge.net/ [6]: https://lists.sourceforge.net/lists/listinfo/bio-bwa-help [7]: mailto:bio-bwa-help@sourceforge.net [8]: http://biostars.org [9]: http://seqanswers.com/ [10]: http://www.ncbi.nlm.nih.gov/pubmed/19451168 [11]: http://www.ncbi.nlm.nih.gov/pubmed/20080505 [12]: http://arxiv.org/abs/1303.3997 [13]: http://arxiv.org/ [14]: http://zlib.net/ [15]: https://github.com/lh3/bwa/tree/mem [16]: ftp://ftp.ncbi.nlm.nih.gov/genbank/genomes/Eukaryotes/vertebrates_mammals/Homo_sapiens/GRCh38/seqs_for_alignment_pipelines/ [17]: http://sourceforge.net/projects/bio-bwa/files/bwakit/ [18]: https://github.com/lh3/bwa/blob/master/README-alt.md bwa-0.7.17/bamlite.c000066400000000000000000000137221317342117100141450ustar00rootroot00000000000000#include #include #include #include #include #include "bamlite.h" #ifdef USE_MALLOC_WRAPPERS # include "malloc_wrap.h" #endif /********************* * from bam_endian.c * *********************/ static inline int bam_is_big_endian() { long one= 1; return !(*((char *)(&one))); } static inline uint16_t bam_swap_endian_2(uint16_t v) { return (uint16_t)(((v & 0x00FF00FFU) << 8) | ((v & 0xFF00FF00U) >> 8)); } static inline void *bam_swap_endian_2p(void *x) { *(uint16_t*)x = bam_swap_endian_2(*(uint16_t*)x); return x; } static inline uint32_t bam_swap_endian_4(uint32_t v) { v = ((v & 0x0000FFFFU) << 16) | (v >> 16); return ((v & 0x00FF00FFU) << 8) | ((v & 0xFF00FF00U) >> 8); } static inline void *bam_swap_endian_4p(void *x) { *(uint32_t*)x = bam_swap_endian_4(*(uint32_t*)x); return x; } static inline uint64_t bam_swap_endian_8(uint64_t v) { v = ((v & 0x00000000FFFFFFFFLLU) << 32) | (v >> 32); v = ((v & 0x0000FFFF0000FFFFLLU) << 16) | ((v & 0xFFFF0000FFFF0000LLU) >> 16); return ((v & 0x00FF00FF00FF00FFLLU) << 8) | ((v & 0xFF00FF00FF00FF00LLU) >> 8); } static inline void *bam_swap_endian_8p(void *x) { *(uint64_t*)x = bam_swap_endian_8(*(uint64_t*)x); return x; } /************** * from bam.c * **************/ int bam_is_be; bam_header_t *bam_header_init() { bam_is_be = bam_is_big_endian(); return (bam_header_t*)calloc(1, sizeof(bam_header_t)); } void bam_header_destroy(bam_header_t *header) { int32_t i; if (header == 0) return; if (header->target_name) { for (i = 0; i < header->n_targets; ++i) if (header->target_name[i]) free(header->target_name[i]); if (header->target_len) free(header->target_len); free(header->target_name); } if (header->text) free(header->text); free(header); } bam_header_t *bam_header_read(bamFile fp) { bam_header_t *header; char buf[4]; int magic_len; int32_t i = 1, name_len; // read "BAM1" magic_len = bam_read(fp, buf, 4); if (magic_len != 4 || strncmp(buf, "BAM\001", 4) != 0) { fprintf(stderr, "[bam_header_read] invalid BAM binary header (this is not a BAM file).\n"); return NULL; } header = bam_header_init(); // read plain text and the number of reference sequences if (bam_read(fp, &header->l_text, 4) != 4) goto fail; if (bam_is_be) bam_swap_endian_4p(&header->l_text); header->text = (char*)calloc(header->l_text + 1, 1); if (bam_read(fp, header->text, header->l_text) != header->l_text) goto fail; if (bam_read(fp, &header->n_targets, 4) != 4) goto fail; if (bam_is_be) bam_swap_endian_4p(&header->n_targets); // read reference sequence names and lengths header->target_name = (char**)calloc(header->n_targets, sizeof(char*)); header->target_len = (uint32_t*)calloc(header->n_targets, 4); for (i = 0; i != header->n_targets; ++i) { if (bam_read(fp, &name_len, 4) != 4) goto fail; if (bam_is_be) bam_swap_endian_4p(&name_len); header->target_name[i] = (char*)calloc(name_len, 1); if (bam_read(fp, header->target_name[i], name_len) != name_len) { goto fail; } if (bam_read(fp, &header->target_len[i], 4) != 4) goto fail; if (bam_is_be) bam_swap_endian_4p(&header->target_len[i]); } return header; fail: bam_header_destroy(header); return NULL; } static void swap_endian_data(const bam1_core_t *c, int data_len, uint8_t *data) { uint8_t *s; uint32_t i, *cigar = (uint32_t*)(data + c->l_qname); s = data + c->n_cigar*4 + c->l_qname + c->l_qseq + (c->l_qseq + 1)/2; for (i = 0; i < c->n_cigar; ++i) bam_swap_endian_4p(&cigar[i]); while (s < data + data_len) { uint8_t type; s += 2; // skip key type = toupper(*s); ++s; // skip type if (type == 'C' || type == 'A') ++s; else if (type == 'S') { bam_swap_endian_2p(s); s += 2; } else if (type == 'I' || type == 'F') { bam_swap_endian_4p(s); s += 4; } else if (type == 'D') { bam_swap_endian_8p(s); s += 8; } else if (type == 'Z' || type == 'H') { while (*s) ++s; ++s; } } } int bam_read1(bamFile fp, bam1_t *b) { bam1_core_t *c = &b->core; int32_t block_len, ret, i; uint32_t x[8]; if ((ret = bam_read(fp, &block_len, 4)) != 4) { if (ret == 0) return -1; // normal end-of-file else return -2; // truncated } if (bam_read(fp, x, sizeof(bam1_core_t)) != sizeof(bam1_core_t)) return -3; if (bam_is_be) { bam_swap_endian_4p(&block_len); for (i = 0; i < 8; ++i) bam_swap_endian_4p(x + i); } c->tid = x[0]; c->pos = x[1]; c->bin = x[2]>>16; c->qual = x[2]>>8&0xff; c->l_qname = x[2]&0xff; c->flag = x[3]>>16; c->n_cigar = x[3]&0xffff; c->l_qseq = x[4]; c->mtid = x[5]; c->mpos = x[6]; c->isize = x[7]; b->data_len = block_len - sizeof(bam1_core_t); if (b->m_data < b->data_len) { b->m_data = b->data_len; kroundup32(b->m_data); b->data = (uint8_t*)realloc(b->data, b->m_data); } if (bam_read(fp, b->data, b->data_len) != b->data_len) return -4; b->l_aux = b->data_len - c->n_cigar * 4 - c->l_qname - c->l_qseq - (c->l_qseq+1)/2; if (bam_is_be) swap_endian_data(c, b->data_len, b->data); return 4 + block_len; } #ifdef USE_VERBOSE_ZLIB_WRAPPERS // Versions of gzopen, gzread and gzclose that print up error messages gzFile bamlite_gzopen(const char *fn, const char *mode) { gzFile fp; if (strcmp(fn, "-") == 0) { fp = gzdopen(fileno((strstr(mode, "r"))? stdin : stdout), mode); if (!fp) { fprintf(stderr, "Couldn't open %s : %s", (strstr(mode, "r"))? "stdin" : "stdout", strerror(errno)); } return fp; } if ((fp = gzopen(fn, mode)) == 0) { fprintf(stderr, "Couldn't open %s : %s\n", fn, errno ? strerror(errno) : "Out of memory"); } return fp; } int bamlite_gzread(gzFile file, void *ptr, unsigned int len) { int ret = gzread(file, ptr, len); if (ret < 0) { int errnum = 0; const char *msg = gzerror(file, &errnum); fprintf(stderr, "gzread error: %s\n", Z_ERRNO == errnum ? strerror(errno) : msg); } return ret; } int bamlite_gzclose(gzFile file) { int ret = gzclose(file); if (Z_OK != ret) { fprintf(stderr, "gzclose error: %s\n", Z_ERRNO == ret ? strerror(errno) : zError(ret)); } return ret; } #endif /* USE_VERBOSE_ZLIB_WRAPPERS */ bwa-0.7.17/bamlite.h000066400000000000000000000060641317342117100141530ustar00rootroot00000000000000#ifndef BAMLITE_H_ #define BAMLITE_H_ #include #include #ifdef USE_MALLOC_WRAPPERS # include "malloc_wrap.h" #endif #define USE_VERBOSE_ZLIB_WRAPPERS typedef gzFile bamFile; #ifdef USE_VERBOSE_ZLIB_WRAPPERS /* These print error messages on failure */ # define bam_open(fn, mode) bamlite_gzopen(fn, mode) # define bam_dopen(fd, mode) gzdopen(fd, mode) # define bam_close(fp) bamlite_gzclose(fp) # define bam_read(fp, buf, size) bamlite_gzread(fp, buf, size) #else # define bam_open(fn, mode) gzopen(fn, mode) # define bam_dopen(fd, mode) gzdopen(fd, mode) # define bam_close(fp) gzclose(fp) # define bam_read(fp, buf, size) gzread(fp, buf, size) #endif /* USE_VERBOSE_ZLIB_WRAPPERS */ typedef struct { int32_t n_targets; char **target_name; uint32_t *target_len; size_t l_text, n_text; char *text; } bam_header_t; #define BAM_FPAIRED 1 #define BAM_FPROPER_PAIR 2 #define BAM_FUNMAP 4 #define BAM_FMUNMAP 8 #define BAM_FREVERSE 16 #define BAM_FMREVERSE 32 #define BAM_FREAD1 64 #define BAM_FREAD2 128 #define BAM_FSECONDARY 256 #define BAM_FQCFAIL 512 #define BAM_FDUP 1024 #define BAM_CIGAR_SHIFT 4 #define BAM_CIGAR_MASK ((1 << BAM_CIGAR_SHIFT) - 1) #define BAM_CMATCH 0 #define BAM_CINS 1 #define BAM_CDEL 2 #define BAM_CREF_SKIP 3 #define BAM_CSOFT_CLIP 4 #define BAM_CHARD_CLIP 5 #define BAM_CPAD 6 typedef struct { int32_t tid; int32_t pos; uint32_t bin:16, qual:8, l_qname:8; uint32_t flag:16, n_cigar:16; int32_t l_qseq; int32_t mtid; int32_t mpos; int32_t isize; } bam1_core_t; typedef struct { bam1_core_t core; int l_aux, data_len, m_data; uint8_t *data; } bam1_t; #ifndef kroundup32 #define kroundup32(x) (--(x), (x)|=(x)>>1, (x)|=(x)>>2, (x)|=(x)>>4, (x)|=(x)>>8, (x)|=(x)>>16, ++(x)) #endif #define bam1_strand(b) (((b)->core.flag&BAM_FREVERSE) != 0) #define bam1_mstrand(b) (((b)->core.flag&BAM_FMREVERSE) != 0) #define bam1_cigar(b) ((uint32_t*)((b)->data + (b)->core.l_qname)) #define bam1_qname(b) ((char*)((b)->data)) #define bam1_seq(b) ((b)->data + (b)->core.n_cigar*4 + (b)->core.l_qname) #define bam1_qual(b) ((b)->data + (b)->core.n_cigar*4 + (b)->core.l_qname + (((b)->core.l_qseq + 1)>>1)) #define bam1_seqi(s, i) ((s)[(i)/2] >> 4*(1-(i)%2) & 0xf) #define bam1_aux(b) ((b)->data + (b)->core.n_cigar*4 + (b)->core.l_qname + (b)->core.l_qseq + ((b)->core.l_qseq + 1)/2) #define bam_init1() ((bam1_t*)calloc(1, sizeof(bam1_t))) #define bam_destroy1(b) do { \ if (b) { free((b)->data); free(b); } \ } while (0) extern int bam_is_be; #ifdef __cplusplus extern "C" { #endif bam_header_t *bam_header_init(void); void bam_header_destroy(bam_header_t *header); bam_header_t *bam_header_read(bamFile fp); int bam_read1(bamFile fp, bam1_t *b); #ifdef USE_VERBOSE_ZLIB_WRAPPERS gzFile bamlite_gzopen(const char *fn, const char *mode); int bamlite_gzread(gzFile file, void *ptr, unsigned int len); int bamlite_gzclose(gzFile file); #endif /* USE_VERBOSE_ZLIB_WRAPPERS */ #ifdef __cplusplus } #endif #endif bwa-0.7.17/bntseq.c000066400000000000000000000327611317342117100140300ustar00rootroot00000000000000/* The MIT License Copyright (c) 2008 Genome Research Ltd (GRL). Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. */ /* Contact: Heng Li */ #include #include #include #include #include #include #include "bntseq.h" #include "utils.h" #include "kseq.h" KSEQ_DECLARE(gzFile) #include "khash.h" KHASH_MAP_INIT_STR(str, int) #ifdef USE_MALLOC_WRAPPERS # include "malloc_wrap.h" #endif unsigned char nst_nt4_table[256] = { 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 5 /*'-'*/, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 0, 4, 1, 4, 4, 4, 2, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 3, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 0, 4, 1, 4, 4, 4, 2, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 3, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4 }; void bns_dump(const bntseq_t *bns, const char *prefix) { char str[1024]; FILE *fp; int i; { // dump .ann strcpy(str, prefix); strcat(str, ".ann"); fp = xopen(str, "w"); err_fprintf(fp, "%lld %d %u\n", (long long)bns->l_pac, bns->n_seqs, bns->seed); for (i = 0; i != bns->n_seqs; ++i) { bntann1_t *p = bns->anns + i; err_fprintf(fp, "%d %s", p->gi, p->name); if (p->anno[0]) err_fprintf(fp, " %s\n", p->anno); else err_fprintf(fp, "\n"); err_fprintf(fp, "%lld %d %d\n", (long long)p->offset, p->len, p->n_ambs); } err_fflush(fp); err_fclose(fp); } { // dump .amb strcpy(str, prefix); strcat(str, ".amb"); fp = xopen(str, "w"); err_fprintf(fp, "%lld %d %u\n", (long long)bns->l_pac, bns->n_seqs, bns->n_holes); for (i = 0; i != bns->n_holes; ++i) { bntamb1_t *p = bns->ambs + i; err_fprintf(fp, "%lld %d %c\n", (long long)p->offset, p->len, p->amb); } err_fflush(fp); err_fclose(fp); } } bntseq_t *bns_restore_core(const char *ann_filename, const char* amb_filename, const char* pac_filename) { char str[8192]; FILE *fp; const char *fname; bntseq_t *bns; long long xx; int i; int scanres; bns = (bntseq_t*)calloc(1, sizeof(bntseq_t)); { // read .ann fp = xopen(fname = ann_filename, "r"); scanres = fscanf(fp, "%lld%d%u", &xx, &bns->n_seqs, &bns->seed); if (scanres != 3) goto badread; bns->l_pac = xx; bns->anns = (bntann1_t*)calloc(bns->n_seqs, sizeof(bntann1_t)); for (i = 0; i < bns->n_seqs; ++i) { bntann1_t *p = bns->anns + i; char *q = str; int c; // read gi and sequence name scanres = fscanf(fp, "%u%s", &p->gi, str); if (scanres != 2) goto badread; p->name = strdup(str); // read fasta comments while (q - str < sizeof(str) - 1 && (c = fgetc(fp)) != '\n' && c != EOF) *q++ = c; while (c != '\n' && c != EOF) c = fgetc(fp); if (c == EOF) { scanres = EOF; goto badread; } *q = 0; if (q - str > 1 && strcmp(str, " (null)") != 0) p->anno = strdup(str + 1); // skip leading space else p->anno = strdup(""); // read the rest scanres = fscanf(fp, "%lld%d%d", &xx, &p->len, &p->n_ambs); if (scanres != 3) goto badread; p->offset = xx; } err_fclose(fp); } { // read .amb int64_t l_pac; int32_t n_seqs; fp = xopen(fname = amb_filename, "r"); scanres = fscanf(fp, "%lld%d%d", &xx, &n_seqs, &bns->n_holes); if (scanres != 3) goto badread; l_pac = xx; xassert(l_pac == bns->l_pac && n_seqs == bns->n_seqs, "inconsistent .ann and .amb files."); bns->ambs = bns->n_holes? (bntamb1_t*)calloc(bns->n_holes, sizeof(bntamb1_t)) : 0; for (i = 0; i < bns->n_holes; ++i) { bntamb1_t *p = bns->ambs + i; scanres = fscanf(fp, "%lld%d%s", &xx, &p->len, str); if (scanres != 3) goto badread; p->offset = xx; p->amb = str[0]; } err_fclose(fp); } { // open .pac bns->fp_pac = xopen(pac_filename, "rb"); } return bns; badread: if (EOF == scanres) { err_fatal(__func__, "Error reading %s : %s\n", fname, ferror(fp) ? strerror(errno) : "Unexpected end of file"); } err_fatal(__func__, "Parse error reading %s\n", fname); } bntseq_t *bns_restore(const char *prefix) { char ann_filename[1024], amb_filename[1024], pac_filename[1024], alt_filename[1024]; FILE *fp; bntseq_t *bns; strcat(strcpy(ann_filename, prefix), ".ann"); strcat(strcpy(amb_filename, prefix), ".amb"); strcat(strcpy(pac_filename, prefix), ".pac"); bns = bns_restore_core(ann_filename, amb_filename, pac_filename); if (bns == 0) return 0; if ((fp = fopen(strcat(strcpy(alt_filename, prefix), ".alt"), "r")) != 0) { // read .alt file if present char str[1024]; khash_t(str) *h; int c, i, absent; khint_t k; h = kh_init(str); for (i = 0; i < bns->n_seqs; ++i) { k = kh_put(str, h, bns->anns[i].name, &absent); kh_val(h, k) = i; } i = 0; while ((c = fgetc(fp)) != EOF) { if (c == '\t' || c == '\n' || c == '\r') { str[i] = 0; if (str[0] != '@') { k = kh_get(str, h, str); if (k != kh_end(h)) bns->anns[kh_val(h, k)].is_alt = 1; } while (c != '\n' && c != EOF) c = fgetc(fp); i = 0; } else str[i++] = c; // FIXME: potential segfault here } kh_destroy(str, h); fclose(fp); } return bns; } void bns_destroy(bntseq_t *bns) { if (bns == 0) return; else { int i; if (bns->fp_pac) err_fclose(bns->fp_pac); free(bns->ambs); for (i = 0; i < bns->n_seqs; ++i) { free(bns->anns[i].name); free(bns->anns[i].anno); } free(bns->anns); free(bns); } } #define _set_pac(pac, l, c) ((pac)[(l)>>2] |= (c)<<((~(l)&3)<<1)) #define _get_pac(pac, l) ((pac)[(l)>>2]>>((~(l)&3)<<1)&3) static uint8_t *add1(const kseq_t *seq, bntseq_t *bns, uint8_t *pac, int64_t *m_pac, int *m_seqs, int *m_holes, bntamb1_t **q) { bntann1_t *p; int i, lasts; if (bns->n_seqs == *m_seqs) { *m_seqs <<= 1; bns->anns = (bntann1_t*)realloc(bns->anns, *m_seqs * sizeof(bntann1_t)); } p = bns->anns + bns->n_seqs; p->name = strdup((char*)seq->name.s); p->anno = seq->comment.l > 0? strdup((char*)seq->comment.s) : strdup("(null)"); p->gi = 0; p->len = seq->seq.l; p->offset = (bns->n_seqs == 0)? 0 : (p-1)->offset + (p-1)->len; p->n_ambs = 0; for (i = lasts = 0; i < seq->seq.l; ++i) { int c = nst_nt4_table[(int)seq->seq.s[i]]; if (c >= 4) { // N if (lasts == seq->seq.s[i]) { // contiguous N ++(*q)->len; } else { if (bns->n_holes == *m_holes) { (*m_holes) <<= 1; bns->ambs = (bntamb1_t*)realloc(bns->ambs, (*m_holes) * sizeof(bntamb1_t)); } *q = bns->ambs + bns->n_holes; (*q)->len = 1; (*q)->offset = p->offset + i; (*q)->amb = seq->seq.s[i]; ++p->n_ambs; ++bns->n_holes; } } lasts = seq->seq.s[i]; { // fill buffer if (c >= 4) c = lrand48()&3; if (bns->l_pac == *m_pac) { // double the pac size *m_pac <<= 1; pac = realloc(pac, *m_pac/4); memset(pac + bns->l_pac/4, 0, (*m_pac - bns->l_pac)/4); } _set_pac(pac, bns->l_pac, c); ++bns->l_pac; } } ++bns->n_seqs; return pac; } int64_t bns_fasta2bntseq(gzFile fp_fa, const char *prefix, int for_only) { extern void seq_reverse(int len, ubyte_t *seq, int is_comp); // in bwaseqio.c kseq_t *seq; char name[1024]; bntseq_t *bns; uint8_t *pac = 0; int32_t m_seqs, m_holes; int64_t ret = -1, m_pac, l; bntamb1_t *q; FILE *fp; // initialization seq = kseq_init(fp_fa); bns = (bntseq_t*)calloc(1, sizeof(bntseq_t)); bns->seed = 11; // fixed seed for random generator srand48(bns->seed); m_seqs = m_holes = 8; m_pac = 0x10000; bns->anns = (bntann1_t*)calloc(m_seqs, sizeof(bntann1_t)); bns->ambs = (bntamb1_t*)calloc(m_holes, sizeof(bntamb1_t)); pac = calloc(m_pac/4, 1); q = bns->ambs; strcpy(name, prefix); strcat(name, ".pac"); fp = xopen(name, "wb"); // read sequences while (kseq_read(seq) >= 0) pac = add1(seq, bns, pac, &m_pac, &m_seqs, &m_holes, &q); if (!for_only) { // add the reverse complemented sequence int64_t ll_pac = (bns->l_pac * 2 + 3) / 4 * 4; if (ll_pac > m_pac) pac = realloc(pac, ll_pac/4); memset(pac + (bns->l_pac+3)/4, 0, (ll_pac - (bns->l_pac+3)/4*4) / 4); for (l = bns->l_pac - 1; l >= 0; --l, ++bns->l_pac) _set_pac(pac, bns->l_pac, 3-_get_pac(pac, l)); } ret = bns->l_pac; { // finalize .pac file ubyte_t ct; err_fwrite(pac, 1, (bns->l_pac>>2) + ((bns->l_pac&3) == 0? 0 : 1), fp); // the following codes make the pac file size always (l_pac/4+1+1) if (bns->l_pac % 4 == 0) { ct = 0; err_fwrite(&ct, 1, 1, fp); } ct = bns->l_pac % 4; err_fwrite(&ct, 1, 1, fp); // close .pac file err_fflush(fp); err_fclose(fp); } bns_dump(bns, prefix); bns_destroy(bns); kseq_destroy(seq); free(pac); return ret; } int bwa_fa2pac(int argc, char *argv[]) { int c, for_only = 0; gzFile fp; while ((c = getopt(argc, argv, "f")) >= 0) { switch (c) { case 'f': for_only = 1; break; } } if (argc == optind) { fprintf(stderr, "Usage: bwa fa2pac [-f] []\n"); return 1; } fp = xzopen(argv[optind], "r"); bns_fasta2bntseq(fp, (optind+1 < argc)? argv[optind+1] : argv[optind], for_only); err_gzclose(fp); return 0; } int bns_pos2rid(const bntseq_t *bns, int64_t pos_f) { int left, mid, right; if (pos_f >= bns->l_pac) return -1; left = 0; mid = 0; right = bns->n_seqs; while (left < right) { // binary search mid = (left + right) >> 1; if (pos_f >= bns->anns[mid].offset) { if (mid == bns->n_seqs - 1) break; if (pos_f < bns->anns[mid+1].offset) break; // bracketed left = mid + 1; } else right = mid; } return mid; } int bns_intv2rid(const bntseq_t *bns, int64_t rb, int64_t re) { int is_rev, rid_b, rid_e; if (rb < bns->l_pac && re > bns->l_pac) return -2; assert(rb <= re); rid_b = bns_pos2rid(bns, bns_depos(bns, rb, &is_rev)); rid_e = rb < re? bns_pos2rid(bns, bns_depos(bns, re - 1, &is_rev)) : rid_b; return rid_b == rid_e? rid_b : -1; } int bns_cnt_ambi(const bntseq_t *bns, int64_t pos_f, int len, int *ref_id) { int left, mid, right, nn; if (ref_id) *ref_id = bns_pos2rid(bns, pos_f); left = 0; right = bns->n_holes; nn = 0; while (left < right) { mid = (left + right) >> 1; if (pos_f >= bns->ambs[mid].offset + bns->ambs[mid].len) left = mid + 1; else if (pos_f + len <= bns->ambs[mid].offset) right = mid; else { // overlap if (pos_f >= bns->ambs[mid].offset) { nn += bns->ambs[mid].offset + bns->ambs[mid].len < pos_f + len? bns->ambs[mid].offset + bns->ambs[mid].len - pos_f : len; } else { nn += bns->ambs[mid].offset + bns->ambs[mid].len < pos_f + len? bns->ambs[mid].len : len - (bns->ambs[mid].offset - pos_f); } break; } } return nn; } uint8_t *bns_get_seq(int64_t l_pac, const uint8_t *pac, int64_t beg, int64_t end, int64_t *len) { uint8_t *seq = 0; if (end < beg) end ^= beg, beg ^= end, end ^= beg; // if end is smaller, swap if (end > l_pac<<1) end = l_pac<<1; if (beg < 0) beg = 0; if (beg >= l_pac || end <= l_pac) { int64_t k, l = 0; *len = end - beg; seq = malloc(end - beg); if (beg >= l_pac) { // reverse strand int64_t beg_f = (l_pac<<1) - 1 - end; int64_t end_f = (l_pac<<1) - 1 - beg; for (k = end_f; k > beg_f; --k) seq[l++] = 3 - _get_pac(pac, k); } else { // forward strand for (k = beg; k < end; ++k) seq[l++] = _get_pac(pac, k); } } else *len = 0; // if bridging the forward-reverse boundary, return nothing return seq; } uint8_t *bns_fetch_seq(const bntseq_t *bns, const uint8_t *pac, int64_t *beg, int64_t mid, int64_t *end, int *rid) { int64_t far_beg, far_end, len; int is_rev; uint8_t *seq; if (*end < *beg) *end ^= *beg, *beg ^= *end, *end ^= *beg; // if end is smaller, swap assert(*beg <= mid && mid < *end); *rid = bns_pos2rid(bns, bns_depos(bns, mid, &is_rev)); far_beg = bns->anns[*rid].offset; far_end = far_beg + bns->anns[*rid].len; if (is_rev) { // flip to the reverse strand int64_t tmp = far_beg; far_beg = (bns->l_pac<<1) - far_end; far_end = (bns->l_pac<<1) - tmp; } *beg = *beg > far_beg? *beg : far_beg; *end = *end < far_end? *end : far_end; seq = bns_get_seq(bns->l_pac, pac, *beg, *end, &len); if (seq == 0 || *end - *beg != len) { fprintf(stderr, "[E::%s] begin=%ld, mid=%ld, end=%ld, len=%ld, seq=%p, rid=%d, far_beg=%ld, far_end=%ld\n", __func__, (long)*beg, (long)mid, (long)*end, (long)len, seq, *rid, (long)far_beg, (long)far_end); } assert(seq && *end - *beg == len); // assertion failure should never happen return seq; } bwa-0.7.17/bntseq.h000066400000000000000000000053271317342117100140330ustar00rootroot00000000000000/* The MIT License Copyright (c) 2008 Genome Research Ltd (GRL). Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. */ /* Contact: Heng Li */ #ifndef BWT_BNTSEQ_H #define BWT_BNTSEQ_H #include #include #include #include #ifndef BWA_UBYTE #define BWA_UBYTE typedef uint8_t ubyte_t; #endif typedef struct { int64_t offset; int32_t len; int32_t n_ambs; uint32_t gi; int32_t is_alt; char *name, *anno; } bntann1_t; typedef struct { int64_t offset; int32_t len; char amb; } bntamb1_t; typedef struct { int64_t l_pac; int32_t n_seqs; uint32_t seed; bntann1_t *anns; // n_seqs elements int32_t n_holes; bntamb1_t *ambs; // n_holes elements FILE *fp_pac; } bntseq_t; extern unsigned char nst_nt4_table[256]; #ifdef __cplusplus extern "C" { #endif void bns_dump(const bntseq_t *bns, const char *prefix); bntseq_t *bns_restore(const char *prefix); bntseq_t *bns_restore_core(const char *ann_filename, const char* amb_filename, const char* pac_filename); void bns_destroy(bntseq_t *bns); int64_t bns_fasta2bntseq(gzFile fp_fa, const char *prefix, int for_only); int bns_pos2rid(const bntseq_t *bns, int64_t pos_f); int bns_cnt_ambi(const bntseq_t *bns, int64_t pos_f, int len, int *ref_id); uint8_t *bns_get_seq(int64_t l_pac, const uint8_t *pac, int64_t beg, int64_t end, int64_t *len); uint8_t *bns_fetch_seq(const bntseq_t *bns, const uint8_t *pac, int64_t *beg, int64_t mid, int64_t *end, int *rid); int bns_intv2rid(const bntseq_t *bns, int64_t rb, int64_t re); #ifdef __cplusplus } #endif static inline int64_t bns_depos(const bntseq_t *bns, int64_t pos, int *is_rev) { return (*is_rev = (pos >= bns->l_pac))? (bns->l_pac<<1) - 1 - pos : pos; } #endif bwa-0.7.17/bwa.1000066400000000000000000000654161317342117100132260ustar00rootroot00000000000000.TH bwa 1 "23 October 2017" "bwa-0.7.17-r1188" "Bioinformatics tools" .SH NAME .PP bwa - Burrows-Wheeler Alignment Tool .SH SYNOPSIS .PP bwa index ref.fa .PP bwa mem ref.fa reads.fq > aln-se.sam .PP bwa mem ref.fa read1.fq read2.fq > aln-pe.sam .PP bwa aln ref.fa short_read.fq > aln_sa.sai .PP bwa samse ref.fa aln_sa.sai short_read.fq > aln-se.sam .PP bwa sampe ref.fa aln_sa1.sai aln_sa2.sai read1.fq read2.fq > aln-pe.sam .PP bwa bwasw ref.fa long_read.fq > aln.sam .SH DESCRIPTION .PP BWA is a software package for mapping low-divergent sequences against a large reference genome, such as the human genome. It consists of three algorithms: BWA-backtrack, BWA-SW and BWA-MEM. The first algorithm is designed for Illumina sequence reads up to 100bp, while the rest two for longer sequences ranged from 70bp to 1Mbp. BWA-MEM and BWA-SW share similar features such as long-read support and split alignment, but BWA-MEM, which is the latest, is generally recommended for high-quality queries as it is faster and more accurate. BWA-MEM also has better performance than BWA-backtrack for 70-100bp Illumina reads. For all the algorithms, BWA first needs to construct the FM-index for the reference genome (the .B index command). Alignment algorithms are invoked with different sub-commands: .BR aln / samse / sampe for BWA-backtrack, .B bwasw for BWA-SW and .B mem for the BWA-MEM algorithm. .SH COMMANDS AND OPTIONS .TP .B index .B bwa index .RB [ -p .IR prefix ] .RB [ -a .IR algoType ] .I db.fa Index database sequences in the FASTA format. .B OPTIONS: .RS .TP 10 .BI -p \ STR Prefix of the output database [same as db filename] .TP .BI -a \ STR Algorithm for constructing BWT index. BWA implements three algorithms for BWT construction: .BR is , .B bwtsw and .BR rb2 . The first algorithm is a little faster for small database but requires large RAM and does not work for databases with total length longer than 2GB. The second algorithm is adapted from the BWT-SW source code. It in theory works with database with trillions of bases. When this option is not specified, the appropriate algorithm will be chosen automatically. .RE .TP .B mem .B bwa mem .RB [ -aCHjMpP ] .RB [ -t .IR nThreads ] .RB [ -k .IR minSeedLen ] .RB [ -w .IR bandWidth ] .RB [ -d .IR zDropoff ] .RB [ -r .IR seedSplitRatio ] .RB [ -c .IR maxOcc ] .RB [ -D .IR chainShadow ] .RB [ -m .IR maxMateSW ] .RB [ -W .IR minSeedMatch ] .RB [ -A .IR matchScore ] .RB [ -B .IR mmPenalty ] .RB [ -O .IR gapOpenPen ] .RB [ -E .IR gapExtPen ] .RB [ -L .IR clipPen ] .RB [ -U .IR unpairPen ] .RB [ -x .IR readType ] .RB [ -R .IR RGline ] .RB [ -H .IR HDlines ] .RB [ -v .IR verboseLevel ] .I db.prefix .I reads.fq .RI [ mates.fq ] Align 70bp-1Mbp query sequences with the BWA-MEM algorithm. Briefly, the algorithm works by seeding alignments with maximal exact matches (MEMs) and then extending seeds with the affine-gap Smith-Waterman algorithm (SW). If .I mates.fq file is absent and option .B -p is not set, this command regards input reads are single-end. If .I mates.fq is present, this command assumes the .IR i -th read in .I reads.fq and the .IR i -th read in .I mates.fq constitute a read pair. If .B -p is used, the command assumes the .RI 2 i -th and the .RI (2 i +1)-th read in .I reads.fq constitute a read pair (such input file is said to be interleaved). In this case, .I mates.fq is ignored. In the paired-end mode, the .B mem command will infer the read orientation and the insert size distribution from a batch of reads. The BWA-MEM algorithm performs local alignment. It may produce multiple primary alignments for different part of a query sequence. This is a crucial feature for long sequences. However, some tools such as Picard's markDuplicates does not work with split alignments. One may consider to use option .B -M to flag shorter split hits as secondary. .RS .TP 10 .B ALGORITHM OPTIONS: .TP .BI -t \ INT Number of threads [1] .TP .BI -k \ INT Minimum seed length. Matches shorter than .I INT will be missed. The alignment speed is usually insensitive to this value unless it significantly deviates from 20. [19] .TP .BI -w \ INT Band width. Essentially, gaps longer than .I INT will not be found. Note that the maximum gap length is also affected by the scoring matrix and the hit length, not solely determined by this option. [100] .TP .BI -d \ INT Off-diagonal X-dropoff (Z-dropoff). Stop extension when the difference between the best and the current extension score is above .RI | i - j |* A + INT , where .I i and .I j are the current positions of the query and reference, respectively, and .I A is the matching score. Z-dropoff is similar to BLAST's X-dropoff except that it doesn't penalize gaps in one of the sequences in the alignment. Z-dropoff not only avoids unnecessary extension, but also reduces poor alignments inside a long good alignment. [100] .TP .BI -r \ FLOAT Trigger re-seeding for a MEM longer than .IR minSeedLen * FLOAT . This is a key heuristic parameter for tuning the performance. Larger value yields fewer seeds, which leads to faster alignment speed but lower accuracy. [1.5] .TP .BI -c \ INT Discard a MEM if it has more than .I INT occurence in the genome. This is an insensitive parameter. [500] .TP .BI -D \ FLOAT Drop chains shorter than .I FLOAT fraction of the longest overlapping chain [0.5] .TP .BI -m \ INT Perform at most .I INT rounds of mate-SW [50] .TP .BI -W \ INT Drop a chain if the number of bases in seeds is smaller than .IR INT . This option is primarily used for longer contigs/reads. When positive, it also affects seed filtering. [0] .TP .B -P In the paired-end mode, perform SW to rescue missing hits only but do not try to find hits that fit a proper pair. .TP .B SCORING OPTIONS: .TP .BI -A \ INT Matching score. [1] .TP .BI -B \ INT Mismatch penalty. The sequence error rate is approximately: {.75 * exp[-log(4) * B/A]}. [4] .TP .BI -O \ INT[,INT] Gap open penalty. If two numbers are specified, the first is the penalty of openning a deletion and the second for openning an insertion. [6] .TP .BI -E \ INT[,INT] Gap extension penalty. If two numbers are specified, the first is the penalty of extending a deletion and second for extending an insertion. A gap of length k costs O + k*E (i.e. .B -O is for opening a zero-length gap). [1] .TP .BI -L \ INT[,INT] Clipping penalty. When performing SW extension, BWA-MEM keeps track of the best score reaching the end of query. If this score is larger than the best SW score minus the clipping penalty, clipping will not be applied. Note that in this case, the SAM AS tag reports the best SW score; clipping penalty is not deduced. If two numbers are provided, the first is for 5'-end clipping and second for 3'-end clipping. [5] .TP .BI -U \ INT Penalty for an unpaired read pair. BWA-MEM scores an unpaired read pair as .RI scoreRead1+scoreRead2- INT and scores a paired as scoreRead1+scoreRead2-insertPenalty. It compares these two scores to determine whether we should force pairing. A larger value leads to more aggressive read pair. [17] .TP .BI -x \ STR Read type. Changes multiple parameters unless overriden [null] .RS .TP 10 .BR pacbio : .B -k17 -W40 -r10 -A1 -B1 -O1 -E1 -L0 (PacBio reads to ref) .TP .BR ont2d : .B -k14 -W20 -r10 -A1 -B1 -O1 -E1 -L0 (Oxford Nanopore 2D-reads to ref) .TP .BR intractg : .B -B9 -O16 -L5 (intra-species contigs to ref) .RE .TP .B INPUT/OUTPUT OPTIONS: .TP .B -p Smart pairing. If two adjacent reads have the same name, they are considered to form a read pair. This way, paired-end and single-end reads can be mixed in a single FASTA/Q stream. .TP .BI -R \ STR Complete read group header line. '\\t' can be used in .I STR and will be converted to a TAB in the output SAM. The read group ID will be attached to every read in the output. An example is '@RG\\tID:foo\\tSM:bar'. [null] .TP .BI -H \ ARG If ARG starts with @, it is interpreted as a string and gets inserted into the output SAM header; otherwise, ARG is interpreted as a file with all lines starting with @ in the file inserted into the SAM header. [null] .TP .BI -o \ FILE Write the output SAM file to .IR FILE . For compatibility with other BWA commands, this option may also be given as .B -f .IR FILE . [standard ouptut] .TP .B -q Don't reduce the mapping quality of split alignment of lower alignment score. .TP .B -5 For split alignment, mark the segment with the smallest coordinate as the primary. It automatically applies option .B -q as well. This option may help some Hi-C pipelines. By default, BWA-MEM marks highest scoring segment as primary. .TP .B -K \ INT Process .I INT input bases in each batch regardless of the number of threads in use .RI [10000000* nThreads ]. By default, the batch size is proportional to the number of threads in use. Because the inferred insert size distribution slightly depends on the batch size, using different number of threads may produce different output. Specifying this option helps reproducibility. .TP .BI -T \ INT Don't output alignment with score lower than .IR INT . This option affects output and occasionally SAM flag 2. [30] .TP .BI -j Treat ALT contigs as part of the primary assembly (i.e. ignore the .I db.prefix.alt file). .TP .BI -h \ INT[,INT2] If a query has not more than .I INT hits with score higher than 80% of the best hit, output them all in the XA tag. If .I INT2 is specified, BWA-MEM outputs up to .I INT2 hits if the list contains a hit to an ALT contig. [5,200] .TP .B -a Output all found alignments for single-end or unpaired paired-end reads. These alignments will be flagged as secondary alignments. .TP .B -C Append FASTA/Q comment to SAM output. This option can be used to transfer read meta information (e.g. barcode) to the SAM output. Note that the FASTA/Q comment (the string after a space in the header line) must conform the SAM spec (e.g. BC:Z:CGTAC). Malformated comments lead to incorrect SAM output. .TP .B -Y Use soft clipping CIGAR operation for supplementary alignments. By default, BWA-MEM uses soft clipping for the primary alignment and hard clipping for supplementary alignments. .TP .B -M Mark shorter split hits as secondary (for Picard compatibility). .TP .BI -v \ INT Control the verbosity level of the output. This option has not been fully supported throughout BWA. Ideally, a value 0 for disabling all the output to stderr; 1 for outputting errors only; 2 for warnings and errors; 3 for all normal messages; 4 or higher for debugging. When this option takes value 4, the output is not SAM. [3] .TP .BI -I \ FLOAT[,FLOAT[,INT[,INT]]] Specify the mean, standard deviation (10% of the mean if absent), max (4 sigma from the mean if absent) and min (4 sigma if absent) of the insert size distribution. Only applicable to the FR orientation. By default, BWA-MEM infers these numbers and the pair orientations given enough reads. [inferred] .RE .TP .B aln bwa aln [-n maxDiff] [-o maxGapO] [-e maxGapE] [-d nDelTail] [-i nIndelEnd] [-k maxSeedDiff] [-l seedLen] [-t nThrds] [-cRN] [-M misMsc] [-O gapOsc] [-E gapEsc] [-q trimQual] > Find the SA coordinates of the input reads. Maximum .I maxSeedDiff differences are allowed in the first .I seedLen subsequence and maximum .I maxDiff differences are allowed in the whole sequence. .B OPTIONS: .RS .TP 10 .BI -n \ NUM Maximum edit distance if the value is INT, or the fraction of missing alignments given 2% uniform base error rate if FLOAT. In the latter case, the maximum edit distance is automatically chosen for different read lengths. [0.04] .TP .BI -o \ INT Maximum number of gap opens [1] .TP .BI -e \ INT Maximum number of gap extensions, -1 for k-difference mode (disallowing long gaps) [-1] .TP .BI -d \ INT Disallow a long deletion within INT bp towards the 3'-end [16] .TP .BI -i \ INT Disallow an indel within INT bp towards the ends [5] .TP .BI -l \ INT Take the first INT subsequence as seed. If INT is larger than the query sequence, seeding will be disabled. For long reads, this option is typically ranged from 25 to 35 for `-k 2'. [inf] .TP .BI -k \ INT Maximum edit distance in the seed [2] .TP .BI -t \ INT Number of threads (multi-threading mode) [1] .TP .BI -M \ INT Mismatch penalty. BWA will not search for suboptimal hits with a score lower than (bestScore-misMsc). [3] .TP .BI -O \ INT Gap open penalty [11] .TP .BI -E \ INT Gap extension penalty [4] .TP .BI -R \ INT Proceed with suboptimal alignments if there are no more than INT equally best hits. This option only affects paired-end mapping. Increasing this threshold helps to improve the pairing accuracy at the cost of speed, especially for short reads (~32bp). .TP .B -c Reverse query but not complement it, which is required for alignment in the color space. (Disabled since 0.6.x) .TP .B -N Disable iterative search. All hits with no more than .I maxDiff differences will be found. This mode is much slower than the default. .TP .BI -q \ INT Parameter for read trimming. BWA trims a read down to argmax_x{\\sum_{i=x+1}^l(INT-q_i)} if q_l 1.sai bwa aln ref.fa -b2 reads.bam > 2.sai bwa sampe ref.fa 1.sai 2.sai reads.bam reads.bam > aln.sam .TP .B -0 When .B -b is specified, only use single-end reads in mapping. .TP .B -1 When .B -b is specified, only use the first read in a read pair in mapping (skip single-end reads and the second reads). .TP .B -2 When .B -b is specified, only use the second read in a read pair in mapping. .B .RE .TP .B samse bwa samse [-n maxOcc] > Generate alignments in the SAM format given single-end reads. Repetitive hits will be randomly chosen. .B OPTIONS: .RS .TP 10 .BI -n \ INT Maximum number of alignments to output in the XA tag for reads paired properly. If a read has more than INT hits, the XA tag will not be written. [3] .TP .BI -r \ STR Specify the read group in a format like `@RG\\tID:foo\\tSM:bar'. [null] .RE .TP .B sampe bwa sampe [-a maxInsSize] [-o maxOcc] [-n maxHitPaired] [-N maxHitDis] [-P] > Generate alignments in the SAM format given paired-end reads. Repetitive read pairs will be placed randomly. .B OPTIONS: .RS .TP 8 .BI -a \ INT Maximum insert size for a read pair to be considered being mapped properly. Since 0.4.5, this option is only used when there are not enough good alignment to infer the distribution of insert sizes. [500] .TP .BI -o \ INT Maximum occurrences of a read for pairing. A read with more occurrneces will be treated as a single-end read. Reducing this parameter helps faster pairing. [100000] .TP .B -P Load the entire FM-index into memory to reduce disk operations (base-space reads only). With this option, at least 1.25N bytes of memory are required, where N is the length of the genome. .TP .BI -n \ INT Maximum number of alignments to output in the XA tag for reads paired properly. If a read has more than INT hits, the XA tag will not be written. [3] .TP .BI -N \ INT Maximum number of alignments to output in the XA tag for disconcordant read pairs (excluding singletons). If a read has more than INT hits, the XA tag will not be written. [10] .TP .BI -r \ STR Specify the read group in a format like `@RG\\tID:foo\\tSM:bar'. [null] .RE .TP .B bwasw bwa bwasw [-a matchScore] [-b mmPen] [-q gapOpenPen] [-r gapExtPen] [-t nThreads] [-w bandWidth] [-T thres] [-s hspIntv] [-z zBest] [-N nHspRev] [-c thresCoef] [mate.fq] Align query sequences in the .I in.fq file. When .I mate.fq is present, perform paired-end alignment. The paired-end mode only works for reads Illumina short-insert libraries. In the paired-end mode, BWA-SW may still output split alignments but they are all marked as not properly paired; the mate positions will not be written if the mate has multiple local hits. .B OPTIONS: .RS .TP 10 .BI -a \ INT Score of a match [1] .TP .BI -b \ INT Mismatch penalty [3] .TP .BI -q \ INT Gap open penalty [5] .TP .BI -r \ INT Gap extension penalty. The penalty for a contiguous gap of size k is q+k*r. [2] .TP .BI -t \ INT Number of threads in the multi-threading mode [1] .TP .BI -w \ INT Band width in the banded alignment [33] .TP .BI -T \ INT Minimum score threshold divided by a [37] .TP .BI -c \ FLOAT Coefficient for threshold adjustment according to query length. Given an l-long query, the threshold for a hit to be retained is a*max{T,c*log(l)}. [5.5] .TP .BI -z \ INT Z-best heuristics. Higher -z increases accuracy at the cost of speed. [1] .TP .BI -s \ INT Maximum SA interval size for initiating a seed. Higher -s increases accuracy at the cost of speed. [3] .TP .BI -N \ INT Minimum number of seeds supporting the resultant alignment to skip reverse alignment. [5] .RE .SH SAM ALIGNMENT FORMAT .PP The output of the .B `aln' command is binary and designed for BWA use only. BWA outputs the final alignment in the SAM (Sequence Alignment/Map) format. Each line consists of: .TS center box; cb | cb | cb n | l | l . Col Field Description _ 1 QNAME Query (pair) NAME 2 FLAG bitwise FLAG 3 RNAME Reference sequence NAME 4 POS 1-based leftmost POSition/coordinate of clipped sequence 5 MAPQ MAPping Quality (Phred-scaled) 6 CIAGR extended CIGAR string 7 MRNM Mate Reference sequence NaMe (`=' if same as RNAME) 8 MPOS 1-based Mate POSistion 9 ISIZE Inferred insert SIZE 10 SEQ query SEQuence on the same strand as the reference 11 QUAL query QUALity (ASCII-33 gives the Phred base quality) 12 OPT variable OPTional fields in the format TAG:VTYPE:VALUE .TE .PP Each bit in the FLAG field is defined as: .TS center box; cb | cb | cb c | l | l . Chr Flag Description _ p 0x0001 the read is paired in sequencing P 0x0002 the read is mapped in a proper pair u 0x0004 the query sequence itself is unmapped U 0x0008 the mate is unmapped r 0x0010 strand of the query (1 for reverse) R 0x0020 strand of the mate 1 0x0040 the read is the first read in a pair 2 0x0080 the read is the second read in a pair s 0x0100 the alignment is not primary f 0x0200 QC failure d 0x0400 optical or PCR duplicate S 0x0800 supplementary alignment .TE .PP The Please check for the format specification and the tools for post-processing the alignment. BWA generates the following optional fields. Tags starting with `X' are specific to BWA. .TS center box; cb | cb cB | l . Tag Meaning _ NM Edit distance MD Mismatching positions/bases AS Alignment score BC Barcode sequence SA Supplementary alignments _ X0 Number of best hits X1 Number of suboptimal hits found by BWA XN Number of ambiguous bases in the referenece XM Number of mismatches in the alignment XO Number of gap opens XG Number of gap extentions XT Type: Unique/Repeat/N/Mate-sw XA Alternative hits; format: /(chr,pos,CIGAR,NM;)*/ _ XS Suboptimal alignment score XF Support from forward/reverse alignment XE Number of supporting seeds .TE .PP Note that XO and XG are generated by BWT search while the CIGAR string by Smith-Waterman alignment. These two tags may be inconsistent with the CIGAR string. This is not a bug. .SH NOTES ON SHORT-READ ALIGNMENT .SS Alignment Accuracy .PP When seeding is disabled, BWA guarantees to find an alignment containing maximum .I maxDiff differences including .I maxGapO gap opens which do not occur within .I nIndelEnd bp towards either end of the query. Longer gaps may be found if .I maxGapE is positive, but it is not guaranteed to find all hits. When seeding is enabled, BWA further requires that the first .I seedLen subsequence contains no more than .I maxSeedDiff differences. .PP When gapped alignment is disabled, BWA is expected to generate the same alignment as Eland version 1, the Illumina alignment program. However, as BWA change `N' in the database sequence to random nucleotides, hits to these random sequences will also be counted. As a consequence, BWA may mark a unique hit as a repeat, if the random sequences happen to be identical to the sequences which should be unqiue in the database. .PP By default, if the best hit is not highly repetitive (controlled by -R), BWA also finds all hits contains one more mismatch; otherwise, BWA finds all equally best hits only. Base quality is NOT considered in evaluating hits. In the paired-end mode, BWA pairs all hits it found. It further performs Smith-Waterman alignment for unmapped reads to rescue reads with a high erro rate, and for high-quality anomalous pairs to fix potential alignment errors. .SS Estimating Insert Size Distribution .PP BWA estimates the insert size distribution per 256*1024 read pairs. It first collects pairs of reads with both ends mapped with a single-end quality 20 or higher and then calculates median (Q2), lower and higher quartile (Q1 and Q3). It estimates the mean and the variance of the insert size distribution from pairs whose insert sizes are within interval [Q1-2(Q3-Q1), Q3+2(Q3-Q1)]. The maximum distance x for a pair considered to be properly paired (SAM flag 0x2) is calculated by solving equation Phi((x-mu)/sigma)=x/L*p0, where mu is the mean, sigma is the standard error of the insert size distribution, L is the length of the genome, p0 is prior of anomalous pair and Phi() is the standard cumulative distribution function. For mapping Illumina short-insert reads to the human genome, x is about 6-7 sigma away from the mean. Quartiles, mean, variance and x will be printed to the standard error output. .SS Memory Requirement .PP With bwtsw algorithm, 5GB memory is required for indexing the complete human genome sequences. For short reads, the .B aln command uses ~3.2GB memory and the .B sampe command uses ~5.4GB. .SS Speed .PP Indexing the human genome sequences takes 3 hours with bwtsw algorithm. Indexing smaller genomes with IS algorithms is faster, but requires more memory. .PP The speed of alignment is largely determined by the error rate of the query sequences (r). Firstly, BWA runs much faster for near perfect hits than for hits with many differences, and it stops searching for a hit with l+2 differences if a l-difference hit is found. This means BWA will be very slow if r is high because in this case BWA has to visit hits with many differences and looking for these hits is expensive. Secondly, the alignment algorithm behind makes the speed sensitive to [k log(N)/m], where k is the maximum allowed differences, N the size of database and m the length of a query. In practice, we choose k w.r.t. r and therefore r is the leading factor. I would not recommend to use BWA on data with r>0.02. .PP Pairing is slower for shorter reads. This is mainly because shorter reads have more spurious hits and converting SA coordinates to chromosomal coordinates are very costly. .SH CHANGES IN BWA-0.6 .PP Since version 0.6, BWA has been able to work with a reference genome longer than 4GB. This feature makes it possible to integrate the forward and reverse complemented genome in one FM-index, which speeds up both BWA-short and BWA-SW. As a tradeoff, BWA uses more memory because it has to keep all positions and ranks in 64-bit integers, twice larger than 32-bit integers used in the previous versions. The latest BWA-SW also works for paired-end reads longer than 100bp. In comparison to BWA-short, BWA-SW tends to be more accurate for highly unique reads and more robust to relative long INDELs and structural variants. Nonetheless, BWA-short usually has higher power to distinguish the optimal hit from many suboptimal hits. The choice of the mapping algorithm may depend on the application. .SH SEE ALSO BWA website , Samtools website .SH AUTHOR Heng Li at the Sanger Institute wrote the key source codes and integrated the following codes for BWT construction: bwtsw , implemented by Chi-Kwong Wong at the University of Hong Kong and IS originally proposed by Nong Ge at the Sun Yat-Sen University and implemented by Yuta Mori. .SH LICENSE AND CITATION .PP The full BWA package is distributed under GPLv3 as it uses source codes from BWT-SW which is covered by GPL. Sorting, hash table, BWT and IS libraries are distributed under the MIT license. .PP If you use the BWA-backtrack algorithm, please cite the following paper: .PP Li H. and Durbin R. (2009) Fast and accurate short read alignment with Burrows-Wheeler transform. Bioinformatics, 25, 1754-1760. [PMID: 19451168] .PP If you use the BWA-SW algorithm, please cite: .PP Li H. and Durbin R. (2010) Fast and accurate long-read alignment with Burrows-Wheeler transform. Bioinformatics, 26, 589-595. [PMID: 20080505] .PP If you use BWA-MEM or the fastmap component of BWA, please cite: .PP Li H. (2013) Aligning sequence reads, clone sequences and assembly contigs with BWA-MEM. arXiv:1303.3997v1 [q-bio.GN]. .PP It is likely that the BWA-MEM manuscript will not appear in a peer-reviewed journal. .SH HISTORY BWA is largely influenced by BWT-SW. It uses source codes from BWT-SW and mimics its binary file formats; BWA-SW resembles BWT-SW in several ways. The initial idea about BWT-based alignment also came from the group who developed BWT-SW. At the same time, BWA is different enough from BWT-SW. The short-read alignment algorithm bears no similarity to Smith-Waterman algorithm any more. While BWA-SW learns from BWT-SW, it introduces heuristics that can hardly be applied to the original algorithm. In all, BWA does not guarantee to find all local hits as what BWT-SW is designed to do, but it is much faster than BWT-SW on both short and long query sequences. I started to write the first piece of codes on 24 May 2008 and got the initial stable version on 02 June 2008. During this period, I was acquainted that Professor Tak-Wah Lam, the first author of BWT-SW paper, was collaborating with Beijing Genomics Institute on SOAP2, the successor to SOAP (Short Oligonucleotide Analysis Package). SOAP2 has come out in November 2008. According to the SourceForge download page, the third BWT-based short read aligner, bowtie, was first released in August 2008. At the time of writing this manual, at least three more BWT-based short-read aligners are being implemented. The BWA-SW algorithm is a new component of BWA. It was conceived in November 2008 and implemented ten months later. The BWA-MEM algorithm is based on an algorithm finding super-maximal exact matches (SMEMs), which was first published with the fermi assembler paper in 2012. I first implemented the basic SMEM algorithm in the .B fastmap command for an experiment and then extended the basic algorithm and added the extension part in Feburary 2013 to make BWA-MEM a fully featured mapper. bwa-0.7.17/bwa.c000066400000000000000000000334321317342117100133010ustar00rootroot00000000000000#include #include #include #include #include "bntseq.h" #include "bwa.h" #include "ksw.h" #include "utils.h" #include "kstring.h" #include "kvec.h" #ifdef USE_MALLOC_WRAPPERS # include "malloc_wrap.h" #endif int bwa_verbose = 3; char bwa_rg_id[256]; char *bwa_pg; /************************ * Batch FASTA/Q reader * ************************/ #include "kseq.h" KSEQ_DECLARE(gzFile) static inline void trim_readno(kstring_t *s) { if (s->l > 2 && s->s[s->l-2] == '/' && isdigit(s->s[s->l-1])) s->l -= 2, s->s[s->l] = 0; } static inline char *dupkstring(const kstring_t *str, int dupempty) { char *s = (str->l > 0 || dupempty)? malloc(str->l + 1) : NULL; if (!s) return NULL; memcpy(s, str->s, str->l); s[str->l] = '\0'; return s; } static inline void kseq2bseq1(const kseq_t *ks, bseq1_t *s) { // TODO: it would be better to allocate one chunk of memory, but probably it does not matter in practice s->name = dupkstring(&ks->name, 1); s->comment = dupkstring(&ks->comment, 0); s->seq = dupkstring(&ks->seq, 1); s->qual = dupkstring(&ks->qual, 0); s->l_seq = ks->seq.l; } bseq1_t *bseq_read(int chunk_size, int *n_, void *ks1_, void *ks2_) { kseq_t *ks = (kseq_t*)ks1_, *ks2 = (kseq_t*)ks2_; int size = 0, m, n; bseq1_t *seqs; m = n = 0; seqs = 0; while (kseq_read(ks) >= 0) { if (ks2 && kseq_read(ks2) < 0) { // the 2nd file has fewer reads fprintf(stderr, "[W::%s] the 2nd file has fewer sequences.\n", __func__); break; } if (n >= m) { m = m? m<<1 : 256; seqs = realloc(seqs, m * sizeof(bseq1_t)); } trim_readno(&ks->name); kseq2bseq1(ks, &seqs[n]); seqs[n].id = n; size += seqs[n++].l_seq; if (ks2) { trim_readno(&ks2->name); kseq2bseq1(ks2, &seqs[n]); seqs[n].id = n; size += seqs[n++].l_seq; } if (size >= chunk_size && (n&1) == 0) break; } if (size == 0) { // test if the 2nd file is finished if (ks2 && kseq_read(ks2) >= 0) fprintf(stderr, "[W::%s] the 1st file has fewer sequences.\n", __func__); } *n_ = n; return seqs; } void bseq_classify(int n, bseq1_t *seqs, int m[2], bseq1_t *sep[2]) { int i, has_last; kvec_t(bseq1_t) a[2] = {{0,0,0}, {0,0,0}}; for (i = 1, has_last = 1; i < n; ++i) { if (has_last) { if (strcmp(seqs[i].name, seqs[i-1].name) == 0) { kv_push(bseq1_t, a[1], seqs[i-1]); kv_push(bseq1_t, a[1], seqs[i]); has_last = 0; } else kv_push(bseq1_t, a[0], seqs[i-1]); } else has_last = 1; } if (has_last) kv_push(bseq1_t, a[0], seqs[i-1]); sep[0] = a[0].a, m[0] = a[0].n; sep[1] = a[1].a, m[1] = a[1].n; } /***************** * CIGAR related * *****************/ void bwa_fill_scmat(int a, int b, int8_t mat[25]) { int i, j, k; for (i = k = 0; i < 4; ++i) { for (j = 0; j < 4; ++j) mat[k++] = i == j? a : -b; mat[k++] = -1; // ambiguous base } for (j = 0; j < 5; ++j) mat[k++] = -1; } // Generate CIGAR when the alignment end points are known uint32_t *bwa_gen_cigar2(const int8_t mat[25], int o_del, int e_del, int o_ins, int e_ins, int w_, int64_t l_pac, const uint8_t *pac, int l_query, uint8_t *query, int64_t rb, int64_t re, int *score, int *n_cigar, int *NM) { uint32_t *cigar = 0; uint8_t tmp, *rseq; int i; int64_t rlen; kstring_t str; const char *int2base; if (n_cigar) *n_cigar = 0; if (NM) *NM = -1; if (l_query <= 0 || rb >= re || (rb < l_pac && re > l_pac)) return 0; // reject if negative length or bridging the forward and reverse strand rseq = bns_get_seq(l_pac, pac, rb, re, &rlen); if (re - rb != rlen) goto ret_gen_cigar; // possible if out of range if (rb >= l_pac) { // then reverse both query and rseq; this is to ensure indels to be placed at the leftmost position for (i = 0; i < l_query>>1; ++i) tmp = query[i], query[i] = query[l_query - 1 - i], query[l_query - 1 - i] = tmp; for (i = 0; i < rlen>>1; ++i) tmp = rseq[i], rseq[i] = rseq[rlen - 1 - i], rseq[rlen - 1 - i] = tmp; } if (l_query == re - rb && w_ == 0) { // no gap; no need to do DP // UPDATE: we come to this block now... FIXME: due to an issue in mem_reg2aln(), we never come to this block. This does not affect accuracy, but it hurts performance. if (n_cigar) { cigar = malloc(4); cigar[0] = l_query<<4 | 0; *n_cigar = 1; } for (i = 0, *score = 0; i < l_query; ++i) *score += mat[rseq[i]*5 + query[i]]; } else { int w, max_gap, max_ins, max_del, min_w; // set the band-width max_ins = (int)((double)(((l_query+1)>>1) * mat[0] - o_ins) / e_ins + 1.); max_del = (int)((double)(((l_query+1)>>1) * mat[0] - o_del) / e_del + 1.); max_gap = max_ins > max_del? max_ins : max_del; max_gap = max_gap > 1? max_gap : 1; w = (max_gap + abs((int)rlen - l_query) + 1) >> 1; w = w < w_? w : w_; min_w = abs((int)rlen - l_query) + 3; w = w > min_w? w : min_w; // NW alignment if (bwa_verbose >= 4) { printf("* Global bandwidth: %d\n", w); printf("* Global ref: "); for (i = 0; i < rlen; ++i) putchar("ACGTN"[(int)rseq[i]]); putchar('\n'); printf("* Global query: "); for (i = 0; i < l_query; ++i) putchar("ACGTN"[(int)query[i]]); putchar('\n'); } *score = ksw_global2(l_query, query, rlen, rseq, 5, mat, o_del, e_del, o_ins, e_ins, w, n_cigar, &cigar); } if (NM && n_cigar) {// compute NM and MD int k, x, y, u, n_mm = 0, n_gap = 0; str.l = str.m = *n_cigar * 4; str.s = (char*)cigar; // append MD to CIGAR int2base = rb < l_pac? "ACGTN" : "TGCAN"; for (k = 0, x = y = u = 0; k < *n_cigar; ++k) { int op, len; cigar = (uint32_t*)str.s; op = cigar[k]&0xf, len = cigar[k]>>4; if (op == 0) { // match for (i = 0; i < len; ++i) { if (query[x + i] != rseq[y + i]) { kputw(u, &str); kputc(int2base[rseq[y+i]], &str); ++n_mm; u = 0; } else ++u; } x += len; y += len; } else if (op == 2) { // deletion if (k > 0 && k < *n_cigar - 1) { // don't do the following if D is the first or the last CIGAR kputw(u, &str); kputc('^', &str); for (i = 0; i < len; ++i) kputc(int2base[rseq[y+i]], &str); u = 0; n_gap += len; } y += len; } else if (op == 1) x += len, n_gap += len; // insertion } kputw(u, &str); kputc(0, &str); *NM = n_mm + n_gap; cigar = (uint32_t*)str.s; } if (rb >= l_pac) // reverse back query for (i = 0; i < l_query>>1; ++i) tmp = query[i], query[i] = query[l_query - 1 - i], query[l_query - 1 - i] = tmp; ret_gen_cigar: free(rseq); return cigar; } uint32_t *bwa_gen_cigar(const int8_t mat[25], int q, int r, int w_, int64_t l_pac, const uint8_t *pac, int l_query, uint8_t *query, int64_t rb, int64_t re, int *score, int *n_cigar, int *NM) { return bwa_gen_cigar2(mat, q, r, q, r, w_, l_pac, pac, l_query, query, rb, re, score, n_cigar, NM); } /********************* * Full index reader * *********************/ char *bwa_idx_infer_prefix(const char *hint) { char *prefix; int l_hint; FILE *fp; l_hint = strlen(hint); prefix = malloc(l_hint + 3 + 4 + 1); strcpy(prefix, hint); strcpy(prefix + l_hint, ".64.bwt"); if ((fp = fopen(prefix, "rb")) != 0) { fclose(fp); prefix[l_hint + 3] = 0; return prefix; } else { strcpy(prefix + l_hint, ".bwt"); if ((fp = fopen(prefix, "rb")) == 0) { free(prefix); return 0; } else { fclose(fp); prefix[l_hint] = 0; return prefix; } } } bwt_t *bwa_idx_load_bwt(const char *hint) { char *tmp, *prefix; bwt_t *bwt; prefix = bwa_idx_infer_prefix(hint); if (prefix == 0) { if (bwa_verbose >= 1) fprintf(stderr, "[E::%s] fail to locate the index files\n", __func__); return 0; } tmp = calloc(strlen(prefix) + 5, 1); strcat(strcpy(tmp, prefix), ".bwt"); // FM-index bwt = bwt_restore_bwt(tmp); strcat(strcpy(tmp, prefix), ".sa"); // partial suffix array (SA) bwt_restore_sa(tmp, bwt); free(tmp); free(prefix); return bwt; } bwaidx_t *bwa_idx_load_from_disk(const char *hint, int which) { bwaidx_t *idx; char *prefix; prefix = bwa_idx_infer_prefix(hint); if (prefix == 0) { if (bwa_verbose >= 1) fprintf(stderr, "[E::%s] fail to locate the index files\n", __func__); return 0; } idx = calloc(1, sizeof(bwaidx_t)); if (which & BWA_IDX_BWT) idx->bwt = bwa_idx_load_bwt(hint); if (which & BWA_IDX_BNS) { int i, c; idx->bns = bns_restore(prefix); for (i = c = 0; i < idx->bns->n_seqs; ++i) if (idx->bns->anns[i].is_alt) ++c; if (bwa_verbose >= 3) fprintf(stderr, "[M::%s] read %d ALT contigs\n", __func__, c); if (which & BWA_IDX_PAC) { idx->pac = calloc(idx->bns->l_pac/4+1, 1); err_fread_noeof(idx->pac, 1, idx->bns->l_pac/4+1, idx->bns->fp_pac); // concatenated 2-bit encoded sequence err_fclose(idx->bns->fp_pac); idx->bns->fp_pac = 0; } } free(prefix); return idx; } bwaidx_t *bwa_idx_load(const char *hint, int which) { return bwa_idx_load_from_disk(hint, which); } void bwa_idx_destroy(bwaidx_t *idx) { if (idx == 0) return; if (idx->mem == 0) { if (idx->bwt) bwt_destroy(idx->bwt); if (idx->bns) bns_destroy(idx->bns); if (idx->pac) free(idx->pac); } else { free(idx->bwt); free(idx->bns->anns); free(idx->bns); if (!idx->is_shm) free(idx->mem); } free(idx); } int bwa_mem2idx(int64_t l_mem, uint8_t *mem, bwaidx_t *idx) { int64_t k = 0, x; int i; // generate idx->bwt x = sizeof(bwt_t); idx->bwt = malloc(x); memcpy(idx->bwt, mem + k, x); k += x; x = idx->bwt->bwt_size * 4; idx->bwt->bwt = (uint32_t*)(mem + k); k += x; x = idx->bwt->n_sa * sizeof(bwtint_t); idx->bwt->sa = (bwtint_t*)(mem + k); k += x; // generate idx->bns and idx->pac x = sizeof(bntseq_t); idx->bns = malloc(x); memcpy(idx->bns, mem + k, x); k += x; x = idx->bns->n_holes * sizeof(bntamb1_t); idx->bns->ambs = (bntamb1_t*)(mem + k); k += x; x = idx->bns->n_seqs * sizeof(bntann1_t); idx->bns->anns = malloc(x); memcpy(idx->bns->anns, mem + k, x); k += x; for (i = 0; i < idx->bns->n_seqs; ++i) { idx->bns->anns[i].name = (char*)(mem + k); k += strlen(idx->bns->anns[i].name) + 1; idx->bns->anns[i].anno = (char*)(mem + k); k += strlen(idx->bns->anns[i].anno) + 1; } idx->pac = (uint8_t*)(mem + k); k += idx->bns->l_pac/4+1; assert(k == l_mem); idx->l_mem = k; idx->mem = mem; return 0; } int bwa_idx2mem(bwaidx_t *idx) { int i; int64_t k, x, tmp; uint8_t *mem; // copy idx->bwt x = idx->bwt->bwt_size * 4; mem = realloc(idx->bwt->bwt, sizeof(bwt_t) + x); idx->bwt->bwt = 0; memmove(mem + sizeof(bwt_t), mem, x); memcpy(mem, idx->bwt, sizeof(bwt_t)); k = sizeof(bwt_t) + x; x = idx->bwt->n_sa * sizeof(bwtint_t); mem = realloc(mem, k + x); memcpy(mem + k, idx->bwt->sa, x); k += x; free(idx->bwt->sa); free(idx->bwt); idx->bwt = 0; // copy idx->bns tmp = idx->bns->n_seqs * sizeof(bntann1_t) + idx->bns->n_holes * sizeof(bntamb1_t); for (i = 0; i < idx->bns->n_seqs; ++i) // compute the size of heap-allocated memory tmp += strlen(idx->bns->anns[i].name) + strlen(idx->bns->anns[i].anno) + 2; mem = realloc(mem, k + sizeof(bntseq_t) + tmp); x = sizeof(bntseq_t); memcpy(mem + k, idx->bns, x); k += x; x = idx->bns->n_holes * sizeof(bntamb1_t); memcpy(mem + k, idx->bns->ambs, x); k += x; free(idx->bns->ambs); x = idx->bns->n_seqs * sizeof(bntann1_t); memcpy(mem + k, idx->bns->anns, x); k += x; for (i = 0; i < idx->bns->n_seqs; ++i) { x = strlen(idx->bns->anns[i].name) + 1; memcpy(mem + k, idx->bns->anns[i].name, x); k += x; x = strlen(idx->bns->anns[i].anno) + 1; memcpy(mem + k, idx->bns->anns[i].anno, x); k += x; free(idx->bns->anns[i].name); free(idx->bns->anns[i].anno); } free(idx->bns->anns); // copy idx->pac x = idx->bns->l_pac/4+1; mem = realloc(mem, k + x); memcpy(mem + k, idx->pac, x); k += x; free(idx->bns); idx->bns = 0; free(idx->pac); idx->pac = 0; return bwa_mem2idx(k, mem, idx); } /*********************** * SAM header routines * ***********************/ void bwa_print_sam_hdr(const bntseq_t *bns, const char *hdr_line) { int i, n_SQ = 0; extern char *bwa_pg; if (hdr_line) { const char *p = hdr_line; while ((p = strstr(p, "@SQ\t")) != 0) { if (p == hdr_line || *(p-1) == '\n') ++n_SQ; p += 4; } } if (n_SQ == 0) { for (i = 0; i < bns->n_seqs; ++i) { err_printf("@SQ\tSN:%s\tLN:%d", bns->anns[i].name, bns->anns[i].len); if (bns->anns[i].is_alt) err_printf("\tAH:*\n"); else err_fputc('\n', stdout); } } else if (n_SQ != bns->n_seqs && bwa_verbose >= 2) fprintf(stderr, "[W::%s] %d @SQ lines provided with -H; %d sequences in the index. Continue anyway.\n", __func__, n_SQ, bns->n_seqs); if (hdr_line) err_printf("%s\n", hdr_line); if (bwa_pg) err_printf("%s\n", bwa_pg); } static char *bwa_escape(char *s) { char *p, *q; for (p = q = s; *p; ++p) { if (*p == '\\') { ++p; if (*p == 't') *q++ = '\t'; else if (*p == 'n') *q++ = '\n'; else if (*p == 'r') *q++ = '\r'; else if (*p == '\\') *q++ = '\\'; } else *q++ = *p; } *q = '\0'; return s; } char *bwa_set_rg(const char *s) { char *p, *q, *r, *rg_line = 0; memset(bwa_rg_id, 0, 256); if (strstr(s, "@RG") != s) { if (bwa_verbose >= 1) fprintf(stderr, "[E::%s] the read group line is not started with @RG\n", __func__); goto err_set_rg; } if (strstr(s, "\t") != NULL) { if (bwa_verbose >= 1) fprintf(stderr, "[E::%s] the read group line contained literal characters -- replace with escaped tabs: \\t\n", __func__); goto err_set_rg; } rg_line = strdup(s); bwa_escape(rg_line); if ((p = strstr(rg_line, "\tID:")) == 0) { if (bwa_verbose >= 1) fprintf(stderr, "[E::%s] no ID within the read group line\n", __func__); goto err_set_rg; } p += 4; for (q = p; *q && *q != '\t' && *q != '\n'; ++q); if (q - p + 1 > 256) { if (bwa_verbose >= 1) fprintf(stderr, "[E::%s] @RG:ID is longer than 255 characters\n", __func__); goto err_set_rg; } for (q = p, r = bwa_rg_id; *q && *q != '\t' && *q != '\n'; ++q) *r++ = *q; return rg_line; err_set_rg: free(rg_line); return 0; } char *bwa_insert_header(const char *s, char *hdr) { int len = 0; if (s == 0 || s[0] != '@') return hdr; if (hdr) { len = strlen(hdr); hdr = realloc(hdr, len + strlen(s) + 2); hdr[len++] = '\n'; strcpy(hdr + len, s); } else hdr = strdup(s); bwa_escape(hdr + len); return hdr; } bwa-0.7.17/bwa.h000066400000000000000000000037741317342117100133140ustar00rootroot00000000000000#ifndef BWA_H_ #define BWA_H_ #include #include "bntseq.h" #include "bwt.h" #define BWA_IDX_BWT 0x1 #define BWA_IDX_BNS 0x2 #define BWA_IDX_PAC 0x4 #define BWA_IDX_ALL 0x7 #define BWA_CTL_SIZE 0x10000 #define BWTALGO_AUTO 0 #define BWTALGO_RB2 1 #define BWTALGO_BWTSW 2 #define BWTALGO_IS 3 typedef struct { bwt_t *bwt; // FM-index bntseq_t *bns; // information on the reference sequences uint8_t *pac; // the actual 2-bit encoded reference sequences with 'N' converted to a random base int is_shm; int64_t l_mem; uint8_t *mem; } bwaidx_t; typedef struct { int l_seq, id; char *name, *comment, *seq, *qual, *sam; } bseq1_t; extern int bwa_verbose; extern char bwa_rg_id[256]; #ifdef __cplusplus extern "C" { #endif bseq1_t *bseq_read(int chunk_size, int *n_, void *ks1_, void *ks2_); void bseq_classify(int n, bseq1_t *seqs, int m[2], bseq1_t *sep[2]); void bwa_fill_scmat(int a, int b, int8_t mat[25]); uint32_t *bwa_gen_cigar(const int8_t mat[25], int q, int r, int w_, int64_t l_pac, const uint8_t *pac, int l_query, uint8_t *query, int64_t rb, int64_t re, int *score, int *n_cigar, int *NM); uint32_t *bwa_gen_cigar2(const int8_t mat[25], int o_del, int e_del, int o_ins, int e_ins, int w_, int64_t l_pac, const uint8_t *pac, int l_query, uint8_t *query, int64_t rb, int64_t re, int *score, int *n_cigar, int *NM); int bwa_idx_build(const char *fa, const char *prefix, int algo_type, int block_size); char *bwa_idx_infer_prefix(const char *hint); bwt_t *bwa_idx_load_bwt(const char *hint); bwaidx_t *bwa_idx_load_from_shm(const char *hint); bwaidx_t *bwa_idx_load_from_disk(const char *hint, int which); bwaidx_t *bwa_idx_load(const char *hint, int which); void bwa_idx_destroy(bwaidx_t *idx); int bwa_idx2mem(bwaidx_t *idx); int bwa_mem2idx(int64_t l_mem, uint8_t *mem, bwaidx_t *idx); void bwa_print_sam_hdr(const bntseq_t *bns, const char *hdr_line); char *bwa_set_rg(const char *s); char *bwa_insert_header(const char *s, char *hdr); #ifdef __cplusplus } #endif #endif bwa-0.7.17/bwakit/000077500000000000000000000000001317342117100136405ustar00rootroot00000000000000bwa-0.7.17/bwakit/README.md000066400000000000000000000135011317342117100151170ustar00rootroot00000000000000## Introduction Bwakit is a self-consistent installation-free package of scripts and precompiled binaries, providing an end-to-end solution to read mapping. In addition to the basic mapping functionality implemented in bwa, bwakit is able to generate proper human reference genome and to take advantage of ALT contigs, if present, to improve read mapping and to perform HLA typing for high-coverage human data. It can remap name- or coordinate-sorted BAM with read group and barcode information retained. Bwakit also *optionally* trims adapters (via [trimadap][ta]), marks duplicates (via [samblaster][sb]) and sorts the final alignment (via [samtools][smtl]). Bwakit has two entry scripts: `run-gen-ref` which downloads and generates human reference genomes, and `run-bwamem` which prints mapping command lines on the standard output that can be piped to `sh` to execute. The two scripts will call other programs or use data in `bwa.kit`. The following shows an example about how to use bwakit: ```sh # Download the bwa-0.7.11 binary package (download link may change) wget -O- http://sourceforge.net/projects/bio-bwa/files/bwakit/bwakit-0.7.12_x64-linux.tar.bz2/download \ | gzip -dc | tar xf - # Generate the GRCh38+ALT+decoy+HLA and create the BWA index bwa.kit/run-gen-ref hs38DH # download GRCh38 and write hs38DH.fa bwa.kit/bwa index hs38DH.fa # create BWA index # mapping bwa.kit/run-bwamem -o out -H hs38DH.fa read1.fq read2.fq | sh ``` The last mapping command line will generate the following files: * `out.aln.bam`: unsorted alignments with ALT-aware mapping quality. In this file, one read may be placed on multiple overlapping ALT contigs at the same time even if the read is mapped better to some contigs than others. This makes it possible to analyze each contig independent of others. * `out.hla.top`: best genotypes for HLA-A, -B, -C, -DQA1, -DQB1 and -DRB1 genes. * `out.hla.all`: other possible genotypes on the six HLA genes. * `out.log.*`: bwa-mem, samblaster and HLA typing log files. Bwakit can be [downloaded here][res]. It is only available to x86_64-linux. The scripts in the package are available in the [bwa/bwakit][kit] directory. Packaging is done manually for now. ## Limitations * HLA typing only works for high-coverage human data. The typing accuracy can still be improved. We encourage researchers to develop better HLA typing tools based on the intermediate output of bwakit (for each HLA gene included in the index, bwakit writes all reads matching it in a separate file). * Duplicate marking only works when all reads from a single paired-end library are provided as the input. This limitation is the necessary tradeoff of fast MarkDuplicate provided by samblaster. * The adapter trimmer is chosen as it is fast, pipe friendly and does not discard reads. However, it is conservative and suboptimal. If this is a concern, it is recommended to preprocess input reads with a more sophisticated adapter trimmer. We also hope existing trimmers can be modified to operate on an interleaved FASTQ stream. We will replace trimadap once a better trimmer meets our needs. * Bwakit can be memory demanding depends on the functionality invoked. For 30X human data, bwa-mem takes about 11GB RAM with 32 threads, samblaster uses close to 10GB and BAM shuffling (if the input is sorted BAM) uses several GB. In the current setting, sorting uses about 10GB. ## Package Contents ``` bwa.kit |-- README.md This README file. |-- run-bwamem *Entry script* for the entire mapping pipeline. |-- bwa *BWA binary* |-- k8 Interpretor for *.js scripts. |-- bwa-postalt.js Post-process alignments to ALT contigs/decoys/HLA genes. |-- htsbox Used by run-bwamem for shuffling BAMs and BAM=>FASTQ. |-- samblaster MarkDuplicates for reads from the same library. v0.1.20 |-- samtools SAMtools for sorting and SAM=>BAM conversion. v1.1 |-- seqtk For FASTQ manipulation. |-- trimadap Trim Illumina PE sequencing adapters. | |-- run-gen-ref *Entry script* for generating human reference genomes. |-- resource-GRCh38 Resources for generating GRCh38 | |-- hs38DH-extra.fa Decoy and HLA gene sequences. Used by run-gen-ref. | `-- hs38DH.fa.alt ALT-to-GRCh38 alignment. Used by run-gen-ref. | |-- run-HLA HLA typing for sequences extracted by bwa-postalt.js. |-- typeHLA.sh Type one HLA-gene. Called by run-HLA. |-- typeHLA.js HLA typing from exon-to-contig alignment. Used by typeHLA.sh. |-- typeHLA-selctg.js Select contigs overlapping HLA exons. Used by typeHLA.sh. |-- fermi2.pl Fermi2 wrapper. Used by typeHLA.sh for de novo assembly. |-- fermi2 Fermi2 binary. Used by fermi2.pl. |-- ropebwt2 RopeBWT2 binary. Used by fermi2.pl. |-- resource-human-HLA Resources for HLA typing | |-- HLA-ALT-exons.bed Exonic regions of HLA ALT contigs. Used by typeHLA.sh. | |-- HLA-CDS.fa CDS of HLA-{A,B,C,DQA1,DQB1,DRB1} genes from IMGT/HLA-3.18.0. | |-- HLA-ALT-type.txt HLA types for each HLA ALT contig. Not used. | `-- HLA-ALT-idx BWA indices of each HLA ALT contig. Used by typeHLA.sh | `-- (...) | `-- doc BWA documentations |-- bwa.1 Manpage |-- NEWS.md Release Notes |-- README.md GitHub README page `-- README-alt.md Documentation for ALT mapping ``` [res]: https://sourceforge.net/projects/bio-bwa/files/bwakit [sb]: https://github.com/GregoryFaust/samblaster [ta]: https://github.com/lh3/seqtk/blob/master/trimadap.c [smtl]: http://www.htslib.org [kit]: https://github.com/lh3/bwa/tree/master/bwakit bwa-0.7.17/bwakit/bwa-postalt.js000066400000000000000000000430711317342117100164400ustar00rootroot00000000000000/***************************************************************** * The K8 Javascript interpreter is required to run this script. * * * * Source code: https://github.com/attractivechaos/k8 * * Binary: http://sourceforge.net/projects/lh3/files/k8/ * * * * Data file used for generating GRCh38 ALT alignments: * * * * http://sourceforge.net/projects/bio-bwa/files/ * *****************************************************************/ /****************** *** From k8.js *** ******************/ // Parse command-line options. A BSD getopt() clone in javascript. var getopt = function(args, ostr) { var oli; // option letter list index if (typeof(getopt.place) == 'undefined') getopt.ind = 0, getopt.arg = null, getopt.place = -1; if (getopt.place == -1) { // update scanning pointer if (getopt.ind >= args.length || args[getopt.ind].charAt(getopt.place = 0) != '-') { getopt.place = -1; return null; } if (getopt.place + 1 < args[getopt.ind].length && args[getopt.ind].charAt(++getopt.place) == '-') { // found "--" ++getopt.ind; getopt.place = -1; return null; } } var optopt = args[getopt.ind].charAt(getopt.place++); // character checked for validity if (optopt == ':' || (oli = ostr.indexOf(optopt)) < 0) { if (optopt == '-') return null; // if the user didn't specify '-' as an option, assume it means null. if (getopt.place < 0) ++getopt.ind; return '?'; } if (oli+1 >= ostr.length || ostr.charAt(++oli) != ':') { // don't need argument getopt.arg = null; if (getopt.place < 0 || getopt.place >= args[getopt.ind].length) ++getopt.ind, getopt.place = -1; } else { // need an argument if (getopt.place >= 0 && getopt.place < args[getopt.ind].length) getopt.arg = args[getopt.ind].substr(getopt.place); else if (args.length <= ++getopt.ind) { // no arg getopt.place = -1; if (ostr.length > 0 && ostr.charAt(0) == ':') return ':'; return '?'; } else getopt.arg = args[getopt.ind]; // white space getopt.place = -1; ++getopt.ind; } return optopt; } // reverse a string Bytes.prototype.reverse = function() { for (var i = 0; i < this.length>>1; ++i) { var tmp = this[i]; this[i] = this[this.length - i - 1]; this[this.length - i - 1] = tmp; } } // reverse complement a DNA string Bytes.prototype.revcomp = function() { if (Bytes.rctab == null) { var s1 = 'WSATUGCYRKMBDHVNwsatugcyrkmbdhvn'; var s2 = 'WSTAACGRYMKVHDBNwstaacgrymkvhdbn'; Bytes.rctab = []; for (var i = 0; i < 256; ++i) Bytes.rctab[i] = 0; for (var i = 0; i < s1.length; ++i) Bytes.rctab[s1.charCodeAt(i)] = s2.charCodeAt(i); } for (var i = 0; i < this.length>>1; ++i) { var tmp = this[this.length - i - 1]; this[this.length - i - 1] = Bytes.rctab[this[i]]; this[i] = Bytes.rctab[tmp]; } if (this.length&1) this[this.length>>1] = Bytes.rctab[this[this.length>>1]]; } // create index for a list of intervals for fast interval queries; ported from bedidx.c in samtools function intv_ovlp(intv, bits) { if (typeof bits == "undefined") bits = 13; intv.sort(function(a,b) {return a[0]-b[0];}); // create the index var idx = [], max = 0; for (var i = 0; i < intv.length; ++i) { var b = intv[i][0]>>bits; var e = (intv[i][1]-1)>>bits; if (b != e) { for (var j = b; j <= e; ++j) if (idx[j] == null) idx[j] = i; } else if (idx[b] == null) idx[b] = i; max = max > e? max : e; } // closure return function(_b, _e) { var x = _b >> bits; if (x > max) return []; var off = idx[x]; if (off == null) { var i; for (i = ((_e - 1) >> bits) - 1; i >= 0; --i) if (idx[i] != null) break; off = i < 0? 0 : idx[i]; } var ovlp = []; for (var i = off; i < intv.length && intv[i][0] < _e; ++i) if (intv[i][1] > _b) ovlp.push(intv[i]); return ovlp; } } var re_cigar = /(\d+)([MIDSHN])/g; /****************************** *** Generate ALT alignment *** ******************************/ // given a pos on ALT and the ALT-to-REF CIGAR, find the pos on REF function cigar2pos(cigar, pos) { var x = 0, y = 0; for (var i = 0; i < cigar.length; ++i) { var op = cigar[i][0], len = cigar[i][1]; if (op == 'M') { if (y <= pos && pos < y + len) return x + (pos - y); x += len, y += len; } else if (op == 'D') { x += len; } else if (op == 'I') { if (y <= pos && pos < y + len) return x; y += len; } else if (op == 'S' || op == 'H') { if (y <= pos && pos < y + len) return -1; y += len; } } return -1; } // Parse a hit. $s is an array that looks something like ["chr1", "+12345", "100M", 5] // Return an object keeping various information about the alignment. function parse_hit(s, opt) { var h = {}; h.ctg = s[0]; h.start = parseInt(s[1].substr(1)) - 1; h.rev = (s[1].charAt(0) == '-'); h.cigar = s[2]; h.NM = parseInt(s[3]); h.hard = false; var m, l_ins, n_ins, l_del, n_del, l_match, l_skip, l_clip; l_ins = l_del = n_ins = n_del = l_match = l_skip = l_clip = 0; while ((m = re_cigar.exec(h.cigar)) != null) { var l = parseInt(m[1]); if (m[2] == 'M') l_match += l; else if (m[2] == 'D') ++n_del, l_del += l; else if (m[2] == 'I') ++n_ins, l_ins += l; else if (m[2] == 'N') l_skip += l; else if (m[2] == 'H' || m[2] == 'S') { l_clip += l; if (m[2] == 'H') h.hard = true; } } h.end = h.start + l_match + l_del + l_skip; h.NM = h.NM > l_del + l_ins? h.NM : l_del + l_ins; h.score = Math.floor((opt.a * l_match - (opt.a + opt.b) * (h.NM - l_del - l_ins) - opt.o * (n_del + n_ins) - opt.e * (l_del + l_ins)) / opt.a + .499); h.l_query = l_match + l_ins + l_clip; return h; } function print_buffer(buf2, fp_hla, hla) // output alignments { if (buf2.length == 0) return; for (var i = 0; i < buf2.length; ++i) print(buf2[i].join("\t")); if (fp_hla != null) { var name = buf2[0][0] + '/' + (buf2[0][1]>>6&3) + ((buf2[0][1]&16)? '-' : '+'); for (var x in hla) { if (fp_hla[x] != null); fp_hla[x].write('@' + name + '\n' + buf2[0][9] + '\n+\n' + buf2[0][10] + '\n'); } } } function collect_hla_hits(idx, ctg, start, end, hla) // collect reads hit to HLA genes { var m, ofunc = idx[ctg]; if (ofunc == null) return; var ovlp_alt = ofunc(start, end); for (var i = 0; i < ovlp_alt.length; ++i) if ((m = /^(HLA-[^\s\*]+)\*\d+/.exec(ovlp_alt[i][2])) != null) hla[m[1]] = true; } function bwa_postalt(args) { var version = "r985"; var c, opt = { a:1, b:4, o:6, e:1, min_mapq:10, min_sc:90, max_nm_sc:10, min_pa_ratio:1 }; while ((c = getopt(args, 'vp:r:')) != null) { if (c == 'p') opt.pre = getopt.arg; else if (c == 'r') opt.min_pa_ratio = parseFloat(getopt.arg); else if (c == 'v') { print(version); exit(0); } } if (opt.min_pa_ratio > 1.) opt.min_pa_ratio = 1.; if (args.length == getopt.ind) { print(""); print("Usage: k8 bwa-postalt.js [options] [aln.sam]\n"); print("Options: -p STR prefix of output files containting sequences matching HLA genes [null]"); print(" -r FLOAT reduce mapQ to 0 if not overlapping lifted best and pa= 0) { var line = buf.toString(); if (line.charAt(0) == '@') continue; var t = line.split("\t"); if (t.length < 11) continue; // incomplete lines is_alt[t[0]] = true; var pos = parseInt(t[3]) - 1; var flag = parseInt(t[1]); if ((flag&4) || t[2] == '*') continue; var m, cigar = [], l_qaln = 0, l_tlen = 0, l_qclip = 0; if ((m = /^(HLA-[^\s\*]+)\*\d+/.exec(t[0])) != null) { // read HLA contigs if (hla_ctg[m[1]] == null) hla_ctg[m[1]] = 0; ++hla_ctg[m[1]]; hla_chr = t[2]; } while ((m = re_cigar.exec(t[5])) != null) { var l = parseInt(m[1]); cigar.push([m[2] != 'H'? m[2] : 'S', l]); // convert hard clip to soft clip if (m[2] == 'M') l_qaln += l, l_tlen += l; else if (m[2] == 'I') l_qaln += l; else if (m[2] == 'S' || m[2] == 'H') l_qclip += l; else if (m[2] == 'D' || m[2] == 'N') l_tlen += l; } var j = flag&16? cigar.length-1 : 0; var start = cigar[j][0] == 'S'? cigar[j][1] : 0; if (intv_alt[t[0]] == null) intv_alt[t[0]] = []; intv_alt[t[0]].push([start, start + l_qaln, l_qaln + l_qclip, t[2], flag&16? true : false, pos - 1, cigar, pos + l_tlen]); if (intv_pri[t[2]] == null) intv_pri[t[2]] = []; intv_pri[t[2]].push([pos, pos + l_tlen, t[0]]); } file.close(); var idx_alt = {}, idx_pri = {}; for (var ctg in intv_alt) idx_alt[ctg] = intv_ovlp(intv_alt[ctg]); for (var ctg in intv_pri) idx_pri[ctg] = intv_ovlp(intv_pri[ctg]); // initialize the list of HLA contigs var fp_hla = null; if (opt.pre) { fp_hla = {}; for (var h in hla_ctg) fp_hla[h] = new File(opt.pre + '.' + h + '.fq', "w"); } // process SAM var buf2 = [], hla = {}; file = args.length - getopt.ind >= 2? new File(args[getopt.ind+1]) : new File(); while (file.readline(buf) >= 0) { var m, line = buf.toString(); if (line.charAt(0) == '@') { // print and then skip the header line print(line); continue; } var t = line.split("\t"); t[1] = parseInt(t[1]); t[3] = parseInt(t[3]); t[4] = parseInt(t[4]); // print bufferred reads if (buf2.length && (buf2[0][0] != t[0] || (buf2[0][1]&0xc0) != (t[1]&0xc0))) { print_buffer(buf2, fp_hla, hla); buf2 = [], hla = {}; } // skip unmapped lines if (t[1]&4) { buf2.push(t); continue; } // parse the reported hit var NM = (m = /\tNM:i:(\d+)/.exec(line)) == null? '0' : m[1]; var flag = t[1]; var h = parse_hit([t[2], ((flag&16)?'-':'+') + t[3], t[5], NM], opt); if (t[2] == hla_chr) collect_hla_hits(idx_pri, h.ctg, h.start, h.end, hla); if (h.hard) { // the following does not work with hard clipped alignments buf2.push(t); continue; } var hits = [h]; // parse hits in the XA tag if ((m = /\tXA:Z:(\S+)/.exec(line)) != null) { var XA_strs = m[1].split(";"); for (var i = 0; i < XA_strs.length; ++i) if (XA_strs[i] != '') // as the last symbol in an XA tag is ";", the last split is an empty string hits.push(parse_hit(XA_strs[i].split(","), opt)); } // check if there are ALT hits var has_alt = false; for (var i = 0; i < hits.length; ++i) if (is_alt[hits[i].ctg] != null) { has_alt = true; break; } if (!has_alt) { buf2.push(t); continue; } // lift mapping positions to the primary assembly var n_rpt_lifted = 0, rpt_lifted = null; for (var i = 0; i < hits.length; ++i) { var a, h = hits[i]; if (idx_alt[h.ctg] == null || (a = idx_alt[h.ctg](h.start, h.end)) == null || a.length == 0) continue; // find the approximate position on the primary assembly var lifted = []; for (var j = 0; j < a.length; ++j) { var s, e; if (!a[j][4]) { // ALT is mapped to the forward strand of the primary assembly s = cigar2pos(a[j][6], h.start); e = cigar2pos(a[j][6], h.end - 1) + 1; } else { s = cigar2pos(a[j][6], a[j][2] - h.end); e = cigar2pos(a[j][6], a[j][2] - h.start - 1) + 1; } if (s < 0 || e < 0) continue; // read is mapped to clippings in the ALT-to-chr alignment s += a[j][5]; e += a[j][5]; lifted.push([a[j][3], (h.rev!=a[j][4]), s, e]); if (i == 0) ++n_rpt_lifted; } if (i == 0 && n_rpt_lifted == 1) rpt_lifted = lifted[0].slice(0); if (lifted.length) hits[i].lifted = lifted; } // prepare for hits grouping for (var i = 0; i < hits.length; ++i) { // set keys for sorting if (hits[i].lifted != null) // TODO: only the first element in lifted[] is used hits[i].pctg = hits[i].lifted[0][0], hits[i].pstart = hits[i].lifted[0][2], hits[i].pend = hits[i].lifted[0][3]; else hits[i].pctg = hits[i].ctg, hits[i].pstart = hits[i].start, hits[i].pend = hits[i].end; hits[i].i = i; // keep the original index } // group hits based on the lifted positions on non-ALT sequences if (hits.length > 1) { hits.sort(function(a,b) { return a.pctg != b.pctg? (a.pctg < b.pctg? -1 : 1) : a.pstart - b.pstart }); var last_chr = null, end = 0, g = -1; for (var i = 0; i < hits.length; ++i) { if (last_chr != hits[i].pctg) ++g, last_chr = hits[i].pctg, end = 0; else if (hits[i].pstart >= end) ++g; hits[i].g = g; end = end > hits[i].pend? end : hits[i].pend; } } else hits[0].g = 0; // find the index and group id of the reported hit; find the size of the reported group var reported_g = null, reported_i = null, n_group0 = 0; if (hits.length > 1) { for (var i = 0; i < hits.length; ++i) if (hits[i].i == 0) reported_g = hits[i].g, reported_i = i; for (var i = 0; i < hits.length; ++i) if (hits[i].g == reported_g) ++n_group0; } else { if (is_alt[hits[0].ctg] == null) { // no need to go through the following if the single hit is non-ALT buf2.push(t); continue; } reported_g = reported_i = 0, n_group0 = 1; } // re-estimate mapping quality if necessary var mapQ, ori_mapQ = t[4]; if (n_group0 > 1) { var group_max = []; for (var i = 0; i < hits.length; ++i) { var g = hits[i].g; if (group_max[g] == null || group_max[g][0] < hits[i].score) group_max[g] = [hits[i].score, g]; } if (group_max.length > 1) group_max.sort(function(x,y) {return y[0]-x[0]}); if (group_max[0][1] == reported_g) { // the best hit is the hit reported in SAM mapQ = group_max.length == 1? 60 : 6 * (group_max[0][0] - group_max[1][0]); } else mapQ = 0; mapQ = mapQ < 60? mapQ : 60; if (idx_alt[t[2]] == null) mapQ = mapQ < ori_mapQ? mapQ : ori_mapQ; else mapQ = mapQ > ori_mapQ? mapQ : ori_mapQ; } else mapQ = t[4]; // find out whether the read is overlapping HLA genes if (hits[reported_i].pctg == hla_chr) { var rpt_start = 1<<30, rpt_end = 0; for (var i = 0; i < hits.length; ++i) { var h = hits[i]; if (h.g == reported_g) { rpt_start = rpt_start < h.pstart? rpt_start : h.pstart; rpt_end = rpt_end > h.pend ? rpt_end : h.pend; } } collect_hla_hits(idx_pri, hla_chr, rpt_start, rpt_end, hla); } // adjust the mapQ of the primary hits if (n_rpt_lifted <= 1) { var l = n_rpt_lifted == 1? rpt_lifted : null; for (var i = 0; i < buf2.length; ++i) { var s = buf2[i], is_ovlp = true; if (l != null) { if (l[0] != s[2]) is_ovlp = false; // different chr else if (((s[1]&16) != 0) != l[1]) is_ovlp = false; // different strand else { var start = s[3] - 1, end = start; while ((m = re_cigar.exec(t[5])) != null) if (m[2] == 'M' || m[2] == 'D' || m[2] == 'N') end += parseInt(m[1]); if (!(start < l[3] && l[2] < end)) is_ovlp = false; // no overlap } } else is_ovlp = false; // get the "pa" tag if present var om = -1, pa = 10.; for (var j = 11; j < s.length; ++j) if ((m = /^om:i:(\d+)/.exec(s[j])) != null) om = parseInt(m[1]); else if ((m = /^pa:f:(\S+)/.exec(s[j])) != null) pa = parseFloat(m[1]); if (is_ovlp) { // overlapping the lifted hit if (om > 0) s[4] = om; s[4] = s[4] < mapQ? s[4] : mapQ; } else if (pa < opt.min_pa_ratio) { // not overlapping; has a small pa if (om < 0) s.push("om:i:" + s[4]); s[4] = 0; } } } // generate lifted_str for (var i = 0; i < hits.length; ++i) { if (hits[i].lifted && hits[i].lifted.length) { var u = '', lifted = hits[i].lifted; for (var j = 0; j < lifted.length; ++j) u += lifted[j][0] + "," + lifted[j][2] + "," + lifted[j][3] + "," + (lifted[j][1]?'-':'+') + ";"; hits[i].lifted_str = u; } } // stage the reported hit t[4] = mapQ; if (n_group0 > 1) t.push("om:i:"+ori_mapQ); if (hits[reported_i].lifted_str) t.push("lt:Z:" + hits[reported_i].lifted_str); buf2.push(t); // stage the hits generated from the XA tag var cnt = 0, rs = null, rq = null; // rq: reverse quality; rs: reverse complement sequence var rg = (m = /\t(RG:Z:\S+)/.exec(line)) != null? m[1] : null; for (var i = 0; i < hits.length; ++i) { if (hits[i].g != reported_g || i == reported_i) continue; if (idx_alt[hits[i].ctg] == null) continue; var s = [t[0], 0, hits[i].ctg, hits[i].start+1, mapQ, hits[i].cigar, t[6], t[7], t[8]]; if (t[6] == '=' && s[2] != t[2]) s[6] = t[2]; // print sequence/quality and set the rev flag if (hits[i].rev == hits[reported_i].rev) { s.push(t[9], t[10]); s[1] = flag | 0x800; } else { // we need to write the reverse sequence if (rs == null || rq == null) { aux.length = 0; aux.set(t[9], 0); aux.revcomp(); rs = aux.toString(); aux.set(t[10],0); aux.reverse(); rq = aux.toString(); } s.push(rs, rq); s[1] = (flag ^ 0x10) | 0x800; } s.push("NM:i:" + hits[i].NM); if (hits[i].lifted_str) s.push("lt:Z:" + hits[i].lifted_str); if (rg != null) s.push(rg); buf2.push(s); } } print_buffer(buf2, fp_hla, hla); file.close(); if (fp_hla != null) for (var h in fp_hla) fp_hla[h].close(); buf.destroy(); aux.destroy(); } bwa_postalt(arguments); bwa-0.7.17/bwakit/run-HLA000077500000000000000000000006331317342117100147760ustar00rootroot00000000000000#!/bin/bash ctg_opt="" if [ $# -gt 1 ] && [ $1 == '-A' ]; then ctg_opt="-A" shift fi if [ $# -eq 0 ]; then echo "Usage: $0 " exit 1 fi for f in $1.HLA-*.fq; do gene=`echo $f | perl -pe 's/^.*(HLA-[A-Z]+[0-9]*).*fq$/$1/'` echo -e "\n*** Processing gene $gene...\n" >&2 `dirname $0`/typeHLA.sh $ctg_opt $1 $gene done ls $1.HLA-*.gt | xargs -i echo grep ^GT {} \| head -1 | sh | sed "s,^GT,$1," bwa-0.7.17/bwakit/run-bwamem000077500000000000000000000150561317342117100156470ustar00rootroot00000000000000#!/usr/bin/env perl use strict; use warnings; use Getopt::Std; my %opts = (t=>1); getopts("MPSadskHo:R:x:t:", \%opts); die(' Usage: run-bwamem [options] [file2] Options: -o STR prefix for output files [inferred from input] -R STR read group header line such as \'@RG\tID:foo\tSM:bar\' [null] -x STR read type: pacbio, ont2d or intractg [default] intractg: intra-species contig (kb query, highly similar) pacbio: pacbio subreads (~10kb query, high error rate) ont2d: Oxford Nanopore reads (~10kb query, higher error rate) -t INT number of threads [1] -H apply HLA typing -a trim HiSeq2000/2500 PE resequencing adapters (via trimadap) -d mark duplicate (via samblaster) -S for BAM input, don\'t shuffle -s sort the output alignment (via samtools; requring more RAM) -k keep temporary files generated by typeHLA -M mark shorter split hits as secondary Examples: * Map paired-end reads to GRCh38+ALT+decoy+HLA and perform HLA typing: run-bwamem -o prefix -t8 -HR"@RG\tID:foo\tSM:bar" hs38DH.fa read1.fq.gz read2.fq.gz Note: HLA typing is only effective for high-coverage data. The typing accuracy varies with the quality of input. It is only intended for research purpose, not for diagnostic. * Remap coordinate-sorted BAM, transfer read groups tags, trim Illumina PE adapters and sort the output. The BAM may contain single-end or paired-end reads, or a mixture of the two types. Specifying -R stops read group transfer. run-bwamem -sao prefix hs38DH.fa old-srt.bam Note: the adaptor trimmer included in bwa.kit is chosen because it fits the current mapping pipeline better. It is conservative and suboptimal. A more sophisticated trimmer is recommended if this becomes a concern. * Remap name-grouped BAM and mark duplicates: run-bwamem -Sdo prefix hs38DH.fa old-unsrt.bam Note: streamed duplicate marking requires all reads from a single paired-end library to be aligned at the same time. Output files: {-o}.aln.bam - final alignment {-o}.hla.top - best genotypes for the 6 classical HLA genes (if there are HLA-* contigs) {-o}.hla.all - additional HLA genotypes consistent with data {-o}.log.* - log files ') if @ARGV < 2; my $idx = $ARGV[0]; my $exepath = $0 =~/^\S+\/[^\/\s]+/? $0 : &which($0); my $root = $0 =~/^(\S+)\/[^\/\s]+/? $1 : undef; $root = $exepath =~/^(\S+)\/[^\/\s]+/? $1 : undef if !defined($root); die "ERROR: failed to locate the 'bwa.kit' directory\n" if !defined($root); die("ERROR: failed to locate the BWA index. Please run '$root/bwa index -p $idx ref.fa'.\n") unless (-f "$idx.bwt" && -f "$idx.pac" && -f "$idx.sa" && -f "$idx.ann" && -f "$idx.amb"); if (@ARGV >= 3 && $ARGV[1] =~ /\.(bam|sam|sam\.gz)$/) { warn("WARNING: for SAM/BAM input, only the first sequence file is used.\n"); @ARGV = 2; } if (defined($opts{p}) && @ARGV >= 3) { warn("WARNING: option -P is ignored as there are two input sequence files.\n"); delete $opts{p}; } my $prefix; if (defined $opts{o}) { $prefix = $opts{o}; } elsif (@ARGV >= 3) { my $len = length($ARGV[1]) < length($ARGV[2])? length($ARGV[1]) : length($ARGV[2]); my $i; for ($i = 0; $i < $len; ++$i) { last if substr($ARGV[1], $i, 1) ne substr($ARGV[2], $i, 1) } $prefix = substr($ARGV[1], 0, $i) if $i > 0; } elsif ($ARGV[1] =~ /^(\S+)\.(fastq|fq|fasta|fa|mag|mag\.gz|fasta\.gz|fa\.gz|fastq\.gz|fq\.gz|bam)$/) { $prefix = $1; } die("ERROR: failed to identify the prefix for output. Please specify -o.\n") unless defined($prefix); my $size = 0; my $comp_ratio = 3.; for my $f (@ARGV[1..$#ARGV]) { my @a = stat($f); my $s = $a[7]; die("ERROR: failed to read file $f\n") if !defined($s); $s *= $comp_ratio if $f =~ /\.(gz|bam)$/; $size += int($s) + 1; } my $is_pe = (defined($opts{p}) || @ARGV >= 3)? 1 : 0; my $is_bam = $ARGV[1] =~ /\.bam$/? 1 : 0; if (defined($opts{x})) { delete($opts{d}); delete($opts{a}); delete $opts{p}; } # for BAM input, find @RG header lines my @RG_lines = (); if ($is_bam && !defined($opts{R})) { my $fh; open($fh, "$root/samtools view -H $ARGV[1] |") || die; while (<$fh>) { chomp; if (/^\@RG\t/) { s/\t/\\t/g; push(@RG_lines, "-H'$_'"); } } close($fh); } warn("WARNING: many programs require read groups. Please specify with -R if you can.\n") if !defined($opts{R}) && @RG_lines == 0; my $cmd = ''; if ($is_bam) { my $cmd_sam2bam = "cat $ARGV[1] \\\n"; my $ntmps = int($size / 4e9) + 1; my $cmd_shuf = !defined($opts{S})? " | $root/htsbox bamshuf -uOn$ntmps - $prefix.shuf \\\n" : ""; my $bam2fq_opt = @RG_lines > 0? " -t" : ""; my $cmd_bam2fq = " | $root/htsbox bam2fq -O$bam2fq_opt - \\\n"; $cmd = $cmd_sam2bam . $cmd_shuf . $cmd_bam2fq; } elsif (@ARGV >= 3) { $cmd = "$root/seqtk mergepe $ARGV[1] $ARGV[2] \\\n"; } else { $cmd = "cat $ARGV[1] \\\n"; } my $bwa_opts = "-p " . ($opts{t} > 1? "-t$opts{t} " : "") . (defined($opts{x})? "-x $opts{x} " : "") . (defined($opts{R})? "-R'$opts{R}' " : "") . (defined($opts{M})? "-M " : ""); $bwa_opts .= join(" ", @RG_lines) . " -C " if @RG_lines > 0; $cmd .= " | $root/trimadap 2> $prefix.log.trim \\\n" if defined($opts{a}); $cmd .= " | $root/bwa mem $bwa_opts$ARGV[0] - 2> $prefix.log.bwamem \\\n"; $cmd .= " | $root/samblaster 2> $prefix.log.dedup \\\n" if defined($opts{d}); my $has_hla = 0; if (-f "$ARGV[0].alt" && !defined($opts{P})) { my $fh; open($fh, "$ARGV[0].alt") || die; while (<$fh>) { $has_hla = 1 if /^HLA-[^\s\*]+\*\d+/; } close($fh); my $hla_pre = $has_hla? "-p $prefix.hla " : ""; $cmd .= " | $root/k8 $root/bwa-postalt.js $hla_pre$ARGV[0].alt \\\n"; } my $t_sort = $opts{t} < 4? $opts{t} : 4; $cmd .= defined($opts{s})? " | $root/samtools sort -@ $t_sort -m1G - -o $prefix.aln.bam;\n" : " | $root/samtools view -1 - > $prefix.aln.bam;\n"; if ($has_hla && defined($opts{H}) && (!defined($opts{x}) || $opts{x} eq 'intractg')) { $cmd .= "$root/run-HLA ". (defined($opts{x}) && $opts{x} eq 'intractg'? "-A " : "") . "$prefix.hla > $prefix.hla.top 2> $prefix.log.hla;\n"; $cmd .= "touch $prefix.hla.HLA-dummy.gt; cat $prefix.hla.HLA*.gt | grep ^GT | cut -f2- > $prefix.hla.all;\n"; $cmd .= "rm -f $prefix.hla.HLA*;\n" unless defined($opts{k}); } print $cmd; sub which { my $file = shift; my $path = (@_)? shift : $ENV{PATH}; return if (!defined($path)); foreach my $x (split(":", $path)) { $x =~ s/\/$//; return "$x/$file" if (-x "$x/$file"); } return; } bwa-0.7.17/bwakit/run-gen-ref000077500000000000000000000033231317342117100157140ustar00rootroot00000000000000#!/bin/bash root=`dirname $0` url38="ftp://ftp.ncbi.nlm.nih.gov/genomes/all/GCA/000/001/405/GCA_000001405.15_GRCh38/seqs_for_alignment_pipelines.ucsc_ids/GCA_000001405.15_GRCh38_full_analysis_set.fna.gz" url37d5="ftp://ftp.ncbi.nlm.nih.gov/1000genomes/ftp/technical/reference/phase2_reference_assembly_sequence/hs37d5.fa.gz" if [ $# -eq 0 ]; then echo "Usage: $0 " echo "Analysis sets:" echo " hs38 primary assembly of GRCh38 (incl. chromosomes, unplaced and unlocalized contigs) and EBV" echo " hs38a hs38 plus ALT contigs" echo " hs38DH hs38a plus decoy contigs and HLA genes (recommended for GRCh38 mapping)" echo " hs37 primary assembly of GRCh37 (used by 1000g phase 1) plus the EBV genome" echo " hs37d5 hs37 plus decoy contigs (used by 1000g phase 3)" echo "" echo "Note: This script downloads human reference genomes. For hs38a and hs38DH, it needs additional" echo " sequences and ALT-to-REF mapping included in the bwa.kit package." exit 1; fi if [ $1 == "hs38DH" ]; then (wget -O- $url38 | gzip -dc; cat $root/resource-GRCh38/hs38DH-extra.fa) > $1.fa [ ! -f $1.fa.alt ] && cp $root/resource-GRCh38/hs38DH.fa.alt $1.fa.alt elif [ $1 == "hs38a" ]; then wget -O- $url38 | gzip -dc > $1.fa [ ! -f $1.fa.alt ] && grep _alt $root/resource-GRCh38/hs38DH.fa.alt > $1.fa.alt elif [ $1 == "hs38" ]; then wget -O- $url38 | gzip -dc | awk '/^>/{f=/_alt/?0:1}f' > $1.fa elif [ $1 == "hs37d5" ]; then wget -O- $url37d5 | gzip -dc > $1.fa 2>/dev/null elif [ $1 == "hs37" ]; then wget -O- $url37d5 | gzip -dc 2>/dev/null | awk '/^>/{f=/>hs37d5/?0:1}f' > $1.fa else echo "ERROR: unknown genome build" fi [ ! -f $1.fa.bwt ] && echo -e "\nPlease run 'bwa index $1.fa'...\n" bwa-0.7.17/bwakit/typeHLA-selctg.js000066400000000000000000000032211317342117100167610ustar00rootroot00000000000000var min_ovlp = 30; if (arguments.length < 3) { print("Usage: k8 selctg.js [min_ovlp="+min_ovlp+"]"); exit(1); } if (arguments.length >= 4) min_ovlp = parseInt(arguments[3]); var gene = arguments[0]; var buf = new Bytes(); var h = {}; var file = new File(arguments[1]); while (file.readline(buf) >= 0) { var t = buf.toString().split("\t"); if (t[3] != gene) continue; if (h[t[0]] == null) h[t[0]] = []; h[t[0]].push([parseInt(t[1]), parseInt(t[2])]); } file.close(); var s = {}, re = /(\d+)([MIDSHN])/g; file = new File(arguments[2]); while (file.readline(buf) >= 0) { var line = buf.toString(); var m, t = line.split("\t"); var x = h[t[2]]; if (x == null) continue; var start = parseInt(t[3]) - 1, end = start; while ((m = re.exec(t[5])) != null) // parse CIGAR to get the end position if (m[2] == 'M' || m[2] == 'D') end += parseInt(m[1]); var max_ovlp = 0; for (var i = 0; i < x.length; ++i) { var max_left = x[i][0] > start? x[i][0] : start; var min_rght = x[i][1] < end ? x[i][1] : end; max_ovlp = max_ovlp > min_rght - max_left? max_ovlp : min_rght - max_left; } var AS = null, XS = null; if ((m = /AS:i:(\d+)/.exec(line)) != null) AS = parseInt(m[1]); if ((m = /XS:i:(\d+)/.exec(line)) != null) XS = parseInt(m[1]); if (s[t[0]] == null) s[t[0]] = []; s[t[0]].push([AS, XS, max_ovlp]); } file.close(); buf.destroy(); for (var x in s) { var is_rejected = false, y = s[x]; y.sort(function(a,b) {return b[0]-a[0]}); for (var i = 0; i < y.length && y[i][0] == y[0][0]; ++i) if (y[0][2] < min_ovlp || y[i][0] == y[i][1]) is_rejected = true; if (is_rejected) continue; print(x); } bwa-0.7.17/bwakit/typeHLA.js000066400000000000000000000407301317342117100155100ustar00rootroot00000000000000/***************************************************************** * The K8 Javascript interpreter is required to run this script. * * * * Source code: https://github.com/attractivechaos/k8 * * Binary: http://sourceforge.net/projects/lh3/files/k8/ * *****************************************************************/ var getopt = function(args, ostr) { var oli; // option letter list index if (typeof(getopt.place) == 'undefined') getopt.ind = 0, getopt.arg = null, getopt.place = -1; if (getopt.place == -1) { // update scanning pointer if (getopt.ind >= args.length || args[getopt.ind].charAt(getopt.place = 0) != '-') { getopt.place = -1; return null; } if (getopt.place + 1 < args[getopt.ind].length && args[getopt.ind].charAt(++getopt.place) == '-') { // found "--" ++getopt.ind; getopt.place = -1; return null; } } var optopt = args[getopt.ind].charAt(getopt.place++); // character checked for validity if (optopt == ':' || (oli = ostr.indexOf(optopt)) < 0) { if (optopt == '-') return null; // if the user didn't specify '-' as an option, assume it means null. if (getopt.place < 0) ++getopt.ind; return '?'; } if (oli+1 >= ostr.length || ostr.charAt(++oli) != ':') { // don't need argument getopt.arg = null; if (getopt.place < 0 || getopt.place >= args[getopt.ind].length) ++getopt.ind, getopt.place = -1; } else { // need an argument if (getopt.place >= 0 && getopt.place < args[getopt.ind].length) getopt.arg = args[getopt.ind].substr(getopt.place); else if (args.length <= ++getopt.ind) { // no arg getopt.place = -1; if (ostr.length > 0 && ostr.charAt(0) == ':') return ':'; return '?'; } else getopt.arg = args[getopt.ind]; // white space getopt.place = -1; ++getopt.ind; } return optopt; } /************************ * Command line parsing * ************************/ var ver = "r19"; var c, thres_len = 50, thres_ratio = .8, thres_nm = 5, thres_frac = .33, dbg = false; // parse command line options while ((c = getopt(arguments, "vdl:n:f:")) != null) { if (c == 'l') thres_len = parseInt(getopt.arg); else if (c == 'n') thres_nm = parseInt(getopt.arg); else if (c == 'd') dbg = true; else if (c == 'f') thres_frac = parseFloat(getopt.arg); else if (c == 'v') { print(ver); exit(0); } } if (arguments.length == getopt.ind) { print(""); print("Usage: k8 typeHLA.js [options] \n"); print("Options: -n INT drop a contig if the edit distance to the closest gene is >INT ["+thres_nm+"]"); print(" -l INT drop a contig if its match too short ["+thres_len+"]"); print(" -f FLOAT drop inconsistent contigs if their length = 0) { var m, mm, line = buf.toString(); var t = line.split("\t"); var flag = parseInt(t[1]); // SAM header if (t[0].charAt(0) == '@') { if (t[0] == '@SQ' && (m = /LN:(\d+)/.exec(line)) != null && (mm = /SN:(\S+)/.exec(line)) != null) len[mm[1]] = parseInt(m[1]); continue; } // parse gene name and exon number var gene = null, exon = null; if ((m = /^(HLA-[^\s_]+)_(\d+)/.exec(t[0])) != null) { gene = m[1], exon = parseInt(m[2]) - 1; if (gcnt[exon] == null) gcnt[exon] = {}; gcnt[exon][gene] = true; } if (gene == null || exon == null || t[2] == '*') continue; // parse clipping and aligned length var x = 0, ts = parseInt(t[3]) - 1, te = ts, clip = [0, 0]; while ((m = re_cigar.exec(t[5])) != null) { var l = parseInt(m[1]); if (m[2] == 'M') x += l, te += l; else if (m[2] == 'I') x += l; else if (m[2] == 'D') te += l; else if (m[2] == 'S' || m[2] == 'H') clip[x==0?0:1] = l; } var tl = len[t[2]]; var left = ts < clip[0]? ts : clip[0]; var right = tl - te < clip[1]? tl - te : clip[1]; var qs, qe, ql = clip[0] + x + clip[1]; if (flag & 16) qs = clip[1], qe = ql - clip[0]; else qs = clip[0], qe = ql - clip[1]; var nm = (m = /\tNM:i:(\d+)/.exec(line)) != null? parseInt(m[1]) : 0; list.push([t[2], gene, exon, ts, te, nm, left + right, qs, qe, ql]); // left+right should be 0 given a prefix-suffix alignment } buf.destroy(); file.close(); /************************************** * Prepare data structures for typing * **************************************/ // identify the primary exons, the exons associated with most genes var pri_exon = [], n_pri_exons; { var cnt = [], max = 0; // count the number of genes per exon and track the max for (var e = 0; e < gcnt.length; ++e) { if (gcnt[e] != null) { var c = 0, h = gcnt[e]; for (var x in h) ++c; cnt[e] = c; max = max > c? max : c; } else cnt[e] = 0; } warn("- Number of genes for each exon: [" +cnt.join(",") + "]"); // find primary exons var pri_list = []; for (var e = 0; e < cnt.length; ++e) { if (cnt[e] == max) pri_list.push(e + 1); pri_exon[e] = cnt[e] == max? 1 : 0; } warn("- List of primary exon(s): ["+pri_list.join(",")+"]"); n_pri_exons = pri_list.length; } // convert strings to integers (for performance) var ghash = {}, glist = [], chash = {}, clist = [], elist = []; for (var i = 0; i < list.length; ++i) { if (ghash[list[i][1]] == null) { ghash[list[i][1]] = glist.length; glist.push(list[i][1]); } if (chash[list[i][0]] == null) { chash[list[i][0]] = clist.length; clist.push(list[i][0]); } var g = ghash[list[i][1]]; if (elist[g] == null) elist[g] = {}; elist[g][list[i][2]] = true; } // extract the 3rd and 4th digits var gsub = [], gsuf = []; for (var i = 0; i < glist.length; ++i) { var m = /^HLA-[^*\s]+\*\d+:(\d+).*([A-Z]?)$/.exec(glist[i]); gsub[i] = parseInt(m[1]); gsuf[i] = /[A-Z]$/.test(glist[i])? 1 : 0; } /************************************************* * Collect genes with perfect matches on primary * *************************************************/ // collect exons with fully covered by perfect match(es) var perf_exons = []; function push_perf_exons(matches, last) { matches.sort(function(a, b) { return a[0]-b[0]; }); var cov = 0, start = 0, end = 0; for (var i = 0; i < matches.length; ++i) { if (matches[i][3] > 0) continue; if (matches[i][0] <= end) end = end > matches[i][1]? end : matches[i][1]; else cov += end - start, start = matches[i][0], end = matches[i][1]; } cov += end - start; if (matches[0][2] == cov) { if (perf_exons[last[1]] == null) perf_exons[last[1]] = []; //print(last[0], last[1], ghash[last[0]]); perf_exons[last[1]].push(ghash[last[0]]); } } var last = [null, -1], matches = []; for (var i = 0; i < list.length; ++i) { var li = list[i]; if (last[0] != li[1] || last[1] != li[2]) { if (matches.length) push_perf_exons(matches, last); matches = []; last = [li[1], li[2]]; } matches.push([li[7], li[8], li[9], li[5]+li[6]]); } if (matches.length) push_perf_exons(matches, last); // for each gene, count how many primary exons are perfect var pg_aux_cnt = {}; for (var e = 0; e < perf_exons.length; ++e) { if (!pri_exon[e]) continue; var pe = perf_exons[e]; var n = pe? pe.length : 0; for (var i = 0; i < n; ++i) { var g = pe[i]; if (pg_aux_cnt[g] == null) pg_aux_cnt[g] = 1; else ++pg_aux_cnt[g]; } } // find genes with perfect matches on the primary exons var perf_genes = []; for (var g in pg_aux_cnt) if (pg_aux_cnt[g] == n_pri_exons) perf_genes.push(parseInt(g)); warn("- Found " +perf_genes.length+ " genes fully covered by perfect matches on the primary exon(s)"); var h_perf_genes = {}; for (var i = 0; i < perf_genes.length; ++i) { if (dbg) print("PG", glist[perf_genes[i]]); h_perf_genes[perf_genes[i]] = true; } /******************* * Filter hit list * *******************/ // reorganize hits to exons function list2exons(list, flt_flag, perf_hash) { var exons = []; for (var i = 0; i < list.length; ++i) { var li = list[i], c = chash[li[0]], g = ghash[li[1]]; if (flt_flag != null && flt_flag[c] == 1) continue; if (perf_hash != null && !perf_hash[g]) continue; if (exons[li[2]] == null) exons[li[2]] = []; exons[li[2]].push([c, g, li[5] + li[6], li[4] - li[3]]); } return exons; } var exons = list2exons(list), flt_flag = [], ovlp_len = []; for (var c = 0; c < clist.length; ++c) flt_flag[c] = ovlp_len[c] = 0; for (var e = 0; e < exons.length; ++e) { if (!pri_exon[e]) continue; var ee = exons[e]; var max_len = []; for (var c = 0; c < clist.length; ++c) max_len[c] = 0; for (var i = 0; i < ee.length; ++i) { var l = ee[i][3] - ee[i][2]; if (l < 1) l = 1; if (max_len[ee[i][0]] < l) max_len[ee[i][0]] = l; } for (var c = 0; c < clist.length; ++c) ovlp_len[c] += max_len[c]; for (var i = 0; i < ee.length; ++i) flt_flag[ee[i][0]] |= (!h_perf_genes[ee[i][1]] || ee[i][2])? 1 : 1<<1; } var l_cons = 0, l_incons = 0; for (var c = 0; c < clist.length; ++c) if (flt_flag[c]&2) l_cons += ovlp_len[c]; else if (flt_flag[c] == 1) l_incons += ovlp_len[c]; warn("- Total length of contigs consistent/inconsistent with perfect genes: " +l_cons+ "/" +l_incons); var attempt_perf = (l_incons/(l_cons+l_incons) < thres_frac); /******************************** * Core function for genotyping * ********************************/ function type_gene(perf_mode) { if (perf_mode) { var flt_list = []; for (var c = 0; c < clist.length; ++c) if (flt_flag[c] == 1) flt_list.push(clist[c]); warn(" - Filtered " +flt_list.length+ " inconsistent contig(s): [" +flt_list.join(",")+ "]"); exons = list2exons(list, flt_flag, h_perf_genes); } else exons = list2exons(list); /*********************** * Score each genotype * ***********************/ // initialize genotype scores var pair = []; for (var i = 0; i < glist.length; ++i) { pair[i] = []; for (var j = 0; j <= i; ++j) pair[i][j] = 0; } // these two arrays are used to output debugging information var score = [], ctg = []; function type_exon(e, gt_list) { function update_pair(x, m, is_pri) { var y, z; y = (x>>14&0xff) + m < 0xff? (x>>14&0xff) + m : 0xff; if (is_pri) z = (x>>22) + m < 0xff? (x>>22) + m : 0xff; else z = x>>22; return z<<22 | y<<14 | ((x&0x3fff) + (1<<6|is_pri)); } score[e] = []; ctg[e] = []; if (exons[e] == null) return; var ee = exons[e], is_pri = pri_exon[e]? 1 : 0; // find contigs and genes associated with the current exon var ch = {}, gh = {}; for (var i = 0; i < ee.length; ++i) if (elist[ee[i][1]][e] != null) ch[ee[i][0]] = true, gh[ee[i][1]] = true; var ga = [], ca = ctg[e]; for (var c in ch) ca.push(parseInt(c)); for (var g in gh) ga.push(parseInt(g)); var named_ca = []; for (var i = 0; i < ca.length; ++i) named_ca.push(clist[ca[i]]); warn(" - Processing exon "+(e+1)+" (" +ga.length+ " genes; " +ca.length+ " contigs: [" +named_ca.join(", ")+ "])..."); // set unmapped entries to high mismatch var sc = score[e]; for (var k = 0; k < ga.length; ++k) { var g = ga[k]; if (sc[g] == null) sc[g] = []; for (var i = 0; i < ca.length; ++i) sc[g][ca[i]] = 0xff; } // convert representation again and compute max_len[] var max_len = []; for (var i = 0; i < ee.length; ++i) { var c = ee[i][0], g = ee[i][1]; if (gh[g] == null || ch[c] == null) continue; sc[g][c] = sc[g][c] < ee[i][2]? sc[g][c] : ee[i][2]; if (max_len[c] == null) max_len[c] = 0; max_len[c] = max_len[c] > ee[i][3]? max_len[c] : ee[i][3]; } // drop mismapped contigs var max_max_len = 0; for (var k = 0; k < ca.length; ++k) max_max_len = max_max_len > max_len[ca[k]]? max_max_len : max_len[ca[k]]; var dropped = []; for (var k = 0; k < ca.length; ++k) { var min = 0x7fffffff, c = ca[k]; for (var i = 0; i < ga.length; ++i) { var g = ga[i]; min = min < sc[g][c]? min : sc[g][c]; } dropped[c] = min > thres_nm? true : false; if (max_len[c] < thres_len && max_len[c] < thres_ratio * max_max_len) dropped[c] = true; if (dropped[c]) warn(" . Dropped low-quality contig " +clist[c]+ " (minNM=" +min+ "; maxLen=" +max_len[c]+ ")"); } // fill the pair array if (gt_list == null) { for (var i = 0; i < ga.length; ++i) { var m = 0, gi = ga[i], g1 = sc[gi]; // homozygous for (var k = 0; k < ca.length; ++k) { var c = ca[k]; if (!dropped[c]) m += g1[c]; } pair[gi][gi] = update_pair(pair[gi][gi], m, is_pri); // heterozygous for (var j = i + 1; j < ga.length; ++j) { var gj = ga[j], g2 = sc[gj], m = 0, a = [0, 0]; for (var k = 0; k < ca.length; ++k) { var c = ca[k]; if (!dropped[c]) { m += g1[c] < g2[c]? g1[c] : g2[c]; ++a[g1[c]>22? min_nm_pri : pair[i][j]>>22; var gt_list = []; for (var i = 0; i < glist.length; ++i) for (var j = 0; j <= i; ++j) if ((pair[i][j]&63) == n_pri_exons && pair[i][j]>>22 == min_nm_pri) gt_list.push([i, j]); warn(" - Collected " +gt_list.length+ " top genotypes on the primary exon(s); minimal edit distance: " +min_nm_pri); // type other exons warn(" - Processing other exon(s)..."); for (var e = 0; e < exons.length; ++e) if (!pri_exon[e]) type_exon(e, gt_list); /***************************** * Choose the best genotypes * *****************************/ // genotyping var min_nm = 0x7fffffff; for (var i = 0; i < glist.length; ++i) for (var j = 0; j <= i; ++j) if ((pair[i][j]&63) == n_pri_exons) min_nm = min_nm < pair[i][j]>>14? min_nm : pair[i][j]>>14; var out = []; for (var i = 0; i < glist.length; ++i) for (var j = 0; j <= i; ++j) if ((pair[i][j]&63) == n_pri_exons && pair[i][j]>>14 <= min_nm + 1) out.push([pair[i][j]>>14, pair[i][j]>>6&0xff, i, j, (gsuf[i] + gsuf[j])<<16|(gsub[i] + gsub[j])]); out.sort(function(a, b) { return a[0]!=b[0]? a[0]-b[0] : a[1]!=b[1]? b[1]-a[1] : a[4]!=b[4]? a[4]-b[4] : a[2]!=b[2]? a[2]-b[2] : a[3]-b[3]}); return out; } /********************** * Perform genotyping * **********************/ warn("- Typing in the imperfect mode..."); var rst = type_gene(false); if (attempt_perf) { warn("- Typing in the perfect mode..."); var rst_perf = type_gene(true); warn("- Imperfect vs perfect mode: [" +(rst[0][0]>>8&0xff)+ "," +(rst[0][0]&0xff)+ "] vs [" +(rst_perf[0][0]>>8&0xff)+ "," +(rst_perf[0][0]&0xff)+ "]"); if (rst_perf[0][0] < rst[0][0]) { warn("- Chose the result from the perfect mode"); rst = rst_perf; } else warn("- Chose the result from the imperfect mode"); } else warn("- Perfect mode is not attempted"); /********** * Output * **********/ for (var i = 0; i < rst.length; ++i) print("GT", glist[rst[i][3]], glist[rst[i][2]], rst[i][0]>>8&0xff, rst[i][0]&0xff, rst[i][1]); bwa-0.7.17/bwakit/typeHLA.sh000077500000000000000000000027611317342117100155130ustar00rootroot00000000000000#!/bin/bash is_ctg=0 if [ $# -gt 1 ] && [ $1 == '-A' ]; then is_ctg=1 shift fi if [ $# -lt 2 ]; then echo "Usage: $0 [-A] " exit 1 fi preres="resource-human-HLA" root=`dirname $0` pre=$1.$2 touch $pre.gt if [ ! -s $pre.fq ]; then echo '** Empty input file. Abort!' >&2 exit 0 fi if [ $is_ctg -eq 0 ]; then echo "** De novo assembling..." >&2 len=`$root/seqtk comp $pre.fq | awk '{++x;y+=$2}END{printf("%.0f\n", y/x)}'` $root/fermi2.pl unitig -f $root/fermi2 -r $root/ropebwt2 -t2 -l$len -p $pre.tmp $pre.fq > $pre.tmp.mak make -f $pre.tmp.mak >&2 cp $pre.tmp.mag.gz $pre.mag.gz else rm -f $pre.tmp.mag.gz ln -s $pre.fq $pre.tmp.mag.gz fi echo "** Selecting contigs overlapping target exons..." >&2 (ls $root/$preres/HLA-ALT-idx/*.fa.bwt | sed s,.bwt,, | xargs -i $root/bwa mem -t2 -B1 -O1 -E1 {} $pre.tmp.mag.gz 2>/dev/null) | grep -v ^@ | sort -k3,3 -k4,4n | gzip > $pre.tmp.ALT.sam.gz $root/k8 $root/typeHLA-selctg.js $2 $root/$preres/HLA-ALT-exons.bed $pre.tmp.ALT.sam.gz | $root/seqtk subseq $pre.tmp.mag.gz - | gzip -1 > $pre.tmp.fq.gz echo "** Mapping exons to de novo contigs..." >&2 $root/bwa index -p $pre.tmp $pre.tmp.fq.gz 2>/dev/null $root/seqtk comp $root/$preres/HLA-CDS.fa | cut -f1 | grep ^$2 | $root/seqtk subseq $root/$preres/HLA-CDS.fa - | $root/bwa mem -aD.1 -t2 $pre.tmp - 2>/dev/null | gzip -1 > $pre.sam.gz echo "** Typing..." >&2 $root/k8 $root/typeHLA.js $pre.sam.gz > $pre.gt # delete temporary files rm -f $pre.tmp.* [ $is_ctg -eq 1 ] && rm -f $pre.mag.gz bwa-0.7.17/bwamem.c000066400000000000000000001353751317342117100140110ustar00rootroot00000000000000#include #include #include #include #include #include #ifdef HAVE_PTHREAD #include #endif #include "kstring.h" #include "bwamem.h" #include "bntseq.h" #include "ksw.h" #include "kvec.h" #include "ksort.h" #include "utils.h" #ifdef USE_MALLOC_WRAPPERS # include "malloc_wrap.h" #endif /* Theory on probability and scoring *ungapped* alignment * * s'(a,b) = log[P(b|a)/P(b)] = log[4P(b|a)], assuming uniform base distribution * s'(a,a) = log(4), s'(a,b) = log(4e/3), where e is the error rate * * Scale s'(a,b) to s(a,a) s.t. s(a,a)=x. Then s(a,b) = x*s'(a,b)/log(4), or conversely: s'(a,b)=s(a,b)*log(4)/x * * If the matching score is x and mismatch penalty is -y, we can compute error rate e: * e = .75 * exp[-log(4) * y/x] * * log P(seq) = \sum_i log P(b_i|a_i) = \sum_i {s'(a,b) - log(4)} * = \sum_i { s(a,b)*log(4)/x - log(4) } = log(4) * (S/x - l) * * where S=\sum_i s(a,b) is the alignment score. Converting to the phred scale: * Q(seq) = -10/log(10) * log P(seq) = 10*log(4)/log(10) * (l - S/x) = 6.02 * (l - S/x) * * * Gap open (zero gap): q' = log[P(gap-open)], r' = log[P(gap-ext)] (see Durbin et al. (1998) Section 4.1) * Then q = x*log[P(gap-open)]/log(4), r = x*log[P(gap-ext)]/log(4) * * When there are gaps, l should be the length of alignment matches (i.e. the M operator in CIGAR) */ static const bntseq_t *global_bns = 0; // for debugging only mem_opt_t *mem_opt_init() { mem_opt_t *o; o = calloc(1, sizeof(mem_opt_t)); o->flag = 0; o->a = 1; o->b = 4; o->o_del = o->o_ins = 6; o->e_del = o->e_ins = 1; o->w = 100; o->T = 30; o->zdrop = 100; o->pen_unpaired = 17; o->pen_clip5 = o->pen_clip3 = 5; o->max_mem_intv = 20; o->min_seed_len = 19; o->split_width = 10; o->max_occ = 500; o->max_chain_gap = 10000; o->max_ins = 10000; o->mask_level = 0.50; o->drop_ratio = 0.50; o->XA_drop_ratio = 0.80; o->split_factor = 1.5; o->chunk_size = 10000000; o->n_threads = 1; o->max_XA_hits = 5; o->max_XA_hits_alt = 200; o->max_matesw = 50; o->mask_level_redun = 0.95; o->min_chain_weight = 0; o->max_chain_extend = 1<<30; o->mapQ_coef_len = 50; o->mapQ_coef_fac = log(o->mapQ_coef_len); bwa_fill_scmat(o->a, o->b, o->mat); return o; } /*************************** * Collection SA invervals * ***************************/ #define intv_lt(a, b) ((a).info < (b).info) KSORT_INIT(mem_intv, bwtintv_t, intv_lt) typedef struct { bwtintv_v mem, mem1, *tmpv[2]; } smem_aux_t; static smem_aux_t *smem_aux_init() { smem_aux_t *a; a = calloc(1, sizeof(smem_aux_t)); a->tmpv[0] = calloc(1, sizeof(bwtintv_v)); a->tmpv[1] = calloc(1, sizeof(bwtintv_v)); return a; } static void smem_aux_destroy(smem_aux_t *a) { free(a->tmpv[0]->a); free(a->tmpv[0]); free(a->tmpv[1]->a); free(a->tmpv[1]); free(a->mem.a); free(a->mem1.a); free(a); } static void mem_collect_intv(const mem_opt_t *opt, const bwt_t *bwt, int len, const uint8_t *seq, smem_aux_t *a) { int i, k, x = 0, old_n; int start_width = 1; int split_len = (int)(opt->min_seed_len * opt->split_factor + .499); a->mem.n = 0; // first pass: find all SMEMs while (x < len) { if (seq[x] < 4) { x = bwt_smem1(bwt, len, seq, x, start_width, &a->mem1, a->tmpv); for (i = 0; i < a->mem1.n; ++i) { bwtintv_t *p = &a->mem1.a[i]; int slen = (uint32_t)p->info - (p->info>>32); // seed length if (slen >= opt->min_seed_len) kv_push(bwtintv_t, a->mem, *p); } } else ++x; } // second pass: find MEMs inside a long SMEM old_n = a->mem.n; for (k = 0; k < old_n; ++k) { bwtintv_t *p = &a->mem.a[k]; int start = p->info>>32, end = (int32_t)p->info; if (end - start < split_len || p->x[2] > opt->split_width) continue; bwt_smem1(bwt, len, seq, (start + end)>>1, p->x[2]+1, &a->mem1, a->tmpv); for (i = 0; i < a->mem1.n; ++i) if ((uint32_t)a->mem1.a[i].info - (a->mem1.a[i].info>>32) >= opt->min_seed_len) kv_push(bwtintv_t, a->mem, a->mem1.a[i]); } // third pass: LAST-like if (opt->max_mem_intv > 0) { x = 0; while (x < len) { if (seq[x] < 4) { if (1) { bwtintv_t m; x = bwt_seed_strategy1(bwt, len, seq, x, opt->min_seed_len, opt->max_mem_intv, &m); if (m.x[2] > 0) kv_push(bwtintv_t, a->mem, m); } else { // for now, we never come to this block which is slower x = bwt_smem1a(bwt, len, seq, x, start_width, opt->max_mem_intv, &a->mem1, a->tmpv); for (i = 0; i < a->mem1.n; ++i) kv_push(bwtintv_t, a->mem, a->mem1.a[i]); } } else ++x; } } // sort ks_introsort(mem_intv, a->mem.n, a->mem.a); } /************ * Chaining * ************/ typedef struct { int64_t rbeg; int32_t qbeg, len; int score; } mem_seed_t; // unaligned memory typedef struct { int n, m, first, rid; uint32_t w:29, kept:2, is_alt:1; float frac_rep; int64_t pos; mem_seed_t *seeds; } mem_chain_t; typedef struct { size_t n, m; mem_chain_t *a; } mem_chain_v; #include "kbtree.h" #define chain_cmp(a, b) (((b).pos < (a).pos) - ((a).pos < (b).pos)) KBTREE_INIT(chn, mem_chain_t, chain_cmp) // return 1 if the seed is merged into the chain static int test_and_merge(const mem_opt_t *opt, int64_t l_pac, mem_chain_t *c, const mem_seed_t *p, int seed_rid) { int64_t qend, rend, x, y; const mem_seed_t *last = &c->seeds[c->n-1]; qend = last->qbeg + last->len; rend = last->rbeg + last->len; if (seed_rid != c->rid) return 0; // different chr; request a new chain if (p->qbeg >= c->seeds[0].qbeg && p->qbeg + p->len <= qend && p->rbeg >= c->seeds[0].rbeg && p->rbeg + p->len <= rend) return 1; // contained seed; do nothing if ((last->rbeg < l_pac || c->seeds[0].rbeg < l_pac) && p->rbeg >= l_pac) return 0; // don't chain if on different strand x = p->qbeg - last->qbeg; // always non-negtive y = p->rbeg - last->rbeg; if (y >= 0 && x - y <= opt->w && y - x <= opt->w && x - last->len < opt->max_chain_gap && y - last->len < opt->max_chain_gap) { // grow the chain if (c->n == c->m) { c->m <<= 1; c->seeds = realloc(c->seeds, c->m * sizeof(mem_seed_t)); } c->seeds[c->n++] = *p; return 1; } return 0; // request to add a new chain } int mem_chain_weight(const mem_chain_t *c) { int64_t end; int j, w = 0, tmp; for (j = 0, end = 0; j < c->n; ++j) { const mem_seed_t *s = &c->seeds[j]; if (s->qbeg >= end) w += s->len; else if (s->qbeg + s->len > end) w += s->qbeg + s->len - end; end = end > s->qbeg + s->len? end : s->qbeg + s->len; } tmp = w; w = 0; for (j = 0, end = 0; j < c->n; ++j) { const mem_seed_t *s = &c->seeds[j]; if (s->rbeg >= end) w += s->len; else if (s->rbeg + s->len > end) w += s->rbeg + s->len - end; end = end > s->rbeg + s->len? end : s->rbeg + s->len; } w = w < tmp? w : tmp; return w < 1<<30? w : (1<<30)-1; } void mem_print_chain(const bntseq_t *bns, mem_chain_v *chn) { int i, j; for (i = 0; i < chn->n; ++i) { mem_chain_t *p = &chn->a[i]; err_printf("* Found CHAIN(%d): n=%d; weight=%d", i, p->n, mem_chain_weight(p)); for (j = 0; j < p->n; ++j) { bwtint_t pos; int is_rev; pos = bns_depos(bns, p->seeds[j].rbeg, &is_rev); if (is_rev) pos -= p->seeds[j].len - 1; err_printf("\t%d;%d;%d,%ld(%s:%c%ld)", p->seeds[j].score, p->seeds[j].len, p->seeds[j].qbeg, (long)p->seeds[j].rbeg, bns->anns[p->rid].name, "+-"[is_rev], (long)(pos - bns->anns[p->rid].offset) + 1); } err_putchar('\n'); } } mem_chain_v mem_chain(const mem_opt_t *opt, const bwt_t *bwt, const bntseq_t *bns, int len, const uint8_t *seq, void *buf) { int i, b, e, l_rep; int64_t l_pac = bns->l_pac; mem_chain_v chain; kbtree_t(chn) *tree; smem_aux_t *aux; kv_init(chain); if (len < opt->min_seed_len) return chain; // if the query is shorter than the seed length, no match tree = kb_init(chn, KB_DEFAULT_SIZE); aux = buf? (smem_aux_t*)buf : smem_aux_init(); mem_collect_intv(opt, bwt, len, seq, aux); for (i = 0, b = e = l_rep = 0; i < aux->mem.n; ++i) { // compute frac_rep bwtintv_t *p = &aux->mem.a[i]; int sb = (p->info>>32), se = (uint32_t)p->info; if (p->x[2] <= opt->max_occ) continue; if (sb > e) l_rep += e - b, b = sb, e = se; else e = e > se? e : se; } l_rep += e - b; for (i = 0; i < aux->mem.n; ++i) { bwtintv_t *p = &aux->mem.a[i]; int step, count, slen = (uint32_t)p->info - (p->info>>32); // seed length int64_t k; // if (slen < opt->min_seed_len) continue; // ignore if too short or too repetitive step = p->x[2] > opt->max_occ? p->x[2] / opt->max_occ : 1; for (k = count = 0; k < p->x[2] && count < opt->max_occ; k += step, ++count) { mem_chain_t tmp, *lower, *upper; mem_seed_t s; int rid, to_add = 0; s.rbeg = tmp.pos = bwt_sa(bwt, p->x[0] + k); // this is the base coordinate in the forward-reverse reference s.qbeg = p->info>>32; s.score= s.len = slen; rid = bns_intv2rid(bns, s.rbeg, s.rbeg + s.len); if (rid < 0) continue; // bridging multiple reference sequences or the forward-reverse boundary; TODO: split the seed; don't discard it!!! if (kb_size(tree)) { kb_intervalp(chn, tree, &tmp, &lower, &upper); // find the closest chain if (!lower || !test_and_merge(opt, l_pac, lower, &s, rid)) to_add = 1; } else to_add = 1; if (to_add) { // add the seed as a new chain tmp.n = 1; tmp.m = 4; tmp.seeds = calloc(tmp.m, sizeof(mem_seed_t)); tmp.seeds[0] = s; tmp.rid = rid; tmp.is_alt = !!bns->anns[rid].is_alt; kb_putp(chn, tree, &tmp); } } } if (buf == 0) smem_aux_destroy(aux); kv_resize(mem_chain_t, chain, kb_size(tree)); #define traverse_func(p_) (chain.a[chain.n++] = *(p_)) __kb_traverse(mem_chain_t, tree, traverse_func); #undef traverse_func for (i = 0; i < chain.n; ++i) chain.a[i].frac_rep = (float)l_rep / len; if (bwa_verbose >= 4) printf("* fraction of repetitive seeds: %.3f\n", (float)l_rep / len); kb_destroy(chn, tree); return chain; } /******************** * Filtering chains * ********************/ #define chn_beg(ch) ((ch).seeds->qbeg) #define chn_end(ch) ((ch).seeds[(ch).n-1].qbeg + (ch).seeds[(ch).n-1].len) #define flt_lt(a, b) ((a).w > (b).w) KSORT_INIT(mem_flt, mem_chain_t, flt_lt) int mem_chain_flt(const mem_opt_t *opt, int n_chn, mem_chain_t *a) { int i, k; kvec_t(int) chains = {0,0,0}; // this keeps int indices of the non-overlapping chains if (n_chn == 0) return 0; // no need to filter // compute the weight of each chain and drop chains with small weight for (i = k = 0; i < n_chn; ++i) { mem_chain_t *c = &a[i]; c->first = -1; c->kept = 0; c->w = mem_chain_weight(c); if (c->w < opt->min_chain_weight) free(c->seeds); else a[k++] = *c; } n_chn = k; ks_introsort(mem_flt, n_chn, a); // pairwise chain comparisons a[0].kept = 3; kv_push(int, chains, 0); for (i = 1; i < n_chn; ++i) { int large_ovlp = 0; for (k = 0; k < chains.n; ++k) { int j = chains.a[k]; int b_max = chn_beg(a[j]) > chn_beg(a[i])? chn_beg(a[j]) : chn_beg(a[i]); int e_min = chn_end(a[j]) < chn_end(a[i])? chn_end(a[j]) : chn_end(a[i]); if (e_min > b_max && (!a[j].is_alt || a[i].is_alt)) { // have overlap; don't consider ovlp where the kept chain is ALT while the current chain is primary int li = chn_end(a[i]) - chn_beg(a[i]); int lj = chn_end(a[j]) - chn_beg(a[j]); int min_l = li < lj? li : lj; if (e_min - b_max >= min_l * opt->mask_level && min_l < opt->max_chain_gap) { // significant overlap large_ovlp = 1; if (a[j].first < 0) a[j].first = i; // keep the first shadowed hit s.t. mapq can be more accurate if (a[i].w < a[j].w * opt->drop_ratio && a[j].w - a[i].w >= opt->min_seed_len<<1) break; } } } if (k == chains.n) { kv_push(int, chains, i); a[i].kept = large_ovlp? 2 : 3; } } for (i = 0; i < chains.n; ++i) { mem_chain_t *c = &a[chains.a[i]]; if (c->first >= 0) a[c->first].kept = 1; } free(chains.a); for (i = k = 0; i < n_chn; ++i) { // don't extend more than opt->max_chain_extend .kept=1/2 chains if (a[i].kept == 0 || a[i].kept == 3) continue; if (++k >= opt->max_chain_extend) break; } for (; i < n_chn; ++i) if (a[i].kept < 3) a[i].kept = 0; for (i = k = 0; i < n_chn; ++i) { // free discarded chains mem_chain_t *c = &a[i]; if (c->kept == 0) free(c->seeds); else a[k++] = a[i]; } return k; } /****************************** * De-overlap single-end hits * ******************************/ #define alnreg_slt2(a, b) ((a).re < (b).re) KSORT_INIT(mem_ars2, mem_alnreg_t, alnreg_slt2) #define alnreg_slt(a, b) ((a).score > (b).score || ((a).score == (b).score && ((a).rb < (b).rb || ((a).rb == (b).rb && (a).qb < (b).qb)))) KSORT_INIT(mem_ars, mem_alnreg_t, alnreg_slt) #define alnreg_hlt(a, b) ((a).score > (b).score || ((a).score == (b).score && ((a).is_alt < (b).is_alt || ((a).is_alt == (b).is_alt && (a).hash < (b).hash)))) KSORT_INIT(mem_ars_hash, mem_alnreg_t, alnreg_hlt) #define alnreg_hlt2(a, b) ((a).is_alt < (b).is_alt || ((a).is_alt == (b).is_alt && ((a).score > (b).score || ((a).score == (b).score && (a).hash < (b).hash)))) KSORT_INIT(mem_ars_hash2, mem_alnreg_t, alnreg_hlt2) #define PATCH_MAX_R_BW 0.05f #define PATCH_MIN_SC_RATIO 0.90f int mem_patch_reg(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac, uint8_t *query, const mem_alnreg_t *a, const mem_alnreg_t *b, int *_w) { int w, score, q_s, r_s; double r; if (bns == 0 || pac == 0 || query == 0) return 0; assert(a->rid == b->rid && a->rb <= b->rb); if (a->rb < bns->l_pac && b->rb >= bns->l_pac) return 0; // on different strands if (a->qb >= b->qb || a->qe >= b->qe || a->re >= b->re) return 0; // not colinear w = (a->re - b->rb) - (a->qe - b->qb); // required bandwidth w = w > 0? w : -w; // l = abs(l) r = (double)(a->re - b->rb) / (b->re - a->rb) - (double)(a->qe - b->qb) / (b->qe - a->qb); // relative bandwidth r = r > 0.? r : -r; // r = fabs(r) if (bwa_verbose >= 4) printf("* potential hit merge between [%d,%d)<=>[%ld,%ld) and [%d,%d)<=>[%ld,%ld), @ %s; w=%d, r=%.4g\n", a->qb, a->qe, (long)a->rb, (long)a->re, b->qb, b->qe, (long)b->rb, (long)b->re, bns->anns[a->rid].name, w, r); if (a->re < b->rb || a->qe < b->qb) { // no overlap on query or on ref if (w > opt->w<<1 || r >= PATCH_MAX_R_BW) return 0; // the bandwidth or the relative bandwidth is too large } else if (w > opt->w<<2 || r >= PATCH_MAX_R_BW*2) return 0; // more permissive if overlapping on both ref and query // global alignment w += a->w + b->w; w = w < opt->w<<2? w : opt->w<<2; if (bwa_verbose >= 4) printf("* test potential hit merge with global alignment; w=%d\n", w); bwa_gen_cigar2(opt->mat, opt->o_del, opt->e_del, opt->o_ins, opt->e_ins, w, bns->l_pac, pac, b->qe - a->qb, query + a->qb, a->rb, b->re, &score, 0, 0); q_s = (int)((double)(b->qe - a->qb) / ((b->qe - b->qb) + (a->qe - a->qb)) * (b->score + a->score) + .499); // predicted score from query r_s = (int)((double)(b->re - a->rb) / ((b->re - b->rb) + (a->re - a->rb)) * (b->score + a->score) + .499); // predicted score from ref if (bwa_verbose >= 4) printf("* score=%d;(%d,%d)\n", score, q_s, r_s); if ((double)score / (q_s > r_s? q_s : r_s) < PATCH_MIN_SC_RATIO) return 0; *_w = w; return score; } int mem_sort_dedup_patch(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac, uint8_t *query, int n, mem_alnreg_t *a) { int m, i, j; if (n <= 1) return n; ks_introsort(mem_ars2, n, a); // sort by the END position, not START! for (i = 0; i < n; ++i) a[i].n_comp = 1; for (i = 1; i < n; ++i) { mem_alnreg_t *p = &a[i]; if (p->rid != a[i-1].rid || p->rb >= a[i-1].re + opt->max_chain_gap) continue; // then no need to go into the loop below for (j = i - 1; j >= 0 && p->rid == a[j].rid && p->rb < a[j].re + opt->max_chain_gap; --j) { mem_alnreg_t *q = &a[j]; int64_t or, oq, mr, mq; int score, w; if (q->qe == q->qb) continue; // a[j] has been excluded or = q->re - p->rb; // overlap length on the reference oq = q->qb < p->qb? q->qe - p->qb : p->qe - q->qb; // overlap length on the query mr = q->re - q->rb < p->re - p->rb? q->re - q->rb : p->re - p->rb; // min ref len in alignment mq = q->qe - q->qb < p->qe - p->qb? q->qe - q->qb : p->qe - p->qb; // min qry len in alignment if (or > opt->mask_level_redun * mr && oq > opt->mask_level_redun * mq) { // one of the hits is redundant if (p->score < q->score) { p->qe = p->qb; break; } else q->qe = q->qb; } else if (q->rb < p->rb && (score = mem_patch_reg(opt, bns, pac, query, q, p, &w)) > 0) { // then merge q into p p->n_comp += q->n_comp + 1; p->seedcov = p->seedcov > q->seedcov? p->seedcov : q->seedcov; p->sub = p->sub > q->sub? p->sub : q->sub; p->csub = p->csub > q->csub? p->csub : q->csub; p->qb = q->qb, p->rb = q->rb; p->truesc = p->score = score; p->w = w; q->qb = q->qe; } } } for (i = 0, m = 0; i < n; ++i) // exclude identical hits if (a[i].qe > a[i].qb) { if (m != i) a[m++] = a[i]; else ++m; } n = m; ks_introsort(mem_ars, n, a); for (i = 1; i < n; ++i) { // mark identical hits if (a[i].score == a[i-1].score && a[i].rb == a[i-1].rb && a[i].qb == a[i-1].qb) a[i].qe = a[i].qb; } for (i = 1, m = 1; i < n; ++i) // exclude identical hits if (a[i].qe > a[i].qb) { if (m != i) a[m++] = a[i]; else ++m; } return m; } typedef kvec_t(int) int_v; static void mem_mark_primary_se_core(const mem_opt_t *opt, int n, mem_alnreg_t *a, int_v *z) { // similar to the loop in mem_chain_flt() int i, k, tmp; tmp = opt->a + opt->b; tmp = opt->o_del + opt->e_del > tmp? opt->o_del + opt->e_del : tmp; tmp = opt->o_ins + opt->e_ins > tmp? opt->o_ins + opt->e_ins : tmp; z->n = 0; kv_push(int, *z, 0); for (i = 1; i < n; ++i) { for (k = 0; k < z->n; ++k) { int j = z->a[k]; int b_max = a[j].qb > a[i].qb? a[j].qb : a[i].qb; int e_min = a[j].qe < a[i].qe? a[j].qe : a[i].qe; if (e_min > b_max) { // have overlap int min_l = a[i].qe - a[i].qb < a[j].qe - a[j].qb? a[i].qe - a[i].qb : a[j].qe - a[j].qb; if (e_min - b_max >= min_l * opt->mask_level) { // significant overlap if (a[j].sub == 0) a[j].sub = a[i].score; if (a[j].score - a[i].score <= tmp && (a[j].is_alt || !a[i].is_alt)) ++a[j].sub_n; break; } } } if (k == z->n) kv_push(int, *z, i); else a[i].secondary = z->a[k]; } } int mem_mark_primary_se(const mem_opt_t *opt, int n, mem_alnreg_t *a, int64_t id) { int i, n_pri; int_v z = {0,0,0}; if (n == 0) return 0; for (i = n_pri = 0; i < n; ++i) { a[i].sub = a[i].alt_sc = 0, a[i].secondary = a[i].secondary_all = -1, a[i].hash = hash_64(id+i); if (!a[i].is_alt) ++n_pri; } ks_introsort(mem_ars_hash, n, a); mem_mark_primary_se_core(opt, n, a, &z); for (i = 0; i < n; ++i) { mem_alnreg_t *p = &a[i]; p->secondary_all = i; // keep the rank in the first round if (!p->is_alt && p->secondary >= 0 && a[p->secondary].is_alt) p->alt_sc = a[p->secondary].score; } if (n_pri >= 0 && n_pri < n) { kv_resize(int, z, n); if (n_pri > 0) ks_introsort(mem_ars_hash2, n, a); for (i = 0; i < n; ++i) z.a[a[i].secondary_all] = i; for (i = 0; i < n; ++i) { if (a[i].secondary >= 0) { a[i].secondary_all = z.a[a[i].secondary]; if (a[i].is_alt) a[i].secondary = INT_MAX; } else a[i].secondary_all = -1; } if (n_pri > 0) { // mark primary for hits to the primary assembly only for (i = 0; i < n_pri; ++i) a[i].sub = 0, a[i].secondary = -1; mem_mark_primary_se_core(opt, n_pri, a, &z); } } else { for (i = 0; i < n; ++i) a[i].secondary_all = a[i].secondary; } free(z.a); return n_pri; } /********************************* * Test if a seed is good enough * *********************************/ #define MEM_SHORT_EXT 50 #define MEM_SHORT_LEN 200 #define MEM_HSP_COEF 1.1f #define MEM_MINSC_COEF 5.5f #define MEM_SEEDSW_COEF 0.05f int mem_seed_sw(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac, int l_query, const uint8_t *query, const mem_seed_t *s) { int qb, qe, rid; int64_t rb, re, mid, l_pac = bns->l_pac; uint8_t *rseq = 0; kswr_t x; if (s->len >= MEM_SHORT_LEN) return -1; // the seed is longer than the max-extend; no need to do SW qb = s->qbeg, qe = s->qbeg + s->len; rb = s->rbeg, re = s->rbeg + s->len; mid = (rb + re) >> 1; qb -= MEM_SHORT_EXT; qb = qb > 0? qb : 0; qe += MEM_SHORT_EXT; qe = qe < l_query? qe : l_query; rb -= MEM_SHORT_EXT; rb = rb > 0? rb : 0; re += MEM_SHORT_EXT; re = re < l_pac<<1? re : l_pac<<1; if (rb < l_pac && l_pac < re) { if (mid < l_pac) re = l_pac; else rb = l_pac; } if (qe - qb >= MEM_SHORT_LEN || re - rb >= MEM_SHORT_LEN) return -1; // the seed seems good enough; no need to do SW rseq = bns_fetch_seq(bns, pac, &rb, mid, &re, &rid); x = ksw_align2(qe - qb, (uint8_t*)query + qb, re - rb, rseq, 5, opt->mat, opt->o_del, opt->e_del, opt->o_ins, opt->e_ins, KSW_XSTART, 0); free(rseq); return x.score; } void mem_flt_chained_seeds(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac, int l_query, const uint8_t *query, int n_chn, mem_chain_t *a) { double min_l = opt->min_chain_weight? MEM_HSP_COEF * opt->min_chain_weight : MEM_MINSC_COEF * log(l_query); int i, j, k, min_HSP_score = (int)(opt->a * min_l + .499); if (min_l > MEM_SEEDSW_COEF * l_query) return; // don't run the following for short reads for (i = 0; i < n_chn; ++i) { mem_chain_t *c = &a[i]; for (j = k = 0; j < c->n; ++j) { mem_seed_t *s = &c->seeds[j]; s->score = mem_seed_sw(opt, bns, pac, l_query, query, s); if (s->score < 0 || s->score >= min_HSP_score) { s->score = s->score < 0? s->len * opt->a : s->score; c->seeds[k++] = *s; } } c->n = k; } } /**************************************** * Construct the alignment from a chain * ****************************************/ static inline int cal_max_gap(const mem_opt_t *opt, int qlen) { int l_del = (int)((double)(qlen * opt->a - opt->o_del) / opt->e_del + 1.); int l_ins = (int)((double)(qlen * opt->a - opt->o_ins) / opt->e_ins + 1.); int l = l_del > l_ins? l_del : l_ins; l = l > 1? l : 1; return l < opt->w<<1? l : opt->w<<1; } #define MAX_BAND_TRY 2 void mem_chain2aln(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac, int l_query, const uint8_t *query, const mem_chain_t *c, mem_alnreg_v *av) { int i, k, rid, max_off[2], aw[2]; // aw: actual bandwidth used in extension int64_t l_pac = bns->l_pac, rmax[2], tmp, max = 0; const mem_seed_t *s; uint8_t *rseq = 0; uint64_t *srt; if (c->n == 0) return; // get the max possible span rmax[0] = l_pac<<1; rmax[1] = 0; for (i = 0; i < c->n; ++i) { int64_t b, e; const mem_seed_t *t = &c->seeds[i]; b = t->rbeg - (t->qbeg + cal_max_gap(opt, t->qbeg)); e = t->rbeg + t->len + ((l_query - t->qbeg - t->len) + cal_max_gap(opt, l_query - t->qbeg - t->len)); rmax[0] = rmax[0] < b? rmax[0] : b; rmax[1] = rmax[1] > e? rmax[1] : e; if (t->len > max) max = t->len; } rmax[0] = rmax[0] > 0? rmax[0] : 0; rmax[1] = rmax[1] < l_pac<<1? rmax[1] : l_pac<<1; if (rmax[0] < l_pac && l_pac < rmax[1]) { // crossing the forward-reverse boundary; then choose one side if (c->seeds[0].rbeg < l_pac) rmax[1] = l_pac; // this works because all seeds are guaranteed to be on the same strand else rmax[0] = l_pac; } // retrieve the reference sequence rseq = bns_fetch_seq(bns, pac, &rmax[0], c->seeds[0].rbeg, &rmax[1], &rid); assert(c->rid == rid); srt = malloc(c->n * 8); for (i = 0; i < c->n; ++i) srt[i] = (uint64_t)c->seeds[i].score<<32 | i; ks_introsort_64(c->n, srt); for (k = c->n - 1; k >= 0; --k) { mem_alnreg_t *a; s = &c->seeds[(uint32_t)srt[k]]; for (i = 0; i < av->n; ++i) { // test whether extension has been made before mem_alnreg_t *p = &av->a[i]; int64_t rd; int qd, w, max_gap; if (s->rbeg < p->rb || s->rbeg + s->len > p->re || s->qbeg < p->qb || s->qbeg + s->len > p->qe) continue; // not fully contained if (s->len - p->seedlen0 > .1 * l_query) continue; // this seed may give a better alignment // qd: distance ahead of the seed on query; rd: on reference qd = s->qbeg - p->qb; rd = s->rbeg - p->rb; max_gap = cal_max_gap(opt, qd < rd? qd : rd); // the maximal gap allowed in regions ahead of the seed w = max_gap < p->w? max_gap : p->w; // bounded by the band width if (qd - rd < w && rd - qd < w) break; // the seed is "around" a previous hit // similar to the previous four lines, but this time we look at the region behind qd = p->qe - (s->qbeg + s->len); rd = p->re - (s->rbeg + s->len); max_gap = cal_max_gap(opt, qd < rd? qd : rd); w = max_gap < p->w? max_gap : p->w; if (qd - rd < w && rd - qd < w) break; } if (i < av->n) { // the seed is (almost) contained in an existing alignment; further testing is needed to confirm it is not leading to a different aln if (bwa_verbose >= 4) printf("** Seed(%d) [%ld;%ld,%ld] is almost contained in an existing alignment [%d,%d) <=> [%ld,%ld)\n", k, (long)s->len, (long)s->qbeg, (long)s->rbeg, av->a[i].qb, av->a[i].qe, (long)av->a[i].rb, (long)av->a[i].re); for (i = k + 1; i < c->n; ++i) { // check overlapping seeds in the same chain const mem_seed_t *t; if (srt[i] == 0) continue; t = &c->seeds[(uint32_t)srt[i]]; if (t->len < s->len * .95) continue; // only check overlapping if t is long enough; TODO: more efficient by early stopping if (s->qbeg <= t->qbeg && s->qbeg + s->len - t->qbeg >= s->len>>2 && t->qbeg - s->qbeg != t->rbeg - s->rbeg) break; if (t->qbeg <= s->qbeg && t->qbeg + t->len - s->qbeg >= s->len>>2 && s->qbeg - t->qbeg != s->rbeg - t->rbeg) break; } if (i == c->n) { // no overlapping seeds; then skip extension srt[k] = 0; // mark that seed extension has not been performed continue; } if (bwa_verbose >= 4) printf("** Seed(%d) might lead to a different alignment even though it is contained. Extension will be performed.\n", k); } a = kv_pushp(mem_alnreg_t, *av); memset(a, 0, sizeof(mem_alnreg_t)); a->w = aw[0] = aw[1] = opt->w; a->score = a->truesc = -1; a->rid = c->rid; if (bwa_verbose >= 4) err_printf("** ---> Extending from seed(%d) [%ld;%ld,%ld] @ %s <---\n", k, (long)s->len, (long)s->qbeg, (long)s->rbeg, bns->anns[c->rid].name); if (s->qbeg) { // left extension uint8_t *rs, *qs; int qle, tle, gtle, gscore; qs = malloc(s->qbeg); for (i = 0; i < s->qbeg; ++i) qs[i] = query[s->qbeg - 1 - i]; tmp = s->rbeg - rmax[0]; rs = malloc(tmp); for (i = 0; i < tmp; ++i) rs[i] = rseq[tmp - 1 - i]; for (i = 0; i < MAX_BAND_TRY; ++i) { int prev = a->score; aw[0] = opt->w << i; if (bwa_verbose >= 4) { int j; printf("*** Left ref: "); for (j = 0; j < tmp; ++j) putchar("ACGTN"[(int)rs[j]]); putchar('\n'); printf("*** Left query: "); for (j = 0; j < s->qbeg; ++j) putchar("ACGTN"[(int)qs[j]]); putchar('\n'); } a->score = ksw_extend2(s->qbeg, qs, tmp, rs, 5, opt->mat, opt->o_del, opt->e_del, opt->o_ins, opt->e_ins, aw[0], opt->pen_clip5, opt->zdrop, s->len * opt->a, &qle, &tle, >le, &gscore, &max_off[0]); if (bwa_verbose >= 4) { printf("*** Left extension: prev_score=%d; score=%d; bandwidth=%d; max_off_diagonal_dist=%d\n", prev, a->score, aw[0], max_off[0]); fflush(stdout); } if (a->score == prev || max_off[0] < (aw[0]>>1) + (aw[0]>>2)) break; } // check whether we prefer to reach the end of the query if (gscore <= 0 || gscore <= a->score - opt->pen_clip5) { // local extension a->qb = s->qbeg - qle, a->rb = s->rbeg - tle; a->truesc = a->score; } else { // to-end extension a->qb = 0, a->rb = s->rbeg - gtle; a->truesc = gscore; } free(qs); free(rs); } else a->score = a->truesc = s->len * opt->a, a->qb = 0, a->rb = s->rbeg; if (s->qbeg + s->len != l_query) { // right extension int qle, tle, qe, re, gtle, gscore, sc0 = a->score; qe = s->qbeg + s->len; re = s->rbeg + s->len - rmax[0]; assert(re >= 0); for (i = 0; i < MAX_BAND_TRY; ++i) { int prev = a->score; aw[1] = opt->w << i; if (bwa_verbose >= 4) { int j; printf("*** Right ref: "); for (j = 0; j < rmax[1] - rmax[0] - re; ++j) putchar("ACGTN"[(int)rseq[re+j]]); putchar('\n'); printf("*** Right query: "); for (j = 0; j < l_query - qe; ++j) putchar("ACGTN"[(int)query[qe+j]]); putchar('\n'); } a->score = ksw_extend2(l_query - qe, query + qe, rmax[1] - rmax[0] - re, rseq + re, 5, opt->mat, opt->o_del, opt->e_del, opt->o_ins, opt->e_ins, aw[1], opt->pen_clip3, opt->zdrop, sc0, &qle, &tle, >le, &gscore, &max_off[1]); if (bwa_verbose >= 4) { printf("*** Right extension: prev_score=%d; score=%d; bandwidth=%d; max_off_diagonal_dist=%d\n", prev, a->score, aw[1], max_off[1]); fflush(stdout); } if (a->score == prev || max_off[1] < (aw[1]>>1) + (aw[1]>>2)) break; } // similar to the above if (gscore <= 0 || gscore <= a->score - opt->pen_clip3) { // local extension a->qe = qe + qle, a->re = rmax[0] + re + tle; a->truesc += a->score - sc0; } else { // to-end extension a->qe = l_query, a->re = rmax[0] + re + gtle; a->truesc += gscore - sc0; } } else a->qe = l_query, a->re = s->rbeg + s->len; if (bwa_verbose >= 4) printf("*** Added alignment region: [%d,%d) <=> [%ld,%ld); score=%d; {left,right}_bandwidth={%d,%d}\n", a->qb, a->qe, (long)a->rb, (long)a->re, a->score, aw[0], aw[1]); // compute seedcov for (i = 0, a->seedcov = 0; i < c->n; ++i) { const mem_seed_t *t = &c->seeds[i]; if (t->qbeg >= a->qb && t->qbeg + t->len <= a->qe && t->rbeg >= a->rb && t->rbeg + t->len <= a->re) // seed fully contained a->seedcov += t->len; // this is not very accurate, but for approx. mapQ, this is good enough } a->w = aw[0] > aw[1]? aw[0] : aw[1]; a->seedlen0 = s->len; a->frac_rep = c->frac_rep; } free(srt); free(rseq); } /***************************** * Basic hit->SAM conversion * *****************************/ static inline int infer_bw(int l1, int l2, int score, int a, int q, int r) { int w; if (l1 == l2 && l1 * a - score < (q + r - a)<<1) return 0; // to get equal alignment length, we need at least two gaps w = ((double)((l1 < l2? l1 : l2) * a - score - q) / r + 2.); if (w < abs(l1 - l2)) w = abs(l1 - l2); return w; } static inline int get_rlen(int n_cigar, const uint32_t *cigar) { int k, l; for (k = l = 0; k < n_cigar; ++k) { int op = cigar[k]&0xf; if (op == 0 || op == 2) l += cigar[k]>>4; } return l; } static inline void add_cigar(const mem_opt_t *opt, mem_aln_t *p, kstring_t *str, int which) { int i; if (p->n_cigar) { // aligned for (i = 0; i < p->n_cigar; ++i) { int c = p->cigar[i]&0xf; if (!(opt->flag&MEM_F_SOFTCLIP) && !p->is_alt && (c == 3 || c == 4)) c = which? 4 : 3; // use hard clipping for supplementary alignments kputw(p->cigar[i]>>4, str); kputc("MIDSH"[c], str); } } else kputc('*', str); // having a coordinate but unaligned (e.g. when copy_mate is true) } void mem_aln2sam(const mem_opt_t *opt, const bntseq_t *bns, kstring_t *str, bseq1_t *s, int n, const mem_aln_t *list, int which, const mem_aln_t *m_) { int i, l_name; mem_aln_t ptmp = list[which], *p = &ptmp, mtmp, *m = 0; // make a copy of the alignment to convert if (m_) mtmp = *m_, m = &mtmp; // set flag p->flag |= m? 0x1 : 0; // is paired in sequencing p->flag |= p->rid < 0? 0x4 : 0; // is mapped p->flag |= m && m->rid < 0? 0x8 : 0; // is mate mapped if (p->rid < 0 && m && m->rid >= 0) // copy mate to alignment p->rid = m->rid, p->pos = m->pos, p->is_rev = m->is_rev, p->n_cigar = 0; if (m && m->rid < 0 && p->rid >= 0) // copy alignment to mate m->rid = p->rid, m->pos = p->pos, m->is_rev = p->is_rev, m->n_cigar = 0; p->flag |= p->is_rev? 0x10 : 0; // is on the reverse strand p->flag |= m && m->is_rev? 0x20 : 0; // is mate on the reverse strand // print up to CIGAR l_name = strlen(s->name); ks_resize(str, str->l + s->l_seq + l_name + (s->qual? s->l_seq : 0) + 20); kputsn(s->name, l_name, str); kputc('\t', str); // QNAME kputw((p->flag&0xffff) | (p->flag&0x10000? 0x100 : 0), str); kputc('\t', str); // FLAG if (p->rid >= 0) { // with coordinate kputs(bns->anns[p->rid].name, str); kputc('\t', str); // RNAME kputl(p->pos + 1, str); kputc('\t', str); // POS kputw(p->mapq, str); kputc('\t', str); // MAPQ add_cigar(opt, p, str, which); } else kputsn("*\t0\t0\t*", 7, str); // without coordinte kputc('\t', str); // print the mate position if applicable if (m && m->rid >= 0) { if (p->rid == m->rid) kputc('=', str); else kputs(bns->anns[m->rid].name, str); kputc('\t', str); kputl(m->pos + 1, str); kputc('\t', str); if (p->rid == m->rid) { int64_t p0 = p->pos + (p->is_rev? get_rlen(p->n_cigar, p->cigar) - 1 : 0); int64_t p1 = m->pos + (m->is_rev? get_rlen(m->n_cigar, m->cigar) - 1 : 0); if (m->n_cigar == 0 || p->n_cigar == 0) kputc('0', str); else kputl(-(p0 - p1 + (p0 > p1? 1 : p0 < p1? -1 : 0)), str); } else kputc('0', str); } else kputsn("*\t0\t0", 5, str); kputc('\t', str); // print SEQ and QUAL if (p->flag & 0x100) { // for secondary alignments, don't write SEQ and QUAL kputsn("*\t*", 3, str); } else if (!p->is_rev) { // the forward strand int i, qb = 0, qe = s->l_seq; if (p->n_cigar && which && !(opt->flag&MEM_F_SOFTCLIP) && !p->is_alt) { // have cigar && not the primary alignment && not softclip all if ((p->cigar[0]&0xf) == 4 || (p->cigar[0]&0xf) == 3) qb += p->cigar[0]>>4; if ((p->cigar[p->n_cigar-1]&0xf) == 4 || (p->cigar[p->n_cigar-1]&0xf) == 3) qe -= p->cigar[p->n_cigar-1]>>4; } ks_resize(str, str->l + (qe - qb) + 1); for (i = qb; i < qe; ++i) str->s[str->l++] = "ACGTN"[(int)s->seq[i]]; kputc('\t', str); if (s->qual) { // printf qual ks_resize(str, str->l + (qe - qb) + 1); for (i = qb; i < qe; ++i) str->s[str->l++] = s->qual[i]; str->s[str->l] = 0; } else kputc('*', str); } else { // the reverse strand int i, qb = 0, qe = s->l_seq; if (p->n_cigar && which && !(opt->flag&MEM_F_SOFTCLIP) && !p->is_alt) { if ((p->cigar[0]&0xf) == 4 || (p->cigar[0]&0xf) == 3) qe -= p->cigar[0]>>4; if ((p->cigar[p->n_cigar-1]&0xf) == 4 || (p->cigar[p->n_cigar-1]&0xf) == 3) qb += p->cigar[p->n_cigar-1]>>4; } ks_resize(str, str->l + (qe - qb) + 1); for (i = qe-1; i >= qb; --i) str->s[str->l++] = "TGCAN"[(int)s->seq[i]]; kputc('\t', str); if (s->qual) { // printf qual ks_resize(str, str->l + (qe - qb) + 1); for (i = qe-1; i >= qb; --i) str->s[str->l++] = s->qual[i]; str->s[str->l] = 0; } else kputc('*', str); } // print optional tags if (p->n_cigar) { kputsn("\tNM:i:", 6, str); kputw(p->NM, str); kputsn("\tMD:Z:", 6, str); kputs((char*)(p->cigar + p->n_cigar), str); } if (m && m->n_cigar) { kputsn("\tMC:Z:", 6, str); add_cigar(opt, m, str, which); } if (p->score >= 0) { kputsn("\tAS:i:", 6, str); kputw(p->score, str); } if (p->sub >= 0) { kputsn("\tXS:i:", 6, str); kputw(p->sub, str); } if (bwa_rg_id[0]) { kputsn("\tRG:Z:", 6, str); kputs(bwa_rg_id, str); } if (!(p->flag & 0x100)) { // not multi-hit for (i = 0; i < n; ++i) if (i != which && !(list[i].flag&0x100)) break; if (i < n) { // there are other primary hits; output them kputsn("\tSA:Z:", 6, str); for (i = 0; i < n; ++i) { const mem_aln_t *r = &list[i]; int k; if (i == which || (r->flag&0x100)) continue; // proceed if: 1) different from the current; 2) not shadowed multi hit kputs(bns->anns[r->rid].name, str); kputc(',', str); kputl(r->pos+1, str); kputc(',', str); kputc("+-"[r->is_rev], str); kputc(',', str); for (k = 0; k < r->n_cigar; ++k) { kputw(r->cigar[k]>>4, str); kputc("MIDSH"[r->cigar[k]&0xf], str); } kputc(',', str); kputw(r->mapq, str); kputc(',', str); kputw(r->NM, str); kputc(';', str); } } if (p->alt_sc > 0) ksprintf(str, "\tpa:f:%.3f", (double)p->score / p->alt_sc); } if (p->XA) { kputsn("\tXA:Z:", 6, str); kputs(p->XA, str); } if (s->comment) { kputc('\t', str); kputs(s->comment, str); } if ((opt->flag&MEM_F_REF_HDR) && p->rid >= 0 && bns->anns[p->rid].anno != 0 && bns->anns[p->rid].anno[0] != 0) { int tmp; kputsn("\tXR:Z:", 6, str); tmp = str->l; kputs(bns->anns[p->rid].anno, str); for (i = tmp; i < str->l; ++i) // replace TAB in the comment to SPACE if (str->s[i] == '\t') str->s[i] = ' '; } kputc('\n', str); } /************************ * Integrated interface * ************************/ int mem_approx_mapq_se(const mem_opt_t *opt, const mem_alnreg_t *a) { int mapq, l, sub = a->sub? a->sub : opt->min_seed_len * opt->a; double identity; sub = a->csub > sub? a->csub : sub; if (sub >= a->score) return 0; l = a->qe - a->qb > a->re - a->rb? a->qe - a->qb : a->re - a->rb; identity = 1. - (double)(l * opt->a - a->score) / (opt->a + opt->b) / l; if (a->score == 0) { mapq = 0; } else if (opt->mapQ_coef_len > 0) { double tmp; tmp = l < opt->mapQ_coef_len? 1. : opt->mapQ_coef_fac / log(l); tmp *= identity * identity; mapq = (int)(6.02 * (a->score - sub) / opt->a * tmp * tmp + .499); } else { mapq = (int)(MEM_MAPQ_COEF * (1. - (double)sub / a->score) * log(a->seedcov) + .499); mapq = identity < 0.95? (int)(mapq * identity * identity + .499) : mapq; } if (a->sub_n > 0) mapq -= (int)(4.343 * log(a->sub_n+1) + .499); if (mapq > 60) mapq = 60; if (mapq < 0) mapq = 0; mapq = (int)(mapq * (1. - a->frac_rep) + .499); return mapq; } void mem_reorder_primary5(int T, mem_alnreg_v *a) { int k, n_pri = 0, left_st = INT_MAX, left_k = -1; mem_alnreg_t t; for (k = 0; k < a->n; ++k) if (a->a[k].secondary < 0 && !a->a[k].is_alt && a->a[k].score >= T) ++n_pri; if (n_pri <= 1) return; // only one alignment for (k = 0; k < a->n; ++k) { mem_alnreg_t *p = &a->a[k]; if (p->secondary >= 0 || p->is_alt || p->score < T) continue; if (p->qb < left_st) left_st = p->qb, left_k = k; } assert(a->a[0].secondary < 0); if (left_k == 0) return; // no need to reorder t = a->a[0], a->a[0] = a->a[left_k], a->a[left_k] = t; for (k = 1; k < a->n; ++k) { // update secondary and secondary_all mem_alnreg_t *p = &a->a[k]; if (p->secondary == 0) p->secondary = left_k; else if (p->secondary == left_k) p->secondary = 0; if (p->secondary_all == 0) p->secondary_all = left_k; else if (p->secondary_all == left_k) p->secondary_all = 0; } } // TODO (future plan): group hits into a uint64_t[] array. This will be cleaner and more flexible void mem_reg2sam(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac, bseq1_t *s, mem_alnreg_v *a, int extra_flag, const mem_aln_t *m) { extern char **mem_gen_alt(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac, mem_alnreg_v *a, int l_query, const char *query); kstring_t str; kvec_t(mem_aln_t) aa; int k, l; char **XA = 0; if (!(opt->flag & MEM_F_ALL)) XA = mem_gen_alt(opt, bns, pac, a, s->l_seq, s->seq); kv_init(aa); str.l = str.m = 0; str.s = 0; for (k = l = 0; k < a->n; ++k) { mem_alnreg_t *p = &a->a[k]; mem_aln_t *q; if (p->score < opt->T) continue; if (p->secondary >= 0 && (p->is_alt || !(opt->flag&MEM_F_ALL))) continue; if (p->secondary >= 0 && p->secondary < INT_MAX && p->score < a->a[p->secondary].score * opt->drop_ratio) continue; q = kv_pushp(mem_aln_t, aa); *q = mem_reg2aln(opt, bns, pac, s->l_seq, s->seq, p); assert(q->rid >= 0); // this should not happen with the new code q->XA = XA? XA[k] : 0; q->flag |= extra_flag; // flag secondary if (p->secondary >= 0) q->sub = -1; // don't output sub-optimal score if (l && p->secondary < 0) // if supplementary q->flag |= (opt->flag&MEM_F_NO_MULTI)? 0x10000 : 0x800; if (!(opt->flag & MEM_F_KEEP_SUPP_MAPQ) && l && !p->is_alt && q->mapq > aa.a[0].mapq) q->mapq = aa.a[0].mapq; // lower mapq for supplementary mappings, unless -5 or -q is applied ++l; } if (aa.n == 0) { // no alignments good enough; then write an unaligned record mem_aln_t t; t = mem_reg2aln(opt, bns, pac, s->l_seq, s->seq, 0); t.flag |= extra_flag; mem_aln2sam(opt, bns, &str, s, 1, &t, 0, m); } else { for (k = 0; k < aa.n; ++k) mem_aln2sam(opt, bns, &str, s, aa.n, aa.a, k, m); for (k = 0; k < aa.n; ++k) free(aa.a[k].cigar); free(aa.a); } s->sam = str.s; if (XA) { for (k = 0; k < a->n; ++k) free(XA[k]); free(XA); } } mem_alnreg_v mem_align1_core(const mem_opt_t *opt, const bwt_t *bwt, const bntseq_t *bns, const uint8_t *pac, int l_seq, char *seq, void *buf) { int i; mem_chain_v chn; mem_alnreg_v regs; for (i = 0; i < l_seq; ++i) // convert to 2-bit encoding if we have not done so seq[i] = seq[i] < 4? seq[i] : nst_nt4_table[(int)seq[i]]; chn = mem_chain(opt, bwt, bns, l_seq, (uint8_t*)seq, buf); chn.n = mem_chain_flt(opt, chn.n, chn.a); mem_flt_chained_seeds(opt, bns, pac, l_seq, (uint8_t*)seq, chn.n, chn.a); if (bwa_verbose >= 4) mem_print_chain(bns, &chn); kv_init(regs); for (i = 0; i < chn.n; ++i) { mem_chain_t *p = &chn.a[i]; if (bwa_verbose >= 4) err_printf("* ---> Processing chain(%d) <---\n", i); mem_chain2aln(opt, bns, pac, l_seq, (uint8_t*)seq, p, ®s); free(chn.a[i].seeds); } free(chn.a); regs.n = mem_sort_dedup_patch(opt, bns, pac, (uint8_t*)seq, regs.n, regs.a); if (bwa_verbose >= 4) { err_printf("* %ld chains remain after removing duplicated chains\n", regs.n); for (i = 0; i < regs.n; ++i) { mem_alnreg_t *p = ®s.a[i]; printf("** %d, [%d,%d) <=> [%ld,%ld)\n", p->score, p->qb, p->qe, (long)p->rb, (long)p->re); } } for (i = 0; i < regs.n; ++i) { mem_alnreg_t *p = ®s.a[i]; if (p->rid >= 0 && bns->anns[p->rid].is_alt) p->is_alt = 1; } return regs; } mem_aln_t mem_reg2aln(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac, int l_query, const char *query_, const mem_alnreg_t *ar) { mem_aln_t a; int i, w2, tmp, qb, qe, NM, score, is_rev, last_sc = -(1<<30), l_MD; int64_t pos, rb, re; uint8_t *query; memset(&a, 0, sizeof(mem_aln_t)); if (ar == 0 || ar->rb < 0 || ar->re < 0) { // generate an unmapped record a.rid = -1; a.pos = -1; a.flag |= 0x4; return a; } qb = ar->qb, qe = ar->qe; rb = ar->rb, re = ar->re; query = malloc(l_query); for (i = 0; i < l_query; ++i) // convert to the nt4 encoding query[i] = query_[i] < 5? query_[i] : nst_nt4_table[(int)query_[i]]; a.mapq = ar->secondary < 0? mem_approx_mapq_se(opt, ar) : 0; if (ar->secondary >= 0) a.flag |= 0x100; // secondary alignment tmp = infer_bw(qe - qb, re - rb, ar->truesc, opt->a, opt->o_del, opt->e_del); w2 = infer_bw(qe - qb, re - rb, ar->truesc, opt->a, opt->o_ins, opt->e_ins); w2 = w2 > tmp? w2 : tmp; if (bwa_verbose >= 4) printf("* Band width: inferred=%d, cmd_opt=%d, alnreg=%d\n", w2, opt->w, ar->w); if (w2 > opt->w) w2 = w2 < ar->w? w2 : ar->w; i = 0; a.cigar = 0; do { free(a.cigar); w2 = w2 < opt->w<<2? w2 : opt->w<<2; a.cigar = bwa_gen_cigar2(opt->mat, opt->o_del, opt->e_del, opt->o_ins, opt->e_ins, w2, bns->l_pac, pac, qe - qb, (uint8_t*)&query[qb], rb, re, &score, &a.n_cigar, &NM); if (bwa_verbose >= 4) printf("* Final alignment: w2=%d, global_sc=%d, local_sc=%d\n", w2, score, ar->truesc); if (score == last_sc || w2 == opt->w<<2) break; // it is possible that global alignment and local alignment give different scores last_sc = score; w2 <<= 1; } while (++i < 3 && score < ar->truesc - opt->a); l_MD = strlen((char*)(a.cigar + a.n_cigar)) + 1; a.NM = NM; pos = bns_depos(bns, rb < bns->l_pac? rb : re - 1, &is_rev); a.is_rev = is_rev; if (a.n_cigar > 0) { // squeeze out leading or trailing deletions if ((a.cigar[0]&0xf) == 2) { pos += a.cigar[0]>>4; --a.n_cigar; memmove(a.cigar, a.cigar + 1, a.n_cigar * 4 + l_MD); } else if ((a.cigar[a.n_cigar-1]&0xf) == 2) { --a.n_cigar; memmove(a.cigar + a.n_cigar, a.cigar + a.n_cigar + 1, l_MD); // MD needs to be moved accordingly } } if (qb != 0 || qe != l_query) { // add clipping to CIGAR int clip5, clip3; clip5 = is_rev? l_query - qe : qb; clip3 = is_rev? qb : l_query - qe; a.cigar = realloc(a.cigar, 4 * (a.n_cigar + 2) + l_MD); if (clip5) { memmove(a.cigar+1, a.cigar, a.n_cigar * 4 + l_MD); // make room for 5'-end clipping a.cigar[0] = clip5<<4 | 3; ++a.n_cigar; } if (clip3) { memmove(a.cigar + a.n_cigar + 1, a.cigar + a.n_cigar, l_MD); // make room for 3'-end clipping a.cigar[a.n_cigar++] = clip3<<4 | 3; } } a.rid = bns_pos2rid(bns, pos); assert(a.rid == ar->rid); a.pos = pos - bns->anns[a.rid].offset; a.score = ar->score; a.sub = ar->sub > ar->csub? ar->sub : ar->csub; a.is_alt = ar->is_alt; a.alt_sc = ar->alt_sc; free(query); return a; } typedef struct { const mem_opt_t *opt; const bwt_t *bwt; const bntseq_t *bns; const uint8_t *pac; const mem_pestat_t *pes; smem_aux_t **aux; bseq1_t *seqs; mem_alnreg_v *regs; int64_t n_processed; } worker_t; static void worker1(void *data, int i, int tid) { worker_t *w = (worker_t*)data; if (!(w->opt->flag&MEM_F_PE)) { if (bwa_verbose >= 4) printf("=====> Processing read '%s' <=====\n", w->seqs[i].name); w->regs[i] = mem_align1_core(w->opt, w->bwt, w->bns, w->pac, w->seqs[i].l_seq, w->seqs[i].seq, w->aux[tid]); } else { if (bwa_verbose >= 4) printf("=====> Processing read '%s'/1 <=====\n", w->seqs[i<<1|0].name); w->regs[i<<1|0] = mem_align1_core(w->opt, w->bwt, w->bns, w->pac, w->seqs[i<<1|0].l_seq, w->seqs[i<<1|0].seq, w->aux[tid]); if (bwa_verbose >= 4) printf("=====> Processing read '%s'/2 <=====\n", w->seqs[i<<1|1].name); w->regs[i<<1|1] = mem_align1_core(w->opt, w->bwt, w->bns, w->pac, w->seqs[i<<1|1].l_seq, w->seqs[i<<1|1].seq, w->aux[tid]); } } static void worker2(void *data, int i, int tid) { extern int mem_sam_pe(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac, const mem_pestat_t pes[4], uint64_t id, bseq1_t s[2], mem_alnreg_v a[2]); extern void mem_reg2ovlp(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac, bseq1_t *s, mem_alnreg_v *a); worker_t *w = (worker_t*)data; if (!(w->opt->flag&MEM_F_PE)) { if (bwa_verbose >= 4) printf("=====> Finalizing read '%s' <=====\n", w->seqs[i].name); mem_mark_primary_se(w->opt, w->regs[i].n, w->regs[i].a, w->n_processed + i); if (w->opt->flag & MEM_F_PRIMARY5) mem_reorder_primary5(w->opt->T, &w->regs[i]); mem_reg2sam(w->opt, w->bns, w->pac, &w->seqs[i], &w->regs[i], 0, 0); free(w->regs[i].a); } else { if (bwa_verbose >= 4) printf("=====> Finalizing read pair '%s' <=====\n", w->seqs[i<<1|0].name); mem_sam_pe(w->opt, w->bns, w->pac, w->pes, (w->n_processed>>1) + i, &w->seqs[i<<1], &w->regs[i<<1]); free(w->regs[i<<1|0].a); free(w->regs[i<<1|1].a); } } void mem_process_seqs(const mem_opt_t *opt, const bwt_t *bwt, const bntseq_t *bns, const uint8_t *pac, int64_t n_processed, int n, bseq1_t *seqs, const mem_pestat_t *pes0) { extern void kt_for(int n_threads, void (*func)(void*,int,int), void *data, int n); worker_t w; mem_pestat_t pes[4]; double ctime, rtime; int i; ctime = cputime(); rtime = realtime(); global_bns = bns; w.regs = malloc(n * sizeof(mem_alnreg_v)); w.opt = opt; w.bwt = bwt; w.bns = bns; w.pac = pac; w.seqs = seqs; w.n_processed = n_processed; w.pes = &pes[0]; w.aux = malloc(opt->n_threads * sizeof(smem_aux_t)); for (i = 0; i < opt->n_threads; ++i) w.aux[i] = smem_aux_init(); kt_for(opt->n_threads, worker1, &w, (opt->flag&MEM_F_PE)? n>>1 : n); // find mapping positions for (i = 0; i < opt->n_threads; ++i) smem_aux_destroy(w.aux[i]); free(w.aux); if (opt->flag&MEM_F_PE) { // infer insert sizes if not provided if (pes0) memcpy(pes, pes0, 4 * sizeof(mem_pestat_t)); // if pes0 != NULL, set the insert-size distribution as pes0 else mem_pestat(opt, bns->l_pac, n, w.regs, pes); // otherwise, infer the insert size distribution from data } kt_for(opt->n_threads, worker2, &w, (opt->flag&MEM_F_PE)? n>>1 : n); // generate alignment free(w.regs); if (bwa_verbose >= 3) fprintf(stderr, "[M::%s] Processed %d reads in %.3f CPU sec, %.3f real sec\n", __func__, n, cputime() - ctime, realtime() - rtime); } bwa-0.7.17/bwamem.h000066400000000000000000000174721317342117100140130ustar00rootroot00000000000000#ifndef BWAMEM_H_ #define BWAMEM_H_ #include "bwt.h" #include "bntseq.h" #include "bwa.h" #define MEM_MAPQ_COEF 30.0 #define MEM_MAPQ_MAX 60 struct __smem_i; typedef struct __smem_i smem_i; #define MEM_F_PE 0x2 #define MEM_F_NOPAIRING 0x4 #define MEM_F_ALL 0x8 #define MEM_F_NO_MULTI 0x10 #define MEM_F_NO_RESCUE 0x20 #define MEM_F_REF_HDR 0x100 #define MEM_F_SOFTCLIP 0x200 #define MEM_F_SMARTPE 0x400 #define MEM_F_PRIMARY5 0x800 #define MEM_F_KEEP_SUPP_MAPQ 0x1000 typedef struct { int a, b; // match score and mismatch penalty int o_del, e_del; int o_ins, e_ins; int pen_unpaired; // phred-scaled penalty for unpaired reads int pen_clip5,pen_clip3;// clipping penalty. This score is not deducted from the DP score. int w; // band width int zdrop; // Z-dropoff uint64_t max_mem_intv; int T; // output score threshold; only affecting output int flag; // see MEM_F_* macros int min_seed_len; // minimum seed length int min_chain_weight; int max_chain_extend; float split_factor; // split into a seed if MEM is longer than min_seed_len*split_factor int split_width; // split into a seed if its occurence is smaller than this value int max_occ; // skip a seed if its occurence is larger than this value int max_chain_gap; // do not chain seed if it is max_chain_gap-bp away from the closest seed int n_threads; // number of threads int chunk_size; // process chunk_size-bp sequences in a batch float mask_level; // regard a hit as redundant if the overlap with another better hit is over mask_level times the min length of the two hits float drop_ratio; // drop a chain if its seed coverage is below drop_ratio times the seed coverage of a better chain overlapping with the small chain float XA_drop_ratio; // when counting hits for the XA tag, ignore alignments with score < XA_drop_ratio * max_score; only effective for the XA tag float mask_level_redun; float mapQ_coef_len; int mapQ_coef_fac; int max_ins; // when estimating insert size distribution, skip pairs with insert longer than this value int max_matesw; // perform maximally max_matesw rounds of mate-SW for each end int max_XA_hits, max_XA_hits_alt; // if there are max_hits or fewer, output them all int8_t mat[25]; // scoring matrix; mat[0] == 0 if unset } mem_opt_t; typedef struct { int64_t rb, re; // [rb,re): reference sequence in the alignment int qb, qe; // [qb,qe): query sequence in the alignment int rid; // reference seq ID int score; // best local SW score int truesc; // actual score corresponding to the aligned region; possibly smaller than $score int sub; // 2nd best SW score int alt_sc; int csub; // SW score of a tandem hit int sub_n; // approximate number of suboptimal hits int w; // actual band width used in extension int seedcov; // length of regions coverged by seeds int secondary; // index of the parent hit shadowing the current hit; <0 if primary int secondary_all; int seedlen0; // length of the starting seed int n_comp:30, is_alt:2; // number of sub-alignments chained together float frac_rep; uint64_t hash; } mem_alnreg_t; typedef struct { size_t n, m; mem_alnreg_t *a; } mem_alnreg_v; typedef struct { int low, high; // lower and upper bounds within which a read pair is considered to be properly paired int failed; // non-zero if the orientation is not supported by sufficient data double avg, std; // mean and stddev of the insert size distribution } mem_pestat_t; typedef struct { // This struct is only used for the convenience of API. int64_t pos; // forward strand 5'-end mapping position int rid; // reference sequence index in bntseq_t; <0 for unmapped int flag; // extra flag uint32_t is_rev:1, is_alt:1, mapq:8, NM:22; // is_rev: whether on the reverse strand; mapq: mapping quality; NM: edit distance int n_cigar; // number of CIGAR operations uint32_t *cigar; // CIGAR in the BAM encoding: opLen<<4|op; op to integer mapping: MIDSH=>01234 char *XA; // alternative mappings int score, sub, alt_sc; } mem_aln_t; #ifdef __cplusplus extern "C" { #endif smem_i *smem_itr_init(const bwt_t *bwt); void smem_itr_destroy(smem_i *itr); void smem_set_query(smem_i *itr, int len, const uint8_t *query); void smem_config(smem_i *itr, int min_intv, int max_len, uint64_t max_intv); const bwtintv_v *smem_next(smem_i *itr); mem_opt_t *mem_opt_init(void); void mem_fill_scmat(int a, int b, int8_t mat[25]); /** * Align a batch of sequences and generate the alignments in the SAM format * * This routine requires $seqs[i].{l_seq,seq,name} and write $seqs[i].sam. * Note that $seqs[i].sam may consist of several SAM lines if the * corresponding sequence has multiple primary hits. * * In the paired-end mode (i.e. MEM_F_PE is set in $opt->flag), query * sequences must be interleaved: $n must be an even number and the 2i-th * sequence and the (2i+1)-th sequence constitute a read pair. In this * mode, there should be enough (typically >50) unique pairs for the * routine to infer the orientation and insert size. * * @param opt alignment parameters * @param bwt FM-index of the reference sequence * @param bns Information of the reference * @param pac 2-bit encoded reference * @param n number of query sequences * @param seqs query sequences; $seqs[i].seq/sam to be modified after the call * @param pes0 insert-size info; if NULL, infer from data; if not NULL, it should be an array with 4 elements, * corresponding to each FF, FR, RF and RR orientation. See mem_pestat() for more info. */ void mem_process_seqs(const mem_opt_t *opt, const bwt_t *bwt, const bntseq_t *bns, const uint8_t *pac, int64_t n_processed, int n, bseq1_t *seqs, const mem_pestat_t *pes0); /** * Find the aligned regions for one query sequence * * Note that this routine does not generate CIGAR. CIGAR should be * generated later by mem_reg2aln() below. * * @param opt alignment parameters * @param bwt FM-index of the reference sequence * @param bns Information of the reference * @param pac 2-bit encoded reference * @param l_seq length of query sequence * @param seq query sequence * * @return list of aligned regions. */ mem_alnreg_v mem_align1(const mem_opt_t *opt, const bwt_t *bwt, const bntseq_t *bns, const uint8_t *pac, int l_seq, const char *seq); /** * Generate CIGAR and forward-strand position from alignment region * * @param opt alignment parameters * @param bns Information of the reference * @param pac 2-bit encoded reference * @param l_seq length of query sequence * @param seq query sequence * @param ar one alignment region * * @return CIGAR, strand, mapping quality and forward-strand position */ mem_aln_t mem_reg2aln(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac, int l_seq, const char *seq, const mem_alnreg_t *ar); mem_aln_t mem_reg2aln2(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac, int l_seq, const char *seq, const mem_alnreg_t *ar, const char *name); /** * Infer the insert size distribution from interleaved alignment regions * * This function can be called after mem_align1(), as long as paired-end * reads are properly interleaved. * * @param opt alignment parameters * @param l_pac length of concatenated reference sequence * @param n number of query sequences; must be an even number * @param regs region array of size $n; 2i-th and (2i+1)-th elements constitute a pair * @param pes inferred insert size distribution (output) */ void mem_pestat(const mem_opt_t *opt, int64_t l_pac, int n, const mem_alnreg_v *regs, mem_pestat_t pes[4]); #ifdef __cplusplus } #endif #endif bwa-0.7.17/bwamem_extra.c000066400000000000000000000105141317342117100151770ustar00rootroot00000000000000#include #include "bwa.h" #include "bwamem.h" #include "bntseq.h" #include "kstring.h" /*************************** * SMEM iterator interface * ***************************/ struct __smem_i { const bwt_t *bwt; const uint8_t *query; int start, len; int min_intv, max_len; uint64_t max_intv; bwtintv_v *matches; // matches; to be returned by smem_next() bwtintv_v *sub; // sub-matches inside the longest match; temporary bwtintv_v *tmpvec[2]; // temporary arrays }; smem_i *smem_itr_init(const bwt_t *bwt) { smem_i *itr; itr = calloc(1, sizeof(smem_i)); itr->bwt = bwt; itr->tmpvec[0] = calloc(1, sizeof(bwtintv_v)); itr->tmpvec[1] = calloc(1, sizeof(bwtintv_v)); itr->matches = calloc(1, sizeof(bwtintv_v)); itr->sub = calloc(1, sizeof(bwtintv_v)); itr->min_intv = 1; itr->max_len = INT_MAX; itr->max_intv = 0; return itr; } void smem_itr_destroy(smem_i *itr) { free(itr->tmpvec[0]->a); free(itr->tmpvec[0]); free(itr->tmpvec[1]->a); free(itr->tmpvec[1]); free(itr->matches->a); free(itr->matches); free(itr->sub->a); free(itr->sub); free(itr); } void smem_set_query(smem_i *itr, int len, const uint8_t *query) { itr->query = query; itr->start = 0; itr->len = len; } void smem_config(smem_i *itr, int min_intv, int max_len, uint64_t max_intv) { itr->min_intv = min_intv; itr->max_len = max_len; itr->max_intv = max_intv; } const bwtintv_v *smem_next(smem_i *itr) { int ori_start; itr->tmpvec[0]->n = itr->tmpvec[1]->n = itr->matches->n = itr->sub->n = 0; if (itr->start >= itr->len || itr->start < 0) return 0; while (itr->start < itr->len && itr->query[itr->start] > 3) ++itr->start; // skip ambiguous bases if (itr->start == itr->len) return 0; ori_start = itr->start; itr->start = bwt_smem1a(itr->bwt, itr->len, itr->query, ori_start, itr->min_intv, itr->max_intv, itr->matches, itr->tmpvec); // search for SMEM return itr->matches; } /*********************** *** Extra functions *** ***********************/ mem_alnreg_v mem_align1(const mem_opt_t *opt, const bwt_t *bwt, const bntseq_t *bns, const uint8_t *pac, int l_seq, const char *seq_) { // the difference from mem_align1_core() is that this routine: 1) calls mem_mark_primary_se(); 2) does not modify the input sequence extern mem_alnreg_v mem_align1_core(const mem_opt_t *opt, const bwt_t *bwt, const bntseq_t *bns, const uint8_t *pac, int l_seq, char *seq, void *buf); extern void mem_mark_primary_se(const mem_opt_t *opt, int n, mem_alnreg_t *a, int64_t id); mem_alnreg_v ar; char *seq; seq = malloc(l_seq); memcpy(seq, seq_, l_seq); // makes a copy of seq_ ar = mem_align1_core(opt, bwt, bns, pac, l_seq, seq, 0); mem_mark_primary_se(opt, ar.n, ar.a, lrand48()); free(seq); return ar; } static inline int get_pri_idx(double XA_drop_ratio, const mem_alnreg_t *a, int i) { int k = a[i].secondary_all; if (k >= 0 && a[i].score >= a[k].score * XA_drop_ratio) return k; return -1; } // Okay, returning strings is bad, but this has happened a lot elsewhere. If I have time, I need serious code cleanup. char **mem_gen_alt(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac, const mem_alnreg_v *a, int l_query, const char *query) // ONLY work after mem_mark_primary_se() { int i, k, r, *cnt, tot; kstring_t *aln = 0, str = {0,0,0}; char **XA = 0, *has_alt; cnt = calloc(a->n, sizeof(int)); has_alt = calloc(a->n, 1); for (i = 0, tot = 0; i < a->n; ++i) { r = get_pri_idx(opt->XA_drop_ratio, a->a, i); if (r >= 0) { ++cnt[r], ++tot; if (a->a[i].is_alt) has_alt[r] = 1; } } if (tot == 0) goto end_gen_alt; aln = calloc(a->n, sizeof(kstring_t)); for (i = 0; i < a->n; ++i) { mem_aln_t t; if ((r = get_pri_idx(opt->XA_drop_ratio, a->a, i)) < 0) continue; if (cnt[r] > opt->max_XA_hits_alt || (!has_alt[r] && cnt[r] > opt->max_XA_hits)) continue; t = mem_reg2aln(opt, bns, pac, l_query, query, &a->a[i]); str.l = 0; kputs(bns->anns[t.rid].name, &str); kputc(',', &str); kputc("+-"[t.is_rev], &str); kputl(t.pos + 1, &str); kputc(',', &str); for (k = 0; k < t.n_cigar; ++k) { kputw(t.cigar[k]>>4, &str); kputc("MIDSHN"[t.cigar[k]&0xf], &str); } kputc(',', &str); kputw(t.NM, &str); kputc(';', &str); free(t.cigar); kputsn(str.s, str.l, &aln[r]); } XA = calloc(a->n, sizeof(char*)); for (k = 0; k < a->n; ++k) XA[k] = aln[k].s; end_gen_alt: free(has_alt); free(cnt); free(aln); free(str.s); return XA; } bwa-0.7.17/bwamem_pair.c000066400000000000000000000417211317342117100150130ustar00rootroot00000000000000#include #include #include #include #include "kstring.h" #include "bwamem.h" #include "kvec.h" #include "utils.h" #include "ksw.h" #ifdef USE_MALLOC_WRAPPERS # include "malloc_wrap.h" #endif #define MIN_RATIO 0.8 #define MIN_DIR_CNT 10 #define MIN_DIR_RATIO 0.05 #define OUTLIER_BOUND 2.0 #define MAPPING_BOUND 3.0 #define MAX_STDDEV 4.0 static inline int mem_infer_dir(int64_t l_pac, int64_t b1, int64_t b2, int64_t *dist) { int64_t p2; int r1 = (b1 >= l_pac), r2 = (b2 >= l_pac); p2 = r1 == r2? b2 : (l_pac<<1) - 1 - b2; // p2 is the coordinate of read 2 on the read 1 strand *dist = p2 > b1? p2 - b1 : b1 - p2; return (r1 == r2? 0 : 1) ^ (p2 > b1? 0 : 3); } static int cal_sub(const mem_opt_t *opt, mem_alnreg_v *r) { int j; for (j = 1; j < r->n; ++j) { // choose unique alignment int b_max = r->a[j].qb > r->a[0].qb? r->a[j].qb : r->a[0].qb; int e_min = r->a[j].qe < r->a[0].qe? r->a[j].qe : r->a[0].qe; if (e_min > b_max) { // have overlap int min_l = r->a[j].qe - r->a[j].qb < r->a[0].qe - r->a[0].qb? r->a[j].qe - r->a[j].qb : r->a[0].qe - r->a[0].qb; if (e_min - b_max >= min_l * opt->mask_level) break; // significant overlap } } return j < r->n? r->a[j].score : opt->min_seed_len * opt->a; } void mem_pestat(const mem_opt_t *opt, int64_t l_pac, int n, const mem_alnreg_v *regs, mem_pestat_t pes[4]) { int i, d, max; uint64_v isize[4]; memset(pes, 0, 4 * sizeof(mem_pestat_t)); memset(isize, 0, sizeof(kvec_t(int)) * 4); for (i = 0; i < n>>1; ++i) { int dir; int64_t is; mem_alnreg_v *r[2]; r[0] = (mem_alnreg_v*)®s[i<<1|0]; r[1] = (mem_alnreg_v*)®s[i<<1|1]; if (r[0]->n == 0 || r[1]->n == 0) continue; if (cal_sub(opt, r[0]) > MIN_RATIO * r[0]->a[0].score) continue; if (cal_sub(opt, r[1]) > MIN_RATIO * r[1]->a[0].score) continue; if (r[0]->a[0].rid != r[1]->a[0].rid) continue; // not on the same chr dir = mem_infer_dir(l_pac, r[0]->a[0].rb, r[1]->a[0].rb, &is); if (is && is <= opt->max_ins) kv_push(uint64_t, isize[dir], is); } if (bwa_verbose >= 3) fprintf(stderr, "[M::%s] # candidate unique pairs for (FF, FR, RF, RR): (%ld, %ld, %ld, %ld)\n", __func__, isize[0].n, isize[1].n, isize[2].n, isize[3].n); for (d = 0; d < 4; ++d) { // TODO: this block is nearly identical to the one in bwtsw2_pair.c. It would be better to merge these two. mem_pestat_t *r = &pes[d]; uint64_v *q = &isize[d]; int p25, p50, p75, x; if (q->n < MIN_DIR_CNT) { fprintf(stderr, "[M::%s] skip orientation %c%c as there are not enough pairs\n", __func__, "FR"[d>>1&1], "FR"[d&1]); r->failed = 1; free(q->a); continue; } else fprintf(stderr, "[M::%s] analyzing insert size distribution for orientation %c%c...\n", __func__, "FR"[d>>1&1], "FR"[d&1]); ks_introsort_64(q->n, q->a); p25 = q->a[(int)(.25 * q->n + .499)]; p50 = q->a[(int)(.50 * q->n + .499)]; p75 = q->a[(int)(.75 * q->n + .499)]; r->low = (int)(p25 - OUTLIER_BOUND * (p75 - p25) + .499); if (r->low < 1) r->low = 1; r->high = (int)(p75 + OUTLIER_BOUND * (p75 - p25) + .499); fprintf(stderr, "[M::%s] (25, 50, 75) percentile: (%d, %d, %d)\n", __func__, p25, p50, p75); fprintf(stderr, "[M::%s] low and high boundaries for computing mean and std.dev: (%d, %d)\n", __func__, r->low, r->high); for (i = x = 0, r->avg = 0; i < q->n; ++i) if (q->a[i] >= r->low && q->a[i] <= r->high) r->avg += q->a[i], ++x; r->avg /= x; for (i = 0, r->std = 0; i < q->n; ++i) if (q->a[i] >= r->low && q->a[i] <= r->high) r->std += (q->a[i] - r->avg) * (q->a[i] - r->avg); r->std = sqrt(r->std / x); fprintf(stderr, "[M::%s] mean and std.dev: (%.2f, %.2f)\n", __func__, r->avg, r->std); r->low = (int)(p25 - MAPPING_BOUND * (p75 - p25) + .499); r->high = (int)(p75 + MAPPING_BOUND * (p75 - p25) + .499); if (r->low > r->avg - MAX_STDDEV * r->std) r->low = (int)(r->avg - MAX_STDDEV * r->std + .499); if (r->high < r->avg + MAX_STDDEV * r->std) r->high = (int)(r->avg + MAX_STDDEV * r->std + .499); if (r->low < 1) r->low = 1; fprintf(stderr, "[M::%s] low and high boundaries for proper pairs: (%d, %d)\n", __func__, r->low, r->high); free(q->a); } for (d = 0, max = 0; d < 4; ++d) max = max > isize[d].n? max : isize[d].n; for (d = 0; d < 4; ++d) if (pes[d].failed == 0 && isize[d].n < max * MIN_DIR_RATIO) { pes[d].failed = 1; fprintf(stderr, "[M::%s] skip orientation %c%c\n", __func__, "FR"[d>>1&1], "FR"[d&1]); } } int mem_matesw(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac, const mem_pestat_t pes[4], const mem_alnreg_t *a, int l_ms, const uint8_t *ms, mem_alnreg_v *ma) { extern int mem_sort_dedup_patch(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac, uint8_t *query, int n, mem_alnreg_t *a); int64_t l_pac = bns->l_pac; int i, r, skip[4], n = 0, rid; for (r = 0; r < 4; ++r) skip[r] = pes[r].failed? 1 : 0; for (i = 0; i < ma->n; ++i) { // check which orinentation has been found int64_t dist; r = mem_infer_dir(l_pac, a->rb, ma->a[i].rb, &dist); if (dist >= pes[r].low && dist <= pes[r].high) skip[r] = 1; } if (skip[0] + skip[1] + skip[2] + skip[3] == 4) return 0; // consistent pair exist; no need to perform SW for (r = 0; r < 4; ++r) { int is_rev, is_larger; uint8_t *seq, *rev = 0, *ref = 0; int64_t rb, re; if (skip[r]) continue; is_rev = (r>>1 != (r&1)); // whether to reverse complement the mate is_larger = !(r>>1); // whether the mate has larger coordinate if (is_rev) { rev = malloc(l_ms); // this is the reverse complement of $ms for (i = 0; i < l_ms; ++i) rev[l_ms - 1 - i] = ms[i] < 4? 3 - ms[i] : 4; seq = rev; } else seq = (uint8_t*)ms; if (!is_rev) { rb = is_larger? a->rb + pes[r].low : a->rb - pes[r].high; re = (is_larger? a->rb + pes[r].high: a->rb - pes[r].low) + l_ms; // if on the same strand, end position should be larger to make room for the seq length } else { rb = (is_larger? a->rb + pes[r].low : a->rb - pes[r].high) - l_ms; // similarly on opposite strands re = is_larger? a->rb + pes[r].high: a->rb - pes[r].low; } if (rb < 0) rb = 0; if (re > l_pac<<1) re = l_pac<<1; if (rb < re) ref = bns_fetch_seq(bns, pac, &rb, (rb+re)>>1, &re, &rid); if (a->rid == rid && re - rb >= opt->min_seed_len) { // no funny things happening kswr_t aln; mem_alnreg_t b; int tmp, xtra = KSW_XSUBO | KSW_XSTART | (l_ms * opt->a < 250? KSW_XBYTE : 0) | (opt->min_seed_len * opt->a); aln = ksw_align2(l_ms, seq, re - rb, ref, 5, opt->mat, opt->o_del, opt->e_del, opt->o_ins, opt->e_ins, xtra, 0); memset(&b, 0, sizeof(mem_alnreg_t)); if (aln.score >= opt->min_seed_len && aln.qb >= 0) { // something goes wrong if aln.qb < 0 b.rid = a->rid; b.is_alt = a->is_alt; b.qb = is_rev? l_ms - (aln.qe + 1) : aln.qb; b.qe = is_rev? l_ms - aln.qb : aln.qe + 1; b.rb = is_rev? (l_pac<<1) - (rb + aln.te + 1) : rb + aln.tb; b.re = is_rev? (l_pac<<1) - (rb + aln.tb) : rb + aln.te + 1; b.score = aln.score; b.csub = aln.score2; b.secondary = -1; b.seedcov = (b.re - b.rb < b.qe - b.qb? b.re - b.rb : b.qe - b.qb) >> 1; // printf("*** %d, [%lld,%lld], %d:%d, (%lld,%lld), (%lld,%lld) == (%lld,%lld)\n", aln.score, rb, re, is_rev, is_larger, a->rb, a->re, ma->a[0].rb, ma->a[0].re, b.rb, b.re); kv_push(mem_alnreg_t, *ma, b); // make room for a new element // move b s.t. ma is sorted for (i = 0; i < ma->n - 1; ++i) // find the insertion point if (ma->a[i].score < b.score) break; tmp = i; for (i = ma->n - 1; i > tmp; --i) ma->a[i] = ma->a[i-1]; ma->a[i] = b; } ++n; } if (n) ma->n = mem_sort_dedup_patch(opt, 0, 0, 0, ma->n, ma->a); if (rev) free(rev); free(ref); } return n; } int mem_pair(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac, const mem_pestat_t pes[4], bseq1_t s[2], mem_alnreg_v a[2], int id, int *sub, int *n_sub, int z[2], int n_pri[2]) { pair64_v v, u; int r, i, k, y[4], ret; // y[] keeps the last hit int64_t l_pac = bns->l_pac; kv_init(v); kv_init(u); for (r = 0; r < 2; ++r) { // loop through read number for (i = 0; i < n_pri[r]; ++i) { pair64_t key; mem_alnreg_t *e = &a[r].a[i]; key.x = e->rb < l_pac? e->rb : (l_pac<<1) - 1 - e->rb; // forward position key.x = (uint64_t)e->rid<<32 | (key.x - bns->anns[e->rid].offset); key.y = (uint64_t)e->score << 32 | i << 2 | (e->rb >= l_pac)<<1 | r; kv_push(pair64_t, v, key); } } ks_introsort_128(v.n, v.a); y[0] = y[1] = y[2] = y[3] = -1; //for (i = 0; i < v.n; ++i) printf("[%d]\t%d\t%c%ld\n", i, (int)(v.a[i].y&1)+1, "+-"[v.a[i].y>>1&1], (long)v.a[i].x); for (i = 0; i < v.n; ++i) { for (r = 0; r < 2; ++r) { // loop through direction int dir = r<<1 | (v.a[i].y>>1&1), which; if (pes[dir].failed) continue; // invalid orientation which = r<<1 | ((v.a[i].y&1)^1); if (y[which] < 0) continue; // no previous hits for (k = y[which]; k >= 0; --k) { // TODO: this is a O(n^2) solution in the worst case; remember to check if this loop takes a lot of time (I doubt) int64_t dist; int q; double ns; pair64_t *p; if ((v.a[k].y&3) != which) continue; dist = (int64_t)v.a[i].x - v.a[k].x; //printf("%d: %lld\n", k, dist); if (dist > pes[dir].high) break; if (dist < pes[dir].low) continue; ns = (dist - pes[dir].avg) / pes[dir].std; q = (int)((v.a[i].y>>32) + (v.a[k].y>>32) + .721 * log(2. * erfc(fabs(ns) * M_SQRT1_2)) * opt->a + .499); // .721 = 1/log(4) if (q < 0) q = 0; p = kv_pushp(pair64_t, u); p->y = (uint64_t)k<<32 | i; p->x = (uint64_t)q<<32 | (hash_64(p->y ^ id<<8) & 0xffffffffU); //printf("[%lld,%lld]\t%d\tdist=%ld\n", v.a[k].x, v.a[i].x, q, (long)dist); } } y[v.a[i].y&3] = i; } if (u.n) { // found at least one proper pair int tmp = opt->a + opt->b; tmp = tmp > opt->o_del + opt->e_del? tmp : opt->o_del + opt->e_del; tmp = tmp > opt->o_ins + opt->e_ins? tmp : opt->o_ins + opt->e_ins; ks_introsort_128(u.n, u.a); i = u.a[u.n-1].y >> 32; k = u.a[u.n-1].y << 32 >> 32; z[v.a[i].y&1] = v.a[i].y<<32>>34; // index of the best pair z[v.a[k].y&1] = v.a[k].y<<32>>34; ret = u.a[u.n-1].x >> 32; *sub = u.n > 1? u.a[u.n-2].x>>32 : 0; for (i = (long)u.n - 2, *n_sub = 0; i >= 0; --i) if (*sub - (int)(u.a[i].x>>32) <= tmp) ++*n_sub; } else ret = 0, *sub = 0, *n_sub = 0; free(u.a); free(v.a); return ret; } void mem_aln2sam(const mem_opt_t *opt, const bntseq_t *bns, kstring_t *str, bseq1_t *s, int n, const mem_aln_t *list, int which, const mem_aln_t *m); void mem_reorder_primary5(int T, mem_alnreg_v *a); #define raw_mapq(diff, a) ((int)(6.02 * (diff) / (a) + .499)) int mem_sam_pe(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac, const mem_pestat_t pes[4], uint64_t id, bseq1_t s[2], mem_alnreg_v a[2]) { extern int mem_mark_primary_se(const mem_opt_t *opt, int n, mem_alnreg_t *a, int64_t id); extern int mem_approx_mapq_se(const mem_opt_t *opt, const mem_alnreg_t *a); extern void mem_reg2sam(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac, bseq1_t *s, mem_alnreg_v *a, int extra_flag, const mem_aln_t *m); extern char **mem_gen_alt(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac, const mem_alnreg_v *a, int l_query, const char *query); int n = 0, i, j, z[2], o, subo, n_sub, extra_flag = 1, n_pri[2], n_aa[2]; kstring_t str; mem_aln_t h[2], g[2], aa[2][2]; str.l = str.m = 0; str.s = 0; memset(h, 0, sizeof(mem_aln_t) * 2); memset(g, 0, sizeof(mem_aln_t) * 2); n_aa[0] = n_aa[1] = 0; if (!(opt->flag & MEM_F_NO_RESCUE)) { // then perform SW for the best alignment mem_alnreg_v b[2]; kv_init(b[0]); kv_init(b[1]); for (i = 0; i < 2; ++i) for (j = 0; j < a[i].n; ++j) if (a[i].a[j].score >= a[i].a[0].score - opt->pen_unpaired) kv_push(mem_alnreg_t, b[i], a[i].a[j]); for (i = 0; i < 2; ++i) for (j = 0; j < b[i].n && j < opt->max_matesw; ++j) n += mem_matesw(opt, bns, pac, pes, &b[i].a[j], s[!i].l_seq, (uint8_t*)s[!i].seq, &a[!i]); free(b[0].a); free(b[1].a); } n_pri[0] = mem_mark_primary_se(opt, a[0].n, a[0].a, id<<1|0); n_pri[1] = mem_mark_primary_se(opt, a[1].n, a[1].a, id<<1|1); if (opt->flag & MEM_F_PRIMARY5) { mem_reorder_primary5(opt->T, &a[0]); mem_reorder_primary5(opt->T, &a[1]); } if (opt->flag&MEM_F_NOPAIRING) goto no_pairing; // pairing single-end hits if (n_pri[0] && n_pri[1] && (o = mem_pair(opt, bns, pac, pes, s, a, id, &subo, &n_sub, z, n_pri)) > 0) { int is_multi[2], q_pe, score_un, q_se[2]; char **XA[2]; // check if an end has multiple hits even after mate-SW for (i = 0; i < 2; ++i) { for (j = 1; j < n_pri[i]; ++j) if (a[i].a[j].secondary < 0 && a[i].a[j].score >= opt->T) break; is_multi[i] = j < n_pri[i]? 1 : 0; } if (is_multi[0] || is_multi[1]) goto no_pairing; // TODO: in rare cases, the true hit may be long but with low score // compute mapQ for the best SE hit score_un = a[0].a[0].score + a[1].a[0].score - opt->pen_unpaired; //q_pe = o && subo < o? (int)(MEM_MAPQ_COEF * (1. - (double)subo / o) * log(a[0].a[z[0]].seedcov + a[1].a[z[1]].seedcov) + .499) : 0; subo = subo > score_un? subo : score_un; q_pe = raw_mapq(o - subo, opt->a); if (n_sub > 0) q_pe -= (int)(4.343 * log(n_sub+1) + .499); if (q_pe < 0) q_pe = 0; if (q_pe > 60) q_pe = 60; q_pe = (int)(q_pe * (1. - .5 * (a[0].a[0].frac_rep + a[1].a[0].frac_rep)) + .499); // the following assumes no split hits if (o > score_un) { // paired alignment is preferred mem_alnreg_t *c[2]; c[0] = &a[0].a[z[0]]; c[1] = &a[1].a[z[1]]; for (i = 0; i < 2; ++i) { if (c[i]->secondary >= 0) c[i]->sub = a[i].a[c[i]->secondary].score, c[i]->secondary = -2; q_se[i] = mem_approx_mapq_se(opt, c[i]); } q_se[0] = q_se[0] > q_pe? q_se[0] : q_pe < q_se[0] + 40? q_pe : q_se[0] + 40; q_se[1] = q_se[1] > q_pe? q_se[1] : q_pe < q_se[1] + 40? q_pe : q_se[1] + 40; extra_flag |= 2; // cap at the tandem repeat score q_se[0] = q_se[0] < raw_mapq(c[0]->score - c[0]->csub, opt->a)? q_se[0] : raw_mapq(c[0]->score - c[0]->csub, opt->a); q_se[1] = q_se[1] < raw_mapq(c[1]->score - c[1]->csub, opt->a)? q_se[1] : raw_mapq(c[1]->score - c[1]->csub, opt->a); } else { // the unpaired alignment is preferred z[0] = z[1] = 0; q_se[0] = mem_approx_mapq_se(opt, &a[0].a[0]); q_se[1] = mem_approx_mapq_se(opt, &a[1].a[0]); } for (i = 0; i < 2; ++i) { int k = a[i].a[z[i]].secondary_all; if (k >= 0 && k < n_pri[i]) { // switch secondary and primary if both of them are non-ALT assert(a[i].a[k].secondary_all < 0); for (j = 0; j < a[i].n; ++j) if (a[i].a[j].secondary_all == k || j == k) a[i].a[j].secondary_all = z[i]; a[i].a[z[i]].secondary_all = -1; } } if (!(opt->flag & MEM_F_ALL)) { for (i = 0; i < 2; ++i) XA[i] = mem_gen_alt(opt, bns, pac, &a[i], s[i].l_seq, s[i].seq); } else XA[0] = XA[1] = 0; // write SAM for (i = 0; i < 2; ++i) { h[i] = mem_reg2aln(opt, bns, pac, s[i].l_seq, s[i].seq, &a[i].a[z[i]]); h[i].mapq = q_se[i]; h[i].flag |= 0x40<score < opt->T || p->secondary >= 0 || !p->is_alt) continue; g[i] = mem_reg2aln(opt, bns, pac, s[i].l_seq, s[i].seq, p); g[i].flag |= 0x800 | 0x40<= opt->T) which = 0; else if (n_pri[i] < a[i].n && a[i].a[n_pri[i]].score >= opt->T) which = n_pri[i]; } if (which >= 0) h[i] = mem_reg2aln(opt, bns, pac, s[i].l_seq, s[i].seq, &a[i].a[which]); else h[i] = mem_reg2aln(opt, bns, pac, s[i].l_seq, s[i].seq, 0); } if (!(opt->flag & MEM_F_NOPAIRING) && h[0].rid == h[1].rid && h[0].rid >= 0) { // if the top hits from the two ends constitute a proper pair, flag it. int64_t dist; int d; d = mem_infer_dir(bns->l_pac, a[0].a[0].rb, a[1].a[0].rb, &dist); if (!pes[d].failed && dist >= pes[d].low && dist <= pes[d].high) extra_flag |= 2; } mem_reg2sam(opt, bns, pac, &s[0], &a[0], 0x41|extra_flag, &h[1]); mem_reg2sam(opt, bns, pac, &s[1], &a[1], 0x81|extra_flag, &h[0]); if (strcmp(s[0].name, s[1].name) != 0) err_fatal(__func__, "paired reads have different names: \"%s\", \"%s\"\n", s[0].name, s[1].name); free(h[0].cigar); free(h[1].cigar); return n; } bwa-0.7.17/bwape.c000066400000000000000000000713241317342117100136300ustar00rootroot00000000000000#include #include #include #include #include #include #include "bwtaln.h" #include "kvec.h" #include "bntseq.h" #include "utils.h" #include "bwase.h" #include "bwa.h" #include "ksw.h" #ifdef USE_MALLOC_WRAPPERS # include "malloc_wrap.h" #endif typedef struct { int n; bwtint_t *a; } poslist_t; typedef struct { double avg, std, ap_prior; bwtint_t low, high, high_bayesian; } isize_info_t; #define b128_eq(a, b) ((a).x == (b).x && (a).y == (b).y) #define b128_hash(a) ((uint32_t)(a).x) #include "khash.h" KHASH_INIT(b128, pair64_t, poslist_t, 1, b128_hash, b128_eq) typedef struct { pair64_v arr; pair64_v pos[2]; kvec_t(bwt_aln1_t) aln[2]; } pe_data_t; #define MIN_HASH_WIDTH 1000 extern int g_log_n[256]; // in bwase.c static kh_b128_t *g_hash; void bwa_aln2seq_core(int n_aln, const bwt_aln1_t *aln, bwa_seq_t *s, int set_main, int n_multi); void bwa_aln2seq(int n_aln, const bwt_aln1_t *aln, bwa_seq_t *s); int bwa_approx_mapQ(const bwa_seq_t *p, int mm); void bwa_print_sam1(const bntseq_t *bns, bwa_seq_t *p, const bwa_seq_t *mate, int mode, int max_top2); bntseq_t *bwa_open_nt(const char *prefix); void bwa_print_sam_SQ(const bntseq_t *bns); pe_opt_t *bwa_init_pe_opt() { pe_opt_t *po; po = (pe_opt_t*)calloc(1, sizeof(pe_opt_t)); po->max_isize = 500; po->force_isize = 0; po->max_occ = 100000; po->n_multi = 3; po->N_multi = 10; po->type = BWA_PET_STD; po->is_sw = 1; po->ap_prior = 1e-5; return po; } /* static double ierfc(double x) // inverse erfc(); iphi(x) = M_SQRT2 *ierfc(2 * x); { const double a = 0.140012; double b, c; b = log(x * (2 - x)); c = 2./M_PI/a + b / 2.; return sqrt(sqrt(c * c - b / a) - c); } */ // for normal distribution, this is about 3std #define OUTLIER_BOUND 2.0 static int infer_isize(int n_seqs, bwa_seq_t *seqs[2], isize_info_t *ii, double ap_prior, int64_t L) { uint64_t x, *isizes, n_ap = 0; int n, i, tot, p25, p75, p50, max_len = 1, tmp; double skewness = 0.0, kurtosis = 0.0, y; ii->avg = ii->std = -1.0; ii->low = ii->high = ii->high_bayesian = 0; isizes = (uint64_t*)calloc(n_seqs, 8); for (i = 0, tot = 0; i != n_seqs; ++i) { bwa_seq_t *p[2]; p[0] = seqs[0] + i; p[1] = seqs[1] + i; if (p[0]->mapQ >= 20 && p[1]->mapQ >= 20) { x = (p[0]->pos < p[1]->pos)? p[1]->pos + p[1]->len - p[0]->pos : p[0]->pos + p[0]->len - p[1]->pos; if (x < 100000) isizes[tot++] = x; } if (p[0]->len > max_len) max_len = p[0]->len; if (p[1]->len > max_len) max_len = p[1]->len; } if (tot < 20) { fprintf(stderr, "[infer_isize] fail to infer insert size: too few good pairs\n"); free(isizes); return -1; } ks_introsort_64(tot, isizes); p25 = isizes[(int)(tot*0.25 + 0.5)]; p50 = isizes[(int)(tot*0.50 + 0.5)]; p75 = isizes[(int)(tot*0.75 + 0.5)]; tmp = (int)(p25 - OUTLIER_BOUND * (p75 - p25) + .499); ii->low = tmp > max_len? tmp : max_len; // ii->low is unsigned ii->high = (int)(p75 + OUTLIER_BOUND * (p75 - p25) + .499); if (ii->low > ii->high) { fprintf(stderr, "[infer_isize] fail to infer insert size: upper bound is smaller than read length\n"); free(isizes); return -1; } for (i = 0, x = n = 0; i < tot; ++i) if (isizes[i] >= ii->low && isizes[i] <= ii->high) ++n, x += isizes[i]; ii->avg = (double)x / n; for (i = 0; i < tot; ++i) { if (isizes[i] >= ii->low && isizes[i] <= ii->high) { double tmp = (isizes[i] - ii->avg) * (isizes[i] - ii->avg); ii->std += tmp; skewness += tmp * (isizes[i] - ii->avg); kurtosis += tmp * tmp; } } kurtosis = kurtosis/n / (ii->std / n * ii->std / n) - 3; ii->std = sqrt(ii->std / n); // it would be better as n-1, but n is usually very large skewness = skewness / n / (ii->std * ii->std * ii->std); for (y = 1.0; y < 10.0; y += 0.01) if (.5 * erfc(y / M_SQRT2) < ap_prior / L * (y * ii->std + ii->avg)) break; ii->high_bayesian = (bwtint_t)(y * ii->std + ii->avg + .499); for (i = 0; i < tot; ++i) if (isizes[i] > ii->high_bayesian) ++n_ap; ii->ap_prior = .01 * (n_ap + .01) / tot; if (ii->ap_prior < ap_prior) ii->ap_prior = ap_prior; free(isizes); fprintf(stderr, "[infer_isize] (25, 50, 75) percentile: (%d, %d, %d)\n", p25, p50, p75); if (isnan(ii->std) || p75 > 100000) { ii->low = ii->high = ii->high_bayesian = 0; ii->avg = ii->std = -1.0; fprintf(stderr, "[infer_isize] fail to infer insert size: weird pairing\n"); return -1; } for (y = 1.0; y < 10.0; y += 0.01) if (.5 * erfc(y / M_SQRT2) < ap_prior / L * (y * ii->std + ii->avg)) break; ii->high_bayesian = (bwtint_t)(y * ii->std + ii->avg + .499); fprintf(stderr, "[infer_isize] low and high boundaries: %ld and %ld for estimating avg and std\n", (long)ii->low, (long)ii->high); fprintf(stderr, "[infer_isize] inferred external isize from %d pairs: %.3lf +/- %.3lf\n", n, ii->avg, ii->std); fprintf(stderr, "[infer_isize] skewness: %.3lf; kurtosis: %.3lf; ap_prior: %.2e\n", skewness, kurtosis, ii->ap_prior); fprintf(stderr, "[infer_isize] inferred maximum insert size: %ld (%.2lf sigma)\n", (long)ii->high_bayesian, y); return 0; } static int pairing(bwa_seq_t *p[2], pe_data_t *d, const pe_opt_t *opt, int s_mm, const isize_info_t *ii) { int i, j, o_n, subo_n, cnt_chg = 0, low_bound = ii->low, max_len; uint64_t o_score, subo_score; pair64_t last_pos[2][2], o_pos[2]; max_len = p[0]->full_len; if (max_len < p[1]->full_len) max_len = p[1]->full_len; if (low_bound < max_len) low_bound = max_len; // here v>=u. When ii is set, we check insert size with ii; otherwise with opt->max_isize #define __pairing_aux(u,v) do { \ bwtint_t l = (v).x + p[(v).y&1]->len - ((u).x); \ if ((u).x != (uint64_t)-1 && (v).x > (u).x && l >= max_len \ && ((ii->high && l <= ii->high_bayesian) || (ii->high == 0 && l <= opt->max_isize))) \ { \ uint64_t s = d->aln[(v).y&1].a[(v).y>>2].score + d->aln[(u).y&1].a[(u).y>>2].score; \ s *= 10; \ if (ii->high) s += (int)(-4.343 * log(.5 * erfc(M_SQRT1_2 * fabs(l - ii->avg) / ii->std)) + .499); \ s = s<<32 | (uint32_t)hash_64((u).x<<32 | (v).x); \ if (s>>32 == o_score>>32) ++o_n; \ else if (s>>32 < o_score>>32) { subo_n += o_n; o_n = 1; } \ else ++subo_n; \ if (s < o_score) subo_score = o_score, o_score = s, o_pos[(u).y&1] = (u), o_pos[(v).y&1] = (v); \ else if (s < subo_score) subo_score = s; \ } \ } while (0) #define __pairing_aux2(q, w) do { \ const bwt_aln1_t *r = d->aln[(w).y&1].a + ((w).y>>2); \ (q)->extra_flag |= SAM_FPP; \ if ((q)->pos != (w).x || (q)->strand != ((w).y>>1&1)) { \ (q)->n_mm = r->n_mm; (q)->n_gapo = r->n_gapo; (q)->n_gape = r->n_gape; (q)->strand = (w).y>>1&1; \ (q)->score = r->score; \ (q)->pos = (w).x; \ if ((q)->mapQ > 0) ++cnt_chg; \ } \ } while (0) o_score = subo_score = (uint64_t)-1; o_n = subo_n = 0; ks_introsort_128(d->arr.n, d->arr.a); for (j = 0; j < 2; ++j) last_pos[j][0].x = last_pos[j][0].y = last_pos[j][1].x = last_pos[j][1].y = (uint64_t)-1; if (opt->type == BWA_PET_STD) { for (i = 0; i < d->arr.n; ++i) { pair64_t x = d->arr.a[i]; int strand = x.y>>1&1; if (strand == 1) { // reverse strand, then check int y = 1 - (x.y&1); __pairing_aux(last_pos[y][1], x); __pairing_aux(last_pos[y][0], x); } else { // forward strand, then push last_pos[x.y&1][0] = last_pos[x.y&1][1]; last_pos[x.y&1][1] = x; } } } else { fprintf(stderr, "[paring] not implemented yet!\n"); exit(1); } // set pairing //fprintf(stderr, "[%ld, %d, %d, %d]\n", d->arr.n, (int)(o_score>>32), (int)(subo_score>>32), o_n); if (o_score != (uint64_t)-1) { int mapQ_p = 0; // this is the maximum mapping quality when one end is moved //fprintf(stderr, "%d, %d\n", o_n, subo_n); if (o_n == 1) { if (subo_score == (uint64_t)-1) mapQ_p = 29; // no sub-optimal pair else if ((subo_score>>32) - (o_score>>32) > s_mm * 10) mapQ_p = 23; // poor sub-optimal pair else { int n = subo_n > 255? 255 : subo_n; mapQ_p = ((subo_score>>32) - (o_score>>32)) / 2 - g_log_n[n]; if (mapQ_p < 0) mapQ_p = 0; } } if ((p[0]->pos == o_pos[0].x && p[0]->strand == (o_pos[0].y>>1&1)) && (p[1]->pos == o_pos[1].x && p[1]->strand == (o_pos[1].y>>1&1))) { // both ends not moved if (p[0]->mapQ > 0 && p[1]->mapQ > 0) { int mapQ = p[0]->mapQ + p[1]->mapQ; if (mapQ > 60) mapQ = 60; p[0]->mapQ = p[1]->mapQ = mapQ; } else { if (p[0]->mapQ == 0) p[0]->mapQ = (mapQ_p + 7 < p[1]->mapQ)? mapQ_p + 7 : p[1]->mapQ; if (p[1]->mapQ == 0) p[1]->mapQ = (mapQ_p + 7 < p[0]->mapQ)? mapQ_p + 7 : p[0]->mapQ; } } else if (p[0]->pos == o_pos[0].x && p[0]->strand == (o_pos[0].y>>1&1)) { // [1] moved p[1]->seQ = 0; p[1]->mapQ = p[0]->mapQ; if (p[1]->mapQ > mapQ_p) p[1]->mapQ = mapQ_p; } else if (p[1]->pos == o_pos[1].x && p[1]->strand == (o_pos[1].y>>1&1)) { // [0] moved p[0]->seQ = 0; p[0]->mapQ = p[1]->mapQ; if (p[0]->mapQ > mapQ_p) p[0]->mapQ = mapQ_p; } else { // both ends moved p[0]->seQ = p[1]->seQ = 0; mapQ_p -= 20; if (mapQ_p < 0) mapQ_p = 0; p[0]->mapQ = p[1]->mapQ = mapQ_p; } __pairing_aux2(p[0], o_pos[0]); __pairing_aux2(p[1], o_pos[1]); } return cnt_chg; } typedef struct { kvec_t(bwt_aln1_t) aln; } aln_buf_t; int bwa_cal_pac_pos_pe(const bntseq_t *bns, const char *prefix, bwt_t *const _bwt, int n_seqs, bwa_seq_t *seqs[2], FILE *fp_sa[2], isize_info_t *ii, const pe_opt_t *opt, const gap_opt_t *gopt, const isize_info_t *last_ii) { int i, j, cnt_chg = 0; char str[1024]; bwt_t *bwt; pe_data_t *d; aln_buf_t *buf[2]; d = (pe_data_t*)calloc(1, sizeof(pe_data_t)); buf[0] = (aln_buf_t*)calloc(n_seqs, sizeof(aln_buf_t)); buf[1] = (aln_buf_t*)calloc(n_seqs, sizeof(aln_buf_t)); if (_bwt == 0) { // load forward SA strcpy(str, prefix); strcat(str, ".bwt"); bwt = bwt_restore_bwt(str); strcpy(str, prefix); strcat(str, ".sa"); bwt_restore_sa(str, bwt); } else bwt = _bwt; // SE for (i = 0; i != n_seqs; ++i) { bwa_seq_t *p[2]; for (j = 0; j < 2; ++j) { int n_aln; p[j] = seqs[j] + i; p[j]->n_multi = 0; p[j]->extra_flag |= SAM_FPD | (j == 0? SAM_FR1 : SAM_FR2); err_fread_noeof(&n_aln, 4, 1, fp_sa[j]); if (n_aln > kv_max(d->aln[j])) kv_resize(bwt_aln1_t, d->aln[j], n_aln); d->aln[j].n = n_aln; err_fread_noeof(d->aln[j].a, sizeof(bwt_aln1_t), n_aln, fp_sa[j]); kv_copy(bwt_aln1_t, buf[j][i].aln, d->aln[j]); // backup d->aln[j] // generate SE alignment and mapping quality bwa_aln2seq(n_aln, d->aln[j].a, p[j]); if (p[j]->type == BWA_TYPE_UNIQUE || p[j]->type == BWA_TYPE_REPEAT) { int strand; int max_diff = gopt->fnr > 0.0? bwa_cal_maxdiff(p[j]->len, BWA_AVG_ERR, gopt->fnr) : gopt->max_diff; p[j]->seQ = p[j]->mapQ = bwa_approx_mapQ(p[j], max_diff); p[j]->pos = bwa_sa2pos(bns, bwt, p[j]->sa, p[j]->len + p[j]->ref_shift, &strand); p[j]->strand = strand; if (p[j]->pos == (bwtint_t)-1) p[j]->type = BWA_TYPE_NO_MATCH; } } } // infer isize infer_isize(n_seqs, seqs, ii, opt->ap_prior, bwt->seq_len/2); if (ii->avg < 0.0 && last_ii->avg > 0.0) *ii = *last_ii; if (opt->force_isize) { fprintf(stderr, "[%s] discard insert size estimate as user's request.\n", __func__); ii->low = ii->high = 0; ii->avg = ii->std = -1.0; } // PE for (i = 0; i != n_seqs; ++i) { bwa_seq_t *p[2]; for (j = 0; j < 2; ++j) { p[j] = seqs[j] + i; kv_copy(bwt_aln1_t, d->aln[j], buf[j][i].aln); } if ((p[0]->type == BWA_TYPE_UNIQUE || p[0]->type == BWA_TYPE_REPEAT) && (p[1]->type == BWA_TYPE_UNIQUE || p[1]->type == BWA_TYPE_REPEAT)) { // only when both ends mapped pair64_t x; int j, k; long long n_occ[2]; for (j = 0; j < 2; ++j) { n_occ[j] = 0; for (k = 0; k < d->aln[j].n; ++k) n_occ[j] += d->aln[j].a[k].l - d->aln[j].a[k].k + 1; } if (n_occ[0] > opt->max_occ || n_occ[1] > opt->max_occ) continue; d->arr.n = 0; for (j = 0; j < 2; ++j) { for (k = 0; k < d->aln[j].n; ++k) { bwt_aln1_t *r = d->aln[j].a + k; bwtint_t l; if (0 && r->l - r->k + 1 >= MIN_HASH_WIDTH) { // then check hash table pair64_t key; int ret; key.x = r->k; key.y = r->l; khint_t iter = kh_put(b128, g_hash, key, &ret); if (ret) { // not in the hash table; ret must equal 1 as we never remove elements poslist_t *z = &kh_val(g_hash, iter); z->n = r->l - r->k + 1; z->a = (bwtint_t*)malloc(sizeof(bwtint_t) * z->n); for (l = r->k; l <= r->l; ++l) { int strand; z->a[l - r->k] = bwa_sa2pos(bns, bwt, l, p[j]->len + p[j]->ref_shift, &strand)<<1; z->a[l - r->k] |= strand; } } for (l = 0; l < kh_val(g_hash, iter).n; ++l) { x.x = kh_val(g_hash, iter).a[l]>>1; x.y = k<<2 | (kh_val(g_hash, iter).a[l]&1)<<1 | j; kv_push(pair64_t, d->arr, x); } } else { // then calculate on the fly for (l = r->k; l <= r->l; ++l) { int strand; x.x = bwa_sa2pos(bns, bwt, l, p[j]->len + p[j]->ref_shift, &strand); x.y = k<<2 | strand<<1 | j; kv_push(pair64_t, d->arr, x); } } } } cnt_chg += pairing(p, d, opt, gopt->s_mm, ii); } if (opt->N_multi || opt->n_multi) { for (j = 0; j < 2; ++j) { if (p[j]->type != BWA_TYPE_NO_MATCH) { int k, n_multi; if (!(p[j]->extra_flag&SAM_FPP) && p[1-j]->type != BWA_TYPE_NO_MATCH) { bwa_aln2seq_core(d->aln[j].n, d->aln[j].a, p[j], 0, p[j]->c1+p[j]->c2-1 > opt->N_multi? opt->n_multi : opt->N_multi); } else bwa_aln2seq_core(d->aln[j].n, d->aln[j].a, p[j], 0, opt->n_multi); for (k = 0, n_multi = 0; k < p[j]->n_multi; ++k) { int strand; bwt_multi1_t *q = p[j]->multi + k; q->pos = bwa_sa2pos(bns, bwt, q->pos, p[j]->len + q->ref_shift, &strand); q->strand = strand; if (q->pos != p[j]->pos && q->pos != (bwtint_t)-1) p[j]->multi[n_multi++] = *q; } p[j]->n_multi = n_multi; } } } } // free for (i = 0; i < n_seqs; ++i) { kv_destroy(buf[0][i].aln); kv_destroy(buf[1][i].aln); } free(buf[0]); free(buf[1]); if (_bwt == 0) bwt_destroy(bwt); kv_destroy(d->arr); kv_destroy(d->pos[0]); kv_destroy(d->pos[1]); kv_destroy(d->aln[0]); kv_destroy(d->aln[1]); free(d); return cnt_chg; } #define SW_MIN_MATCH_LEN 20 #define SW_MIN_MAPQ 17 // cnt = n_mm<<16 | n_gapo<<8 | n_gape bwa_cigar_t *bwa_sw_core(bwtint_t l_pac, const ubyte_t *pacseq, int len, const ubyte_t *seq, int64_t *beg, int reglen, int *n_cigar, uint32_t *_cnt) { kswr_t r; uint32_t *cigar32 = 0; bwa_cigar_t *cigar = 0; ubyte_t *ref_seq; bwtint_t k, x, y, l; int xtra, gscore; int8_t mat[25]; bwa_fill_scmat(1, 3, mat); // check whether there are too many N's if (reglen < SW_MIN_MATCH_LEN || (int64_t)l_pac - *beg < len) return 0; for (k = 0, x = 0; k < len; ++k) if (seq[k] >= 4) ++x; if ((float)x/len >= 0.25 || len - x < SW_MIN_MATCH_LEN) return 0; // get reference subsequence ref_seq = (ubyte_t*)calloc(reglen, 1); for (k = *beg, l = 0; l < reglen && k < l_pac; ++k) ref_seq[l++] = pacseq[k>>2] >> ((~k&3)<<1) & 3; // do alignment xtra = KSW_XSUBO | KSW_XSTART | (len < 250? KSW_XBYTE : 0); r = ksw_align(len, (uint8_t*)seq, l, ref_seq, 5, mat, 5, 1, xtra, 0); gscore = ksw_global(r.qe - r.qb + 1, &seq[r.qb], r.te - r.tb + 1, &ref_seq[r.tb], 5, mat, 5, 1, 50, n_cigar, &cigar32); cigar = (bwa_cigar_t*)cigar32; for (k = 0; k < *n_cigar; ++k) cigar[k] = __cigar_create((cigar32[k]&0xf), (cigar32[k]>>4)); if (r.score < SW_MIN_MATCH_LEN || r.score2 == r.score || gscore != r.score) { // poor hit or tandem hits or weird alignment free(cigar); free(ref_seq); *n_cigar = 0; return 0; } // check whether the alignment is good enough for (k = 0, x = y = 0; k < *n_cigar; ++k) { bwa_cigar_t c = cigar[k]; if (__cigar_op(c) == FROM_M) x += __cigar_len(c), y += __cigar_len(c); else if (__cigar_op(c) == FROM_D) x += __cigar_len(c); else y += __cigar_len(c); } if (x < SW_MIN_MATCH_LEN || y < SW_MIN_MATCH_LEN) { // not good enough free(cigar); free(ref_seq); *n_cigar = 0; return 0; } { // update cigar and coordinate; int start = r.qb, end = r.qe + 1; *beg += r.tb; cigar = (bwa_cigar_t*)realloc(cigar, sizeof(bwa_cigar_t) * (*n_cigar + 2)); if (start) { memmove(cigar + 1, cigar, sizeof(bwa_cigar_t) * (*n_cigar)); cigar[0] = __cigar_create(3, start); ++(*n_cigar); } if (end < len) { /*cigar[*n_cigar] = 3<<14 | (len - end);*/ cigar[*n_cigar] = __cigar_create(3, (len - end)); ++(*n_cigar); } } { // set *cnt int n_mm, n_gapo, n_gape; n_mm = n_gapo = n_gape = 0; x = r.tb; y = r.qb; for (k = 0; k < *n_cigar; ++k) { bwa_cigar_t c = cigar[k]; if (__cigar_op(c) == FROM_M) { for (l = 0; l < (__cigar_len(c)); ++l) if (ref_seq[x+l] < 4 && seq[y+l] < 4 && ref_seq[x+l] != seq[y+l]) ++n_mm; x += __cigar_len(c), y += __cigar_len(c); } else if (__cigar_op(c) == FROM_D) { x += __cigar_len(c), ++n_gapo, n_gape += (__cigar_len(c)) - 1; } else if (__cigar_op(c) == FROM_I) { y += __cigar_len(c), ++n_gapo, n_gape += (__cigar_len(c)) - 1; } } *_cnt = (uint32_t)n_mm<<16 | n_gapo<<8 | n_gape; } free(ref_seq); return cigar; } ubyte_t *bwa_paired_sw(const bntseq_t *bns, const ubyte_t *_pacseq, int n_seqs, bwa_seq_t *seqs[2], const pe_opt_t *popt, const isize_info_t *ii) { ubyte_t *pacseq; int i; uint64_t n_tot[2], n_mapped[2]; // load reference sequence if (_pacseq == 0) { pacseq = (ubyte_t*)calloc(bns->l_pac/4+1, 1); err_rewind(bns->fp_pac); err_fread_noeof(pacseq, 1, bns->l_pac/4+1, bns->fp_pac); } else pacseq = (ubyte_t*)_pacseq; if (!popt->is_sw || ii->avg < 0.0) return pacseq; // perform mate alignment n_tot[0] = n_tot[1] = n_mapped[0] = n_mapped[1] = 0; for (i = 0; i != n_seqs; ++i) { bwa_seq_t *p[2]; p[0] = seqs[0] + i; p[1] = seqs[1] + i; if ((p[0]->mapQ >= SW_MIN_MAPQ || p[1]->mapQ >= SW_MIN_MAPQ) && (p[0]->extra_flag&SAM_FPP) == 0) { // unpaired and one read has high mapQ int k, n_cigar[2], is_singleton, mapQ = 0, mq_adjust[2]; int64_t beg[2], end[2]; bwa_cigar_t *cigar[2]; uint32_t cnt[2]; /* In the following, _pref points to the reference read * which must be aligned; _pmate points to its mate which is * considered to be modified. */ #define __set_rght_coor(_a, _b, _pref, _pmate) do { \ (_a) = (int64_t)_pref->pos + ii->avg - 3 * ii->std - _pmate->len * 1.5; \ (_b) = (_a) + 6 * ii->std + 2 * _pmate->len; \ if ((_a) < (int64_t)_pref->pos + _pref->len) (_a) = _pref->pos + _pref->len; \ if ((_b) > bns->l_pac) (_b) = bns->l_pac; \ } while (0) #define __set_left_coor(_a, _b, _pref, _pmate) do { \ (_a) = (int64_t)_pref->pos + _pref->len - ii->avg - 3 * ii->std - _pmate->len * 0.5; \ (_b) = (_a) + 6 * ii->std + 2 * _pmate->len; \ if ((_a) < 0) (_a) = 0; \ if ((_b) > _pref->pos) (_b) = _pref->pos; \ } while (0) #define __set_fixed(_pref, _pmate, _beg, _cnt) do { \ _pmate->type = BWA_TYPE_MATESW; \ _pmate->pos = _beg; \ _pmate->seQ = _pref->seQ; \ _pmate->strand = (popt->type == BWA_PET_STD)? 1 - _pref->strand : _pref->strand; \ _pmate->n_mm = _cnt>>16; _pmate->n_gapo = _cnt>>8&0xff; _pmate->n_gape = _cnt&0xff; \ _pmate->extra_flag |= SAM_FPP; \ _pref->extra_flag |= SAM_FPP; \ } while (0) mq_adjust[0] = mq_adjust[1] = 255; // not effective is_singleton = (p[0]->type == BWA_TYPE_NO_MATCH || p[1]->type == BWA_TYPE_NO_MATCH)? 1 : 0; ++n_tot[is_singleton]; cigar[0] = cigar[1] = 0; n_cigar[0] = n_cigar[1] = 0; if (popt->type != BWA_PET_STD) continue; // other types of pairing is not considered for (k = 0; k < 2; ++k) { // p[1-k] is the reference read and p[k] is the read considered to be modified ubyte_t *seq; if (p[1-k]->type == BWA_TYPE_NO_MATCH) continue; // if p[1-k] is unmapped, skip { // note that popt->type == BWA_PET_STD always true; in older versions, there was a branch for color-space FF/RR reads if (p[1-k]->strand == 0) { // then the mate is on the reverse strand and has larger coordinate __set_rght_coor(beg[k], end[k], p[1-k], p[k]); seq = p[k]->rseq; } else { // then the mate is on forward stand and has smaller coordinate __set_left_coor(beg[k], end[k], p[1-k], p[k]); seq = p[k]->seq; seq_reverse(p[k]->len, seq, 0); // because ->seq is reversed; this will reversed back shortly } } // perform SW alignment cigar[k] = bwa_sw_core(bns->l_pac, pacseq, p[k]->len, seq, &beg[k], end[k] - beg[k], &n_cigar[k], &cnt[k]); if (cigar[k] && p[k]->type != BWA_TYPE_NO_MATCH) { // re-evaluate cigar[k] int s_old, clip = 0, s_new; if (__cigar_op(cigar[k][0]) == 3) clip += __cigar_len(cigar[k][0]); if (__cigar_op(cigar[k][n_cigar[k]-1]) == 3) clip += __cigar_len(cigar[k][n_cigar[k]-1]); s_old = (int)((p[k]->n_mm * 9 + p[k]->n_gapo * 13 + p[k]->n_gape * 2) / 3. * 8. + .499); s_new = (int)(((cnt[k]>>16) * 9 + (cnt[k]>>8&0xff) * 13 + (cnt[k]&0xff) * 2 + clip * 3) / 3. * 8. + .499); s_old += -4.343 * log(ii->ap_prior / bns->l_pac); s_new += (int)(-4.343 * log(.5 * erfc(M_SQRT1_2 * 1.5) + .499)); // assume the mapped isize is 1.5\sigma if (s_old < s_new) { // reject SW alignment mq_adjust[k] = s_new - s_old; free(cigar[k]); cigar[k] = 0; n_cigar[k] = 0; } else mq_adjust[k] = s_old - s_new; } // now revserse sequence back such that p[*]->seq looks untouched if (popt->type == BWA_PET_STD) { if (p[1-k]->strand == 1) seq_reverse(p[k]->len, seq, 0); } else { if (p[1-k]->strand == 0) seq_reverse(p[k]->len, seq, 0); } } k = -1; // no read to be changed if (cigar[0] && cigar[1]) { k = p[0]->mapQ < p[1]->mapQ? 0 : 1; // p[k] to be fixed mapQ = abs(p[1]->mapQ - p[0]->mapQ); } else if (cigar[0]) k = 0, mapQ = p[1]->mapQ; else if (cigar[1]) k = 1, mapQ = p[0]->mapQ; if (k >= 0 && p[k]->pos != beg[k]) { ++n_mapped[is_singleton]; { // recalculate mapping quality int tmp = (int)p[1-k]->mapQ - p[k]->mapQ/2 - 8; if (tmp <= 0) tmp = 1; if (mapQ > tmp) mapQ = tmp; p[k]->mapQ = p[1-k]->mapQ = mapQ; p[k]->seQ = p[1-k]->seQ = p[1-k]->seQ < mapQ? p[1-k]->seQ : mapQ; if (p[k]->mapQ > mq_adjust[k]) p[k]->mapQ = mq_adjust[k]; if (p[k]->seQ > mq_adjust[k]) p[k]->seQ = mq_adjust[k]; } // update CIGAR free(p[k]->cigar); p[k]->cigar = cigar[k]; cigar[k] = 0; p[k]->n_cigar = n_cigar[k]; // update the rest of information __set_fixed(p[1-k], p[k], beg[k], cnt[k]); } free(cigar[0]); free(cigar[1]); } } fprintf(stderr, "[bwa_paired_sw] %lld out of %lld Q%d singletons are mated.\n", (long long)n_mapped[1], (long long)n_tot[1], SW_MIN_MAPQ); fprintf(stderr, "[bwa_paired_sw] %lld out of %lld Q%d discordant pairs are fixed.\n", (long long)n_mapped[0], (long long)n_tot[0], SW_MIN_MAPQ); return pacseq; } void bwa_sai2sam_pe_core(const char *prefix, char *const fn_sa[2], char *const fn_fa[2], pe_opt_t *popt, const char *rg_line) { extern bwa_seqio_t *bwa_open_reads(int mode, const char *fn_fa); int i, j, n_seqs; long long tot_seqs = 0; bwa_seq_t *seqs[2]; bwa_seqio_t *ks[2]; clock_t t; bntseq_t *bns; FILE *fp_sa[2]; gap_opt_t opt, opt0; khint_t iter; isize_info_t last_ii; // this is for the last batch of reads char str[1024], magic[2][4]; bwt_t *bwt; uint8_t *pac; // initialization bwase_initialize(); // initialize g_log_n[] in bwase.c pac = 0; bwt = 0; for (i = 1; i != 256; ++i) g_log_n[i] = (int)(4.343 * log(i) + 0.5); bns = bns_restore(prefix); srand48(bns->seed); fp_sa[0] = xopen(fn_sa[0], "r"); fp_sa[1] = xopen(fn_sa[1], "r"); g_hash = kh_init(b128); last_ii.avg = -1.0; err_fread_noeof(magic[0], 1, 4, fp_sa[0]); err_fread_noeof(magic[1], 1, 4, fp_sa[1]); if (strncmp(magic[0], SAI_MAGIC, 4) != 0 || strncmp(magic[1], SAI_MAGIC, 4) != 0) { fprintf(stderr, "[E::%s] Unmatched SAI magic. Please re-run `aln' with the same version of bwa.\n", __func__); exit(1); } err_fread_noeof(&opt, sizeof(gap_opt_t), 1, fp_sa[0]); ks[0] = bwa_open_reads(opt.mode, fn_fa[0]); opt0 = opt; err_fread_noeof(&opt, sizeof(gap_opt_t), 1, fp_sa[1]); // overwritten! ks[1] = bwa_open_reads(opt.mode, fn_fa[1]); { // for Illumina alignment only if (popt->is_preload) { strcpy(str, prefix); strcat(str, ".bwt"); bwt = bwt_restore_bwt(str); strcpy(str, prefix); strcat(str, ".sa"); bwt_restore_sa(str, bwt); pac = (ubyte_t*)calloc(bns->l_pac/4+1, 1); err_rewind(bns->fp_pac); err_fread_noeof(pac, 1, bns->l_pac/4+1, bns->fp_pac); } } // core loop bwa_print_sam_hdr(bns, rg_line); while ((seqs[0] = bwa_read_seq(ks[0], 0x40000, &n_seqs, opt0.mode, opt0.trim_qual)) != 0) { int cnt_chg; isize_info_t ii; ubyte_t *pacseq; seqs[1] = bwa_read_seq(ks[1], 0x40000, &n_seqs, opt.mode, opt.trim_qual); tot_seqs += n_seqs; t = clock(); fprintf(stderr, "[bwa_sai2sam_pe_core] convert to sequence coordinate... \n"); cnt_chg = bwa_cal_pac_pos_pe(bns, prefix, bwt, n_seqs, seqs, fp_sa, &ii, popt, &opt, &last_ii); fprintf(stderr, "[bwa_sai2sam_pe_core] time elapses: %.2f sec\n", (float)(clock() - t) / CLOCKS_PER_SEC); t = clock(); fprintf(stderr, "[bwa_sai2sam_pe_core] changing coordinates of %d alignments.\n", cnt_chg); fprintf(stderr, "[bwa_sai2sam_pe_core] align unmapped mate...\n"); pacseq = bwa_paired_sw(bns, pac, n_seqs, seqs, popt, &ii); fprintf(stderr, "[bwa_sai2sam_pe_core] time elapses: %.2f sec\n", (float)(clock() - t) / CLOCKS_PER_SEC); t = clock(); fprintf(stderr, "[bwa_sai2sam_pe_core] refine gapped alignments... "); for (j = 0; j < 2; ++j) bwa_refine_gapped(bns, n_seqs, seqs[j], pacseq); fprintf(stderr, "%.2f sec\n", (float)(clock() - t) / CLOCKS_PER_SEC); t = clock(); if (pac == 0) free(pacseq); fprintf(stderr, "[bwa_sai2sam_pe_core] print alignments... "); for (i = 0; i < n_seqs; ++i) { bwa_seq_t *p[2]; p[0] = seqs[0] + i; p[1] = seqs[1] + i; if (p[0]->bc[0] || p[1]->bc[0]) { strcat(p[0]->bc, p[1]->bc); strcpy(p[1]->bc, p[0]->bc); } bwa_print_sam1(bns, p[0], p[1], opt.mode, opt.max_top2); bwa_print_sam1(bns, p[1], p[0], opt.mode, opt.max_top2); if (strcmp(p[0]->name, p[1]->name) != 0) err_fatal(__func__, "paired reads have different names: \"%s\", \"%s\"\n", p[0]->name, p[1]->name); } fprintf(stderr, "%.2f sec\n", (float)(clock() - t) / CLOCKS_PER_SEC); t = clock(); for (j = 0; j < 2; ++j) bwa_free_read_seq(n_seqs, seqs[j]); fprintf(stderr, "[bwa_sai2sam_pe_core] %lld sequences have been processed.\n", tot_seqs); last_ii = ii; } // destroy bns_destroy(bns); for (i = 0; i < 2; ++i) { bwa_seq_close(ks[i]); err_fclose(fp_sa[i]); } for (iter = kh_begin(g_hash); iter != kh_end(g_hash); ++iter) if (kh_exist(g_hash, iter)) free(kh_val(g_hash, iter).a); kh_destroy(b128, g_hash); if (pac) { free(pac); bwt_destroy(bwt); } } int bwa_sai2sam_pe(int argc, char *argv[]) { int c; pe_opt_t *popt; char *prefix, *rg_line = 0; popt = bwa_init_pe_opt(); while ((c = getopt(argc, argv, "a:o:sPn:N:c:f:Ar:")) >= 0) { switch (c) { case 'r': if ((rg_line = bwa_set_rg(optarg)) == 0) return 1; break; case 'a': popt->max_isize = atoi(optarg); break; case 'o': popt->max_occ = atoi(optarg); break; case 's': popt->is_sw = 0; break; case 'P': popt->is_preload = 1; break; case 'n': popt->n_multi = atoi(optarg); break; case 'N': popt->N_multi = atoi(optarg); break; case 'c': popt->ap_prior = atof(optarg); break; case 'f': xreopen(optarg, "w", stdout); break; case 'A': popt->force_isize = 1; break; default: return 1; } } if (optind + 5 > argc) { fprintf(stderr, "\n"); fprintf(stderr, "Usage: bwa sampe [options] \n\n"); fprintf(stderr, "Options: -a INT maximum insert size [%d]\n", popt->max_isize); fprintf(stderr, " -o INT maximum occurrences for one end [%d]\n", popt->max_occ); fprintf(stderr, " -n INT maximum hits to output for paired reads [%d]\n", popt->n_multi); fprintf(stderr, " -N INT maximum hits to output for discordant pairs [%d]\n", popt->N_multi); fprintf(stderr, " -c FLOAT prior of chimeric rate (lower bound) [%.1le]\n", popt->ap_prior); fprintf(stderr, " -f FILE sam file to output results to [stdout]\n"); fprintf(stderr, " -r STR read group header line such as `@RG\\tID:foo\\tSM:bar' [null]\n"); fprintf(stderr, " -P preload index into memory (for base-space reads only)\n"); fprintf(stderr, " -s disable Smith-Waterman for the unmapped mate\n"); fprintf(stderr, " -A disable insert size estimate (force -s)\n\n"); fprintf(stderr, "Notes: 1. For SOLiD reads, corresponds R3 reads and to F3.\n"); fprintf(stderr, " 2. For reads shorter than 30bp, applying a smaller -o is recommended to\n"); fprintf(stderr, " to get a sensible speed at the cost of pairing accuracy.\n"); fprintf(stderr, "\n"); return 1; } if ((prefix = bwa_idx_infer_prefix(argv[optind])) == 0) { fprintf(stderr, "[%s] fail to locate the index\n", __func__); return 1; } bwa_sai2sam_pe_core(prefix, argv + optind + 1, argv + optind+3, popt, rg_line); free(prefix); free(popt); return 0; } bwa-0.7.17/bwase.c000066400000000000000000000461561317342117100136400ustar00rootroot00000000000000#include #include #include #include #include #include #include #include "bwase.h" #include "bwtaln.h" #include "bntseq.h" #include "utils.h" #include "kstring.h" #include "bwa.h" #include "ksw.h" #ifdef USE_MALLOC_WRAPPERS # include "malloc_wrap.h" #endif int g_log_n[256]; void bwa_aln2seq_core(int n_aln, const bwt_aln1_t *aln, bwa_seq_t *s, int set_main, int n_multi) { int i, cnt, best; if (n_aln == 0) { s->type = BWA_TYPE_NO_MATCH; s->c1 = s->c2 = 0; return; } if (set_main) { best = aln[0].score; for (i = cnt = 0; i < n_aln; ++i) { const bwt_aln1_t *p = aln + i; if (p->score > best) break; if (drand48() * (p->l - p->k + 1 + cnt) > (double)cnt) { s->n_mm = p->n_mm; s->n_gapo = p->n_gapo; s->n_gape = p->n_gape; s->ref_shift = (int)p->n_del - (int)p->n_ins; s->score = p->score; s->sa = p->k + (bwtint_t)((p->l - p->k + 1) * drand48()); } cnt += p->l - p->k + 1; } s->c1 = cnt; for (; i < n_aln; ++i) cnt += aln[i].l - aln[i].k + 1; s->c2 = cnt - s->c1; s->type = s->c1 > 1? BWA_TYPE_REPEAT : BWA_TYPE_UNIQUE; } if (n_multi) { int k, rest, n_occ, z = 0; for (k = n_occ = 0; k < n_aln; ++k) { const bwt_aln1_t *q = aln + k; n_occ += q->l - q->k + 1; } if (s->multi) free(s->multi); if (n_occ > n_multi + 1) { // if there are too many hits, generate none of them s->multi = 0; s->n_multi = 0; return; } /* The following code is more flexible than what is required * here. In principle, due to the requirement above, we can * simply output all hits, but the following samples "rest" * number of random hits. */ rest = n_occ > n_multi + 1? n_multi + 1 : n_occ; // find one additional for ->sa s->multi = calloc(rest, sizeof(bwt_multi1_t)); for (k = 0; k < n_aln; ++k) { const bwt_aln1_t *q = aln + k; if (q->l - q->k + 1 <= rest) { bwtint_t l; for (l = q->k; l <= q->l; ++l) { s->multi[z].pos = l; s->multi[z].gap = q->n_gapo + q->n_gape; s->multi[z].ref_shift = (int)q->n_del - (int)q->n_ins; s->multi[z++].mm = q->n_mm; } rest -= q->l - q->k + 1; } else { // Random sampling (http://code.activestate.com/recipes/272884/). In fact, we never come here. int j, i; for (j = rest, i = q->l - q->k + 1; j > 0; --j) { double p = 1.0, x = drand48(); while (x < p) p -= p * j / (i--); s->multi[z].pos = q->l - i; s->multi[z].gap = q->n_gapo + q->n_gape; s->multi[z].ref_shift = (int)q->n_del - (int)q->n_ins; s->multi[z++].mm = q->n_mm; } rest = 0; break; } } s->n_multi = z; } } void bwa_aln2seq(int n_aln, const bwt_aln1_t *aln, bwa_seq_t *s) { bwa_aln2seq_core(n_aln, aln, s, 1, 0); } int bwa_approx_mapQ(const bwa_seq_t *p, int mm) { int n; if (p->c1 == 0) return 23; if (p->c1 > 1) return 0; if (p->n_mm == mm) return 25; if (p->c2 == 0) return 37; n = (p->c2 >= 255)? 255 : p->c2; return (23 < g_log_n[n])? 0 : 23 - g_log_n[n]; } bwtint_t bwa_sa2pos(const bntseq_t *bns, const bwt_t *bwt, bwtint_t sapos, int ref_len, int *strand) { bwtint_t pos_f; int is_rev; *strand = 0; // initialise strand to 0 otherwise we could return without setting it pos_f = bwt_sa(bwt, sapos); // position on the forward-reverse coordinate if (pos_f < bns->l_pac && bns->l_pac < pos_f + ref_len) return (bwtint_t)-1; pos_f = bns_depos(bns, pos_f, &is_rev); // position on the forward strand; this may be the first base or the last base *strand = !is_rev; if (is_rev) pos_f = pos_f + 1 < ref_len? 0 : pos_f - ref_len + 1; // position of the first base return pos_f; // FIXME: it is possible that pos_f < bns->anns[ref_id].offset } /** * Derive the actual position in the read from the given suffix array * coordinates. Note that the position will be approximate based on * whether indels appear in the read and whether calculations are * performed from the start or end of the read. */ void bwa_cal_pac_pos_core(const bntseq_t *bns, const bwt_t *bwt, bwa_seq_t *seq, const int max_mm, const float fnr) { int max_diff, strand; if (seq->type != BWA_TYPE_UNIQUE && seq->type != BWA_TYPE_REPEAT) return; max_diff = fnr > 0.0? bwa_cal_maxdiff(seq->len, BWA_AVG_ERR, fnr) : max_mm; seq->seQ = seq->mapQ = bwa_approx_mapQ(seq, max_diff); //fprintf(stderr, "%d\n", seq->ref_shift); seq->pos = bwa_sa2pos(bns, bwt, seq->sa, seq->len + seq->ref_shift, &strand); seq->strand = strand; seq->seQ = seq->mapQ = bwa_approx_mapQ(seq, max_diff); if (seq->pos == (bwtint_t)-1) seq->type = BWA_TYPE_NO_MATCH; } void bwa_cal_pac_pos(const bntseq_t *bns, const char *prefix, int n_seqs, bwa_seq_t *seqs, int max_mm, float fnr) { int i, j, strand, n_multi; char str[1024]; bwt_t *bwt; // load forward SA strcpy(str, prefix); strcat(str, ".bwt"); bwt = bwt_restore_bwt(str); strcpy(str, prefix); strcat(str, ".sa"); bwt_restore_sa(str, bwt); for (i = 0; i != n_seqs; ++i) { bwa_seq_t *p = &seqs[i]; bwa_cal_pac_pos_core(bns, bwt, p, max_mm, fnr); for (j = n_multi = 0; j < p->n_multi; ++j) { bwt_multi1_t *q = p->multi + j; q->pos = bwa_sa2pos(bns, bwt, q->pos, p->len + q->ref_shift, &strand); q->strand = strand; if (q->pos != p->pos && q->pos != (bwtint_t)-1) p->multi[n_multi++] = *q; } p->n_multi = n_multi; } bwt_destroy(bwt); } #define SW_BW 50 bwa_cigar_t *bwa_refine_gapped_core(bwtint_t l_pac, const ubyte_t *pacseq, int len, ubyte_t *seq, int ref_shift, bwtint_t *_rb, int *n_cigar) { bwa_cigar_t *cigar = 0; uint32_t *cigar32 = 0; ubyte_t *rseq; int64_t k, rb, re, rlen; int8_t mat[25]; int w; bwa_fill_scmat(1, 3, mat); rb = *_rb; re = rb + len + ref_shift; assert(re <= l_pac); rseq = bns_get_seq(l_pac, pacseq, rb, re, &rlen); assert(re - rb == rlen); w = abs((int)rlen - len) * 1.5; ksw_global(len, seq, rlen, rseq, 5, mat, 5, 1, SW_BW > w? SW_BW : w, n_cigar, &cigar32); assert(*n_cigar > 0); if ((cigar32[*n_cigar - 1]&0xf) == 1) cigar32[*n_cigar - 1] = (cigar32[*n_cigar - 1]>>4<<4) | 3; // change endding ins to soft clipping if ((cigar32[0]&0xf) == 1) cigar32[0] = (cigar32[0]>>4<<4) | 3; // change beginning ins to soft clipping if ((cigar32[*n_cigar - 1]&0xf) == 2) --*n_cigar; // delete endding del if ((cigar32[0]&0xf) == 2) { // delete beginning del *_rb += cigar32[0]>>4; --*n_cigar; memmove(cigar32, cigar32+1, (*n_cigar) * 4); } cigar = (bwa_cigar_t*)cigar32; for (k = 0; k < *n_cigar; ++k) cigar[k] = __cigar_create((cigar32[k]&0xf), (cigar32[k]>>4)); free(rseq); return cigar; } char *bwa_cal_md1(int n_cigar, bwa_cigar_t *cigar, int len, bwtint_t pos, ubyte_t *seq, bwtint_t l_pac, ubyte_t *pacseq, kstring_t *str, int *_nm) { bwtint_t x, y; int z, u, c, nm = 0; str->l = 0; // reset x = pos; y = 0; if (cigar) { int k, l; for (k = u = 0; k < n_cigar; ++k) { l = __cigar_len(cigar[k]); if (__cigar_op(cigar[k]) == FROM_M) { for (z = 0; z < l && x+z < l_pac; ++z) { c = pacseq[(x+z)>>2] >> ((~(x+z)&3)<<1) & 3; if (c > 3 || seq[y+z] > 3 || c != seq[y+z]) { ksprintf(str, "%d", u); kputc("ACGTN"[c], str); ++nm; u = 0; } else ++u; } x += l; y += l; } else if (__cigar_op(cigar[k]) == FROM_I || __cigar_op(cigar[k]) == FROM_S) { y += l; if (__cigar_op(cigar[k]) == FROM_I) nm += l; } else if (__cigar_op(cigar[k]) == FROM_D) { ksprintf(str, "%d", u); kputc('^', str); for (z = 0; z < l && x+z < l_pac; ++z) kputc("ACGT"[pacseq[(x+z)>>2] >> ((~(x+z)&3)<<1) & 3], str); u = 0; x += l; nm += l; } } } else { // no gaps for (z = u = 0; z < (bwtint_t)len && x+z < l_pac; ++z) { c = pacseq[(x+z)>>2] >> ((~(x+z)&3)<<1) & 3; if (c > 3 || seq[y+z] > 3 || c != seq[y+z]) { ksprintf(str, "%d", u); kputc("ACGTN"[c], str); ++nm; u = 0; } else ++u; } } ksprintf(str, "%d", u); *_nm = nm; return strdup(str->s); } void bwa_correct_trimmed(bwa_seq_t *s) { if (s->len == s->full_len) return; if (s->strand == 0) { // forward if (s->cigar && __cigar_op(s->cigar[s->n_cigar-1]) == FROM_S) { // the last is S s->cigar[s->n_cigar-1] += s->full_len - s->len; } else { if (s->cigar == 0) { s->n_cigar = 2; s->cigar = calloc(s->n_cigar, sizeof(bwa_cigar_t)); s->cigar[0] = __cigar_create(0, s->len); } else { ++s->n_cigar; s->cigar = realloc(s->cigar, s->n_cigar * sizeof(bwa_cigar_t)); } s->cigar[s->n_cigar-1] = __cigar_create(3, (s->full_len - s->len)); } } else { // reverse if (s->cigar && __cigar_op(s->cigar[0]) == FROM_S) { // the first is S s->cigar[0] += s->full_len - s->len; } else { if (s->cigar == 0) { s->n_cigar = 2; s->cigar = calloc(s->n_cigar, sizeof(bwa_cigar_t)); s->cigar[1] = __cigar_create(0, s->len); } else { ++s->n_cigar; s->cigar = realloc(s->cigar, s->n_cigar * sizeof(bwa_cigar_t)); memmove(s->cigar + 1, s->cigar, (s->n_cigar-1) * sizeof(bwa_cigar_t)); } s->cigar[0] = __cigar_create(3, (s->full_len - s->len)); } } s->len = s->full_len; } void bwa_refine_gapped(const bntseq_t *bns, int n_seqs, bwa_seq_t *seqs, ubyte_t *_pacseq) { ubyte_t *pacseq; int i, j, k; kstring_t *str; if (!_pacseq) { pacseq = (ubyte_t*)calloc(bns->l_pac/4+1, 1); err_rewind(bns->fp_pac); err_fread_noeof(pacseq, 1, bns->l_pac/4+1, bns->fp_pac); } else pacseq = _pacseq; for (i = 0; i != n_seqs; ++i) { bwa_seq_t *s = seqs + i; seq_reverse(s->len, s->seq, 0); // IMPORTANT: s->seq is reversed here!!! for (j = k = 0; j < s->n_multi; ++j) { bwt_multi1_t *q = s->multi + j; int n_cigar; if (q->gap) { // gapped alignment q->cigar = bwa_refine_gapped_core(bns->l_pac, pacseq, s->len, q->strand? s->rseq : s->seq, q->ref_shift, &q->pos, &n_cigar); q->n_cigar = n_cigar; if (q->cigar) s->multi[k++] = *q; } else s->multi[k++] = *q; } s->n_multi = k; // this squeezes out gapped alignments which failed the CIGAR generation if (s->type == BWA_TYPE_NO_MATCH || s->type == BWA_TYPE_MATESW || s->n_gapo == 0) continue; s->cigar = bwa_refine_gapped_core(bns->l_pac, pacseq, s->len, s->strand? s->rseq : s->seq, s->ref_shift, &s->pos, &s->n_cigar); if (s->cigar == 0) s->type = BWA_TYPE_NO_MATCH; } // generate MD tag str = (kstring_t*)calloc(1, sizeof(kstring_t)); for (i = 0; i != n_seqs; ++i) { bwa_seq_t *s = seqs + i; if (s->type != BWA_TYPE_NO_MATCH) { int nm; s->md = bwa_cal_md1(s->n_cigar, s->cigar, s->len, s->pos, s->strand? s->rseq : s->seq, bns->l_pac, pacseq, str, &nm); s->nm = nm; } } free(str->s); free(str); // correct for trimmed reads for (i = 0; i < n_seqs; ++i) bwa_correct_trimmed(seqs + i); if (!_pacseq) free(pacseq); } int64_t pos_end(const bwa_seq_t *p) { if (p->cigar) { int j; int64_t x = p->pos; for (j = 0; j != p->n_cigar; ++j) { int op = __cigar_op(p->cigar[j]); if (op == 0 || op == 2) x += __cigar_len(p->cigar[j]); } return x; } else return p->pos + p->len; } int64_t pos_end_multi(const bwt_multi1_t *p, int len) // analogy to pos_end() { if (p->cigar) { int j; int64_t x = p->pos; for (j = 0; j != p->n_cigar; ++j) { int op = __cigar_op(p->cigar[j]); if (op == 0 || op == 2) x += __cigar_len(p->cigar[j]); } return x; } else return p->pos + len; } static int64_t pos_5(const bwa_seq_t *p) { if (p->type != BWA_TYPE_NO_MATCH) return p->strand? pos_end(p) : p->pos; return -1; } void bwa_print_seq(FILE *stream, bwa_seq_t *seq) { char buffer[4096]; const int bsz = sizeof(buffer); int i, j, l; if (seq->strand == 0) { for (i = 0; i < seq->full_len; i += bsz) { l = seq->full_len - i > bsz ? bsz : seq->full_len - i; for (j = 0; j < l; j++) buffer[j] = "ACGTN"[seq->seq[i + j]]; err_fwrite(buffer, 1, l, stream); } } else { for (i = seq->full_len - 1; i >= 0; i -= bsz) { l = i + 1 > bsz ? bsz : i + 1; for (j = 0; j < l; j++) buffer[j] = "TGCAN"[seq->seq[i - j]]; err_fwrite(buffer, 1, l, stream); } } } void bwa_print_sam1(const bntseq_t *bns, bwa_seq_t *p, const bwa_seq_t *mate, int mode, int max_top2) { int j; if (p->type != BWA_TYPE_NO_MATCH || (mate && mate->type != BWA_TYPE_NO_MATCH)) { int seqid, nn, am = 0, flag = p->extra_flag; char XT; if (p->type == BWA_TYPE_NO_MATCH) { p->pos = mate->pos; p->strand = mate->strand; flag |= SAM_FSU; j = 1; } else j = pos_end(p) - p->pos; // j is the length of the reference in the alignment // get seqid nn = bns_cnt_ambi(bns, p->pos, j, &seqid); if (p->type != BWA_TYPE_NO_MATCH && p->pos + j - bns->anns[seqid].offset > bns->anns[seqid].len) flag |= SAM_FSU; // flag UNMAP as this alignment bridges two adjacent reference sequences // update flag and print it if (p->strand) flag |= SAM_FSR; if (mate) { if (mate->type != BWA_TYPE_NO_MATCH) { if (mate->strand) flag |= SAM_FMR; } else flag |= SAM_FMU; } err_printf("%s\t%d\t%s\t", p->name, flag, bns->anns[seqid].name); err_printf("%d\t%d\t", (int)(p->pos - bns->anns[seqid].offset + 1), p->mapQ); // print CIGAR if (p->cigar) { for (j = 0; j != p->n_cigar; ++j) err_printf("%d%c", __cigar_len(p->cigar[j]), "MIDS"[__cigar_op(p->cigar[j])]); } else if (p->type == BWA_TYPE_NO_MATCH) err_printf("*"); else err_printf("%dM", p->len); // print mate coordinate if (mate && mate->type != BWA_TYPE_NO_MATCH) { int m_seqid; long long isize; am = mate->seQ < p->seQ? mate->seQ : p->seQ; // smaller single-end mapping quality // redundant calculation here, but should not matter too much bns_cnt_ambi(bns, mate->pos, mate->len, &m_seqid); err_printf("\t%s\t", (seqid == m_seqid)? "=" : bns->anns[m_seqid].name); isize = (seqid == m_seqid)? pos_5(mate) - pos_5(p) : 0; if (p->type == BWA_TYPE_NO_MATCH) isize = 0; err_printf("%d\t%lld\t", (int)(mate->pos - bns->anns[m_seqid].offset + 1), isize); } else if (mate) err_printf("\t=\t%d\t0\t", (int)(p->pos - bns->anns[seqid].offset + 1)); else err_printf("\t*\t0\t0\t"); // print sequence and quality bwa_print_seq(stdout, p); err_putchar('\t'); if (p->qual) { if (p->strand) seq_reverse(p->len, p->qual, 0); // reverse quality err_printf("%s", p->qual); } else err_printf("*"); if (bwa_rg_id[0]) err_printf("\tRG:Z:%s", bwa_rg_id); if (p->bc[0]) err_printf("\tBC:Z:%s", p->bc); if (p->clip_len < p->full_len) err_printf("\tXC:i:%d", p->clip_len); if (p->type != BWA_TYPE_NO_MATCH) { int i; // calculate XT tag XT = "NURM"[p->type]; if (nn > 10) XT = 'N'; // print tags err_printf("\tXT:A:%c\t%s:i:%d", XT, (mode & BWA_MODE_COMPREAD)? "NM" : "CM", p->nm); if (nn) err_printf("\tXN:i:%d", nn); if (mate) err_printf("\tSM:i:%d\tAM:i:%d", p->seQ, am); if (p->type != BWA_TYPE_MATESW) { // X0 and X1 are not available for this type of alignment err_printf("\tX0:i:%d", p->c1); if (p->c1 <= max_top2) err_printf("\tX1:i:%d", p->c2); } err_printf("\tXM:i:%d\tXO:i:%d\tXG:i:%d", p->n_mm, p->n_gapo, p->n_gapo+p->n_gape); if (p->md) err_printf("\tMD:Z:%s", p->md); // print multiple hits if (p->n_multi) { err_printf("\tXA:Z:"); for (i = 0; i < p->n_multi; ++i) { bwt_multi1_t *q = p->multi + i; int k; j = pos_end_multi(q, p->len) - q->pos; nn = bns_cnt_ambi(bns, q->pos, j, &seqid); err_printf("%s,%c%d,", bns->anns[seqid].name, q->strand? '-' : '+', (int)(q->pos - bns->anns[seqid].offset + 1)); if (q->cigar) { for (k = 0; k < q->n_cigar; ++k) err_printf("%d%c", __cigar_len(q->cigar[k]), "MIDS"[__cigar_op(q->cigar[k])]); } else err_printf("%dM", p->len); err_printf(",%d;", q->gap + q->mm); } } } err_putchar('\n'); } else { // this read has no match //ubyte_t *s = p->strand? p->rseq : p->seq; int flag = p->extra_flag | SAM_FSU; if (mate && mate->type == BWA_TYPE_NO_MATCH) flag |= SAM_FMU; err_printf("%s\t%d\t*\t0\t0\t*\t*\t0\t0\t", p->name, flag); //Why did this work differently to the version above?? //for (j = 0; j != p->len; ++j) putchar("ACGTN"[(int)s[j]]); bwa_print_seq(stdout, p); err_putchar('\t'); if (p->qual) { if (p->strand) seq_reverse(p->len, p->qual, 0); // reverse quality err_printf("%s", p->qual); } else err_printf("*"); if (bwa_rg_id[0]) err_printf("\tRG:Z:%s", bwa_rg_id); if (p->bc[0]) err_printf("\tBC:Z:%s", p->bc); if (p->clip_len < p->full_len) err_printf("\tXC:i:%d", p->clip_len); err_putchar('\n'); } } void bwase_initialize() { int i; for (i = 1; i != 256; ++i) g_log_n[i] = (int)(4.343 * log(i) + 0.5); } void bwa_sai2sam_se_core(const char *prefix, const char *fn_sa, const char *fn_fa, int n_occ, const char *rg_line) { extern bwa_seqio_t *bwa_open_reads(int mode, const char *fn_fa); int i, n_seqs, m_aln; long long tot_seqs = 0; bwt_aln1_t *aln = 0; bwa_seq_t *seqs; bwa_seqio_t *ks; clock_t t; bntseq_t *bns; FILE *fp_sa; gap_opt_t opt; char magic[4]; // initialization bwase_initialize(); bns = bns_restore(prefix); srand48(bns->seed); fp_sa = xopen(fn_sa, "r"); m_aln = 0; err_fread_noeof(magic, 1, 4, fp_sa); if (strncmp(magic, SAI_MAGIC, 4) != 0) { fprintf(stderr, "[E::%s] Unmatched SAI magic. Please re-run `aln' with the same version of bwa.\n", __func__); exit(1); } err_fread_noeof(&opt, sizeof(gap_opt_t), 1, fp_sa); bwa_print_sam_hdr(bns, rg_line); // set ks ks = bwa_open_reads(opt.mode, fn_fa); // core loop while ((seqs = bwa_read_seq(ks, 0x40000, &n_seqs, opt.mode, opt.trim_qual)) != 0) { tot_seqs += n_seqs; t = clock(); // read alignment for (i = 0; i < n_seqs; ++i) { bwa_seq_t *p = seqs + i; int n_aln; err_fread_noeof(&n_aln, 4, 1, fp_sa); if (n_aln > m_aln) { m_aln = n_aln; aln = (bwt_aln1_t*)realloc(aln, sizeof(bwt_aln1_t) * m_aln); } err_fread_noeof(aln, sizeof(bwt_aln1_t), n_aln, fp_sa); bwa_aln2seq_core(n_aln, aln, p, 1, n_occ); } fprintf(stderr, "[bwa_aln_core] convert to sequence coordinate... "); bwa_cal_pac_pos(bns, prefix, n_seqs, seqs, opt.max_diff, opt.fnr); // forward bwt will be destroyed here fprintf(stderr, "%.2f sec\n", (float)(clock() - t) / CLOCKS_PER_SEC); t = clock(); fprintf(stderr, "[bwa_aln_core] refine gapped alignments... "); bwa_refine_gapped(bns, n_seqs, seqs, 0); fprintf(stderr, "%.2f sec\n", (float)(clock() - t) / CLOCKS_PER_SEC); t = clock(); fprintf(stderr, "[bwa_aln_core] print alignments... "); for (i = 0; i < n_seqs; ++i) bwa_print_sam1(bns, seqs + i, 0, opt.mode, opt.max_top2); fprintf(stderr, "%.2f sec\n", (float)(clock() - t) / CLOCKS_PER_SEC); bwa_free_read_seq(n_seqs, seqs); fprintf(stderr, "[bwa_aln_core] %lld sequences have been processed.\n", tot_seqs); } // destroy bwa_seq_close(ks); bns_destroy(bns); err_fclose(fp_sa); free(aln); } int bwa_sai2sam_se(int argc, char *argv[]) { int c, n_occ = 3; char *prefix, *rg_line = 0; while ((c = getopt(argc, argv, "hn:f:r:")) >= 0) { switch (c) { case 'h': break; case 'r': if ((rg_line = bwa_set_rg(optarg)) == 0) return 1; break; case 'n': n_occ = atoi(optarg); break; case 'f': xreopen(optarg, "w", stdout); break; default: return 1; } } if (optind + 3 > argc) { fprintf(stderr, "Usage: bwa samse [-n max_occ] [-f out.sam] [-r RG_line] \n"); return 1; } if ((prefix = bwa_idx_infer_prefix(argv[optind])) == 0) { fprintf(stderr, "[%s] fail to locate the index\n", __func__); return 1; } bwa_sai2sam_se_core(prefix, argv[optind+1], argv[optind+2], n_occ, rg_line); free(prefix); return 0; } bwa-0.7.17/bwase.h000066400000000000000000000020111317342117100136230ustar00rootroot00000000000000#ifndef BWASE_H #define BWASE_H #include "bntseq.h" #include "bwt.h" #include "bwtaln.h" #ifdef __cplusplus extern "C" { #endif // Initialize mapping tables in the bwa single-end mapper. void bwase_initialize(); // Calculate the approximate position of the sequence from the specified bwt with loaded suffix array. void bwa_cal_pac_pos_core(const bntseq_t *bns, const bwt_t* bwt, bwa_seq_t* seq, const int max_mm, const float fnr); // Refine the approximate position of the sequence to an actual placement for the sequence. void bwa_refine_gapped(const bntseq_t *bns, int n_seqs, bwa_seq_t *seqs, ubyte_t *_pacseq); // Backfill certain alignment properties mainly centering around number of matches. void bwa_aln2seq(int n_aln, const bwt_aln1_t *aln, bwa_seq_t *s); // Calculate the end position of a read given a certain sequence. int64_t pos_end(const bwa_seq_t *p); // bwtint_t bwa_sa2pos(const bntseq_t *bns, const bwt_t *bwt, bwtint_t sapos, int len, int *strand); #ifdef __cplusplus } #endif #endif // BWASE_H bwa-0.7.17/bwaseqio.c000066400000000000000000000153241317342117100143420ustar00rootroot00000000000000#include #include #include "bwtaln.h" #include "utils.h" #include "bamlite.h" #include "kseq.h" KSEQ_DECLARE(gzFile) #ifdef USE_MALLOC_WRAPPERS # include "malloc_wrap.h" #endif extern unsigned char nst_nt4_table[256]; static char bam_nt16_nt4_table[] = { 4, 0, 1, 4, 2, 4, 4, 4, 3, 4, 4, 4, 4, 4, 4, 4 }; struct __bwa_seqio_t { // for BAM input int is_bam, which; // 1st bit: read1, 2nd bit: read2, 3rd: SE bamFile fp; // for fastq input kseq_t *ks; }; bwa_seqio_t *bwa_bam_open(const char *fn, int which) { bwa_seqio_t *bs; bam_header_t *h; bs = (bwa_seqio_t*)calloc(1, sizeof(bwa_seqio_t)); bs->is_bam = 1; bs->which = which; bs->fp = bam_open(fn, "r"); if (0 == bs->fp) err_fatal_simple("Couldn't open bam file"); h = bam_header_read(bs->fp); bam_header_destroy(h); return bs; } bwa_seqio_t *bwa_seq_open(const char *fn) { gzFile fp; bwa_seqio_t *bs; bs = (bwa_seqio_t*)calloc(1, sizeof(bwa_seqio_t)); fp = xzopen(fn, "r"); bs->ks = kseq_init(fp); return bs; } void bwa_seq_close(bwa_seqio_t *bs) { if (bs == 0) return; if (bs->is_bam) { if (0 != bam_close(bs->fp)) err_fatal_simple("Error closing bam file"); } else { err_gzclose(bs->ks->f->f); kseq_destroy(bs->ks); } free(bs); } void seq_reverse(int len, ubyte_t *seq, int is_comp) { int i; if (is_comp) { for (i = 0; i < len>>1; ++i) { char tmp = seq[len-1-i]; if (tmp < 4) tmp = 3 - tmp; seq[len-1-i] = (seq[i] >= 4)? seq[i] : 3 - seq[i]; seq[i] = tmp; } if (len&1) seq[i] = (seq[i] >= 4)? seq[i] : 3 - seq[i]; } else { for (i = 0; i < len>>1; ++i) { char tmp = seq[len-1-i]; seq[len-1-i] = seq[i]; seq[i] = tmp; } } } int bwa_trim_read(int trim_qual, bwa_seq_t *p) { int s = 0, l, max = 0, max_l = p->len; if (trim_qual < 1 || p->qual == 0) return 0; for (l = p->len - 1; l >= BWA_MIN_RDLEN; --l) { s += trim_qual - (p->qual[l] - 33); if (s < 0) break; if (s > max) max = s, max_l = l; } p->clip_len = p->len = max_l; return p->full_len - p->len; } static bwa_seq_t *bwa_read_bam(bwa_seqio_t *bs, int n_needed, int *n, int is_comp, int trim_qual) { bwa_seq_t *seqs, *p; int n_seqs, l, i; long n_trimmed = 0, n_tot = 0; bam1_t *b; int res; b = bam_init1(); n_seqs = 0; seqs = (bwa_seq_t*)calloc(n_needed, sizeof(bwa_seq_t)); while ((res = bam_read1(bs->fp, b)) >= 0) { uint8_t *s, *q; int go = 0; if ((bs->which & 1) && (b->core.flag & BAM_FREAD1)) go = 1; if ((bs->which & 2) && (b->core.flag & BAM_FREAD2)) go = 1; if ((bs->which & 4) && !(b->core.flag& BAM_FREAD1) && !(b->core.flag& BAM_FREAD2))go = 1; if (go == 0) continue; l = b->core.l_qseq; p = &seqs[n_seqs++]; p->tid = -1; // no assigned to a thread p->qual = 0; p->full_len = p->clip_len = p->len = l; n_tot += p->full_len; s = bam1_seq(b); q = bam1_qual(b); p->seq = (ubyte_t*)calloc(p->len + 1, 1); p->qual = (ubyte_t*)calloc(p->len + 1, 1); for (i = 0; i != p->full_len; ++i) { p->seq[i] = bam_nt16_nt4_table[(int)bam1_seqi(s, i)]; p->qual[i] = q[i] + 33 < 126? q[i] + 33 : 126; } if (bam1_strand(b)) { // then reverse seq_reverse(p->len, p->seq, 1); seq_reverse(p->len, p->qual, 0); } if (trim_qual >= 1) n_trimmed += bwa_trim_read(trim_qual, p); p->rseq = (ubyte_t*)calloc(p->full_len, 1); memcpy(p->rseq, p->seq, p->len); seq_reverse(p->len, p->seq, 0); // *IMPORTANT*: will be reversed back in bwa_refine_gapped() seq_reverse(p->len, p->rseq, is_comp); p->name = strdup((const char*)bam1_qname(b)); if (n_seqs == n_needed) break; } if (res < 0 && res != -1) err_fatal_simple("Error reading bam file"); *n = n_seqs; if (n_seqs && trim_qual >= 1) fprintf(stderr, "[bwa_read_seq] %.1f%% bases are trimmed.\n", 100.0f * n_trimmed/n_tot); if (n_seqs == 0) { free(seqs); bam_destroy1(b); return 0; } bam_destroy1(b); return seqs; } #define BARCODE_LOW_QUAL 13 bwa_seq_t *bwa_read_seq(bwa_seqio_t *bs, int n_needed, int *n, int mode, int trim_qual) { bwa_seq_t *seqs, *p; kseq_t *seq = bs->ks; int n_seqs, l, i, is_comp = mode&BWA_MODE_COMPREAD, is_64 = mode&BWA_MODE_IL13, l_bc = mode>>24; long n_trimmed = 0, n_tot = 0; if (l_bc > BWA_MAX_BCLEN) { fprintf(stderr, "[%s] the maximum barcode length is %d.\n", __func__, BWA_MAX_BCLEN); return 0; } if (bs->is_bam) return bwa_read_bam(bs, n_needed, n, is_comp, trim_qual); // l_bc has no effect for BAM input n_seqs = 0; seqs = (bwa_seq_t*)calloc(n_needed, sizeof(bwa_seq_t)); while ((l = kseq_read(seq)) >= 0) { if ((mode & BWA_MODE_CFY) && (seq->comment.l != 0)) { // skip reads that are marked to be filtered by Casava char *s = index(seq->comment.s, ':'); if (s && *(++s) == 'Y') { continue; } } if (is_64 && seq->qual.l) for (i = 0; i < seq->qual.l; ++i) seq->qual.s[i] -= 31; if (seq->seq.l <= l_bc) continue; // sequence length equals or smaller than the barcode length p = &seqs[n_seqs++]; if (l_bc) { // then trim barcode for (i = 0; i < l_bc; ++i) p->bc[i] = (seq->qual.l && seq->qual.s[i]-33 < BARCODE_LOW_QUAL)? tolower(seq->seq.s[i]) : toupper(seq->seq.s[i]); p->bc[i] = 0; for (; i < seq->seq.l; ++i) seq->seq.s[i - l_bc] = seq->seq.s[i]; seq->seq.l -= l_bc; seq->seq.s[seq->seq.l] = 0; if (seq->qual.l) { for (i = l_bc; i < seq->qual.l; ++i) seq->qual.s[i - l_bc] = seq->qual.s[i]; seq->qual.l -= l_bc; seq->qual.s[seq->qual.l] = 0; } l = seq->seq.l; } else p->bc[0] = 0; p->tid = -1; // no assigned to a thread p->qual = 0; p->full_len = p->clip_len = p->len = l; n_tot += p->full_len; p->seq = (ubyte_t*)calloc(p->full_len, 1); for (i = 0; i != p->full_len; ++i) p->seq[i] = nst_nt4_table[(int)seq->seq.s[i]]; if (seq->qual.l) { // copy quality p->qual = (ubyte_t*)strdup((char*)seq->qual.s); if (trim_qual >= 1) n_trimmed += bwa_trim_read(trim_qual, p); } p->rseq = (ubyte_t*)calloc(p->full_len, 1); memcpy(p->rseq, p->seq, p->len); seq_reverse(p->len, p->seq, 0); // *IMPORTANT*: will be reversed back in bwa_refine_gapped() seq_reverse(p->len, p->rseq, is_comp); p->name = strdup((const char*)seq->name.s); { // trim /[12]$ int t = strlen(p->name); if (t > 2 && p->name[t-2] == '/' && (p->name[t-1] == '1' || p->name[t-1] == '2')) p->name[t-2] = '\0'; } if (n_seqs == n_needed) break; } *n = n_seqs; if (n_seqs && trim_qual >= 1) fprintf(stderr, "[bwa_read_seq] %.1f%% bases are trimmed.\n", 100.0f * n_trimmed/n_tot); if (n_seqs == 0) { free(seqs); return 0; } return seqs; } void bwa_free_read_seq(int n_seqs, bwa_seq_t *seqs) { int i, j; for (i = 0; i != n_seqs; ++i) { bwa_seq_t *p = seqs + i; for (j = 0; j < p->n_multi; ++j) if (p->multi[j].cigar) free(p->multi[j].cigar); free(p->name); free(p->seq); free(p->rseq); free(p->qual); free(p->aln); free(p->md); free(p->multi); free(p->cigar); } free(seqs); } bwa-0.7.17/bwashm.c000066400000000000000000000132701317342117100140070ustar00rootroot00000000000000#include #include #include #include #include #include #include #include #include #include "bwa.h" int bwa_shm_stage(bwaidx_t *idx, const char *hint, const char *_tmpfn) { const char *name; uint8_t *shm, *shm_idx; uint16_t *cnt; int shmid, to_init = 0, l; char path[PATH_MAX + 1], *tmpfn = (char*)_tmpfn; if (hint == 0 || hint[0] == 0) return -1; for (name = hint + strlen(hint) - 1; name >= hint && *name != '/'; --name); ++name; if ((shmid = shm_open("/bwactl", O_RDWR, 0)) < 0) { shmid = shm_open("/bwactl", O_CREAT|O_RDWR|O_EXCL, 0644); to_init = 1; } if (shmid < 0) return -1; ftruncate(shmid, BWA_CTL_SIZE); shm = mmap(0, BWA_CTL_SIZE, PROT_READ|PROT_WRITE, MAP_SHARED, shmid, 0); cnt = (uint16_t*)shm; if (to_init) { memset(shm, 0, BWA_CTL_SIZE); cnt[1] = 4; } if (idx->mem == 0) bwa_idx2mem(idx); if (tmpfn) { FILE *fp; if ((fp = fopen(tmpfn, "wb")) != 0) { int64_t rest = idx->l_mem; while (rest > 0) { int64_t l = rest < 0x1000000? rest : 0x1000000; rest -= fwrite(&idx->mem[idx->l_mem - rest], 1, l, fp); } fclose(fp); free(idx->mem); idx->mem = 0; } else { fprintf(stderr, "[W::%s] fail to create the temporary file. Option '-f' is ignored.\n", __func__); tmpfn = 0; } } strcat(strcpy(path, "/bwaidx-"), name); if ((shmid = shm_open(path, O_CREAT|O_RDWR|O_EXCL, 0644)) < 0) { shm_unlink(path); perror("shm_open()"); return -1; } l = 8 + strlen(name) + 1; if (cnt[1] + l > BWA_CTL_SIZE) return -1; memcpy(shm + cnt[1], &idx->l_mem, 8); memcpy(shm + cnt[1] + 8, name, l - 8); cnt[1] += l; ++cnt[0]; ftruncate(shmid, idx->l_mem); shm_idx = mmap(0, idx->l_mem, PROT_READ|PROT_WRITE, MAP_SHARED, shmid, 0); if (tmpfn) { FILE *fp; fp = fopen(tmpfn, "rb"); int64_t rest = idx->l_mem; while (rest > 0) { int64_t l = rest < 0x1000000? rest : 0x1000000; rest -= fread(&shm_idx[idx->l_mem - rest], 1, l, fp); } fclose(fp); unlink(tmpfn); } else { memcpy(shm_idx, idx->mem, idx->l_mem); free(idx->mem); } bwa_mem2idx(idx->l_mem, shm_idx, idx); idx->is_shm = 1; return 0; } bwaidx_t *bwa_idx_load_from_shm(const char *hint) { const char *name; uint8_t *shm, *shm_idx; uint16_t *cnt, i; char *p, path[PATH_MAX + 1]; int shmid; int64_t l_mem; bwaidx_t *idx; if (hint == 0 || hint[0] == 0) return 0; for (name = hint + strlen(hint) - 1; name >= hint && *name != '/'; --name); ++name; if ((shmid = shm_open("/bwactl", O_RDONLY, 0)) < 0) return 0; shm = mmap(0, BWA_CTL_SIZE, PROT_READ, MAP_SHARED, shmid, 0); cnt = (uint16_t*)shm; if (cnt[0] == 0) return 0; for (i = 0, p = (char*)(shm + 4); i < cnt[0]; ++i) { memcpy(&l_mem, p, 8); p += 8; if (strcmp(p, name) == 0) break; p += strlen(p) + 1; } if (i == cnt[0]) return 0; strcat(strcpy(path, "/bwaidx-"), name); if ((shmid = shm_open(path, O_RDONLY, 0)) < 0) return 0; shm_idx = mmap(0, l_mem, PROT_READ, MAP_SHARED, shmid, 0); idx = calloc(1, sizeof(bwaidx_t)); bwa_mem2idx(l_mem, shm_idx, idx); idx->is_shm = 1; return idx; } int bwa_shm_test(const char *hint) { int shmid; uint16_t *cnt, i; char *p, *shm; const char *name; if (hint == 0 || hint[0] == 0) return 0; for (name = hint + strlen(hint) - 1; name >= hint && *name != '/'; --name); ++name; if ((shmid = shm_open("/bwactl", O_RDONLY, 0)) < 0) return 0; shm = mmap(0, BWA_CTL_SIZE, PROT_READ, MAP_SHARED, shmid, 0); cnt = (uint16_t*)shm; for (i = 0, p = shm + 4; i < cnt[0]; ++i) { if (strcmp(p + 8, name) == 0) return 1; p += strlen(p) + 9; } return 0; } int bwa_shm_list(void) { int shmid; uint16_t *cnt, i; char *p, *shm; if ((shmid = shm_open("/bwactl", O_RDONLY, 0)) < 0) return -1; shm = mmap(0, BWA_CTL_SIZE, PROT_READ, MAP_SHARED, shmid, 0); cnt = (uint16_t*)shm; for (i = 0, p = shm + 4; i < cnt[0]; ++i) { int64_t l_mem; memcpy(&l_mem, p, 8); p += 8; printf("%s\t%ld\n", p, (long)l_mem); p += strlen(p) + 1; } return 0; } int bwa_shm_destroy(void) { int shmid; uint16_t *cnt, i; char *p, *shm; char path[PATH_MAX + 1]; if ((shmid = shm_open("/bwactl", O_RDONLY, 0)) < 0) return -1; shm = mmap(0, BWA_CTL_SIZE, PROT_READ, MAP_SHARED, shmid, 0); cnt = (uint16_t*)shm; for (i = 0, p = shm + 4; i < cnt[0]; ++i) { int64_t l_mem; memcpy(&l_mem, p, 8); p += 8; strcat(strcpy(path, "/bwaidx-"), p); shm_unlink(path); p += strlen(p) + 1; } munmap(shm, BWA_CTL_SIZE); shm_unlink("/bwactl"); return 0; } int main_shm(int argc, char *argv[]) { int c, to_list = 0, to_drop = 0, ret = 0; char *tmpfn = 0; while ((c = getopt(argc, argv, "ldf:")) >= 0) { if (c == 'l') to_list = 1; else if (c == 'd') to_drop = 1; else if (c == 'f') tmpfn = optarg; } if (optind == argc && !to_list && !to_drop) { fprintf(stderr, "\nUsage: bwa shm [-d|-l] [-f tmpFile] [idxbase]\n\n"); fprintf(stderr, "Options: -d destroy all indices in shared memory\n"); fprintf(stderr, " -l list names of indices in shared memory\n"); fprintf(stderr, " -f FILE temporary file to reduce peak memory\n\n"); return 1; } if (optind < argc && (to_list || to_drop)) { fprintf(stderr, "[E::%s] open -l or -d cannot be used when 'idxbase' is present\n", __func__); return 1; } if (optind < argc) { if (bwa_shm_test(argv[optind]) == 0) { bwaidx_t *idx; idx = bwa_idx_load_from_disk(argv[optind], BWA_IDX_ALL); if (bwa_shm_stage(idx, argv[optind], tmpfn) < 0) { fprintf(stderr, "[E::%s] failed to stage the index in shared memory\n", __func__); ret = 1; } bwa_idx_destroy(idx); } else fprintf(stderr, "[M::%s] index '%s' is already in shared memory\n", __func__, argv[optind]); } if (to_list) bwa_shm_list(); if (to_drop) bwa_shm_destroy(); return ret; } bwa-0.7.17/bwt.c000066400000000000000000000357561317342117100133370ustar00rootroot00000000000000/* The MIT License Copyright (c) 2008 Genome Research Ltd (GRL). Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. */ /* Contact: Heng Li */ #include #include #include #include #include #include #include "utils.h" #include "bwt.h" #include "kvec.h" #ifdef USE_MALLOC_WRAPPERS # include "malloc_wrap.h" #endif void bwt_gen_cnt_table(bwt_t *bwt) { int i, j; for (i = 0; i != 256; ++i) { uint32_t x = 0; for (j = 0; j != 4; ++j) x |= (((i&3) == j) + ((i>>2&3) == j) + ((i>>4&3) == j) + (i>>6 == j)) << (j<<3); bwt->cnt_table[i] = x; } } static inline bwtint_t bwt_invPsi(const bwt_t *bwt, bwtint_t k) // compute inverse CSA { bwtint_t x = k - (k > bwt->primary); x = bwt_B0(bwt, x); x = bwt->L2[x] + bwt_occ(bwt, k, x); return k == bwt->primary? 0 : x; } // bwt->bwt and bwt->occ must be precalculated void bwt_cal_sa(bwt_t *bwt, int intv) { bwtint_t isa, sa, i; // S(isa) = sa int intv_round = intv; kv_roundup32(intv_round); xassert(intv_round == intv, "SA sample interval is not a power of 2."); xassert(bwt->bwt, "bwt_t::bwt is not initialized."); if (bwt->sa) free(bwt->sa); bwt->sa_intv = intv; bwt->n_sa = (bwt->seq_len + intv) / intv; bwt->sa = (bwtint_t*)calloc(bwt->n_sa, sizeof(bwtint_t)); // calculate SA value isa = 0; sa = bwt->seq_len; for (i = 0; i < bwt->seq_len; ++i) { if (isa % intv == 0) bwt->sa[isa/intv] = sa; --sa; isa = bwt_invPsi(bwt, isa); } if (isa % intv == 0) bwt->sa[isa/intv] = sa; bwt->sa[0] = (bwtint_t)-1; // before this line, bwt->sa[0] = bwt->seq_len } bwtint_t bwt_sa(const bwt_t *bwt, bwtint_t k) { bwtint_t sa = 0, mask = bwt->sa_intv - 1; while (k & mask) { ++sa; k = bwt_invPsi(bwt, k); } /* without setting bwt->sa[0] = -1, the following line should be changed to (sa + bwt->sa[k/bwt->sa_intv]) % (bwt->seq_len + 1) */ return sa + bwt->sa[k/bwt->sa_intv]; } static inline int __occ_aux(uint64_t y, int c) { // reduce nucleotide counting to bits counting y = ((c&2)? y : ~y) >> 1 & ((c&1)? y : ~y) & 0x5555555555555555ull; // count the number of 1s in y y = (y & 0x3333333333333333ull) + (y >> 2 & 0x3333333333333333ull); return ((y + (y >> 4)) & 0xf0f0f0f0f0f0f0full) * 0x101010101010101ull >> 56; } bwtint_t bwt_occ(const bwt_t *bwt, bwtint_t k, ubyte_t c) { bwtint_t n; uint32_t *p, *end; if (k == bwt->seq_len) return bwt->L2[c+1] - bwt->L2[c]; if (k == (bwtint_t)(-1)) return 0; k -= (k >= bwt->primary); // because $ is not in bwt // retrieve Occ at k/OCC_INTERVAL n = ((bwtint_t*)(p = bwt_occ_intv(bwt, k)))[c]; p += sizeof(bwtint_t); // jump to the start of the first BWT cell // calculate Occ up to the last k/32 end = p + (((k>>5) - ((k&~OCC_INTV_MASK)>>5))<<1); for (; p < end; p += 2) n += __occ_aux((uint64_t)p[0]<<32 | p[1], c); // calculate Occ n += __occ_aux(((uint64_t)p[0]<<32 | p[1]) & ~((1ull<<((~k&31)<<1)) - 1), c); if (c == 0) n -= ~k&31; // corrected for the masked bits return n; } // an analogy to bwt_occ() but more efficient, requiring k <= l void bwt_2occ(const bwt_t *bwt, bwtint_t k, bwtint_t l, ubyte_t c, bwtint_t *ok, bwtint_t *ol) { bwtint_t _k, _l; _k = (k >= bwt->primary)? k-1 : k; _l = (l >= bwt->primary)? l-1 : l; if (_l/OCC_INTERVAL != _k/OCC_INTERVAL || k == (bwtint_t)(-1) || l == (bwtint_t)(-1)) { *ok = bwt_occ(bwt, k, c); *ol = bwt_occ(bwt, l, c); } else { bwtint_t m, n, i, j; uint32_t *p; if (k >= bwt->primary) --k; if (l >= bwt->primary) --l; n = ((bwtint_t*)(p = bwt_occ_intv(bwt, k)))[c]; p += sizeof(bwtint_t); // calculate *ok j = k >> 5 << 5; for (i = k/OCC_INTERVAL*OCC_INTERVAL; i < j; i += 32, p += 2) n += __occ_aux((uint64_t)p[0]<<32 | p[1], c); m = n; n += __occ_aux(((uint64_t)p[0]<<32 | p[1]) & ~((1ull<<((~k&31)<<1)) - 1), c); if (c == 0) n -= ~k&31; // corrected for the masked bits *ok = n; // calculate *ol j = l >> 5 << 5; for (; i < j; i += 32, p += 2) m += __occ_aux((uint64_t)p[0]<<32 | p[1], c); m += __occ_aux(((uint64_t)p[0]<<32 | p[1]) & ~((1ull<<((~l&31)<<1)) - 1), c); if (c == 0) m -= ~l&31; // corrected for the masked bits *ol = m; } } #define __occ_aux4(bwt, b) \ ((bwt)->cnt_table[(b)&0xff] + (bwt)->cnt_table[(b)>>8&0xff] \ + (bwt)->cnt_table[(b)>>16&0xff] + (bwt)->cnt_table[(b)>>24]) void bwt_occ4(const bwt_t *bwt, bwtint_t k, bwtint_t cnt[4]) { bwtint_t x; uint32_t *p, tmp, *end; if (k == (bwtint_t)(-1)) { memset(cnt, 0, 4 * sizeof(bwtint_t)); return; } k -= (k >= bwt->primary); // because $ is not in bwt p = bwt_occ_intv(bwt, k); memcpy(cnt, p, 4 * sizeof(bwtint_t)); p += sizeof(bwtint_t); // sizeof(bwtint_t) = 4*(sizeof(bwtint_t)/sizeof(uint32_t)) end = p + ((k>>4) - ((k&~OCC_INTV_MASK)>>4)); // this is the end point of the following loop for (x = 0; p < end; ++p) x += __occ_aux4(bwt, *p); tmp = *p & ~((1U<<((~k&15)<<1)) - 1); x += __occ_aux4(bwt, tmp) - (~k&15); cnt[0] += x&0xff; cnt[1] += x>>8&0xff; cnt[2] += x>>16&0xff; cnt[3] += x>>24; } // an analogy to bwt_occ4() but more efficient, requiring k <= l void bwt_2occ4(const bwt_t *bwt, bwtint_t k, bwtint_t l, bwtint_t cntk[4], bwtint_t cntl[4]) { bwtint_t _k, _l; _k = k - (k >= bwt->primary); _l = l - (l >= bwt->primary); if (_l>>OCC_INTV_SHIFT != _k>>OCC_INTV_SHIFT || k == (bwtint_t)(-1) || l == (bwtint_t)(-1)) { bwt_occ4(bwt, k, cntk); bwt_occ4(bwt, l, cntl); } else { bwtint_t x, y; uint32_t *p, tmp, *endk, *endl; k -= (k >= bwt->primary); // because $ is not in bwt l -= (l >= bwt->primary); p = bwt_occ_intv(bwt, k); memcpy(cntk, p, 4 * sizeof(bwtint_t)); p += sizeof(bwtint_t); // sizeof(bwtint_t) = 4*(sizeof(bwtint_t)/sizeof(uint32_t)) // prepare cntk[] endk = p + ((k>>4) - ((k&~OCC_INTV_MASK)>>4)); endl = p + ((l>>4) - ((l&~OCC_INTV_MASK)>>4)); for (x = 0; p < endk; ++p) x += __occ_aux4(bwt, *p); y = x; tmp = *p & ~((1U<<((~k&15)<<1)) - 1); x += __occ_aux4(bwt, tmp) - (~k&15); // calculate cntl[] and finalize cntk[] for (; p < endl; ++p) y += __occ_aux4(bwt, *p); tmp = *p & ~((1U<<((~l&15)<<1)) - 1); y += __occ_aux4(bwt, tmp) - (~l&15); memcpy(cntl, cntk, 4 * sizeof(bwtint_t)); cntk[0] += x&0xff; cntk[1] += x>>8&0xff; cntk[2] += x>>16&0xff; cntk[3] += x>>24; cntl[0] += y&0xff; cntl[1] += y>>8&0xff; cntl[2] += y>>16&0xff; cntl[3] += y>>24; } } int bwt_match_exact(const bwt_t *bwt, int len, const ubyte_t *str, bwtint_t *sa_begin, bwtint_t *sa_end) { bwtint_t k, l, ok, ol; int i; k = 0; l = bwt->seq_len; for (i = len - 1; i >= 0; --i) { ubyte_t c = str[i]; if (c > 3) return 0; // no match bwt_2occ(bwt, k - 1, l, c, &ok, &ol); k = bwt->L2[c] + ok + 1; l = bwt->L2[c] + ol; if (k > l) break; // no match } if (k > l) return 0; // no match if (sa_begin) *sa_begin = k; if (sa_end) *sa_end = l; return l - k + 1; } int bwt_match_exact_alt(const bwt_t *bwt, int len, const ubyte_t *str, bwtint_t *k0, bwtint_t *l0) { int i; bwtint_t k, l, ok, ol; k = *k0; l = *l0; for (i = len - 1; i >= 0; --i) { ubyte_t c = str[i]; if (c > 3) return 0; // there is an N here. no match bwt_2occ(bwt, k - 1, l, c, &ok, &ol); k = bwt->L2[c] + ok + 1; l = bwt->L2[c] + ol; if (k > l) return 0; // no match } *k0 = k; *l0 = l; return l - k + 1; } /********************* * Bidirectional BWT * *********************/ void bwt_extend(const bwt_t *bwt, const bwtintv_t *ik, bwtintv_t ok[4], int is_back) { bwtint_t tk[4], tl[4]; int i; bwt_2occ4(bwt, ik->x[!is_back] - 1, ik->x[!is_back] - 1 + ik->x[2], tk, tl); for (i = 0; i != 4; ++i) { ok[i].x[!is_back] = bwt->L2[i] + 1 + tk[i]; ok[i].x[2] = tl[i] - tk[i]; } ok[3].x[is_back] = ik->x[is_back] + (ik->x[!is_back] <= bwt->primary && ik->x[!is_back] + ik->x[2] - 1 >= bwt->primary); ok[2].x[is_back] = ok[3].x[is_back] + ok[3].x[2]; ok[1].x[is_back] = ok[2].x[is_back] + ok[2].x[2]; ok[0].x[is_back] = ok[1].x[is_back] + ok[1].x[2]; } static void bwt_reverse_intvs(bwtintv_v *p) { if (p->n > 1) { int j; for (j = 0; j < p->n>>1; ++j) { bwtintv_t tmp = p->a[p->n - 1 - j]; p->a[p->n - 1 - j] = p->a[j]; p->a[j] = tmp; } } } // NOTE: $max_intv is not currently used in BWA-MEM int bwt_smem1a(const bwt_t *bwt, int len, const uint8_t *q, int x, int min_intv, uint64_t max_intv, bwtintv_v *mem, bwtintv_v *tmpvec[2]) { int i, j, c, ret; bwtintv_t ik, ok[4]; bwtintv_v a[2], *prev, *curr, *swap; mem->n = 0; if (q[x] > 3) return x + 1; if (min_intv < 1) min_intv = 1; // the interval size should be at least 1 kv_init(a[0]); kv_init(a[1]); prev = tmpvec && tmpvec[0]? tmpvec[0] : &a[0]; // use the temporary vector if provided curr = tmpvec && tmpvec[1]? tmpvec[1] : &a[1]; bwt_set_intv(bwt, q[x], ik); // the initial interval of a single base ik.info = x + 1; for (i = x + 1, curr->n = 0; i < len; ++i) { // forward search if (ik.x[2] < max_intv) { // an interval small enough kv_push(bwtintv_t, *curr, ik); break; } else if (q[i] < 4) { // an A/C/G/T base c = 3 - q[i]; // complement of q[i] bwt_extend(bwt, &ik, ok, 0); if (ok[c].x[2] != ik.x[2]) { // change of the interval size kv_push(bwtintv_t, *curr, ik); if (ok[c].x[2] < min_intv) break; // the interval size is too small to be extended further } ik = ok[c]; ik.info = i + 1; } else { // an ambiguous base kv_push(bwtintv_t, *curr, ik); break; // always terminate extension at an ambiguous base; in this case, ia[0].info; // this will be the returned value swap = curr; curr = prev; prev = swap; for (i = x - 1; i >= -1; --i) { // backward search for MEMs c = i < 0? -1 : q[i] < 4? q[i] : -1; // c==-1 if i<0 or q[i] is an ambiguous base for (j = 0, curr->n = 0; j < prev->n; ++j) { bwtintv_t *p = &prev->a[j]; if (c >= 0 && ik.x[2] >= max_intv) bwt_extend(bwt, p, ok, 1); if (c < 0 || ik.x[2] < max_intv || ok[c].x[2] < min_intv) { // keep the hit if reaching the beginning or an ambiguous base or the intv is small enough if (curr->n == 0) { // test curr->n>0 to make sure there are no longer matches if (mem->n == 0 || i + 1 < mem->a[mem->n-1].info>>32) { // skip contained matches ik = *p; ik.info |= (uint64_t)(i + 1)<<32; kv_push(bwtintv_t, *mem, ik); } } // otherwise the match is contained in another longer match } else if (curr->n == 0 || ok[c].x[2] != curr->a[curr->n-1].x[2]) { ok[c].info = p->info; kv_push(bwtintv_t, *curr, ok[c]); } } if (curr->n == 0) break; swap = curr; curr = prev; prev = swap; } bwt_reverse_intvs(mem); // s.t. sorted by the start coordinate if (tmpvec == 0 || tmpvec[0] == 0) free(a[0].a); if (tmpvec == 0 || tmpvec[1] == 0) free(a[1].a); return ret; } int bwt_smem1(const bwt_t *bwt, int len, const uint8_t *q, int x, int min_intv, bwtintv_v *mem, bwtintv_v *tmpvec[2]) { return bwt_smem1a(bwt, len, q, x, min_intv, 0, mem, tmpvec); } int bwt_seed_strategy1(const bwt_t *bwt, int len, const uint8_t *q, int x, int min_len, int max_intv, bwtintv_t *mem) { int i, c; bwtintv_t ik, ok[4]; memset(mem, 0, sizeof(bwtintv_t)); if (q[x] > 3) return x + 1; bwt_set_intv(bwt, q[x], ik); // the initial interval of a single base for (i = x + 1; i < len; ++i) { // forward search if (q[i] < 4) { // an A/C/G/T base c = 3 - q[i]; // complement of q[i] bwt_extend(bwt, &ik, ok, 0); if (ok[c].x[2] < max_intv && i - x >= min_len) { *mem = ok[c]; mem->info = (uint64_t)x<<32 | (i + 1); return i + 1; } ik = ok[c]; } else return i + 1; } return len; } /************************* * Read/write BWT and SA * *************************/ void bwt_dump_bwt(const char *fn, const bwt_t *bwt) { FILE *fp; fp = xopen(fn, "wb"); err_fwrite(&bwt->primary, sizeof(bwtint_t), 1, fp); err_fwrite(bwt->L2+1, sizeof(bwtint_t), 4, fp); err_fwrite(bwt->bwt, 4, bwt->bwt_size, fp); err_fflush(fp); err_fclose(fp); } void bwt_dump_sa(const char *fn, const bwt_t *bwt) { FILE *fp; fp = xopen(fn, "wb"); err_fwrite(&bwt->primary, sizeof(bwtint_t), 1, fp); err_fwrite(bwt->L2+1, sizeof(bwtint_t), 4, fp); err_fwrite(&bwt->sa_intv, sizeof(bwtint_t), 1, fp); err_fwrite(&bwt->seq_len, sizeof(bwtint_t), 1, fp); err_fwrite(bwt->sa + 1, sizeof(bwtint_t), bwt->n_sa - 1, fp); err_fflush(fp); err_fclose(fp); } static bwtint_t fread_fix(FILE *fp, bwtint_t size, void *a) { // Mac/Darwin has a bug when reading data longer than 2GB. This function fixes this issue by reading data in small chunks const int bufsize = 0x1000000; // 16M block bwtint_t offset = 0; while (size) { int x = bufsize < size? bufsize : size; if ((x = err_fread_noeof(a + offset, 1, x, fp)) == 0) break; size -= x; offset += x; } return offset; } void bwt_restore_sa(const char *fn, bwt_t *bwt) { char skipped[256]; FILE *fp; bwtint_t primary; fp = xopen(fn, "rb"); err_fread_noeof(&primary, sizeof(bwtint_t), 1, fp); xassert(primary == bwt->primary, "SA-BWT inconsistency: primary is not the same."); err_fread_noeof(skipped, sizeof(bwtint_t), 4, fp); // skip err_fread_noeof(&bwt->sa_intv, sizeof(bwtint_t), 1, fp); err_fread_noeof(&primary, sizeof(bwtint_t), 1, fp); xassert(primary == bwt->seq_len, "SA-BWT inconsistency: seq_len is not the same."); bwt->n_sa = (bwt->seq_len + bwt->sa_intv) / bwt->sa_intv; bwt->sa = (bwtint_t*)calloc(bwt->n_sa, sizeof(bwtint_t)); bwt->sa[0] = -1; fread_fix(fp, sizeof(bwtint_t) * (bwt->n_sa - 1), bwt->sa + 1); err_fclose(fp); } bwt_t *bwt_restore_bwt(const char *fn) { bwt_t *bwt; FILE *fp; bwt = (bwt_t*)calloc(1, sizeof(bwt_t)); fp = xopen(fn, "rb"); err_fseek(fp, 0, SEEK_END); bwt->bwt_size = (err_ftell(fp) - sizeof(bwtint_t) * 5) >> 2; bwt->bwt = (uint32_t*)calloc(bwt->bwt_size, 4); err_fseek(fp, 0, SEEK_SET); err_fread_noeof(&bwt->primary, sizeof(bwtint_t), 1, fp); err_fread_noeof(bwt->L2+1, sizeof(bwtint_t), 4, fp); fread_fix(fp, bwt->bwt_size<<2, bwt->bwt); bwt->seq_len = bwt->L2[4]; err_fclose(fp); bwt_gen_cnt_table(bwt); return bwt; } void bwt_destroy(bwt_t *bwt) { if (bwt == 0) return; free(bwt->sa); free(bwt->bwt); free(bwt); } bwa-0.7.17/bwt.h000066400000000000000000000115461317342117100133330ustar00rootroot00000000000000/* The MIT License Copyright (c) 2008 Genome Research Ltd (GRL). Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. */ /* Contact: Heng Li */ #ifndef BWA_BWT_H #define BWA_BWT_H #include #include // requirement: (OCC_INTERVAL%16 == 0); please DO NOT change this line because some part of the code assume OCC_INTERVAL=0x80 #define OCC_INTV_SHIFT 7 #define OCC_INTERVAL (1LL<bwt[(k)/OCC_INTERVAL * (OCC_INTERVAL/(sizeof(uint32_t)*8/2) + sizeof(bwtint_t)/4*4) + sizeof(bwtint_t)/4*4 + (k)%OCC_INTERVAL/16]) #define bwt_occ_intv(b, k) ((b)->bwt + (k)/OCC_INTERVAL * (OCC_INTERVAL/(sizeof(uint32_t)*8/2) + sizeof(bwtint_t)/4*4) */ // The following two lines are ONLY correct when OCC_INTERVAL==0x80 #define bwt_bwt(b, k) ((b)->bwt[((k)>>7<<4) + sizeof(bwtint_t) + (((k)&0x7f)>>4)]) #define bwt_occ_intv(b, k) ((b)->bwt + ((k)>>7<<4)) /* retrieve a character from the $-removed BWT string. Note that * bwt_t::bwt is not exactly the BWT string and therefore this macro is * called bwt_B0 instead of bwt_B */ #define bwt_B0(b, k) (bwt_bwt(b, k)>>((~(k)&0xf)<<1)&3) #define bwt_set_intv(bwt, c, ik) ((ik).x[0] = (bwt)->L2[(int)(c)]+1, (ik).x[2] = (bwt)->L2[(int)(c)+1]-(bwt)->L2[(int)(c)], (ik).x[1] = (bwt)->L2[3-(c)]+1, (ik).info = 0) #ifdef __cplusplus extern "C" { #endif void bwt_dump_bwt(const char *fn, const bwt_t *bwt); void bwt_dump_sa(const char *fn, const bwt_t *bwt); bwt_t *bwt_restore_bwt(const char *fn); void bwt_restore_sa(const char *fn, bwt_t *bwt); void bwt_destroy(bwt_t *bwt); void bwt_bwtgen(const char *fn_pac, const char *fn_bwt); // from BWT-SW void bwt_bwtgen2(const char *fn_pac, const char *fn_bwt, int block_size); // from BWT-SW void bwt_cal_sa(bwt_t *bwt, int intv); void bwt_bwtupdate_core(bwt_t *bwt); bwtint_t bwt_occ(const bwt_t *bwt, bwtint_t k, ubyte_t c); void bwt_occ4(const bwt_t *bwt, bwtint_t k, bwtint_t cnt[4]); bwtint_t bwt_sa(const bwt_t *bwt, bwtint_t k); // more efficient version of bwt_occ/bwt_occ4 for retrieving two close Occ values void bwt_gen_cnt_table(bwt_t *bwt); void bwt_2occ(const bwt_t *bwt, bwtint_t k, bwtint_t l, ubyte_t c, bwtint_t *ok, bwtint_t *ol); void bwt_2occ4(const bwt_t *bwt, bwtint_t k, bwtint_t l, bwtint_t cntk[4], bwtint_t cntl[4]); int bwt_match_exact(const bwt_t *bwt, int len, const ubyte_t *str, bwtint_t *sa_begin, bwtint_t *sa_end); int bwt_match_exact_alt(const bwt_t *bwt, int len, const ubyte_t *str, bwtint_t *k0, bwtint_t *l0); /** * Extend bi-SA-interval _ik_ */ void bwt_extend(const bwt_t *bwt, const bwtintv_t *ik, bwtintv_t ok[4], int is_back); /** * Given a query _q_, collect potential SMEMs covering position _x_ and store them in _mem_. * Return the end of the longest exact match starting from _x_. */ int bwt_smem1(const bwt_t *bwt, int len, const uint8_t *q, int x, int min_intv, bwtintv_v *mem, bwtintv_v *tmpvec[2]); int bwt_smem1a(const bwt_t *bwt, int len, const uint8_t *q, int x, int min_intv, uint64_t max_intv, bwtintv_v *mem, bwtintv_v *tmpvec[2]); int bwt_seed_strategy1(const bwt_t *bwt, int len, const uint8_t *q, int x, int min_len, int max_intv, bwtintv_t *mem); #ifdef __cplusplus } #endif #endif bwa-0.7.17/bwt_gen.c000066400000000000000000001514131317342117100141550ustar00rootroot00000000000000/* BWTConstruct.c BWT-Index Construction This module constructs BWT and auxiliary data structures. Copyright (C) 2004, Wong Chi Kwong. This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation; either version 2 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program; if not, write to the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. */ #include #include #include #include #include #include #include "QSufSort.h" #ifdef USE_MALLOC_WRAPPERS # include "malloc_wrap.h" #endif typedef uint64_t bgint_t; typedef int64_t sbgint_t; #define ALPHABET_SIZE 4 #define BIT_PER_CHAR 2 #define CHAR_PER_WORD 16 #define CHAR_PER_BYTE 4 #define BITS_IN_WORD 32 #define BITS_IN_BYTE 8 #define BYTES_IN_WORD 4 #define ALL_ONE_MASK 0xFFFFFFFF #define DNA_OCC_CNT_TABLE_SIZE_IN_WORD 65536 #define BITS_PER_OCC_VALUE 16 #define OCC_VALUE_PER_WORD 2 #define OCC_INTERVAL 256 #define OCC_INTERVAL_MAJOR 65536 #define TRUE 1 #define FALSE 0 #define BWTINC_INSERT_SORT_NUM_ITEM 7 #define MIN_AVAILABLE_WORD 0x10000 #define average(value1, value2) ( ((value1) & (value2)) + ((value1) ^ (value2)) / 2 ) #define min(value1, value2) ( ((value1) < (value2)) ? (value1) : (value2) ) #define max(value1, value2) ( ((value1) > (value2)) ? (value1) : (value2) ) #define med3(a, b, c) ( ac ? b : a>c ? c : a)) #define swap(a, b, t); t = a; a = b; b = t; #define truncateLeft(value, offset) ( (value) << (offset) >> (offset) ) #define truncateRight(value, offset) ( (value) >> (offset) << (offset) ) #define DNA_OCC_SUM_EXCEPTION(sum) ((sum & 0xfefefeff) == 0) typedef struct BWT { bgint_t textLength; // length of the text bgint_t inverseSa0; // SA-1[0] bgint_t *cumulativeFreq; // cumulative frequency unsigned int *bwtCode; // BWT code unsigned int *occValue; // Occurrence values stored explicitly bgint_t *occValueMajor; // Occurrence values stored explicitly unsigned int *decodeTable; // For decoding BWT by table lookup bgint_t bwtSizeInWord; // Temporary variable to hold the memory allocated bgint_t occSizeInWord; // Temporary variable to hold the memory allocated bgint_t occMajorSizeInWord; // Temporary variable to hold the memory allocated } BWT; typedef struct BWTInc { BWT *bwt; unsigned int numberOfIterationDone; bgint_t *cumulativeCountInCurrentBuild; bgint_t availableWord; bgint_t buildSize; bgint_t initialMaxBuildSize; bgint_t incMaxBuildSize; unsigned int firstCharInLastIteration; unsigned int *workingMemory; unsigned int *packedText; unsigned char *textBuffer; unsigned int *packedShift; } BWTInc; static bgint_t TextLengthFromBytePacked(bgint_t bytePackedLength, unsigned int bitPerChar, unsigned int lastByteLength) { return (bytePackedLength - 1) * (BITS_IN_BYTE / bitPerChar) + lastByteLength; } static void initializeVAL(unsigned int *startAddr, const bgint_t length, const unsigned int initValue) { bgint_t i; for (i=0; i>= 2; } } } // for BWTIncCreate() static bgint_t BWTOccValueMajorSizeInWord(const bgint_t numChar) { bgint_t numOfOccValue; unsigned numOfOccIntervalPerMajor; numOfOccValue = (numChar + OCC_INTERVAL - 1) / OCC_INTERVAL + 1; // Value at both end for bi-directional encoding numOfOccIntervalPerMajor = OCC_INTERVAL_MAJOR / OCC_INTERVAL; return (numOfOccValue + numOfOccIntervalPerMajor - 1) / numOfOccIntervalPerMajor * ALPHABET_SIZE; } // for BWTIncCreate() static bgint_t BWTOccValueMinorSizeInWord(const bgint_t numChar) { bgint_t numOfOccValue; numOfOccValue = (numChar + OCC_INTERVAL - 1) / OCC_INTERVAL + 1; // Value at both end for bi-directional encoding return (numOfOccValue + OCC_VALUE_PER_WORD - 1) / OCC_VALUE_PER_WORD * ALPHABET_SIZE; } // for BWTIncCreate() static bgint_t BWTResidentSizeInWord(const bgint_t numChar) { bgint_t numCharRoundUpToOccInterval; // The $ in BWT at the position of inverseSa0 is not encoded numCharRoundUpToOccInterval = (numChar + OCC_INTERVAL - 1) / OCC_INTERVAL * OCC_INTERVAL; return (numCharRoundUpToOccInterval + CHAR_PER_WORD - 1) / CHAR_PER_WORD; } static void BWTIncSetBuildSizeAndTextAddr(BWTInc *bwtInc) { bgint_t maxBuildSize; if (bwtInc->bwt->textLength == 0) { // initial build // Minus 2 because n+1 entries of seq and rank needed for n char maxBuildSize = (bwtInc->availableWord - (2 + OCC_INTERVAL / CHAR_PER_WORD) * (sizeof(bgint_t) / 4)) / (2 * CHAR_PER_WORD + 1) * CHAR_PER_WORD / (sizeof(bgint_t) / 4); if (bwtInc->initialMaxBuildSize > 0) { bwtInc->buildSize = min(bwtInc->initialMaxBuildSize, maxBuildSize); } else { bwtInc->buildSize = maxBuildSize; } } else { // Minus 3 because n+1 entries of sorted rank, seq and rank needed for n char // Minus numberOfIterationDone because bwt slightly shift to left in each iteration maxBuildSize = (bwtInc->availableWord - bwtInc->bwt->bwtSizeInWord - bwtInc->bwt->occSizeInWord - (3 + bwtInc->numberOfIterationDone * OCC_INTERVAL / BIT_PER_CHAR) * (sizeof(bgint_t) / 4)) / 3 / (sizeof(bgint_t) / 4); if (maxBuildSize < CHAR_PER_WORD) { fprintf(stderr, "BWTIncSetBuildSizeAndTextAddr(): Not enough space allocated to continue construction!\n"); exit(1); } if (bwtInc->incMaxBuildSize > 0) { bwtInc->buildSize = min(bwtInc->incMaxBuildSize, maxBuildSize); } else { bwtInc->buildSize = maxBuildSize; } if (bwtInc->buildSize < CHAR_PER_WORD) bwtInc->buildSize = CHAR_PER_WORD; } if (bwtInc->buildSize < CHAR_PER_WORD) { fprintf(stderr, "BWTIncSetBuildSizeAndTextAddr(): Not enough space allocated to continue construction!\n"); exit(1); } bwtInc->buildSize = bwtInc->buildSize / CHAR_PER_WORD * CHAR_PER_WORD; bwtInc->packedText = bwtInc->workingMemory + 2 * (bwtInc->buildSize + 1) * (sizeof(bgint_t) / 4); bwtInc->textBuffer = (unsigned char*)(bwtInc->workingMemory + (bwtInc->buildSize + 1) * (sizeof(bgint_t) / 4)); } // for ceilLog2() unsigned int leadingZero(const unsigned int input) { unsigned int l; const static unsigned int leadingZero8bit[256] = {8,7,6,6,5,5,5,5,4,4,4,4,4,4,4,4,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3, 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0}; if (input & 0xFFFF0000) { if (input & 0xFF000000) { l = leadingZero8bit[input >> 24]; } else { l = 8 + leadingZero8bit[input >> 16]; } } else { if (input & 0x0000FF00) { l = 16 + leadingZero8bit[input >> 8]; } else { l = 24 + leadingZero8bit[input]; } } return l; } // for BitPerBytePackedChar() static unsigned int ceilLog2(const unsigned int input) { if (input <= 1) return 0; return BITS_IN_WORD - leadingZero(input - 1); } // for ConvertBytePackedToWordPacked() static unsigned int BitPerBytePackedChar(const unsigned int alphabetSize) { unsigned int bitPerChar; bitPerChar = ceilLog2(alphabetSize); // Return the largest number of bit that does not affect packing efficiency if (BITS_IN_BYTE / (BITS_IN_BYTE / bitPerChar) > bitPerChar) bitPerChar = BITS_IN_BYTE / (BITS_IN_BYTE / bitPerChar); return bitPerChar; } // for ConvertBytePackedToWordPacked() static unsigned int BitPerWordPackedChar(const unsigned int alphabetSize) { return ceilLog2(alphabetSize); } static void ConvertBytePackedToWordPacked(const unsigned char *input, unsigned int *output, const unsigned int alphabetSize, const bgint_t textLength) { bgint_t i; unsigned int j, k, c; unsigned int bitPerBytePackedChar; unsigned int bitPerWordPackedChar; unsigned int charPerWord; unsigned int charPerByte; unsigned int bytePerIteration; bgint_t byteProcessed = 0; bgint_t wordProcessed = 0; unsigned int mask, shift; unsigned int buffer[BITS_IN_WORD]; bitPerBytePackedChar = BitPerBytePackedChar(alphabetSize); bitPerWordPackedChar = BitPerWordPackedChar(alphabetSize); charPerByte = BITS_IN_BYTE / bitPerBytePackedChar; charPerWord = BITS_IN_WORD / bitPerWordPackedChar; bytePerIteration = charPerWord / charPerByte; mask = truncateRight(ALL_ONE_MASK, BITS_IN_WORD - bitPerWordPackedChar); shift = BITS_IN_WORD - BITS_IN_BYTE + bitPerBytePackedChar - bitPerWordPackedChar; while ((wordProcessed + 1) * charPerWord < textLength) { k = 0; for (i=0; i> bitPerWordPackedChar * i; } output[wordProcessed] = c; wordProcessed++; } k = 0; for (i=0; i < (textLength - wordProcessed * charPerWord - 1) / charPerByte + 1; i++) { c = (unsigned int)input[byteProcessed] << shift; for (j=0; j> bitPerWordPackedChar * i; } output[wordProcessed] = c; } BWT *BWTCreate(const bgint_t textLength, unsigned int *decodeTable) { BWT *bwt; bwt = (BWT*)calloc(1, sizeof(BWT)); bwt->textLength = 0; bwt->cumulativeFreq = (bgint_t*)calloc((ALPHABET_SIZE + 1), sizeof(bgint_t)); initializeVAL_bg(bwt->cumulativeFreq, ALPHABET_SIZE + 1, 0); bwt->bwtSizeInWord = 0; // Generate decode tables if (decodeTable == NULL) { bwt->decodeTable = (unsigned*)calloc(DNA_OCC_CNT_TABLE_SIZE_IN_WORD, sizeof(unsigned int)); GenerateDNAOccCountTable(bwt->decodeTable); } else { // FIXME Prevent BWTFree() from freeing decodeTable in this case bwt->decodeTable = decodeTable; } bwt->occMajorSizeInWord = BWTOccValueMajorSizeInWord(textLength); bwt->occValueMajor = (bgint_t*)calloc(bwt->occMajorSizeInWord, sizeof(bgint_t)); bwt->occSizeInWord = 0; bwt->occValue = NULL; return bwt; } BWTInc *BWTIncCreate(const bgint_t textLength, unsigned int initialMaxBuildSize, unsigned int incMaxBuildSize) { BWTInc *bwtInc; unsigned int i, n_iter; if (textLength < incMaxBuildSize) incMaxBuildSize = textLength; if (textLength < initialMaxBuildSize) initialMaxBuildSize = textLength; bwtInc = (BWTInc*)calloc(1, sizeof(BWTInc)); bwtInc->numberOfIterationDone = 0; bwtInc->bwt = BWTCreate(textLength, NULL); bwtInc->initialMaxBuildSize = initialMaxBuildSize; bwtInc->incMaxBuildSize = incMaxBuildSize; bwtInc->cumulativeCountInCurrentBuild = (bgint_t*)calloc((ALPHABET_SIZE + 1), sizeof(bgint_t)); initializeVAL_bg(bwtInc->cumulativeCountInCurrentBuild, ALPHABET_SIZE + 1, 0); // Build frequently accessed data bwtInc->packedShift = (unsigned*)calloc(CHAR_PER_WORD, sizeof(unsigned int)); for (i=0; ipackedShift[i] = BITS_IN_WORD - (i+1) * BIT_PER_CHAR; n_iter = (textLength - initialMaxBuildSize) / incMaxBuildSize + 1; bwtInc->availableWord = BWTResidentSizeInWord(textLength) + BWTOccValueMinorSizeInWord(textLength) // minimal memory requirement + OCC_INTERVAL / BIT_PER_CHAR * n_iter * 2 * (sizeof(bgint_t) / 4) // buffer at the end of occ array + incMaxBuildSize/5 * 3 * (sizeof(bgint_t) / 4); // space for the 3 temporary arrays in each iteration if (bwtInc->availableWord < MIN_AVAILABLE_WORD) bwtInc->availableWord = MIN_AVAILABLE_WORD; // lh3: otherwise segfaul when availableWord is too small fprintf(stderr, "[%s] textLength=%ld, availableWord=%ld\n", __func__, (long)textLength, (long)bwtInc->availableWord); bwtInc->workingMemory = (unsigned*)calloc(bwtInc->availableWord, BYTES_IN_WORD); return bwtInc; } // for BWTIncConstruct() static void BWTIncPutPackedTextToRank(const unsigned int *packedText, bgint_t* __restrict rank, bgint_t* __restrict cumulativeCount, const bgint_t numChar) { bgint_t i; unsigned int j; unsigned int c, t; unsigned int packedMask; bgint_t rankIndex; bgint_t lastWord; unsigned int numCharInLastWord; lastWord = (numChar - 1) / CHAR_PER_WORD; numCharInLastWord = numChar - lastWord * CHAR_PER_WORD; packedMask = ALL_ONE_MASK >> (BITS_IN_WORD - BIT_PER_CHAR); rankIndex = numChar - 1; t = packedText[lastWord] >> (BITS_IN_WORD - numCharInLastWord * BIT_PER_CHAR); for (i=0; i>= BIT_PER_CHAR; } for (i=lastWord; i--;) { // loop from lastWord - 1 to 0 t = packedText[i]; for (j=0; j>= BIT_PER_CHAR; } } // Convert occurrence to cumulativeCount cumulativeCount[2] += cumulativeCount[1]; cumulativeCount[3] += cumulativeCount[2]; cumulativeCount[4] += cumulativeCount[3]; } static void ForwardDNAAllOccCountNoLimit(const unsigned int* dna, const bgint_t index, bgint_t* __restrict occCount, const unsigned int* dnaDecodeTable) { static const unsigned int truncateRightMask[16] = { 0x00000000, 0xC0000000, 0xF0000000, 0xFC000000, 0xFF000000, 0xFFC00000, 0xFFF00000, 0xFFFC0000, 0xFFFF0000, 0xFFFFC000, 0xFFFFF000, 0xFFFFFC00, 0xFFFFFF00, 0xFFFFFFC0, 0xFFFFFFF0, 0xFFFFFFFC }; bgint_t iteration, i; unsigned int wordToCount, charToCount; unsigned int j, c, sum; occCount[0] = 0; occCount[1] = 0; occCount[2] = 0; occCount[3] = 0; iteration = index / 256; wordToCount = (index - iteration * 256) / 16; charToCount = index - iteration * 256 - wordToCount * 16; for (i=0; i> 16]; sum += dnaDecodeTable[*dna & 0x0000FFFF]; dna++; } if (!DNA_OCC_SUM_EXCEPTION(sum)) { occCount[0] += sum & 0x000000FF; sum >>= 8; occCount[1] += sum & 0x000000FF; sum >>= 8; occCount[2] += sum & 0x000000FF; sum >>= 8; occCount[3] += sum; } else { // only some or all of the 3 bits are on // in reality, only one of the four cases are possible if (sum == 0x00000100) { occCount[0] += 256; } else if (sum == 0x00010000) { occCount[1] += 256; } else if (sum == 0x01000000) { occCount[2] += 256; } else if (sum == 0x00000000) { occCount[3] += 256; } else { fprintf(stderr, "ForwardDNAAllOccCountNoLimit(): DNA occ sum exception!\n"); exit(1); } } } sum = 0; for (j=0; j> 16]; sum += dnaDecodeTable[*dna & 0x0000FFFF]; dna++; } if (charToCount > 0) { c = *dna & truncateRightMask[charToCount]; // increase count of 'a' by 16 - c; sum += dnaDecodeTable[c >> 16]; sum += dnaDecodeTable[c & 0xFFFF]; sum += charToCount - 16; // decrease count of 'a' by 16 - positionToProcess } occCount[0] += sum & 0x000000FF; sum >>= 8; occCount[1] += sum & 0x000000FF; sum >>= 8; occCount[2] += sum & 0x000000FF; sum >>= 8; occCount[3] += sum; } static void BWTIncBuildPackedBwt(const bgint_t *relativeRank, unsigned int* __restrict bwt, const bgint_t numChar, const bgint_t *cumulativeCount, const unsigned int *packedShift) { bgint_t i, r; unsigned int c; bgint_t previousRank, currentRank; bgint_t wordIndex, charIndex; bgint_t inverseSa0; inverseSa0 = previousRank = relativeRank[0]; for (i=1; i<=numChar; i++) { currentRank = relativeRank[i]; // previousRank > cumulativeCount[c] because $ is one of the char c = (previousRank > cumulativeCount[1]) + (previousRank > cumulativeCount[2]) + (previousRank > cumulativeCount[3]); // set bwt for currentRank if (c > 0) { // c <> 'a' r = currentRank; if (r > inverseSa0) { // - 1 because $ at inverseSa0 is not encoded r--; } wordIndex = r / CHAR_PER_WORD; charIndex = r - wordIndex * CHAR_PER_WORD; bwt[wordIndex] |= c << packedShift[charIndex]; } previousRank = currentRank; } } static inline bgint_t BWTOccValueExplicit(const BWT *bwt, const bgint_t occIndexExplicit, const unsigned int character) { bgint_t occIndexMajor; occIndexMajor = occIndexExplicit * OCC_INTERVAL / OCC_INTERVAL_MAJOR; if (occIndexExplicit % OCC_VALUE_PER_WORD == 0) { return bwt->occValueMajor[occIndexMajor * ALPHABET_SIZE + character] + (bwt->occValue[occIndexExplicit / OCC_VALUE_PER_WORD * ALPHABET_SIZE + character] >> 16); } else { return bwt->occValueMajor[occIndexMajor * ALPHABET_SIZE + character] + (bwt->occValue[occIndexExplicit / OCC_VALUE_PER_WORD * ALPHABET_SIZE + character] & 0x0000FFFF); } } static unsigned int ForwardDNAOccCount(const unsigned int* dna, const unsigned int index, const unsigned int character, const unsigned int* dnaDecodeTable) { static const unsigned int truncateRightMask[16] = { 0x00000000, 0xC0000000, 0xF0000000, 0xFC000000, 0xFF000000, 0xFFC00000, 0xFFF00000, 0xFFFC0000, 0xFFFF0000, 0xFFFFC000, 0xFFFFF000, 0xFFFFFC00, 0xFFFFFF00, 0xFFFFFFC0, 0xFFFFFFF0, 0xFFFFFFFC }; unsigned int wordToCount, charToCount; unsigned int i, c; unsigned int sum = 0; wordToCount = index / 16; charToCount = index - wordToCount * 16; for (i=0; i> 16]; sum += dnaDecodeTable[dna[i] & 0x0000FFFF]; } if (charToCount > 0) { c = dna[i] & truncateRightMask[charToCount]; // increase count of 'a' by 16 - c; sum += dnaDecodeTable[c >> 16]; sum += dnaDecodeTable[c & 0xFFFF]; sum += charToCount - 16; // decrease count of 'a' by 16 - positionToProcess } return (sum >> (character * 8)) & 0x000000FF; } static unsigned int BackwardDNAOccCount(const unsigned int* dna, const unsigned int index, const unsigned int character, const unsigned int* dnaDecodeTable) { static const unsigned int truncateLeftMask[16] = { 0x00000000, 0x00000003, 0x0000000F, 0x0000003F, 0x000000FF, 0x000003FF, 0x00000FFF, 0x00003FFF, 0x0000FFFF, 0x0003FFFF, 0x000FFFFF, 0x003FFFFF, 0x00FFFFFF, 0x03FFFFFF, 0x0FFFFFFF, 0x3FFFFFFF }; unsigned int wordToCount, charToCount; unsigned int i, c; unsigned int sum = 0; wordToCount = index / 16; charToCount = index - wordToCount * 16; dna -= wordToCount + 1; if (charToCount > 0) { c = *dna & truncateLeftMask[charToCount]; // increase count of 'a' by 16 - c; sum += dnaDecodeTable[c >> 16]; sum += dnaDecodeTable[c & 0xFFFF]; sum += charToCount - 16; // decrease count of 'a' by 16 - positionToProcess } for (i=0; i> 16]; sum += dnaDecodeTable[*dna & 0x0000FFFF]; } return (sum >> (character * 8)) & 0x000000FF; } bgint_t BWTOccValue(const BWT *bwt, bgint_t index, const unsigned int character) { bgint_t occValue; bgint_t occExplicitIndex, occIndex; // $ is supposed to be positioned at inverseSa0 but it is not encoded // therefore index is subtracted by 1 for adjustment if (index > bwt->inverseSa0) index--; occExplicitIndex = (index + OCC_INTERVAL / 2 - 1) / OCC_INTERVAL; // Bidirectional encoding occIndex = occExplicitIndex * OCC_INTERVAL; occValue = BWTOccValueExplicit(bwt, occExplicitIndex, character); if (occIndex == index) return occValue; if (occIndex < index) { return occValue + ForwardDNAOccCount(bwt->bwtCode + occIndex / CHAR_PER_WORD, index - occIndex, character, bwt->decodeTable); } else { return occValue - BackwardDNAOccCount(bwt->bwtCode + occIndex / CHAR_PER_WORD, occIndex - index, character, bwt->decodeTable); } } static bgint_t BWTIncGetAbsoluteRank(BWT *bwt, bgint_t* __restrict absoluteRank, bgint_t* __restrict seq, const unsigned int *packedText, const bgint_t numChar, const bgint_t* cumulativeCount, const unsigned int firstCharInLastIteration) { bgint_t saIndex; bgint_t lastWord; unsigned int packedMask; bgint_t i; unsigned int c, t, j; bgint_t rankIndex; unsigned int shift; bgint_t seqIndexFromStart[ALPHABET_SIZE]; bgint_t seqIndexFromEnd[ALPHABET_SIZE]; for (i=0; i> shift; saIndex = bwt->inverseSa0; rankIndex = numChar - 1; lastWord = numChar / CHAR_PER_WORD; for (i=lastWord; i--;) { // loop from lastWord - 1 to 0 t = packedText[i]; for (j=0; jcumulativeFreq[c] + BWTOccValue(bwt, saIndex, c) + 1; // A counting sort using the first character of suffix is done here // If rank > inverseSa0 -> fill seq from end, otherwise fill seq from start -> to leave the right entry for inverseSa0 if (saIndex > bwt->inverseSa0) { seq[seqIndexFromEnd[c]] = rankIndex; absoluteRank[seqIndexFromEnd[c]] = saIndex; seqIndexFromEnd[c]--; } else { seq[seqIndexFromStart[c]] = rankIndex; absoluteRank[seqIndexFromStart[c]] = saIndex; seqIndexFromStart[c]++; } rankIndex--; t >>= BIT_PER_CHAR; } } absoluteRank[seqIndexFromStart[firstCharInLastIteration]] = bwt->inverseSa0; // representing the substring of all preceding characters seq[seqIndexFromStart[firstCharInLastIteration]] = numChar; return seqIndexFromStart[firstCharInLastIteration]; } static void BWTIncSortKey(bgint_t* __restrict key, bgint_t* __restrict seq, const bgint_t numItem) { #define EQUAL_KEY_THRESHOLD 4 // Partition for equal key if data array size / the number of data with equal value with pivot < EQUAL_KEY_THRESHOLD int64_t lowIndex, highIndex, midIndex; int64_t lowPartitionIndex, highPartitionIndex; int64_t lowStack[32], highStack[32]; int stackDepth; int64_t i, j; bgint_t tempSeq, tempKey; int64_t numberOfEqualKey; if (numItem < 2) return; stackDepth = 0; lowIndex = 0; highIndex = numItem - 1; for (;;) { for (;;) { // Sort small array of data if (highIndex - lowIndex < BWTINC_INSERT_SORT_NUM_ITEM) { // Insertion sort on smallest arrays for (i=lowIndex+1; i<=highIndex; i++) { tempSeq = seq[i]; tempKey = key[i]; for (j = i; j > lowIndex && key[j-1] > tempKey; j--) { seq[j] = seq[j-1]; key[j] = key[j-1]; } if (j != i) { seq[j] = tempSeq; key[j] = tempKey; } } break; } // Choose pivot as median of the lowest, middle, and highest data; sort the three data midIndex = average(lowIndex, highIndex); if (key[lowIndex] > key[midIndex]) { tempSeq = seq[lowIndex]; tempKey = key[lowIndex]; seq[lowIndex] = seq[midIndex]; key[lowIndex] = key[midIndex]; seq[midIndex] = tempSeq; key[midIndex] = tempKey; } if (key[lowIndex] > key[highIndex]) { tempSeq = seq[lowIndex]; tempKey = key[lowIndex]; seq[lowIndex] = seq[highIndex]; key[lowIndex] = key[highIndex]; seq[highIndex] = tempSeq; key[highIndex] = tempKey; } if (key[midIndex] > key[highIndex]) { tempSeq = seq[midIndex]; tempKey = key[midIndex]; seq[midIndex] = seq[highIndex]; key[midIndex] = key[highIndex]; seq[highIndex] = tempSeq; key[highIndex] = tempKey; } // Partition data numberOfEqualKey = 0; lowPartitionIndex = lowIndex + 1; highPartitionIndex = highIndex - 1; for (;;) { while (lowPartitionIndex <= highPartitionIndex && key[lowPartitionIndex] <= key[midIndex]) { numberOfEqualKey += (key[lowPartitionIndex] == key[midIndex]); lowPartitionIndex++; } while (lowPartitionIndex < highPartitionIndex) { if (key[midIndex] >= key[highPartitionIndex]) { numberOfEqualKey += (key[midIndex] == key[highPartitionIndex]); break; } highPartitionIndex--; } if (lowPartitionIndex >= highPartitionIndex) { break; } tempSeq = seq[lowPartitionIndex]; tempKey = key[lowPartitionIndex]; seq[lowPartitionIndex] = seq[highPartitionIndex]; key[lowPartitionIndex] = key[highPartitionIndex]; seq[highPartitionIndex] = tempSeq; key[highPartitionIndex] = tempKey; if (highPartitionIndex == midIndex) { // partition key has been moved midIndex = lowPartitionIndex; } lowPartitionIndex++; highPartitionIndex--; } // Adjust the partition index highPartitionIndex = lowPartitionIndex; lowPartitionIndex--; // move the partition key to end of low partition tempSeq = seq[midIndex]; tempKey = key[midIndex]; seq[midIndex] = seq[lowPartitionIndex]; key[midIndex] = key[lowPartitionIndex]; seq[lowPartitionIndex] = tempSeq; key[lowPartitionIndex] = tempKey; if (highIndex - lowIndex + BWTINC_INSERT_SORT_NUM_ITEM <= EQUAL_KEY_THRESHOLD * numberOfEqualKey) { // Many keys = partition key; separate the equal key data from the lower partition midIndex = lowIndex; for (;;) { while (midIndex < lowPartitionIndex && key[midIndex] < key[lowPartitionIndex]) { midIndex++; } while (midIndex < lowPartitionIndex && key[lowPartitionIndex] == key[lowPartitionIndex - 1]) { lowPartitionIndex--; } if (midIndex >= lowPartitionIndex) { break; } tempSeq = seq[midIndex]; tempKey = key[midIndex]; seq[midIndex] = seq[lowPartitionIndex - 1]; key[midIndex] = key[lowPartitionIndex - 1]; seq[lowPartitionIndex - 1] = tempSeq; key[lowPartitionIndex - 1] = tempKey; midIndex++; lowPartitionIndex--; } } if (lowPartitionIndex - lowIndex > highIndex - highPartitionIndex) { // put the larger partition to stack lowStack[stackDepth] = lowIndex; highStack[stackDepth] = lowPartitionIndex - 1; stackDepth++; // sort the smaller partition first lowIndex = highPartitionIndex; } else { // put the larger partition to stack lowStack[stackDepth] = highPartitionIndex; highStack[stackDepth] = highIndex; stackDepth++; // sort the smaller partition first if (lowPartitionIndex > lowIndex) { highIndex = lowPartitionIndex - 1; } else { // all keys in the partition equals to the partition key break; } } continue; } // Pop a range from stack if (stackDepth > 0) { stackDepth--; lowIndex = lowStack[stackDepth]; highIndex = highStack[stackDepth]; continue; } else return; } } static void BWTIncBuildRelativeRank(bgint_t* __restrict sortedRank, bgint_t* __restrict seq, bgint_t* __restrict relativeRank, const bgint_t numItem, bgint_t oldInverseSa0, const bgint_t *cumulativeCount) { bgint_t i, c; bgint_t s, r; bgint_t lastRank, lastIndex; bgint_t oldInverseSa0RelativeRank = 0; bgint_t freq; lastIndex = numItem; lastRank = sortedRank[numItem]; if (lastRank > oldInverseSa0) { sortedRank[numItem]--; // to prepare for merging; $ is not encoded in bwt } s = seq[numItem]; relativeRank[s] = numItem; if (lastRank == oldInverseSa0) { oldInverseSa0RelativeRank = numItem; oldInverseSa0++; // so that this segment of code is not run again lastRank++; // so that oldInverseSa0 become a sorted group with 1 item } c = ALPHABET_SIZE - 1; freq = cumulativeCount[c]; for (i=numItem; i--;) { // from numItem - 1 to 0 r = sortedRank[i]; if (r > oldInverseSa0) sortedRank[i]--; // to prepare for merging; $ is not encoded in bwt s = seq[i]; if (i < freq) { if (lastIndex >= freq) lastRank++; // to trigger the group across alphabet boundary to be split c--; freq = cumulativeCount[c]; } if (r == lastRank) { relativeRank[s] = lastIndex; } else { if (i == lastIndex - 1) { if (lastIndex < numItem && (sbgint_t)seq[lastIndex + 1] < 0) { seq[lastIndex] = seq[lastIndex + 1] - 1; } else { seq[lastIndex] = (bgint_t)-1; } } lastIndex = i; lastRank = r; relativeRank[s] = i; if (r == oldInverseSa0) { oldInverseSa0RelativeRank = i; oldInverseSa0++; // so that this segment of code is not run again lastRank++; // so that oldInverseSa0 become a sorted group with 1 item } } } } static void BWTIncBuildBwt(unsigned int* insertBwt, const bgint_t *relativeRank, const bgint_t numChar, const bgint_t *cumulativeCount) { unsigned int c; bgint_t i; bgint_t previousRank, currentRank; previousRank = relativeRank[0]; for (i=1; i<=numChar; i++) { currentRank = relativeRank[i]; c = (previousRank >= cumulativeCount[1]) + (previousRank >= cumulativeCount[2]) + (previousRank >= cumulativeCount[3]); insertBwt[currentRank] = c; previousRank = currentRank; } } static void BWTIncMergeBwt(const bgint_t *sortedRank, const unsigned int* oldBwt, const unsigned int *insertBwt, unsigned int* __restrict mergedBwt, const bgint_t numOldBwt, const bgint_t numInsertBwt) { unsigned int bitsInWordMinusBitPerChar; bgint_t leftShift, rightShift; bgint_t o; bgint_t oIndex, iIndex, mIndex; bgint_t mWord, mChar, oWord, oChar; bgint_t numInsert; bitsInWordMinusBitPerChar = BITS_IN_WORD - BIT_PER_CHAR; oIndex = 0; iIndex = 0; mIndex = 0; mWord = 0; mChar = 0; mergedBwt[0] = 0; // this can be cleared as merged Bwt slightly shift to the left in each iteration while (oIndex < numOldBwt) { // copy from insertBwt while (iIndex <= numInsertBwt && sortedRank[iIndex] <= oIndex) { if (sortedRank[iIndex] != 0) { // special value to indicate that this is for new inverseSa0 mergedBwt[mWord] |= insertBwt[iIndex] << (BITS_IN_WORD - (mChar + 1) * BIT_PER_CHAR); mIndex++; mChar++; if (mChar == CHAR_PER_WORD) { mChar = 0; mWord++; mergedBwt[mWord] = 0; // no need to worry about crossing mergedBwt boundary } } iIndex++; } // Copy from oldBwt to mergedBwt if (iIndex <= numInsertBwt) { o = sortedRank[iIndex]; } else { o = numOldBwt; } numInsert = o - oIndex; oWord = oIndex / CHAR_PER_WORD; oChar = oIndex - oWord * CHAR_PER_WORD; if (oChar > mChar) { leftShift = (oChar - mChar) * BIT_PER_CHAR; rightShift = (CHAR_PER_WORD + mChar - oChar) * BIT_PER_CHAR; mergedBwt[mWord] = mergedBwt[mWord] | (oldBwt[oWord] << (oChar * BIT_PER_CHAR) >> (mChar * BIT_PER_CHAR)) | (oldBwt[oWord+1] >> rightShift); oIndex += min(numInsert, CHAR_PER_WORD - mChar); while (o > oIndex) { oWord++; mWord++; mergedBwt[mWord] = (oldBwt[oWord] << leftShift) | (oldBwt[oWord+1] >> rightShift); oIndex += CHAR_PER_WORD; } } else if (oChar < mChar) { rightShift = (mChar - oChar) * BIT_PER_CHAR; leftShift = (CHAR_PER_WORD + oChar - mChar) * BIT_PER_CHAR; mergedBwt[mWord] = mergedBwt[mWord] | (oldBwt[oWord] << (oChar * BIT_PER_CHAR) >> (mChar * BIT_PER_CHAR)); oIndex += min(numInsert, CHAR_PER_WORD - mChar); while (o > oIndex) { oWord++; mWord++; mergedBwt[mWord] = (oldBwt[oWord-1] << leftShift) | (oldBwt[oWord] >> rightShift); oIndex += CHAR_PER_WORD; } } else { // oChar == mChar mergedBwt[mWord] = mergedBwt[mWord] | truncateLeft(oldBwt[oWord], mChar * BIT_PER_CHAR); oIndex += min(numInsert, CHAR_PER_WORD - mChar); while (o > oIndex) { oWord++; mWord++; mergedBwt[mWord] = oldBwt[oWord]; oIndex += CHAR_PER_WORD; } } oIndex = o; mIndex += numInsert; // Clear the trailing garbage in mergedBwt mWord = mIndex / CHAR_PER_WORD; mChar = mIndex - mWord * CHAR_PER_WORD; if (mChar == 0) { mergedBwt[mWord] = 0; } else { mergedBwt[mWord] = truncateRight(mergedBwt[mWord], (BITS_IN_WORD - mChar * BIT_PER_CHAR)); } } // copy from insertBwt while (iIndex <= numInsertBwt) { if (sortedRank[iIndex] != 0) { mergedBwt[mWord] |= insertBwt[iIndex] << (BITS_IN_WORD - (mChar + 1) * BIT_PER_CHAR); mIndex++; mChar++; if (mChar == CHAR_PER_WORD) { mChar = 0; mWord++; mergedBwt[mWord] = 0; // no need to worry about crossing mergedBwt boundary } } iIndex++; } } void BWTClearTrailingBwtCode(BWT *bwt) { bgint_t bwtResidentSizeInWord; bgint_t wordIndex, offset; bgint_t i; bwtResidentSizeInWord = BWTResidentSizeInWord(bwt->textLength); wordIndex = bwt->textLength / CHAR_PER_WORD; offset = (bwt->textLength - wordIndex * CHAR_PER_WORD) * BIT_PER_CHAR; if (offset > 0) { bwt->bwtCode[wordIndex] = truncateRight(bwt->bwtCode[wordIndex], BITS_IN_WORD - offset); } else { if (wordIndex < bwtResidentSizeInWord) { bwt->bwtCode[wordIndex] = 0; } } for (i=wordIndex+1; ibwtCode[i] = 0; } } void BWTGenerateOccValueFromBwt(const unsigned int* bwt, unsigned int* __restrict occValue, bgint_t* __restrict occValueMajor, const bgint_t textLength, const unsigned int* decodeTable) { bgint_t numberOfOccValueMajor, numberOfOccValue; unsigned int wordBetweenOccValue; bgint_t numberOfOccIntervalPerMajor; unsigned int c; bgint_t i, j; bgint_t occMajorIndex; bgint_t occIndex, bwtIndex; bgint_t sum; // perhaps unsigned is big enough bgint_t tempOccValue0[ALPHABET_SIZE], tempOccValue1[ALPHABET_SIZE]; wordBetweenOccValue = OCC_INTERVAL / CHAR_PER_WORD; // Calculate occValue numberOfOccValue = (textLength + OCC_INTERVAL - 1) / OCC_INTERVAL + 1; // Value at both end for bi-directional encoding numberOfOccIntervalPerMajor = OCC_INTERVAL_MAJOR / OCC_INTERVAL; numberOfOccValueMajor = (numberOfOccValue + numberOfOccIntervalPerMajor - 1) / numberOfOccIntervalPerMajor; tempOccValue0[0] = 0; tempOccValue0[1] = 0; tempOccValue0[2] = 0; tempOccValue0[3] = 0; occValueMajor[0] = 0; occValueMajor[1] = 0; occValueMajor[2] = 0; occValueMajor[3] = 0; occIndex = 0; bwtIndex = 0; for (occMajorIndex=1; occMajorIndex> 16]; sum += decodeTable[c & 0x0000FFFF]; bwtIndex++; } if (!DNA_OCC_SUM_EXCEPTION(sum)) { tempOccValue1[0] += (sum & 0x000000FF); sum >>= 8; tempOccValue1[1] += (sum & 0x000000FF); sum >>= 8; tempOccValue1[2] += (sum & 0x000000FF); sum >>= 8; tempOccValue1[3] += sum; } else { if (sum == 0x00000100) { tempOccValue1[0] += 256; } else if (sum == 0x00010000) { tempOccValue1[1] += 256; } else if (sum == 0x01000000) { tempOccValue1[2] += 256; } else { tempOccValue1[3] += 256; } } occValue[occIndex * 4 + 0] = (tempOccValue0[0] << 16) | tempOccValue1[0]; occValue[occIndex * 4 + 1] = (tempOccValue0[1] << 16) | tempOccValue1[1]; occValue[occIndex * 4 + 2] = (tempOccValue0[2] << 16) | tempOccValue1[2]; occValue[occIndex * 4 + 3] = (tempOccValue0[3] << 16) | tempOccValue1[3]; tempOccValue0[0] = tempOccValue1[0]; tempOccValue0[1] = tempOccValue1[1]; tempOccValue0[2] = tempOccValue1[2]; tempOccValue0[3] = tempOccValue1[3]; sum = 0; occIndex++; for (j=0; j> 16]; sum += decodeTable[c & 0x0000FFFF]; bwtIndex++; } if (!DNA_OCC_SUM_EXCEPTION(sum)) { tempOccValue0[0] += (sum & 0x000000FF); sum >>= 8; tempOccValue0[1] += (sum & 0x000000FF); sum >>= 8; tempOccValue0[2] += (sum & 0x000000FF); sum >>= 8; tempOccValue0[3] += sum; } else { if (sum == 0x00000100) { tempOccValue0[0] += 256; } else if (sum == 0x00010000) { tempOccValue0[1] += 256; } else if (sum == 0x01000000) { tempOccValue0[2] += 256; } else { tempOccValue0[3] += 256; } } } occValueMajor[occMajorIndex * 4 + 0] = occValueMajor[(occMajorIndex - 1) * 4 + 0] + tempOccValue0[0]; occValueMajor[occMajorIndex * 4 + 1] = occValueMajor[(occMajorIndex - 1) * 4 + 1] + tempOccValue0[1]; occValueMajor[occMajorIndex * 4 + 2] = occValueMajor[(occMajorIndex - 1) * 4 + 2] + tempOccValue0[2]; occValueMajor[occMajorIndex * 4 + 3] = occValueMajor[(occMajorIndex - 1) * 4 + 3] + tempOccValue0[3]; tempOccValue0[0] = 0; tempOccValue0[1] = 0; tempOccValue0[2] = 0; tempOccValue0[3] = 0; } while (occIndex < (numberOfOccValue-1)/2) { sum = 0; tempOccValue1[0] = tempOccValue0[0]; tempOccValue1[1] = tempOccValue0[1]; tempOccValue1[2] = tempOccValue0[2]; tempOccValue1[3] = tempOccValue0[3]; for (j=0; j> 16]; sum += decodeTable[c & 0x0000FFFF]; bwtIndex++; } if (!DNA_OCC_SUM_EXCEPTION(sum)) { tempOccValue1[0] += (sum & 0x000000FF); sum >>= 8; tempOccValue1[1] += (sum & 0x000000FF); sum >>= 8; tempOccValue1[2] += (sum & 0x000000FF); sum >>= 8; tempOccValue1[3] += sum; } else { if (sum == 0x00000100) { tempOccValue1[0] += 256; } else if (sum == 0x00010000) { tempOccValue1[1] += 256; } else if (sum == 0x01000000) { tempOccValue1[2] += 256; } else { tempOccValue1[3] += 256; } } occValue[occIndex * 4 + 0] = (tempOccValue0[0] << 16) | tempOccValue1[0]; occValue[occIndex * 4 + 1] = (tempOccValue0[1] << 16) | tempOccValue1[1]; occValue[occIndex * 4 + 2] = (tempOccValue0[2] << 16) | tempOccValue1[2]; occValue[occIndex * 4 + 3] = (tempOccValue0[3] << 16) | tempOccValue1[3]; tempOccValue0[0] = tempOccValue1[0]; tempOccValue0[1] = tempOccValue1[1]; tempOccValue0[2] = tempOccValue1[2]; tempOccValue0[3] = tempOccValue1[3]; sum = 0; occIndex++; for (j=0; j> 16]; sum += decodeTable[c & 0x0000FFFF]; bwtIndex++; } if (!DNA_OCC_SUM_EXCEPTION(sum)) { tempOccValue0[0] += (sum & 0x000000FF); sum >>= 8; tempOccValue0[1] += (sum & 0x000000FF); sum >>= 8; tempOccValue0[2] += (sum & 0x000000FF); sum >>= 8; tempOccValue0[3] += sum; } else { if (sum == 0x00000100) { tempOccValue0[0] += 256; } else if (sum == 0x00010000) { tempOccValue0[1] += 256; } else if (sum == 0x01000000) { tempOccValue0[2] += 256; } else { tempOccValue0[3] += 256; } } } sum = 0; tempOccValue1[0] = tempOccValue0[0]; tempOccValue1[1] = tempOccValue0[1]; tempOccValue1[2] = tempOccValue0[2]; tempOccValue1[3] = tempOccValue0[3]; if (occIndex * 2 < numberOfOccValue - 1) { for (j=0; j> 16]; sum += decodeTable[c & 0x0000FFFF]; bwtIndex++; } if (!DNA_OCC_SUM_EXCEPTION(sum)) { tempOccValue1[0] += (sum & 0x000000FF); sum >>= 8; tempOccValue1[1] += (sum & 0x000000FF); sum >>= 8; tempOccValue1[2] += (sum & 0x000000FF); sum >>= 8; tempOccValue1[3] += sum; } else { if (sum == 0x00000100) { tempOccValue1[0] += 256; } else if (sum == 0x00010000) { tempOccValue1[1] += 256; } else if (sum == 0x01000000) { tempOccValue1[2] += 256; } else { tempOccValue1[3] += 256; } } } occValue[occIndex * 4 + 0] = (tempOccValue0[0] << 16) | tempOccValue1[0]; occValue[occIndex * 4 + 1] = (tempOccValue0[1] << 16) | tempOccValue1[1]; occValue[occIndex * 4 + 2] = (tempOccValue0[2] << 16) | tempOccValue1[2]; occValue[occIndex * 4 + 3] = (tempOccValue0[3] << 16) | tempOccValue1[3]; } static void BWTIncConstruct(BWTInc *bwtInc, const bgint_t numChar) { unsigned int i; bgint_t mergedBwtSizeInWord, mergedOccSizeInWord; unsigned int firstCharInThisIteration; bgint_t *relativeRank, *seq, *sortedRank; unsigned int *insertBwt, *mergedBwt; bgint_t newInverseSa0RelativeRank, oldInverseSa0RelativeRank, newInverseSa0; mergedBwtSizeInWord = BWTResidentSizeInWord(bwtInc->bwt->textLength + numChar); mergedOccSizeInWord = BWTOccValueMinorSizeInWord(bwtInc->bwt->textLength + numChar); initializeVAL_bg(bwtInc->cumulativeCountInCurrentBuild, ALPHABET_SIZE + 1, 0); if (bwtInc->bwt->textLength == 0) { // Initial build // Set address seq = (bgint_t*)bwtInc->workingMemory; relativeRank = seq + bwtInc->buildSize + 1; // mergedBwt and packedTex may share memory mergedBwt = insertBwt = bwtInc->workingMemory + bwtInc->availableWord - mergedBwtSizeInWord; // build in place assert((void*)(relativeRank + bwtInc->buildSize + 1) <= (void*)bwtInc->packedText); assert((void*)(relativeRank + bwtInc->buildSize + 1) <= (void*)mergedBwt); // ->packedText is not used any more and may be overwritten by mergedBwt BWTIncPutPackedTextToRank(bwtInc->packedText, relativeRank, bwtInc->cumulativeCountInCurrentBuild, numChar); firstCharInThisIteration = relativeRank[0]; relativeRank[numChar] = 0; // Sort suffix QSufSortSuffixSort((qsint_t*)relativeRank, (qsint_t*)seq, (qsint_t)numChar, (qsint_t)ALPHABET_SIZE - 1, 0, FALSE); newInverseSa0 = relativeRank[0]; // Clear BWT area initializeVAL(insertBwt, mergedBwtSizeInWord, 0); // Build BWT BWTIncBuildPackedBwt(relativeRank, insertBwt, numChar, bwtInc->cumulativeCountInCurrentBuild, bwtInc->packedShift); // so that the cumulativeCount is not deducted bwtInc->firstCharInLastIteration = ALPHABET_SIZE; } else { // Incremental build // Set address sortedRank = (bgint_t*)bwtInc->workingMemory; seq = sortedRank + bwtInc->buildSize + 1; insertBwt = (unsigned*)seq; // insertBwt and seq share memory // relativeRank and ->packedText may share memory relativeRank = seq + bwtInc->buildSize + 1; assert((void*)relativeRank <= (void*)bwtInc->packedText); // Store the first character of this iteration firstCharInThisIteration = bwtInc->packedText[0] >> (BITS_IN_WORD - BIT_PER_CHAR); // Count occurrence of input text ForwardDNAAllOccCountNoLimit(bwtInc->packedText, numChar, bwtInc->cumulativeCountInCurrentBuild + 1, bwtInc->bwt->decodeTable); // Add the first character of the previous iteration to represent the inverseSa0 of the previous iteration bwtInc->cumulativeCountInCurrentBuild[bwtInc->firstCharInLastIteration + 1]++; bwtInc->cumulativeCountInCurrentBuild[2] += bwtInc->cumulativeCountInCurrentBuild[1]; bwtInc->cumulativeCountInCurrentBuild[3] += bwtInc->cumulativeCountInCurrentBuild[2]; bwtInc->cumulativeCountInCurrentBuild[4] += bwtInc->cumulativeCountInCurrentBuild[3]; // Get rank of new suffix among processed suffix // The seq array is built into ALPHABET_SIZE + 2 groups; ALPHABET_SIZE groups + 1 group divided into 2 by inverseSa0 + inverseSa0 as 1 group // ->packedText is not used any more and will be overwritten by relativeRank oldInverseSa0RelativeRank = BWTIncGetAbsoluteRank(bwtInc->bwt, sortedRank, seq, bwtInc->packedText, numChar, bwtInc->cumulativeCountInCurrentBuild, bwtInc->firstCharInLastIteration); // Sort rank by ALPHABET_SIZE + 2 groups (or ALPHABET_SIZE + 1 groups when inverseSa0 sit on the border of a group) for (i=0; icumulativeCountInCurrentBuild[i] > oldInverseSa0RelativeRank || bwtInc->cumulativeCountInCurrentBuild[i+1] <= oldInverseSa0RelativeRank) { BWTIncSortKey(sortedRank + bwtInc->cumulativeCountInCurrentBuild[i], seq + bwtInc->cumulativeCountInCurrentBuild[i], bwtInc->cumulativeCountInCurrentBuild[i+1] - bwtInc->cumulativeCountInCurrentBuild[i]); } else { if (bwtInc->cumulativeCountInCurrentBuild[i] < oldInverseSa0RelativeRank) { BWTIncSortKey(sortedRank + bwtInc->cumulativeCountInCurrentBuild[i], seq + bwtInc->cumulativeCountInCurrentBuild[i], oldInverseSa0RelativeRank - bwtInc->cumulativeCountInCurrentBuild[i]); } if (bwtInc->cumulativeCountInCurrentBuild[i+1] > oldInverseSa0RelativeRank + 1) { BWTIncSortKey(sortedRank + oldInverseSa0RelativeRank + 1, seq + oldInverseSa0RelativeRank + 1, bwtInc->cumulativeCountInCurrentBuild[i+1] - oldInverseSa0RelativeRank - 1); } } } // build relative rank; sortedRank is updated for merging to cater for the fact that $ is not encoded in bwt // the cumulative freq information is used to make sure that inverseSa0 and suffix beginning with different characters are kept in different unsorted groups) BWTIncBuildRelativeRank(sortedRank, seq, relativeRank, numChar, bwtInc->bwt->inverseSa0, bwtInc->cumulativeCountInCurrentBuild); assert(relativeRank[numChar] == oldInverseSa0RelativeRank); // Sort suffix QSufSortSuffixSort((qsint_t*)relativeRank, (qsint_t*)seq, (qsint_t)numChar, (qsint_t)numChar, 1, TRUE); newInverseSa0RelativeRank = relativeRank[0]; newInverseSa0 = sortedRank[newInverseSa0RelativeRank] + newInverseSa0RelativeRank; sortedRank[newInverseSa0RelativeRank] = 0; // a special value so that this is skipped in the merged bwt // Build BWT; seq is overwritten by insertBwt BWTIncBuildBwt(insertBwt, relativeRank, numChar, bwtInc->cumulativeCountInCurrentBuild); // Merge BWT; relativeRank may be overwritten by mergedBwt mergedBwt = bwtInc->workingMemory + bwtInc->availableWord - mergedBwtSizeInWord - bwtInc->numberOfIterationDone * OCC_INTERVAL / BIT_PER_CHAR * (sizeof(bgint_t) / 4); // minus numberOfIteration * occInterval to create a buffer for merging assert(mergedBwt >= insertBwt + numChar); BWTIncMergeBwt(sortedRank, bwtInc->bwt->bwtCode, insertBwt, mergedBwt, bwtInc->bwt->textLength, numChar); } // Build auxiliary structure and update info and pointers in BWT bwtInc->bwt->textLength += numChar; bwtInc->bwt->bwtCode = mergedBwt; bwtInc->bwt->bwtSizeInWord = mergedBwtSizeInWord; bwtInc->bwt->occSizeInWord = mergedOccSizeInWord; assert(mergedBwt >= bwtInc->workingMemory + mergedOccSizeInWord); bwtInc->bwt->occValue = mergedBwt - mergedOccSizeInWord; BWTClearTrailingBwtCode(bwtInc->bwt); BWTGenerateOccValueFromBwt(bwtInc->bwt->bwtCode, bwtInc->bwt->occValue, bwtInc->bwt->occValueMajor, bwtInc->bwt->textLength, bwtInc->bwt->decodeTable); bwtInc->bwt->inverseSa0 = newInverseSa0; bwtInc->bwt->cumulativeFreq[1] += bwtInc->cumulativeCountInCurrentBuild[1] - (bwtInc->firstCharInLastIteration <= 0); bwtInc->bwt->cumulativeFreq[2] += bwtInc->cumulativeCountInCurrentBuild[2] - (bwtInc->firstCharInLastIteration <= 1); bwtInc->bwt->cumulativeFreq[3] += bwtInc->cumulativeCountInCurrentBuild[3] - (bwtInc->firstCharInLastIteration <= 2); bwtInc->bwt->cumulativeFreq[4] += bwtInc->cumulativeCountInCurrentBuild[4] - (bwtInc->firstCharInLastIteration <= 3); bwtInc->firstCharInLastIteration = firstCharInThisIteration; // Set build size and text address for the next build BWTIncSetBuildSizeAndTextAddr(bwtInc); bwtInc->numberOfIterationDone++; } BWTInc *BWTIncConstructFromPacked(const char *inputFileName, bgint_t initialMaxBuildSize, bgint_t incMaxBuildSize) { FILE *packedFile; bgint_t packedFileLen; bgint_t totalTextLength; bgint_t textToLoad, textSizeInByte; bgint_t processedTextLength; unsigned char lastByteLength; BWTInc *bwtInc; packedFile = (FILE*)fopen(inputFileName, "rb"); if (packedFile == NULL) { fprintf(stderr, "BWTIncConstructFromPacked() : Cannot open %s : %s\n", inputFileName, strerror(errno)); exit(1); } if (fseek(packedFile, -1, SEEK_END) != 0) { fprintf(stderr, "BWTIncConstructFromPacked() : Can't seek on %s : %s\n", inputFileName, strerror(errno)); exit(1); } packedFileLen = ftell(packedFile); if (packedFileLen == -1) { fprintf(stderr, "BWTIncConstructFromPacked() : Can't ftell on %s : %s\n", inputFileName, strerror(errno)); exit(1); } if (fread(&lastByteLength, sizeof(unsigned char), 1, packedFile) != 1) { fprintf(stderr, "BWTIncConstructFromPacked() : Can't read from %s : %s\n", inputFileName, ferror(packedFile)? strerror(errno) : "Unexpected end of file"); exit(1); } totalTextLength = TextLengthFromBytePacked(packedFileLen, BIT_PER_CHAR, lastByteLength); bwtInc = BWTIncCreate(totalTextLength, initialMaxBuildSize, incMaxBuildSize); BWTIncSetBuildSizeAndTextAddr(bwtInc); if (bwtInc->buildSize > totalTextLength) { textToLoad = totalTextLength; } else { textToLoad = totalTextLength - ((totalTextLength - bwtInc->buildSize + CHAR_PER_WORD - 1) / CHAR_PER_WORD * CHAR_PER_WORD); } textSizeInByte = textToLoad / CHAR_PER_BYTE; // excluded the odd byte if (fseek(packedFile, -((long)textSizeInByte + 2), SEEK_CUR) != 0) { fprintf(stderr, "BWTIncConstructFromPacked() : Can't seek on %s : %s\n", inputFileName, strerror(errno)); exit(1); } if (fread(bwtInc->textBuffer, sizeof(unsigned char), textSizeInByte + 1, packedFile) != textSizeInByte + 1) { fprintf(stderr, "BWTIncConstructFromPacked() : Can't read from %s : %s\n", inputFileName, ferror(packedFile)? strerror(errno) : "Unexpected end of file"); exit(1); } if (fseek(packedFile, -((long)textSizeInByte + 1), SEEK_CUR) != 0) { fprintf(stderr, "BWTIncConstructFromPacked() : Can't seek on %s : %s\n", inputFileName, strerror(errno)); exit(1); } ConvertBytePackedToWordPacked(bwtInc->textBuffer, bwtInc->packedText, ALPHABET_SIZE, textToLoad); BWTIncConstruct(bwtInc, textToLoad); processedTextLength = textToLoad; while (processedTextLength < totalTextLength) { textToLoad = bwtInc->buildSize / CHAR_PER_WORD * CHAR_PER_WORD; if (textToLoad > totalTextLength - processedTextLength) { textToLoad = totalTextLength - processedTextLength; } textSizeInByte = textToLoad / CHAR_PER_BYTE; if (fseek(packedFile, -((long)textSizeInByte), SEEK_CUR) != 0) { fprintf(stderr, "BWTIncConstructFromPacked() : Can't seek on %s : %s\n", inputFileName, strerror(errno)); exit(1); } if (fread(bwtInc->textBuffer, sizeof(unsigned char), textSizeInByte, packedFile) != textSizeInByte) { fprintf(stderr, "BWTIncConstructFromPacked() : Can't read from %s : %s\n", inputFileName, ferror(packedFile)? strerror(errno) : "Unexpected end of file"); exit(1); } if (fseek(packedFile, -((long)textSizeInByte), SEEK_CUR) != 0) { fprintf(stderr, "BWTIncConstructFromPacked() : Can't seek on %s : %s\n", inputFileName, strerror(errno)); exit(1); } ConvertBytePackedToWordPacked(bwtInc->textBuffer, bwtInc->packedText, ALPHABET_SIZE, textToLoad); BWTIncConstruct(bwtInc, textToLoad); processedTextLength += textToLoad; if (bwtInc->numberOfIterationDone % 10 == 0) { fprintf(stderr, "[BWTIncConstructFromPacked] %lu iterations done. %lu characters processed.\n", (long)bwtInc->numberOfIterationDone, (long)processedTextLength); } } fclose(packedFile); return bwtInc; } void BWTIncFree(BWTInc *bwtInc) { if (bwtInc == 0) return; free(bwtInc->bwt->cumulativeFreq); free(bwtInc->bwt->occValueMajor); free(bwtInc->bwt->decodeTable); free(bwtInc->bwt); free(bwtInc->workingMemory); free(bwtInc->cumulativeCountInCurrentBuild); free(bwtInc->packedShift); free(bwtInc); } static bgint_t BWTFileSizeInWord(const bgint_t numChar) { // The $ in BWT at the position of inverseSa0 is not encoded return (numChar + CHAR_PER_WORD - 1) / CHAR_PER_WORD; } void BWTSaveBwtCodeAndOcc(const BWT *bwt, const char *bwtFileName, const char *occValueFileName) { FILE *bwtFile; /* FILE *occValueFile; */ bgint_t bwtLength; bwtFile = (FILE*)fopen(bwtFileName, "wb"); if (bwtFile == NULL) { fprintf(stderr, "BWTSaveBwtCodeAndOcc(): Cannot open %s for writing: %s\n", bwtFileName, strerror(errno)); exit(1); } bwtLength = BWTFileSizeInWord(bwt->textLength); if (fwrite(&bwt->inverseSa0, sizeof(bgint_t), 1, bwtFile) != 1 || fwrite(bwt->cumulativeFreq + 1, sizeof(bgint_t), ALPHABET_SIZE, bwtFile) != ALPHABET_SIZE || fwrite(bwt->bwtCode, sizeof(unsigned int), bwtLength, bwtFile) != bwtLength) { fprintf(stderr, "BWTSaveBwtCodeAndOcc(): Error writing to %s : %s\n", bwtFileName, strerror(errno)); exit(1); } if (fclose(bwtFile) != 0) { fprintf(stderr, "BWTSaveBwtCodeAndOcc(): Error on closing %s : %s\n", bwtFileName, strerror(errno)); exit(1); } } void bwt_bwtgen2(const char *fn_pac, const char *fn_bwt, int block_size) { BWTInc *bwtInc; bwtInc = BWTIncConstructFromPacked(fn_pac, block_size, block_size); fprintf(stderr, "[bwt_gen] Finished constructing BWT in %u iterations.\n", bwtInc->numberOfIterationDone); BWTSaveBwtCodeAndOcc(bwtInc->bwt, fn_bwt, 0); BWTIncFree(bwtInc); } void bwt_bwtgen(const char *fn_pac, const char *fn_bwt) { bwt_bwtgen2(fn_pac, fn_bwt, 10000000); } int bwt_bwtgen_main(int argc, char *argv[]) { if (argc < 3) { fprintf(stderr, "Usage: bwtgen \n"); return 1; } bwt_bwtgen(argv[1], argv[2]); return 0; } #ifdef MAIN_BWT_GEN int main(int argc, char *argv[]) { return bwt_bwtgen_main(argc, argv); } #endif bwa-0.7.17/bwt_lite.c000066400000000000000000000051721317342117100143410ustar00rootroot00000000000000#include #include #include #include "bwt_lite.h" #ifdef USE_MALLOC_WRAPPERS # include "malloc_wrap.h" #endif int is_sa(const uint8_t *T, int *SA, int n); int is_bwt(uint8_t *T, int n); bwtl_t *bwtl_seq2bwtl(int len, const uint8_t *seq) { bwtl_t *b; int i; b = (bwtl_t*)calloc(1, sizeof(bwtl_t)); b->seq_len = len; { // calculate b->bwt uint8_t *s; b->sa = (uint32_t*)calloc(len + 1, 4); is_sa(seq, (int*)b->sa, len); s = (uint8_t*)calloc(len + 1, 1); for (i = 0; i <= len; ++i) { if (b->sa[i] == 0) b->primary = i; else s[i] = seq[b->sa[i] - 1]; } for (i = b->primary; i < len; ++i) s[i] = s[i + 1]; b->bwt_size = (len + 15) / 16; b->bwt = (uint32_t*)calloc(b->bwt_size, 4); for (i = 0; i < len; ++i) b->bwt[i>>4] |= s[i] << ((15 - (i&15)) << 1); free(s); } { // calculate b->occ uint32_t c[4]; b->n_occ = (len + 15) / 16 * 4; b->occ = (uint32_t*)calloc(b->n_occ, 4); memset(c, 0, 16); for (i = 0; i < len; ++i) { if (i % 16 == 0) memcpy(b->occ + (i/16) * 4, c, 16); ++c[bwtl_B0(b, i)]; } memcpy(b->L2+1, c, 16); for (i = 2; i < 5; ++i) b->L2[i] += b->L2[i-1]; } { // generate cnt_table for (i = 0; i != 256; ++i) { uint32_t j, x = 0; for (j = 0; j != 4; ++j) x |= (((i&3) == j) + ((i>>2&3) == j) + ((i>>4&3) == j) + (i>>6 == j)) << (j<<3); b->cnt_table[i] = x; } } return b; } uint32_t bwtl_occ(const bwtl_t *bwt, uint32_t k, uint8_t c) { uint32_t n, b; if (k == bwt->seq_len) return bwt->L2[c+1] - bwt->L2[c]; if (k == (uint32_t)(-1)) return 0; if (k >= bwt->primary) --k; // because $ is not in bwt n = bwt->occ[k/16<<2|c]; b = bwt->bwt[k/16] & ~((1U<<((15-(k&15))<<1)) - 1); n += (bwt->cnt_table[b&0xff] + bwt->cnt_table[b>>8&0xff] + bwt->cnt_table[b>>16&0xff] + bwt->cnt_table[b>>24]) >> (c<<3) & 0xff; if (c == 0) n -= 15 - (k&15); // corrected for the masked bits return n; } void bwtl_occ4(const bwtl_t *bwt, uint32_t k, uint32_t cnt[4]) { uint32_t x, b; if (k == (uint32_t)(-1)) { memset(cnt, 0, 16); return; } if (k >= bwt->primary) --k; // because $ is not in bwt memcpy(cnt, bwt->occ + (k>>4<<2), 16); b = bwt->bwt[k>>4] & ~((1U<<((~k&15)<<1)) - 1); x = bwt->cnt_table[b&0xff] + bwt->cnt_table[b>>8&0xff] + bwt->cnt_table[b>>16&0xff] + bwt->cnt_table[b>>24]; x -= 15 - (k&15); cnt[0] += x&0xff; cnt[1] += x>>8&0xff; cnt[2] += x>>16&0xff; cnt[3] += x>>24; } void bwtl_2occ4(const bwtl_t *bwt, uint32_t k, uint32_t l, uint32_t cntk[4], uint32_t cntl[4]) { bwtl_occ4(bwt, k, cntk); bwtl_occ4(bwt, l, cntl); } void bwtl_destroy(bwtl_t *bwt) { if (bwt) { free(bwt->occ); free(bwt->bwt); free(bwt->sa); free(bwt); } } bwa-0.7.17/bwt_lite.h000066400000000000000000000012151317342117100143400ustar00rootroot00000000000000#ifndef BWT_LITE_H_ #define BWT_LITE_H_ #include typedef struct { uint32_t seq_len, bwt_size, n_occ; uint32_t primary; uint32_t *bwt, *occ, *sa, L2[5]; uint32_t cnt_table[256]; } bwtl_t; #define bwtl_B0(b, k) ((b)->bwt[(k)>>4]>>((~(k)&0xf)<<1)&3) #ifdef __cplusplus extern "C" { #endif bwtl_t *bwtl_seq2bwtl(int len, const uint8_t *seq); uint32_t bwtl_occ(const bwtl_t *bwt, uint32_t k, uint8_t c); void bwtl_occ4(const bwtl_t *bwt, uint32_t k, uint32_t cnt[4]); void bwtl_2occ4(const bwtl_t *bwt, uint32_t k, uint32_t l, uint32_t cntk[4], uint32_t cntl[4]); void bwtl_destroy(bwtl_t *bwt); #ifdef __cplusplus } #endif #endif bwa-0.7.17/bwtaln.c000066400000000000000000000254111317342117100140150ustar00rootroot00000000000000#include #include #include #include #include #include #include #ifdef HAVE_CONFIG_H #include "config.h" #endif #include "bwtaln.h" #include "bwtgap.h" #include "utils.h" #include "bwa.h" #ifdef HAVE_PTHREAD #include #endif #ifdef USE_MALLOC_WRAPPERS # include "malloc_wrap.h" #endif gap_opt_t *gap_init_opt() { gap_opt_t *o; o = (gap_opt_t*)calloc(1, sizeof(gap_opt_t)); /* IMPORTANT: s_mm*10 should be about the average base error rate. Voilating this requirement will break pairing! */ o->s_mm = 3; o->s_gapo = 11; o->s_gape = 4; o->max_diff = -1; o->max_gapo = 1; o->max_gape = 6; o->indel_end_skip = 5; o->max_del_occ = 10; o->max_entries = 2000000; o->mode = BWA_MODE_GAPE | BWA_MODE_COMPREAD; o->seed_len = 32; o->max_seed_diff = 2; o->fnr = 0.04; o->n_threads = 1; o->max_top2 = 30; o->trim_qual = 0; return o; } int bwa_cal_maxdiff(int l, double err, double thres) { double elambda = exp(-l * err); double sum, y = 1.0; int k, x = 1; for (k = 1, sum = elambda; k < 1000; ++k) { y *= l * err; x *= k; sum += elambda * y / x; if (1.0 - sum < thres) return k; } return 2; } // width must be filled as zero int bwt_cal_width(const bwt_t *bwt, int len, const ubyte_t *str, bwt_width_t *width) { bwtint_t k, l, ok, ol; int i, bid; bid = 0; k = 0; l = bwt->seq_len; for (i = 0; i < len; ++i) { ubyte_t c = str[i]; if (c < 4) { bwt_2occ(bwt, k - 1, l, c, &ok, &ol); k = bwt->L2[c] + ok + 1; l = bwt->L2[c] + ol; } if (k > l || c > 3) { // then restart k = 0; l = bwt->seq_len; ++bid; } width[i].w = l - k + 1; width[i].bid = bid; } width[len].w = 0; width[len].bid = ++bid; return bid; } void bwa_cal_sa_reg_gap(int tid, bwt_t *const bwt, int n_seqs, bwa_seq_t *seqs, const gap_opt_t *opt) { int i, j, max_l = 0, max_len; gap_stack_t *stack; bwt_width_t *w, *seed_w; gap_opt_t local_opt = *opt; // initiate priority stack for (i = max_len = 0; i != n_seqs; ++i) if (seqs[i].len > max_len) max_len = seqs[i].len; if (opt->fnr > 0.0) local_opt.max_diff = bwa_cal_maxdiff(max_len, BWA_AVG_ERR, opt->fnr); if (local_opt.max_diff < local_opt.max_gapo) local_opt.max_gapo = local_opt.max_diff; stack = gap_init_stack(local_opt.max_diff, local_opt.max_gapo, local_opt.max_gape, &local_opt); seed_w = (bwt_width_t*)calloc(opt->seed_len+1, sizeof(bwt_width_t)); w = 0; for (i = 0; i != n_seqs; ++i) { bwa_seq_t *p = seqs + i; #ifdef HAVE_PTHREAD if (i % opt->n_threads != tid) continue; #endif p->sa = 0; p->type = BWA_TYPE_NO_MATCH; p->c1 = p->c2 = 0; p->n_aln = 0; p->aln = 0; if (max_l < p->len) { max_l = p->len; w = (bwt_width_t*)realloc(w, (max_l + 1) * sizeof(bwt_width_t)); memset(w, 0, (max_l + 1) * sizeof(bwt_width_t)); } bwt_cal_width(bwt, p->len, p->seq, w); if (opt->fnr > 0.0) local_opt.max_diff = bwa_cal_maxdiff(p->len, BWA_AVG_ERR, opt->fnr); local_opt.seed_len = opt->seed_len < p->len? opt->seed_len : 0x7fffffff; if (p->len > opt->seed_len) bwt_cal_width(bwt, opt->seed_len, p->seq + (p->len - opt->seed_len), seed_w); // core function for (j = 0; j < p->len; ++j) // we need to complement p->seq[j] = p->seq[j] > 3? 4 : 3 - p->seq[j]; p->aln = bwt_match_gap(bwt, p->len, p->seq, w, p->len <= opt->seed_len? 0 : seed_w, &local_opt, &p->n_aln, stack); //fprintf(stderr, "mm=%lld,ins=%lld,del=%lld,gapo=%lld\n", p->aln->n_mm, p->aln->n_ins, p->aln->n_del, p->aln->n_gapo); // clean up the unused data in the record free(p->name); free(p->seq); free(p->rseq); free(p->qual); p->name = 0; p->seq = p->rseq = p->qual = 0; } free(seed_w); free(w); gap_destroy_stack(stack); } #ifdef HAVE_PTHREAD typedef struct { int tid; bwt_t *bwt; int n_seqs; bwa_seq_t *seqs; const gap_opt_t *opt; } thread_aux_t; static void *worker(void *data) { thread_aux_t *d = (thread_aux_t*)data; bwa_cal_sa_reg_gap(d->tid, d->bwt, d->n_seqs, d->seqs, d->opt); return 0; } #endif bwa_seqio_t *bwa_open_reads(int mode, const char *fn_fa) { bwa_seqio_t *ks; if (mode & BWA_MODE_BAM) { // open BAM int which = 0; if (mode & BWA_MODE_BAM_SE) which |= 4; if (mode & BWA_MODE_BAM_READ1) which |= 1; if (mode & BWA_MODE_BAM_READ2) which |= 2; if (which == 0) which = 7; // then read all reads ks = bwa_bam_open(fn_fa, which); } else ks = bwa_seq_open(fn_fa); return ks; } void bwa_aln_core(const char *prefix, const char *fn_fa, const gap_opt_t *opt) { int i, n_seqs; long long tot_seqs = 0; bwa_seq_t *seqs; bwa_seqio_t *ks; clock_t t; bwt_t *bwt; // initialization ks = bwa_open_reads(opt->mode, fn_fa); { // load BWT char *str = (char*)calloc(strlen(prefix) + 10, 1); strcpy(str, prefix); strcat(str, ".bwt"); bwt = bwt_restore_bwt(str); free(str); } // core loop err_fwrite(SAI_MAGIC, 1, 4, stdout); err_fwrite(opt, sizeof(gap_opt_t), 1, stdout); while ((seqs = bwa_read_seq(ks, 0x40000, &n_seqs, opt->mode, opt->trim_qual)) != 0) { tot_seqs += n_seqs; t = clock(); fprintf(stderr, "[bwa_aln_core] calculate SA coordinate... "); #ifdef HAVE_PTHREAD if (opt->n_threads <= 1) { // no multi-threading at all bwa_cal_sa_reg_gap(0, bwt, n_seqs, seqs, opt); } else { pthread_t *tid; pthread_attr_t attr; thread_aux_t *data; int j; pthread_attr_init(&attr); pthread_attr_setdetachstate(&attr, PTHREAD_CREATE_JOINABLE); data = (thread_aux_t*)calloc(opt->n_threads, sizeof(thread_aux_t)); tid = (pthread_t*)calloc(opt->n_threads, sizeof(pthread_t)); for (j = 0; j < opt->n_threads; ++j) { data[j].tid = j; data[j].bwt = bwt; data[j].n_seqs = n_seqs; data[j].seqs = seqs; data[j].opt = opt; pthread_create(&tid[j], &attr, worker, data + j); } for (j = 0; j < opt->n_threads; ++j) pthread_join(tid[j], 0); free(data); free(tid); } #else bwa_cal_sa_reg_gap(0, bwt, n_seqs, seqs, opt); #endif fprintf(stderr, "%.2f sec\n", (float)(clock() - t) / CLOCKS_PER_SEC); t = clock(); fprintf(stderr, "[bwa_aln_core] write to the disk... "); for (i = 0; i < n_seqs; ++i) { bwa_seq_t *p = seqs + i; err_fwrite(&p->n_aln, 4, 1, stdout); if (p->n_aln) err_fwrite(p->aln, sizeof(bwt_aln1_t), p->n_aln, stdout); } fprintf(stderr, "%.2f sec\n", (float)(clock() - t) / CLOCKS_PER_SEC); bwa_free_read_seq(n_seqs, seqs); fprintf(stderr, "[bwa_aln_core] %lld sequences have been processed.\n", tot_seqs); } // destroy bwt_destroy(bwt); bwa_seq_close(ks); } int bwa_aln(int argc, char *argv[]) { int c, opte = -1; gap_opt_t *opt; char *prefix; opt = gap_init_opt(); while ((c = getopt(argc, argv, "n:o:e:i:d:l:k:LR:m:t:NM:O:E:q:f:b012IYB:")) >= 0) { switch (c) { case 'n': if (strstr(optarg, ".")) opt->fnr = atof(optarg), opt->max_diff = -1; else opt->max_diff = atoi(optarg), opt->fnr = -1.0; break; case 'o': opt->max_gapo = atoi(optarg); break; case 'e': opte = atoi(optarg); break; case 'M': opt->s_mm = atoi(optarg); break; case 'O': opt->s_gapo = atoi(optarg); break; case 'E': opt->s_gape = atoi(optarg); break; case 'd': opt->max_del_occ = atoi(optarg); break; case 'i': opt->indel_end_skip = atoi(optarg); break; case 'l': opt->seed_len = atoi(optarg); break; case 'k': opt->max_seed_diff = atoi(optarg); break; case 'm': opt->max_entries = atoi(optarg); break; case 't': opt->n_threads = atoi(optarg); break; case 'L': opt->mode |= BWA_MODE_LOGGAP; break; case 'R': opt->max_top2 = atoi(optarg); break; case 'q': opt->trim_qual = atoi(optarg); break; case 'N': opt->mode |= BWA_MODE_NONSTOP; opt->max_top2 = 0x7fffffff; break; case 'f': xreopen(optarg, "wb", stdout); break; case 'b': opt->mode |= BWA_MODE_BAM; break; case '0': opt->mode |= BWA_MODE_BAM_SE; break; case '1': opt->mode |= BWA_MODE_BAM_READ1; break; case '2': opt->mode |= BWA_MODE_BAM_READ2; break; case 'I': opt->mode |= BWA_MODE_IL13; break; case 'Y': opt->mode |= BWA_MODE_CFY; break; case 'B': opt->mode |= atoi(optarg) << 24; break; default: return 1; } } if (opte > 0) { opt->max_gape = opte; opt->mode &= ~BWA_MODE_GAPE; } if (optind + 2 > argc) { fprintf(stderr, "\n"); fprintf(stderr, "Usage: bwa aln [options] \n\n"); fprintf(stderr, "Options: -n NUM max #diff (int) or missing prob under %.2f err rate (float) [%.2f]\n", BWA_AVG_ERR, opt->fnr); fprintf(stderr, " -o INT maximum number or fraction of gap opens [%d]\n", opt->max_gapo); fprintf(stderr, " -e INT maximum number of gap extensions, -1 for disabling long gaps [-1]\n"); fprintf(stderr, " -i INT do not put an indel within INT bp towards the ends [%d]\n", opt->indel_end_skip); fprintf(stderr, " -d INT maximum occurrences for extending a long deletion [%d]\n", opt->max_del_occ); fprintf(stderr, " -l INT seed length [%d]\n", opt->seed_len); fprintf(stderr, " -k INT maximum differences in the seed [%d]\n", opt->max_seed_diff); fprintf(stderr, " -m INT maximum entries in the queue [%d]\n", opt->max_entries); fprintf(stderr, " -t INT number of threads [%d]\n", opt->n_threads); fprintf(stderr, " -M INT mismatch penalty [%d]\n", opt->s_mm); fprintf(stderr, " -O INT gap open penalty [%d]\n", opt->s_gapo); fprintf(stderr, " -E INT gap extension penalty [%d]\n", opt->s_gape); fprintf(stderr, " -R INT stop searching when there are >INT equally best hits [%d]\n", opt->max_top2); fprintf(stderr, " -q INT quality threshold for read trimming down to %dbp [%d]\n", BWA_MIN_RDLEN, opt->trim_qual); fprintf(stderr, " -f FILE file to write output to instead of stdout\n"); fprintf(stderr, " -B INT length of barcode\n"); fprintf(stderr, " -L log-scaled gap penalty for long deletions\n"); fprintf(stderr, " -N non-iterative mode: search for all n-difference hits (slooow)\n"); fprintf(stderr, " -I the input is in the Illumina 1.3+ FASTQ-like format\n"); fprintf(stderr, " -b the input read file is in the BAM format\n"); fprintf(stderr, " -0 use single-end reads only (effective with -b)\n"); fprintf(stderr, " -1 use the 1st read in a pair (effective with -b)\n"); fprintf(stderr, " -2 use the 2nd read in a pair (effective with -b)\n"); fprintf(stderr, " -Y filter Casava-filtered sequences\n"); fprintf(stderr, "\n"); return 1; } if (opt->fnr > 0.0) { int i, k; for (i = 17, k = 0; i <= 250; ++i) { int l = bwa_cal_maxdiff(i, BWA_AVG_ERR, opt->fnr); if (l != k) fprintf(stderr, "[bwa_aln] %dbp reads: max_diff = %d\n", i, l); k = l; } } if ((prefix = bwa_idx_infer_prefix(argv[optind])) == 0) { fprintf(stderr, "[%s] fail to locate the index\n", __func__); free(opt); return 1; } bwa_aln_core(prefix, argv[optind+1], opt); free(opt); free(prefix); return 0; } bwa-0.7.17/bwtaln.h000066400000000000000000000073711317342117100140270ustar00rootroot00000000000000#ifndef BWTALN_H #define BWTALN_H #include #include "bwt.h" #define BWA_TYPE_NO_MATCH 0 #define BWA_TYPE_UNIQUE 1 #define BWA_TYPE_REPEAT 2 #define BWA_TYPE_MATESW 3 #define SAM_FPD 1 // paired #define SAM_FPP 2 // properly paired #define SAM_FSU 4 // self-unmapped #define SAM_FMU 8 // mate-unmapped #define SAM_FSR 16 // self on the reverse strand #define SAM_FMR 32 // mate on the reverse strand #define SAM_FR1 64 // this is read one #define SAM_FR2 128 // this is read two #define SAM_FSC 256 // secondary alignment #define BWA_AVG_ERR 0.02 #define BWA_MIN_RDLEN 35 // for read trimming #define BWA_MAX_BCLEN 63 // maximum barcode length; 127 is the maximum #ifndef bns_pac #define bns_pac(pac, k) ((pac)[(k)>>2] >> ((~(k)&3)<<1) & 3) #endif #define FROM_M 0 #define FROM_I 1 #define FROM_D 2 #define FROM_S 3 #define SAI_MAGIC "SAI\1" typedef struct { bwtint_t w; int bid; } bwt_width_t; typedef struct { uint64_t n_mm:8, n_gapo:8, n_gape:8, score:20, n_ins:10, n_del:10; bwtint_t k, l; } bwt_aln1_t; typedef uint16_t bwa_cigar_t; /* rgoya: If changing order of bytes, beware of operations like: * s->cigar[0] += s->full_len - s->len; */ #define CIGAR_OP_SHIFT 14 #define CIGAR_LN_MASK 0x3fff #define __cigar_op(__cigar) ((__cigar)>>CIGAR_OP_SHIFT) #define __cigar_len(__cigar) ((__cigar)&CIGAR_LN_MASK) #define __cigar_create(__op, __len) ((__op)< #include #include #include "bwtgap.h" #include "bwtaln.h" #ifdef USE_MALLOC_WRAPPERS # include "malloc_wrap.h" #endif #define STATE_M 0 #define STATE_I 1 #define STATE_D 2 #define aln_score(m,o,e,p) ((m)*(p)->s_mm + (o)*(p)->s_gapo + (e)*(p)->s_gape) gap_stack_t *gap_init_stack2(int max_score) { gap_stack_t *stack; stack = (gap_stack_t*)calloc(1, sizeof(gap_stack_t)); stack->n_stacks = max_score; stack->stacks = (gap_stack1_t*)calloc(stack->n_stacks, sizeof(gap_stack1_t)); return stack; } gap_stack_t *gap_init_stack(int max_mm, int max_gapo, int max_gape, const gap_opt_t *opt) { return gap_init_stack2(aln_score(max_mm+1, max_gapo+1, max_gape+1, opt)); } void gap_destroy_stack(gap_stack_t *stack) { int i; for (i = 0; i != stack->n_stacks; ++i) free(stack->stacks[i].stack); free(stack->stacks); free(stack); } static void gap_reset_stack(gap_stack_t *stack) { int i; for (i = 0; i != stack->n_stacks; ++i) stack->stacks[i].n_entries = 0; stack->best = stack->n_stacks; stack->n_entries = 0; } static inline void gap_push(gap_stack_t *stack, int i, bwtint_t k, bwtint_t l, int n_mm, int n_gapo, int n_gape, int n_ins, int n_del, int state, int is_diff, const gap_opt_t *opt) { int score; gap_entry_t *p; gap_stack1_t *q; score = aln_score(n_mm, n_gapo, n_gape, opt); q = stack->stacks + score; if (q->n_entries == q->m_entries) { q->m_entries = q->m_entries? q->m_entries<<1 : 4; q->stack = (gap_entry_t*)realloc(q->stack, sizeof(gap_entry_t) * q->m_entries); } p = q->stack + q->n_entries; p->info = (uint32_t)score<<21 | i; p->k = k; p->l = l; p->n_mm = n_mm; p->n_gapo = n_gapo; p->n_gape = n_gape; p->n_ins = n_ins; p->n_del = n_del; p->state = state; p->last_diff_pos = is_diff? i : 0; ++(q->n_entries); ++(stack->n_entries); if (stack->best > score) stack->best = score; } static inline void gap_pop(gap_stack_t *stack, gap_entry_t *e) { gap_stack1_t *q; q = stack->stacks + stack->best; *e = q->stack[q->n_entries - 1]; --(q->n_entries); --(stack->n_entries); if (q->n_entries == 0 && stack->n_entries) { // reset best int i; for (i = stack->best + 1; i < stack->n_stacks; ++i) if (stack->stacks[i].n_entries != 0) break; stack->best = i; } else if (stack->n_entries == 0) stack->best = stack->n_stacks; } static inline void gap_shadow(int x, int len, bwtint_t max, int last_diff_pos, bwt_width_t *w) { int i, j; for (i = j = 0; i < last_diff_pos; ++i) { if (w[i].w > x) w[i].w -= x; else if (w[i].w == x) { w[i].bid = 1; w[i].w = max - (++j); } // else should not happen } } static inline int int_log2(uint32_t v) { int c = 0; if (v & 0xffff0000u) { v >>= 16; c |= 16; } if (v & 0xff00) { v >>= 8; c |= 8; } if (v & 0xf0) { v >>= 4; c |= 4; } if (v & 0xc) { v >>= 2; c |= 2; } if (v & 0x2) c |= 1; return c; } bwt_aln1_t *bwt_match_gap(bwt_t *const bwt, int len, const ubyte_t *seq, bwt_width_t *width, bwt_width_t *seed_width, const gap_opt_t *opt, int *_n_aln, gap_stack_t *stack) { // $seq is the reverse complement of the input read int best_score = aln_score(opt->max_diff+1, opt->max_gapo+1, opt->max_gape+1, opt); int best_diff = opt->max_diff + 1, max_diff = opt->max_diff; int best_cnt = 0; int max_entries = 0, j, _j, n_aln, m_aln; bwt_aln1_t *aln; m_aln = 4; n_aln = 0; aln = (bwt_aln1_t*)calloc(m_aln, sizeof(bwt_aln1_t)); // check whether there are too many N for (j = _j = 0; j < len; ++j) if (seq[j] > 3) ++_j; if (_j > max_diff) { *_n_aln = n_aln; return aln; } //for (j = 0; j != len; ++j) printf("#0 %d: [%d,%u]\t[%d,%u]\n", j, w[0][j].bid, w[0][j].w, w[1][j].bid, w[1][j].w); gap_reset_stack(stack); // reset stack gap_push(stack, len, 0, bwt->seq_len, 0, 0, 0, 0, 0, 0, 0, opt); while (stack->n_entries) { gap_entry_t e; int i, m, m_seed = 0, hit_found, allow_diff, allow_M, tmp; bwtint_t k, l, cnt_k[4], cnt_l[4], occ; if (max_entries < stack->n_entries) max_entries = stack->n_entries; if (stack->n_entries > opt->max_entries) break; gap_pop(stack, &e); // get the best entry k = e.k; l = e.l; // SA interval i = e.info&0xffff; // length if (!(opt->mode & BWA_MODE_NONSTOP) && e.info>>21 > best_score + opt->s_mm) break; // no need to proceed m = max_diff - (e.n_mm + e.n_gapo); if (opt->mode & BWA_MODE_GAPE) m -= e.n_gape; if (m < 0) continue; if (seed_width) { // apply seeding m_seed = opt->max_seed_diff - (e.n_mm + e.n_gapo); if (opt->mode & BWA_MODE_GAPE) m_seed -= e.n_gape; } //printf("#1\t[%d,%d,%d,%c]\t[%d,%d,%d]\t[%u,%u]\t[%u,%u]\t%d\n", stack->n_entries, a, i, "MID"[e.state], e.n_mm, e.n_gapo, e.n_gape, width[i-1].bid, width[i-1].w, k, l, e.last_diff_pos); if (i > 0 && m < width[i-1].bid) continue; // check whether a hit is found hit_found = 0; if (i == 0) hit_found = 1; else if (m == 0 && (e.state == STATE_M || (opt->mode&BWA_MODE_GAPE) || e.n_gape == opt->max_gape)) { // no diff allowed if (bwt_match_exact_alt(bwt, i, seq, &k, &l)) hit_found = 1; else continue; // no hit, skip } if (hit_found) { // action for found hits int score = aln_score(e.n_mm, e.n_gapo, e.n_gape, opt); int do_add = 1; //printf("#2 hits found: %d:(%u,%u)\n", e.n_mm+e.n_gapo, k, l); if (n_aln == 0) { best_score = score; best_diff = e.n_mm + e.n_gapo; if (opt->mode & BWA_MODE_GAPE) best_diff += e.n_gape; if (!(opt->mode & BWA_MODE_NONSTOP)) max_diff = (best_diff + 1 > opt->max_diff)? opt->max_diff : best_diff + 1; // top2 behaviour } if (score == best_score) best_cnt += l - k + 1; else if (best_cnt > opt->max_top2) break; // top2b behaviour if (e.n_gapo) { // check whether the hit has been found. this may happen when a gap occurs in a tandem repeat for (j = 0; j != n_aln; ++j) if (aln[j].k == k && aln[j].l == l) break; if (j < n_aln) do_add = 0; } if (do_add) { // append bwt_aln1_t *p; gap_shadow(l - k + 1, len, bwt->seq_len, e.last_diff_pos, width); if (n_aln == m_aln) { m_aln <<= 1; aln = (bwt_aln1_t*)realloc(aln, m_aln * sizeof(bwt_aln1_t)); memset(aln + m_aln/2, 0, m_aln/2*sizeof(bwt_aln1_t)); } p = aln + n_aln; p->n_mm = e.n_mm; p->n_gapo = e.n_gapo; p->n_gape = e.n_gape; p->n_ins = e.n_ins; p->n_del = e.n_del; p->k = k; p->l = l; p->score = score; //fprintf(stderr, "*** n_mm=%d,n_gapo=%d,n_gape=%d,n_ins=%d,n_del=%d\n", e.n_mm, e.n_gapo, e.n_gape, e.n_ins, e.n_del); ++n_aln; } continue; } --i; bwt_2occ4(bwt, k - 1, l, cnt_k, cnt_l); // retrieve Occ values occ = l - k + 1; // test whether diff is allowed allow_diff = allow_M = 1; if (i > 0) { int ii = i - (len - opt->seed_len); if (width[i-1].bid > m-1) allow_diff = 0; else if (width[i-1].bid == m-1 && width[i].bid == m-1 && width[i-1].w == width[i].w) allow_M = 0; if (seed_width && ii > 0) { if (seed_width[ii-1].bid > m_seed-1) allow_diff = 0; else if (seed_width[ii-1].bid == m_seed-1 && seed_width[ii].bid == m_seed-1 && seed_width[ii-1].w == seed_width[ii].w) allow_M = 0; } } // indels tmp = (opt->mode & BWA_MODE_LOGGAP)? int_log2(e.n_gape + e.n_gapo)/2+1 : e.n_gapo + e.n_gape; if (allow_diff && i >= opt->indel_end_skip + tmp && len - i >= opt->indel_end_skip + tmp) { if (e.state == STATE_M) { // gap open if (e.n_gapo < opt->max_gapo) { // gap open is allowed // insertion gap_push(stack, i, k, l, e.n_mm, e.n_gapo + 1, e.n_gape, e.n_ins + 1, e.n_del, STATE_I, 1, opt); // deletion for (j = 0; j != 4; ++j) { k = bwt->L2[j] + cnt_k[j] + 1; l = bwt->L2[j] + cnt_l[j]; if (k <= l) gap_push(stack, i + 1, k, l, e.n_mm, e.n_gapo + 1, e.n_gape, e.n_ins, e.n_del + 1, STATE_D, 1, opt); } } } else if (e.state == STATE_I) { // extention of an insertion if (e.n_gape < opt->max_gape) // gap extention is allowed gap_push(stack, i, k, l, e.n_mm, e.n_gapo, e.n_gape + 1, e.n_ins + 1, e.n_del, STATE_I, 1, opt); } else if (e.state == STATE_D) { // extention of a deletion if (e.n_gape < opt->max_gape) { // gap extention is allowed if (e.n_gape + e.n_gapo < max_diff || occ < opt->max_del_occ) { for (j = 0; j != 4; ++j) { k = bwt->L2[j] + cnt_k[j] + 1; l = bwt->L2[j] + cnt_l[j]; if (k <= l) gap_push(stack, i + 1, k, l, e.n_mm, e.n_gapo, e.n_gape + 1, e.n_ins, e.n_del + 1, STATE_D, 1, opt); } } } } } // mismatches if (allow_diff && allow_M) { // mismatch is allowed for (j = 1; j <= 4; ++j) { int c = (seq[i] + j) & 3; int is_mm = (j != 4 || seq[i] > 3); k = bwt->L2[c] + cnt_k[c] + 1; l = bwt->L2[c] + cnt_l[c]; if (k <= l) gap_push(stack, i, k, l, e.n_mm + is_mm, e.n_gapo, e.n_gape, e.n_ins, e.n_del, STATE_M, is_mm, opt); } } else if (seq[i] < 4) { // try exact match only int c = seq[i] & 3; k = bwt->L2[c] + cnt_k[c] + 1; l = bwt->L2[c] + cnt_l[c]; if (k <= l) gap_push(stack, i, k, l, e.n_mm, e.n_gapo, e.n_gape, e.n_ins, e.n_del, STATE_M, 0, opt); } } *_n_aln = n_aln; //fprintf(stderr, "max_entries = %d\n", max_entries); return aln; } bwa-0.7.17/bwtgap.h000066400000000000000000000017441317342117100140220ustar00rootroot00000000000000#ifndef BWTGAP_H_ #define BWTGAP_H_ #include "bwt.h" #include "bwtaln.h" typedef struct { // recursion stack uint32_t info; // score<<21 | i uint32_t n_mm:8, n_gapo:8, n_gape:8, state:2, n_seed_mm:6; uint32_t n_ins:16, n_del:16; int last_diff_pos; bwtint_t k, l; // (k,l) is the SA region of [i,n-1] } gap_entry_t; typedef struct { int n_entries, m_entries; gap_entry_t *stack; } gap_stack1_t; typedef struct { int n_stacks, best, n_entries; gap_stack1_t *stacks; } gap_stack_t; #ifdef __cplusplus extern "C" { #endif gap_stack_t *gap_init_stack2(int max_score); gap_stack_t *gap_init_stack(int max_mm, int max_gapo, int max_gape, const gap_opt_t *opt); void gap_destroy_stack(gap_stack_t *stack); bwt_aln1_t *bwt_match_gap(bwt_t *const bwt, int len, const ubyte_t *seq, bwt_width_t *w, bwt_width_t *seed_w, const gap_opt_t *opt, int *_n_aln, gap_stack_t *stack); void bwa_aln2seq(int n_aln, const bwt_aln1_t *aln, bwa_seq_t *s); #ifdef __cplusplus } #endif #endif bwa-0.7.17/bwtindex.c000066400000000000000000000231561317342117100143560ustar00rootroot00000000000000/* The MIT License Copyright (c) 2008 Genome Research Ltd (GRL). Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. */ /* Contact: Heng Li */ #include #include #include #include #include #include #include "bntseq.h" #include "bwa.h" #include "bwt.h" #include "utils.h" #include "rle.h" #include "rope.h" #ifdef _DIVBWT #include "divsufsort.h" #endif #ifdef USE_MALLOC_WRAPPERS # include "malloc_wrap.h" #endif int is_bwt(ubyte_t *T, int n); int64_t bwa_seq_len(const char *fn_pac) { FILE *fp; int64_t pac_len; ubyte_t c; fp = xopen(fn_pac, "rb"); err_fseek(fp, -1, SEEK_END); pac_len = err_ftell(fp); err_fread_noeof(&c, 1, 1, fp); err_fclose(fp); return (pac_len - 1) * 4 + (int)c; } bwt_t *bwt_pac2bwt(const char *fn_pac, int use_is) { bwt_t *bwt; ubyte_t *buf, *buf2; int64_t i, pac_size; FILE *fp; // initialization bwt = (bwt_t*)calloc(1, sizeof(bwt_t)); bwt->seq_len = bwa_seq_len(fn_pac); bwt->bwt_size = (bwt->seq_len + 15) >> 4; fp = xopen(fn_pac, "rb"); // prepare sequence pac_size = (bwt->seq_len>>2) + ((bwt->seq_len&3) == 0? 0 : 1); buf2 = (ubyte_t*)calloc(pac_size, 1); err_fread_noeof(buf2, 1, pac_size, fp); err_fclose(fp); memset(bwt->L2, 0, 5 * 4); buf = (ubyte_t*)calloc(bwt->seq_len + 1, 1); for (i = 0; i < bwt->seq_len; ++i) { buf[i] = buf2[i>>2] >> ((3 - (i&3)) << 1) & 3; ++bwt->L2[1+buf[i]]; } for (i = 2; i <= 4; ++i) bwt->L2[i] += bwt->L2[i-1]; free(buf2); // Burrows-Wheeler Transform if (use_is) { bwt->primary = is_bwt(buf, bwt->seq_len); } else { rope_t *r; int64_t x; rpitr_t itr; const uint8_t *blk; r = rope_init(ROPE_DEF_MAX_NODES, ROPE_DEF_BLOCK_LEN); for (i = bwt->seq_len - 1, x = 0; i >= 0; --i) { int c = buf[i] + 1; x = rope_insert_run(r, x, c, 1, 0) + 1; while (--c >= 0) x += r->c[c]; } bwt->primary = x; rope_itr_first(r, &itr); x = 0; while ((blk = rope_itr_next_block(&itr)) != 0) { const uint8_t *q = blk + 2, *end = blk + 2 + *rle_nptr(blk); while (q < end) { int c = 0; int64_t l; rle_dec1(q, c, l); for (i = 0; i < l; ++i) buf[x++] = c - 1; } } rope_destroy(r); } bwt->bwt = (uint32_t*)calloc(bwt->bwt_size, 4); for (i = 0; i < bwt->seq_len; ++i) bwt->bwt[i>>4] |= buf[i] << ((15 - (i&15)) << 1); free(buf); return bwt; } int bwa_pac2bwt(int argc, char *argv[]) // the "pac2bwt" command; IMPORTANT: bwt generated at this step CANNOT be used with BWA. bwtupdate is required! { bwt_t *bwt; int c, use_is = 1; while ((c = getopt(argc, argv, "d")) >= 0) { switch (c) { case 'd': use_is = 0; break; default: return 1; } } if (optind + 2 > argc) { fprintf(stderr, "Usage: bwa pac2bwt [-d] \n"); return 1; } bwt = bwt_pac2bwt(argv[optind], use_is); bwt_dump_bwt(argv[optind+1], bwt); bwt_destroy(bwt); return 0; } #define bwt_B00(b, k) ((b)->bwt[(k)>>4]>>((~(k)&0xf)<<1)&3) void bwt_bwtupdate_core(bwt_t *bwt) { bwtint_t i, k, c[4], n_occ; uint32_t *buf; n_occ = (bwt->seq_len + OCC_INTERVAL - 1) / OCC_INTERVAL + 1; bwt->bwt_size += n_occ * sizeof(bwtint_t); // the new size buf = (uint32_t*)calloc(bwt->bwt_size, 4); // will be the new bwt c[0] = c[1] = c[2] = c[3] = 0; for (i = k = 0; i < bwt->seq_len; ++i) { if (i % OCC_INTERVAL == 0) { memcpy(buf + k, c, sizeof(bwtint_t) * 4); k += sizeof(bwtint_t); // in fact: sizeof(bwtint_t)=4*(sizeof(bwtint_t)/4) } if (i % 16 == 0) buf[k++] = bwt->bwt[i/16]; // 16 == sizeof(uint32_t)/2 ++c[bwt_B00(bwt, i)]; } // the last element memcpy(buf + k, c, sizeof(bwtint_t) * 4); xassert(k + sizeof(bwtint_t) == bwt->bwt_size, "inconsistent bwt_size"); // update bwt free(bwt->bwt); bwt->bwt = buf; } int bwa_bwtupdate(int argc, char *argv[]) // the "bwtupdate" command { bwt_t *bwt; if (argc != 2) { fprintf(stderr, "Usage: bwa bwtupdate \n"); return 1; } bwt = bwt_restore_bwt(argv[1]); bwt_bwtupdate_core(bwt); bwt_dump_bwt(argv[1], bwt); bwt_destroy(bwt); return 0; } int bwa_bwt2sa(int argc, char *argv[]) // the "bwt2sa" command { bwt_t *bwt; int c, sa_intv = 32; while ((c = getopt(argc, argv, "i:")) >= 0) { switch (c) { case 'i': sa_intv = atoi(optarg); break; default: return 1; } } if (optind + 2 > argc) { fprintf(stderr, "Usage: bwa bwt2sa [-i %d] \n", sa_intv); return 1; } bwt = bwt_restore_bwt(argv[optind]); bwt_cal_sa(bwt, sa_intv); bwt_dump_sa(argv[optind+1], bwt); bwt_destroy(bwt); return 0; } int bwa_index(int argc, char *argv[]) // the "index" command { int c, algo_type = BWTALGO_AUTO, is_64 = 0, block_size = 10000000; char *prefix = 0, *str; while ((c = getopt(argc, argv, "6a:p:b:")) >= 0) { switch (c) { case 'a': // if -a is not set, algo_type will be determined later if (strcmp(optarg, "rb2") == 0) algo_type = BWTALGO_RB2; else if (strcmp(optarg, "bwtsw") == 0) algo_type = BWTALGO_BWTSW; else if (strcmp(optarg, "is") == 0) algo_type = BWTALGO_IS; else err_fatal(__func__, "unknown algorithm: '%s'.", optarg); break; case 'p': prefix = strdup(optarg); break; case '6': is_64 = 1; break; case 'b': block_size = strtol(optarg, &str, 10); if (*str == 'G' || *str == 'g') block_size *= 1024 * 1024 * 1024; else if (*str == 'M' || *str == 'm') block_size *= 1024 * 1024; else if (*str == 'K' || *str == 'k') block_size *= 1024; break; default: return 1; } } if (optind + 1 > argc) { fprintf(stderr, "\n"); fprintf(stderr, "Usage: bwa index [options] \n\n"); fprintf(stderr, "Options: -a STR BWT construction algorithm: bwtsw, is or rb2 [auto]\n"); fprintf(stderr, " -p STR prefix of the index [same as fasta name]\n"); fprintf(stderr, " -b INT block size for the bwtsw algorithm (effective with -a bwtsw) [%d]\n", block_size); fprintf(stderr, " -6 index files named as .64.* instead of .* \n"); fprintf(stderr, "\n"); fprintf(stderr, "Warning: `-a bwtsw' does not work for short genomes, while `-a is' and\n"); fprintf(stderr, " `-a div' do not work not for long genomes.\n\n"); return 1; } if (prefix == 0) { prefix = malloc(strlen(argv[optind]) + 4); strcpy(prefix, argv[optind]); if (is_64) strcat(prefix, ".64"); } bwa_idx_build(argv[optind], prefix, algo_type, block_size); free(prefix); return 0; } int bwa_idx_build(const char *fa, const char *prefix, int algo_type, int block_size) { extern void bwa_pac_rev_core(const char *fn, const char *fn_rev); char *str, *str2, *str3; clock_t t; int64_t l_pac; str = (char*)calloc(strlen(prefix) + 10, 1); str2 = (char*)calloc(strlen(prefix) + 10, 1); str3 = (char*)calloc(strlen(prefix) + 10, 1); { // nucleotide indexing gzFile fp = xzopen(fa, "r"); t = clock(); if (bwa_verbose >= 3) fprintf(stderr, "[bwa_index] Pack FASTA... "); l_pac = bns_fasta2bntseq(fp, prefix, 0); if (bwa_verbose >= 3) fprintf(stderr, "%.2f sec\n", (float)(clock() - t) / CLOCKS_PER_SEC); err_gzclose(fp); } if (algo_type == 0) algo_type = l_pac > 50000000? 2 : 3; // set the algorithm for generating BWT { strcpy(str, prefix); strcat(str, ".pac"); strcpy(str2, prefix); strcat(str2, ".bwt"); t = clock(); if (bwa_verbose >= 3) fprintf(stderr, "[bwa_index] Construct BWT for the packed sequence...\n"); if (algo_type == 2) bwt_bwtgen2(str, str2, block_size); else if (algo_type == 1 || algo_type == 3) { bwt_t *bwt; bwt = bwt_pac2bwt(str, algo_type == 3); bwt_dump_bwt(str2, bwt); bwt_destroy(bwt); } if (bwa_verbose >= 3) fprintf(stderr, "[bwa_index] %.2f seconds elapse.\n", (float)(clock() - t) / CLOCKS_PER_SEC); } { bwt_t *bwt; strcpy(str, prefix); strcat(str, ".bwt"); t = clock(); if (bwa_verbose >= 3) fprintf(stderr, "[bwa_index] Update BWT... "); bwt = bwt_restore_bwt(str); bwt_bwtupdate_core(bwt); bwt_dump_bwt(str, bwt); bwt_destroy(bwt); if (bwa_verbose >= 3) fprintf(stderr, "%.2f sec\n", (float)(clock() - t) / CLOCKS_PER_SEC); } { gzFile fp = xzopen(fa, "r"); t = clock(); if (bwa_verbose >= 3) fprintf(stderr, "[bwa_index] Pack forward-only FASTA... "); l_pac = bns_fasta2bntseq(fp, prefix, 1); if (bwa_verbose >= 3) fprintf(stderr, "%.2f sec\n", (float)(clock() - t) / CLOCKS_PER_SEC); err_gzclose(fp); } { bwt_t *bwt; strcpy(str, prefix); strcat(str, ".bwt"); strcpy(str3, prefix); strcat(str3, ".sa"); t = clock(); if (bwa_verbose >= 3) fprintf(stderr, "[bwa_index] Construct SA from BWT and Occ... "); bwt = bwt_restore_bwt(str); bwt_cal_sa(bwt, 32); bwt_dump_sa(str3, bwt); bwt_destroy(bwt); if (bwa_verbose >= 3) fprintf(stderr, "%.2f sec\n", (float)(clock() - t) / CLOCKS_PER_SEC); } free(str3); free(str2); free(str); return 0; } bwa-0.7.17/bwtsw2.h000066400000000000000000000027641317342117100137710ustar00rootroot00000000000000#ifndef LH3_BWTSW2_H #define LH3_BWTSW2_H #include #include "bntseq.h" #include "bwt_lite.h" #include "bwt.h" #define BSW2_FLAG_MATESW 0x100 #define BSW2_FLAG_TANDEM 0x200 #define BSW2_FLAG_MOVED 0x400 #define BSW2_FLAG_RESCUED 0x800 typedef struct { int skip_sw:8, cpy_cmt:8, hard_clip:16; int a, b, q, r, t, qr, bw, max_ins, max_chain_gap; int z, is, t_seeds, multi_2nd; float mask_level, coef; int n_threads, chunk_size; } bsw2opt_t; typedef struct { bwtint_t k, l; uint32_t flag:18, n_seeds:13, is_rev:1; int len, G, G2; int beg, end; } bsw2hit_t; typedef struct { int flag, nn, n_cigar, chr, pos, qual, mchr, mpos, pqual, isize, nm; uint32_t *cigar; } bsw2aux_t; typedef struct { int n, max; bsw2hit_t *hits; bsw2aux_t *aux; } bwtsw2_t; typedef struct { void *stack; int max_l; uint8_t *aln_mem; } bsw2global_t; typedef struct { int l, tid; char *name, *seq, *qual, *sam, *comment; } bsw2seq1_t; #ifdef __cplusplus extern "C" { #endif bsw2opt_t *bsw2_init_opt(); bwtsw2_t **bsw2_core(const bntseq_t *bns, const bsw2opt_t *opt, const bwtl_t *target, const bwt_t *query, bsw2global_t *pool); void bsw2_aln(const bsw2opt_t *opt, const bntseq_t *bns, bwt_t * const target, const char *fn, const char *fn2); void bsw2_destroy(bwtsw2_t *b); bsw2global_t *bsw2_global_init(); void bsw2_global_destroy(bsw2global_t *_pool); void bsw2_pair(const bsw2opt_t *opt, int64_t l_pac, const uint8_t *pac, int n, bsw2seq1_t *seq, bwtsw2_t **hit); #ifdef __cplusplus } #endif #endif bwa-0.7.17/bwtsw2_aux.c000066400000000000000000000606041317342117100146360ustar00rootroot00000000000000#include #include #include #ifdef HAVE_CONFIG_H #include "config.h" #endif #ifdef HAVE_PTHREAD #include #endif #include "bntseq.h" #include "bwt_lite.h" #include "utils.h" #include "bwtsw2.h" #include "kstring.h" #include "bwa.h" #include "ksw.h" #include "kseq.h" KSEQ_DECLARE(gzFile) #include "ksort.h" #define __left_lt(a, b) ((a).end > (b).end) KSORT_INIT(hit, bsw2hit_t, __left_lt) #ifdef USE_MALLOC_WRAPPERS # include "malloc_wrap.h" #endif extern unsigned char nst_nt4_table[256]; unsigned char nt_comp_table[256] = { 'N','N','N','N', 'N','N','N','N', 'N','N','N','N', 'N','N','N','N', 'N','N','N','N', 'N','N','N','N', 'N','N','N','N', 'N','N','N','N', 'N','N','N','N', 'N','N','N','N', 'N','N','N','N', 'N','N','N','N', 'N','N','N','N', 'N','N','N','N', 'N','N','N','N', 'N','N','N','N', 'N','T','V','G', 'H','N','N','C', 'D','N','N','M', 'N','K','N','N', 'N','N','Y','S', 'A','N','B','W', 'X','R','N','N', 'N','N','N','N', 'n','t','v','g', 'h','n','n','c', 'd','n','n','m', 'n','k','n','n', 'n','n','y','s', 'a','n','b','w', 'x','r','n}; extern int bsw2_resolve_duphits(const bntseq_t *bns, const bwt_t *bwt, bwtsw2_t *b, int IS); extern int bsw2_resolve_query_overlaps(bwtsw2_t *b, float mask_level); bsw2opt_t *bsw2_init_opt() { bsw2opt_t *o = (bsw2opt_t*)calloc(1, sizeof(bsw2opt_t)); o->a = 1; o->b = 3; o->q = 5; o->r = 2; o->t = 30; o->bw = 50; o->max_ins = 20000; o->z = 1; o->is = 3; o->t_seeds = 5; o->hard_clip = 0; o->skip_sw = 0; o->mask_level = 0.50f; o->coef = 5.5f; o->qr = o->q + o->r; o->n_threads = 1; o->chunk_size = 10000000; o->max_chain_gap = 10000; o->cpy_cmt = 0; return o; } void bsw2_destroy(bwtsw2_t *b) { int i; if (b == 0) return; if (b->aux) for (i = 0; i < b->n; ++i) free(b->aux[i].cigar); free(b->aux); free(b->hits); free(b); } bwtsw2_t *bsw2_dup_no_cigar(const bwtsw2_t *b) { bwtsw2_t *p; p = calloc(1, sizeof(bwtsw2_t)); p->max = p->n = b->n; if (b->n) { kroundup32(p->max); p->hits = calloc(p->max, sizeof(bsw2hit_t)); memcpy(p->hits, b->hits, p->n * sizeof(bsw2hit_t)); } return p; } #define __gen_ap(par, opt) do { \ int i; \ for (i = 0; i < 25; ++i) (par).matrix[i] = -(opt)->b; \ for (i = 0; i < 4; ++i) (par).matrix[i*5+i] = (opt)->a; \ (par).gap_open = (opt)->q; (par).gap_ext = (opt)->r; \ (par).gap_end = (opt)->r; \ (par).row = 5; (par).band_width = opt->bw; \ } while (0) void bsw2_extend_left(const bsw2opt_t *opt, bwtsw2_t *b, uint8_t *_query, int lq, uint8_t *pac, bwtint_t l_pac, uint8_t *_mem) { int i; bwtint_t k; uint8_t *target = 0, *query; int8_t mat[25]; bwa_fill_scmat(opt->a, opt->b, mat); query = calloc(lq, 1); // sort according to the descending order of query end ks_introsort(hit, b->n, b->hits); target = calloc(((lq + 1) / 2 * opt->a + opt->r) / opt->r + lq, 1); // reverse _query for (i = 0; i < lq; ++i) query[lq - i - 1] = _query[i]; // core loop for (i = 0; i < b->n; ++i) { bsw2hit_t *p = b->hits + i; int lt = ((p->beg + 1) / 2 * opt->a + opt->r) / opt->r + lq; int score, j, qle, tle; p->n_seeds = 1; if (p->l || p->k == 0) continue; for (j = score = 0; j < i; ++j) { bsw2hit_t *q = b->hits + j; if (q->beg <= p->beg && q->k <= p->k && q->k + q->len >= p->k + p->len) { if (q->n_seeds < (1<<13) - 2) ++q->n_seeds; ++score; } } if (score) continue; if (lt > p->k) lt = p->k; for (k = p->k - 1, j = 0; k > 0 && j < lt; --k) // FIXME: k=0 not considered! target[j++] = pac[k>>2] >> (~k&3)*2 & 0x3; lt = j; score = ksw_extend(p->beg, &query[lq - p->beg], lt, target, 5, mat, opt->q, opt->r, opt->bw, 0, -1, p->G, &qle, &tle, 0, 0, 0); if (score > p->G) { // extensible p->G = score; p->k -= tle; p->len += tle; p->beg -= qle; } } free(query); free(target); } void bsw2_extend_rght(const bsw2opt_t *opt, bwtsw2_t *b, uint8_t *query, int lq, uint8_t *pac, bwtint_t l_pac, uint8_t *_mem) { int i; bwtint_t k; uint8_t *target; int8_t mat[25]; bwa_fill_scmat(opt->a, opt->b, mat); target = calloc(((lq + 1) / 2 * opt->a + opt->r) / opt->r + lq, 1); for (i = 0; i < b->n; ++i) { bsw2hit_t *p = b->hits + i; int lt = ((lq - p->beg + 1) / 2 * opt->a + opt->r) / opt->r + lq; int j, score, qle, tle; if (p->l) continue; for (k = p->k, j = 0; k < p->k + lt && k < l_pac; ++k) target[j++] = pac[k>>2] >> (~k&3)*2 & 0x3; lt = j; score = ksw_extend(lq - p->beg, &query[p->beg], lt, target, 5, mat, opt->q, opt->r, opt->bw, 0, -1, 1, &qle, &tle, 0, 0, 0) - 1; // if (score < p->G) fprintf(stderr, "[bsw2_extend_hits] %d < %d\n", score, p->G); if (score >= p->G) { p->G = score; p->len = tle; p->end = p->beg + qle; } } free(target); } /* generate CIGAR array(s) in b->cigar[] */ static void gen_cigar(const bsw2opt_t *opt, int lq, uint8_t *seq[2], int64_t l_pac, const uint8_t *pac, bwtsw2_t *b, const char *name) { int i; int8_t mat[25]; bwa_fill_scmat(opt->a, opt->b, mat); for (i = 0; i < b->n; ++i) { bsw2hit_t *p = b->hits + i; bsw2aux_t *q = b->aux + i; uint8_t *query; int beg, end, score; if (p->l) continue; beg = (p->flag & 0x10)? lq - p->end : p->beg; end = (p->flag & 0x10)? lq - p->beg : p->end; query = seq[(p->flag & 0x10)? 1 : 0] + beg; q->cigar = bwa_gen_cigar(mat, opt->q, opt->r, opt->bw, l_pac, pac, end - beg, query, p->k, p->k + p->len, &score, &q->n_cigar, &q->nm); #if 0 if (name && score != p->G) { // debugging only int j, glen = 0; for (j = 0; j < q->n_cigar; ++j) if ((q->cigar[j]&0xf) == 1 || (q->cigar[j]&0xf) == 2) glen += q->cigar[j]>>4; fprintf(stderr, "[E::%s] %s - unequal score: %d != %d; (qlen, aqlen, arlen, glen, bw) = (%d, %d, %d, %d, %d)\n", __func__, name, score, p->G, lq, end - beg, p->len, glen, opt->bw); } #endif if (q->cigar && (beg != 0 || end < lq)) { // write soft clipping q->cigar = realloc(q->cigar, 4 * (q->n_cigar + 2)); if (beg != 0) { memmove(q->cigar + 1, q->cigar, q->n_cigar * 4); q->cigar[0] = beg<<4 | 4; ++q->n_cigar; } if (end < lq) { q->cigar[q->n_cigar] = (lq - end)<<4 | 4; ++q->n_cigar; } } } } /* this is for the debugging purpose only */ void bsw2_debug_hits(const bwtsw2_t *b) { int i; printf("# raw hits: %d\n", b->n); for (i = 0; i < b->n; ++i) { bsw2hit_t *p = b->hits + i; if (p->G > 0) printf("G=%d, G2=%d, len=%d, [%d,%d), k=%lu, l=%lu, #seeds=%d, is_rev=%d\n", p->G, p->G2, p->len, p->beg, p->end, (long)p->k, (long)p->l, p->n_seeds, p->is_rev); } } static void merge_hits(bwtsw2_t *b[2], int l, int is_reverse) { int i; if (b[0]->n + b[1]->n > b[0]->max) { b[0]->max = b[0]->n + b[1]->n; b[0]->hits = realloc(b[0]->hits, b[0]->max * sizeof(bsw2hit_t)); } for (i = 0; i < b[1]->n; ++i) { bsw2hit_t *p = b[0]->hits + b[0]->n + i; *p = b[1]->hits[i]; if (is_reverse) { int x = p->beg; p->beg = l - p->end; p->end = l - x; p->flag |= 0x10; } } b[0]->n += b[1]->n; bsw2_destroy(b[1]); b[1] = 0; } /* seq[0] is the forward sequence and seq[1] is the reverse complement. */ static bwtsw2_t *bsw2_aln1_core(const bsw2opt_t *opt, const bntseq_t *bns, uint8_t *pac, const bwt_t *target, int l, uint8_t *seq[2], bsw2global_t *pool) { extern void bsw2_chain_filter(const bsw2opt_t *opt, int len, bwtsw2_t *b[2]); bwtsw2_t *b[2], **bb[2], **_b, *p; int k, j; bwtl_t *query; query = bwtl_seq2bwtl(l, seq[0]); _b = bsw2_core(bns, opt, query, target, pool); bwtl_destroy(query); for (k = 0; k < 2; ++k) { bb[k] = calloc(2, sizeof(void*)); bb[k][0] = calloc(1, sizeof(bwtsw2_t)); bb[k][1] = calloc(1, sizeof(bwtsw2_t)); } for (k = 0; k < 2; ++k) { // separate _b into bb[2] based on the strand for (j = 0; j < _b[k]->n; ++j) { bsw2hit_t *q; p = bb[_b[k]->hits[j].is_rev][k]; if (p->n == p->max) { p->max = p->max? p->max<<1 : 8; p->hits = realloc(p->hits, p->max * sizeof(bsw2hit_t)); } q = &p->hits[p->n++]; *q = _b[k]->hits[j]; if (_b[k]->hits[j].is_rev) { int x = q->beg; q->beg = l - q->end; q->end = l - x; } } } b[0] = bb[0][1]; b[1] = bb[1][1]; // bb[*][1] are "narrow SA hits" bsw2_chain_filter(opt, l, b); // NB: only unique seeds are chained for (k = 0; k < 2; ++k) { bsw2_extend_left(opt, bb[k][1], seq[k], l, pac, bns->l_pac, pool->aln_mem); merge_hits(bb[k], l, 0); // bb[k][1] is merged to bb[k][0] here bsw2_resolve_duphits(0, 0, bb[k][0], 0); bsw2_extend_rght(opt, bb[k][0], seq[k], l, pac, bns->l_pac, pool->aln_mem); bsw2_resolve_duphits(0, 0, bb[k][0], 0); b[k] = bb[k][0]; free(bb[k]); } merge_hits(b, l, 1); // again, b[1] is merged to b[0] bsw2_resolve_query_overlaps(b[0], opt->mask_level); bsw2_destroy(_b[0]); bsw2_destroy(_b[1]); free(_b); return b[0]; } /* set ->flag to records the origin of the hit (to forward bwt or reverse bwt) */ static void flag_fr(bwtsw2_t *b[2]) { int i, j; for (i = 0; i < b[0]->n; ++i) { bsw2hit_t *p = b[0]->hits + i; p->flag |= 0x10000; } for (i = 0; i < b[1]->n; ++i) { bsw2hit_t *p = b[1]->hits + i; p->flag |= 0x20000; } for (i = 0; i < b[0]->n; ++i) { bsw2hit_t *p = b[0]->hits + i; for (j = 0; j < b[1]->n; ++j) { bsw2hit_t *q = b[1]->hits + j; if (q->beg == p->beg && q->end == p->end && q->k == p->k && q->len == p->len && q->G == p->G) { q->flag |= 0x30000; p->flag |= 0x30000; break; } } } } typedef struct { int n, max; bsw2seq1_t *seq; } bsw2seq_t; static int fix_cigar(const bntseq_t *bns, bsw2hit_t *p, int n_cigar, uint32_t *cigar) { // FIXME: this routine does not work if the query bridge three reference sequences int32_t coor, refl, lq; int x, y, i, seqid; bns_cnt_ambi(bns, p->k, p->len, &seqid); coor = p->k - bns->anns[seqid].offset; refl = bns->anns[seqid].len; x = coor; y = 0; // test if the alignment goes beyond the boundary for (i = 0; i < n_cigar; ++i) { int op = cigar[i]&0xf, ln = cigar[i]>>4; if (op == 1 || op == 4 || op == 5) y += ln; else if (op == 2) x += ln; else x += ln, y += ln; } lq = y; // length of the query sequence if (x > refl) { // then fix it int j, nc, mq[2], nlen[2]; uint32_t *cn; bwtint_t kk = 0; nc = mq[0] = mq[1] = nlen[0] = nlen[1] = 0; cn = calloc(n_cigar + 3, 4); x = coor; y = 0; for (i = j = 0; i < n_cigar; ++i) { int op = cigar[i]&0xf, ln = cigar[i]>>4; if (op == 4 || op == 5 || op == 1) { // ins or clipping y += ln; cn[j++] = cigar[i]; } else if (op == 2) { // del if (x + ln >= refl && nc == 0) { cn[j++] = (uint32_t)(lq - y)<<4 | 4; nc = j; cn[j++] = (uint32_t)y<<4 | 4; kk = p->k + (x + ln - refl); nlen[0] = x - coor; nlen[1] = p->len - nlen[0] - ln; } else cn[j++] = cigar[i]; x += ln; } else if (op == 0) { // match if (x + ln >= refl && nc == 0) { // FIXME: not consider a special case where a split right between M and I cn[j++] = (uint32_t)(refl - x)<<4 | 0; // write M cn[j++] = (uint32_t)(lq - y - (refl - x))<<4 | 4; // write S nc = j; mq[0] += refl - x; cn[j++] = (uint32_t)(y + (refl - x))<<4 | 4; if (x + ln - refl) cn[j++] = (uint32_t)(x + ln - refl)<<4 | 0; mq[1] += x + ln - refl; kk = bns->anns[seqid].offset + refl; nlen[0] = refl - coor; nlen[1] = p->len - nlen[0]; } else { cn[j++] = cigar[i]; mq[nc?1:0] += ln; } x += ln; y += ln; } } if (mq[0] > mq[1]) { // then take the first alignment n_cigar = nc; memcpy(cigar, cn, 4 * nc); p->len = nlen[0]; } else { p->k = kk; p->len = nlen[1]; n_cigar = j - nc; memcpy(cigar, cn + nc, 4 * (j - nc)); } free(cn); } return n_cigar; } static void write_aux(const bsw2opt_t *opt, const bntseq_t *bns, int qlen, uint8_t *seq[2], const uint8_t *pac, bwtsw2_t *b, const char *name) { int i; // allocate for b->aux if (b->n<<1 < b->max) { b->max = b->n; kroundup32(b->max); b->hits = realloc(b->hits, b->max * sizeof(bsw2hit_t)); } b->aux = calloc(b->n, sizeof(bsw2aux_t)); // generate CIGAR gen_cigar(opt, qlen, seq, bns->l_pac, pac, b, name); // fix CIGAR, generate mapQ, and write chromosomal position for (i = 0; i < b->n; ++i) { bsw2hit_t *p = &b->hits[i]; bsw2aux_t *q = &b->aux[i]; q->flag = p->flag & 0xfe; q->isize = 0; if (p->l == 0) { // unique hit float c = 1.0; int subo; // fix out-of-boundary CIGAR q->n_cigar = fix_cigar(bns, p, q->n_cigar, q->cigar); // compute mapQ subo = p->G2 > opt->t? p->G2 : opt->t; if (p->flag>>16 == 1 || p->flag>>16 == 2) c *= .5; if (p->n_seeds < 2) c *= .2; q->qual = (int)(c * (p->G - subo) * (250.0 / p->G + 0.03 / opt->a) + .499); if (q->qual > 250) q->qual = 250; if (q->qual < 0) q->qual = 0; if (p->flag&1) q->qual = 0; // this is a random hit q->pqual = q->qual; // set the paired qual as qual // get the chromosomal position q->nn = bns_cnt_ambi(bns, p->k, p->len, &q->chr); q->pos = p->k - bns->anns[q->chr].offset; } else q->qual = 0, q->n_cigar = 0, q->chr = q->pos = -1, q->nn = 0; } } static void update_mate_aux(bwtsw2_t *b, const bwtsw2_t *m) { int i; if (m == 0) return; // update flag, mchr and mpos for (i = 0; i < b->n; ++i) { bsw2aux_t *q = &b->aux[i]; q->flag |= 1; // paired if (m->n == 0) q->flag |= 8; // mate unmapped if (m->n == 1) { q->mchr = m->aux[0].chr; q->mpos = m->aux[0].pos; if (m->aux[0].flag&0x10) q->flag |= 0x20; // mate reverse strand if (q->chr == q->mchr) { // set insert size if (q->mpos + m->hits[0].len > q->pos) q->isize = q->mpos + m->hits[0].len - q->pos; else q->isize = q->mpos - q->pos - b->hits[0].len; } else q->isize = 0; } else q->mchr = q->mpos = -1; } // update mapping quality if (b->n == 1 && m->n == 1) { bsw2hit_t *p = &b->hits[0]; if (p->flag & BSW2_FLAG_MATESW) { // this alignment is found by Smith-Waterman if (!(p->flag & BSW2_FLAG_TANDEM) && b->aux[0].pqual < 20) b->aux[0].pqual = 20; if (b->aux[0].pqual >= m->aux[0].qual) b->aux[0].pqual = m->aux[0].qual; } else if ((p->flag & 2) && !(m->hits[0].flag & BSW2_FLAG_MATESW)) { // properly paired if (!(p->flag & BSW2_FLAG_TANDEM)) { // pqual is bounded by [b->aux[0].qual,m->aux[0].qual] b->aux[0].pqual += 20; if (b->aux[0].pqual > m->aux[0].qual) b->aux[0].pqual = m->aux[0].qual; if (b->aux[0].pqual < b->aux[0].qual) b->aux[0].pqual = b->aux[0].qual; } } } } /* generate SAM lines for a sequence in ks with alignment stored in * b. ks->name and ks->seq will be freed and set to NULL in the end. */ static void print_hits(const bntseq_t *bns, const bsw2opt_t *opt, bsw2seq1_t *ks, bwtsw2_t *b, int is_pe, bwtsw2_t *bmate) { int i, k; kstring_t str; memset(&str, 0, sizeof(kstring_t)); if (b == 0 || b->n == 0) { // no hits ksprintf(&str, "%s\t4\t*\t0\t0\t*\t*\t0\t0\t", ks->name); for (i = 0; i < ks->l; ++i) kputc(ks->seq[i], &str); if (ks->qual) { kputc('\t', &str); for (i = 0; i < ks->l; ++i) kputc(ks->qual[i], &str); } else kputs("\t*", &str); kputc('\n', &str); } for (i = 0; b && i < b->n; ++i) { bsw2hit_t *p = b->hits + i; bsw2aux_t *q = b->aux + i; int j, beg, end, type = 0; // print mandatory fields before SEQ if (q->cigar == 0) q->flag |= 0x4; ksprintf(&str, "%s\t%d", ks->name, q->flag | (opt->multi_2nd && i? 0x100 : 0)); ksprintf(&str, "\t%s\t%ld", q->chr>=0? bns->anns[q->chr].name : "*", (long)q->pos + 1); if (p->l == 0 && q->cigar) { // not a repetitive hit ksprintf(&str, "\t%d\t", q->pqual); for (k = 0; k < q->n_cigar; ++k) ksprintf(&str, "%d%c", q->cigar[k]>>4, (opt->hard_clip? "MIDNHHP" : "MIDNSHP")[q->cigar[k]&0xf]); } else ksprintf(&str, "\t0\t*"); if (!is_pe) kputs("\t*\t0\t0\t", &str); else ksprintf(&str, "\t%s\t%d\t%d\t", q->mchr==q->chr? "=" : (q->mchr<0? "*" : bns->anns[q->mchr].name), q->mpos+1, q->isize); // get the sequence begin and end beg = 0; end = ks->l; if (opt->hard_clip && q->cigar) { if ((q->cigar[0]&0xf) == 4) beg += q->cigar[0]>>4; if ((q->cigar[q->n_cigar-1]&0xf) == 4) end -= q->cigar[q->n_cigar-1]>>4; } for (j = beg; j < end; ++j) { if (p->flag&0x10) kputc(nt_comp_table[(int)ks->seq[ks->l - 1 - j]], &str); else kputc(ks->seq[j], &str); } // print base quality if present if (ks->qual) { kputc('\t', &str); for (j = beg; j < end; ++j) { if (p->flag&0x10) kputc(ks->qual[ks->l - 1 - j], &str); else kputc(ks->qual[j], &str); } } else kputs("\t*", &str); // print optional tags ksprintf(&str, "\tAS:i:%d\tXS:i:%d\tXF:i:%d\tXE:i:%d\tNM:i:%d", p->G, p->G2, p->flag>>16, p->n_seeds, q->nm); if (q->nn) ksprintf(&str, "\tXN:i:%d", q->nn); if (p->l) ksprintf(&str, "\tXI:i:%d", p->l - p->k + 1); if (p->flag&BSW2_FLAG_MATESW) type |= 1; if (p->flag&BSW2_FLAG_TANDEM) type |= 2; if (type) ksprintf(&str, "\tXT:i:%d", type); if (opt->cpy_cmt && ks->comment) { int l = strlen(ks->comment); if (l >= 6 && ks->comment[2] == ':' && ks->comment[4] == ':') { kputc('\t', &str); kputs(ks->comment, &str); } } kputc('\n', &str); } ks->sam = str.s; free(ks->seq); ks->seq = 0; free(ks->qual); ks->qual = 0; free(ks->name); ks->name = 0; } static void update_opt(bsw2opt_t *dst, const bsw2opt_t *src, int qlen) { double ll = log(qlen); int i, k; *dst = *src; if (dst->t < ll * dst->coef) dst->t = (int)(ll * dst->coef + .499); // set band width: the query length sets a boundary on the maximum band width k = (qlen * dst->a - 2 * dst->q) / (2 * dst->r + dst->a); i = (qlen * dst->a - dst->a - dst->t) / dst->r; if (k > i) k = i; if (k < 1) k = 1; // I do not know if k==0 causes troubles dst->bw = src->bw < k? src->bw : k; } /* Core routine to align reads in _seq. It is separated from * process_seqs() to realize multi-threading */ static void bsw2_aln_core(bsw2seq_t *_seq, const bsw2opt_t *_opt, const bntseq_t *bns, uint8_t *pac, const bwt_t *target, int is_pe) { int x; bsw2opt_t opt; bsw2global_t *pool = bsw2_global_init(); bwtsw2_t **buf; buf = calloc(_seq->n, sizeof(void*)); for (x = 0; x < _seq->n; ++x) { bsw2seq1_t *p = _seq->seq + x; uint8_t *seq[2], *rseq[2]; int i, l, k; bwtsw2_t *b[2]; l = p->l; update_opt(&opt, _opt, p->l); if (pool->max_l < l) { // then enlarge working space for aln_extend_core() int tmp = ((l + 1) / 2 * opt.a + opt.r) / opt.r + l; pool->max_l = l; pool->aln_mem = realloc(pool->aln_mem, (tmp + 2) * 24); } // set seq[2] and rseq[2] seq[0] = calloc(l * 4, 1); seq[1] = seq[0] + l; rseq[0] = seq[1] + l; rseq[1] = rseq[0] + l; // convert sequences to 2-bit representation for (i = k = 0; i < l; ++i) { int c = nst_nt4_table[(int)p->seq[i]]; if (c >= 4) { c = (int)(drand48() * 4); ++k; } // FIXME: ambiguous bases are not properly handled seq[0][i] = c; seq[1][l-1-i] = 3 - c; rseq[0][l-1-i] = 3 - c; rseq[1][i] = c; } if (l - k < opt.t) { // too few unambiguous bases buf[x] = calloc(1, sizeof(bwtsw2_t)); free(seq[0]); continue; } // alignment b[0] = bsw2_aln1_core(&opt, bns, pac, target, l, seq, pool); for (k = 0; k < b[0]->n; ++k) if (b[0]->hits[k].n_seeds < opt.t_seeds) break; if (k < b[0]->n) { b[1] = bsw2_aln1_core(&opt, bns, pac, target, l, rseq, pool); for (i = 0; i < b[1]->n; ++i) { bsw2hit_t *p = &b[1]->hits[i]; int x = p->beg; p->flag ^= 0x10, p->is_rev ^= 1; // flip the strand p->beg = l - p->end; p->end = l - x; } flag_fr(b); merge_hits(b, l, 0); bsw2_resolve_duphits(0, 0, b[0], 0); bsw2_resolve_query_overlaps(b[0], opt.mask_level); } else b[1] = 0; // generate CIGAR and print SAM buf[x] = bsw2_dup_no_cigar(b[0]); // free free(seq[0]); bsw2_destroy(b[0]); } if (is_pe) bsw2_pair(&opt, bns->l_pac, pac, _seq->n, _seq->seq, buf); for (x = 0; x < _seq->n; ++x) { bsw2seq1_t *p = _seq->seq + x; uint8_t *seq[2]; int i; seq[0] = malloc(p->l * 2); seq[1] = seq[0] + p->l; for (i = 0; i < p->l; ++i) { int c = nst_nt4_table[(int)p->seq[i]]; if (c >= 4) c = (int)(drand48() * 4); seq[0][i] = c; seq[1][p->l-1-i] = 3 - c; } update_opt(&opt, _opt, p->l); write_aux(&opt, bns, p->l, seq, pac, buf[x], _seq->seq[x].name); free(seq[0]); } for (x = 0; x < _seq->n; ++x) { if (is_pe) update_mate_aux(buf[x], buf[x^1]); print_hits(bns, &opt, &_seq->seq[x], buf[x], is_pe, buf[x^1]); } for (x = 0; x < _seq->n; ++x) bsw2_destroy(buf[x]); free(buf); bsw2_global_destroy(pool); } #ifdef HAVE_PTHREAD typedef struct { int tid, is_pe; bsw2seq_t *_seq; const bsw2opt_t *_opt; const bntseq_t *bns; uint8_t *pac; const bwt_t *target; } thread_aux_t; /* another interface to bsw2_aln_core() to facilitate pthread_create() */ static void *worker(void *data) { thread_aux_t *p = (thread_aux_t*)data; bsw2_aln_core(p->_seq, p->_opt, p->bns, p->pac, p->target, p->is_pe); return 0; } #endif /* process sequences stored in _seq, generate SAM lines for these * sequences and reset _seq afterwards. */ static void process_seqs(bsw2seq_t *_seq, const bsw2opt_t *opt, const bntseq_t *bns, uint8_t *pac, const bwt_t *target, int is_pe) { int i; is_pe = is_pe? 1 : 0; #ifdef HAVE_PTHREAD if (opt->n_threads <= 1) { bsw2_aln_core(_seq, opt, bns, pac, target, is_pe); } else { pthread_t *tid; pthread_attr_t attr; thread_aux_t *data; int j; pthread_attr_init(&attr); pthread_attr_setdetachstate(&attr, PTHREAD_CREATE_JOINABLE); data = (thread_aux_t*)calloc(opt->n_threads, sizeof(thread_aux_t)); tid = (pthread_t*)calloc(opt->n_threads, sizeof(pthread_t)); for (j = 0; j < opt->n_threads; ++j) { thread_aux_t *p = data + j; p->tid = j; p->_opt = opt; p->bns = bns; p->is_pe = is_pe; p->pac = pac; p->target = target; p->_seq = calloc(1, sizeof(bsw2seq_t)); p->_seq->max = (_seq->n + opt->n_threads - 1) / opt->n_threads + 1; p->_seq->n = 0; p->_seq->seq = calloc(p->_seq->max, sizeof(bsw2seq1_t)); } for (i = 0; i < _seq->n; ++i) { // assign sequences to each thread bsw2seq_t *p = data[(i>>is_pe)%opt->n_threads]._seq; p->seq[p->n++] = _seq->seq[i]; } for (j = 0; j < opt->n_threads; ++j) pthread_create(&tid[j], &attr, worker, &data[j]); for (j = 0; j < opt->n_threads; ++j) pthread_join(tid[j], 0); for (j = 0; j < opt->n_threads; ++j) data[j]._seq->n = 0; for (i = 0; i < _seq->n; ++i) { // copy the result from each thread back bsw2seq_t *p = data[(i>>is_pe)%opt->n_threads]._seq; _seq->seq[i] = p->seq[p->n++]; } for (j = 0; j < opt->n_threads; ++j) { thread_aux_t *p = data + j; free(p->_seq->seq); free(p->_seq); } free(data); free(tid); } #else bsw2_aln_core(_seq, opt, bns, pac, target, is_pe); #endif // print and reset for (i = 0; i < _seq->n; ++i) { bsw2seq1_t *p = _seq->seq + i; if (p->sam) err_printf("%s", p->sam); free(p->name); free(p->seq); free(p->qual); free(p->sam); p->tid = -1; p->l = 0; p->name = p->seq = p->qual = p->sam = 0; } err_fflush(stdout); _seq->n = 0; } void bsw2_aln(const bsw2opt_t *opt, const bntseq_t *bns, bwt_t * const target, const char *fn, const char *fn2) { gzFile fp, fp2; kseq_t *ks, *ks2; int l, is_pe = 0, i, n; uint8_t *pac; bsw2seq_t *_seq; bseq1_t *bseq; pac = calloc(bns->l_pac/4+1, 1); for (l = 0; l < bns->n_seqs; ++l) err_printf("@SQ\tSN:%s\tLN:%d\n", bns->anns[l].name, bns->anns[l].len); err_fread_noeof(pac, 1, bns->l_pac/4+1, bns->fp_pac); fp = xzopen(fn, "r"); ks = kseq_init(fp); _seq = calloc(1, sizeof(bsw2seq_t)); if (fn2) { fp2 = xzopen(fn2, "r"); ks2 = kseq_init(fp2); is_pe = 1; } else fp2 = 0, ks2 = 0, is_pe = 0; while ((bseq = bseq_read(opt->chunk_size * opt->n_threads, &n, ks, ks2)) != 0) { int size = 0; if (n > _seq->max) { _seq->max = n; kroundup32(_seq->max); _seq->seq = realloc(_seq->seq, _seq->max * sizeof(bsw2seq1_t)); } _seq->n = n; for (i = 0; i < n; ++i) { bseq1_t *b = &bseq[i]; bsw2seq1_t *p = &_seq->seq[i]; p->tid = -1; p->l = b->l_seq; p->name = b->name; p->seq = b->seq; p->qual = b->qual; p->comment = b->comment; p->sam = 0; size += p->l; } fprintf(stderr, "[bsw2_aln] read %d sequences/pairs (%d bp) ...\n", n, size); free(bseq); process_seqs(_seq, opt, bns, pac, target, is_pe); } // free free(pac); free(_seq->seq); free(_seq); kseq_destroy(ks); err_gzclose(fp); if (fn2) { kseq_destroy(ks2); err_gzclose(fp2); } } bwa-0.7.17/bwtsw2_chain.c000066400000000000000000000057671317342117100151340ustar00rootroot00000000000000#include #include "bwtsw2.h" #ifdef USE_MALLOC_WRAPPERS # include "malloc_wrap.h" #endif typedef struct { uint32_t tbeg, tend; int qbeg, qend; uint32_t flag:1, idx:31; int chain; // also reuse as a counter } hsaip_t; #define _hsaip_lt(a, b) ((a).qbeg < (b).qbeg) #include "ksort.h" KSORT_INIT(hsaip, hsaip_t, _hsaip_lt) static int chaining(const bsw2opt_t *opt, int shift, int n, hsaip_t *z, hsaip_t *chain) { int j, k, m = 0; ks_introsort(hsaip, n, z); for (j = 0; j < n; ++j) { hsaip_t *p = z + j; for (k = m - 1; k >= 0; --k) { hsaip_t *q = chain + k; int x = p->qbeg - q->qbeg; // always positive int y = p->tbeg - q->tbeg; if (y > 0 && x < opt->max_chain_gap && y < opt->max_chain_gap && x - y <= opt->bw && y - x <= opt->bw) { // chained if (p->qend > q->qend) q->qend = p->qend; if (p->tend > q->tend) q->tend = p->tend; ++q->chain; p->chain = shift + k; break; } else if (q->chain > opt->t_seeds * 2) k = 0; // if the chain is strong enough, do not check the previous chains } if (k < 0) { // not added to any previous chains chain[m] = *p; chain[m].chain = 1; chain[m].idx = p->chain = shift + m; ++m; } } return m; } void bsw2_chain_filter(const bsw2opt_t *opt, int len, bwtsw2_t *b[2]) { hsaip_t *z[2], *chain[2]; int i, j, k, n[2], m[2], thres = opt->t_seeds * 2; char *flag; // initialization n[0] = b[0]->n; n[1] = b[1]->n; z[0] = calloc(n[0] + n[1], sizeof(hsaip_t)); z[1] = z[0] + n[0]; chain[0] = calloc(n[0] + n[1], sizeof(hsaip_t)); for (k = j = 0; k < 2; ++k) { for (i = 0; i < b[k]->n; ++i) { bsw2hit_t *p = b[k]->hits + i; hsaip_t *q = z[k] + i; q->flag = k; q->idx = i; q->tbeg = p->k; q->tend = p->k + p->len; q->chain = -1; q->qbeg = p->beg; q->qend = p->end; } } // chaining m[0] = chaining(opt, 0, n[0], z[0], chain[0]); chain[1] = chain[0] + m[0]; m[1] = chaining(opt, m[0], n[1], z[1], chain[1]); // change query coordinate on the reverse strand for (k = 0; k < m[1]; ++k) { hsaip_t *p = chain[1] + k; int tmp = p->qbeg; p->qbeg = len - p->qend; p->qend = len - tmp; } //for (k = 0; k < m[0]; ++k) printf("%d, [%d,%d), [%d,%d)\n", chain[0][k].chain, chain[0][k].tbeg, chain[0][k].tend, chain[0][k].qbeg, chain[0][k].qend); // filtering flag = calloc(m[0] + m[1], 1); ks_introsort(hsaip, m[0] + m[1], chain[0]); for (k = 1; k < m[0] + m[1]; ++k) { hsaip_t *p = chain[0] + k; for (j = 0; j < k; ++j) { hsaip_t *q = chain[0] + j; if (flag[q->idx]) continue; if (q->qend >= p->qend && q->chain > p->chain * thres && p->chain < thres) { flag[p->idx] = 1; break; } } } for (k = 0; k < n[0] + n[1]; ++k) { hsaip_t *p = z[0] + k; if (flag[p->chain]) b[p->flag]->hits[p->idx].G = 0; } free(flag); // squeeze out filtered elements in b[2] for (k = 0; k < 2; ++k) { for (j = i = 0; j < n[k]; ++j) { bsw2hit_t *p = b[k]->hits + j; if (p->G) { if (i != j) b[k]->hits[i++] = *p; else ++i; } } b[k]->n = i; } // free free(z[0]); free(chain[0]); } bwa-0.7.17/bwtsw2_core.c000066400000000000000000000450131317342117100147660ustar00rootroot00000000000000#include #include #include #include #include #include "bwt_lite.h" #include "bwtsw2.h" #include "bwt.h" #include "kvec.h" #ifdef USE_MALLOC_WRAPPERS # include "malloc_wrap.h" #endif typedef struct { bwtint_t k, l; } qintv_t; #define qintv_eq(a, b) ((a).k == (b).k && (a).l == (b).l) #define qintv_hash(a) ((a).k>>7^(a).l<<17) #include "khash.h" KHASH_INIT(qintv, qintv_t, uint64_t, 1, qintv_hash, qintv_eq) KHASH_MAP_INIT_INT64(64, uint64_t) #define MINUS_INF -0x3fffffff #define MASK_LEVEL 0.90f struct __mempool_t; static void mp_destroy(struct __mempool_t*); typedef struct { bwtint_t qk, ql; int I, D, G; uint32_t pj:2, qlen:30; int tlen; int ppos, upos; int cpos[4]; } bsw2cell_t; #include "ksort.h" KSORT_INIT_GENERIC(int) #define __hitG_lt(a, b) (((a).G + ((int)(a).n_seeds<<2)) > (b).G + ((int)(b).n_seeds<<2)) KSORT_INIT(hitG, bsw2hit_t, __hitG_lt) static const bsw2cell_t g_default_cell = { 0, 0, MINUS_INF, MINUS_INF, MINUS_INF, 0, 0, 0, -1, -1, {-1, -1, -1, -1} }; typedef struct { int n, max; uint32_t tk, tl; // this is fine bsw2cell_t *array; } bsw2entry_t, *bsw2entry_p; /* --- BEGIN: Stack operations --- */ typedef struct { int n_pending; kvec_t(bsw2entry_p) stack0, pending; struct __mempool_t *pool; } bsw2stack_t; #define stack_isempty(s) (kv_size(s->stack0) == 0 && s->n_pending == 0) static void stack_destroy(bsw2stack_t *s) { mp_destroy(s->pool); kv_destroy(s->stack0); kv_destroy(s->pending); free(s); } inline static void stack_push0(bsw2stack_t *s, bsw2entry_p e) { kv_push(bsw2entry_p, s->stack0, e); } inline static bsw2entry_p stack_pop(bsw2stack_t *s) { assert(!(kv_size(s->stack0) == 0 && s->n_pending != 0)); return kv_pop(s->stack0); } /* --- END: Stack operations --- */ /* --- BEGIN: memory pool --- */ typedef struct __mempool_t { int cnt; // if cnt!=0, then there must be memory leak kvec_t(bsw2entry_p) pool; } mempool_t; inline static bsw2entry_p mp_alloc(mempool_t *mp) { ++mp->cnt; if (kv_size(mp->pool) == 0) return (bsw2entry_t*)calloc(1, sizeof(bsw2entry_t)); else return kv_pop(mp->pool); } inline static void mp_free(mempool_t *mp, bsw2entry_p e) { --mp->cnt; e->n = 0; kv_push(bsw2entry_p, mp->pool, e); } static void mp_destroy(struct __mempool_t *mp) { int i; for (i = 0; i != kv_size(mp->pool); ++i) { free(kv_A(mp->pool, i)->array); free(kv_A(mp->pool, i)); } kv_destroy(mp->pool); free(mp); } /* --- END: memory pool --- */ /* --- BEGIN: utilities --- */ static khash_t(64) *bsw2_connectivity(const bwtl_t *b) { khash_t(64) *h; uint32_t k, l, cntk[4], cntl[4]; // this is fine uint64_t x; khiter_t iter; int j, ret; kvec_t(uint64_t) stack; kv_init(stack); h = kh_init(64); kh_resize(64, h, b->seq_len * 4); x = b->seq_len; kv_push(uint64_t, stack, x); while (kv_size(stack)) { x = kv_pop(stack); k = x>>32; l = (uint32_t)x; bwtl_2occ4(b, k-1, l, cntk, cntl); for (j = 0; j != 4; ++j) { k = b->L2[j] + cntk[j] + 1; l = b->L2[j] + cntl[j]; if (k > l) continue; x = (uint64_t)k << 32 | l; iter = kh_put(64, h, x, &ret); if (ret) { // if not present kh_value(h, iter) = 1; kv_push(uint64_t, stack, x); } else ++kh_value(h, iter); } } kv_destroy(stack); //fprintf(stderr, "[bsw2_connectivity] %u nodes in the DAG\n", kh_size(h)); return h; } // pick up top T matches at a node static void cut_tail(bsw2entry_t *u, int T, bsw2entry_t *aux) { int i, *a, n, x; if (u->n <= T) return; if (aux->max < u->n) { aux->max = u->n; aux->array = (bsw2cell_t*)realloc(aux->array, aux->max * sizeof(bsw2cell_t)); } a = (int*)aux->array; for (i = n = 0; i != u->n; ++i) if (u->array[i].ql && u->array[i].G > 0) a[n++] = -u->array[i].G; if (n <= T) return; x = -ks_ksmall(int, n, a, T); n = 0; for (i = 0; i < u->n; ++i) { bsw2cell_t *p = u->array + i; if (p->G == x) ++n; if (p->G < x || (p->G == x && n >= T)) { p->qk = p->ql = 0; p->G = 0; if (p->ppos >= 0) u->array[p->ppos].cpos[p->pj] = -1; } } } // remove duplicated cells static inline void remove_duplicate(bsw2entry_t *u, khash_t(qintv) *hash) { int i, ret, j; khiter_t k; qintv_t key; kh_clear(qintv, hash); for (i = 0; i != u->n; ++i) { bsw2cell_t *p = u->array + i; if (p->ql == 0) continue; key.k = p->qk; key.l = p->ql; k = kh_put(qintv, hash, key, &ret); j = -1; if (ret == 0) { if ((uint32_t)kh_value(hash, k) >= p->G) j = i; else { j = kh_value(hash, k)>>32; kh_value(hash, k) = (uint64_t)i<<32 | p->G; } } else kh_value(hash, k) = (uint64_t)i<<32 | p->G; if (j >= 0) { p = u->array + j; p->qk = p->ql = 0; p->G = 0; if (p->ppos >= 0) u->array[p->ppos].cpos[p->pj] = -3; } } } // merge two entries static void merge_entry(const bsw2opt_t * __restrict opt, bsw2entry_t *u, bsw2entry_t *v, bwtsw2_t *b) { int i; if (u->n + v->n >= u->max) { u->max = u->n + v->n; u->array = (bsw2cell_t*)realloc(u->array, u->max * sizeof(bsw2cell_t)); } for (i = 0; i != v->n; ++i) { bsw2cell_t *p = v->array + i; if (p->ppos >= 0) p->ppos += u->n; if (p->cpos[0] >= 0) p->cpos[0] += u->n; if (p->cpos[1] >= 0) p->cpos[1] += u->n; if (p->cpos[2] >= 0) p->cpos[2] += u->n; if (p->cpos[3] >= 0) p->cpos[3] += u->n; } memcpy(u->array + u->n, v->array, v->n * sizeof(bsw2cell_t)); u->n += v->n; } static inline bsw2cell_t *push_array_p(bsw2entry_t *e) { if (e->n == e->max) { e->max = e->max? e->max<<1 : 256; e->array = (bsw2cell_t*)realloc(e->array, sizeof(bsw2cell_t) * e->max); } return e->array + e->n; } static inline double time_elapse(const struct rusage *curr, const struct rusage *last) { long t1 = (curr->ru_utime.tv_sec - last->ru_utime.tv_sec) + (curr->ru_stime.tv_sec - last->ru_stime.tv_sec); long t2 = (curr->ru_utime.tv_usec - last->ru_utime.tv_usec) + (curr->ru_stime.tv_usec - last->ru_stime.tv_usec); return (double)t1 + t2 * 1e-6; } /* --- END: utilities --- */ /* --- BEGIN: processing partial hits --- */ static void save_hits(const bwtl_t *bwt, int thres, bsw2hit_t *hits, bsw2entry_t *u) { int i; uint32_t k; // this is fine for (i = 0; i < u->n; ++i) { bsw2cell_t *p = u->array + i; if (p->G < thres) continue; for (k = u->tk; k <= u->tl; ++k) { int beg, end; bsw2hit_t *q = 0; beg = bwt->sa[k]; end = beg + p->tlen; if (p->G > hits[beg*2].G) { hits[beg*2+1] = hits[beg*2]; q = hits + beg * 2; } else if (p->G > hits[beg*2+1].G) q = hits + beg * 2 + 1; if (q) { q->k = p->qk; q->l = p->ql; q->len = p->qlen; q->G = p->G; q->beg = beg; q->end = end; q->G2 = q->k == q->l? 0 : q->G; q->flag = q->n_seeds = 0; } } } } /* "narrow hits" are node-to-node hits that have a high score and * are not so repetitive (|SA interval|<=IS). */ static void save_narrow_hits(const bwtl_t *bwtl, bsw2entry_t *u, bwtsw2_t *b1, int t, int IS) { int i; for (i = 0; i < u->n; ++i) { bsw2hit_t *q; bsw2cell_t *p = u->array + i; if (p->G >= t && p->ql - p->qk + 1 <= IS) { // good narrow hit if (b1->max == b1->n) { b1->max = b1->max? b1->max<<1 : 4; b1->hits = realloc(b1->hits, b1->max * sizeof(bsw2hit_t)); } q = &b1->hits[b1->n++]; q->k = p->qk; q->l = p->ql; q->len = p->qlen; q->G = p->G; q->G2 = 0; q->beg = bwtl->sa[u->tk]; q->end = q->beg + p->tlen; q->flag = 0; // delete p p->qk = p->ql = 0; p->G = 0; if (p->ppos >= 0) u->array[p->ppos].cpos[p->pj] = -3; } } } /* after this, "narrow SA hits" will be expanded and the coordinates * will be obtained and stored in b->hits[*].k. */ int bsw2_resolve_duphits(const bntseq_t *bns, const bwt_t *bwt, bwtsw2_t *b, int IS) { int i, j, n, is_rev; if (b->n == 0) return 0; if (bwt && bns) { // convert to chromosomal coordinates if requested int old_n = b->n; bsw2hit_t *old_hits = b->hits; for (i = n = 0; i < b->n; ++i) { // compute the memory to allocated bsw2hit_t *p = old_hits + i; if (p->l - p->k + 1 <= IS) n += p->l - p->k + 1; else if (p->G > 0) ++n; } b->n = b->max = n; b->hits = calloc(b->max, sizeof(bsw2hit_t)); for (i = j = 0; i < old_n; ++i) { bsw2hit_t *p = old_hits + i; if (p->l - p->k + 1 <= IS) { // the hit is no so repetitive bwtint_t k; if (p->G == 0 && p->k == 0 && p->l == 0 && p->len == 0) continue; for (k = p->k; k <= p->l; ++k) { b->hits[j] = *p; b->hits[j].k = bns_depos(bns, bwt_sa(bwt, k), &is_rev); b->hits[j].l = 0; b->hits[j].is_rev = is_rev; if (is_rev) b->hits[j].k -= p->len - 1; ++j; } } else if (p->G > 0) { b->hits[j] = *p; b->hits[j].k = bns_depos(bns, bwt_sa(bwt, p->k), &is_rev); b->hits[j].l = 0; b->hits[j].flag |= 1; b->hits[j].is_rev = is_rev; if (is_rev) b->hits[j].k -= p->len - 1; ++j; } } free(old_hits); } for (i = j = 0; i < b->n; ++i) // squeeze out empty elements if (b->hits[i].G) b->hits[j++] = b->hits[i]; b->n = j; ks_introsort(hitG, b->n, b->hits); for (i = 1; i < b->n; ++i) { bsw2hit_t *p = b->hits + i; for (j = 0; j < i; ++j) { bsw2hit_t *q = b->hits + j; int compatible = 1; if (p->is_rev != q->is_rev) continue; // hits from opposite strands are not duplicates if (p->l == 0 && q->l == 0) { int qol = (p->end < q->end? p->end : q->end) - (p->beg > q->beg? p->beg : q->beg); // length of query overlap if (qol < 0) qol = 0; if ((float)qol / (p->end - p->beg) > MASK_LEVEL || (float)qol / (q->end - q->beg) > MASK_LEVEL) { int64_t tol = (int64_t)(p->k + p->len < q->k + q->len? p->k + p->len : q->k + q->len) - (int64_t)(p->k > q->k? p->k : q->k); // length of target overlap if ((double)tol / p->len > MASK_LEVEL || (double)tol / q->len > MASK_LEVEL) compatible = 0; } } if (!compatible) { p->G = 0; if (q->G2 < p->G2) q->G2 = p->G2; break; } } } n = i; for (i = j = 0; i < n; ++i) { if (b->hits[i].G == 0) continue; if (i != j) b->hits[j++] = b->hits[i]; else ++j; } b->n = j; return b->n; } int bsw2_resolve_query_overlaps(bwtsw2_t *b, float mask_level) { int i, j, n; if (b->n == 0) return 0; ks_introsort(hitG, b->n, b->hits); { // choose a random one int G0 = b->hits[0].G; for (i = 1; i < b->n; ++i) if (b->hits[i].G != G0) break; j = (int)(i * drand48()); if (j) { bsw2hit_t tmp; tmp = b->hits[0]; b->hits[0] = b->hits[j]; b->hits[j] = tmp; } } for (i = 1; i < b->n; ++i) { bsw2hit_t *p = b->hits + i; int all_compatible = 1; if (p->G == 0) break; for (j = 0; j < i; ++j) { bsw2hit_t *q = b->hits + j; int64_t tol = 0; int qol, compatible = 0; float fol; if (q->G == 0) continue; qol = (p->end < q->end? p->end : q->end) - (p->beg > q->beg? p->beg : q->beg); if (qol < 0) qol = 0; if (p->l == 0 && q->l == 0) { tol = (int64_t)(p->k + p->len < q->k + q->len? p->k + p->len : q->k + q->len) - (p->k > q->k? p->k : q->k); if (tol < 0) tol = 0; } fol = (float)qol / (p->end - p->beg < q->end - q->beg? p->end - p->beg : q->end - q->beg); if (fol < mask_level || (tol > 0 && qol < p->end - p->beg && qol < q->end - q->beg)) compatible = 1; if (!compatible) { if (q->G2 < p->G) q->G2 = p->G; all_compatible = 0; } } if (!all_compatible) p->G = 0; } n = i; for (i = j = 0; i < n; ++i) { if (b->hits[i].G == 0) continue; if (i != j) b->hits[j++] = b->hits[i]; else ++j; } b->n = j; return j; } /* --- END: processing partial hits --- */ /* --- BEGIN: global mem pool --- */ bsw2global_t *bsw2_global_init() { bsw2global_t *pool; bsw2stack_t *stack; pool = calloc(1, sizeof(bsw2global_t)); stack = calloc(1, sizeof(bsw2stack_t)); stack->pool = (mempool_t*)calloc(1, sizeof(mempool_t)); pool->stack = (void*)stack; return pool; } void bsw2_global_destroy(bsw2global_t *pool) { stack_destroy((bsw2stack_t*)pool->stack); free(pool->aln_mem); free(pool); } /* --- END: global mem pool --- */ static inline int fill_cell(const bsw2opt_t *o, int match_score, bsw2cell_t *c[4]) { int G = c[3]? c[3]->G + match_score : MINUS_INF; if (c[1]) { c[0]->I = c[1]->I > c[1]->G - o->q? c[1]->I - o->r : c[1]->G - o->qr; if (c[0]->I > G) G = c[0]->I; } else c[0]->I = MINUS_INF; if (c[2]) { c[0]->D = c[2]->D > c[2]->G - o->q? c[2]->D - o->r : c[2]->G - o->qr; if (c[0]->D > G) G = c[0]->D; } else c[0]->D = MINUS_INF; return(c[0]->G = G); } static void init_bwtsw2(const bwtl_t *target, const bwt_t *query, bsw2stack_t *s) { bsw2entry_t *u; bsw2cell_t *x; u = mp_alloc(s->pool); u->tk = 0; u->tl = target->seq_len; x = push_array_p(u); *x = g_default_cell; x->G = 0; x->qk = 0; x->ql = query->seq_len; u->n++; stack_push0(s, u); } /* On return, ret[1] keeps not-so-repetitive hits (narrow SA hits); ret[0] keeps all hits (right?) */ bwtsw2_t **bsw2_core(const bntseq_t *bns, const bsw2opt_t *opt, const bwtl_t *target, const bwt_t *query, bsw2global_t *pool) { bsw2stack_t *stack = (bsw2stack_t*)pool->stack; bwtsw2_t *b, *b1, **b_ret; int i, j, score_mat[16], *heap, heap_size, n_tot = 0; struct rusage curr, last; khash_t(qintv) *rhash; khash_t(64) *chash; // initialize connectivity hash (chash) chash = bsw2_connectivity(target); // calculate score matrix for (i = 0; i != 4; ++i) for (j = 0; j != 4; ++j) score_mat[i<<2|j] = (i == j)? opt->a : -opt->b; // initialize other variables rhash = kh_init(qintv); init_bwtsw2(target, query, stack); heap_size = opt->z; heap = calloc(heap_size, sizeof(int)); // initialize the return struct b = (bwtsw2_t*)calloc(1, sizeof(bwtsw2_t)); b->n = b->max = target->seq_len * 2; b->hits = calloc(b->max, sizeof(bsw2hit_t)); b1 = (bwtsw2_t*)calloc(1, sizeof(bwtsw2_t)); b_ret = calloc(2, sizeof(void*)); b_ret[0] = b; b_ret[1] = b1; // initialize timer getrusage(0, &last); // the main loop: traversal of the DAG while (!stack_isempty(stack)) { int old_n, tj; bsw2entry_t *v; uint32_t tcntk[4], tcntl[4]; bwtint_t k, l; v = stack_pop(stack); old_n = v->n; n_tot += v->n; for (i = 0; i < v->n; ++i) { // test max depth and band width bsw2cell_t *p = v->array + i; if (p->ql == 0) continue; if (p->tlen - (int)p->qlen > opt->bw || (int)p->qlen - p->tlen > opt->bw) { p->qk = p->ql = 0; if (p->ppos >= 0) v->array[p->ppos].cpos[p->pj] = -5; } } // get Occ for the DAG bwtl_2occ4(target, v->tk - 1, v->tl, tcntk, tcntl); for (tj = 0; tj != 4; ++tj) { // descend to the children bwtint_t qcntk[4], qcntl[4]; int qj, *curr_score_mat = score_mat + tj * 4; khiter_t iter; bsw2entry_t *u; k = target->L2[tj] + tcntk[tj] + 1; l = target->L2[tj] + tcntl[tj]; if (k > l) continue; // update counter iter = kh_get(64, chash, (uint64_t)k<<32 | l); --kh_value(chash, iter); // initialization u = mp_alloc(stack->pool); u->tk = k; u->tl = l; memset(heap, 0, sizeof(int) * opt->z); // loop through all the nodes in v for (i = 0; i < v->n; ++i) { bsw2cell_t *p = v->array + i, *x, *c[4]; // c[0]=>current, c[1]=>I, c[2]=>D, c[3]=>G int is_added = 0; if (p->ql == 0) continue; // deleted node c[0] = x = push_array_p(u); x->G = MINUS_INF; p->upos = x->upos = -1; if (p->ppos >= 0) { // parent has been visited c[1] = (v->array[p->ppos].upos >= 0)? u->array + v->array[p->ppos].upos : 0; c[3] = v->array + p->ppos; c[2] = p; if (fill_cell(opt, curr_score_mat[p->pj], c) > 0) { // then update topology at p and x x->ppos = v->array[p->ppos].upos; // the parent pos in u p->upos = u->n++; // the current pos in u if (x->ppos >= 0) u->array[x->ppos].cpos[p->pj] = p->upos; // the child pos of its parent in u is_added = 1; } } else { x->D = p->D > p->G - opt->q? p->D - opt->r : p->G - opt->qr; if (x->D > 0) { x->G = x->D; x->I = MINUS_INF; x->ppos = -1; p->upos = u->n++; is_added = 1; } } if (is_added) { // x has been added to u->array. fill the remaining variables x->cpos[0] = x->cpos[1] = x->cpos[2] = x->cpos[3] = -1; x->pj = p->pj; x->qk = p->qk; x->ql = p->ql; x->qlen = p->qlen; x->tlen = p->tlen + 1; if (x->G > -heap[0]) { heap[0] = -x->G; ks_heapadjust(int, 0, heap_size, heap); } } if ((x->G > opt->qr && x->G >= -heap[0]) || i < old_n) { // good node in u, or in v if (p->cpos[0] == -1 || p->cpos[1] == -1 || p->cpos[2] == -1 || p->cpos[3] == -1) { bwt_2occ4(query, p->qk - 1, p->ql, qcntk, qcntl); for (qj = 0; qj != 4; ++qj) { // descend to the prefix trie if (p->cpos[qj] != -1) continue; // this node will be visited later k = query->L2[qj] + qcntk[qj] + 1; l = query->L2[qj] + qcntl[qj]; if (k > l) { p->cpos[qj] = -2; continue; } x = push_array_p(v); p = v->array + i; // p may not point to the correct position after realloc x->G = x->I = x->D = MINUS_INF; x->qk = k; x->ql = l; x->pj = qj; x->qlen = p->qlen + 1; x->ppos = i; x->tlen = p->tlen; x->cpos[0] = x->cpos[1] = x->cpos[2] = x->cpos[3] = -1; p->cpos[qj] = v->n++; } // ~for(qj) } // ~if(p->cpos[]) } // ~if } // ~for(i) if (u->n) save_hits(target, opt->t, b->hits, u); { // push u to the stack (or to the pending array) uint32_t cnt, pos; cnt = (uint32_t)kh_value(chash, iter); pos = kh_value(chash, iter)>>32; if (pos) { // something in the pending array, then merge bsw2entry_t *w = kv_A(stack->pending, pos-1); if (u->n) { if (w->n < u->n) { // swap w = u; u = kv_A(stack->pending, pos-1); kv_A(stack->pending, pos-1) = w; } merge_entry(opt, w, u, b); } if (cnt == 0) { // move from pending to stack0 remove_duplicate(w, rhash); save_narrow_hits(target, w, b1, opt->t, opt->is); cut_tail(w, opt->z, u); stack_push0(stack, w); kv_A(stack->pending, pos-1) = 0; --stack->n_pending; } mp_free(stack->pool, u); } else if (cnt) { // the first time if (u->n) { // push to the pending queue ++stack->n_pending; kv_push(bsw2entry_p, stack->pending, u); kh_value(chash, iter) = (uint64_t)kv_size(stack->pending)<<32 | cnt; } else mp_free(stack->pool, u); } else { // cnt == 0, then push to the stack bsw2entry_t *w = mp_alloc(stack->pool); save_narrow_hits(target, u, b1, opt->t, opt->is); cut_tail(u, opt->z, w); mp_free(stack->pool, w); stack_push0(stack, u); } } } // ~for(tj) mp_free(stack->pool, v); } // while(top) getrusage(0, &curr); for (i = 0; i < 2; ++i) for (j = 0; j < b_ret[i]->n; ++j) b_ret[i]->hits[j].n_seeds = 0; bsw2_resolve_duphits(bns, query, b, opt->is); bsw2_resolve_duphits(bns, query, b1, opt->is); //fprintf(stderr, "stats: %.3lf sec; %d elems\n", time_elapse(&curr, &last), n_tot); // free free(heap); kh_destroy(qintv, rhash); kh_destroy(64, chash); stack->pending.n = stack->stack0.n = 0; return b_ret; } bwa-0.7.17/bwtsw2_main.c000066400000000000000000000073521317342117100147660ustar00rootroot00000000000000#include #include #include #include #include #include "bwt.h" #include "bwtsw2.h" #include "utils.h" #include "bwa.h" int bwa_bwtsw2(int argc, char *argv[]) { bsw2opt_t *opt; bwaidx_t *idx; int c; opt = bsw2_init_opt(); srand48(11); while ((c = getopt(argc, argv, "q:r:a:b:t:T:w:d:z:m:s:c:N:Hf:MI:SG:C")) >= 0) { switch (c) { case 'q': opt->q = atoi(optarg); break; case 'r': opt->r = atoi(optarg); break; case 'a': opt->a = atoi(optarg); break; case 'b': opt->b = atoi(optarg); break; case 'w': opt->bw = atoi(optarg); break; case 'T': opt->t = atoi(optarg); break; case 't': opt->n_threads = atoi(optarg); break; case 'z': opt->z = atoi(optarg); break; case 's': opt->is = atoi(optarg); break; case 'm': opt->mask_level = atof(optarg); break; case 'c': opt->coef = atof(optarg); break; case 'N': opt->t_seeds = atoi(optarg); break; case 'M': opt->multi_2nd = 1; break; case 'H': opt->hard_clip = 1; break; case 'f': xreopen(optarg, "w", stdout); break; case 'I': opt->max_ins = atoi(optarg); break; case 'S': opt->skip_sw = 1; break; case 'C': opt->cpy_cmt = 1; break; case 'G': opt->max_chain_gap = atoi(optarg); break; default: return 1; } } opt->qr = opt->q + opt->r; if (optind + 2 > argc) { fprintf(stderr, "\n"); fprintf(stderr, "Usage: bwa bwasw [options] [query2.fa]\n\n"); fprintf(stderr, "Options: -a INT score for a match [%d]\n", opt->a); fprintf(stderr, " -b INT mismatch penalty [%d]\n", opt->b); fprintf(stderr, " -q INT gap open penalty [%d]\n", opt->q); fprintf(stderr, " -r INT gap extension penalty [%d]\n", opt->r); fprintf(stderr, " -w INT band width [%d]\n", opt->bw); fprintf(stderr, " -m FLOAT mask level [%.2f]\n", opt->mask_level); fprintf(stderr, "\n"); fprintf(stderr, " -t INT number of threads [%d]\n", opt->n_threads); fprintf(stderr, " -f FILE file to output results to instead of stdout\n"); fprintf(stderr, " -H in SAM output, use hard clipping instead of soft clipping\n"); fprintf(stderr, " -C copy FASTA/Q comment to SAM output\n"); fprintf(stderr, " -M mark multi-part alignments as secondary\n"); fprintf(stderr, " -S skip Smith-Waterman read pairing\n"); fprintf(stderr, " -I INT ignore pairs with insert >=INT for inferring the size distr [%d]\n", opt->max_ins); fprintf(stderr, "\n"); fprintf(stderr, " -T INT score threshold divided by a [%d]\n", opt->t); fprintf(stderr, " -c FLOAT coefficient of length-threshold adjustment [%.1f]\n", opt->coef); fprintf(stderr, " -z INT Z-best [%d]\n", opt->z); fprintf(stderr, " -s INT maximum seeding interval size [%d]\n", opt->is); fprintf(stderr, " -N INT # seeds to trigger rev aln; 2*INT is also the chaining threshold [%d]\n", opt->t_seeds); fprintf(stderr, " -G INT maximum gap size during chaining [%d]\n", opt->max_chain_gap); fprintf(stderr, "\n"); fprintf(stderr, "Note: For long Illumina, 454 and Sanger reads, assembly contigs, fosmids and\n"); fprintf(stderr, " BACs, the default setting usually works well. For the current PacBio\n"); fprintf(stderr, " reads (end of 2010), '-b5 -q2 -r1 -z10' is recommended. One may also\n"); fprintf(stderr, " increase '-z' for better sensitivity.\n"); fprintf(stderr, "\n"); return 1; } // adjust opt for opt->a opt->t *= opt->a; opt->coef *= opt->a; if ((idx = bwa_idx_load(argv[optind], BWA_IDX_BWT|BWA_IDX_BNS)) == 0) return 1; bsw2_aln(opt, idx->bns, idx->bwt, argv[optind+1], optind+2 < argc? argv[optind+2] : 0); bwa_idx_destroy(idx); free(opt); return 0; } bwa-0.7.17/bwtsw2_pair.c000066400000000000000000000244251317342117100147750ustar00rootroot00000000000000#include #include #include #include #include "utils.h" #include "bwt.h" #include "bntseq.h" #include "bwtsw2.h" #include "kstring.h" #include "ksw.h" #ifdef USE_MALLOC_WRAPPERS # include "malloc_wrap.h" #endif #define MIN_RATIO 0.8 #define OUTLIER_BOUND 2.0 #define MAX_STDDEV 4.0 #define EXT_STDDEV 4.0 typedef struct { int low, high, failed; double avg, std; } bsw2pestat_t; bsw2pestat_t bsw2_stat(int n, bwtsw2_t **buf, kstring_t *msg, int max_ins) { int i, k, x, p25, p50, p75, tmp, max_len = 0; uint64_t *isize; bsw2pestat_t r; memset(&r, 0, sizeof(bsw2pestat_t)); isize = calloc(n, 8); for (i = k = 0; i < n; i += 2) { bsw2hit_t *t[2]; int l; if (buf[i] == 0 || buf[i]->n != 1 || buf[i+1]->n != 1) continue; // more than 1 hits t[0] = &buf[i]->hits[0]; t[1] = &buf[i+1]->hits[0]; if (t[0]->G2 > 0.8 * t[0]->G) continue; // the best hit is not good enough if (t[1]->G2 > 0.8 * t[1]->G) continue; // the best hit is not good enough l = t[0]->k > t[1]->k? t[0]->k - t[1]->k + t[1]->len : t[1]->k - t[0]->k + t[0]->len; if (l >= max_ins) continue; // skip pairs with excessively large insert max_len = max_len > t[0]->end - t[0]->beg? max_len : t[0]->end - t[0]->beg; max_len = max_len > t[1]->end - t[1]->beg? max_len : t[1]->end - t[1]->beg; isize[k++] = l; } ks_introsort_64(k, isize); p25 = isize[(int)(.25 * k + .499)]; p50 = isize[(int)(.50 * k + .499)]; p75 = isize[(int)(.75 * k + .499)]; ksprintf(msg, "[%s] infer the insert size distribution from %d high-quality pairs.\n", __func__, k); if (k < 8) { ksprintf(msg, "[%s] fail to infer the insert size distribution: too few good pairs.\n", __func__); free(isize); r.failed = 1; return r; } tmp = (int)(p25 - OUTLIER_BOUND * (p75 - p25) + .499); r.low = tmp > max_len? tmp : max_len; if (r.low < 1) r.low = 1; r.high = (int)(p75 + OUTLIER_BOUND * (p75 - p25) + .499); if (r.low > r.high) { ksprintf(msg, "[%s] fail to infer the insert size distribution: upper bound is smaller than max read length.\n", __func__); free(isize); r.failed = 1; return r; } ksprintf(msg, "[%s] (25, 50, 75) percentile: (%d, %d, %d)\n", __func__, p25, p50, p75); ksprintf(msg, "[%s] low and high boundaries for computing mean and std.dev: (%d, %d)\n", __func__, r.low, r.high); for (i = x = 0, r.avg = 0; i < k; ++i) if (isize[i] >= r.low && isize[i] <= r.high) r.avg += isize[i], ++x; if (x == 0) { ksprintf(msg, "[%s] fail to infer the insert size distribution: no pairs within boundaries.\n", __func__); free(isize); r.failed = 1; return r; } r.avg /= x; for (i = 0, r.std = 0; i < k; ++i) if (isize[i] >= r.low && isize[i] <= r.high) r.std += (isize[i] - r.avg) * (isize[i] - r.avg); r.std = sqrt(r.std / x); ksprintf(msg, "[%s] mean and std.dev: (%.2f, %.2f)\n", __func__, r.avg, r.std); tmp = (int)(p25 - 3. * (p75 - p25) + .499); r.low = tmp > max_len? tmp : max_len; if (r.low < 1) r.low = 1; r.high = (int)(p75 + 3. * (p75 - p25) + .499); if (r.low > r.avg - MAX_STDDEV * r.std) r.low = (int)(r.avg - MAX_STDDEV * r.std + .499); r.low = tmp > max_len? tmp : max_len; if (r.high < r.avg + MAX_STDDEV * r.std) r.high = (int)(r.avg + MAX_STDDEV * r.std + .499); ksprintf(msg, "[%s] low and high boundaries for proper pairs: (%d, %d)\n", __func__, r.low, r.high); free(isize); return r; } typedef struct { int n_cigar, beg, end, len; int64_t pos; uint32_t *cigar; } pairaux_t; extern unsigned char nst_nt4_table[256]; void bsw2_pair1(const bsw2opt_t *opt, int64_t l_pac, const uint8_t *pac, const bsw2pestat_t *st, const bsw2hit_t *h, int l_mseq, const char *mseq, bsw2hit_t *a, int8_t g_mat[25]) { extern void seq_reverse(int len, ubyte_t *seq, int is_comp); int64_t k, beg, end; uint8_t *seq, *ref; int i; // compute the region start and end a->n_seeds = 1; a->flag |= BSW2_FLAG_MATESW; // before calling this routine, *a has been cleared with memset(0); the flag is set with 1<<6/7 if (h->is_rev == 0) { beg = (int64_t)(h->k + st->avg - EXT_STDDEV * st->std - l_mseq + .499); if (beg < h->k) beg = h->k; end = (int64_t)(h->k + st->avg + EXT_STDDEV * st->std + .499); a->is_rev = 1; a->flag |= 16; } else { beg = (int64_t)(h->k + h->end - h->beg - st->avg - EXT_STDDEV * st->std + .499); end = (int64_t)(h->k + h->end - h->beg - st->avg + EXT_STDDEV * st->std + l_mseq + .499); if (end > h->k + (h->end - h->beg)) end = h->k + (h->end - h->beg); a->is_rev = 0; } if (beg < 1) beg = 1; if (end > l_pac) end = l_pac; if (end - beg < l_mseq) return; // generate the sequence seq = malloc(l_mseq + (end - beg)); ref = seq + l_mseq; for (k = beg; k < end; ++k) ref[k - beg] = pac[k>>2] >> ((~k&3)<<1) & 0x3; if (h->is_rev == 0) { for (i = 0; i < l_mseq; ++i) { // on the reverse strand int c = nst_nt4_table[(int)mseq[i]]; seq[l_mseq - 1 - i] = c > 3? 4 : 3 - c; } } else { for (i = 0; i < l_mseq; ++i) // on the forward strand seq[i] = nst_nt4_table[(int)mseq[i]]; } { int flag = KSW_XSUBO | KSW_XSTART | (l_mseq * g_mat[0] < 250? KSW_XBYTE : 0) | opt->t; kswr_t aln; aln = ksw_align(l_mseq, seq, end - beg, ref, 5, g_mat, opt->q, opt->r, flag, 0); a->G = aln.score; a->G2 = aln.score2; if (a->G < opt->t) a->G = 0; if (a->G2 < opt->t) a->G2 = 0; if (a->G2) a->flag |= BSW2_FLAG_TANDEM; a->k = beg + aln.tb; a->len = aln.te - aln.tb + 1; a->beg = aln.qb; a->end = aln.qe + 1; /* printf("[Q] "); for (i = 0; i < l_mseq; ++i) putchar("ACGTN"[(int)seq[i]]); putchar('\n'); printf("[R] "); for (i = 0; i < end - beg; ++i) putchar("ACGTN"[(int)ref[i]]); putchar('\n'); printf("G=%d,G2=%d,beg=%d,end=%d,k=%lld,len=%d\n", a->G, a->G2, a->beg, a->end, a->k, a->len); */ } if (a->is_rev) i = a->beg, a->beg = l_mseq - a->end, a->end = l_mseq - i; free(seq); } void bsw2_pair(const bsw2opt_t *opt, int64_t l_pac, const uint8_t *pac, int n, bsw2seq1_t *seq, bwtsw2_t **hits) { extern int bsw2_resolve_duphits(const bntseq_t *bns, const bwt_t *bwt, bwtsw2_t *b, int IS); bsw2pestat_t pes; int i, j, k, n_rescued = 0, n_moved = 0, n_fixed = 0; int8_t g_mat[25]; kstring_t msg; memset(&msg, 0, sizeof(kstring_t)); pes = bsw2_stat(n, hits, &msg, opt->max_ins); for (i = k = 0; i < 5; ++i) { for (j = 0; j < 4; ++j) g_mat[k++] = i == j? opt->a : -opt->b; g_mat[k++] = 0; } for (i = 0; i < n; i += 2) { bsw2hit_t a[2]; memset(&a, 0, sizeof(bsw2hit_t) * 2); a[0].flag = 1<<6; a[1].flag = 1<<7; for (j = 0; j < 2; ++j) { // set the read1/2 flag if (hits[i+j] == 0) continue; for (k = 0; k < hits[i+j]->n; ++k) { bsw2hit_t *p = &hits[i+j]->hits[k]; p->flag |= 1<<(6+j); } } if (pes.failed) continue; if (hits[i] == 0 || hits[i+1] == 0) continue; // one end has excessive N if (hits[i]->n != 1 && hits[i+1]->n != 1) continue; // no end has exactly one hit if (hits[i]->n > 1 || hits[i+1]->n > 1) continue; // one read has more than one hit if (!opt->skip_sw) { if (hits[i+0]->n == 1) bsw2_pair1(opt, l_pac, pac, &pes, &hits[i+0]->hits[0], seq[i+1].l, seq[i+1].seq, &a[1], g_mat); if (hits[i+1]->n == 1) bsw2_pair1(opt, l_pac, pac, &pes, &hits[i+1]->hits[0], seq[i+0].l, seq[i+0].seq, &a[0], g_mat); } // else a[0].G == a[1].G == a[0].G2 == a[1].G2 == 0 // the following enumerate all possibilities. It is tedious but necessary... if (hits[i]->n + hits[i+1]->n == 1) { // one end mapped; the other not; bwtsw2_t *p[2]; int which; if (hits[i]->n == 1) p[0] = hits[i], p[1] = hits[i+1], which = 1; else p[0] = hits[i+1], p[1] = hits[i], which = 0; if (a[which].G == 0) continue; a[which].flag |= BSW2_FLAG_RESCUED; if (p[1]->max == 0) { p[1]->max = 1; p[1]->hits = malloc(sizeof(bsw2hit_t)); } p[1]->hits[0] = a[which]; p[1]->n = 1; p[0]->hits[0].flag |= 2; p[1]->hits[0].flag |= 2; ++n_rescued; } else { // then both ends mapped int is_fixed = 0; //fprintf(stderr, "%d; %lld,%lld; %d,%d\n", a[0].is_rev, hits[i]->hits[0].k, a[0].k, hits[i]->hits[0].end, a[0].end); for (j = 0; j < 2; ++j) { // fix wrong mappings and wrong suboptimal alignment score bsw2hit_t *p = &hits[i+j]->hits[0]; if (p->G < a[j].G) { // the orginal mapping is suboptimal a[j].G2 = a[j].G2 > p->G? a[j].G2 : p->G; // FIXME: reset BSW2_FLAG_TANDEM? *p = a[j]; ++n_fixed; is_fixed = 1; } else if (p->k != a[j].k && p->G2 < a[j].G) { p->G2 = a[j].G; } else if (p->k == a[j].k && p->G2 < a[j].G2) { p->G2 = a[j].G2; } } if (hits[i]->hits[0].k == a[0].k && hits[i+1]->hits[0].k == a[1].k) { // properly paired and no ends need to be moved for (j = 0; j < 2; ++j) hits[i+j]->hits[0].flag |= 2 | (a[j].flag & BSW2_FLAG_TANDEM); } else if (hits[i]->hits[0].k == a[0].k || hits[i+1]->hits[0].k == a[1].k) { // a tandem match for (j = 0; j < 2; ++j) { hits[i+j]->hits[0].flag |= 2; if (hits[i+j]->hits[0].k != a[j].k) hits[i+j]->hits[0].flag |= BSW2_FLAG_TANDEM; } } else if (!is_fixed && (a[0].G || a[1].G)) { // it is possible to move one end if (a[0].G && a[1].G) { // now we have two "proper pairs" int G[2]; double diff; G[0] = hits[i]->hits[0].G + a[1].G; G[1] = hits[i+1]->hits[0].G + a[0].G; diff = fabs((double)(G[0] - G[1])) / (opt->a + opt->b) / ((hits[i]->hits[0].len + a[1].len + hits[i+1]->hits[0].len + a[0].len) / 2.); if (diff > 0.05) a[G[0] > G[1]? 0 : 1].G = 0; } if (a[0].G == 0 || a[1].G == 0) { // one proper pair only bsw2hit_t *p[2]; // p[0] points the unchanged hit; p[1] to the hit to be moved int which, isize; double dev, diff; if (a[0].G) p[0] = &hits[i+1]->hits[0], p[1] = &hits[i]->hits[0], which = 0; else p[0] = &hits[i]->hits[0], p[1] = &hits[i+1]->hits[0], which = 1; isize = p[0]->is_rev? p[0]->k + p[0]->len - a[which].k : a[which].k + a[which].len - p[0]->k; dev = fabs(isize - pes.avg) / pes.std; diff = (double)(p[1]->G - a[which].G) / (opt->a + opt->b) / (p[1]->end - p[1]->beg) * 100.0; if (diff < dev * 2.) { // then move (heuristic) a[which].G2 = a[which].G; p[1][0] = a[which]; p[1]->flag |= BSW2_FLAG_MOVED | 2; p[0]->flag |= 2; ++n_moved; } } } else if (is_fixed) { hits[i+0]->hits[0].flag |= 2; hits[i+1]->hits[0].flag |= 2; } } } ksprintf(&msg, "[%s] #fixed=%d, #rescued=%d, #moved=%d\n", __func__, n_fixed, n_rescued, n_moved); fputs(msg.s, stderr); free(msg.s); } bwa-0.7.17/example.c000066400000000000000000000034641317342117100141650ustar00rootroot00000000000000#include #include #include #include #include #include "bwamem.h" #include "kseq.h" // for the FASTA/Q parser KSEQ_DECLARE(gzFile) int main(int argc, char *argv[]) { bwaidx_t *idx; gzFile fp; kseq_t *ks; mem_opt_t *opt; if (argc < 3) { fprintf(stderr, "Usage: bwamem-lite \n"); return 1; } idx = bwa_idx_load(argv[1], BWA_IDX_ALL); // load the BWA index if (NULL == idx) { fprintf(stderr, "Index load failed.\n"); exit(EXIT_FAILURE); } fp = strcmp(argv[2], "-")? gzopen(argv[2], "r") : gzdopen(fileno(stdin), "r"); if (NULL == fp) { fprintf(stderr, "Couldn't open %s : %s\n", strcmp(argv[2], "-") ? argv[2] : "stdin", errno ? strerror(errno) : "Out of memory"); exit(EXIT_FAILURE); } ks = kseq_init(fp); // initialize the FASTA/Q parser opt = mem_opt_init(); // initialize the BWA-MEM parameters to the default values while (kseq_read(ks) >= 0) { // read one sequence mem_alnreg_v ar; int i, k; ar = mem_align1(opt, idx->bwt, idx->bns, idx->pac, ks->seq.l, ks->seq.s); // get all the hits for (i = 0; i < ar.n; ++i) { // traverse each hit mem_aln_t a; if (ar.a[i].secondary >= 0) continue; // skip secondary alignments a = mem_reg2aln(opt, idx->bns, idx->pac, ks->seq.l, ks->seq.s, &ar.a[i]); // get forward-strand position and CIGAR // print alignment printf("%s\t%c\t%s\t%ld\t%d\t", ks->name.s, "+-"[a.is_rev], idx->bns->anns[a.rid].name, (long)a.pos, a.mapq); for (k = 0; k < a.n_cigar; ++k) // print CIGAR printf("%d%c", a.cigar[k]>>4, "MIDSH"[a.cigar[k]&0xf]); printf("\t%d\n", a.NM); // print edit distance free(a.cigar); // don't forget to deallocate CIGAR } free(ar.a); // and deallocate the hit list } free(opt); kseq_destroy(ks); gzclose(fp); bwa_idx_destroy(idx); return 0; } bwa-0.7.17/fastmap.c000066400000000000000000000452341317342117100141660ustar00rootroot00000000000000#include #include #include #include #include #include #include #include #include "bwa.h" #include "bwamem.h" #include "kvec.h" #include "utils.h" #include "bntseq.h" #include "kseq.h" KSEQ_DECLARE(gzFile) extern unsigned char nst_nt4_table[256]; void *kopen(const char *fn, int *_fd); int kclose(void *a); void kt_pipeline(int n_threads, void *(*func)(void*, int, void*), void *shared_data, int n_steps); typedef struct { kseq_t *ks, *ks2; mem_opt_t *opt; mem_pestat_t *pes0; int64_t n_processed; int copy_comment, actual_chunk_size; bwaidx_t *idx; } ktp_aux_t; typedef struct { ktp_aux_t *aux; int n_seqs; bseq1_t *seqs; } ktp_data_t; static void *process(void *shared, int step, void *_data) { ktp_aux_t *aux = (ktp_aux_t*)shared; ktp_data_t *data = (ktp_data_t*)_data; int i; if (step == 0) { ktp_data_t *ret; int64_t size = 0; ret = calloc(1, sizeof(ktp_data_t)); ret->seqs = bseq_read(aux->actual_chunk_size, &ret->n_seqs, aux->ks, aux->ks2); if (ret->seqs == 0) { free(ret); return 0; } if (!aux->copy_comment) for (i = 0; i < ret->n_seqs; ++i) { free(ret->seqs[i].comment); ret->seqs[i].comment = 0; } for (i = 0; i < ret->n_seqs; ++i) size += ret->seqs[i].l_seq; if (bwa_verbose >= 3) fprintf(stderr, "[M::%s] read %d sequences (%ld bp)...\n", __func__, ret->n_seqs, (long)size); return ret; } else if (step == 1) { const mem_opt_t *opt = aux->opt; const bwaidx_t *idx = aux->idx; if (opt->flag & MEM_F_SMARTPE) { bseq1_t *sep[2]; int n_sep[2]; mem_opt_t tmp_opt = *opt; bseq_classify(data->n_seqs, data->seqs, n_sep, sep); if (bwa_verbose >= 3) fprintf(stderr, "[M::%s] %d single-end sequences; %d paired-end sequences\n", __func__, n_sep[0], n_sep[1]); if (n_sep[0]) { tmp_opt.flag &= ~MEM_F_PE; mem_process_seqs(&tmp_opt, idx->bwt, idx->bns, idx->pac, aux->n_processed, n_sep[0], sep[0], 0); for (i = 0; i < n_sep[0]; ++i) data->seqs[sep[0][i].id].sam = sep[0][i].sam; } if (n_sep[1]) { tmp_opt.flag |= MEM_F_PE; mem_process_seqs(&tmp_opt, idx->bwt, idx->bns, idx->pac, aux->n_processed + n_sep[0], n_sep[1], sep[1], aux->pes0); for (i = 0; i < n_sep[1]; ++i) data->seqs[sep[1][i].id].sam = sep[1][i].sam; } free(sep[0]); free(sep[1]); } else mem_process_seqs(opt, idx->bwt, idx->bns, idx->pac, aux->n_processed, data->n_seqs, data->seqs, aux->pes0); aux->n_processed += data->n_seqs; return data; } else if (step == 2) { for (i = 0; i < data->n_seqs; ++i) { if (data->seqs[i].sam) err_fputs(data->seqs[i].sam, stdout); free(data->seqs[i].name); free(data->seqs[i].comment); free(data->seqs[i].seq); free(data->seqs[i].qual); free(data->seqs[i].sam); } free(data->seqs); free(data); return 0; } return 0; } static void update_a(mem_opt_t *opt, const mem_opt_t *opt0) { if (opt0->a) { // matching score is changed if (!opt0->b) opt->b *= opt->a; if (!opt0->T) opt->T *= opt->a; if (!opt0->o_del) opt->o_del *= opt->a; if (!opt0->e_del) opt->e_del *= opt->a; if (!opt0->o_ins) opt->o_ins *= opt->a; if (!opt0->e_ins) opt->e_ins *= opt->a; if (!opt0->zdrop) opt->zdrop *= opt->a; if (!opt0->pen_clip5) opt->pen_clip5 *= opt->a; if (!opt0->pen_clip3) opt->pen_clip3 *= opt->a; if (!opt0->pen_unpaired) opt->pen_unpaired *= opt->a; } } int main_mem(int argc, char *argv[]) { mem_opt_t *opt, opt0; int fd, fd2, i, c, ignore_alt = 0, no_mt_io = 0; int fixed_chunk_size = -1; gzFile fp, fp2 = 0; char *p, *rg_line = 0, *hdr_line = 0; const char *mode = 0; void *ko = 0, *ko2 = 0; mem_pestat_t pes[4]; ktp_aux_t aux; memset(&aux, 0, sizeof(ktp_aux_t)); memset(pes, 0, 4 * sizeof(mem_pestat_t)); for (i = 0; i < 4; ++i) pes[i].failed = 1; aux.opt = opt = mem_opt_init(); memset(&opt0, 0, sizeof(mem_opt_t)); while ((c = getopt(argc, argv, "51qpaMCSPVYjk:c:v:s:r:t:R:A:B:O:E:U:w:L:d:T:Q:D:m:I:N:o:f:W:x:G:h:y:K:X:H:")) >= 0) { if (c == 'k') opt->min_seed_len = atoi(optarg), opt0.min_seed_len = 1; else if (c == '1') no_mt_io = 1; else if (c == 'x') mode = optarg; else if (c == 'w') opt->w = atoi(optarg), opt0.w = 1; else if (c == 'A') opt->a = atoi(optarg), opt0.a = 1; else if (c == 'B') opt->b = atoi(optarg), opt0.b = 1; else if (c == 'T') opt->T = atoi(optarg), opt0.T = 1; else if (c == 'U') opt->pen_unpaired = atoi(optarg), opt0.pen_unpaired = 1; else if (c == 't') opt->n_threads = atoi(optarg), opt->n_threads = opt->n_threads > 1? opt->n_threads : 1; else if (c == 'P') opt->flag |= MEM_F_NOPAIRING; else if (c == 'a') opt->flag |= MEM_F_ALL; else if (c == 'p') opt->flag |= MEM_F_PE | MEM_F_SMARTPE; else if (c == 'M') opt->flag |= MEM_F_NO_MULTI; else if (c == 'S') opt->flag |= MEM_F_NO_RESCUE; else if (c == 'Y') opt->flag |= MEM_F_SOFTCLIP; else if (c == 'V') opt->flag |= MEM_F_REF_HDR; else if (c == '5') opt->flag |= MEM_F_PRIMARY5 | MEM_F_KEEP_SUPP_MAPQ; // always apply MEM_F_KEEP_SUPP_MAPQ with -5 else if (c == 'q') opt->flag |= MEM_F_KEEP_SUPP_MAPQ; else if (c == 'c') opt->max_occ = atoi(optarg), opt0.max_occ = 1; else if (c == 'd') opt->zdrop = atoi(optarg), opt0.zdrop = 1; else if (c == 'v') bwa_verbose = atoi(optarg); else if (c == 'j') ignore_alt = 1; else if (c == 'r') opt->split_factor = atof(optarg), opt0.split_factor = 1.; else if (c == 'D') opt->drop_ratio = atof(optarg), opt0.drop_ratio = 1.; else if (c == 'm') opt->max_matesw = atoi(optarg), opt0.max_matesw = 1; else if (c == 's') opt->split_width = atoi(optarg), opt0.split_width = 1; else if (c == 'G') opt->max_chain_gap = atoi(optarg), opt0.max_chain_gap = 1; else if (c == 'N') opt->max_chain_extend = atoi(optarg), opt0.max_chain_extend = 1; else if (c == 'o' || c == 'f') xreopen(optarg, "wb", stdout); else if (c == 'W') opt->min_chain_weight = atoi(optarg), opt0.min_chain_weight = 1; else if (c == 'y') opt->max_mem_intv = atol(optarg), opt0.max_mem_intv = 1; else if (c == 'C') aux.copy_comment = 1; else if (c == 'K') fixed_chunk_size = atoi(optarg); else if (c == 'X') opt->mask_level = atof(optarg); else if (c == 'h') { opt0.max_XA_hits = opt0.max_XA_hits_alt = 1; opt->max_XA_hits = opt->max_XA_hits_alt = strtol(optarg, &p, 10); if (*p != 0 && ispunct(*p) && isdigit(p[1])) opt->max_XA_hits_alt = strtol(p+1, &p, 10); } else if (c == 'Q') { opt0.mapQ_coef_len = 1; opt->mapQ_coef_len = atoi(optarg); opt->mapQ_coef_fac = opt->mapQ_coef_len > 0? log(opt->mapQ_coef_len) : 0; } else if (c == 'O') { opt0.o_del = opt0.o_ins = 1; opt->o_del = opt->o_ins = strtol(optarg, &p, 10); if (*p != 0 && ispunct(*p) && isdigit(p[1])) opt->o_ins = strtol(p+1, &p, 10); } else if (c == 'E') { opt0.e_del = opt0.e_ins = 1; opt->e_del = opt->e_ins = strtol(optarg, &p, 10); if (*p != 0 && ispunct(*p) && isdigit(p[1])) opt->e_ins = strtol(p+1, &p, 10); } else if (c == 'L') { opt0.pen_clip5 = opt0.pen_clip3 = 1; opt->pen_clip5 = opt->pen_clip3 = strtol(optarg, &p, 10); if (*p != 0 && ispunct(*p) && isdigit(p[1])) opt->pen_clip3 = strtol(p+1, &p, 10); } else if (c == 'R') { if ((rg_line = bwa_set_rg(optarg)) == 0) return 1; // FIXME: memory leak } else if (c == 'H') { if (optarg[0] != '@') { FILE *fp; if ((fp = fopen(optarg, "r")) != 0) { char *buf; buf = calloc(1, 0x10000); while (fgets(buf, 0xffff, fp)) { i = strlen(buf); assert(buf[i-1] == '\n'); // a long line buf[i-1] = 0; hdr_line = bwa_insert_header(buf, hdr_line); } free(buf); fclose(fp); } } else hdr_line = bwa_insert_header(optarg, hdr_line); } else if (c == 'I') { // specify the insert size distribution aux.pes0 = pes; pes[1].failed = 0; pes[1].avg = strtod(optarg, &p); pes[1].std = pes[1].avg * .1; if (*p != 0 && ispunct(*p) && isdigit(p[1])) pes[1].std = strtod(p+1, &p); pes[1].high = (int)(pes[1].avg + 4. * pes[1].std + .499); pes[1].low = (int)(pes[1].avg - 4. * pes[1].std + .499); if (pes[1].low < 1) pes[1].low = 1; if (*p != 0 && ispunct(*p) && isdigit(p[1])) pes[1].high = (int)(strtod(p+1, &p) + .499); if (*p != 0 && ispunct(*p) && isdigit(p[1])) pes[1].low = (int)(strtod(p+1, &p) + .499); if (bwa_verbose >= 3) fprintf(stderr, "[M::%s] mean insert size: %.3f, stddev: %.3f, max: %d, min: %d\n", __func__, pes[1].avg, pes[1].std, pes[1].high, pes[1].low); } else return 1; } if (rg_line) { hdr_line = bwa_insert_header(rg_line, hdr_line); free(rg_line); } if (opt->n_threads < 1) opt->n_threads = 1; if (optind + 1 >= argc || optind + 3 < argc) { fprintf(stderr, "\n"); fprintf(stderr, "Usage: bwa mem [options] [in2.fq]\n\n"); fprintf(stderr, "Algorithm options:\n\n"); fprintf(stderr, " -t INT number of threads [%d]\n", opt->n_threads); fprintf(stderr, " -k INT minimum seed length [%d]\n", opt->min_seed_len); fprintf(stderr, " -w INT band width for banded alignment [%d]\n", opt->w); fprintf(stderr, " -d INT off-diagonal X-dropoff [%d]\n", opt->zdrop); fprintf(stderr, " -r FLOAT look for internal seeds inside a seed longer than {-k} * FLOAT [%g]\n", opt->split_factor); fprintf(stderr, " -y INT seed occurrence for the 3rd round seeding [%ld]\n", (long)opt->max_mem_intv); // fprintf(stderr, " -s INT look for internal seeds inside a seed with less than INT occ [%d]\n", opt->split_width); fprintf(stderr, " -c INT skip seeds with more than INT occurrences [%d]\n", opt->max_occ); fprintf(stderr, " -D FLOAT drop chains shorter than FLOAT fraction of the longest overlapping chain [%.2f]\n", opt->drop_ratio); fprintf(stderr, " -W INT discard a chain if seeded bases shorter than INT [0]\n"); fprintf(stderr, " -m INT perform at most INT rounds of mate rescues for each read [%d]\n", opt->max_matesw); fprintf(stderr, " -S skip mate rescue\n"); fprintf(stderr, " -P skip pairing; mate rescue performed unless -S also in use\n"); fprintf(stderr, "\nScoring options:\n\n"); fprintf(stderr, " -A INT score for a sequence match, which scales options -TdBOELU unless overridden [%d]\n", opt->a); fprintf(stderr, " -B INT penalty for a mismatch [%d]\n", opt->b); fprintf(stderr, " -O INT[,INT] gap open penalties for deletions and insertions [%d,%d]\n", opt->o_del, opt->o_ins); fprintf(stderr, " -E INT[,INT] gap extension penalty; a gap of size k cost '{-O} + {-E}*k' [%d,%d]\n", opt->e_del, opt->e_ins); fprintf(stderr, " -L INT[,INT] penalty for 5'- and 3'-end clipping [%d,%d]\n", opt->pen_clip5, opt->pen_clip3); fprintf(stderr, " -U INT penalty for an unpaired read pair [%d]\n\n", opt->pen_unpaired); fprintf(stderr, " -x STR read type. Setting -x changes multiple parameters unless overridden [null]\n"); fprintf(stderr, " pacbio: -k17 -W40 -r10 -A1 -B1 -O1 -E1 -L0 (PacBio reads to ref)\n"); fprintf(stderr, " ont2d: -k14 -W20 -r10 -A1 -B1 -O1 -E1 -L0 (Oxford Nanopore 2D-reads to ref)\n"); fprintf(stderr, " intractg: -B9 -O16 -L5 (intra-species contigs to ref)\n"); fprintf(stderr, "\nInput/output options:\n\n"); fprintf(stderr, " -p smart pairing (ignoring in2.fq)\n"); fprintf(stderr, " -R STR read group header line such as '@RG\\tID:foo\\tSM:bar' [null]\n"); fprintf(stderr, " -H STR/FILE insert STR to header if it starts with @; or insert lines in FILE [null]\n"); fprintf(stderr, " -o FILE sam file to output results to [stdout]\n"); fprintf(stderr, " -j treat ALT contigs as part of the primary assembly (i.e. ignore .alt file)\n"); fprintf(stderr, " -5 for split alignment, take the alignment with the smallest coordinate as primary\n"); fprintf(stderr, " -q don't modify mapQ of supplementary alignments\n"); fprintf(stderr, " -K INT process INT input bases in each batch regardless of nThreads (for reproducibility) []\n"); fprintf(stderr, "\n"); fprintf(stderr, " -v INT verbosity level: 1=error, 2=warning, 3=message, 4+=debugging [%d]\n", bwa_verbose); fprintf(stderr, " -T INT minimum score to output [%d]\n", opt->T); fprintf(stderr, " -h INT[,INT] if there are 80%% of the max score, output all in XA [%d,%d]\n", opt->max_XA_hits, opt->max_XA_hits_alt); fprintf(stderr, " -a output all alignments for SE or unpaired PE\n"); fprintf(stderr, " -C append FASTA/FASTQ comment to SAM output\n"); fprintf(stderr, " -V output the reference FASTA header in the XR tag\n"); fprintf(stderr, " -Y use soft clipping for supplementary alignments\n"); fprintf(stderr, " -M mark shorter split hits as secondary\n\n"); fprintf(stderr, " -I FLOAT[,FLOAT[,INT[,INT]]]\n"); fprintf(stderr, " specify the mean, standard deviation (10%% of the mean if absent), max\n"); fprintf(stderr, " (4 sigma from the mean if absent) and min of the insert size distribution.\n"); fprintf(stderr, " FR orientation only. [inferred]\n"); fprintf(stderr, "\n"); fprintf(stderr, "Note: Please read the man page for detailed description of the command line and options.\n"); fprintf(stderr, "\n"); free(opt); return 1; } if (mode) { if (strcmp(mode, "intractg") == 0) { if (!opt0.o_del) opt->o_del = 16; if (!opt0.o_ins) opt->o_ins = 16; if (!opt0.b) opt->b = 9; if (!opt0.pen_clip5) opt->pen_clip5 = 5; if (!opt0.pen_clip3) opt->pen_clip3 = 5; } else if (strcmp(mode, "pacbio") == 0 || strcmp(mode, "pbref") == 0 || strcmp(mode, "ont2d") == 0) { if (!opt0.o_del) opt->o_del = 1; if (!opt0.e_del) opt->e_del = 1; if (!opt0.o_ins) opt->o_ins = 1; if (!opt0.e_ins) opt->e_ins = 1; if (!opt0.b) opt->b = 1; if (opt0.split_factor == 0.) opt->split_factor = 10.; if (strcmp(mode, "ont2d") == 0) { if (!opt0.min_chain_weight) opt->min_chain_weight = 20; if (!opt0.min_seed_len) opt->min_seed_len = 14; if (!opt0.pen_clip5) opt->pen_clip5 = 0; if (!opt0.pen_clip3) opt->pen_clip3 = 0; } else { if (!opt0.min_chain_weight) opt->min_chain_weight = 40; if (!opt0.min_seed_len) opt->min_seed_len = 17; if (!opt0.pen_clip5) opt->pen_clip5 = 0; if (!opt0.pen_clip3) opt->pen_clip3 = 0; } } else { fprintf(stderr, "[E::%s] unknown read type '%s'\n", __func__, mode); return 1; // FIXME memory leak } } else update_a(opt, &opt0); bwa_fill_scmat(opt->a, opt->b, opt->mat); aux.idx = bwa_idx_load_from_shm(argv[optind]); if (aux.idx == 0) { if ((aux.idx = bwa_idx_load(argv[optind], BWA_IDX_ALL)) == 0) return 1; // FIXME: memory leak } else if (bwa_verbose >= 3) fprintf(stderr, "[M::%s] load the bwa index from shared memory\n", __func__); if (ignore_alt) for (i = 0; i < aux.idx->bns->n_seqs; ++i) aux.idx->bns->anns[i].is_alt = 0; ko = kopen(argv[optind + 1], &fd); if (ko == 0) { if (bwa_verbose >= 1) fprintf(stderr, "[E::%s] fail to open file `%s'.\n", __func__, argv[optind + 1]); return 1; } fp = gzdopen(fd, "r"); aux.ks = kseq_init(fp); if (optind + 2 < argc) { if (opt->flag&MEM_F_PE) { if (bwa_verbose >= 2) fprintf(stderr, "[W::%s] when '-p' is in use, the second query file is ignored.\n", __func__); } else { ko2 = kopen(argv[optind + 2], &fd2); if (ko2 == 0) { if (bwa_verbose >= 1) fprintf(stderr, "[E::%s] fail to open file `%s'.\n", __func__, argv[optind + 2]); return 1; } fp2 = gzdopen(fd2, "r"); aux.ks2 = kseq_init(fp2); opt->flag |= MEM_F_PE; } } bwa_print_sam_hdr(aux.idx->bns, hdr_line); aux.actual_chunk_size = fixed_chunk_size > 0? fixed_chunk_size : opt->chunk_size * opt->n_threads; kt_pipeline(no_mt_io? 1 : 2, process, &aux, 3); free(hdr_line); free(opt); bwa_idx_destroy(aux.idx); kseq_destroy(aux.ks); err_gzclose(fp); kclose(ko); if (aux.ks2) { kseq_destroy(aux.ks2); err_gzclose(fp2); kclose(ko2); } return 0; } int main_fastmap(int argc, char *argv[]) { int c, i, min_iwidth = 20, min_len = 17, print_seq = 0, min_intv = 1, max_len = INT_MAX; uint64_t max_intv = 0; kseq_t *seq; bwtint_t k; gzFile fp; smem_i *itr; const bwtintv_v *a; bwaidx_t *idx; while ((c = getopt(argc, argv, "w:l:pi:I:L:")) >= 0) { switch (c) { case 'p': print_seq = 1; break; case 'w': min_iwidth = atoi(optarg); break; case 'l': min_len = atoi(optarg); break; case 'i': min_intv = atoi(optarg); break; case 'I': max_intv = atol(optarg); break; case 'L': max_len = atoi(optarg); break; default: return 1; } } if (optind + 1 >= argc) { fprintf(stderr, "\n"); fprintf(stderr, "Usage: bwa fastmap [options] \n\n"); fprintf(stderr, "Options: -l INT min SMEM length to output [%d]\n", min_len); fprintf(stderr, " -w INT max interval size to find coordiantes [%d]\n", min_iwidth); fprintf(stderr, " -i INT min SMEM interval size [%d]\n", min_intv); fprintf(stderr, " -L INT max MEM length [%d]\n", max_len); fprintf(stderr, " -I INT stop if MEM is longer than -l with a size less than INT [%ld]\n", (long)max_intv); fprintf(stderr, "\n"); return 1; } fp = xzopen(argv[optind + 1], "r"); seq = kseq_init(fp); if ((idx = bwa_idx_load(argv[optind], BWA_IDX_BWT|BWA_IDX_BNS)) == 0) return 1; itr = smem_itr_init(idx->bwt); smem_config(itr, min_intv, max_len, max_intv); while (kseq_read(seq) >= 0) { err_printf("SQ\t%s\t%ld", seq->name.s, seq->seq.l); if (print_seq) { err_putchar('\t'); err_puts(seq->seq.s); } else err_putchar('\n'); for (i = 0; i < seq->seq.l; ++i) seq->seq.s[i] = nst_nt4_table[(int)seq->seq.s[i]]; smem_set_query(itr, seq->seq.l, (uint8_t*)seq->seq.s); while ((a = smem_next(itr)) != 0) { for (i = 0; i < a->n; ++i) { bwtintv_t *p = &a->a[i]; if ((uint32_t)p->info - (p->info>>32) < min_len) continue; err_printf("EM\t%d\t%d\t%ld", (uint32_t)(p->info>>32), (uint32_t)p->info, (long)p->x[2]); if (p->x[2] <= min_iwidth) { for (k = 0; k < p->x[2]; ++k) { bwtint_t pos; int len, is_rev, ref_id; len = (uint32_t)p->info - (p->info>>32); pos = bns_depos(idx->bns, bwt_sa(idx->bwt, p->x[0] + k), &is_rev); if (is_rev) pos -= len - 1; bns_cnt_ambi(idx->bns, pos, len, &ref_id); err_printf("\t%s:%c%ld", idx->bns->anns[ref_id].name, "+-"[is_rev], (long)(pos - idx->bns->anns[ref_id].offset) + 1); } } else err_puts("\t*"); err_putchar('\n'); } } err_puts("//"); } smem_itr_destroy(itr); bwa_idx_destroy(idx); kseq_destroy(seq); err_gzclose(fp); return 0; } bwa-0.7.17/is.c000066400000000000000000000150221317342117100131360ustar00rootroot00000000000000/* * sais.c for sais-lite * Copyright (c) 2008 Yuta Mori All Rights Reserved. * * Permission is hereby granted, free of charge, to any person * obtaining a copy of this software and associated documentation * files (the "Software"), to deal in the Software without * restriction, including without limitation the rights to use, * copy, modify, merge, publish, distribute, sublicense, and/or sell * copies of the Software, and to permit persons to whom the * Software is furnished to do so, subject to the following * conditions: * * The above copyright notice and this permission notice shall be * included in all copies or substantial portions of the Software. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR * OTHER DEALINGS IN THE SOFTWARE. */ #include #ifdef USE_MALLOC_WRAPPERS # include "malloc_wrap.h" #endif typedef unsigned char ubyte_t; #define chr(i) (cs == sizeof(int) ? ((const int *)T)[i]:((const unsigned char *)T)[i]) /* find the start or end of each bucket */ static void getCounts(const unsigned char *T, int *C, int n, int k, int cs) { int i; for (i = 0; i < k; ++i) C[i] = 0; for (i = 0; i < n; ++i) ++C[chr(i)]; } static void getBuckets(const int *C, int *B, int k, int end) { int i, sum = 0; if (end) { for (i = 0; i < k; ++i) { sum += C[i]; B[i] = sum; } } else { for (i = 0; i < k; ++i) { sum += C[i]; B[i] = sum - C[i]; } } } /* compute SA */ static void induceSA(const unsigned char *T, int *SA, int *C, int *B, int n, int k, int cs) { int *b, i, j; int c0, c1; /* compute SAl */ if (C == B) getCounts(T, C, n, k, cs); getBuckets(C, B, k, 0); /* find starts of buckets */ j = n - 1; b = SA + B[c1 = chr(j)]; *b++ = ((0 < j) && (chr(j - 1) < c1)) ? ~j : j; for (i = 0; i < n; ++i) { j = SA[i], SA[i] = ~j; if (0 < j) { --j; if ((c0 = chr(j)) != c1) { B[c1] = b - SA; b = SA + B[c1 = c0]; } *b++ = ((0 < j) && (chr(j - 1) < c1)) ? ~j : j; } } /* compute SAs */ if (C == B) getCounts(T, C, n, k, cs); getBuckets(C, B, k, 1); /* find ends of buckets */ for (i = n - 1, b = SA + B[c1 = 0]; 0 <= i; --i) { if (0 < (j = SA[i])) { --j; if ((c0 = chr(j)) != c1) { B[c1] = b - SA; b = SA + B[c1 = c0]; } *--b = ((j == 0) || (chr(j - 1) > c1)) ? ~j : j; } else SA[i] = ~j; } } /* * find the suffix array SA of T[0..n-1] in {0..k-1}^n use a working * space (excluding T and SA) of at most 2n+O(1) for a constant alphabet */ static int sais_main(const unsigned char *T, int *SA, int fs, int n, int k, int cs) { int *C, *B, *RA; int i, j, c, m, p, q, plen, qlen, name; int c0, c1; int diff; /* stage 1: reduce the problem by at least 1/2 sort all the * S-substrings */ if (k <= fs) { C = SA + n; B = (k <= (fs - k)) ? C + k : C; } else if ((C = B = (int *) malloc(k * sizeof(int))) == NULL) return -2; getCounts(T, C, n, k, cs); getBuckets(C, B, k, 1); /* find ends of buckets */ for (i = 0; i < n; ++i) SA[i] = 0; for (i = n - 2, c = 0, c1 = chr(n - 1); 0 <= i; --i, c1 = c0) { if ((c0 = chr(i)) < (c1 + c)) c = 1; else if (c != 0) SA[--B[c1]] = i + 1, c = 0; } induceSA(T, SA, C, B, n, k, cs); if (fs < k) free(C); /* compact all the sorted substrings into the first m items of SA * 2*m must be not larger than n (proveable) */ for (i = 0, m = 0; i < n; ++i) { p = SA[i]; if ((0 < p) && (chr(p - 1) > (c0 = chr(p)))) { for (j = p + 1; (j < n) && (c0 == (c1 = chr(j))); ++j); if ((j < n) && (c0 < c1)) SA[m++] = p; } } for (i = m; i < n; ++i) SA[i] = 0; /* init the name array buffer */ /* store the length of all substrings */ for (i = n - 2, j = n, c = 0, c1 = chr(n - 1); 0 <= i; --i, c1 = c0) { if ((c0 = chr(i)) < (c1 + c)) c = 1; else if (c != 0) { SA[m + ((i + 1) >> 1)] = j - i - 1; j = i + 1; c = 0; } } /* find the lexicographic names of all substrings */ for (i = 0, name = 0, q = n, qlen = 0; i < m; ++i) { p = SA[i], plen = SA[m + (p >> 1)], diff = 1; if (plen == qlen) { for (j = 0; (j < plen) && (chr(p + j) == chr(q + j)); j++); if (j == plen) diff = 0; } if (diff != 0) ++name, q = p, qlen = plen; SA[m + (p >> 1)] = name; } /* stage 2: solve the reduced problem recurse if names are not yet * unique */ if (name < m) { RA = SA + n + fs - m; for (i = n - 1, j = m - 1; m <= i; --i) { if (SA[i] != 0) RA[j--] = SA[i] - 1; } if (sais_main((unsigned char *) RA, SA, fs + n - m * 2, m, name, sizeof(int)) != 0) return -2; for (i = n - 2, j = m - 1, c = 0, c1 = chr(n - 1); 0 <= i; --i, c1 = c0) { if ((c0 = chr(i)) < (c1 + c)) c = 1; else if (c != 0) RA[j--] = i + 1, c = 0; /* get p1 */ } for (i = 0; i < m; ++i) SA[i] = RA[SA[i]]; /* get index */ } /* stage 3: induce the result for the original problem */ if (k <= fs) { C = SA + n; B = (k <= (fs - k)) ? C + k : C; } else if ((C = B = (int *) malloc(k * sizeof(int))) == NULL) return -2; /* put all left-most S characters into their buckets */ getCounts(T, C, n, k, cs); getBuckets(C, B, k, 1); /* find ends of buckets */ for (i = m; i < n; ++i) SA[i] = 0; /* init SA[m..n-1] */ for (i = m - 1; 0 <= i; --i) { j = SA[i], SA[i] = 0; SA[--B[chr(j)]] = j; } induceSA(T, SA, C, B, n, k, cs); if (fs < k) free(C); return 0; } /** * Constructs the suffix array of a given string. * @param T[0..n-1] The input string. * @param SA[0..n] The output array of suffixes. * @param n The length of the given string. * @return 0 if no error occurred */ int is_sa(const ubyte_t *T, int *SA, int n) { if ((T == NULL) || (SA == NULL) || (n < 0)) return -1; SA[0] = n; if (n <= 1) { if (n == 1) SA[1] = 0; return 0; } return sais_main(T, SA+1, 0, n, 256, 1); } /** * Constructs the burrows-wheeler transformed string of a given string. * @param T[0..n-1] The input string. * @param n The length of the given string. * @return The primary index if no error occurred, -1 or -2 otherwise. */ int is_bwt(ubyte_t *T, int n) { int *SA, i, primary = 0; SA = (int*)calloc(n+1, sizeof(int)); if (is_sa(T, SA, n)) return -1; for (i = 0; i <= n; ++i) { if (SA[i] == 0) primary = i; else SA[i] = T[SA[i] - 1]; } for (i = 0; i < primary; ++i) T[i] = SA[i]; for (; i < n; ++i) T[i] = SA[i + 1]; free(SA); return primary; } bwa-0.7.17/kbtree.h000066400000000000000000000370211317342117100140070ustar00rootroot00000000000000/*- * Copyright 1997-1999, 2001, John-Mark Gurney. * 2008-2009, Attractive Chaos * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #ifndef __AC_KBTREE_H #define __AC_KBTREE_H #include #include #include #ifdef USE_MALLOC_WRAPPERS # include "malloc_wrap.h" #endif typedef struct { int32_t is_internal:1, n:31; } kbnode_t; #define __KB_KEY(type, x) ((type*)((char*)x + 4)) #define __KB_PTR(btr, x) ((kbnode_t**)((char*)x + btr->off_ptr)) #define __KB_TREE_T(name) \ typedef struct { \ kbnode_t *root; \ int off_key, off_ptr, ilen, elen; \ int n, t; \ int n_keys, n_nodes; \ } kbtree_##name##_t; #define __KB_INIT(name, key_t) \ kbtree_##name##_t *kb_init_##name(int size) \ { \ kbtree_##name##_t *b; \ b = (kbtree_##name##_t*)calloc(1, sizeof(kbtree_##name##_t)); \ b->t = ((size - 4 - sizeof(void*)) / (sizeof(void*) + sizeof(key_t)) + 1) >> 1; \ if (b->t < 2) { \ free(b); return 0; \ } \ b->n = 2 * b->t - 1; \ b->off_ptr = 4 + b->n * sizeof(key_t); \ b->ilen = (4 + sizeof(void*) + b->n * (sizeof(void*) + sizeof(key_t)) + 3) >> 2 << 2; \ b->elen = (b->off_ptr + 3) >> 2 << 2; \ b->root = (kbnode_t*)calloc(1, b->ilen); \ ++b->n_nodes; \ return b; \ } #define __kb_destroy(b) do { \ int i, max = 8; \ kbnode_t *x, **top, **stack = 0; \ if (b) { \ top = stack = (kbnode_t**)calloc(max, sizeof(kbnode_t*)); \ *top++ = (b)->root; \ while (top != stack) { \ x = *--top; \ if (x == 0 || x->is_internal == 0) { free(x); continue; } \ for (i = 0; i <= x->n; ++i) \ if (__KB_PTR(b, x)[i]) { \ if (top - stack == max) { \ max <<= 1; \ stack = (kbnode_t**)realloc(stack, max * sizeof(kbnode_t*)); \ top = stack + (max>>1); \ } \ *top++ = __KB_PTR(b, x)[i]; \ } \ free(x); \ } \ } \ free(b); free(stack); \ } while (0) #define __kb_get_first(key_t, b, ret) do { \ kbnode_t *__x = (b)->root; \ while (__KB_PTR(b, __x)[0] != 0) \ __x = __KB_PTR(b, __x)[0]; \ (ret) = __KB_KEY(key_t, __x)[0]; \ } while (0) #define __KB_GET_AUX0(name, key_t, __cmp) \ static inline int __kb_get_aux_##name(const kbnode_t * __restrict x, const key_t * __restrict k, int *r) \ { \ int tr, *rr, begin, end, n = x->n >> 1; \ if (x->n == 0) return -1; \ if (__cmp(*k, __KB_KEY(key_t, x)[n]) < 0) { \ begin = 0; end = n; \ } else { begin = n; end = x->n - 1; } \ rr = r? r : &tr; \ n = end; \ while (n >= begin && (*rr = __cmp(*k, __KB_KEY(key_t, x)[n])) < 0) --n; \ return n; \ } #define __KB_GET_AUX1(name, key_t, __cmp) \ static inline int __kb_getp_aux_##name(const kbnode_t * __restrict x, const key_t * __restrict k, int *r) \ { \ int tr, *rr, begin = 0, end = x->n; \ if (x->n == 0) return -1; \ rr = r? r : &tr; \ while (begin < end) { \ int mid = (begin + end) >> 1; \ if (__cmp(__KB_KEY(key_t, x)[mid], *k) < 0) begin = mid + 1; \ else end = mid; \ } \ if (begin == x->n) { *rr = 1; return x->n - 1; } \ if ((*rr = __cmp(*k, __KB_KEY(key_t, x)[begin])) < 0) --begin; \ return begin; \ } #define __KB_GET(name, key_t) \ static key_t *kb_getp_##name(kbtree_##name##_t *b, const key_t * __restrict k) \ { \ int i, r = 0; \ kbnode_t *x = b->root; \ while (x) { \ i = __kb_getp_aux_##name(x, k, &r); \ if (i >= 0 && r == 0) return &__KB_KEY(key_t, x)[i]; \ if (x->is_internal == 0) return 0; \ x = __KB_PTR(b, x)[i + 1]; \ } \ return 0; \ } \ static inline key_t *kb_get_##name(kbtree_##name##_t *b, const key_t k) \ { \ return kb_getp_##name(b, &k); \ } #define __KB_INTERVAL(name, key_t) \ static void kb_intervalp_##name(kbtree_##name##_t *b, const key_t * __restrict k, key_t **lower, key_t **upper) \ { \ int i, r = 0; \ kbnode_t *x = b->root; \ *lower = *upper = 0; \ while (x) { \ i = __kb_getp_aux_##name(x, k, &r); \ if (i >= 0 && r == 0) { \ *lower = *upper = &__KB_KEY(key_t, x)[i]; \ return; \ } \ if (i >= 0) *lower = &__KB_KEY(key_t, x)[i]; \ if (i < x->n - 1) *upper = &__KB_KEY(key_t, x)[i + 1]; \ if (x->is_internal == 0) return; \ x = __KB_PTR(b, x)[i + 1]; \ } \ } \ static inline void kb_interval_##name(kbtree_##name##_t *b, const key_t k, key_t **lower, key_t **upper) \ { \ kb_intervalp_##name(b, &k, lower, upper); \ } #define __KB_PUT(name, key_t, __cmp) \ /* x must be an internal node */ \ static void __kb_split_##name(kbtree_##name##_t *b, kbnode_t *x, int i, kbnode_t *y) \ { \ kbnode_t *z; \ z = (kbnode_t*)calloc(1, y->is_internal? b->ilen : b->elen); \ ++b->n_nodes; \ z->is_internal = y->is_internal; \ z->n = b->t - 1; \ memcpy(__KB_KEY(key_t, z), __KB_KEY(key_t, y) + b->t, sizeof(key_t) * (b->t - 1)); \ if (y->is_internal) memcpy(__KB_PTR(b, z), __KB_PTR(b, y) + b->t, sizeof(void*) * b->t); \ y->n = b->t - 1; \ memmove(__KB_PTR(b, x) + i + 2, __KB_PTR(b, x) + i + 1, sizeof(void*) * (x->n - i)); \ __KB_PTR(b, x)[i + 1] = z; \ memmove(__KB_KEY(key_t, x) + i + 1, __KB_KEY(key_t, x) + i, sizeof(key_t) * (x->n - i)); \ __KB_KEY(key_t, x)[i] = __KB_KEY(key_t, y)[b->t - 1]; \ ++x->n; \ } \ static void __kb_putp_aux_##name(kbtree_##name##_t *b, kbnode_t *x, const key_t * __restrict k) \ { \ int i = x->n - 1; \ if (x->is_internal == 0) { \ i = __kb_getp_aux_##name(x, k, 0); \ if (i != x->n - 1) \ memmove(__KB_KEY(key_t, x) + i + 2, __KB_KEY(key_t, x) + i + 1, (x->n - i - 1) * sizeof(key_t)); \ __KB_KEY(key_t, x)[i + 1] = *k; \ ++x->n; \ } else { \ i = __kb_getp_aux_##name(x, k, 0) + 1; \ if (__KB_PTR(b, x)[i]->n == 2 * b->t - 1) { \ __kb_split_##name(b, x, i, __KB_PTR(b, x)[i]); \ if (__cmp(*k, __KB_KEY(key_t, x)[i]) > 0) ++i; \ } \ __kb_putp_aux_##name(b, __KB_PTR(b, x)[i], k); \ } \ } \ static void kb_putp_##name(kbtree_##name##_t *b, const key_t * __restrict k) \ { \ kbnode_t *r, *s; \ ++b->n_keys; \ r = b->root; \ if (r->n == 2 * b->t - 1) { \ ++b->n_nodes; \ s = (kbnode_t*)calloc(1, b->ilen); \ b->root = s; s->is_internal = 1; s->n = 0; \ __KB_PTR(b, s)[0] = r; \ __kb_split_##name(b, s, 0, r); \ r = s; \ } \ __kb_putp_aux_##name(b, r, k); \ } \ static inline void kb_put_##name(kbtree_##name##_t *b, const key_t k) \ { \ kb_putp_##name(b, &k); \ } #define __KB_DEL(name, key_t) \ static key_t __kb_delp_aux_##name(kbtree_##name##_t *b, kbnode_t *x, const key_t * __restrict k, int s) \ { \ int yn, zn, i, r = 0; \ kbnode_t *xp, *y, *z; \ key_t kp; \ if (x == 0) return *k; \ if (s) { /* s can only be 0, 1 or 2 */ \ r = x->is_internal == 0? 0 : s == 1? 1 : -1; \ i = s == 1? x->n - 1 : -1; \ } else i = __kb_getp_aux_##name(x, k, &r); \ if (x->is_internal == 0) { \ if (s == 2) ++i; \ kp = __KB_KEY(key_t, x)[i]; \ memmove(__KB_KEY(key_t, x) + i, __KB_KEY(key_t, x) + i + 1, (x->n - i - 1) * sizeof(key_t)); \ --x->n; \ return kp; \ } \ if (r == 0) { \ if ((yn = __KB_PTR(b, x)[i]->n) >= b->t) { \ xp = __KB_PTR(b, x)[i]; \ kp = __KB_KEY(key_t, x)[i]; \ __KB_KEY(key_t, x)[i] = __kb_delp_aux_##name(b, xp, 0, 1); \ return kp; \ } else if ((zn = __KB_PTR(b, x)[i + 1]->n) >= b->t) { \ xp = __KB_PTR(b, x)[i + 1]; \ kp = __KB_KEY(key_t, x)[i]; \ __KB_KEY(key_t, x)[i] = __kb_delp_aux_##name(b, xp, 0, 2); \ return kp; \ } else if (yn == b->t - 1 && zn == b->t - 1) { \ y = __KB_PTR(b, x)[i]; z = __KB_PTR(b, x)[i + 1]; \ __KB_KEY(key_t, y)[y->n++] = *k; \ memmove(__KB_KEY(key_t, y) + y->n, __KB_KEY(key_t, z), z->n * sizeof(key_t)); \ if (y->is_internal) memmove(__KB_PTR(b, y) + y->n, __KB_PTR(b, z), (z->n + 1) * sizeof(void*)); \ y->n += z->n; \ memmove(__KB_KEY(key_t, x) + i, __KB_KEY(key_t, x) + i + 1, (x->n - i - 1) * sizeof(key_t)); \ memmove(__KB_PTR(b, x) + i + 1, __KB_PTR(b, x) + i + 2, (x->n - i - 1) * sizeof(void*)); \ --x->n; \ free(z); \ return __kb_delp_aux_##name(b, y, k, s); \ } \ } \ ++i; \ if ((xp = __KB_PTR(b, x)[i])->n == b->t - 1) { \ if (i > 0 && (y = __KB_PTR(b, x)[i - 1])->n >= b->t) { \ memmove(__KB_KEY(key_t, xp) + 1, __KB_KEY(key_t, xp), xp->n * sizeof(key_t)); \ if (xp->is_internal) memmove(__KB_PTR(b, xp) + 1, __KB_PTR(b, xp), (xp->n + 1) * sizeof(void*)); \ __KB_KEY(key_t, xp)[0] = __KB_KEY(key_t, x)[i - 1]; \ __KB_KEY(key_t, x)[i - 1] = __KB_KEY(key_t, y)[y->n - 1]; \ if (xp->is_internal) __KB_PTR(b, xp)[0] = __KB_PTR(b, y)[y->n]; \ --y->n; ++xp->n; \ } else if (i < x->n && (y = __KB_PTR(b, x)[i + 1])->n >= b->t) { \ __KB_KEY(key_t, xp)[xp->n++] = __KB_KEY(key_t, x)[i]; \ __KB_KEY(key_t, x)[i] = __KB_KEY(key_t, y)[0]; \ if (xp->is_internal) __KB_PTR(b, xp)[xp->n] = __KB_PTR(b, y)[0]; \ --y->n; \ memmove(__KB_KEY(key_t, y), __KB_KEY(key_t, y) + 1, y->n * sizeof(key_t)); \ if (y->is_internal) memmove(__KB_PTR(b, y), __KB_PTR(b, y) + 1, (y->n + 1) * sizeof(void*)); \ } else if (i > 0 && (y = __KB_PTR(b, x)[i - 1])->n == b->t - 1) { \ __KB_KEY(key_t, y)[y->n++] = __KB_KEY(key_t, x)[i - 1]; \ memmove(__KB_KEY(key_t, y) + y->n, __KB_KEY(key_t, xp), xp->n * sizeof(key_t)); \ if (y->is_internal) memmove(__KB_PTR(b, y) + y->n, __KB_PTR(b, xp), (xp->n + 1) * sizeof(void*)); \ y->n += xp->n; \ memmove(__KB_KEY(key_t, x) + i - 1, __KB_KEY(key_t, x) + i, (x->n - i) * sizeof(key_t)); \ memmove(__KB_PTR(b, x) + i, __KB_PTR(b, x) + i + 1, (x->n - i) * sizeof(void*)); \ --x->n; \ free(xp); \ xp = y; \ } else if (i < x->n && (y = __KB_PTR(b, x)[i + 1])->n == b->t - 1) { \ __KB_KEY(key_t, xp)[xp->n++] = __KB_KEY(key_t, x)[i]; \ memmove(__KB_KEY(key_t, xp) + xp->n, __KB_KEY(key_t, y), y->n * sizeof(key_t)); \ if (xp->is_internal) memmove(__KB_PTR(b, xp) + xp->n, __KB_PTR(b, y), (y->n + 1) * sizeof(void*)); \ xp->n += y->n; \ memmove(__KB_KEY(key_t, x) + i, __KB_KEY(key_t, x) + i + 1, (x->n - i - 1) * sizeof(key_t)); \ memmove(__KB_PTR(b, x) + i + 1, __KB_PTR(b, x) + i + 2, (x->n - i - 1) * sizeof(void*)); \ --x->n; \ free(y); \ } \ } \ return __kb_delp_aux_##name(b, xp, k, s); \ } \ static key_t kb_delp_##name(kbtree_##name##_t *b, const key_t * __restrict k) \ { \ kbnode_t *x; \ key_t ret; \ ret = __kb_delp_aux_##name(b, b->root, k, 0); \ --b->n_keys; \ if (b->root->n == 0 && b->root->is_internal) { \ --b->n_nodes; \ x = b->root; \ b->root = __KB_PTR(b, x)[0]; \ free(x); \ } \ return ret; \ } \ static inline key_t kb_del_##name(kbtree_##name##_t *b, const key_t k) \ { \ return kb_delp_##name(b, &k); \ } typedef struct { kbnode_t *x; int i; } __kbstack_t; #define __kb_traverse(key_t, b, __func) do { \ int __kmax = 8; \ __kbstack_t *__kstack, *__kp; \ __kp = __kstack = (__kbstack_t*)calloc(__kmax, sizeof(__kbstack_t)); \ __kp->x = (b)->root; __kp->i = 0; \ for (;;) { \ while (__kp->x && __kp->i <= __kp->x->n) { \ if (__kp - __kstack == __kmax - 1) { \ __kmax <<= 1; \ __kstack = (__kbstack_t*)realloc(__kstack, __kmax * sizeof(__kbstack_t)); \ __kp = __kstack + (__kmax>>1) - 1; \ } \ (__kp+1)->i = 0; (__kp+1)->x = __kp->x->is_internal? __KB_PTR(b, __kp->x)[__kp->i] : 0; \ ++__kp; \ } \ --__kp; \ if (__kp >= __kstack) { \ if (__kp->x && __kp->i < __kp->x->n) __func(&__KB_KEY(key_t, __kp->x)[__kp->i]); \ ++__kp->i; \ } else break; \ } \ free(__kstack); \ } while (0) #define KBTREE_INIT(name, key_t, __cmp) \ __KB_TREE_T(name) \ __KB_INIT(name, key_t) \ __KB_GET_AUX1(name, key_t, __cmp) \ __KB_GET(name, key_t) \ __KB_INTERVAL(name, key_t) \ __KB_PUT(name, key_t, __cmp) \ __KB_DEL(name, key_t) #define KB_DEFAULT_SIZE 512 #define kbtree_t(name) kbtree_##name##_t #define kb_init(name, s) kb_init_##name(s) #define kb_destroy(name, b) __kb_destroy(b) #define kb_get(name, b, k) kb_get_##name(b, k) #define kb_put(name, b, k) kb_put_##name(b, k) #define kb_del(name, b, k) kb_del_##name(b, k) #define kb_interval(name, b, k, l, u) kb_interval_##name(b, k, l, u) #define kb_getp(name, b, k) kb_getp_##name(b, k) #define kb_putp(name, b, k) kb_putp_##name(b, k) #define kb_delp(name, b, k) kb_delp_##name(b, k) #define kb_intervalp(name, b, k, l, u) kb_intervalp_##name(b, k, l, u) #define kb_size(b) ((b)->n_keys) #define kb_generic_cmp(a, b) (((b) < (a)) - ((a) < (b))) #define kb_str_cmp(a, b) strcmp(a, b) #endif bwa-0.7.17/khash.h000066400000000000000000000506301317342117100136320ustar00rootroot00000000000000/* The MIT License Copyright (c) 2008, 2009, 2011 by Attractive Chaos Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. */ /* An example: #include "khash.h" KHASH_MAP_INIT_INT(32, char) int main() { int ret, is_missing; khiter_t k; khash_t(32) *h = kh_init(32); k = kh_put(32, h, 5, &ret); kh_value(h, k) = 10; k = kh_get(32, h, 10); is_missing = (k == kh_end(h)); k = kh_get(32, h, 5); kh_del(32, h, k); for (k = kh_begin(h); k != kh_end(h); ++k) if (kh_exist(h, k)) kh_value(h, k) = 1; kh_destroy(32, h); return 0; } */ /* 2011-12-29 (0.2.7): * Minor code clean up; no actual effect. 2011-09-16 (0.2.6): * The capacity is a power of 2. This seems to dramatically improve the speed for simple keys. Thank Zilong Tan for the suggestion. Reference: - http://code.google.com/p/ulib/ - http://nothings.org/computer/judy/ * Allow to optionally use linear probing which usually has better performance for random input. Double hashing is still the default as it is more robust to certain non-random input. * Added Wang's integer hash function (not used by default). This hash function is more robust to certain non-random input. 2011-02-14 (0.2.5): * Allow to declare global functions. 2009-09-26 (0.2.4): * Improve portability 2008-09-19 (0.2.3): * Corrected the example * Improved interfaces 2008-09-11 (0.2.2): * Improved speed a little in kh_put() 2008-09-10 (0.2.1): * Added kh_clear() * Fixed a compiling error 2008-09-02 (0.2.0): * Changed to token concatenation which increases flexibility. 2008-08-31 (0.1.2): * Fixed a bug in kh_get(), which has not been tested previously. 2008-08-31 (0.1.1): * Added destructor */ #ifndef __AC_KHASH_H #define __AC_KHASH_H /*! @header Generic hash table library. */ #define AC_VERSION_KHASH_H "0.2.6" #include #include #include #ifdef USE_MALLOC_WRAPPERS # include "malloc_wrap.h" #endif /* compipler specific configuration */ #if UINT_MAX == 0xffffffffu typedef unsigned int khint32_t; #elif ULONG_MAX == 0xffffffffu typedef unsigned long khint32_t; #endif #if ULONG_MAX == ULLONG_MAX typedef unsigned long khint64_t; #else typedef unsigned long long khint64_t; #endif #ifdef _MSC_VER #define kh_inline __inline #else #define kh_inline inline #endif typedef khint32_t khint_t; typedef khint_t khiter_t; #define __ac_isempty(flag, i) ((flag[i>>4]>>((i&0xfU)<<1))&2) #define __ac_isdel(flag, i) ((flag[i>>4]>>((i&0xfU)<<1))&1) #define __ac_iseither(flag, i) ((flag[i>>4]>>((i&0xfU)<<1))&3) #define __ac_set_isdel_false(flag, i) (flag[i>>4]&=~(1ul<<((i&0xfU)<<1))) #define __ac_set_isempty_false(flag, i) (flag[i>>4]&=~(2ul<<((i&0xfU)<<1))) #define __ac_set_isboth_false(flag, i) (flag[i>>4]&=~(3ul<<((i&0xfU)<<1))) #define __ac_set_isdel_true(flag, i) (flag[i>>4]|=1ul<<((i&0xfU)<<1)) #ifdef KHASH_LINEAR #define __ac_inc(k, m) 1 #else #define __ac_inc(k, m) (((k)>>3 ^ (k)<<3) | 1) & (m) #endif #define __ac_fsize(m) ((m) < 16? 1 : (m)>>4) #ifndef kroundup32 #define kroundup32(x) (--(x), (x)|=(x)>>1, (x)|=(x)>>2, (x)|=(x)>>4, (x)|=(x)>>8, (x)|=(x)>>16, ++(x)) #endif #ifndef kcalloc #define kcalloc(N,Z) calloc(N,Z) #endif #ifndef kmalloc #define kmalloc(Z) malloc(Z) #endif #ifndef krealloc #define krealloc(P,Z) realloc(P,Z) #endif #ifndef kfree #define kfree(P) free(P) #endif static const double __ac_HASH_UPPER = 0.77; #define __KHASH_TYPE(name, khkey_t, khval_t) \ typedef struct { \ khint_t n_buckets, size, n_occupied, upper_bound; \ khint32_t *flags; \ khkey_t *keys; \ khval_t *vals; \ } kh_##name##_t; #define __KHASH_PROTOTYPES(name, khkey_t, khval_t) \ extern kh_##name##_t *kh_init_##name(void); \ extern void kh_destroy_##name(kh_##name##_t *h); \ extern void kh_clear_##name(kh_##name##_t *h); \ extern khint_t kh_get_##name(const kh_##name##_t *h, khkey_t key); \ extern int kh_resize_##name(kh_##name##_t *h, khint_t new_n_buckets); \ extern khint_t kh_put_##name(kh_##name##_t *h, khkey_t key, int *ret); \ extern void kh_del_##name(kh_##name##_t *h, khint_t x); #define __KHASH_IMPL(name, SCOPE, khkey_t, khval_t, kh_is_map, __hash_func, __hash_equal) \ SCOPE kh_##name##_t *kh_init_##name(void) { \ return (kh_##name##_t*)kcalloc(1, sizeof(kh_##name##_t)); \ } \ SCOPE void kh_destroy_##name(kh_##name##_t *h) \ { \ if (h) { \ kfree((void *)h->keys); kfree(h->flags); \ kfree((void *)h->vals); \ kfree(h); \ } \ } \ SCOPE void kh_clear_##name(kh_##name##_t *h) \ { \ if (h && h->flags) { \ memset(h->flags, 0xaa, __ac_fsize(h->n_buckets) * sizeof(khint32_t)); \ h->size = h->n_occupied = 0; \ } \ } \ SCOPE khint_t kh_get_##name(const kh_##name##_t *h, khkey_t key) \ { \ if (h->n_buckets) { \ khint_t inc, k, i, last, mask; \ mask = h->n_buckets - 1; \ k = __hash_func(key); i = k & mask; \ inc = __ac_inc(k, mask); last = i; /* inc==1 for linear probing */ \ while (!__ac_isempty(h->flags, i) && (__ac_isdel(h->flags, i) || !__hash_equal(h->keys[i], key))) { \ i = (i + inc) & mask; \ if (i == last) return h->n_buckets; \ } \ return __ac_iseither(h->flags, i)? h->n_buckets : i; \ } else return 0; \ } \ SCOPE int kh_resize_##name(kh_##name##_t *h, khint_t new_n_buckets) \ { /* This function uses 0.25*n_bucktes bytes of working space instead of [sizeof(key_t+val_t)+.25]*n_buckets. */ \ khint32_t *new_flags = 0; \ khint_t j = 1; \ { \ kroundup32(new_n_buckets); \ if (new_n_buckets < 4) new_n_buckets = 4; \ if (h->size >= (khint_t)(new_n_buckets * __ac_HASH_UPPER + 0.5)) j = 0; /* requested size is too small */ \ else { /* hash table size to be changed (shrink or expand); rehash */ \ new_flags = (khint32_t*)kmalloc(__ac_fsize(new_n_buckets) * sizeof(khint32_t)); \ if (!new_flags) return -1; \ memset(new_flags, 0xaa, __ac_fsize(new_n_buckets) * sizeof(khint32_t)); \ if (h->n_buckets < new_n_buckets) { /* expand */ \ khkey_t *new_keys = (khkey_t*)krealloc((void *)h->keys, new_n_buckets * sizeof(khkey_t)); \ if (!new_keys) return -1; \ h->keys = new_keys; \ if (kh_is_map) { \ khval_t *new_vals = (khval_t*)krealloc((void *)h->vals, new_n_buckets * sizeof(khval_t)); \ if (!new_vals) return -1; \ h->vals = new_vals; \ } \ } /* otherwise shrink */ \ } \ } \ if (j) { /* rehashing is needed */ \ for (j = 0; j != h->n_buckets; ++j) { \ if (__ac_iseither(h->flags, j) == 0) { \ khkey_t key = h->keys[j]; \ khval_t val; \ khint_t new_mask; \ new_mask = new_n_buckets - 1; \ if (kh_is_map) val = h->vals[j]; \ __ac_set_isdel_true(h->flags, j); \ while (1) { /* kick-out process; sort of like in Cuckoo hashing */ \ khint_t inc, k, i; \ k = __hash_func(key); \ i = k & new_mask; \ inc = __ac_inc(k, new_mask); \ while (!__ac_isempty(new_flags, i)) i = (i + inc) & new_mask; \ __ac_set_isempty_false(new_flags, i); \ if (i < h->n_buckets && __ac_iseither(h->flags, i) == 0) { /* kick out the existing element */ \ { khkey_t tmp = h->keys[i]; h->keys[i] = key; key = tmp; } \ if (kh_is_map) { khval_t tmp = h->vals[i]; h->vals[i] = val; val = tmp; } \ __ac_set_isdel_true(h->flags, i); /* mark it as deleted in the old hash table */ \ } else { /* write the element and jump out of the loop */ \ h->keys[i] = key; \ if (kh_is_map) h->vals[i] = val; \ break; \ } \ } \ } \ } \ if (h->n_buckets > new_n_buckets) { /* shrink the hash table */ \ h->keys = (khkey_t*)krealloc((void *)h->keys, new_n_buckets * sizeof(khkey_t)); \ if (kh_is_map) h->vals = (khval_t*)krealloc((void *)h->vals, new_n_buckets * sizeof(khval_t)); \ } \ kfree(h->flags); /* free the working space */ \ h->flags = new_flags; \ h->n_buckets = new_n_buckets; \ h->n_occupied = h->size; \ h->upper_bound = (khint_t)(h->n_buckets * __ac_HASH_UPPER + 0.5); \ } \ return 0; \ } \ SCOPE khint_t kh_put_##name(kh_##name##_t *h, khkey_t key, int *ret) \ { \ khint_t x; \ if (h->n_occupied >= h->upper_bound) { /* update the hash table */ \ if (h->n_buckets > (h->size<<1)) { \ if (kh_resize_##name(h, h->n_buckets - 1) < 0) { /* clear "deleted" elements */ \ *ret = -1; return h->n_buckets; \ } \ } else if (kh_resize_##name(h, h->n_buckets + 1) < 0) { /* expand the hash table */ \ *ret = -1; return h->n_buckets; \ } \ } /* TODO: to implement automatically shrinking; resize() already support shrinking */ \ { \ khint_t inc, k, i, site, last, mask = h->n_buckets - 1; \ x = site = h->n_buckets; k = __hash_func(key); i = k & mask; \ if (__ac_isempty(h->flags, i)) x = i; /* for speed up */ \ else { \ inc = __ac_inc(k, mask); last = i; \ while (!__ac_isempty(h->flags, i) && (__ac_isdel(h->flags, i) || !__hash_equal(h->keys[i], key))) { \ if (__ac_isdel(h->flags, i)) site = i; \ i = (i + inc) & mask; \ if (i == last) { x = site; break; } \ } \ if (x == h->n_buckets) { \ if (__ac_isempty(h->flags, i) && site != h->n_buckets) x = site; \ else x = i; \ } \ } \ } \ if (__ac_isempty(h->flags, x)) { /* not present at all */ \ h->keys[x] = key; \ __ac_set_isboth_false(h->flags, x); \ ++h->size; ++h->n_occupied; \ *ret = 1; \ } else if (__ac_isdel(h->flags, x)) { /* deleted */ \ h->keys[x] = key; \ __ac_set_isboth_false(h->flags, x); \ ++h->size; \ *ret = 2; \ } else *ret = 0; /* Don't touch h->keys[x] if present and not deleted */ \ return x; \ } \ SCOPE void kh_del_##name(kh_##name##_t *h, khint_t x) \ { \ if (x != h->n_buckets && !__ac_iseither(h->flags, x)) { \ __ac_set_isdel_true(h->flags, x); \ --h->size; \ } \ } #define KHASH_DECLARE(name, khkey_t, khval_t) \ __KHASH_TYPE(name, khkey_t, khval_t) \ __KHASH_PROTOTYPES(name, khkey_t, khval_t) #define KHASH_INIT2(name, SCOPE, khkey_t, khval_t, kh_is_map, __hash_func, __hash_equal) \ __KHASH_TYPE(name, khkey_t, khval_t) \ __KHASH_IMPL(name, SCOPE, khkey_t, khval_t, kh_is_map, __hash_func, __hash_equal) #define KHASH_INIT(name, khkey_t, khval_t, kh_is_map, __hash_func, __hash_equal) \ KHASH_INIT2(name, static kh_inline, khkey_t, khval_t, kh_is_map, __hash_func, __hash_equal) /* --- BEGIN OF HASH FUNCTIONS --- */ /*! @function @abstract Integer hash function @param key The integer [khint32_t] @return The hash value [khint_t] */ #define kh_int_hash_func(key) (khint32_t)(key) /*! @function @abstract Integer comparison function */ #define kh_int_hash_equal(a, b) ((a) == (b)) /*! @function @abstract 64-bit integer hash function @param key The integer [khint64_t] @return The hash value [khint_t] */ #define kh_int64_hash_func(key) (khint32_t)((key)>>33^(key)^(key)<<11) /*! @function @abstract 64-bit integer comparison function */ #define kh_int64_hash_equal(a, b) ((a) == (b)) /*! @function @abstract const char* hash function @param s Pointer to a null terminated string @return The hash value */ static kh_inline khint_t __ac_X31_hash_string(const char *s) { khint_t h = (khint_t)*s; if (h) for (++s ; *s; ++s) h = (h << 5) - h + (khint_t)*s; return h; } /*! @function @abstract Another interface to const char* hash function @param key Pointer to a null terminated string [const char*] @return The hash value [khint_t] */ #define kh_str_hash_func(key) __ac_X31_hash_string(key) /*! @function @abstract Const char* comparison function */ #define kh_str_hash_equal(a, b) (strcmp(a, b) == 0) static kh_inline khint_t __ac_Wang_hash(khint_t key) { key += ~(key << 15); key ^= (key >> 10); key += (key << 3); key ^= (key >> 6); key += ~(key << 11); key ^= (key >> 16); return key; } #define kh_int_hash_func2(k) __ac_Wang_hash((khint_t)key) /* --- END OF HASH FUNCTIONS --- */ /* Other convenient macros... */ /*! @abstract Type of the hash table. @param name Name of the hash table [symbol] */ #define khash_t(name) kh_##name##_t /*! @function @abstract Initiate a hash table. @param name Name of the hash table [symbol] @return Pointer to the hash table [khash_t(name)*] */ #define kh_init(name) kh_init_##name() /*! @function @abstract Destroy a hash table. @param name Name of the hash table [symbol] @param h Pointer to the hash table [khash_t(name)*] */ #define kh_destroy(name, h) kh_destroy_##name(h) /*! @function @abstract Reset a hash table without deallocating memory. @param name Name of the hash table [symbol] @param h Pointer to the hash table [khash_t(name)*] */ #define kh_clear(name, h) kh_clear_##name(h) /*! @function @abstract Resize a hash table. @param name Name of the hash table [symbol] @param h Pointer to the hash table [khash_t(name)*] @param s New size [khint_t] */ #define kh_resize(name, h, s) kh_resize_##name(h, s) /*! @function @abstract Insert a key to the hash table. @param name Name of the hash table [symbol] @param h Pointer to the hash table [khash_t(name)*] @param k Key [type of keys] @param r Extra return code: 0 if the key is present in the hash table; 1 if the bucket is empty (never used); 2 if the element in the bucket has been deleted [int*] @return Iterator to the inserted element [khint_t] */ #define kh_put(name, h, k, r) kh_put_##name(h, k, r) /*! @function @abstract Retrieve a key from the hash table. @param name Name of the hash table [symbol] @param h Pointer to the hash table [khash_t(name)*] @param k Key [type of keys] @return Iterator to the found element, or kh_end(h) is the element is absent [khint_t] */ #define kh_get(name, h, k) kh_get_##name(h, k) /*! @function @abstract Remove a key from the hash table. @param name Name of the hash table [symbol] @param h Pointer to the hash table [khash_t(name)*] @param k Iterator to the element to be deleted [khint_t] */ #define kh_del(name, h, k) kh_del_##name(h, k) /*! @function @abstract Test whether a bucket contains data. @param h Pointer to the hash table [khash_t(name)*] @param x Iterator to the bucket [khint_t] @return 1 if containing data; 0 otherwise [int] */ #define kh_exist(h, x) (!__ac_iseither((h)->flags, (x))) /*! @function @abstract Get key given an iterator @param h Pointer to the hash table [khash_t(name)*] @param x Iterator to the bucket [khint_t] @return Key [type of keys] */ #define kh_key(h, x) ((h)->keys[x]) /*! @function @abstract Get value given an iterator @param h Pointer to the hash table [khash_t(name)*] @param x Iterator to the bucket [khint_t] @return Value [type of values] @discussion For hash sets, calling this results in segfault. */ #define kh_val(h, x) ((h)->vals[x]) /*! @function @abstract Alias of kh_val() */ #define kh_value(h, x) ((h)->vals[x]) /*! @function @abstract Get the start iterator @param h Pointer to the hash table [khash_t(name)*] @return The start iterator [khint_t] */ #define kh_begin(h) (khint_t)(0) /*! @function @abstract Get the end iterator @param h Pointer to the hash table [khash_t(name)*] @return The end iterator [khint_t] */ #define kh_end(h) ((h)->n_buckets) /*! @function @abstract Get the number of elements in the hash table @param h Pointer to the hash table [khash_t(name)*] @return Number of elements in the hash table [khint_t] */ #define kh_size(h) ((h)->size) /*! @function @abstract Get the number of buckets in the hash table @param h Pointer to the hash table [khash_t(name)*] @return Number of buckets in the hash table [khint_t] */ #define kh_n_buckets(h) ((h)->n_buckets) /*! @function @abstract Iterate over the entries in the hash table @param h Pointer to the hash table [khash_t(name)*] @param kvar Variable to which key will be assigned @param vvar Variable to which value will be assigned @param code Block of code to execute */ #define kh_foreach(h, kvar, vvar, code) { khint_t __i; \ for (__i = kh_begin(h); __i != kh_end(h); ++__i) { \ if (!kh_exist(h,__i)) continue; \ (kvar) = kh_key(h,__i); \ (vvar) = kh_val(h,__i); \ code; \ } } /*! @function @abstract Iterate over the values in the hash table @param h Pointer to the hash table [khash_t(name)*] @param vvar Variable to which value will be assigned @param code Block of code to execute */ #define kh_foreach_value(h, vvar, code) { khint_t __i; \ for (__i = kh_begin(h); __i != kh_end(h); ++__i) { \ if (!kh_exist(h,__i)) continue; \ (vvar) = kh_val(h,__i); \ code; \ } } /* More conenient interfaces */ /*! @function @abstract Instantiate a hash set containing integer keys @param name Name of the hash table [symbol] */ #define KHASH_SET_INIT_INT(name) \ KHASH_INIT(name, khint32_t, char, 0, kh_int_hash_func, kh_int_hash_equal) /*! @function @abstract Instantiate a hash map containing integer keys @param name Name of the hash table [symbol] @param khval_t Type of values [type] */ #define KHASH_MAP_INIT_INT(name, khval_t) \ KHASH_INIT(name, khint32_t, khval_t, 1, kh_int_hash_func, kh_int_hash_equal) /*! @function @abstract Instantiate a hash map containing 64-bit integer keys @param name Name of the hash table [symbol] */ #define KHASH_SET_INIT_INT64(name) \ KHASH_INIT(name, khint64_t, char, 0, kh_int64_hash_func, kh_int64_hash_equal) /*! @function @abstract Instantiate a hash map containing 64-bit integer keys @param name Name of the hash table [symbol] @param khval_t Type of values [type] */ #define KHASH_MAP_INIT_INT64(name, khval_t) \ KHASH_INIT(name, khint64_t, khval_t, 1, kh_int64_hash_func, kh_int64_hash_equal) typedef const char *kh_cstr_t; /*! @function @abstract Instantiate a hash map containing const char* keys @param name Name of the hash table [symbol] */ #define KHASH_SET_INIT_STR(name) \ KHASH_INIT(name, kh_cstr_t, char, 0, kh_str_hash_func, kh_str_hash_equal) /*! @function @abstract Instantiate a hash map containing const char* keys @param name Name of the hash table [symbol] @param khval_t Type of values [type] */ #define KHASH_MAP_INIT_STR(name, khval_t) \ KHASH_INIT(name, kh_cstr_t, khval_t, 1, kh_str_hash_func, kh_str_hash_equal) #endif /* __AC_KHASH_H */ bwa-0.7.17/kopen.c000066400000000000000000000227621317342117100136500ustar00rootroot00000000000000#include #include #include #include #include #include #include #include #include #include #ifndef _WIN32 #include #include #include #endif #ifdef USE_MALLOC_WRAPPERS # include "malloc_wrap.h" #endif #ifdef _WIN32 #define _KO_NO_NET #endif #ifndef _KO_NO_NET static int socket_wait(int fd, int is_read) { fd_set fds, *fdr = 0, *fdw = 0; struct timeval tv; int ret; tv.tv_sec = 5; tv.tv_usec = 0; // 5 seconds time out FD_ZERO(&fds); FD_SET(fd, &fds); if (is_read) fdr = &fds; else fdw = &fds; ret = select(fd+1, fdr, fdw, 0, &tv); if (ret == -1) perror("select"); return ret; } static int socket_connect(const char *host, const char *port) { #define __err_connect(func) do { perror(func); freeaddrinfo(res); return -1; } while (0) int on = 1, fd; struct linger lng = { 0, 0 }; struct addrinfo hints, *res = 0; memset(&hints, 0, sizeof(struct addrinfo)); hints.ai_family = AF_UNSPEC; hints.ai_socktype = SOCK_STREAM; if (getaddrinfo(host, port, &hints, &res) != 0) __err_connect("getaddrinfo"); if ((fd = socket(res->ai_family, res->ai_socktype, res->ai_protocol)) == -1) __err_connect("socket"); if (setsockopt(fd, SOL_SOCKET, SO_REUSEADDR, &on, sizeof(on)) == -1) __err_connect("setsockopt"); if (setsockopt(fd, SOL_SOCKET, SO_LINGER, &lng, sizeof(lng)) == -1) __err_connect("setsockopt"); if (connect(fd, res->ai_addr, res->ai_addrlen) != 0) __err_connect("connect"); freeaddrinfo(res); return fd; #undef __err_connect } static int write_bytes(int fd, const char *buf, size_t len) { ssize_t bytes; do { bytes = write(fd, buf, len); if (bytes >= 0) { len -= bytes; } else if (errno != EAGAIN && errno != EWOULDBLOCK && errno != EINTR) { return -1; } } while (len > 0); return 0; } static int http_open(const char *fn) { char *p, *proxy, *q, *http_host, *host, *port, *path, *buf; int fd, ret, l; ssize_t bytes = 0, bufsz = 0x10000; /* parse URL; adapted from khttp_parse_url() in knetfile.c */ if (strstr(fn, "http://") != fn) return 0; // set ->http_host for (p = (char*)fn + 7; *p && *p != '/'; ++p); l = p - fn - 7; http_host = calloc(l + 1, 1); strncpy(http_host, fn + 7, l); http_host[l] = 0; for (q = http_host; *q && *q != ':'; ++q); if (*q == ':') *q++ = 0; // get http_proxy proxy = getenv("http_proxy"); // set host, port and path if (proxy == 0) { host = strdup(http_host); // when there is no proxy, server name is identical to http_host name. port = strdup(*q? q : "80"); path = strdup(*p? p : "/"); } else { host = (strstr(proxy, "http://") == proxy)? strdup(proxy + 7) : strdup(proxy); for (q = host; *q && *q != ':'; ++q); if (*q == ':') *q++ = 0; port = strdup(*q? q : "80"); path = strdup(fn); } /* connect; adapted from khttp_connect() in knetfile.c */ l = 0; fd = socket_connect(host, port); buf = calloc(bufsz, 1); // FIXME: I am lazy... But in principle, 64KB should be large enough. l += snprintf(buf + l, bufsz, "GET %s HTTP/1.0\r\nHost: %s\r\n\r\n", path, http_host); if (write_bytes(fd, buf, l) != 0) { close(fd); fd = -1; goto out; } l = 0; retry: while (l < bufsz && (bytes = read(fd, buf + l, 1)) > 0) { // read HTTP header; FIXME: bad efficiency if (buf[l] == '\n' && l >= 3) if (strncmp(buf + l - 3, "\r\n\r\n", 4) == 0) break; ++l; } if (bytes < 0 && (errno == EAGAIN || errno == EWOULDBLOCK || errno == EINTR)) goto retry; buf[l] = 0; if (bytes < 0 || l < 14) { // prematured header close(fd); fd = -1; goto out; } ret = strtol(buf + 8, &p, 0); // HTTP return code if (ret != 200) { close(fd); fd = -1; } out: free(buf); free(http_host); free(host); free(port); free(path); return fd; } typedef struct { int max_response, ctrl_fd; char *response; } ftpaux_t; static int kftp_get_response(ftpaux_t *aux) { unsigned char c; int n = 0; char *p; if (socket_wait(aux->ctrl_fd, 1) <= 0) return 0; while (read(aux->ctrl_fd, &c, 1)) { // FIXME: this is *VERY BAD* for unbuffered I/O if (n >= aux->max_response) { aux->max_response = aux->max_response? aux->max_response<<1 : 256; aux->response = realloc(aux->response, aux->max_response); } aux->response[n++] = c; if (c == '\n') { if (n >= 4 && isdigit(aux->response[0]) && isdigit(aux->response[1]) && isdigit(aux->response[2]) && aux->response[3] != '-') break; n = 0; continue; } } if (n < 2) return -1; aux->response[n-2] = 0; return strtol(aux->response, &p, 0); } static int kftp_send_cmd(ftpaux_t *aux, const char *cmd, int is_get) { if (socket_wait(aux->ctrl_fd, 0) <= 0) return -1; // socket is not ready for writing if (write_bytes(aux->ctrl_fd, cmd, strlen(cmd)) != 0) return -1; return is_get? kftp_get_response(aux) : 0; } static int ftp_open(const char *fn) { char *p, *host = 0, *port = 0, *retr = 0; char host2[80], port2[10]; int v[6], l, fd = -1, ret, pasv_port, pasv_ip[4]; ftpaux_t aux; /* parse URL */ if (strstr(fn, "ftp://") != fn) return 0; for (p = (char*)fn + 6; *p && *p != '/'; ++p); if (*p != '/') return 0; l = p - fn - 6; port = strdup("21"); host = calloc(l + 1, 1); strncpy(host, fn + 6, l); retr = calloc(strlen(p) + 8, 1); sprintf(retr, "RETR %s\r\n", p); /* connect to ctrl */ memset(&aux, 0, sizeof(ftpaux_t)); aux.ctrl_fd = socket_connect(host, port); if (aux.ctrl_fd == -1) goto ftp_open_end; /* fail to connect ctrl */ /* connect to the data stream */ kftp_get_response(&aux); kftp_send_cmd(&aux, "USER anonymous\r\n", 1); kftp_send_cmd(&aux, "PASS kopen@\r\n", 1); kftp_send_cmd(&aux, "TYPE I\r\n", 1); kftp_send_cmd(&aux, "PASV\r\n", 1); for (p = aux.response; *p && *p != '('; ++p); if (*p != '(') goto ftp_open_end; ++p; sscanf(p, "%d,%d,%d,%d,%d,%d", &v[0], &v[1], &v[2], &v[3], &v[4], &v[5]); memcpy(pasv_ip, v, 4 * sizeof(int)); pasv_port = (v[4]<<8&0xff00) + v[5]; kftp_send_cmd(&aux, retr, 0); sprintf(host2, "%d.%d.%d.%d", pasv_ip[0], pasv_ip[1], pasv_ip[2], pasv_ip[3]); sprintf(port2, "%d", pasv_port); fd = socket_connect(host2, port2); if (fd == -1) goto ftp_open_end; ret = kftp_get_response(&aux); if (ret != 150) { close(fd); fd = -1; } close(aux.ctrl_fd); ftp_open_end: free(host); free(port); free(retr); free(aux.response); return fd; } #endif /* !defined(_KO_NO_NET) */ static char **cmd2argv(const char *cmd) { int i, beg, end, argc; char **argv, *str; end = strlen(cmd); for (i = end - 1; i >= 0; --i) if (!isspace(cmd[i])) break; end = i + 1; for (beg = 0; beg < end; ++beg) if (!isspace(cmd[beg])) break; if (beg == end) return 0; for (i = beg + 1, argc = 0; i < end; ++i) if (isspace(cmd[i]) && !isspace(cmd[i-1])) ++argc; argv = (char**)calloc(argc + 2, sizeof(void*)); argv[0] = str = (char*)calloc(end - beg + 1, 1); strncpy(argv[0], cmd + beg, end - beg); for (i = argc = 1; i < end - beg; ++i) if (isspace(str[i])) str[i] = 0; else if (str[i] && str[i-1] == 0) argv[argc++] = &str[i]; return argv; } #define KO_STDIN 1 #define KO_FILE 2 #define KO_PIPE 3 #define KO_HTTP 4 #define KO_FTP 5 typedef struct { int type, fd; pid_t pid; } koaux_t; void *kopen(const char *fn, int *_fd) { koaux_t *aux = 0; *_fd = -1; if (strstr(fn, "http://") == fn) { aux = calloc(1, sizeof(koaux_t)); aux->type = KO_HTTP; aux->fd = http_open(fn); } else if (strstr(fn, "ftp://") == fn) { aux = calloc(1, sizeof(koaux_t)); aux->type = KO_FTP; aux->fd = ftp_open(fn); } else if (strcmp(fn, "-") == 0) { aux = calloc(1, sizeof(koaux_t)); aux->type = KO_STDIN; aux->fd = STDIN_FILENO; } else { const char *p, *q; for (p = fn; *p; ++p) if (!isspace(*p)) break; if (*p == '<') { // pipe open int need_shell, pfd[2]; pid_t pid; // a simple check to see if we need to invoke a shell; not always working for (q = p + 1; *q; ++q) if (ispunct(*q) && *q != '.' && *q != '_' && *q != '-' && *q != ':') break; need_shell = (*q != 0); if (pipe(pfd) != 0) return 0; pid = vfork(); if (pid == -1) { /* vfork() error */ close(pfd[0]); close(pfd[1]); return 0; } if (pid == 0) { /* the child process */ char **argv; /* FIXME: I do not know if this will lead to a memory leak */ close(pfd[0]); dup2(pfd[1], STDOUT_FILENO); close(pfd[1]); if (!need_shell) { argv = cmd2argv(p + 1); execvp(argv[0], argv); free(argv[0]); free(argv); } else execl("/bin/sh", "sh", "-c", p + 1, NULL); exit(1); } else { /* parent process */ close(pfd[1]); aux = calloc(1, sizeof(koaux_t)); aux->type = KO_PIPE; aux->fd = pfd[0]; aux->pid = pid; } } else { #ifdef _WIN32 *_fd = open(fn, O_RDONLY | O_BINARY); #else *_fd = open(fn, O_RDONLY); #endif if (*_fd >= 0) { aux = calloc(1, sizeof(koaux_t)); aux->type = KO_FILE; aux->fd = *_fd; } } } if (aux) *_fd = aux->fd; return aux; } int kclose(void *a) { koaux_t *aux = (koaux_t*)a; if (aux->type == KO_PIPE) { int status; pid_t pid; pid = waitpid(aux->pid, &status, WNOHANG); if (pid != aux->pid) kill(aux->pid, 15); } free(aux); return 0; } #ifdef _KO_MAIN #define BUF_SIZE 0x10000 int main(int argc, char *argv[]) { void *x; int l, fd; unsigned char buf[BUF_SIZE]; FILE *fp; if (argc == 1) { fprintf(stderr, "Usage: kopen \n"); return 1; } x = kopen(argv[1], &fd); fp = fdopen(fd, "r"); if (fp == 0) { fprintf(stderr, "ERROR: fail to open the input\n"); return 1; } do { if ((l = fread(buf, 1, BUF_SIZE, fp)) != 0) fwrite(buf, 1, l, stdout); } while (l == BUF_SIZE); fclose(fp); kclose(x); return 0; } #endif bwa-0.7.17/kseq.h000066400000000000000000000212311317342117100134720ustar00rootroot00000000000000/* The MIT License Copyright (c) 2008, 2009, 2011 Attractive Chaos Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. */ /* Last Modified: 05MAR2012 */ #ifndef AC_KSEQ_H #define AC_KSEQ_H #include #include #include #ifdef USE_MALLOC_WRAPPERS # include "malloc_wrap.h" #endif #define KS_SEP_SPACE 0 // isspace(): \t, \n, \v, \f, \r #define KS_SEP_TAB 1 // isspace() && !' ' #define KS_SEP_LINE 2 // line separator: "\n" (Unix) or "\r\n" (Windows) #define KS_SEP_MAX 2 #define __KS_TYPE(type_t) \ typedef struct __kstream_t { \ unsigned char *buf; \ int begin, end, is_eof; \ type_t f; \ } kstream_t; #define ks_eof(ks) ((ks)->is_eof && (ks)->begin >= (ks)->end) #define ks_rewind(ks) ((ks)->is_eof = (ks)->begin = (ks)->end = 0) #define __KS_BASIC(type_t, __bufsize) \ static inline kstream_t *ks_init(type_t f) \ { \ kstream_t *ks = (kstream_t*)calloc(1, sizeof(kstream_t)); \ ks->f = f; \ ks->buf = (unsigned char*)malloc(__bufsize); \ return ks; \ } \ static inline void ks_destroy(kstream_t *ks) \ { \ if (ks) { \ free(ks->buf); \ free(ks); \ } \ } #define __KS_GETC(__read, __bufsize) \ static inline int ks_getc(kstream_t *ks) \ { \ if (ks->is_eof && ks->begin >= ks->end) return -1; \ if (ks->begin >= ks->end) { \ ks->begin = 0; \ ks->end = __read(ks->f, ks->buf, __bufsize); \ if (ks->end == 0) { ks->is_eof = 1; return -1;} \ } \ return (int)ks->buf[ks->begin++]; \ } #ifndef KSTRING_T #define KSTRING_T kstring_t typedef struct __kstring_t { size_t l, m; char *s; } kstring_t; #endif #ifndef kroundup32 #define kroundup32(x) (--(x), (x)|=(x)>>1, (x)|=(x)>>2, (x)|=(x)>>4, (x)|=(x)>>8, (x)|=(x)>>16, ++(x)) #endif #define __KS_GETUNTIL(__read, __bufsize) \ static int ks_getuntil2(kstream_t *ks, int delimiter, kstring_t *str, int *dret, int append) \ { \ int gotany = 0; \ if (dret) *dret = 0; \ str->l = append? str->l : 0; \ for (;;) { \ int i; \ if (ks->begin >= ks->end) { \ if (!ks->is_eof) { \ ks->begin = 0; \ ks->end = __read(ks->f, ks->buf, __bufsize); \ if (ks->end == 0) { ks->is_eof = 1; break; } \ } else break; \ } \ if (delimiter == KS_SEP_LINE) { \ for (i = ks->begin; i < ks->end; ++i) \ if (ks->buf[i] == '\n') break; \ } else if (delimiter > KS_SEP_MAX) { \ for (i = ks->begin; i < ks->end; ++i) \ if (ks->buf[i] == delimiter) break; \ } else if (delimiter == KS_SEP_SPACE) { \ for (i = ks->begin; i < ks->end; ++i) \ if (isspace(ks->buf[i])) break; \ } else if (delimiter == KS_SEP_TAB) { \ for (i = ks->begin; i < ks->end; ++i) \ if (isspace(ks->buf[i]) && ks->buf[i] != ' ') break; \ } else i = 0; /* never come to here! */ \ if (str->m - str->l < (size_t)(i - ks->begin + 1)) { \ str->m = str->l + (i - ks->begin) + 1; \ kroundup32(str->m); \ str->s = (char*)realloc(str->s, str->m); \ } \ gotany = 1; \ memcpy(str->s + str->l, ks->buf + ks->begin, i - ks->begin); \ str->l = str->l + (i - ks->begin); \ ks->begin = i + 1; \ if (i < ks->end) { \ if (dret) *dret = ks->buf[i]; \ break; \ } \ } \ if (!gotany && ks_eof(ks)) return -1; \ if (str->s == 0) { \ str->m = 1; \ str->s = (char*)calloc(1, 1); \ } else if (delimiter == KS_SEP_LINE && str->l > 1 && str->s[str->l-1] == '\r') --str->l; \ str->s[str->l] = '\0'; \ return str->l; \ } \ static inline int ks_getuntil(kstream_t *ks, int delimiter, kstring_t *str, int *dret) \ { return ks_getuntil2(ks, delimiter, str, dret, 0); } #define KSTREAM_INIT(type_t, __read, __bufsize) \ __KS_TYPE(type_t) \ __KS_BASIC(type_t, __bufsize) \ __KS_GETC(__read, __bufsize) \ __KS_GETUNTIL(__read, __bufsize) #define kseq_rewind(ks) ((ks)->last_char = (ks)->f->is_eof = (ks)->f->begin = (ks)->f->end = 0) #define __KSEQ_BASIC(SCOPE, type_t) \ SCOPE kseq_t *kseq_init(type_t fd) \ { \ kseq_t *s = (kseq_t*)calloc(1, sizeof(kseq_t)); \ s->f = ks_init(fd); \ return s; \ } \ SCOPE void kseq_destroy(kseq_t *ks) \ { \ if (!ks) return; \ free(ks->name.s); free(ks->comment.s); free(ks->seq.s); free(ks->qual.s); \ ks_destroy(ks->f); \ free(ks); \ } /* Return value: >=0 length of the sequence (normal) -1 end-of-file -2 truncated quality string */ #define __KSEQ_READ(SCOPE) \ SCOPE int kseq_read(kseq_t *seq) \ { \ int c; \ kstream_t *ks = seq->f; \ if (seq->last_char == 0) { /* then jump to the next header line */ \ while ((c = ks_getc(ks)) != -1 && c != '>' && c != '@'); \ if (c == -1) return -1; /* end of file */ \ seq->last_char = c; \ } /* else: the first header char has been read in the previous call */ \ seq->comment.l = seq->seq.l = seq->qual.l = 0; /* reset all members */ \ if (ks_getuntil(ks, 0, &seq->name, &c) < 0) return -1; /* normal exit: EOF */ \ if (c != '\n') ks_getuntil(ks, KS_SEP_LINE, &seq->comment, 0); /* read FASTA/Q comment */ \ if (seq->seq.s == 0) { /* we can do this in the loop below, but that is slower */ \ seq->seq.m = 256; \ seq->seq.s = (char*)malloc(seq->seq.m); \ } \ while ((c = ks_getc(ks)) != -1 && c != '>' && c != '+' && c != '@') { \ if (c == '\n') continue; /* skip empty lines */ \ seq->seq.s[seq->seq.l++] = c; /* this is safe: we always have enough space for 1 char */ \ ks_getuntil2(ks, KS_SEP_LINE, &seq->seq, 0, 1); /* read the rest of the line */ \ } \ if (c == '>' || c == '@') seq->last_char = c; /* the first header char has been read */ \ if (seq->seq.l + 1 >= seq->seq.m) { /* seq->seq.s[seq->seq.l] below may be out of boundary */ \ seq->seq.m = seq->seq.l + 2; \ kroundup32(seq->seq.m); /* rounded to the next closest 2^k */ \ seq->seq.s = (char*)realloc(seq->seq.s, seq->seq.m); \ } \ seq->seq.s[seq->seq.l] = 0; /* null terminated string */ \ if (c != '+') return seq->seq.l; /* FASTA */ \ if (seq->qual.m < seq->seq.m) { /* allocate memory for qual in case insufficient */ \ seq->qual.m = seq->seq.m; \ seq->qual.s = (char*)realloc(seq->qual.s, seq->qual.m); \ } \ while ((c = ks_getc(ks)) != -1 && c != '\n'); /* skip the rest of '+' line */ \ if (c == -1) return -2; /* error: no quality string */ \ while (ks_getuntil2(ks, KS_SEP_LINE, &seq->qual, 0, 1) >= 0 && seq->qual.l < seq->seq.l); \ seq->last_char = 0; /* we have not come to the next header line */ \ if (seq->seq.l != seq->qual.l) return -2; /* error: qual string is of a different length */ \ return seq->seq.l; \ } #define __KSEQ_TYPE(type_t) \ typedef struct { \ kstring_t name, comment, seq, qual; \ int last_char; \ kstream_t *f; \ } kseq_t; #define KSEQ_INIT2(SCOPE, type_t, __read) \ KSTREAM_INIT(type_t, __read, 16384) \ __KSEQ_TYPE(type_t) \ __KSEQ_BASIC(SCOPE, type_t) \ __KSEQ_READ(SCOPE) #define KSEQ_INIT(type_t, __read) KSEQ_INIT2(static, type_t, __read) #define KSEQ_DECLARE(type_t) \ __KS_TYPE(type_t) \ __KSEQ_TYPE(type_t) \ extern kseq_t *kseq_init(type_t fd); \ void kseq_destroy(kseq_t *ks); \ int kseq_read(kseq_t *seq); #endif bwa-0.7.17/ksort.h000066400000000000000000000227661317342117100137070ustar00rootroot00000000000000/* The MIT License Copyright (c) 2008, by Attractive Chaos Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. */ /* 2008-11-16 (0.1.4): * Fixed a bug in introsort() that happens in rare cases. 2008-11-05 (0.1.3): * Fixed a bug in introsort() for complex comparisons. * Fixed a bug in mergesort(). The previous version is not stable. 2008-09-15 (0.1.2): * Accelerated introsort. On my Mac (not on another Linux machine), my implementation is as fast as std::sort on random input. * Added combsort and in introsort, switch to combsort if the recursion is too deep. 2008-09-13 (0.1.1): * Added k-small algorithm 2008-09-05 (0.1.0): * Initial version */ #ifndef AC_KSORT_H #define AC_KSORT_H #include #include #ifdef USE_MALLOC_WRAPPERS # include "malloc_wrap.h" #endif typedef struct { void *left, *right; int depth; } ks_isort_stack_t; #define KSORT_SWAP(type_t, a, b) { register type_t t=(a); (a)=(b); (b)=t; } #define KSORT_INIT(name, type_t, __sort_lt) \ void ks_mergesort_##name(size_t n, type_t array[], type_t temp[]) \ { \ type_t *a2[2], *a, *b; \ int curr, shift; \ \ a2[0] = array; \ a2[1] = temp? temp : (type_t*)malloc(sizeof(type_t) * n); \ for (curr = 0, shift = 0; (1ul<> 1) - 1; i != (size_t)(-1); --i) \ ks_heapadjust_##name(i, lsize, l); \ } \ void ks_heapsort_##name(size_t lsize, type_t l[]) \ { \ size_t i; \ for (i = lsize - 1; i > 0; --i) { \ type_t tmp; \ tmp = *l; *l = l[i]; l[i] = tmp; ks_heapadjust_##name(0, i, l); \ } \ } \ static inline void __ks_insertsort_##name(type_t *s, type_t *t) \ { \ type_t *i, *j, swap_tmp; \ for (i = s + 1; i < t; ++i) \ for (j = i; j > s && __sort_lt(*j, *(j-1)); --j) { \ swap_tmp = *j; *j = *(j-1); *(j-1) = swap_tmp; \ } \ } \ void ks_combsort_##name(size_t n, type_t a[]) \ { \ const double shrink_factor = 1.2473309501039786540366528676643; \ int do_swap; \ size_t gap = n; \ type_t tmp, *i, *j; \ do { \ if (gap > 2) { \ gap = (size_t)(gap / shrink_factor); \ if (gap == 9 || gap == 10) gap = 11; \ } \ do_swap = 0; \ for (i = a; i < a + n - gap; ++i) { \ j = i + gap; \ if (__sort_lt(*j, *i)) { \ tmp = *i; *i = *j; *j = tmp; \ do_swap = 1; \ } \ } \ } while (do_swap || gap > 2); \ if (gap != 1) __ks_insertsort_##name(a, a + n); \ } \ void ks_introsort_##name(size_t n, type_t a[]) \ { \ int d; \ ks_isort_stack_t *top, *stack; \ type_t rp, swap_tmp; \ type_t *s, *t, *i, *j, *k; \ \ if (n < 1) return; \ else if (n == 2) { \ if (__sort_lt(a[1], a[0])) { swap_tmp = a[0]; a[0] = a[1]; a[1] = swap_tmp; } \ return; \ } \ for (d = 2; 1ul<>1) + 1; \ if (__sort_lt(*k, *i)) { \ if (__sort_lt(*k, *j)) k = j; \ } else k = __sort_lt(*j, *i)? i : j; \ rp = *k; \ if (k != t) { swap_tmp = *k; *k = *t; *t = swap_tmp; } \ for (;;) { \ do ++i; while (__sort_lt(*i, rp)); \ do --j; while (i <= j && __sort_lt(rp, *j)); \ if (j <= i) break; \ swap_tmp = *i; *i = *j; *j = swap_tmp; \ } \ swap_tmp = *i; *i = *t; *t = swap_tmp; \ if (i-s > t-i) { \ if (i-s > 16) { top->left = s; top->right = i-1; top->depth = d; ++top; } \ s = t-i > 16? i+1 : t; \ } else { \ if (t-i > 16) { top->left = i+1; top->right = t; top->depth = d; ++top; } \ t = i-s > 16? i-1 : s; \ } \ } else { \ if (top == stack) { \ free(stack); \ __ks_insertsort_##name(a, a+n); \ return; \ } else { --top; s = (type_t*)top->left; t = (type_t*)top->right; d = top->depth; } \ } \ } \ } \ /* This function is adapted from: http://ndevilla.free.fr/median/ */ \ /* 0 <= kk < n */ \ type_t ks_ksmall_##name(size_t n, type_t arr[], size_t kk) \ { \ type_t *low, *high, *k, *ll, *hh, *mid; \ low = arr; high = arr + n - 1; k = arr + kk; \ for (;;) { \ if (high <= low) return *k; \ if (high == low + 1) { \ if (__sort_lt(*high, *low)) KSORT_SWAP(type_t, *low, *high); \ return *k; \ } \ mid = low + (high - low) / 2; \ if (__sort_lt(*high, *mid)) KSORT_SWAP(type_t, *mid, *high); \ if (__sort_lt(*high, *low)) KSORT_SWAP(type_t, *low, *high); \ if (__sort_lt(*low, *mid)) KSORT_SWAP(type_t, *mid, *low); \ KSORT_SWAP(type_t, *mid, *(low+1)); \ ll = low + 1; hh = high; \ for (;;) { \ do ++ll; while (__sort_lt(*ll, *low)); \ do --hh; while (__sort_lt(*low, *hh)); \ if (hh < ll) break; \ KSORT_SWAP(type_t, *ll, *hh); \ } \ KSORT_SWAP(type_t, *low, *hh); \ if (hh <= k) low = ll; \ if (hh >= k) high = hh - 1; \ } \ } #define ks_mergesort(name, n, a, t) ks_mergesort_##name(n, a, t) #define ks_introsort(name, n, a) ks_introsort_##name(n, a) #define ks_combsort(name, n, a) ks_combsort_##name(n, a) #define ks_heapsort(name, n, a) ks_heapsort_##name(n, a) #define ks_heapmake(name, n, a) ks_heapmake_##name(n, a) #define ks_heapadjust(name, i, n, a) ks_heapadjust_##name(i, n, a) #define ks_ksmall(name, n, a, k) ks_ksmall_##name(n, a, k) #define ks_lt_generic(a, b) ((a) < (b)) #define ks_lt_str(a, b) (strcmp((a), (b)) < 0) typedef const char *ksstr_t; #define KSORT_INIT_GENERIC(type_t) KSORT_INIT(type_t, type_t, ks_lt_generic) #define KSORT_INIT_STR KSORT_INIT(str, ksstr_t, ks_lt_str) #endif bwa-0.7.17/kstring.c000066400000000000000000000012771317342117100142130ustar00rootroot00000000000000#include #include #include "kstring.h" #ifdef USE_MALLOC_WRAPPERS # include "malloc_wrap.h" #endif int ksprintf(kstring_t *s, const char *fmt, ...) { va_list ap; int l; va_start(ap, fmt); l = vsnprintf(s->s + s->l, s->m - s->l, fmt, ap); va_end(ap); if (l + 1 > s->m - s->l) { s->m = s->l + l + 2; kroundup32(s->m); s->s = (char*)realloc(s->s, s->m); va_start(ap, fmt); l = vsnprintf(s->s + s->l, s->m - s->l, fmt, ap); } va_end(ap); s->l += l; return l; } #ifdef KSTRING_MAIN #include int main() { kstring_t *s; s = (kstring_t*)calloc(1, sizeof(kstring_t)); ksprintf(s, "abcdefg: %d", 100); printf("%s\n", s->s); free(s); return 0; } #endif bwa-0.7.17/kstring.h000066400000000000000000000044341317342117100142160ustar00rootroot00000000000000#ifndef KSTRING_H #define KSTRING_H #include #include #ifdef USE_MALLOC_WRAPPERS # include "malloc_wrap.h" #endif #ifndef kroundup32 #define kroundup32(x) (--(x), (x)|=(x)>>1, (x)|=(x)>>2, (x)|=(x)>>4, (x)|=(x)>>8, (x)|=(x)>>16, ++(x)) #endif #ifndef KSTRING_T #define KSTRING_T kstring_t typedef struct __kstring_t { size_t l, m; char *s; } kstring_t; #endif static inline void ks_resize(kstring_t *s, size_t size) { if (s->m < size) { s->m = size; kroundup32(s->m); s->s = (char*)realloc(s->s, s->m); } } static inline int kputsn(const char *p, int l, kstring_t *s) { if (s->l + l + 1 >= s->m) { s->m = s->l + l + 2; kroundup32(s->m); s->s = (char*)realloc(s->s, s->m); } memcpy(s->s + s->l, p, l); s->l += l; s->s[s->l] = 0; return l; } static inline int kputs(const char *p, kstring_t *s) { return kputsn(p, strlen(p), s); } static inline int kputc(int c, kstring_t *s) { if (s->l + 1 >= s->m) { s->m = s->l + 2; kroundup32(s->m); s->s = (char*)realloc(s->s, s->m); } s->s[s->l++] = c; s->s[s->l] = 0; return c; } static inline int kputw(int c, kstring_t *s) { char buf[16]; int l, x; if (c == 0) return kputc('0', s); for (l = 0, x = c < 0? -c : c; x > 0; x /= 10) buf[l++] = x%10 + '0'; if (c < 0) buf[l++] = '-'; if (s->l + l + 1 >= s->m) { s->m = s->l + l + 2; kroundup32(s->m); s->s = (char*)realloc(s->s, s->m); } for (x = l - 1; x >= 0; --x) s->s[s->l++] = buf[x]; s->s[s->l] = 0; return 0; } static inline int kputuw(unsigned c, kstring_t *s) { char buf[16]; int l, i; unsigned x; if (c == 0) return kputc('0', s); for (l = 0, x = c; x > 0; x /= 10) buf[l++] = x%10 + '0'; if (s->l + l + 1 >= s->m) { s->m = s->l + l + 2; kroundup32(s->m); s->s = (char*)realloc(s->s, s->m); } for (i = l - 1; i >= 0; --i) s->s[s->l++] = buf[i]; s->s[s->l] = 0; return 0; } static inline int kputl(long c, kstring_t *s) { char buf[32]; long l, x; if (c == 0) return kputc('0', s); for (l = 0, x = c < 0? -c : c; x > 0; x /= 10) buf[l++] = x%10 + '0'; if (c < 0) buf[l++] = '-'; if (s->l + l + 1 >= s->m) { s->m = s->l + l + 2; kroundup32(s->m); s->s = (char*)realloc(s->s, s->m); } for (x = l - 1; x >= 0; --x) s->s[s->l++] = buf[x]; s->s[s->l] = 0; return 0; } int ksprintf(kstring_t *s, const char *fmt, ...); #endif bwa-0.7.17/ksw.c000066400000000000000000000630001317342117100133260ustar00rootroot00000000000000/* The MIT License Copyright (c) 2011 by Attractive Chaos Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. */ #include #include #include #include #include "ksw.h" #ifdef USE_MALLOC_WRAPPERS # include "malloc_wrap.h" #endif #ifdef __GNUC__ #define LIKELY(x) __builtin_expect((x),1) #define UNLIKELY(x) __builtin_expect((x),0) #else #define LIKELY(x) (x) #define UNLIKELY(x) (x) #endif const kswr_t g_defr = { 0, -1, -1, -1, -1, -1, -1 }; struct _kswq_t { int qlen, slen; uint8_t shift, mdiff, max, size; __m128i *qp, *H0, *H1, *E, *Hmax; }; /** * Initialize the query data structure * * @param size Number of bytes used to store a score; valid valures are 1 or 2 * @param qlen Length of the query sequence * @param query Query sequence * @param m Size of the alphabet * @param mat Scoring matrix in a one-dimension array * * @return Query data structure */ kswq_t *ksw_qinit(int size, int qlen, const uint8_t *query, int m, const int8_t *mat) { kswq_t *q; int slen, a, tmp, p; size = size > 1? 2 : 1; p = 8 * (3 - size); // # values per __m128i slen = (qlen + p - 1) / p; // segmented length q = (kswq_t*)malloc(sizeof(kswq_t) + 256 + 16 * slen * (m + 4)); // a single block of memory q->qp = (__m128i*)(((size_t)q + sizeof(kswq_t) + 15) >> 4 << 4); // align memory q->H0 = q->qp + slen * m; q->H1 = q->H0 + slen; q->E = q->H1 + slen; q->Hmax = q->E + slen; q->slen = slen; q->qlen = qlen; q->size = size; // compute shift tmp = m * m; for (a = 0, q->shift = 127, q->mdiff = 0; a < tmp; ++a) { // find the minimum and maximum score if (mat[a] < (int8_t)q->shift) q->shift = mat[a]; if (mat[a] > (int8_t)q->mdiff) q->mdiff = mat[a]; } q->max = q->mdiff; q->shift = 256 - q->shift; // NB: q->shift is uint8_t q->mdiff += q->shift; // this is the difference between the min and max scores // An example: p=8, qlen=19, slen=3 and segmentation: // {{0,3,6,9,12,15,18,-1},{1,4,7,10,13,16,-1,-1},{2,5,8,11,14,17,-1,-1}} if (size == 1) { int8_t *t = (int8_t*)q->qp; for (a = 0; a < m; ++a) { int i, k, nlen = slen * p; const int8_t *ma = mat + a * m; for (i = 0; i < slen; ++i) for (k = i; k < nlen; k += slen) // p iterations *t++ = (k >= qlen? 0 : ma[query[k]]) + q->shift; } } else { int16_t *t = (int16_t*)q->qp; for (a = 0; a < m; ++a) { int i, k, nlen = slen * p; const int8_t *ma = mat + a * m; for (i = 0; i < slen; ++i) for (k = i; k < nlen; k += slen) // p iterations *t++ = (k >= qlen? 0 : ma[query[k]]); } } return q; } kswr_t ksw_u8(kswq_t *q, int tlen, const uint8_t *target, int _o_del, int _e_del, int _o_ins, int _e_ins, int xtra) // the first gap costs -(_o+_e) { int slen, i, m_b, n_b, te = -1, gmax = 0, minsc, endsc; uint64_t *b; __m128i zero, oe_del, e_del, oe_ins, e_ins, shift, *H0, *H1, *E, *Hmax; kswr_t r; #define __max_16(ret, xx) do { \ (xx) = _mm_max_epu8((xx), _mm_srli_si128((xx), 8)); \ (xx) = _mm_max_epu8((xx), _mm_srli_si128((xx), 4)); \ (xx) = _mm_max_epu8((xx), _mm_srli_si128((xx), 2)); \ (xx) = _mm_max_epu8((xx), _mm_srli_si128((xx), 1)); \ (ret) = _mm_extract_epi16((xx), 0) & 0x00ff; \ } while (0) // initialization r = g_defr; minsc = (xtra&KSW_XSUBO)? xtra&0xffff : 0x10000; endsc = (xtra&KSW_XSTOP)? xtra&0xffff : 0x10000; m_b = n_b = 0; b = 0; zero = _mm_set1_epi32(0); oe_del = _mm_set1_epi8(_o_del + _e_del); e_del = _mm_set1_epi8(_e_del); oe_ins = _mm_set1_epi8(_o_ins + _e_ins); e_ins = _mm_set1_epi8(_e_ins); shift = _mm_set1_epi8(q->shift); H0 = q->H0; H1 = q->H1; E = q->E; Hmax = q->Hmax; slen = q->slen; for (i = 0; i < slen; ++i) { _mm_store_si128(E + i, zero); _mm_store_si128(H0 + i, zero); _mm_store_si128(Hmax + i, zero); } // the core loop for (i = 0; i < tlen; ++i) { int j, k, cmp, imax; __m128i e, h, t, f = zero, max = zero, *S = q->qp + target[i] * slen; // s is the 1st score vector h = _mm_load_si128(H0 + slen - 1); // h={2,5,8,11,14,17,-1,-1} in the above example h = _mm_slli_si128(h, 1); // h=H(i-1,-1); << instead of >> because x64 is little-endian for (j = 0; LIKELY(j < slen); ++j) { /* SW cells are computed in the following order: * H(i,j) = max{H(i-1,j-1)+S(i,j), E(i,j), F(i,j)} * E(i+1,j) = max{H(i,j)-q, E(i,j)-r} * F(i,j+1) = max{H(i,j)-q, F(i,j)-r} */ // compute H'(i,j); note that at the beginning, h=H'(i-1,j-1) h = _mm_adds_epu8(h, _mm_load_si128(S + j)); h = _mm_subs_epu8(h, shift); // h=H'(i-1,j-1)+S(i,j) e = _mm_load_si128(E + j); // e=E'(i,j) h = _mm_max_epu8(h, e); h = _mm_max_epu8(h, f); // h=H'(i,j) max = _mm_max_epu8(max, h); // set max _mm_store_si128(H1 + j, h); // save to H'(i,j) // now compute E'(i+1,j) e = _mm_subs_epu8(e, e_del); // e=E'(i,j) - e_del t = _mm_subs_epu8(h, oe_del); // h=H'(i,j) - o_del - e_del e = _mm_max_epu8(e, t); // e=E'(i+1,j) _mm_store_si128(E + j, e); // save to E'(i+1,j) // now compute F'(i,j+1) f = _mm_subs_epu8(f, e_ins); t = _mm_subs_epu8(h, oe_ins); // h=H'(i,j) - o_ins - e_ins f = _mm_max_epu8(f, t); // get H'(i-1,j) and prepare for the next j h = _mm_load_si128(H0 + j); // h=H'(i-1,j) } // NB: we do not need to set E(i,j) as we disallow adjecent insertion and then deletion for (k = 0; LIKELY(k < 16); ++k) { // this block mimics SWPS3; NB: H(i,j) updated in the lazy-F loop cannot exceed max f = _mm_slli_si128(f, 1); for (j = 0; LIKELY(j < slen); ++j) { h = _mm_load_si128(H1 + j); h = _mm_max_epu8(h, f); // h=H'(i,j) _mm_store_si128(H1 + j, h); h = _mm_subs_epu8(h, oe_ins); f = _mm_subs_epu8(f, e_ins); cmp = _mm_movemask_epi8(_mm_cmpeq_epi8(_mm_subs_epu8(f, h), zero)); if (UNLIKELY(cmp == 0xffff)) goto end_loop16; } } end_loop16: //int k;for (k=0;k<16;++k)printf("%d ", ((uint8_t*)&max)[k]);printf("\n"); __max_16(imax, max); // imax is the maximum number in max if (imax >= minsc) { // write the b array; this condition adds branching unfornately if (n_b == 0 || (int32_t)b[n_b-1] + 1 != i) { // then append if (n_b == m_b) { m_b = m_b? m_b<<1 : 8; b = (uint64_t*)realloc(b, 8 * m_b); } b[n_b++] = (uint64_t)imax<<32 | i; } else if ((int)(b[n_b-1]>>32) < imax) b[n_b-1] = (uint64_t)imax<<32 | i; // modify the last } if (imax > gmax) { gmax = imax; te = i; // te is the end position on the target for (j = 0; LIKELY(j < slen); ++j) // keep the H1 vector _mm_store_si128(Hmax + j, _mm_load_si128(H1 + j)); if (gmax + q->shift >= 255 || gmax >= endsc) break; } S = H1; H1 = H0; H0 = S; // swap H0 and H1 } r.score = gmax + q->shift < 255? gmax : 255; r.te = te; if (r.score != 255) { // get a->qe, the end of query match; find the 2nd best score int max = -1, tmp, low, high, qlen = slen * 16; uint8_t *t = (uint8_t*)Hmax; for (i = 0; i < qlen; ++i, ++t) if ((int)*t > max) max = *t, r.qe = i / 16 + i % 16 * slen; else if ((int)*t == max && (tmp = i / 16 + i % 16 * slen) < r.qe) r.qe = tmp; //printf("%d,%d\n", max, gmax); if (b) { i = (r.score + q->max - 1) / q->max; low = te - i; high = te + i; for (i = 0; i < n_b; ++i) { int e = (int32_t)b[i]; if ((e < low || e > high) && (int)(b[i]>>32) > r.score2) r.score2 = b[i]>>32, r.te2 = e; } } } free(b); return r; } kswr_t ksw_i16(kswq_t *q, int tlen, const uint8_t *target, int _o_del, int _e_del, int _o_ins, int _e_ins, int xtra) // the first gap costs -(_o+_e) { int slen, i, m_b, n_b, te = -1, gmax = 0, minsc, endsc; uint64_t *b; __m128i zero, oe_del, e_del, oe_ins, e_ins, *H0, *H1, *E, *Hmax; kswr_t r; #define __max_8(ret, xx) do { \ (xx) = _mm_max_epi16((xx), _mm_srli_si128((xx), 8)); \ (xx) = _mm_max_epi16((xx), _mm_srli_si128((xx), 4)); \ (xx) = _mm_max_epi16((xx), _mm_srli_si128((xx), 2)); \ (ret) = _mm_extract_epi16((xx), 0); \ } while (0) // initialization r = g_defr; minsc = (xtra&KSW_XSUBO)? xtra&0xffff : 0x10000; endsc = (xtra&KSW_XSTOP)? xtra&0xffff : 0x10000; m_b = n_b = 0; b = 0; zero = _mm_set1_epi32(0); oe_del = _mm_set1_epi16(_o_del + _e_del); e_del = _mm_set1_epi16(_e_del); oe_ins = _mm_set1_epi16(_o_ins + _e_ins); e_ins = _mm_set1_epi16(_e_ins); H0 = q->H0; H1 = q->H1; E = q->E; Hmax = q->Hmax; slen = q->slen; for (i = 0; i < slen; ++i) { _mm_store_si128(E + i, zero); _mm_store_si128(H0 + i, zero); _mm_store_si128(Hmax + i, zero); } // the core loop for (i = 0; i < tlen; ++i) { int j, k, imax; __m128i e, t, h, f = zero, max = zero, *S = q->qp + target[i] * slen; // s is the 1st score vector h = _mm_load_si128(H0 + slen - 1); // h={2,5,8,11,14,17,-1,-1} in the above example h = _mm_slli_si128(h, 2); for (j = 0; LIKELY(j < slen); ++j) { h = _mm_adds_epi16(h, *S++); e = _mm_load_si128(E + j); h = _mm_max_epi16(h, e); h = _mm_max_epi16(h, f); max = _mm_max_epi16(max, h); _mm_store_si128(H1 + j, h); e = _mm_subs_epu16(e, e_del); t = _mm_subs_epu16(h, oe_del); e = _mm_max_epi16(e, t); _mm_store_si128(E + j, e); f = _mm_subs_epu16(f, e_ins); t = _mm_subs_epu16(h, oe_ins); f = _mm_max_epi16(f, t); h = _mm_load_si128(H0 + j); } for (k = 0; LIKELY(k < 16); ++k) { f = _mm_slli_si128(f, 2); for (j = 0; LIKELY(j < slen); ++j) { h = _mm_load_si128(H1 + j); h = _mm_max_epi16(h, f); _mm_store_si128(H1 + j, h); h = _mm_subs_epu16(h, oe_ins); f = _mm_subs_epu16(f, e_ins); if(UNLIKELY(!_mm_movemask_epi8(_mm_cmpgt_epi16(f, h)))) goto end_loop8; } } end_loop8: __max_8(imax, max); if (imax >= minsc) { if (n_b == 0 || (int32_t)b[n_b-1] + 1 != i) { if (n_b == m_b) { m_b = m_b? m_b<<1 : 8; b = (uint64_t*)realloc(b, 8 * m_b); } b[n_b++] = (uint64_t)imax<<32 | i; } else if ((int)(b[n_b-1]>>32) < imax) b[n_b-1] = (uint64_t)imax<<32 | i; // modify the last } if (imax > gmax) { gmax = imax; te = i; for (j = 0; LIKELY(j < slen); ++j) _mm_store_si128(Hmax + j, _mm_load_si128(H1 + j)); if (gmax >= endsc) break; } S = H1; H1 = H0; H0 = S; } r.score = gmax; r.te = te; { int max = -1, tmp, low, high, qlen = slen * 8; uint16_t *t = (uint16_t*)Hmax; for (i = 0, r.qe = -1; i < qlen; ++i, ++t) if ((int)*t > max) max = *t, r.qe = i / 8 + i % 8 * slen; else if ((int)*t == max && (tmp = i / 8 + i % 8 * slen) < r.qe) r.qe = tmp; if (b) { i = (r.score + q->max - 1) / q->max; low = te - i; high = te + i; for (i = 0; i < n_b; ++i) { int e = (int32_t)b[i]; if ((e < low || e > high) && (int)(b[i]>>32) > r.score2) r.score2 = b[i]>>32, r.te2 = e; } } } free(b); return r; } static inline void revseq(int l, uint8_t *s) { int i, t; for (i = 0; i < l>>1; ++i) t = s[i], s[i] = s[l - 1 - i], s[l - 1 - i] = t; } kswr_t ksw_align2(int qlen, uint8_t *query, int tlen, uint8_t *target, int m, const int8_t *mat, int o_del, int e_del, int o_ins, int e_ins, int xtra, kswq_t **qry) { int size; kswq_t *q; kswr_t r, rr; kswr_t (*func)(kswq_t*, int, const uint8_t*, int, int, int, int, int); q = (qry && *qry)? *qry : ksw_qinit((xtra&KSW_XBYTE)? 1 : 2, qlen, query, m, mat); if (qry && *qry == 0) *qry = q; func = q->size == 2? ksw_i16 : ksw_u8; size = q->size; r = func(q, tlen, target, o_del, e_del, o_ins, e_ins, xtra); if (qry == 0) free(q); if ((xtra&KSW_XSTART) == 0 || ((xtra&KSW_XSUBO) && r.score < (xtra&0xffff))) return r; revseq(r.qe + 1, query); revseq(r.te + 1, target); // +1 because qe/te points to the exact end, not the position after the end q = ksw_qinit(size, r.qe + 1, query, m, mat); rr = func(q, tlen, target, o_del, e_del, o_ins, e_ins, KSW_XSTOP | r.score); revseq(r.qe + 1, query); revseq(r.te + 1, target); free(q); if (r.score == rr.score) r.tb = r.te - rr.te, r.qb = r.qe - rr.qe; return r; } kswr_t ksw_align(int qlen, uint8_t *query, int tlen, uint8_t *target, int m, const int8_t *mat, int gapo, int gape, int xtra, kswq_t **qry) { return ksw_align2(qlen, query, tlen, target, m, mat, gapo, gape, gapo, gape, xtra, qry); } /******************** *** SW extension *** ********************/ typedef struct { int32_t h, e; } eh_t; int ksw_extend2(int qlen, const uint8_t *query, int tlen, const uint8_t *target, int m, const int8_t *mat, int o_del, int e_del, int o_ins, int e_ins, int w, int end_bonus, int zdrop, int h0, int *_qle, int *_tle, int *_gtle, int *_gscore, int *_max_off) { eh_t *eh; // score array int8_t *qp; // query profile int i, j, k, oe_del = o_del + e_del, oe_ins = o_ins + e_ins, beg, end, max, max_i, max_j, max_ins, max_del, max_ie, gscore, max_off; assert(h0 > 0); // allocate memory qp = malloc(qlen * m); eh = calloc(qlen + 1, 8); // generate the query profile for (k = i = 0; k < m; ++k) { const int8_t *p = &mat[k * m]; for (j = 0; j < qlen; ++j) qp[i++] = p[query[j]]; } // fill the first row eh[0].h = h0; eh[1].h = h0 > oe_ins? h0 - oe_ins : 0; for (j = 2; j <= qlen && eh[j-1].h > e_ins; ++j) eh[j].h = eh[j-1].h - e_ins; // adjust $w if it is too large k = m * m; for (i = 0, max = 0; i < k; ++i) // get the max score max = max > mat[i]? max : mat[i]; max_ins = (int)((double)(qlen * max + end_bonus - o_ins) / e_ins + 1.); max_ins = max_ins > 1? max_ins : 1; w = w < max_ins? w : max_ins; max_del = (int)((double)(qlen * max + end_bonus - o_del) / e_del + 1.); max_del = max_del > 1? max_del : 1; w = w < max_del? w : max_del; // TODO: is this necessary? // DP loop max = h0, max_i = max_j = -1; max_ie = -1, gscore = -1; max_off = 0; beg = 0, end = qlen; for (i = 0; LIKELY(i < tlen); ++i) { int t, f = 0, h1, m = 0, mj = -1; int8_t *q = &qp[target[i] * qlen]; // apply the band and the constraint (if provided) if (beg < i - w) beg = i - w; if (end > i + w + 1) end = i + w + 1; if (end > qlen) end = qlen; // compute the first column if (beg == 0) { h1 = h0 - (o_del + e_del * (i + 1)); if (h1 < 0) h1 = 0; } else h1 = 0; for (j = beg; LIKELY(j < end); ++j) { // At the beginning of the loop: eh[j] = { H(i-1,j-1), E(i,j) }, f = F(i,j) and h1 = H(i,j-1) // Similar to SSE2-SW, cells are computed in the following order: // H(i,j) = max{H(i-1,j-1)+S(i,j), E(i,j), F(i,j)} // E(i+1,j) = max{H(i,j)-gapo, E(i,j)} - gape // F(i,j+1) = max{H(i,j)-gapo, F(i,j)} - gape eh_t *p = &eh[j]; int h, M = p->h, e = p->e; // get H(i-1,j-1) and E(i-1,j) p->h = h1; // set H(i,j-1) for the next row M = M? M + q[j] : 0;// separating H and M to disallow a cigar like "100M3I3D20M" h = M > e? M : e; // e and f are guaranteed to be non-negative, so h>=0 even if M<0 h = h > f? h : f; h1 = h; // save H(i,j) to h1 for the next column mj = m > h? mj : j; // record the position where max score is achieved m = m > h? m : h; // m is stored at eh[mj+1] t = M - oe_del; t = t > 0? t : 0; e -= e_del; e = e > t? e : t; // computed E(i+1,j) p->e = e; // save E(i+1,j) for the next row t = M - oe_ins; t = t > 0? t : 0; f -= e_ins; f = f > t? f : t; // computed F(i,j+1) } eh[end].h = h1; eh[end].e = 0; if (j == qlen) { max_ie = gscore > h1? max_ie : i; gscore = gscore > h1? gscore : h1; } if (m == 0) break; if (m > max) { max = m, max_i = i, max_j = mj; max_off = max_off > abs(mj - i)? max_off : abs(mj - i); } else if (zdrop > 0) { if (i - max_i > mj - max_j) { if (max - m - ((i - max_i) - (mj - max_j)) * e_del > zdrop) break; } else { if (max - m - ((mj - max_j) - (i - max_i)) * e_ins > zdrop) break; } } // update beg and end for the next round for (j = beg; LIKELY(j < end) && eh[j].h == 0 && eh[j].e == 0; ++j); beg = j; for (j = end; LIKELY(j >= beg) && eh[j].h == 0 && eh[j].e == 0; --j); end = j + 2 < qlen? j + 2 : qlen; //beg = 0; end = qlen; // uncomment this line for debugging } free(eh); free(qp); if (_qle) *_qle = max_j + 1; if (_tle) *_tle = max_i + 1; if (_gtle) *_gtle = max_ie + 1; if (_gscore) *_gscore = gscore; if (_max_off) *_max_off = max_off; return max; } int ksw_extend(int qlen, const uint8_t *query, int tlen, const uint8_t *target, int m, const int8_t *mat, int gapo, int gape, int w, int end_bonus, int zdrop, int h0, int *qle, int *tle, int *gtle, int *gscore, int *max_off) { return ksw_extend2(qlen, query, tlen, target, m, mat, gapo, gape, gapo, gape, w, end_bonus, zdrop, h0, qle, tle, gtle, gscore, max_off); } /******************** * Global alignment * ********************/ #define MINUS_INF -0x40000000 static inline uint32_t *push_cigar(int *n_cigar, int *m_cigar, uint32_t *cigar, int op, int len) { if (*n_cigar == 0 || op != (cigar[(*n_cigar) - 1]&0xf)) { if (*n_cigar == *m_cigar) { *m_cigar = *m_cigar? (*m_cigar)<<1 : 4; cigar = realloc(cigar, (*m_cigar) << 2); } cigar[(*n_cigar)++] = len<<4 | op; } else cigar[(*n_cigar)-1] += len<<4; return cigar; } int ksw_global2(int qlen, const uint8_t *query, int tlen, const uint8_t *target, int m, const int8_t *mat, int o_del, int e_del, int o_ins, int e_ins, int w, int *n_cigar_, uint32_t **cigar_) { eh_t *eh; int8_t *qp; // query profile int i, j, k, oe_del = o_del + e_del, oe_ins = o_ins + e_ins, score, n_col; uint8_t *z; // backtrack matrix; in each cell: f<<4|e<<2|h; in principle, we can halve the memory, but backtrack will be a little more complex if (n_cigar_) *n_cigar_ = 0; // allocate memory n_col = qlen < 2*w+1? qlen : 2*w+1; // maximum #columns of the backtrack matrix z = n_cigar_ && cigar_? malloc((long)n_col * tlen) : 0; qp = malloc(qlen * m); eh = calloc(qlen + 1, 8); // generate the query profile for (k = i = 0; k < m; ++k) { const int8_t *p = &mat[k * m]; for (j = 0; j < qlen; ++j) qp[i++] = p[query[j]]; } // fill the first row eh[0].h = 0; eh[0].e = MINUS_INF; for (j = 1; j <= qlen && j <= w; ++j) eh[j].h = -(o_ins + e_ins * j), eh[j].e = MINUS_INF; for (; j <= qlen; ++j) eh[j].h = eh[j].e = MINUS_INF; // everything is -inf outside the band // DP loop for (i = 0; LIKELY(i < tlen); ++i) { // target sequence is in the outer loop int32_t f = MINUS_INF, h1, beg, end, t; int8_t *q = &qp[target[i] * qlen]; beg = i > w? i - w : 0; end = i + w + 1 < qlen? i + w + 1 : qlen; // only loop through [beg,end) of the query sequence h1 = beg == 0? -(o_del + e_del * (i + 1)) : MINUS_INF; if (n_cigar_ && cigar_) { uint8_t *zi = &z[(long)i * n_col]; for (j = beg; LIKELY(j < end); ++j) { // At the beginning of the loop: eh[j] = { H(i-1,j-1), E(i,j) }, f = F(i,j) and h1 = H(i,j-1) // Cells are computed in the following order: // M(i,j) = H(i-1,j-1) + S(i,j) // H(i,j) = max{M(i,j), E(i,j), F(i,j)} // E(i+1,j) = max{M(i,j)-gapo, E(i,j)} - gape // F(i,j+1) = max{M(i,j)-gapo, F(i,j)} - gape // We have to separate M(i,j); otherwise the direction may not be recorded correctly. // However, a CIGAR like "10M3I3D10M" allowed by local() is disallowed by global(). // Such a CIGAR may occur, in theory, if mismatch_penalty > 2*gap_ext_penalty + 2*gap_open_penalty/k. // In practice, this should happen very rarely given a reasonable scoring system. eh_t *p = &eh[j]; int32_t h, m = p->h, e = p->e; uint8_t d; // direction p->h = h1; m += q[j]; d = m >= e? 0 : 1; h = m >= e? m : e; d = h >= f? d : 2; h = h >= f? h : f; h1 = h; t = m - oe_del; e -= e_del; d |= e > t? 1<<2 : 0; e = e > t? e : t; p->e = e; t = m - oe_ins; f -= e_ins; d |= f > t? 2<<4 : 0; // if we want to halve the memory, use one bit only, instead of two f = f > t? f : t; zi[j - beg] = d; // z[i,j] keeps h for the current cell and e/f for the next cell } } else { for (j = beg; LIKELY(j < end); ++j) { eh_t *p = &eh[j]; int32_t h, m = p->h, e = p->e; p->h = h1; m += q[j]; h = m >= e? m : e; h = h >= f? h : f; h1 = h; t = m - oe_del; e -= e_del; e = e > t? e : t; p->e = e; t = m - oe_ins; f -= e_ins; f = f > t? f : t; } } eh[end].h = h1; eh[end].e = MINUS_INF; } score = eh[qlen].h; if (n_cigar_ && cigar_) { // backtrack int n_cigar = 0, m_cigar = 0, which = 0; uint32_t *cigar = 0, tmp; i = tlen - 1; k = (i + w + 1 < qlen? i + w + 1 : qlen) - 1; // (i,k) points to the last cell while (i >= 0 && k >= 0) { which = z[(long)i * n_col + (k - (i > w? i - w : 0))] >> (which<<1) & 3; if (which == 0) cigar = push_cigar(&n_cigar, &m_cigar, cigar, 0, 1), --i, --k; else if (which == 1) cigar = push_cigar(&n_cigar, &m_cigar, cigar, 2, 1), --i; else cigar = push_cigar(&n_cigar, &m_cigar, cigar, 1, 1), --k; } if (i >= 0) cigar = push_cigar(&n_cigar, &m_cigar, cigar, 2, i + 1); if (k >= 0) cigar = push_cigar(&n_cigar, &m_cigar, cigar, 1, k + 1); for (i = 0; i < n_cigar>>1; ++i) // reverse CIGAR tmp = cigar[i], cigar[i] = cigar[n_cigar-1-i], cigar[n_cigar-1-i] = tmp; *n_cigar_ = n_cigar, *cigar_ = cigar; } free(eh); free(qp); free(z); return score; } int ksw_global(int qlen, const uint8_t *query, int tlen, const uint8_t *target, int m, const int8_t *mat, int gapo, int gape, int w, int *n_cigar_, uint32_t **cigar_) { return ksw_global2(qlen, query, tlen, target, m, mat, gapo, gape, gapo, gape, w, n_cigar_, cigar_); } /******************************************* * Main function (not compiled by default) * *******************************************/ #ifdef _KSW_MAIN #include #include #include #include "kseq.h" KSEQ_INIT(gzFile, err_gzread) unsigned char seq_nt4_table[256] = { 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 0, 4, 1, 4, 4, 4, 2, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 3, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 0, 4, 1, 4, 4, 4, 2, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 3, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4 }; int main(int argc, char *argv[]) { int c, sa = 1, sb = 3, i, j, k, forward_only = 0, max_rseq = 0; int8_t mat[25]; int gapo = 5, gape = 2, minsc = 0, xtra = KSW_XSTART; uint8_t *rseq = 0; gzFile fpt, fpq; kseq_t *kst, *ksq; // parse command line while ((c = getopt(argc, argv, "a:b:q:r:ft:1")) >= 0) { switch (c) { case 'a': sa = atoi(optarg); break; case 'b': sb = atoi(optarg); break; case 'q': gapo = atoi(optarg); break; case 'r': gape = atoi(optarg); break; case 't': minsc = atoi(optarg); break; case 'f': forward_only = 1; break; case '1': xtra |= KSW_XBYTE; break; } } if (optind + 2 > argc) { fprintf(stderr, "Usage: ksw [-1] [-f] [-a%d] [-b%d] [-q%d] [-r%d] [-t%d] \n", sa, sb, gapo, gape, minsc); return 1; } if (minsc > 0xffff) minsc = 0xffff; xtra |= KSW_XSUBO | minsc; // initialize scoring matrix for (i = k = 0; i < 4; ++i) { for (j = 0; j < 4; ++j) mat[k++] = i == j? sa : -sb; mat[k++] = 0; // ambiguous base } for (j = 0; j < 5; ++j) mat[k++] = 0; // open file fpt = xzopen(argv[optind], "r"); kst = kseq_init(fpt); fpq = xzopen(argv[optind+1], "r"); ksq = kseq_init(fpq); // all-pair alignment while (kseq_read(ksq) > 0) { kswq_t *q[2] = {0, 0}; kswr_t r; for (i = 0; i < (int)ksq->seq.l; ++i) ksq->seq.s[i] = seq_nt4_table[(int)ksq->seq.s[i]]; if (!forward_only) { // reverse if ((int)ksq->seq.m > max_rseq) { max_rseq = ksq->seq.m; rseq = (uint8_t*)realloc(rseq, max_rseq); } for (i = 0, j = ksq->seq.l - 1; i < (int)ksq->seq.l; ++i, --j) rseq[j] = ksq->seq.s[i] == 4? 4 : 3 - ksq->seq.s[i]; } gzrewind(fpt); kseq_rewind(kst); while (kseq_read(kst) > 0) { for (i = 0; i < (int)kst->seq.l; ++i) kst->seq.s[i] = seq_nt4_table[(int)kst->seq.s[i]]; r = ksw_align(ksq->seq.l, (uint8_t*)ksq->seq.s, kst->seq.l, (uint8_t*)kst->seq.s, 5, mat, gapo, gape, xtra, &q[0]); if (r.score >= minsc) err_printf("%s\t%d\t%d\t%s\t%d\t%d\t%d\t%d\t%d\n", kst->name.s, r.tb, r.te+1, ksq->name.s, r.qb, r.qe+1, r.score, r.score2, r.te2); if (rseq) { r = ksw_align(ksq->seq.l, rseq, kst->seq.l, (uint8_t*)kst->seq.s, 5, mat, gapo, gape, xtra, &q[1]); if (r.score >= minsc) err_printf("%s\t%d\t%d\t%s\t%d\t%d\t%d\t%d\t%d\n", kst->name.s, r.tb, r.te+1, ksq->name.s, (int)ksq->seq.l - r.qb, (int)ksq->seq.l - 1 - r.qe, r.score, r.score2, r.te2); } } free(q[0]); free(q[1]); } free(rseq); kseq_destroy(kst); err_gzclose(fpt); kseq_destroy(ksq); err_gzclose(fpq); return 0; } #endif bwa-0.7.17/ksw.h000066400000000000000000000117431317342117100133420ustar00rootroot00000000000000#ifndef __AC_KSW_H #define __AC_KSW_H #include #define KSW_XBYTE 0x10000 #define KSW_XSTOP 0x20000 #define KSW_XSUBO 0x40000 #define KSW_XSTART 0x80000 struct _kswq_t; typedef struct _kswq_t kswq_t; typedef struct { int score; // best score int te, qe; // target end and query end int score2, te2; // second best score and ending position on the target int tb, qb; // target start and query start } kswr_t; #ifdef __cplusplus extern "C" { #endif /** * Aligning two sequences * * @param qlen length of the query sequence (typically =0, *gscore keeps the best score such that * the entire query sequence is aligned; *gtle keeps the position on the * target where *gscore is achieved. Returning *gscore and *gtle helps the * caller to decide whether an end-to-end hit or a partial hit is preferred. * * The first 9 parameters are identical to those in ksw_global() * * @param h0 alignment score of upstream sequences * @param _qle (out) length of the query in the alignment * @param _tle (out) length of the target in the alignment * @param _gtle (out) length of the target if query is fully aligned * @param _gscore (out) score of the best end-to-end alignment; negative if not found * * @return best semi-local alignment score */ int ksw_extend(int qlen, const uint8_t *query, int tlen, const uint8_t *target, int m, const int8_t *mat, int gapo, int gape, int w, int end_bonus, int zdrop, int h0, int *qle, int *tle, int *gtle, int *gscore, int *max_off); int ksw_extend2(int qlen, const uint8_t *query, int tlen, const uint8_t *target, int m, const int8_t *mat, int o_del, int e_del, int o_ins, int e_ins, int w, int end_bonus, int zdrop, int h0, int *qle, int *tle, int *gtle, int *gscore, int *max_off); #ifdef __cplusplus } #endif #endif bwa-0.7.17/kthread.c000066400000000000000000000072401317342117100141500ustar00rootroot00000000000000#include #include #include #include /************ * kt_for() * ************/ struct kt_for_t; typedef struct { struct kt_for_t *t; long i; } ktf_worker_t; typedef struct kt_for_t { int n_threads; long n; ktf_worker_t *w; void (*func)(void*,long,int); void *data; } kt_for_t; static inline long steal_work(kt_for_t *t) { int i, min_i = -1; long k, min = LONG_MAX; for (i = 0; i < t->n_threads; ++i) if (min > t->w[i].i) min = t->w[i].i, min_i = i; k = __sync_fetch_and_add(&t->w[min_i].i, t->n_threads); return k >= t->n? -1 : k; } static void *ktf_worker(void *data) { ktf_worker_t *w = (ktf_worker_t*)data; long i; for (;;) { i = __sync_fetch_and_add(&w->i, w->t->n_threads); if (i >= w->t->n) break; w->t->func(w->t->data, i, w - w->t->w); } while ((i = steal_work(w->t)) >= 0) w->t->func(w->t->data, i, w - w->t->w); pthread_exit(0); } void kt_for(int n_threads, void (*func)(void*,long,int), void *data, long n) { int i; kt_for_t t; pthread_t *tid; t.func = func, t.data = data, t.n_threads = n_threads, t.n = n; t.w = (ktf_worker_t*)alloca(n_threads * sizeof(ktf_worker_t)); tid = (pthread_t*)alloca(n_threads * sizeof(pthread_t)); for (i = 0; i < n_threads; ++i) t.w[i].t = &t, t.w[i].i = i; for (i = 0; i < n_threads; ++i) pthread_create(&tid[i], 0, ktf_worker, &t.w[i]); for (i = 0; i < n_threads; ++i) pthread_join(tid[i], 0); } /***************** * kt_pipeline() * *****************/ struct ktp_t; typedef struct { struct ktp_t *pl; int64_t index; int step; void *data; } ktp_worker_t; typedef struct ktp_t { void *shared; void *(*func)(void*, int, void*); int64_t index; int n_workers, n_steps; ktp_worker_t *workers; pthread_mutex_t mutex; pthread_cond_t cv; } ktp_t; static void *ktp_worker(void *data) { ktp_worker_t *w = (ktp_worker_t*)data; ktp_t *p = w->pl; while (w->step < p->n_steps) { // test whether we can kick off the job with this worker pthread_mutex_lock(&p->mutex); for (;;) { int i; // test whether another worker is doing the same step for (i = 0; i < p->n_workers; ++i) { if (w == &p->workers[i]) continue; // ignore itself if (p->workers[i].step <= w->step && p->workers[i].index < w->index) break; } if (i == p->n_workers) break; // no workers with smaller indices are doing w->step or the previous steps pthread_cond_wait(&p->cv, &p->mutex); } pthread_mutex_unlock(&p->mutex); // working on w->step w->data = p->func(p->shared, w->step, w->step? w->data : 0); // for the first step, input is NULL // update step and let other workers know pthread_mutex_lock(&p->mutex); w->step = w->step == p->n_steps - 1 || w->data? (w->step + 1) % p->n_steps : p->n_steps; if (w->step == 0) w->index = p->index++; pthread_cond_broadcast(&p->cv); pthread_mutex_unlock(&p->mutex); } pthread_exit(0); } void kt_pipeline(int n_threads, void *(*func)(void*, int, void*), void *shared_data, int n_steps) { ktp_t aux; pthread_t *tid; int i; if (n_threads < 1) n_threads = 1; aux.n_workers = n_threads; aux.n_steps = n_steps; aux.func = func; aux.shared = shared_data; aux.index = 0; pthread_mutex_init(&aux.mutex, 0); pthread_cond_init(&aux.cv, 0); aux.workers = (ktp_worker_t*)alloca(n_threads * sizeof(ktp_worker_t)); for (i = 0; i < n_threads; ++i) { ktp_worker_t *w = &aux.workers[i]; w->step = 0; w->pl = &aux; w->data = 0; w->index = aux.index++; } tid = (pthread_t*)alloca(n_threads * sizeof(pthread_t)); for (i = 0; i < n_threads; ++i) pthread_create(&tid[i], 0, ktp_worker, &aux.workers[i]); for (i = 0; i < n_threads; ++i) pthread_join(tid[i], 0); pthread_mutex_destroy(&aux.mutex); pthread_cond_destroy(&aux.cv); } bwa-0.7.17/kvec.h000066400000000000000000000055741317342117100134730ustar00rootroot00000000000000/* The MIT License Copyright (c) 2008, by Attractive Chaos Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. */ /* An example: #include "kvec.h" int main() { kvec_t(int) array; kv_init(array); kv_push(int, array, 10); // append kv_a(int, array, 20) = 5; // dynamic kv_A(array, 20) = 4; // static kv_destroy(array); return 0; } */ /* 2008-09-22 (0.1.0): * The initial version. */ #ifndef AC_KVEC_H #define AC_KVEC_H #include #ifdef USE_MALLOC_WRAPPERS # include "malloc_wrap.h" #endif #define kv_roundup32(x) (--(x), (x)|=(x)>>1, (x)|=(x)>>2, (x)|=(x)>>4, (x)|=(x)>>8, (x)|=(x)>>16, ++(x)) #define kvec_t(type) struct { size_t n, m; type *a; } #define kv_init(v) ((v).n = (v).m = 0, (v).a = 0) #define kv_destroy(v) free((v).a) #define kv_A(v, i) ((v).a[(i)]) #define kv_pop(v) ((v).a[--(v).n]) #define kv_size(v) ((v).n) #define kv_max(v) ((v).m) #define kv_resize(type, v, s) ((v).m = (s), (v).a = (type*)realloc((v).a, sizeof(type) * (v).m)) #define kv_copy(type, v1, v0) do { \ if ((v1).m < (v0).n) kv_resize(type, v1, (v0).n); \ (v1).n = (v0).n; \ memcpy((v1).a, (v0).a, sizeof(type) * (v0).n); \ } while (0) \ #define kv_push(type, v, x) do { \ if ((v).n == (v).m) { \ (v).m = (v).m? (v).m<<1 : 2; \ (v).a = (type*)realloc((v).a, sizeof(type) * (v).m); \ } \ (v).a[(v).n++] = (x); \ } while (0) #define kv_pushp(type, v) ((((v).n == (v).m)? \ ((v).m = ((v).m? (v).m<<1 : 2), \ (v).a = (type*)realloc((v).a, sizeof(type) * (v).m), 0) \ : 0), &(v).a[(v).n++]) #define kv_a(type, v, i) (((v).m <= (size_t)(i)? \ ((v).m = (v).n = (i) + 1, kv_roundup32((v).m), \ (v).a = (type*)realloc((v).a, sizeof(type) * (v).m), 0) \ : (v).n <= (size_t)(i)? (v).n = (i) + 1 \ : 0), (v).a[(i)]) #endif bwa-0.7.17/main.c000066400000000000000000000107541317342117100134560ustar00rootroot00000000000000#include #include #include "kstring.h" #include "utils.h" #ifndef PACKAGE_VERSION #define PACKAGE_VERSION "0.7.17-r1188" #endif int bwa_fa2pac(int argc, char *argv[]); int bwa_pac2bwt(int argc, char *argv[]); int bwa_bwtupdate(int argc, char *argv[]); int bwa_bwt2sa(int argc, char *argv[]); int bwa_index(int argc, char *argv[]); int bwt_bwtgen_main(int argc, char *argv[]); int bwa_aln(int argc, char *argv[]); int bwa_sai2sam_se(int argc, char *argv[]); int bwa_sai2sam_pe(int argc, char *argv[]); int bwa_bwtsw2(int argc, char *argv[]); int main_fastmap(int argc, char *argv[]); int main_mem(int argc, char *argv[]); int main_shm(int argc, char *argv[]); int main_pemerge(int argc, char *argv[]); int main_maxk(int argc, char *argv[]); static int usage() { fprintf(stderr, "\n"); fprintf(stderr, "Program: bwa (alignment via Burrows-Wheeler transformation)\n"); fprintf(stderr, "Version: %s\n", PACKAGE_VERSION); fprintf(stderr, "Contact: Heng Li \n\n"); fprintf(stderr, "Usage: bwa [options]\n\n"); fprintf(stderr, "Command: index index sequences in the FASTA format\n"); fprintf(stderr, " mem BWA-MEM algorithm\n"); fprintf(stderr, " fastmap identify super-maximal exact matches\n"); fprintf(stderr, " pemerge merge overlapping paired ends (EXPERIMENTAL)\n"); fprintf(stderr, " aln gapped/ungapped alignment\n"); fprintf(stderr, " samse generate alignment (single ended)\n"); fprintf(stderr, " sampe generate alignment (paired ended)\n"); fprintf(stderr, " bwasw BWA-SW for long queries\n"); fprintf(stderr, "\n"); fprintf(stderr, " shm manage indices in shared memory\n"); fprintf(stderr, " fa2pac convert FASTA to PAC format\n"); fprintf(stderr, " pac2bwt generate BWT from PAC\n"); fprintf(stderr, " pac2bwtgen alternative algorithm for generating BWT\n"); fprintf(stderr, " bwtupdate update .bwt to the new format\n"); fprintf(stderr, " bwt2sa generate SA from BWT and Occ\n"); fprintf(stderr, "\n"); fprintf(stderr, "Note: To use BWA, you need to first index the genome with `bwa index'.\n" " There are three alignment algorithms in BWA: `mem', `bwasw', and\n" " `aln/samse/sampe'. If you are not sure which to use, try `bwa mem'\n" " first. Please `man ./bwa.1' for the manual.\n\n"); return 1; } int main(int argc, char *argv[]) { extern char *bwa_pg; int i, ret; double t_real; kstring_t pg = {0,0,0}; t_real = realtime(); ksprintf(&pg, "@PG\tID:bwa\tPN:bwa\tVN:%s\tCL:%s", PACKAGE_VERSION, argv[0]); for (i = 1; i < argc; ++i) ksprintf(&pg, " %s", argv[i]); bwa_pg = pg.s; if (argc < 2) return usage(); if (strcmp(argv[1], "fa2pac") == 0) ret = bwa_fa2pac(argc-1, argv+1); else if (strcmp(argv[1], "pac2bwt") == 0) ret = bwa_pac2bwt(argc-1, argv+1); else if (strcmp(argv[1], "pac2bwtgen") == 0) ret = bwt_bwtgen_main(argc-1, argv+1); else if (strcmp(argv[1], "bwtupdate") == 0) ret = bwa_bwtupdate(argc-1, argv+1); else if (strcmp(argv[1], "bwt2sa") == 0) ret = bwa_bwt2sa(argc-1, argv+1); else if (strcmp(argv[1], "index") == 0) ret = bwa_index(argc-1, argv+1); else if (strcmp(argv[1], "aln") == 0) ret = bwa_aln(argc-1, argv+1); else if (strcmp(argv[1], "samse") == 0) ret = bwa_sai2sam_se(argc-1, argv+1); else if (strcmp(argv[1], "sampe") == 0) ret = bwa_sai2sam_pe(argc-1, argv+1); else if (strcmp(argv[1], "bwtsw2") == 0) ret = bwa_bwtsw2(argc-1, argv+1); else if (strcmp(argv[1], "dbwtsw") == 0) ret = bwa_bwtsw2(argc-1, argv+1); else if (strcmp(argv[1], "bwasw") == 0) ret = bwa_bwtsw2(argc-1, argv+1); else if (strcmp(argv[1], "fastmap") == 0) ret = main_fastmap(argc-1, argv+1); else if (strcmp(argv[1], "mem") == 0) ret = main_mem(argc-1, argv+1); else if (strcmp(argv[1], "shm") == 0) ret = main_shm(argc-1, argv+1); else if (strcmp(argv[1], "pemerge") == 0) ret = main_pemerge(argc-1, argv+1); else if (strcmp(argv[1], "maxk") == 0) ret = main_maxk(argc-1, argv+1); else { fprintf(stderr, "[main] unrecognized command '%s'\n", argv[1]); return 1; } err_fflush(stdout); err_fclose(stdout); if (ret == 0) { fprintf(stderr, "[%s] Version: %s\n", __func__, PACKAGE_VERSION); fprintf(stderr, "[%s] CMD:", __func__); for (i = 0; i < argc; ++i) fprintf(stderr, " %s", argv[i]); fprintf(stderr, "\n[%s] Real time: %.3f sec; CPU: %.3f sec\n", __func__, realtime() - t_real, cputime()); } free(bwa_pg); return ret; } bwa-0.7.17/malloc_wrap.c000066400000000000000000000026511317342117100150270ustar00rootroot00000000000000#include #include #include #include #ifdef USE_MALLOC_WRAPPERS /* Don't wrap ourselves */ # undef USE_MALLOC_WRAPPERS #endif #include "malloc_wrap.h" void *wrap_calloc(size_t nmemb, size_t size, const char *file, unsigned int line, const char *func) { void *p = calloc(nmemb, size); if (NULL == p) { fprintf(stderr, "[%s] Failed to allocate %zu bytes at %s line %u: %s\n", func, nmemb * size, file, line, strerror(errno)); exit(EXIT_FAILURE); } return p; } void *wrap_malloc(size_t size, const char *file, unsigned int line, const char *func) { void *p = malloc(size); if (NULL == p) { fprintf(stderr, "[%s] Failed to allocate %zu bytes at %s line %u: %s\n", func, size, file, line, strerror(errno)); exit(EXIT_FAILURE); } return p; } void *wrap_realloc(void *ptr, size_t size, const char *file, unsigned int line, const char *func) { void *p = realloc(ptr, size); if (NULL == p) { fprintf(stderr, "[%s] Failed to allocate %zu bytes at %s line %u: %s\n", func, size, file, line, strerror(errno)); exit(EXIT_FAILURE); } return p; } char *wrap_strdup(const char *s, const char *file, unsigned int line, const char *func) { char *p = strdup(s); if (NULL == p) { fprintf(stderr, "[%s] Failed to allocate %zu bytes at %s line %u: %s\n", func, strlen(s), file, line, strerror(errno)); exit(EXIT_FAILURE); } return p; } bwa-0.7.17/malloc_wrap.h000066400000000000000000000022371317342117100150340ustar00rootroot00000000000000#ifndef MALLOC_WRAP_H #define MALLOC_WRAP_H #include /* Avoid breaking the usual definitions */ #include #ifdef __cplusplus extern "C" { #endif void *wrap_calloc(size_t nmemb, size_t size, const char *file, unsigned int line, const char *func); void *wrap_malloc(size_t size, const char *file, unsigned int line, const char *func); void *wrap_realloc(void *ptr, size_t size, const char *file, unsigned int line, const char *func); char *wrap_strdup(const char *s, const char *file, unsigned int line, const char *func); #ifdef __cplusplus } #endif #ifdef USE_MALLOC_WRAPPERS # ifdef calloc # undef calloc # endif # define calloc(n, s) wrap_calloc( (n), (s), __FILE__, __LINE__, __func__) # ifdef malloc # undef malloc # endif # define malloc(s) wrap_malloc( (s), __FILE__, __LINE__, __func__) # ifdef realloc # undef realloc # endif # define realloc(p, s) wrap_realloc((p), (s), __FILE__, __LINE__, __func__) # ifdef strdup # undef strdup # endif # define strdup(s) wrap_strdup( (s), __FILE__, __LINE__, __func__) #endif /* USE_MALLOC_WRAPPERS */ #endif /* MALLOC_WRAP_H */ bwa-0.7.17/maxk.c000066400000000000000000000031621317342117100134650ustar00rootroot00000000000000#include #include #include #include #include #include #include "bwa.h" #include "bwamem.h" #include "kseq.h" KSEQ_DECLARE(gzFile) int main_maxk(int argc, char *argv[]) { int i, c, self = 0, max_len = 0; uint8_t *cnt = 0; uint64_t hist[256]; bwt_t *bwt; kseq_t *ks; smem_i *itr; gzFile fp; while ((c = getopt(argc, argv, "s")) >= 0) { if (c == 's') self = 1; } if (optind + 2 > argc) { fprintf(stderr, "Usage: bwa maxk [-s] \n"); return 1; } fp = strcmp(argv[optind+1], "-")? gzopen(argv[optind+1], "rb") : gzdopen(fileno(stdin), "rb"); ks = kseq_init(fp); bwt = bwt_restore_bwt(argv[optind]); itr = smem_itr_init(bwt); if (self) smem_config(itr, 2, INT_MAX, 0); memset(hist, 0, 8 * 256); while (kseq_read(ks) >= 0) { const bwtintv_v *a; if (ks->seq.l > max_len) { max_len = ks->seq.l; kroundup32(max_len); cnt = realloc(cnt, max_len); } memset(cnt, 0, ks->seq.l); for (i = 0; i < ks->seq.l; ++i) ks->seq.s[i] = nst_nt4_table[(int)ks->seq.s[i]]; smem_set_query(itr, ks->seq.l, (uint8_t*)ks->seq.s); while ((a = smem_next(itr)) != 0) { for (i = 0; i < a->n; ++i) { bwtintv_t *p = &a->a[i]; int j, l, start = p->info>>32, end = (uint32_t)p->info; l = end - start < 255? end - start : 255; for (j = start; j < end; ++j) cnt[j] = cnt[j] > l? cnt[j] : l; } } for (i = 0; i < ks->seq.l; ++i) ++hist[cnt[i]]; } for (i = 0; i < 256; ++i) printf("%d\t%lld\n", i, (long long)hist[i]); free(cnt); smem_itr_destroy(itr); bwt_destroy(bwt); kseq_destroy(ks); gzclose(fp); return 0; } bwa-0.7.17/pemerge.c000066400000000000000000000211711317342117100141510ustar00rootroot00000000000000#include #include #include #include #include #include #include #include "ksw.h" #include "kseq.h" #include "kstring.h" #include "bwa.h" #include "utils.h" KSEQ_DECLARE(gzFile) #ifdef USE_MALLOC_WRAPPERS # include "malloc_wrap.h" #endif #define MAX_SCORE_RATIO 0.9f #define MAX_ERR 8 static const char *err_msg[MAX_ERR+1] = { "successful merges", "low-scoring pairs", "pairs where the best SW alignment is not an overlap (long left end)", "pairs where the best SW alignment is not an overlap (long right end)", "pairs with large 2nd best SW score", "pairs with gapped overlap", "pairs where the end-to-end alignment is inconsistent with SW", "pairs potentially with tandem overlaps", "pairs with high sum of errors" }; typedef struct { int a, b, q, r, w; int q_def, q_thres; int T; int chunk_size; int n_threads; int flag; // bit 1: print merged; 2: print unmerged int8_t mat[25]; } pem_opt_t; pem_opt_t *pem_opt_init() { pem_opt_t *opt; opt = calloc(1, sizeof(pem_opt_t)); opt->a = 5; opt->b = 4; opt->q = 2, opt->r = 17; opt->w = 20; opt->T = opt->a * 10; opt->q_def = 20; opt->q_thres = 70; opt->chunk_size = 10000000; opt->n_threads = 1; opt->flag = 3; bwa_fill_scmat(opt->a, opt->b, opt->mat); return opt; } int bwa_pemerge(const pem_opt_t *opt, bseq1_t x[2]) { uint8_t *s[2], *q[2], *seq, *qual; int i, xtra, l, l_seq, sum_q, ret = 0; kswr_t r; s[0] = malloc(x[0].l_seq); q[0] = malloc(x[0].l_seq); s[1] = malloc(x[1].l_seq); q[1] = malloc(x[1].l_seq); for (i = 0; i < x[0].l_seq; ++i) { int c = x[0].seq[i]; s[0][i] = c < 0 || c > 127? 4 : c <= 4? c : nst_nt4_table[c]; q[0][i] = x[0].qual? x[0].qual[i] - 33 : opt->q_def; } for (i = 0; i < x[1].l_seq; ++i) { int c = x[1].seq[x[1].l_seq - 1 - i]; c = c < 0 || c > 127? 4 : c < 4? c : nst_nt4_table[c]; s[1][i] = c < 4? 3 - c : 4; q[1][i] = x[1].qual? x[1].qual[x[1].l_seq - 1 - i] - 33 : opt->q_def; } xtra = KSW_XSTART | KSW_XSUBO; r = ksw_align(x[1].l_seq, s[1], x[0].l_seq, s[0], 5, opt->mat, opt->q, opt->r, xtra, 0); ++r.qe; ++r.te; // change to the half-close-half-open coordinates if (r.score < opt->T) { ret = -1; goto pem_ret; } // poor alignment if (r.tb < r.qb) { ret = -2; goto pem_ret; } // no enough space for the left end if (x[0].l_seq - r.te > x[1].l_seq - r.qe) { ret = -3; goto pem_ret; } // no enough space for the right end if ((double)r.score2 / r.score >= MAX_SCORE_RATIO) { ret = -4; goto pem_ret; } // the second best score is too large if (r.qe - r.qb != r.te - r.tb) { ret = -5; goto pem_ret; } // we do not allow gaps { // test tandem match; O(n^2) int max_m, max_m2, min_l, max_l, max_l2; max_m = max_m2 = 0; max_l = max_l2 = 0; min_l = x[0].l_seq < x[1].l_seq? x[0].l_seq : x[1].l_seq; for (l = 1; l < min_l; ++l) { int m = 0, o = x[0].l_seq - l; uint8_t *s0o = &s[0][o], *s1 = s[1]; for (i = 0; i < l; ++i) // TODO: in principle, this can be done with SSE2. It is the bottleneck! m += opt->mat[(s1[i]<<2) + s1[i] + s0o[i]]; // equivalent to s[1][i]*5 + s[0][o+i] if (m > max_m) max_m2 = max_m, max_m = m, max_l2 = max_l, max_l = l; else if (m > max_m2) max_m2 = m, max_l2 = l; } if (max_m < opt->T || max_l != x[0].l_seq - (r.tb - r.qb)) { ret = -6; goto pem_ret; } if (max_l2 < max_l && max_m2 >= opt->T && (double)(max_m2 + (max_l - max_l2) * opt->a) / max_m >= MAX_SCORE_RATIO) { ret = -7; goto pem_ret; } if (max_l2 > max_l && (double)max_m2 / max_m >= MAX_SCORE_RATIO) { ret = -7; goto pem_ret; } } l = x[0].l_seq - (r.tb - r.qb); // length to merge l_seq = x[0].l_seq + x[1].l_seq - l; seq = malloc(l_seq + 1); qual = malloc(l_seq + 1); memcpy(seq, s[0], x[0].l_seq); memcpy(seq + x[0].l_seq, &s[1][l], x[1].l_seq - l); memcpy(qual, q[0], x[0].l_seq); memcpy(qual + x[0].l_seq, &q[1][l], x[1].l_seq - l); for (i = 0, sum_q = 0; i < l; ++i) { int k = x[0].l_seq - l + i; if (s[0][k] == 4) { // ambiguous seq[k] = s[1][i]; qual[k] = q[1][i]; } else if (s[1][i] == 4) { // do nothing } else if (s[0][k] == s[1][i]) { qual[k] = qual[k] > q[1][i]? qual[k] : q[1][i]; } else { // s[0][k] != s[1][i] and neither is N int qq = q[0][k] < q[1][i]? q[0][k] : q[1][i]; sum_q += qq >= 3? qq<<1 : 1; seq[k] = q[0][k] > q[1][i]? s[0][k] : s[1][i]; qual[k] = abs((int)q[0][k] - (int)q[1][i]); } } if (sum_q>>1 > opt->q_thres) { // too many mismatches free(seq); free(qual); ret = -8; goto pem_ret; } for (i = 0; i < l_seq; ++i) seq[i] = "ACGTN"[(int)seq[i]], qual[i] += 33; seq[l_seq] = qual[l_seq] = 0; free(x[1].name); free(x[1].seq); free(x[1].qual); free(x[1].comment); memset(&x[1], 0, sizeof(bseq1_t)); free(x[0].seq); free(x[0].qual); x[0].l_seq = l_seq; x[0].seq = (char*)seq; x[0].qual = (char*)qual; pem_ret: free(s[0]); free(s[1]); free(q[0]); free(q[1]); return ret; } static inline void print_bseq(const bseq1_t *s, int rn) { err_putchar(s->qual? '@' : '>'); err_fputs(s->name, stdout); if (rn == 1 || rn == 2) { err_putchar('/'); err_putchar('0' + rn); err_putchar('\n'); } else err_puts(" merged"); err_puts(s->seq); if (s->qual) { err_puts("+"); err_puts(s->qual); } } typedef struct { int n, start; bseq1_t *seqs; int64_t cnt[MAX_ERR+1]; const pem_opt_t *opt; } worker_t; void *worker(void *data) { worker_t *w = (worker_t*)data; int i; for (i = w->start; i < w->n>>1; i += w->opt->n_threads) ++w->cnt[-bwa_pemerge(w->opt, &w->seqs[i<<1])]; return 0; } static void process_seqs(const pem_opt_t *opt, int n_, bseq1_t *seqs, int64_t cnt[MAX_ERR+1]) { int i, j, n = n_>>1<<1; worker_t *w; w = calloc(opt->n_threads, sizeof(worker_t)); for (i = 0; i < opt->n_threads; ++i) { worker_t *p = &w[i]; p->start = i; p->n = n; p->opt = opt; p->seqs = seqs; } if (opt->n_threads == 1) { worker(w); } else { pthread_t *tid; tid = (pthread_t*)calloc(opt->n_threads, sizeof(pthread_t)); for (i = 0; i < opt->n_threads; ++i) pthread_create(&tid[i], 0, worker, &w[i]); for (i = 0; i < opt->n_threads; ++i) pthread_join(tid[i], 0); free(tid); } for (i = 0; i < opt->n_threads; ++i) { worker_t *p = &w[i]; for (j = 0; j <= MAX_ERR; ++j) cnt[j] += p->cnt[j]; } free(w); for (i = 0; i < n>>1; ++i) { if (seqs[i<<1|1].l_seq != 0) { if (opt->flag&2) { print_bseq(&seqs[i<<1|0], 1); print_bseq(&seqs[i<<1|1], 2); } } else if (opt->flag&1) print_bseq(&seqs[i<<1|0], 0); } for (i = 0; i < n; ++i) { bseq1_t *s = &seqs[i]; free(s->name); free(s->seq); free(s->qual); free(s->comment); } } int main_pemerge(int argc, char *argv[]) { int c, flag = 0, i, n, min_ovlp = 10; int64_t cnt[MAX_ERR+1]; bseq1_t *bseq; gzFile fp, fp2 = 0; kseq_t *ks, *ks2 = 0; pem_opt_t *opt; opt = pem_opt_init(); while ((c = getopt(argc, argv, "muQ:t:T:")) >= 0) { if (c == 'm') flag |= 1; else if (c == 'u') flag |= 2; else if (c == 'Q') opt->q_thres = atoi(optarg); else if (c == 't') opt->n_threads = atoi(optarg); else if (c == 'T') min_ovlp = atoi(optarg); else return 1; } if (flag == 0) flag = 3; opt->flag = flag; opt->T = opt->a * min_ovlp; if (optind == argc) { fprintf(stderr, "\n"); fprintf(stderr, "Usage: bwa pemerge [-mu] [read2.fq]\n\n"); fprintf(stderr, "Options: -m output merged reads only\n"); fprintf(stderr, " -u output unmerged reads only\n"); fprintf(stderr, " -t INT number of threads [%d]\n", opt->n_threads); fprintf(stderr, " -T INT minimum end overlap [%d]\n", min_ovlp); fprintf(stderr, " -Q INT max sum of errors [%d]\n", opt->q_thres); fprintf(stderr, "\n"); free(opt); return 1; } fp = strcmp(argv[optind], "-")? gzopen(argv[optind], "r") : gzdopen(fileno(stdin), "r"); if (NULL == fp) { fprintf(stderr, "Couldn't open %s : %s\n", strcmp(argv[optind], "-") ? argv[optind] : "stdin", errno ? strerror(errno) : "Out of memory"); exit(EXIT_FAILURE); } ks = kseq_init(fp); if (optind + 1 < argc) { fp2 = strcmp(argv[optind+1], "-")? gzopen(argv[optind+1], "r") : gzdopen(fileno(stdin), "r"); if (NULL == fp) { fprintf(stderr, "Couldn't open %s : %s\n", strcmp(argv[optind+1], "-") ? argv[optind+1] : "stdin", errno ? strerror(errno) : "Out of memory"); exit(EXIT_FAILURE); } ks2 = kseq_init(fp2); } memset(cnt, 0, 8 * (MAX_ERR+1)); while ((bseq = bseq_read(opt->n_threads * opt->chunk_size, &n, ks, ks2)) != 0) { process_seqs(opt, n, bseq, cnt); free(bseq); } fprintf(stderr, "%12ld %s\n", (long)cnt[0], err_msg[0]); for (i = 1; i <= MAX_ERR; ++i) fprintf(stderr, "%12ld %s\n", (long)cnt[i], err_msg[i]); kseq_destroy(ks); err_gzclose(fp); if (ks2) { kseq_destroy(ks2); err_gzclose(fp2); } free(opt); err_fflush(stdout); return 0; } bwa-0.7.17/qualfa2fq.pl000077500000000000000000000011511317342117100145770ustar00rootroot00000000000000#!/usr/bin/env perl use strict; use warnings; die("Usage: qualfa2fq.pl \n") if (@ARGV != 2); my ($fhs, $fhq, $q); open($fhs, ($ARGV[0] =~ /\.gz$/)? "gzip -dc $ARGV[0] |" : $ARGV[0]) || die; open($fhq, ($ARGV[1] =~ /\.gz$/)? "gzip -dc $ARGV[1] |" : $ARGV[1]) || die; $/ = ">"; <$fhs>; <$fhq>; $/ = "\n"; while (<$fhs>) { $q = <$fhq>; print "\@$_"; $/ = ">"; $_ = <$fhs>; $q = <$fhq>; chomp; chomp($q); $q =~ s/\s*(\d+)\s*/chr($1+33)/eg; print $_, "+\n"; for (my $i = 0; $i < length($q); $i += 60) { print substr($q, $i, 60), "\n"; } $/ = "\n"; } close($fhs); close($fhq); bwa-0.7.17/rle.c000066400000000000000000000113251317342117100133070ustar00rootroot00000000000000#include #include #include #include #include "rle.h" const uint8_t rle_auxtab[8] = { 0x01, 0x11, 0x21, 0x31, 0x03, 0x13, 0x07, 0x17 }; // insert symbol $a after $x symbols in $str; marginal counts added to $cnt; returns the size increase int rle_insert_cached(uint8_t *block, int64_t x, int a, int64_t rl, int64_t cnt[6], const int64_t ec[6], int *beg, int64_t bc[6]) { uint16_t *nptr = (uint16_t*)block; int diff; block += 2; // skip the first 2 counting bytes if (*nptr == 0) { memset(cnt, 0, 48); diff = rle_enc1(block, a, rl); } else { uint8_t *p, *end = block + *nptr, *q; int64_t pre, z, l = 0, tot, beg_l; int c = -1, n_bytes = 0, n_bytes2, t = 0; uint8_t tmp[24]; beg_l = bc[0] + bc[1] + bc[2] + bc[3] + bc[4] + bc[5]; tot = ec[0] + ec[1] + ec[2] + ec[3] + ec[4] + ec[5]; if (x < beg_l) { beg_l = 0, *beg = 0; memset(bc, 0, 48); } if (x == beg_l) { p = q = block + (*beg); z = beg_l; memcpy(cnt, bc, 48); } else if (x - beg_l <= ((tot-beg_l)>>1) + ((tot-beg_l)>>3)) { // forward z = beg_l; p = block + (*beg); memcpy(cnt, bc, 48); while (z < x) { rle_dec1(p, c, l); z += l; cnt[c] += l; } for (q = p - 1; *q>>6 == 2; --q); } else { // backward memcpy(cnt, ec, 48); z = tot; p = end; while (z >= x) { --p; if (*p>>6 != 2) { l |= *p>>7? (int64_t)rle_auxtab[*p>>3&7]>>4 << t : *p>>3; z -= l; cnt[*p&7] -= l; l = 0; t = 0; } else { l |= (*p&0x3fL) << t; t += 6; } } q = p; rle_dec1(p, c, l); z += l; cnt[c] += l; } *beg = q - block; memcpy(bc, cnt, 48); bc[c] -= l; n_bytes = p - q; if (x == z && a != c && p < end) { // then try the next run int tc; int64_t tl; q = p; rle_dec1(q, tc, tl); if (a == tc) c = tc, n_bytes = q - p, l = tl, z += l, p = q, cnt[tc] += tl; } if (z != x) cnt[c] -= z - x; pre = x - (z - l); p -= n_bytes; if (a == c) { // insert to the same run n_bytes2 = rle_enc1(tmp, c, l + rl); } else if (x == z) { // at the end; append to the existing run p += n_bytes; n_bytes = 0; n_bytes2 = rle_enc1(tmp, a, rl); } else { // break the current run n_bytes2 = rle_enc1(tmp, c, pre); n_bytes2 += rle_enc1(tmp + n_bytes2, a, rl); n_bytes2 += rle_enc1(tmp + n_bytes2, c, l - pre); } if (n_bytes != n_bytes2 && end != p + n_bytes) // size changed memmove(p + n_bytes2, p + n_bytes, end - p - n_bytes); memcpy(p, tmp, n_bytes2); diff = n_bytes2 - n_bytes; } return (*nptr += diff); } int rle_insert(uint8_t *block, int64_t x, int a, int64_t rl, int64_t cnt[6], const int64_t ec[6]) { int beg = 0; int64_t bc[6]; memset(bc, 0, 48); return rle_insert_cached(block, x, a, rl, cnt, ec, &beg, bc); } void rle_split(uint8_t *block, uint8_t *new_block) { int n = *(uint16_t*)block; uint8_t *end = block + 2 + n, *q = block + 2 + (n>>1); while (*q>>6 == 2) --q; memcpy(new_block + 2, q, end - q); *(uint16_t*)new_block = end - q; *(uint16_t*)block = q - block - 2; } void rle_count(const uint8_t *block, int64_t cnt[6]) { const uint8_t *q = block + 2, *end = q + *(uint16_t*)block; while (q < end) { int c; int64_t l; rle_dec1(q, c, l); cnt[c] += l; } } void rle_print(const uint8_t *block, int expand) { const uint16_t *p = (const uint16_t*)block; const uint8_t *q = block + 2, *end = block + 2 + *p; while (q < end) { int c; int64_t l, x; rle_dec1(q, c, l); if (expand) for (x = 0; x < l; ++x) putchar("$ACGTN"[c]); else printf("%c%ld", "$ACGTN"[c], (long)l); } putchar('\n'); } void rle_rank2a(const uint8_t *block, int64_t x, int64_t y, int64_t *cx, int64_t *cy, const int64_t ec[6]) { int a; int64_t tot, cnt[6]; const uint8_t *p; y = y >= x? y : x; tot = ec[0] + ec[1] + ec[2] + ec[3] + ec[4] + ec[5]; if (tot == 0) return; if (x <= (tot - y) + (tot>>3)) { int c = 0; int64_t l, z = 0; memset(cnt, 0, 48); p = block + 2; while (z < x) { rle_dec1(p, c, l); z += l; cnt[c] += l; } for (a = 0; a != 6; ++a) cx[a] += cnt[a]; cx[c] -= z - x; if (cy) { while (z < y) { rle_dec1(p, c, l); z += l; cnt[c] += l; } for (a = 0; a != 6; ++a) cy[a] += cnt[a]; cy[c] -= z - y; } } else { #define move_backward(_x) \ while (z >= (_x)) { \ --p; \ if (*p>>6 != 2) { \ l |= *p>>7? (int64_t)rle_auxtab[*p>>3&7]>>4 << t : *p>>3; \ z -= l; cnt[*p&7] -= l; \ l = 0; t = 0; \ } else { \ l |= (*p&0x3fL) << t; \ t += 6; \ } \ } \ int t = 0; int64_t l = 0, z = tot; memcpy(cnt, ec, 48); p = block + 2 + *(const uint16_t*)block; if (cy) { move_backward(y) for (a = 0; a != 6; ++a) cy[a] += cnt[a]; cy[*p&7] += y - z; } move_backward(x) for (a = 0; a != 6; ++a) cx[a] += cnt[a]; cx[*p&7] += x - z; #undef move_backward } } bwa-0.7.17/rle.h000066400000000000000000000035571317342117100133240ustar00rootroot00000000000000#ifndef RLE6_H_ #define RLE6_H_ #include #ifdef __GNUC__ #define LIKELY(x) __builtin_expect((x),1) #else #define LIKELY(x) (x) #endif #ifdef __cplusplus extern "C" { #endif int rle_insert_cached(uint8_t *block, int64_t x, int a, int64_t rl, int64_t cnt[6], const int64_t ec[6], int *beg, int64_t bc[6]); int rle_insert(uint8_t *block, int64_t x, int a, int64_t rl, int64_t cnt[6], const int64_t end_cnt[6]); void rle_split(uint8_t *block, uint8_t *new_block); void rle_count(const uint8_t *block, int64_t cnt[6]); void rle_rank2a(const uint8_t *block, int64_t x, int64_t y, int64_t *cx, int64_t *cy, const int64_t ec[6]); #define rle_rank1a(block, x, cx, ec) rle_rank2a(block, x, -1, cx, 0, ec) void rle_print(const uint8_t *block, int expand); #ifdef __cplusplus } #endif /****************** *** 43+3 codec *** ******************/ const uint8_t rle_auxtab[8]; #define RLE_MIN_SPACE 18 #define rle_nptr(block) ((uint16_t*)(block)) // decode one run (c,l) and move the pointer p #define rle_dec1(p, c, l) do { \ (c) = *(p) & 7; \ if (LIKELY((*(p)&0x80) == 0)) { \ (l) = *(p)++ >> 3; \ } else if (LIKELY(*(p)>>5 == 6)) { \ (l) = (*(p)&0x18L)<<3L | ((p)[1]&0x3fL); \ (p) += 2; \ } else { \ int n = ((*(p)&0x10) >> 2) + 4; \ (l) = *(p)++ >> 3 & 1; \ while (--n) (l) = ((l)<<6) | (*(p)++&0x3fL); \ } \ } while (0) static inline int rle_enc1(uint8_t *p, int c, int64_t l) { if (l < 1LL<<4) { *p = l << 3 | c; return 1; } else if (l < 1LL<<8) { *p = 0xC0 | l >> 6 << 3 | c; p[1] = 0x80 | (l & 0x3f); return 2; } else if (l < 1LL<<19) { *p = 0xE0 | l >> 18 << 3 | c; p[1] = 0x80 | (l >> 12 & 0x3f); p[2] = 0x80 | (l >> 6 & 0x3f); p[3] = 0x80 | (l & 0x3f); return 4; } else { int i, shift = 36; *p = 0xF0 | l >> 42 << 3 | c; for (i = 1; i < 8; ++i, shift -= 6) p[i] = 0x80 | (l>>shift & 0x3f); return 8; } } #endif bwa-0.7.17/rope.c000066400000000000000000000214731317342117100134770ustar00rootroot00000000000000#include #include #include #include #include #include "rle.h" #include "rope.h" /******************* *** Memory Pool *** *******************/ #define MP_CHUNK_SIZE 0x100000 // 1MB per chunk typedef struct { // memory pool for fast and compact memory allocation (no free) int size, i, n_elems; int64_t top, max; uint8_t **mem; } mempool_t; static mempool_t *mp_init(int size) { mempool_t *mp; mp = calloc(1, sizeof(mempool_t)); mp->size = size; mp->i = mp->n_elems = MP_CHUNK_SIZE / size; mp->top = -1; return mp; } static void mp_destroy(mempool_t *mp) { int64_t i; for (i = 0; i <= mp->top; ++i) free(mp->mem[i]); free(mp->mem); free(mp); } static inline void *mp_alloc(mempool_t *mp) { if (mp->i == mp->n_elems) { if (++mp->top == mp->max) { mp->max = mp->max? mp->max<<1 : 1; mp->mem = realloc(mp->mem, sizeof(void*) * mp->max); } mp->mem[mp->top] = calloc(mp->n_elems, mp->size); mp->i = 0; } return mp->mem[mp->top] + (mp->i++) * mp->size; } /*************** *** B+ rope *** ***************/ rope_t *rope_init(int max_nodes, int block_len) { rope_t *rope; rope = calloc(1, sizeof(rope_t)); if (block_len < 32) block_len = 32; rope->max_nodes = (max_nodes+ 1)>>1<<1; rope->block_len = (block_len + 7) >> 3 << 3; rope->node = mp_init(sizeof(rpnode_t) * rope->max_nodes); rope->leaf = mp_init(rope->block_len); rope->root = mp_alloc(rope->node); rope->root->n = 1; rope->root->is_bottom = 1; rope->root->p = mp_alloc(rope->leaf); return rope; } void rope_destroy(rope_t *rope) { mp_destroy(rope->node); mp_destroy(rope->leaf); free(rope); } static inline rpnode_t *split_node(rope_t *rope, rpnode_t *u, rpnode_t *v) { // split $v's child. $u is the first node in the bucket. $v and $u are in the same bucket. IMPORTANT: there is always enough room in $u int j, i = v - u; rpnode_t *w; // $w is the sibling of $v if (u == 0) { // only happens at the root; add a new root u = v = mp_alloc(rope->node); v->n = 1; v->p = rope->root; // the new root has the old root as the only child memcpy(v->c, rope->c, 48); for (j = 0; j < 6; ++j) v->l += v->c[j]; rope->root = v; } if (i != u->n - 1) // then make room for a new node memmove(v + 2, v + 1, sizeof(rpnode_t) * (u->n - i - 1)); ++u->n; w = v + 1; memset(w, 0, sizeof(rpnode_t)); w->p = mp_alloc(u->is_bottom? rope->leaf : rope->node); if (u->is_bottom) { // we are at the bottom level; $v->p is a string instead of a node uint8_t *p = (uint8_t*)v->p, *q = (uint8_t*)w->p; rle_split(p, q); rle_count(q, w->c); } else { // $v->p is a node, not a string rpnode_t *p = v->p, *q = w->p; // $v and $w are siblings and thus $p and $q are cousins p->n -= rope->max_nodes>>1; memcpy(q, p + p->n, sizeof(rpnode_t) * (rope->max_nodes>>1)); q->n = rope->max_nodes>>1; // NB: this line must below memcpy() as $q->n and $q->is_bottom are modified by memcpy() q->is_bottom = p->is_bottom; for (i = 0; i < q->n; ++i) for (j = 0; j < 6; ++j) w->c[j] += q[i].c[j]; } for (j = 0; j < 6; ++j) // compute $w->l and update $v->c w->l += w->c[j], v->c[j] -= w->c[j]; v->l -= w->l; // update $v->c return v; } int64_t rope_insert_run(rope_t *rope, int64_t x, int a, int64_t rl, rpcache_t *cache) { // insert $a after $x symbols in $rope and the returns rank(a, x) rpnode_t *u = 0, *v = 0, *p = rope->root; // $v is the parent of $p; $u and $v are at the same level and $u is the first node in the bucket int64_t y = 0, z = 0, cnt[6]; int n_runs; do { // top-down update. Searching and node splitting are done together in one pass. if (p->n == rope->max_nodes) { // node is full; split v = split_node(rope, u, v); // $v points to the parent of $p; when a new root is added, $v points to the root if (y + v->l < x) // if $v is not long enough after the split, we need to move both $p and its parent $v y += v->l, z += v->c[a], ++v, p = v->p; } u = p; if (v && x - y > v->l>>1) { // then search backwardly for the right node to descend p += p->n - 1; y += v->l; z += v->c[a]; for (; y >= x; --p) y -= p->l, z -= p->c[a]; ++p; } else for (; y + p->l < x; ++p) y += p->l, z += p->c[a]; // then search forwardly assert(p - u < u->n); if (v) v->c[a] += rl, v->l += rl; // we should not change p->c[a] because this may cause troubles when p's child is split v = p; p = p->p; // descend } while (!u->is_bottom); rope->c[a] += rl; // $rope->c should be updated after the loop as adding a new root needs the old $rope->c counts if (cache) { if (cache->p != (uint8_t*)p) memset(cache, 0, sizeof(rpcache_t)); n_runs = rle_insert_cached((uint8_t*)p, x - y, a, rl, cnt, v->c, &cache->beg, cache->bc); cache->p = (uint8_t*)p; } else n_runs = rle_insert((uint8_t*)p, x - y, a, rl, cnt, v->c); z += cnt[a]; v->c[a] += rl; v->l += rl; // this should be after rle_insert(); otherwise rle_insert() won't work if (n_runs + RLE_MIN_SPACE > rope->block_len) { split_node(rope, u, v); if (cache) memset(cache, 0, sizeof(rpcache_t)); } return z; } static rpnode_t *rope_count_to_leaf(const rope_t *rope, int64_t x, int64_t cx[6], int64_t *rest) { rpnode_t *u, *v = 0, *p = rope->root; int64_t y = 0; int a; memset(cx, 0, 48); do { u = p; if (v && x - y > v->l>>1) { p += p->n - 1; y += v->l; for (a = 0; a != 6; ++a) cx[a] += v->c[a]; for (; y >= x; --p) { y -= p->l; for (a = 0; a != 6; ++a) cx[a] -= p->c[a]; } ++p; } else { for (; y + p->l < x; ++p) { y += p->l; for (a = 0; a != 6; ++a) cx[a] += p->c[a]; } } v = p; p = p->p; } while (!u->is_bottom); *rest = x - y; return v; } void rope_rank2a(const rope_t *rope, int64_t x, int64_t y, int64_t *cx, int64_t *cy) { rpnode_t *v; int64_t rest; v = rope_count_to_leaf(rope, x, cx, &rest); if (y < x || cy == 0) { rle_rank1a((const uint8_t*)v->p, rest, cx, v->c); } else if (rest + (y - x) <= v->l) { memcpy(cy, cx, 48); rle_rank2a((const uint8_t*)v->p, rest, rest + (y - x), cx, cy, v->c); } else { rle_rank1a((const uint8_t*)v->p, rest, cx, v->c); v = rope_count_to_leaf(rope, y, cy, &rest); rle_rank1a((const uint8_t*)v->p, rest, cy, v->c); } } /********************* *** Rope iterator *** *********************/ void rope_itr_first(const rope_t *rope, rpitr_t *i) { memset(i, 0, sizeof(rpitr_t)); i->rope = rope; for (i->pa[i->d] = rope->root; !i->pa[i->d]->is_bottom;) // descend to the leftmost leaf ++i->d, i->pa[i->d] = i->pa[i->d - 1]->p; } const uint8_t *rope_itr_next_block(rpitr_t *i) { const uint8_t *ret; assert(i->d < ROPE_MAX_DEPTH); // a B+ tree should not be that tall if (i->d < 0) return 0; ret = (uint8_t*)i->pa[i->d][i->ia[i->d]].p; while (i->d >= 0 && ++i->ia[i->d] == i->pa[i->d]->n) i->ia[i->d--] = 0; // backtracking if (i->d >= 0) while (!i->pa[i->d]->is_bottom) // descend to the leftmost leaf ++i->d, i->pa[i->d] = i->pa[i->d - 1][i->ia[i->d - 1]].p; return ret; } /*********** *** I/O *** ***********/ void rope_print_node(const rpnode_t *p) { if (p->is_bottom) { int i; putchar('('); for (i = 0; i < p->n; ++i) { uint8_t *block = (uint8_t*)p[i].p; const uint8_t *q = block + 2, *end = block + 2 + *rle_nptr(block); if (i) putchar(','); while (q < end) { int c = 0; int64_t j, l; rle_dec1(q, c, l); for (j = 0; j < l; ++j) putchar("$ACGTN"[c]); } } putchar(')'); } else { int i; putchar('('); for (i = 0; i < p->n; ++i) { if (i) putchar(','); rope_print_node(p[i].p); } putchar(')'); } } void rope_dump_node(const rpnode_t *p, FILE *fp) { int16_t i, n = p->n; uint8_t is_bottom = p->is_bottom; fwrite(&is_bottom, 1, 1, fp); fwrite(&n, 2, 1, fp); if (is_bottom) { for (i = 0; i < n; ++i) { fwrite(p[i].c, 8, 6, fp); fwrite(p[i].p, 1, *rle_nptr(p[i].p) + 2, fp); } } else { for (i = 0; i < p->n; ++i) rope_dump_node(p[i].p, fp); } } void rope_dump(const rope_t *r, FILE *fp) { fwrite(&r->max_nodes, 4, 1, fp); fwrite(&r->block_len, 4, 1, fp); rope_dump_node(r->root, fp); } rpnode_t *rope_restore_node(const rope_t *r, FILE *fp, int64_t c[6]) { uint8_t is_bottom, a; int16_t i, n; rpnode_t *p; fread(&is_bottom, 1, 1, fp); fread(&n, 2, 1, fp); p = mp_alloc(r->node); p->is_bottom = is_bottom, p->n = n; if (is_bottom) { for (i = 0; i < n; ++i) { uint16_t *q; p[i].p = mp_alloc(r->leaf); q = rle_nptr(p[i].p); fread(p[i].c, 8, 6, fp); fread(q, 2, 1, fp); fread(q + 1, 1, *q, fp); } } else { for (i = 0; i < n; ++i) p[i].p = rope_restore_node(r, fp, p[i].c); } memset(c, 0, 48); for (i = 0; i < n; ++i) { p[i].l = 0; for (a = 0; a < 6; ++a) c[a] += p[i].c[a], p[i].l += p[i].c[a]; } return p; } rope_t *rope_restore(FILE *fp) { rope_t *r; r = calloc(1, sizeof(rope_t)); fread(&r->max_nodes, 4, 1, fp); fread(&r->block_len, 4, 1, fp); r->node = mp_init(sizeof(rpnode_t) * r->max_nodes); r->leaf = mp_init(r->block_len); r->root = rope_restore_node(r, fp, r->c); return r; } bwa-0.7.17/rope.h000066400000000000000000000030141317342117100134730ustar00rootroot00000000000000#ifndef ROPE_H_ #define ROPE_H_ #include #include #define ROPE_MAX_DEPTH 80 #define ROPE_DEF_MAX_NODES 64 #define ROPE_DEF_BLOCK_LEN 512 typedef struct rpnode_s { struct rpnode_s *p; // child; at the bottom level, $p points to a string with the first 2 bytes giving the number of runs (#runs) uint64_t l:54, n:9, is_bottom:1; // $n and $is_bottom are only set for the first node in a bucket int64_t c[6]; // marginal counts } rpnode_t; typedef struct { int32_t max_nodes, block_len; // both MUST BE even numbers int64_t c[6]; // marginal counts rpnode_t *root; void *node, *leaf; // memory pool } rope_t; typedef struct { const rope_t *rope; // the rope const rpnode_t *pa[ROPE_MAX_DEPTH]; // parent nodes int ia[ROPE_MAX_DEPTH]; // index in the parent nodes int d; // the current depth in the B+-tree } rpitr_t; typedef struct { int beg; int64_t bc[6]; uint8_t *p; } rpcache_t; #ifdef __cplusplus extern "C" { #endif rope_t *rope_init(int max_nodes, int block_len); void rope_destroy(rope_t *rope); int64_t rope_insert_run(rope_t *rope, int64_t x, int a, int64_t rl, rpcache_t *cache); void rope_rank2a(const rope_t *rope, int64_t x, int64_t y, int64_t *cx, int64_t *cy); #define rope_rank1a(rope, x, cx) rope_rank2a(rope, x, -1, cx, 0) void rope_itr_first(const rope_t *rope, rpitr_t *i); const uint8_t *rope_itr_next_block(rpitr_t *i); void rope_print_node(const rpnode_t *p); void rope_dump(const rope_t *r, FILE *fp); rope_t *rope_restore(FILE *fp); #ifdef __cplusplus } #endif #endif bwa-0.7.17/utils.c000066400000000000000000000154401317342117100136670ustar00rootroot00000000000000/* The MIT License Copyright (c) 2008 Genome Research Ltd (GRL). Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. */ /* Contact: Heng Li */ #define FSYNC_ON_FLUSH #include #include #include #include #include #include #ifdef FSYNC_ON_FLUSH #include #include #include #endif #include #include #include "utils.h" #include "ksort.h" #define pair64_lt(a, b) ((a).x < (b).x || ((a).x == (b).x && (a).y < (b).y)) KSORT_INIT(128, pair64_t, pair64_lt) KSORT_INIT(64, uint64_t, ks_lt_generic) #include "kseq.h" KSEQ_INIT2(, gzFile, err_gzread) /******************** * System utilities * ********************/ FILE *err_xopen_core(const char *func, const char *fn, const char *mode) { FILE *fp = 0; if (strcmp(fn, "-") == 0) return (strstr(mode, "r"))? stdin : stdout; if ((fp = fopen(fn, mode)) == 0) { err_fatal(func, "fail to open file '%s' : %s", fn, strerror(errno)); } return fp; } FILE *err_xreopen_core(const char *func, const char *fn, const char *mode, FILE *fp) { if (freopen(fn, mode, fp) == 0) { err_fatal(func, "fail to open file '%s' : %s", fn, strerror(errno)); } return fp; } gzFile err_xzopen_core(const char *func, const char *fn, const char *mode) { gzFile fp; if (strcmp(fn, "-") == 0) { fp = gzdopen(fileno((strstr(mode, "r"))? stdin : stdout), mode); /* According to zlib.h, this is the only reason gzdopen can fail */ if (!fp) err_fatal(func, "Out of memory"); return fp; } if ((fp = gzopen(fn, mode)) == 0) { err_fatal(func, "fail to open file '%s' : %s", fn, errno ? strerror(errno) : "Out of memory"); } return fp; } void err_fatal(const char *header, const char *fmt, ...) { va_list args; va_start(args, fmt); fprintf(stderr, "[%s] ", header); vfprintf(stderr, fmt, args); fprintf(stderr, "\n"); va_end(args); exit(EXIT_FAILURE); } void err_fatal_core(const char *header, const char *fmt, ...) { va_list args; va_start(args, fmt); fprintf(stderr, "[%s] ", header); vfprintf(stderr, fmt, args); fprintf(stderr, " Abort!\n"); va_end(args); abort(); } void _err_fatal_simple(const char *func, const char *msg) { fprintf(stderr, "[%s] %s\n", func, msg); exit(EXIT_FAILURE); } void _err_fatal_simple_core(const char *func, const char *msg) { fprintf(stderr, "[%s] %s Abort!\n", func, msg); abort(); } size_t err_fwrite(const void *ptr, size_t size, size_t nmemb, FILE *stream) { size_t ret = fwrite(ptr, size, nmemb, stream); if (ret != nmemb) _err_fatal_simple("fwrite", strerror(errno)); return ret; } size_t err_fread_noeof(void *ptr, size_t size, size_t nmemb, FILE *stream) { size_t ret = fread(ptr, size, nmemb, stream); if (ret != nmemb) { _err_fatal_simple("fread", ferror(stream) ? strerror(errno) : "Unexpected end of file"); } return ret; } int err_gzread(gzFile file, void *ptr, unsigned int len) { int ret = gzread(file, ptr, len); if (ret < 0) { int errnum = 0; const char *msg = gzerror(file, &errnum); _err_fatal_simple("gzread", Z_ERRNO == errnum ? strerror(errno) : msg); } return ret; } int err_fseek(FILE *stream, long offset, int whence) { int ret = fseek(stream, offset, whence); if (0 != ret) { _err_fatal_simple("fseek", strerror(errno)); } return ret; } long err_ftell(FILE *stream) { long ret = ftell(stream); if (-1 == ret) { _err_fatal_simple("ftell", strerror(errno)); } return ret; } int err_printf(const char *format, ...) { va_list arg; int done; va_start(arg, format); done = vfprintf(stdout, format, arg); int saveErrno = errno; va_end(arg); if (done < 0) _err_fatal_simple("vfprintf(stdout)", strerror(saveErrno)); return done; } int err_fprintf(FILE *stream, const char *format, ...) { va_list arg; int done; va_start(arg, format); done = vfprintf(stream, format, arg); int saveErrno = errno; va_end(arg); if (done < 0) _err_fatal_simple("vfprintf", strerror(saveErrno)); return done; } int err_fputc(int c, FILE *stream) { int ret = putc(c, stream); if (EOF == ret) { _err_fatal_simple("fputc", strerror(errno)); } return ret; } int err_fputs(const char *s, FILE *stream) { int ret = fputs(s, stream); if (EOF == ret) { _err_fatal_simple("fputs", strerror(errno)); } return ret; } int err_puts(const char *s) { int ret = puts(s); if (EOF == ret) { _err_fatal_simple("puts", strerror(errno)); } return ret; } int err_fflush(FILE *stream) { int ret = fflush(stream); if (ret != 0) _err_fatal_simple("fflush", strerror(errno)); #ifdef FSYNC_ON_FLUSH /* Calling fflush() ensures that all the data has made it to the kernel buffers, but this may not be sufficient for remote filesystems (e.g. NFS, lustre) as an error may still occur while the kernel is copying the buffered data to the file server. To be sure of catching these errors, we need to call fsync() on the file descriptor, but only if it is a regular file. */ { struct stat sbuf; if (0 != fstat(fileno(stream), &sbuf)) _err_fatal_simple("fstat", strerror(errno)); if (S_ISREG(sbuf.st_mode)) { if (0 != fsync(fileno(stream))) _err_fatal_simple("fsync", strerror(errno)); } } #endif return ret; } int err_fclose(FILE *stream) { int ret = fclose(stream); if (ret != 0) _err_fatal_simple("fclose", strerror(errno)); return ret; } int err_gzclose(gzFile file) { int ret = gzclose(file); if (Z_OK != ret) { _err_fatal_simple("gzclose", Z_ERRNO == ret ? strerror(errno) : zError(ret)); } return ret; } /********* * Timer * *********/ double cputime() { struct rusage r; getrusage(RUSAGE_SELF, &r); return r.ru_utime.tv_sec + r.ru_stime.tv_sec + 1e-6 * (r.ru_utime.tv_usec + r.ru_stime.tv_usec); } double realtime() { struct timeval tp; struct timezone tzp; gettimeofday(&tp, &tzp); return tp.tv_sec + tp.tv_usec * 1e-6; } bwa-0.7.17/utils.h000066400000000000000000000073551317342117100137020ustar00rootroot00000000000000/* The MIT License Copyright (c) 2008 Genome Research Ltd (GRL). Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. */ /* Contact: Heng Li */ #ifndef LH3_UTILS_H #define LH3_UTILS_H #include #include #include #ifdef __GNUC__ // Tell GCC to validate printf format string and args #define ATTRIBUTE(list) __attribute__ (list) #else #define ATTRIBUTE(list) #endif #define err_fatal_simple(msg) _err_fatal_simple(__func__, msg) #define err_fatal_simple_core(msg) _err_fatal_simple_core(__func__, msg) #define xopen(fn, mode) err_xopen_core(__func__, fn, mode) #define xreopen(fn, mode, fp) err_xreopen_core(__func__, fn, mode, fp) #define xzopen(fn, mode) err_xzopen_core(__func__, fn, mode) #define xassert(cond, msg) if ((cond) == 0) _err_fatal_simple_core(__func__, msg) typedef struct { uint64_t x, y; } pair64_t; typedef struct { size_t n, m; uint64_t *a; } uint64_v; typedef struct { size_t n, m; pair64_t *a; } pair64_v; #ifdef __cplusplus extern "C" { #endif void err_fatal(const char *header, const char *fmt, ...) ATTRIBUTE((noreturn)); void err_fatal_core(const char *header, const char *fmt, ...) ATTRIBUTE((noreturn)); void _err_fatal_simple(const char *func, const char *msg) ATTRIBUTE((noreturn)); void _err_fatal_simple_core(const char *func, const char *msg) ATTRIBUTE((noreturn)); FILE *err_xopen_core(const char *func, const char *fn, const char *mode); FILE *err_xreopen_core(const char *func, const char *fn, const char *mode, FILE *fp); gzFile err_xzopen_core(const char *func, const char *fn, const char *mode); size_t err_fwrite(const void *ptr, size_t size, size_t nmemb, FILE *stream); size_t err_fread_noeof(void *ptr, size_t size, size_t nmemb, FILE *stream); int err_gzread(gzFile file, void *ptr, unsigned int len); int err_fseek(FILE *stream, long offset, int whence); #define err_rewind(FP) err_fseek((FP), 0, SEEK_SET) long err_ftell(FILE *stream); int err_fprintf(FILE *stream, const char *format, ...) ATTRIBUTE((format(printf, 2, 3))); int err_printf(const char *format, ...) ATTRIBUTE((format(printf, 1, 2))); int err_fputc(int c, FILE *stream); #define err_putchar(C) err_fputc((C), stdout) int err_fputs(const char *s, FILE *stream); int err_puts(const char *s); int err_fflush(FILE *stream); int err_fclose(FILE *stream); int err_gzclose(gzFile file); double cputime(); double realtime(); void ks_introsort_64 (size_t n, uint64_t *a); void ks_introsort_128(size_t n, pair64_t *a); #ifdef __cplusplus } #endif static inline uint64_t hash_64(uint64_t key) { key += ~(key << 32); key ^= (key >> 22); key += ~(key << 13); key ^= (key >> 8); key += (key << 3); key ^= (key >> 15); key += ~(key << 27); key ^= (key >> 31); return key; } #endif bwa-0.7.17/xa2multi.pl000077500000000000000000000013331317342117100144640ustar00rootroot00000000000000#!/usr/bin/env perl use strict; use warnings; while (<>) { if (/\tXA:Z:(\S+)/) { my $l = $1; print; my @t = split("\t"); while ($l =~ /([^,;]+),([-+]\d+),([^,]+),(\d+);/g) { my $mchr = ($t[6] eq $1)? '=' : $t[6]; # FIXME: TLEN/ISIZE is not calculated! my $seq = $t[9]; my $phred = $t[10]; # if alternative alignment has other orientation than primary, # then print the reverse (complement) of sequence and phred string if ((($t[1]&0x10)>0) xor ($2<0)) { $seq = reverse $seq; $seq =~ tr/ACGTacgt/TGCAtgca/; $phred = reverse $phred; } print(join("\t", $t[0], 0x100|($t[1]&0x6e9)|($2<0?0x10:0), $1, abs($2), 0, $3, @t[6..7], 0, $seq, $phred, "NM:i:$4"), "\n"); } } else { print; } }