pax_global_header00006660000000000000000000000064130441510340014505gustar00rootroot0000000000000052 comment=052e579b897ba8adc188a9aca0f91093edf33136 mptp-0.2.2/000077500000000000000000000000001304415103400124665ustar00rootroot00000000000000mptp-0.2.2/.gitignore000066400000000000000000000003061304415103400144550ustar00rootroot00000000000000*.a *.o *.pdf *~ .deps .dirstamp /aclocal.m4 /autom4te.cache /bin /compile /config.h /config.h.in /config.log /config.status /configure /depcomp /install-sh /missing /stamp-h1 Makefile Makefile.in mptp-0.2.2/.travis.yml000066400000000000000000000001441304415103400145760ustar00rootroot00000000000000language: c compiler: - gcc - clang script: ./autogen.sh && ./configure && make && make check mptp-0.2.2/ChangeLog.md000066400000000000000000000015701304415103400146420ustar00rootroot00000000000000# Change Log All notable changes to `mptp` will be documented in this file. This project adheres to [Semantic Versioning](http://semver.org/). ## [0.2.2] - 2017-01-31 ### Fixed - Regular expressions now allow scientific notation when parsing branch lengths - Improved accuracy of ASV score (takes into account tip species) - Memory leaks when parsing incorrectly formatted trees ## [0.2.1] - 2016-10-18 ### Fixed - Updated ASV to consider only coalescent roots of ML delimitation - Assertion stopping mptp when using random starting delimitations for MCMC ## [0.2.0] - 2016-09-27 ### Fixed - Floating point exception error when constructing random trees caused from division by zero - Allocation with malloc caused uninitialized variables when converting unrooted tree to rooted for the MCMC method - Sample size for the the AIC with a correction for finite sample sizes mptp-0.2.2/LICENSE.txt000066400000000000000000001033301304415103400143110ustar00rootroot00000000000000 GNU AFFERO GENERAL PUBLIC LICENSE Version 3, 19 November 2007 Copyright (C) 2007 Free Software Foundation, Inc. Everyone is permitted to copy and distribute verbatim copies of this license document, but changing it is not allowed. Preamble The GNU Affero General Public License is a free, copyleft license for software and other kinds of works, specifically designed to ensure cooperation with the community in the case of network server software. The licenses for most software and other practical works are designed to take away your freedom to share and change the works. By contrast, our General Public Licenses are intended to guarantee your freedom to share and change all versions of a program--to make sure it remains free software for all its users. When we speak of free software, we are referring to freedom, not price. Our General Public Licenses are designed to make sure that you have the freedom to distribute copies of free software (and charge for them if you wish), that you receive source code or can get it if you want it, that you can change the software or use pieces of it in new free programs, and that you know you can do these things. Developers that use our General Public Licenses protect your rights with two steps: (1) assert copyright on the software, and (2) offer you this License which gives you legal permission to copy, distribute and/or modify the software. A secondary benefit of defending all users' freedom is that improvements made in alternate versions of the program, if they receive widespread use, become available for other developers to incorporate. Many developers of free software are heartened and encouraged by the resulting cooperation. However, in the case of software used on network servers, this result may fail to come about. The GNU General Public License permits making a modified version and letting the public access it on a server without ever releasing its source code to the public. The GNU Affero General Public License is designed specifically to ensure that, in such cases, the modified source code becomes available to the community. It requires the operator of a network server to provide the source code of the modified version running there to the users of that server. Therefore, public use of a modified version, on a publicly accessible server, gives the public access to the source code of the modified version. An older license, called the Affero General Public License and published by Affero, was designed to accomplish similar goals. This is a different license, not a version of the Affero GPL, but Affero has released a new version of the Affero GPL which permits relicensing under this license. The precise terms and conditions for copying, distribution and modification follow. TERMS AND CONDITIONS 0. Definitions. "This License" refers to version 3 of the GNU Affero General Public License. "Copyright" also means copyright-like laws that apply to other kinds of works, such as semiconductor masks. "The Program" refers to any copyrightable work licensed under this License. Each licensee is addressed as "you". "Licensees" and "recipients" may be individuals or organizations. To "modify" a work means to copy from or adapt all or part of the work in a fashion requiring copyright permission, other than the making of an exact copy. The resulting work is called a "modified version" of the earlier work or a work "based on" the earlier work. A "covered work" means either the unmodified Program or a work based on the Program. To "propagate" a work means to do anything with it that, without permission, would make you directly or secondarily liable for infringement under applicable copyright law, except executing it on a computer or modifying a private copy. Propagation includes copying, distribution (with or without modification), making available to the public, and in some countries other activities as well. To "convey" a work means any kind of propagation that enables other parties to make or receive copies. Mere interaction with a user through a computer network, with no transfer of a copy, is not conveying. An interactive user interface displays "Appropriate Legal Notices" to the extent that it includes a convenient and prominently visible feature that (1) displays an appropriate copyright notice, and (2) tells the user that there is no warranty for the work (except to the extent that warranties are provided), that licensees may convey the work under this License, and how to view a copy of this License. If the interface presents a list of user commands or options, such as a menu, a prominent item in the list meets this criterion. 1. Source Code. The "source code" for a work means the preferred form of the work for making modifications to it. "Object code" means any non-source form of a work. A "Standard Interface" means an interface that either is an official standard defined by a recognized standards body, or, in the case of interfaces specified for a particular programming language, one that is widely used among developers working in that language. The "System Libraries" of an executable work include anything, other than the work as a whole, that (a) is included in the normal form of packaging a Major Component, but which is not part of that Major Component, and (b) serves only to enable use of the work with that Major Component, or to implement a Standard Interface for which an implementation is available to the public in source code form. A "Major Component", in this context, means a major essential component (kernel, window system, and so on) of the specific operating system (if any) on which the executable work runs, or a compiler used to produce the work, or an object code interpreter used to run it. The "Corresponding Source" for a work in object code form means all the source code needed to generate, install, and (for an executable work) run the object code and to modify the work, including scripts to control those activities. However, it does not include the work's System Libraries, or general-purpose tools or generally available free programs which are used unmodified in performing those activities but which are not part of the work. For example, Corresponding Source includes interface definition files associated with source files for the work, and the source code for shared libraries and dynamically linked subprograms that the work is specifically designed to require, such as by intimate data communication or control flow between those subprograms and other parts of the work. The Corresponding Source need not include anything that users can regenerate automatically from other parts of the Corresponding Source. The Corresponding Source for a work in source code form is that same work. 2. Basic Permissions. All rights granted under this License are granted for the term of copyright on the Program, and are irrevocable provided the stated conditions are met. This License explicitly affirms your unlimited permission to run the unmodified Program. The output from running a covered work is covered by this License only if the output, given its content, constitutes a covered work. This License acknowledges your rights of fair use or other equivalent, as provided by copyright law. You may make, run and propagate covered works that you do not convey, without conditions so long as your license otherwise remains in force. You may convey covered works to others for the sole purpose of having them make modifications exclusively for you, or provide you with facilities for running those works, provided that you comply with the terms of this License in conveying all material for which you do not control copyright. Those thus making or running the covered works for you must do so exclusively on your behalf, under your direction and control, on terms that prohibit them from making any copies of your copyrighted material outside their relationship with you. Conveying under any other circumstances is permitted solely under the conditions stated below. Sublicensing is not allowed; section 10 makes it unnecessary. 3. Protecting Users' Legal Rights From Anti-Circumvention Law. No covered work shall be deemed part of an effective technological measure under any applicable law fulfilling obligations under article 11 of the WIPO copyright treaty adopted on 20 December 1996, or similar laws prohibiting or restricting circumvention of such measures. When you convey a covered work, you waive any legal power to forbid circumvention of technological measures to the extent such circumvention is effected by exercising rights under this License with respect to the covered work, and you disclaim any intention to limit operation or modification of the work as a means of enforcing, against the work's users, your or third parties' legal rights to forbid circumvention of technological measures. 4. Conveying Verbatim Copies. You may convey verbatim copies of the Program's source code as you receive it, in any medium, provided that you conspicuously and appropriately publish on each copy an appropriate copyright notice; keep intact all notices stating that this License and any non-permissive terms added in accord with section 7 apply to the code; keep intact all notices of the absence of any warranty; and give all recipients a copy of this License along with the Program. You may charge any price or no price for each copy that you convey, and you may offer support or warranty protection for a fee. 5. Conveying Modified Source Versions. You may convey a work based on the Program, or the modifications to produce it from the Program, in the form of source code under the terms of section 4, provided that you also meet all of these conditions: a) The work must carry prominent notices stating that you modified it, and giving a relevant date. b) The work must carry prominent notices stating that it is released under this License and any conditions added under section 7. This requirement modifies the requirement in section 4 to "keep intact all notices". c) You must license the entire work, as a whole, under this License to anyone who comes into possession of a copy. This License will therefore apply, along with any applicable section 7 additional terms, to the whole of the work, and all its parts, regardless of how they are packaged. This License gives no permission to license the work in any other way, but it does not invalidate such permission if you have separately received it. d) If the work has interactive user interfaces, each must display Appropriate Legal Notices; however, if the Program has interactive interfaces that do not display Appropriate Legal Notices, your work need not make them do so. A compilation of a covered work with other separate and independent works, which are not by their nature extensions of the covered work, and which are not combined with it such as to form a larger program, in or on a volume of a storage or distribution medium, is called an "aggregate" if the compilation and its resulting copyright are not used to limit the access or legal rights of the compilation's users beyond what the individual works permit. Inclusion of a covered work in an aggregate does not cause this License to apply to the other parts of the aggregate. 6. Conveying Non-Source Forms. You may convey a covered work in object code form under the terms of sections 4 and 5, provided that you also convey the machine-readable Corresponding Source under the terms of this License, in one of these ways: a) Convey the object code in, or embodied in, a physical product (including a physical distribution medium), accompanied by the Corresponding Source fixed on a durable physical medium customarily used for software interchange. b) Convey the object code in, or embodied in, a physical product (including a physical distribution medium), accompanied by a written offer, valid for at least three years and valid for as long as you offer spare parts or customer support for that product model, to give anyone who possesses the object code either (1) a copy of the Corresponding Source for all the software in the product that is covered by this License, on a durable physical medium customarily used for software interchange, for a price no more than your reasonable cost of physically performing this conveying of source, or (2) access to copy the Corresponding Source from a network server at no charge. c) Convey individual copies of the object code with a copy of the written offer to provide the Corresponding Source. This alternative is allowed only occasionally and noncommercially, and only if you received the object code with such an offer, in accord with subsection 6b. d) Convey the object code by offering access from a designated place (gratis or for a charge), and offer equivalent access to the Corresponding Source in the same way through the same place at no further charge. You need not require recipients to copy the Corresponding Source along with the object code. If the place to copy the object code is a network server, the Corresponding Source may be on a different server (operated by you or a third party) that supports equivalent copying facilities, provided you maintain clear directions next to the object code saying where to find the Corresponding Source. Regardless of what server hosts the Corresponding Source, you remain obligated to ensure that it is available for as long as needed to satisfy these requirements. e) Convey the object code using peer-to-peer transmission, provided you inform other peers where the object code and Corresponding Source of the work are being offered to the general public at no charge under subsection 6d. A separable portion of the object code, whose source code is excluded from the Corresponding Source as a System Library, need not be included in conveying the object code work. A "User Product" is either (1) a "consumer product", which means any tangible personal property which is normally used for personal, family, or household purposes, or (2) anything designed or sold for incorporation into a dwelling. In determining whether a product is a consumer product, doubtful cases shall be resolved in favor of coverage. For a particular product received by a particular user, "normally used" refers to a typical or common use of that class of product, regardless of the status of the particular user or of the way in which the particular user actually uses, or expects or is expected to use, the product. A product is a consumer product regardless of whether the product has substantial commercial, industrial or non-consumer uses, unless such uses represent the only significant mode of use of the product. "Installation Information" for a User Product means any methods, procedures, authorization keys, or other information required to install and execute modified versions of a covered work in that User Product from a modified version of its Corresponding Source. The information must suffice to ensure that the continued functioning of the modified object code is in no case prevented or interfered with solely because modification has been made. If you convey an object code work under this section in, or with, or specifically for use in, a User Product, and the conveying occurs as part of a transaction in which the right of possession and use of the User Product is transferred to the recipient in perpetuity or for a fixed term (regardless of how the transaction is characterized), the Corresponding Source conveyed under this section must be accompanied by the Installation Information. But this requirement does not apply if neither you nor any third party retains the ability to install modified object code on the User Product (for example, the work has been installed in ROM). The requirement to provide Installation Information does not include a requirement to continue to provide support service, warranty, or updates for a work that has been modified or installed by the recipient, or for the User Product in which it has been modified or installed. Access to a network may be denied when the modification itself materially and adversely affects the operation of the network or violates the rules and protocols for communication across the network. Corresponding Source conveyed, and Installation Information provided, in accord with this section must be in a format that is publicly documented (and with an implementation available to the public in source code form), and must require no special password or key for unpacking, reading or copying. 7. Additional Terms. "Additional permissions" are terms that supplement the terms of this License by making exceptions from one or more of its conditions. Additional permissions that are applicable to the entire Program shall be treated as though they were included in this License, to the extent that they are valid under applicable law. If additional permissions apply only to part of the Program, that part may be used separately under those permissions, but the entire Program remains governed by this License without regard to the additional permissions. When you convey a copy of a covered work, you may at your option remove any additional permissions from that copy, or from any part of it. (Additional permissions may be written to require their own removal in certain cases when you modify the work.) You may place additional permissions on material, added by you to a covered work, for which you have or can give appropriate copyright permission. Notwithstanding any other provision of this License, for material you add to a covered work, you may (if authorized by the copyright holders of that material) supplement the terms of this License with terms: a) Disclaiming warranty or limiting liability differently from the terms of sections 15 and 16 of this License; or b) Requiring preservation of specified reasonable legal notices or author attributions in that material or in the Appropriate Legal Notices displayed by works containing it; or c) Prohibiting misrepresentation of the origin of that material, or requiring that modified versions of such material be marked in reasonable ways as different from the original version; or d) Limiting the use for publicity purposes of names of licensors or authors of the material; or e) Declining to grant rights under trademark law for use of some trade names, trademarks, or service marks; or f) Requiring indemnification of licensors and authors of that material by anyone who conveys the material (or modified versions of it) with contractual assumptions of liability to the recipient, for any liability that these contractual assumptions directly impose on those licensors and authors. All other non-permissive additional terms are considered "further restrictions" within the meaning of section 10. If the Program as you received it, or any part of it, contains a notice stating that it is governed by this License along with a term that is a further restriction, you may remove that term. If a license document contains a further restriction but permits relicensing or conveying under this License, you may add to a covered work material governed by the terms of that license document, provided that the further restriction does not survive such relicensing or conveying. If you add terms to a covered work in accord with this section, you must place, in the relevant source files, a statement of the additional terms that apply to those files, or a notice indicating where to find the applicable terms. Additional terms, permissive or non-permissive, may be stated in the form of a separately written license, or stated as exceptions; the above requirements apply either way. 8. Termination. You may not propagate or modify a covered work except as expressly provided under this License. Any attempt otherwise to propagate or modify it is void, and will automatically terminate your rights under this License (including any patent licenses granted under the third paragraph of section 11). However, if you cease all violation of this License, then your license from a particular copyright holder is reinstated (a) provisionally, unless and until the copyright holder explicitly and finally terminates your license, and (b) permanently, if the copyright holder fails to notify you of the violation by some reasonable means prior to 60 days after the cessation. Moreover, your license from a particular copyright holder is reinstated permanently if the copyright holder notifies you of the violation by some reasonable means, this is the first time you have received notice of violation of this License (for any work) from that copyright holder, and you cure the violation prior to 30 days after your receipt of the notice. Termination of your rights under this section does not terminate the licenses of parties who have received copies or rights from you under this License. If your rights have been terminated and not permanently reinstated, you do not qualify to receive new licenses for the same material under section 10. 9. Acceptance Not Required for Having Copies. You are not required to accept this License in order to receive or run a copy of the Program. Ancillary propagation of a covered work occurring solely as a consequence of using peer-to-peer transmission to receive a copy likewise does not require acceptance. However, nothing other than this License grants you permission to propagate or modify any covered work. These actions infringe copyright if you do not accept this License. Therefore, by modifying or propagating a covered work, you indicate your acceptance of this License to do so. 10. Automatic Licensing of Downstream Recipients. Each time you convey a covered work, the recipient automatically receives a license from the original licensors, to run, modify and propagate that work, subject to this License. You are not responsible for enforcing compliance by third parties with this License. An "entity transaction" is a transaction transferring control of an organization, or substantially all assets of one, or subdividing an organization, or merging organizations. If propagation of a covered work results from an entity transaction, each party to that transaction who receives a copy of the work also receives whatever licenses to the work the party's predecessor in interest had or could give under the previous paragraph, plus a right to possession of the Corresponding Source of the work from the predecessor in interest, if the predecessor has it or can get it with reasonable efforts. You may not impose any further restrictions on the exercise of the rights granted or affirmed under this License. For example, you may not impose a license fee, royalty, or other charge for exercise of rights granted under this License, and you may not initiate litigation (including a cross-claim or counterclaim in a lawsuit) alleging that any patent claim is infringed by making, using, selling, offering for sale, or importing the Program or any portion of it. 11. Patents. A "contributor" is a copyright holder who authorizes use under this License of the Program or a work on which the Program is based. The work thus licensed is called the contributor's "contributor version". A contributor's "essential patent claims" are all patent claims owned or controlled by the contributor, whether already acquired or hereafter acquired, that would be infringed by some manner, permitted by this License, of making, using, or selling its contributor version, but do not include claims that would be infringed only as a consequence of further modification of the contributor version. For purposes of this definition, "control" includes the right to grant patent sublicenses in a manner consistent with the requirements of this License. Each contributor grants you a non-exclusive, worldwide, royalty-free patent license under the contributor's essential patent claims, to make, use, sell, offer for sale, import and otherwise run, modify and propagate the contents of its contributor version. In the following three paragraphs, a "patent license" is any express agreement or commitment, however denominated, not to enforce a patent (such as an express permission to practice a patent or covenant not to sue for patent infringement). To "grant" such a patent license to a party means to make such an agreement or commitment not to enforce a patent against the party. If you convey a covered work, knowingly relying on a patent license, and the Corresponding Source of the work is not available for anyone to copy, free of charge and under the terms of this License, through a publicly available network server or other readily accessible means, then you must either (1) cause the Corresponding Source to be so available, or (2) arrange to deprive yourself of the benefit of the patent license for this particular work, or (3) arrange, in a manner consistent with the requirements of this License, to extend the patent license to downstream recipients. "Knowingly relying" means you have actual knowledge that, but for the patent license, your conveying the covered work in a country, or your recipient's use of the covered work in a country, would infringe one or more identifiable patents in that country that you have reason to believe are valid. If, pursuant to or in connection with a single transaction or arrangement, you convey, or propagate by procuring conveyance of, a covered work, and grant a patent license to some of the parties receiving the covered work authorizing them to use, propagate, modify or convey a specific copy of the covered work, then the patent license you grant is automatically extended to all recipients of the covered work and works based on it. A patent license is "discriminatory" if it does not include within the scope of its coverage, prohibits the exercise of, or is conditioned on the non-exercise of one or more of the rights that are specifically granted under this License. You may not convey a covered work if you are a party to an arrangement with a third party that is in the business of distributing software, under which you make payment to the third party based on the extent of your activity of conveying the work, and under which the third party grants, to any of the parties who would receive the covered work from you, a discriminatory patent license (a) in connection with copies of the covered work conveyed by you (or copies made from those copies), or (b) primarily for and in connection with specific products or compilations that contain the covered work, unless you entered into that arrangement, or that patent license was granted, prior to 28 March 2007. Nothing in this License shall be construed as excluding or limiting any implied license or other defenses to infringement that may otherwise be available to you under applicable patent law. 12. No Surrender of Others' Freedom. If conditions are imposed on you (whether by court order, agreement or otherwise) that contradict the conditions of this License, they do not excuse you from the conditions of this License. If you cannot convey a covered work so as to satisfy simultaneously your obligations under this License and any other pertinent obligations, then as a consequence you may not convey it at all. For example, if you agree to terms that obligate you to collect a royalty for further conveying from those to whom you convey the Program, the only way you could satisfy both those terms and this License would be to refrain entirely from conveying the Program. 13. Remote Network Interaction; Use with the GNU General Public License. Notwithstanding any other provision of this License, if you modify the Program, your modified version must prominently offer all users interacting with it remotely through a computer network (if your version supports such interaction) an opportunity to receive the Corresponding Source of your version by providing access to the Corresponding Source from a network server at no charge, through some standard or customary means of facilitating copying of software. This Corresponding Source shall include the Corresponding Source for any work covered by version 3 of the GNU General Public License that is incorporated pursuant to the following paragraph. Notwithstanding any other provision of this License, you have permission to link or combine any covered work with a work licensed under version 3 of the GNU General Public License into a single combined work, and to convey the resulting work. The terms of this License will continue to apply to the part which is the covered work, but the work with which it is combined will remain governed by version 3 of the GNU General Public License. 14. Revised Versions of this License. The Free Software Foundation may publish revised and/or new versions of the GNU Affero General Public License from time to time. Such new versions will be similar in spirit to the present version, but may differ in detail to address new problems or concerns. Each version is given a distinguishing version number. If the Program specifies that a certain numbered version of the GNU Affero General Public License "or any later version" applies to it, you have the option of following the terms and conditions either of that numbered version or of any later version published by the Free Software Foundation. If the Program does not specify a version number of the GNU Affero General Public License, you may choose any version ever published by the Free Software Foundation. If the Program specifies that a proxy can decide which future versions of the GNU Affero General Public License can be used, that proxy's public statement of acceptance of a version permanently authorizes you to choose that version for the Program. Later license versions may give you additional or different permissions. However, no additional obligations are imposed on any author or copyright holder as a result of your choosing to follow a later version. 15. Disclaimer of Warranty. THERE IS NO WARRANTY FOR THE PROGRAM, TO THE EXTENT PERMITTED BY APPLICABLE LAW. EXCEPT WHEN OTHERWISE STATED IN WRITING THE COPYRIGHT HOLDERS AND/OR OTHER PARTIES PROVIDE THE PROGRAM "AS IS" WITHOUT WARRANTY OF ANY KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE. THE ENTIRE RISK AS TO THE QUALITY AND PERFORMANCE OF THE PROGRAM IS WITH YOU. SHOULD THE PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF ALL NECESSARY SERVICING, REPAIR OR CORRECTION. 16. Limitation of Liability. IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MODIFIES AND/OR CONVEYS THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES, INCLUDING ANY GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING OUT OF THE USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED TO LOSS OF DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY YOU OR THIRD PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER PROGRAMS), EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE POSSIBILITY OF SUCH DAMAGES. 17. Interpretation of Sections 15 and 16. If the disclaimer of warranty and limitation of liability provided above cannot be given local legal effect according to their terms, reviewing courts shall apply local law that most closely approximates an absolute waiver of all civil liability in connection with the Program, unless a warranty or assumption of liability accompanies a copy of the Program in return for a fee. END OF TERMS AND CONDITIONS How to Apply These Terms to Your New Programs If you develop a new program, and you want it to be of the greatest possible use to the public, the best way to achieve this is to make it free software which everyone can redistribute and change under these terms. To do so, attach the following notices to the program. It is safest to attach them to the start of each source file to most effectively state the exclusion of warranty; and each file should have at least the "copyright" line and a pointer to where the full notice is found. Copyright (C) This program is free software: you can redistribute it and/or modify it under the terms of the GNU Affero General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more details. You should have received a copy of the GNU Affero General Public License along with this program. If not, see . Also add information on how to contact you by electronic and paper mail. If your software can interact with users remotely through a computer network, you should also make sure that it provides a way for users to get its source. For example, if your program is a web application, its interface could display a "Source" link that leads users to an archive of the code. There are many ways you could offer source, and different solutions will be better for different programs; see section 13 for the specific requirements. You should also get your employer (if you work as a programmer) or school, if any, to sign a "copyright disclaimer" for the program, if necessary. For more information on this, and how to apply and follow the GNU AGPL, see . mptp-0.2.2/Makefile.am000066400000000000000000000001631304415103400145220ustar00rootroot00000000000000AUTOMAKE_OPTIONS = foreign SUBDIRS = src man completion EXTRA_DIST = autogen.sh LICENSE.txt README.md ChangeLog.md mptp-0.2.2/README.md000066400000000000000000000231111304415103400137430ustar00rootroot00000000000000# Species Delimitation [![License](https://img.shields.io/badge/license-AGPL-blue.svg)](http://www.gnu.org/licenses/agpl-3.0.en.html) [![Build Status](https://travis-ci.org/Pas-Kapli/mptp.svg?branch=master)](https://travis-ci.com/Pas-Kapli/mptp) ## Introduction The aim of this project is to implement a fast species delimitation method, based on PTP (Zhang et al. 2013). The new tool should: * have an open source code with an appropriate open source license. * 64-bit multi-threaded design that handles very large datasets. We have implemented a tool called mPTP which can handle very large biodiversity datasets. It implements a fast method to compute the ML delimitation from an inferred phylogenetic tree of the samples. Using MCMC, it also computes the support values for each clade, which can be used to assess the confidence of the ML delimitation. **ML delimitation** mPTP implements two flavours of the point-estimate solution. First, it implements the original method from (Zhang et al. 2013) where all within-species processes are modelled with a single exponential distribution. mPTP uses a dynamic programming implementation which estimates the ML delimitation faster and more accurately than the original PTP. The dynamic programming implementation has similar properties as (Gulek et al. 2010). See the [wiki](https://github.com/Pas-Kapli/mptp/wiki) for more information. The second method assumes a distinct exponential distribution for the branching events of each of the delimited species allowing it to fit to a wider range of empirical datasets. **MCMC method** mPTP generates support values for each clades. They represent the ratio of the number of samples for which a particular node was in the between-species process, to the total number of samples. ## Compilation instructions **Cloning the repo** Clone the repo and build the executable and the documentation using the following commands. ```bash git clone https://github.com/Pas-Kapli/mptp.git cd mptp ./autogen.sh ./configure make make install # as root, or run sudo make install ``` You will need [GNU Bison](http://www.gnu.org/software/bison/) and [Flex](http://flex.sourceforge.net/) installed on your system. When using the cloned repository version, you will also need [autoconf](https://www.gnu.org/software/autoconf/autoconf.html) and [automake](https://www.gnu.org/software/automake/) installed. Optionally, you will need the [GNU Scientific Library](http://www.gnu.org/software/gsl/) for the likelihood ratio test. If it is not available on your system, ratio test will be disabled. On a Debian-based Linux system, the four packages can be installed using the command ```bash sudo apt-get install libgsl0-dev flex bison autotools-dev ``` Optionally, you can install the bash auto-completion for mptp. To do that, replace the `./configure` step above with ```bash ./configure --with-bash-completions=DIR ``` where `DIR` is the directory where bash autocompletion is stored. You can use `pkg-config` as follows: ```bash ./configure --with-bash-completions=`pkg-config --variable=completionsdir bash-completion` ``` **Source distribution** To download the source distribution from a [release](https://github.com/Pas-Kapli/mptp/releases) and build the executable and the documentation, use the following commands: ```bash wget https://github.com/Pas-Kapli/mptp/releases/download/v0.2.2/mptp-src-0.2.2.tar.gz tar zxvf mptp-src-0.2.2.tar.gz cd mptp-src-0.2.2 ./configure make make install # as root, or run sudo make install ``` Note that, similarly to cloning the repository, you will need [GNU Bison](http://www.gnu.org/software/bison/) and [Flex](http://flex.sourceforge.net/) installed on your system, and optionally, the [GNU Scientific Library](http://www.gnu.org/software/gsl/). However, you do not need [autoconf](https://www.gnu.org/software/autoconf/autoconf.html) and [automake](https://www.gnu.org/software/automake/) installed (note the missing `./autogen`). See also the notes for installing the bash auto-completition, as described in the *Cloning the repo* section. **Binary distribution** Starting with version 0.2.0, binary distribution files (.tar.gz) for GNU/Linux on x86-64 containing pre-compiled binaries as well as the documentation (man and pdf files) will be made available as part of each [release](https://github.com/Pas-Kapli/mptp/releases). The included executables currently are not compiled with [`libgsl`](http://www.gnu.org/software/gsl/) support. This means, Likelihood Ratio Test (LRT) is disabled for the single-rate PTP model. However, we intend to implement dynamic loading for `libgsl` and therefore this issue will disappear in the next releases. Until then, please consider compiling from source in order to enable `libgsl`. To use the pre-compiled binary, download the appropriate executable for your system using the following commands if you are using a Linux system: ```bash wget https://github.com/Pas-Kapli/mptp/releases/download/v0.2.2/mptp-0.2.2-linux-x86_64.tar.gz tar zxvf mptp-0.2.2-linux-x86_64.tar.gz ``` You will now have the binary distribution in a folder called `mptp-0.2.2-linux-x86_64` in which you will find three subfolders `bin`, `man` and `doc`. We recommend making a copy or a symbolic link to the mptp binary `bin/mptp` in a folder included in your `$PATH`, and a copy or a symbolic link to the mptp man page `man/mptp.1` in a folder included in your `$MANPATH`. The PDF version of the manual is available in `doc/mptp_manual.pdf`. ## Command-line options General options: * `--help` * `--version` * `--quiet` * `--tree_show` * `--multi` * `--single` * `--ml` * `--mcmc INT` * `--mcmc_sample INT` * `--mcmc_log` * `--mcmc_burnin INT` * `--mcmc_startnull` * `--mcmc_startrandom` * `--mcmc_startml` * `--mcmc_credible REAL` * `--mcmc_runs INT` * `--outgroup TAXA` * `--outgroup_crop` * `--minbr REAL` * `--minbr_auto FILENAME` * `--pvalue REAL` * `--precision INT` Input and output options: * `--tree_file FILENAME` * `--output_file FILENAME` Visualization options: * `--svg_width INT` * `--svg_fontsize INT` * `--svg_tipspacing INT` * `--svg_legend_ratio <0..1>` * `--svg_nolegend` * `--svg_marginleft INT` * `--svg_marginright INT` * `--svg_margintop INT` * `--svg_marginbottom INT` * `--svg_inner_radius INT` ## Usage example ```bash mptp --ml --multi --tree_file testTree --output_file out --outgroup A,C --tree_show mptp --mcmc 50000000 --multi --mcmc_sample 1000000 --mcmc_burnin 1000000 --tree_file tree.newick --output_file out ``` ## Documentation If `mptp` was installed according to the [Compilation instructions](https://github.com/Pas-Kapli/mptp#compilation-instructions) you can access the man pages by: ```bash man mptp ``` A comprehensive documentation is also available in the [wiki](https://github.com/Pas-Kapli/mptp/wiki). ## License and third party licenses The code is currently licensed under the [GNU Affero General Public License version 3](http://www.gnu.org/licenses/agpl-3.0.en.html). ## Code File | Description --------------------|---------------- **arch.c** | Architecture specific code (Mac/Linux). **auto.c** | Code for auto-detecting minimum branch length. **aic.c** | Code for Bayesian Single- and multi-rate PTP. **mptp.c** | Main file handling command-line parameters and executing corresponding parts. **mptp.h** | MPTP Header file. **dp.c** | Single- and multi-rate DP heuristics for solving the PTP problem. **fasta.c** | Code for reading FASTA files. **lex_rtree.l** | Lexical analyzer parsing newick rooted trees. **lex_utree.l** | Lexical analyzer parsing newick unrooted trees. **likelihood.c** | Likelihood rated functions. **Makefile.am** | Automake file for generating Makefile.in. **maps.c** | Character mapping arrays for converting sequences to the internal representation. **multirun.c** | Functions to execute multiple MCMC runs and compute ASD of support values. **output.c** | Output related files. **parse_rtree.y** | Functions for parsing rooted trees in newick format. **parse_utree.y** | Functions for parsing unrooted trees in newick format. **random.c** | Functions for creating a random delimitation. **rtree.c** | Rooted tree manipulation functions. **svg.c** | SVG visualization of delimited tree. **svg_landscape.c** | SVG visualization of likelihood landscape. **util.c** | Various common utility functions. **utree.c** | Unrooted tree manipulation functions. ## The team * Paschalia Kapli * Sarah Lutteropp * Kassian Kobert * Pavlos Pavlides * Jiajie Zhang * Alexandros Stamatakis * Tomáš Flouri # References * Zhang J., Kapli P., Pavlidis P., Stamatakis A. (2013) **A general species delimitation method with applications to phylogenetic placements.** *Bioinformatics*, 29(22):2869-2876. doi:[10.1093/bioinformatics/btt499](http://dx.doi.org/10.1093/bioinformatics/btt499) * Nguyen XV, Epps J., Bailey J. (2010) **Information Theoretic Measures for Clustering Comparison: Variants, Properties, Normalization and Correction for Chance.** *Journal of Machine Learning Research*, 11:2837-2854. [PDF](http://www.jmlr.org/papers/volume11/vinh10a/vinh10a.pdf) * Gulek M., Toroslu IH. (2010) **A dynamic programming algorithm for tree-like weighted set packing problem.** *Information Sciences*, 180(20):3974-3979. doi:[10.1016/j.ins.2010.06.035](http://dx.doi.org/10.1016/j.ins.2010.06.035) * Powell JR. (2012) **Accounting for uncertainty in species delineation during the analysis of environmental DNA sequence data.** *Methods in Ecology and Evolution*, 3(1):1-11. doi:[10.1111/j.2041-210X.2011.00122.x](http://dx.doi.org/10.1111/j.2041-210X.2011.00122.x) mptp-0.2.2/autogen.sh000077500000000000000000000000501304415103400144620ustar00rootroot00000000000000#!/bin/sh autoreconf --force --install mptp-0.2.2/completion/000077500000000000000000000000001304415103400146375ustar00rootroot00000000000000mptp-0.2.2/completion/Makefile.am000066400000000000000000000001571304415103400166760ustar00rootroot00000000000000if HAVE_BASH_COMPLETIONS bashcompletiondir = $(bash_completions_dir) dist_bashcompletion_DATA = mptp endif mptp-0.2.2/completion/mptp000066400000000000000000000014671304415103400155520ustar00rootroot00000000000000_mptp() { local cur prev opts COMREPLY=() cur="${COMP_WORDS[COMP_CWORD]}" prev="${COMP_WORDS[COMP_CWORD-1]}" opts="--help --version --tree_show --multi --single --ml --mcmc --mcmc_sample --mcmc_log --mcmc_burnin --mcmc_runs --mcmc_credible --mcmc_startnull --mcmc_startrandom --mcmc_startml --pvalue --minbr --minbr_auto --outgroup --outgroup_crop --quiet --precision --seed --tree_file --output_file --svg_width --svg_fontsize --svg_tipspacing --svg_legend_ratio --svg_nolegend --svg_marginleft --svg_marginright --svg_margintop --svg_marginbottom --svg_inner_radius" case "${prev}" in '--tree_file') #COMPREPLY=( $(compgen -f ${cur}) ) _filedir return 0 ;; *) ;; esac COMPREPLY=( $(compgen -W "${opts}" -- ${cur}) ) } complete -F _mptp mptp mptp-0.2.2/configure.ac000066400000000000000000000051471304415103400147630ustar00rootroot00000000000000# -*- Autoconf -*- # Process this file with autoconf to produce a configure script. AC_PREREQ([2.63]) AC_INIT([mptp], [0.2.2], [Tomas.Flouri@h-its.org]) AM_INIT_AUTOMAKE([subdir-objects]) AC_LANG([C]) AC_CONFIG_SRCDIR([src/mptp.c]) AC_CONFIG_HEADERS([config.h]) AC_CANONICAL_HOST # Checks for programs. AC_PROG_CC AC_PROG_RANLIB AC_PROG_SED AC_PROG_LEX if test "x$LEX" != xflex; then AC_MSG_ERROR(could not find required installation of FLEX) fi AC_PROG_YACC if test "x$YACC" != x"bison -y"; then AC_MSG_ERROR(could not find required installation of BISON) fi AC_PROG_INSTALL # Checks for header files. AC_CHECK_HEADERS([assert.h stdio.h stdarg.h string.h getopt.h stdlib.h regex.h ctype.h locale.h limits.h string.h sys/time.h]) # Checks for typedefs, structures, and compiler characteristics. AC_C_INLINE AC_TYPE_SIZE_T # Checks for library functions. AC_FUNC_MALLOC AC_FUNC_STRTOD AC_FUNC_ALLOCA AC_FUNC_REALLOC AC_CHECK_FUNCS([memmove memcpy gettimeofday memchr memset pow regcomp strcasecmp strchr strcspn sysinfo]) AC_CHECK_LIB([m],[cos]) AC_CHECK_LIB([gslcblas], [cblas_dgemm]) AC_CHECK_LIB([gsl], [gsl_cdf_chisq_P]) # Bash completions AC_ARG_WITH([bash-completions], AC_HELP_STRING([--with-bash-completions=[DIR]], [Bash completions directory [default=no]]), [with_bash_completions="$withval"], [with_bash_completions="no"] ) AS_CASE([$with_bash_completions], # [yes], [PKG_CHECK_VAR([bash_completions_dir], [bash-completion], [completionsdir], [], [AC_MSG_ERROR([bash completions not found])])], [no], [bash_completions_dir=], [bash_completions_dir="$with_bash_completions"] ) AC_SUBST([bash_completions_dir]) AM_CONDITIONAL(HAVE_BASH_COMPLETIONS, test -n "$bash_completions_dir") AS_IF([test -n "$bash_completions_dir"], [bash_completions_output="${bash_completions_dir}"], [bash_completions_output=no] ) have_ps2pdf=no AC_ARG_ENABLE(pdfman, AS_HELP_STRING([--disable-pdfman], [Disable PDF manual creation])) AS_IF([test "x$enable_pdfman" != "xno"], [ have_ps2pdf=yes AC_CHECK_PROG(HAVE_PS2PDF, ps2pdf, yes, no) if test "x$HAVE_PS2PDF" = "xno"; then AC_MSG_WARN([*** ps2pdf is required to build a PDF version of the manual]) have_ps2pdf=no fi ]) AM_CONDITIONAL(HAVE_PS2PDF, test "x${have_ps2pdf}" = "xyes") AM_PROG_CC_C_O AC_CONFIG_FILES([Makefile src/Makefile man/Makefile completion/Makefile]) AC_OUTPUT AC_MSG_RESULT([ $PACKAGE $VERSION Target: $host_os $host_cpu Compiler: ${CC} CFLAGS: ${CFLAGS} ${CPPFLAGS} LIBS: ${LIBS} ${LDFLAGS} Continue with 'make' command ]) mptp-0.2.2/man/000077500000000000000000000000001304415103400132415ustar00rootroot00000000000000mptp-0.2.2/man/Makefile.am000066400000000000000000000006751304415103400153050ustar00rootroot00000000000000# Makefile for creating PDF manual from man file dist_man_MANS = mptp.1 if HAVE_PS2PDF doc_DATA = mptp_manual.pdf mptp_manual.pdf : mptp.1 TEMP=$$(mktemp temp.XXXXXXXX) ; \ if [ $$(uname) == "Darwin" ] ; then \ ${SED} -e 's/\\-/-/g' $< | \ iconv -f UTF-8 -t ISO-8859-1 > $$TEMP ; \ else \ ${SED} -e 's/\\-/-/g' $< > $$TEMP ; \ fi ; \ man -t ./$$TEMP | ps2pdf -sPAPERSIZE=a4 - $@ ; \ rm $$TEMP CLEANFILES=mptp_manual.pdf endif mptp-0.2.2/man/mptp.1000066400000000000000000000344711304415103400143140ustar00rootroot00000000000000.\" -*- coding: utf-8 -*- .\" ============================================================================ .TH mptp 1 "January 31, 2017" "mptp 0.2.2" "USER COMMANDS" .\" ============================================================================ .SH NAME mptp \(em single-locus species delimitation .\" ============================================================================ .SH SYNOPSIS .\" left justified, ragged right .ad l Maximum-likelihood species delimitation: .RS \fBmptp\fR \-\-ml (\-\-single | \-\-multi) \-\-tree_file \fInewickfile\fR \-\-output_file \fIoutputfile\fR [\fIoptions\fR] .PP .RE Species delimitation with support values: .RS \fBmptp\fR \-\-mcmc \fIpositive integer\fR (\-\-single | \-\-multi) (\-\-mcmc_startnull | \-\-mcmc_startrandom | \-\-mcmc_startml) \-\-mcmc_log \fIpositive integer\fR \-\-tree_file \fInewickfile\fR \-\-output_file \fIoutputfile\fR [\fIoptions\fR] .PP .RE .\" left and right justified (default) .ad b .\" ============================================================================ .SH DESCRIPTION Species is one of the fundamental units of comparison in virtually all subfields of biology, from systematics to anatomy, development, ecology, evolution, genetics and molecular biology. The aim of \fBmptp\fR is to offer an open source tool to infer species boundaries on a a given phylogenetic tree based on the Poisson Tree Process (PTP) and the Multiple Poisson Tree Process (mPTP) models. .PP \fBmptp\fR offers two methods for inferring species delimitation. First, a maximum-likelihood based method that uses a dynamic programming approach to infer an ML estimate. Second, an mcmc approach for sampling the space of possible delimitations providing the user with support values on the tree clades. Both approaches are available in two flavours: the PTP and the mPTP model. The PTP model is specified by using the \fIsingle\fR switch and the mPTP by using \fImulti\fR. .\" ============================================================================ .SS Input The input for \fBmptp\fR is a newick file that contains one phylogenetic tree, i.e., branches express the expected number of substitutions per alignment site. .\" ============================================================================ .SS Options \fBmptp\fR parses a large number of command-line options. For easier navigation, options are grouped below by theme. .PP General options: .RS .TP 9 .B \-\-help Display help text and exit. .TP .B \-\-version Output version information and exit. .TP .B \-\-quiet Supress all output to stdout except for warnings and fatal error messages. .TP .BI \-\-tree_file \0filename Input newick file that contains a phylogenetic tree. Can be rooted or unrooted. .TP .BI \-\-output_file \0filename Specifies the prefix used for generating output files. For maximum-likelihood species delimitation two files will be created. First, \fIfilename\fR.txt that contains the actual delimitation and \fIfilename\fR.svg that contains an SVG figure of the computed delimitation. For mcmc analyses, a file \fIfilename\fR.txt is created that contains the newick tree with supports values. .TP .BI \-\-outgroup\~ "comma-separated list of taxa" All computations for species delimitation are carried out on rooted trees. This option is used only (and is required) In case an unrooted tree was specified with the \-\-tree_file option. \fImptp\fR roots the unrooted tree by splitting the branch leading to the most recent common ancestor (MRCA) of the comma-separated list of taxa into two branches of equal size and introducing a new node (the root of the new rooted tree) that connects these two branches. .TP .BI \-\-outgroup_crop Crops taxa specified with the \-\-outgroup option from the the tree. .TP .BI \-\-min_br \0real Any branch lengths in the input tree smaller or equal than \fIreal\fR are excluded (ignored) from the computations. In addition, for mcmc analyses, subtrees that exclusively consist of branch lengths smaller or equal to \fIreal\fR are completely ignored from the proposals (support values for those clades are set to 0). (default: 0.0001) .TP .BI \-\-precision\~ "positive integer" Specifies the precision of the decimal part of floating point numbers on output (default: 7) .TP .BI \-\-minbr_auto \0filename Automatically detects the minimum branch length from the p-distances of the FASTA file \fIfilename\fR. .TP .BI \-\-tree_show Show an ASCII version of the processed input tree (i.e. after it is rooted by, potentially cropping, the outgroup). .RE .PP .\" ============================================================================ Maximum-likelihood estimations: .PP .RS Estimating the maximum-likelihood delimitation is triggered by the switch \-\-ml followed by \-\-single (the PTP model) or \-\-ml \-\-multi (the mPTP model). Note that these two methods affect how options \-\-output_file behaves and can be controlled using the \-\-min_br switch. Both methods require a rooted phylogenetic tree, however an unrooted tree may be specified in conjuction with the option \-\-outgroup. In this case, \fImptp\fR roots it at that outgroup (see General options, \-\-outgroup for more info). Note that both methods output an SVG depiction of the ML delimitation. See Visualization for more information on adjusting and fine-tuning the SVG output. .PP Both methods ignore discard branch lengths of size smaller than the size specified using the \-\-min_br option. The PTP model then attempts to find a connected subgraph of the rooted tree that (a) contains the root, and (b) the sum of likelihoods of fitting the edges of that subgraph in one exponential distribution and the remaining edges in another (exponential distribution) is maximized. With likelihood we mean the sums of the probability density function with the mean defined as the reciprocal of the average of edge lengths in the particular distribution. .PP .TP 9 .B \-\-ml \-\-single Triggers the algorithm for computing an ML estimate of the delimitation using the PTP model. .TP .B \-\-ml \-\-multi Triggers the algorithm for computing an ML estimate of the delimitation using the mPTP model. .TP .B \-\-pvalue \0real Only used with the PTP model (specified with \-\-single). Sets the p-value for performing a likelihood ratio test. Note that, there is no likelihood ratio test for the mPTP model this test is not done. (default: 0.001) .RE .PP .\" ============================================================================ MCMC method: .PP .RS The MCMC method is triggered with the \-\-mcmc switch combined with either \-\-single (the PTP model) or \-\-multi (the mPTP model). .PP Some more stuff to write .PP .TP 9 .B \-\-mcmc\~ "positive integer" \-\-single Triggers the algorithm for computing support values by taking the specified number of MCMC samples (delimitations) using the PTP model. .TP .B \-\-mcmc\~ "positive integer" \-\-multi Triggers the algorithm for computing support values by taking the specified number of MCMC samples (delimitations) using the mPTP model. .TP .B \-\-mcmc_sample\~ "positive integer" Sample only every n-th MCMC step. .TP .B \-\-mcmc_log Log the scores (log-likelihood) for each MCMC sample in a file and create an SVG plot. .TP .B \-\-mcmc_burnin\~ "positive integer" Ignore all MCMC samples generated before the specified step. (default: 1) .TP .B \-\-mcmc_runs\~ "positive integer" Perform multiple MCMC runs. If more than 1 run is specified, mptp will generate one seed for each run based on the provided seed using the \-\-seed switch. Output files will be generated for each run (default: 1) .TP .B \-\-mcmc_credible \0real Specify the probability (0.0 to 1.0) for which to generate the credible interval i.e., the probability the true number of species will fall within the credible interval given the observed data. (default: 0.95) .TP .B \-\-mcmc_startnull Start MCMC sampling from the null-model. .TP .B \-\-mcmc_startrandom Start MCMC sampling from a random delimitation. .TP .B \-\-mcmc_startrandom Start MCMC sampling from the ML delimitation. .TP .B \-\-seed\~ "positive integer" Specifies the seed for the pseudo-random number generator. (default: randomly generated based on system time) .RE .PP .\" ============================================================================ SVG Output: .PP .RS The ML method generates one SVG file that visualizes the processed input tree (i.e. after it is rooted by, potentially cropping, the outgroup) and marks the subtrees corresponding to coalescent processes (the detected species groups) with red color, while the speciation process is colored green. .PP The MCMC method generates one SVG file per run visualizing the processed tree, and indicates the support value for each node, i.e., the percentage of MCMC samples (delimitations) in which the particular node was part of the speciation process. A value of 1 means it was always in the speciation process while a value of 0 means it was always in a coalescent process. The tree branches are colored according to the support values of descendant nodes; a support of value of 0 is colored with red, 1 with black, and values in between are gradients of the two colors. Only support values above 0.5 are shown to avoid packed numbers in dense branching events. In addition, if \-\-mcmc_log is specified, an additional SVG image of log-likelihoods plots for each sampled delimitation is created. .PP .TP 9 .B \-\-svg_width\~ "positive integer" Sets the total width (including margins) of the SVG in pixels. (default: 1920) .TP .B \-\-svg_fontsize\~ "positive integer" Size of font in SVG image. (default: 12) .TP .B \-\-svg_tipspacing\~ "positive integer" Vertical space in pixels between taxa in SVG tree. (default: 20) .TP .B \-\-svg_legend_ratio \0real Ratio (value between 0.0 and 1.0) of total tree length to be displayed as legend line. (default: 0.1) .TP .B \-\-svg_nolengend Hide legend. .TP .B \-\-svg_marginleft\~ "positive integer" Left margin in pixels. (default: 20) .TP .B \-\-svg_marginright\~ "positive integer" Right margin in pixels. (default: 20) .TP .B \-\-svg_margintop\~ "positive integer" Top margin in pixels. (default: 20) .TP .B \-\-svg_marginbottom\~ "positive integer" Top margin in pixels. (default: 20) .TP .B \-\-svg_inner_radius\~ "positive integer" Radius of inner nodes in pixels. (default: 0) .RE .PP .\" ============================================================================ .SH EXAMPLES .PP Compute the maximum likelihood estimate using the mPTP model by discarding all branches with length below or equal to 0.0001 .PP .RS \fBmptp\fR \-\-ml \-\-multi \-\-min_br 0.0001 \-\-tree_file \fInewick.txt\fR \-\-output_file \fIout\fR .RE .PP Run an MCMC analysis of 100 million steps with the mPTP model, that logs every one million-th step, ignores the first 2 million steps and discards all branches with lengths smaller or equal to 0.0001. Use 777 as seed. The chain will start from the ML delimitation (default). .PP .RS \fBmptp\fR \-\-mcmc 100000000 \-\-multi \-\-min_br 0.0001 \-\-tree_file \fInewick.txt\fR \-\-output_file \fIout\fR \-\-mcmc_log 1000000 \-\-mcmc_burnin 2000000 -seed 777 .RE .PP Perform an MCMC analysis of 5 runs, each of 100 million steps with the mPTP model, log every one million-th step, ignore the first 2 million steps, and detect the minimum branch length by specifying the FASTA file alignment.fa that contains the alignment. Use 777 as seed. Start each run from a random delimitation. .PP .RS \fBmptp\fR \-\-mcmc 100000000 \-\-multi -\-\-mcmc_runs 5 \-\-mcmc_log 1000000 \-\-minbr_auto \fIalignment.fa\fR \-\-tree_file \fInewick.txt\fR \-\-output_file \fIout\fR \-\-mcmc_burnin 2000000 -seed 777 \-\-mcmc_startrandom .RE .PP .\" .\" ============================================================================ .SH AUTHORS Implementation by Tomas Flouri, Sarah Lutteropp and Paschalia Kapli. Additional PTP and mPTP model authors include Kassian Kobert, Jiajie Zhang, Pavlos Pavlidis, and Alexandros Stamatakis. .SH REPORTING BUGS Submit suggestions and bug-reports at , or e-mail Tomas Flouri . .\" ============================================================================ .SH AVAILABILITY Source code and binaries are available at . .\" ============================================================================ .SH COPYRIGHT Copyright (C) 2015-2017, Tomas Flouri, Sarah Lutteropp, Paschalia Kapli .PP All rights reserved. .PP Contact: Tomas Flouri , Scientific Computing, Heidelberg Insititute for Theoretical Studies, 69118 Heidelberg, Germany .PP This software is licensed under the terms of the GNU Affero General Public License version 3. .PP \fBGNU Affero General Public License version 3\fR .PP This program is free software: you can redistribute it and/or modify it under the terms of the GNU Affero General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. .PP This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more details. .PP You should have received a copy of the GNU Affero General Public License along with this program. If not, see . .SH VERSION HISTORY New features and important modifications of \fBmptp\fR (short lived or minor bug releases may not be mentioned): .RS .TP .BR v0.1.0\~ "released June 27th, 2016" First public release. .TP .BR v0.1.1\~ "released July 15th, 2016" Bug fix (now LRT test is not printed in output file when using --multi) .TP .BR v.0.2.0\~ "released September 27th, 2016" Fixed floating point exception error when constructing random trees, caused from dividing by zero. Changed allocation from malloc to calloc, as it caused unititialized variables when converting unrooted trees to rooted when using the MCMC method. Fixed sample size for the AIC with a correction for finite sample sizes. .TP .BR v.0.2.1\~ "released October 18th, 2016" Updated ASV to consider only coalescent roots of ML delimitation. Removed assertion stopping mptp when using random starting delimitations for the MCMC method. .TP .BR v0.2.2\~ "released January 31st, 2017" Fixed regular expressions to allow scientific notation for branch lengths when parsing trees. Improved the accuracy of ASV score by also taking into account tips forming coalescent roots. Fixed memory leaks that occur when parsing incorrectly formatted trees. .RE .LP mptp-0.2.2/src/000077500000000000000000000000001304415103400132555ustar00rootroot00000000000000mptp-0.2.2/src/Makefile.am000066400000000000000000000011701304415103400153100ustar00rootroot00000000000000bin_PROGRAMS = $(top_builddir)/bin/mptp libparse_utree_a_SOURCES = parse_utree.y lex_utree.l libparse_rtree_a_SOURCES = parse_rtree.y lex_rtree.l noinst_LIBRARIES = libparse_utree.a libparse_rtree.a AM_CFLAGS=-I${srcdir} -O3 -mtune=native -Wall -Wsign-compare -g ${LIBS} AM_YFLAGS = -d -p `${SED} -n 's/.*_\(.*\)/\1_/p' <<<"$*"` AM_LFLAGS = -o lex.yy.c __top_builddir__bin_mptp_LDADD = libparse_utree.a libparse_rtree.a __top_builddir__bin_mptp_SOURCES = arch.c \ auto.c \ aic.c \ mptp.c \ mptp.h \ dp.c \ fasta.c \ likelihood.c \ maps.c \ multirun.c \ output.c \ random.c \ rtree.c \ svg.c \ svg_landscape.c \ util.c \ utree.c mptp-0.2.2/src/aic.c000066400000000000000000001003731304415103400141610ustar00rootroot00000000000000/* Copyright (C) 2015 Tomas Flouri, Sarah Lutteropp This program is free software: you can redistribute it and/or modify it under the terms of the GNU Affero General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more details. You should have received a copy of the GNU Affero General Public License along with this program. If not, see . Contact: Tomas Flouri , Heidelberg Institute for Theoretical Studies, Schloss-Wolfsbrunnenweg 35, D-69118 Heidelberg, Germany */ #include "mptp.h" typedef struct density_s { double logl; long species_count; } density_t; static rtree_t ** crnodes; static rtree_t ** snodes; static long crnodes_count = 0; static long snodes_count = 0; static long accept_count = 0; static FILE * fp_log = NULL; static long species_count = 0; static density_t * densities = NULL; static void mcmc_log(double logl, long sc) { if (opt_mcmc_log) fprintf(fp_log, "%f,%ld\n", logl, sc); } static int cb_desc(const void * va, const void * vb) { const density_t * a = va; const density_t * b = vb; if (a->logl - b->logl < 0) return 1; else if (a->logl - b->logl > 0) return -1; return 0; } static void mcmc_init(rtree_t * root, long seed) { long i; crnodes = (rtree_t **)xmalloc((size_t)(root->leaves)*sizeof(rtree_t *)); snodes = (rtree_t **)xmalloc((size_t)(root->leaves)*sizeof(rtree_t *)); crnodes_count = 0; snodes_count = 0; accept_count = 0; densities = (density_t *)xmalloc((size_t)(root->leaves+1)*sizeof(density_t)); memset(densities, 0, (size_t)(root->leaves+1) * sizeof(density_t)); for (i = 0; i < root->leaves+1; ++i) densities[i].species_count = i; /* open log file */ if (opt_mcmc_log) fp_log = open_file_ext("log", seed); } static void init_null(rtree_t * root) { int i; rtree_t ** inner_node_list = (rtree_t **)xmalloc((size_t)(root->leaves-1) * sizeof(rtree_t *)); rtree_query_innernodes(root, inner_node_list); /* start mcmc analysis from null model */ for (i = 0; i < root->leaves - 1; ++i) inner_node_list[i]->event = EVENT_COALESCENT; free(inner_node_list); } static void mcmc_stats_init(rtree_t * root) { int i; rtree_t ** inner_node_list = (rtree_t **)xmalloc((size_t)(root->leaves-1) * sizeof(rtree_t *)); rtree_query_innernodes(root, inner_node_list); for (i = 0; i < root->leaves - 1; ++i) { if (inner_node_list[i]->event == EVENT_COALESCENT) { inner_node_list[i]->speciation_start = -1; inner_node_list[i]->aic_weight_start = 0; // Just to initialize - it's not used } else { inner_node_list[i]->speciation_start = opt_mcmc_burnin-1; inner_node_list[i]->aic_weight_start = 0; // This one should be used } inner_node_list[i]->speciation_count = 0; } free(inner_node_list); } static void hpd(long n, FILE * fp) { long i; long min, max; double densities_sum = 0; double acc_sum = 0; long * indices = NULL; indices = (long *)xmalloc((size_t)(n+2)*sizeof(long)); memset(indices, 0, (size_t)(n+2) * sizeof(long)); for (i = 1; i <= n; ++i) densities_sum += densities[i].logl; max = 0; min = n+1; for (i = 1; i <= n; ++i) { acc_sum += densities[i].logl; indices[densities[i].species_count] = 1; if (densities[i].species_count < min) min = densities[i].species_count; if (densities[i].species_count > max) max = densities[i].species_count; if (acc_sum / densities_sum >= opt_mcmc_credible) break; } fprintf(fp, "CCI (%ld,%ld)\n", min, max); if (!opt_quiet) fprintf(stdout, "CCI (%ld,%ld)\n", min, max); fprintf(fp, "HPD "); if (!opt_quiet) printf("HPD "); for (i = 1; i <= n+1; ++i) { if (indices[i] == 1 && indices[i-1] == 0) { fprintf(fp, "(%ld,", i); if (!opt_quiet) printf("(%ld,", i); } if (indices[i] == 0 && indices[i-1] == 1) { fprintf(fp, "%ld) ", i-1); if (!opt_quiet) printf("%ld) ", i-1); } } fprintf(fp,"\n"); if (!opt_quiet) printf("\n"); free(indices); } static void mcmc_finalize(rtree_t * root, double mcmc_min_logl, double mcmc_max_logl, long seed, double aic_weight_prefix_sum) { long i; if (!opt_quiet) { printf ("Minimum log-likelihood observed in mcmc run: %f\n", mcmc_min_logl); printf ("Maximum log-likelihood observed in mcmc run: %f\n", mcmc_max_logl); } /* write support values to all nodes */ rtree_t ** inner_node_list = (rtree_t **)xmalloc((size_t)(root->leaves-1) * sizeof(rtree_t *)); rtree_query_innernodes(root, inner_node_list); for (i = 0; i < root->leaves - 1; ++i) { if (inner_node_list[i]->speciation_start != -1) { inner_node_list[i]->speciation_count = inner_node_list[i]->speciation_count + opt_mcmc_steps - inner_node_list[i]->speciation_start; inner_node_list[i]->aic_support += aic_weight_prefix_sum - inner_node_list[i]->aic_weight_start; } inner_node_list[i]->aic_support /= aic_weight_prefix_sum; inner_node_list[i]->support = inner_node_list[i]->aic_support; /*inner_node_list[i]->support = inner_node_list[i]->speciation_count / (double)(opt_mcmc_steps-opt_mcmc_burnin+1);*/ } free(inner_node_list); free(crnodes); free(snodes); if (opt_mcmc_log) { if (!opt_quiet) fprintf(stdout, "Log written in %s.%ld.log ...\n", opt_outfile, seed); fclose(fp_log); } FILE * fp_stats = open_file_ext("stats", seed); double densities_sum = 0; for (i = 1; i <= root->leaves; ++i) densities_sum += densities[i].logl; for (i = 1; i <= root->leaves; ++i) { fprintf(fp_stats, "%ld,%f\n", i, (densities[i].logl/densities_sum)*100); } /* compute a HPD */ qsort(densities+1, (size_t)(root->leaves), sizeof(density_t), cb_desc); hpd(root->leaves, fp_stats); if (!opt_quiet) fprintf(stdout, "Statistics written in %s.%ld.stats ...\n", opt_outfile, seed); fclose(fp_stats); free(densities); } static void dp_recurse(rtree_t * node, int method) { int k,j; /* bottom-up recursion */ if (node->left) dp_recurse(node->left, method); if (node->right) dp_recurse(node->right, method); /* u_vec * / \ / \ v_vec * * w_vec */ dp_vector_t * u_vec = node->vector; double spec_logl = loglikelihood(node->spec_edge_count, node->spec_edgelen_sum); u_vec[0].spec_edgelen_sum = 0; u_vec[0].score_multi = node->coal_logl + spec_logl; u_vec[0].score_single = node->coal_logl + spec_logl; u_vec[0].coal_multi_logl = node->coal_logl; u_vec[0].species_count = 1; u_vec[0].filled = 1; if (!node->left) return; dp_vector_t * v_vec = node->left->vector; dp_vector_t * w_vec = node->right->vector; assert(node->spec_edge_count >= 0); int u_edge_count = 0; double u_edgelen_sum = 0; /* check whether edges (u,v) and (u,w) are > min branch length */ if (node->left->length > opt_minbr) { u_edge_count++; u_edgelen_sum += node->left->length; } if (node->right->length > opt_minbr) { u_edge_count++; u_edgelen_sum += node->right->length; } for (j = 0; j <= node->left->edge_count; ++j) { for (k = 0; k <= node->right->edge_count; ++k) { /* if at least one of the two entries is not valid/filled, skip */ if (!v_vec[j].filled || !w_vec[k].filled) continue; int i = j + k + u_edge_count; /* set the number of species */ unsigned int u_species_count = v_vec[j].species_count + w_vec[k].species_count; /* compute multi-rate coalescent log-likelihood */ double coal_multi_logl = v_vec[j].coal_multi_logl + w_vec[k].coal_multi_logl; /* compute coalescent edge count and length sum of subtree u */ double u_spec_edgelen_sum = v_vec[j].spec_edgelen_sum + w_vec[k].spec_edgelen_sum + u_edgelen_sum; int coal_edge_count = node->edge_count - i; /* change to int */ double coal_edgelen_sum = node->edgelen_sum - u_spec_edgelen_sum; /* compute single-rate coalescent log-likelihood */ double coal_single_logl = loglikelihood(coal_edge_count,coal_edgelen_sum); /* compute total speciation log-likelihood */ double spec_edgelen_sum = node->spec_edgelen_sum + u_edgelen_sum + v_vec[j].spec_edgelen_sum + w_vec[k].spec_edgelen_sum; int spec_edge_count = node->spec_edge_count + i; assert(u_species_count > 0); spec_logl = loglikelihood(spec_edge_count,spec_edgelen_sum); /* compute single- and multi-rate scores */ double score_multi = coal_multi_logl + spec_logl; double score_single = coal_single_logl + spec_logl; double score = score_multi; double best_score = u_vec[i].score_multi; if (method == PTP_METHOD_SINGLE) { score = score_single; best_score = u_vec[i].score_single; } if (!u_vec[i].filled || score > best_score) { u_vec[i].score_multi = score_multi; u_vec[i].score_single = score_single; u_vec[i].spec_edgelen_sum = u_spec_edgelen_sum; u_vec[i].coal_multi_logl = coal_multi_logl; u_vec[i].vec_left = j; u_vec[i].vec_right = k; u_vec[i].species_count = u_species_count; u_vec[i].filled = 1; } } } } static void backtrack_random(rtree_t * node, bool *warning_minbr) { node->mcmc_slot = -1; if (node->event == EVENT_SPECIATION) { if (node->length <= opt_minbr && node->parent) *warning_minbr = true; backtrack_random(node->left, warning_minbr); backtrack_random(node->right, warning_minbr); /* add to list of speciation nodes only if its two direct descendents are coalescent roots and also the subtree at node has at least one branch length greater than minbr */ if ((node->left->event == EVENT_COALESCENT) && (node->right->event == EVENT_COALESCENT) && (node->edge_count)) { node->mcmc_slot = snodes_count; snodes[snodes_count++] = node; } } else { node->event = EVENT_COALESCENT; /* add to list of coalescent roots in case it is not a tip AND if the subtree rooted at node has at least one edge longer than minbr */ if (node->edge_count) { node->mcmc_slot = crnodes_count; crnodes[crnodes_count++] = node; } } } static void backtrack(rtree_t * node, long index, bool *warning_minbr) { dp_vector_t * vec = node->vector; node->mcmc_slot = -1; if ((vec[index].vec_left != -1) && (vec[index].vec_right != -1)) { node->event = EVENT_SPECIATION; if (node->length <= opt_minbr && node->parent) *warning_minbr = true; backtrack(node->left, vec[index].vec_left, warning_minbr); backtrack(node->right,vec[index].vec_right,warning_minbr); /* add to list of speciation nodes only if its two direct descendents are coalescent roots and also the subtree at node has at least one branch length greater than minbr */ if ((node->left->event == EVENT_COALESCENT) && (node->right->event == EVENT_COALESCENT) && (node->edge_count)) { node->mcmc_slot = snodes_count; snodes[snodes_count++] = node; } } else { node->event = EVENT_COALESCENT; /* add to list of coalescent roots in case it is not a tip AND if the subtree rooted at node has at least one edge longer than minbr */ if (node->edge_count) { node->mcmc_slot = crnodes_count; crnodes[crnodes_count++] = node; } } } static void speciate(long r) { /* CR S * * / \ -> / \ / \ / \ C * * C CR * * CR */ /* select the coalescent root at position r and split it into two coalescent root nodes */ rtree_t * node = crnodes[r]; /* move the last node of the list to the position of the node we just used */ if (r != (crnodes_count-1)) { crnodes[r] = crnodes[crnodes_count-1]; crnodes[r]->mcmc_slot = r; } --crnodes_count; /* eliminate parent from snodes if both its children were coalescent roots, i.e. we had the case below: S S * * / \ / \ / \ / \ CR * * CR -> CR * * S / \ / \ / \ / \ C * * C CR * * CR */ if (node->parent && node->parent->left->event == EVENT_COALESCENT && node->parent->right->event == EVENT_COALESCENT) { assert(node->parent->mcmc_slot != -1); assert(node->edge_count); /* perform the following only if the parent is not the last node in the list */ if (node->parent->mcmc_slot != snodes_count-1) { /* set slot of last node in snodes to the slot we will place it */ snodes[snodes_count-1]->mcmc_slot = node->parent->mcmc_slot; /* move this last node to its new slot */ snodes[node->parent->mcmc_slot] = snodes[snodes_count-1]; } /* reset slot of the removed node and decrease count */ node->parent->mcmc_slot = -1; --snodes_count; } /* add select node to the list of speciation nodes */ node->mcmc_slot = snodes_count; snodes[snodes_count++] = node; node->event = EVENT_SPECIATION; /* add left child to coalescent roots unless it is a leaf OR the tree rooted at node->left has all branch lengths smaller than minbr */ if (node->left->edge_count) { crnodes[crnodes_count] = node->left; node->left->mcmc_slot = crnodes_count++; } /* add right child to coalescent roots unless it is a leaf OR the tree rooted at node->right has all branch lengths smaller than minbr */ if (node->right->edge_count) { crnodes[crnodes_count] = node->right; node->right->mcmc_slot = crnodes_count++; } } static void coalesce(long r) { /* S CR * * / \ -> / \ / \ / \ CR * * CR C * * C */ rtree_t * node = snodes[r]; /* move the last node of the list to the position of the node we just used */ if (r != (snodes_count-1)) { snodes[r] = snodes[snodes_count-1]; snodes[r]->mcmc_slot = r; } --snodes_count; /* add the current node to the list of coalescent roots */ node->mcmc_slot = crnodes_count; crnodes[crnodes_count++] = node; node->event = EVENT_COALESCENT; /* remove left child from coalescent roots unless it is a leaf OR the tree rooted at node->left has all branch lengths smaller than minbr */ if (node->left->edge_count) { /* perform the following only if it is not the last node in the list */ if (node->left->mcmc_slot != crnodes_count-1) { /* set slot of last node in crnodes to the slot we will place it */ crnodes[crnodes_count-1]->mcmc_slot = node->left->mcmc_slot; /* move this last node to its new slot */ crnodes[node->left->mcmc_slot] = crnodes[crnodes_count-1]; } /* reset slot of the removed node and decrease count */ node->left->mcmc_slot = -1; crnodes_count--; } /* now do the same for the right child */ if (node->right->edge_count) { /* perform the following only if the parent is not the last node in the list */ if (node->right->mcmc_slot != crnodes_count-1) { /* set slot of last node in crnodes to the slot we will place it */ crnodes[crnodes_count-1]->mcmc_slot = node->right->mcmc_slot; /* move this last node to its new slot */ crnodes[node->right->mcmc_slot] = crnodes[crnodes_count-1]; } /* reset slot of removed node and decrease count */ node->right->mcmc_slot = -1; crnodes_count--; } /* if the parent of the node has two coalescent roots as children now, then add it to snodes, i.e. the following case: S S * * / \ / \ / \ / \ CR * * S -> CR * * CR / \ / \ / \ / \ CR * * CR C * * C */ if (node->parent && node->parent->left->event == EVENT_COALESCENT && node->parent->right->event == EVENT_COALESCENT) { assert(node->parent->mcmc_slot == -1); /* set slot of parent */ node->parent->mcmc_slot = snodes_count; /* place parent to the last slot in snodes and increase count */ snodes[snodes_count++] = node->parent; } } static double aic_weight_nominator(double aic_score) { return exp(-0.5 * aic_score); } void aic_mcmc(rtree_t * tree, long method, unsigned short * rstate, long seed, double * mcmc_min_logl, double * mcmc_max_logl) { long i; long best_index = 0; long rand_long = 0; double rand_double = 0; double max = 0; double logl = 0; double aic_weight_prefix_sum = 0.0; *mcmc_max_logl = 0; *mcmc_min_logl = 0; if (!opt_quiet) fprintf(stdout,"Computing initial delimitation...\n"); /* check whether all edges are smaller or equal than minbr */ if (!tree->edge_count) { fprintf(stderr,"WARNING: All branch lengths are smaller or equal to the " "threshold specified by --minbr. Delimitation equals to " "the null model\n"); tree->support = 1; tree->aic_support = 1; tree->event = EVENT_COALESCENT; return; } mcmc_init(tree, seed); /* fill DP table */ dp_recurse(tree, method); /* obtain best entry in the root DP table */ dp_vector_t * vec = tree->vector; if (method == PTP_METHOD_MULTI) { max = vec[0].score_multi; for (i = 1; i < tree->edge_count; i++) { if (max < vec[i].score_multi && vec[i].filled) { max = vec[i].score_multi; best_index = i; } } } else { max = vec[0].score_single; for (i = 1; i < tree->edge_count; i++) { //printf("vec[%d].score_single: %.6f\n", i, vec[i].score_single); if (max < vec[i].score_single && vec[i].filled) { max = vec[i].score_single; best_index = i; } } } species_count = vec[best_index].species_count; double max_logl_aic = (method == PTP_METHOD_MULTI) ? vec[best_index].score_multi : vec[best_index].score_single; double max_aic = aic(max_logl_aic, species_count, tree->leaves+2); long coal_edge_count = 0; long spec_edge_count = 0; double spec_edgelen_sum = 0; double coal_edgelen_sum = 0; double coal_score = 0; if (opt_mcmc_startnull && opt_mcmc_startrandom) { fatal("Cannot specify --mcmc_startnull and --mcmc_startrandom together"); } else if (opt_mcmc_startnull) { tree->event = EVENT_COALESCENT; crnodes[crnodes_count++] = tree; logl = tree->coal_logl; best_index = 0; species_count = 1; /* set parameters */ coal_edge_count = tree->edge_count; spec_edge_count = 0; spec_edgelen_sum = 0; coal_edgelen_sum = tree->edgelen_sum; coal_score = tree->coal_logl; /* set all nodes to coalescent */ init_null(tree); /* log log-likelihood at step 0 */ if (opt_mcmc_burnin == 1) mcmc_log(logl,species_count); } else if (opt_mcmc_startrandom) { bool warning_minbr = false; logl = random_delimitation(tree, &species_count, &coal_edge_count, &coal_edgelen_sum, &spec_edge_count, &spec_edgelen_sum, &coal_score, rstate); backtrack_random(tree, &warning_minbr); if (warning_minbr) fprintf(stderr,"WARNING: A speciation edge is smaller than the specified " "minimum branch length.\n"); /* log log-likelihood at step 0 */ if (opt_mcmc_burnin == 1) mcmc_log(logl,species_count); } else { /* ML starting delimitation */ bool warning_minbr = false; backtrack(tree, best_index, &warning_minbr); if (warning_minbr) fprintf(stderr,"WARNING: A speciation edge is smaller than the specified " "minimum branch length.\n"); logl = (method == PTP_METHOD_MULTI) ? vec[best_index].score_multi : vec[best_index].score_single; /* log log-likelihood at step 0 */ if (opt_mcmc_burnin == 1) mcmc_log(logl,species_count); } if (!opt_mcmc_startnull && !opt_mcmc_startrandom) { if (method == PTP_METHOD_SINGLE) { coal_edge_count = tree->edge_count - best_index; spec_edge_count = best_index; spec_edgelen_sum = tree->vector[best_index].spec_edgelen_sum; coal_edgelen_sum = tree->edgelen_sum - spec_edgelen_sum; } else { spec_edge_count = best_index; spec_edgelen_sum = tree->vector[best_index].spec_edgelen_sum; coal_score = tree->vector[best_index].score_multi - loglikelihood(spec_edge_count, spec_edgelen_sum); } } *mcmc_max_logl = logl; *mcmc_min_logl = logl; if (!opt_quiet) { if (opt_mcmc_startnull) fprintf(stdout, "Null model log-likelihood: %f\n", logl); else if (opt_mcmc_startrandom) fprintf(stdout, "Random delimitation log-likelihood: %f\n", logl); else fprintf(stdout, "ML delimitation log-likelihood: %f\n", logl); } if (opt_mcmc_burnin == 1) { //densities[species_count].logl += logl; densities[species_count].logl += -aic(logl, species_count, tree->leaves+2); } if (opt_mcmc_sample == 1) { if (!opt_quiet) printf("1 Log-L: %f\n", logl); } mcmc_stats_init(tree); for (i = 1; i < opt_mcmc_steps; ++i) { /* throw a coin to decide whether to convert a coalescent root to a speciation or the other way round */ rand_double = erand48(rstate); int speciation = (rand_double >= 0.5) ? 1 : 0; if ((speciation && crnodes_count) || (snodes_count == 0)) { /* CR S * * / \ -> / \ / \ / \ C * * C CR * * CR */ /* select a coalescent root, split it into two coalescent nodes */ rand_long = nrand48(rstate); long r = rand_long % crnodes_count; rtree_t * node = crnodes[r]; /* store the count of crnodes for the Hasting ratio */ double old_crnodes_count = crnodes_count; /* speciate */ speciate(r); /* store the new count of snodes for the Hasting ratio */ double new_snodes_count = snodes_count; /* TODO: distinguish between single- and multi-rate methods */ /* subtract the two edges (left and right) from the coalescent distribution and add them to the speciation distribution */ unsigned int edge_count_diff = 0; double edgelen_sum_diff = 0; if (node->left->length > opt_minbr) { ++edge_count_diff; edgelen_sum_diff += node->left->length; } if (node->right->length > opt_minbr) { ++edge_count_diff; edgelen_sum_diff += node->right->length; } if (method == PTP_METHOD_SINGLE) { coal_edgelen_sum -= edgelen_sum_diff; coal_edge_count -= edge_count_diff; } spec_edgelen_sum += edgelen_sum_diff; spec_edge_count += edge_count_diff; /* compute new log-likelihood */ double new_logl; if (spec_edge_count == 0 || (method == PTP_METHOD_SINGLE && coal_edge_count == 0)) new_logl = tree->coal_logl; else { assert((method == PTP_METHOD_MULTI) || (coal_edge_count > 0)); assert(spec_edge_count > 0); if (method == PTP_METHOD_SINGLE) new_logl = loglikelihood(coal_edge_count, coal_edgelen_sum) + loglikelihood(spec_edge_count, spec_edgelen_sum); else new_logl = coal_score - node->coal_logl + node->left->coal_logl + node->right->coal_logl + loglikelihood(spec_edge_count, spec_edgelen_sum); } if (new_logl > *mcmc_max_logl) *mcmc_max_logl = new_logl; if (i+1 < opt_mcmc_burnin) *mcmc_min_logl = *mcmc_max_logl; else if (new_logl < *mcmc_min_logl) *mcmc_min_logl = new_logl; double aic_new_logl = -aic(new_logl, species_count+1, tree->leaves+2); double aic_logl = -aic(logl, species_count, tree->leaves+2); /* Hastings ratio */ double a = exp(aic_new_logl - aic_logl) * (old_crnodes_count / new_snodes_count); /* update densities */ if (i+1 >= opt_mcmc_burnin) { //densities[species_count+1].logl += new_logl; densities[species_count+1].logl += aic_new_logl; } /* decide whether to accept or reject proposal */ rand_double = erand48(rstate); if (rand_double <= a) { /* accept */ if ((i+1) % opt_mcmc_sample == 0) { if (!opt_quiet) printf("%ld Log-L: %f\n", i+1, new_logl); if (i+1 >= opt_mcmc_burnin) mcmc_log(new_logl,species_count+1); } /* update support values information */ if (i+1 >= opt_mcmc_burnin) { node->speciation_start = i; aic_weight_prefix_sum += aic_weight_nominator(-aic_new_logl/max_aic); node->aic_weight_start = aic_weight_prefix_sum; } else { node->speciation_start = opt_mcmc_burnin; } accept_count++; species_count++; logl = new_logl; if (method == PTP_METHOD_MULTI) coal_score = coal_score - node->coal_logl + node->left->coal_logl + node->right->coal_logl; continue; } else { /* reject */ if ((i+1) % opt_mcmc_sample == 0) { if (!opt_quiet) printf("%ld Log-L: %f\n", i+1, new_logl); if (i+1 >= opt_mcmc_burnin) mcmc_log(new_logl,species_count+1); } if (i+1 >= opt_mcmc_burnin) node->speciation_count++; if (method == PTP_METHOD_SINGLE) { coal_edgelen_sum += edgelen_sum_diff; coal_edge_count += edge_count_diff; } spec_edgelen_sum -= edgelen_sum_diff; spec_edge_count -= edge_count_diff; coalesce(node->mcmc_slot); } } else { /* S CR * * / \ -> / \ / \ / \ CR * * CR C * * C */ rand_long = nrand48(rstate); long r = rand_long % snodes_count; rtree_t * node = snodes[r]; /* store the count of snodes for the Hastings ratio */ double old_snodes_count = snodes_count; /* coalesce */ coalesce(r); double new_crnodes_count = crnodes_count; /* TODO: distinguish between single- and multi-rate methods */ /* subtract the two edges (left and right) from the speciation distribution and add them to the coalescent distribution */ int edge_count_diff = 0; double edgelen_sum_diff = 0; if (node->left->length > opt_minbr) { ++edge_count_diff; edgelen_sum_diff += node->left->length; } if (node->right->length > opt_minbr) { ++edge_count_diff; edgelen_sum_diff += node->right->length; } if (method == PTP_METHOD_SINGLE) { coal_edgelen_sum += edgelen_sum_diff; coal_edge_count += edge_count_diff; } spec_edgelen_sum -= edgelen_sum_diff; spec_edge_count -= edge_count_diff; /* compute new log-likelihood */ double new_logl; if (spec_edge_count == 0 || (method == PTP_METHOD_SINGLE && coal_edge_count == 0)) new_logl = tree->coal_logl; else { assert((method == PTP_METHOD_MULTI) || (coal_edge_count > 0)); assert(spec_edge_count > 0); if (method == PTP_METHOD_SINGLE) new_logl = loglikelihood(coal_edge_count, coal_edgelen_sum) + loglikelihood(spec_edge_count, spec_edgelen_sum); else new_logl = coal_score - node->left->coal_logl - node->right->coal_logl + node->coal_logl + loglikelihood(spec_edge_count, spec_edgelen_sum); } if (new_logl > *mcmc_max_logl) *mcmc_max_logl = new_logl; if (i+1 < opt_mcmc_burnin) *mcmc_min_logl = *mcmc_max_logl; else if (new_logl < *mcmc_min_logl) *mcmc_min_logl = new_logl; double aic_new_logl = -aic(new_logl, species_count-1, tree->leaves+2); double aic_logl = -aic(logl, species_count, tree->leaves+2); /* Hastings ratio */ double a = exp(aic_new_logl - aic_logl) * (old_snodes_count / new_crnodes_count); /* update densities */ if (i+1 >= opt_mcmc_burnin) { //densities[species_count-1].logl += new_logl; densities[species_count-1].logl += aic_new_logl; } /* decide whether to accept or reject proposal */ rand_double = erand48(rstate); if (rand_double <= a) { /* accept */ if ((i+1) % opt_mcmc_sample == 0) { if (!opt_quiet) printf("%ld Log-L: %f\n", i+1, new_logl); if (i+1 >= opt_mcmc_burnin) mcmc_log(new_logl,species_count-1); } /* update support values information */ if (i+1 >= opt_mcmc_burnin) { node->speciation_count = node->speciation_count + i - node->speciation_start; aic_weight_prefix_sum += aic_weight_nominator(-aic_new_logl/max_aic); node->aic_support += aic_weight_prefix_sum - node->aic_weight_start; } node->speciation_start = -1; accept_count++; species_count--; logl = new_logl; if (method == PTP_METHOD_MULTI) coal_score = coal_score - node->left->coal_logl - node->right->coal_logl + node->coal_logl; continue; } else { /* reject */ if ((i+1) % opt_mcmc_sample == 0) { if (!opt_quiet) printf("%ld Log-L: %f\n", i+1, new_logl); if (i+1 >= opt_mcmc_burnin) mcmc_log(new_logl,species_count-1); } if (method == PTP_METHOD_SINGLE) { coal_edgelen_sum -= edgelen_sum_diff; coal_edge_count -= edge_count_diff; } spec_edgelen_sum += edgelen_sum_diff; spec_edge_count += edge_count_diff; speciate(node->mcmc_slot); if (i+1 >= opt_mcmc_burnin) { node->speciation_count--; } } } } //printf("Acceptance: %ld\n", accept_count); /* TODO: DEBUG variables for checking the max likelihood mcmc runs give. Must be removed */ mcmc_finalize(tree, *mcmc_min_logl, *mcmc_max_logl, seed, aic_weight_prefix_sum); } mptp-0.2.2/src/arch.c000066400000000000000000000043711304415103400143430ustar00rootroot00000000000000/* Copyright (C) 2014-2015 Tomas Flouri, Torbjorn Rognes, Jeff Epler This program is free software: you can redistribute it and/or modify it under the terms of the GNU Affero General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more details. You should have received a copy of the GNU Affero General Public License along with this program. If not, see . Contact: Tomas Flouri , Heidelberg Institute for Theoretical Studies, Schloss-Wolfsbrunnenweg 35, D-69118 Heidelberg, Germany */ #include "mptp.h" unsigned long arch_get_memused() { struct rusage r_usage; getrusage(RUSAGE_SELF, & r_usage); #if defined __APPLE__ /* Mac: ru_maxrss gives the size in bytes */ return (unsigned long)(r_usage.ru_maxrss); #else /* Linux: ru_maxrss gives the size in kilobytes */ return (unsigned long)r_usage.ru_maxrss * 1024; #endif } unsigned long arch_get_memtotal() { #if defined(_SC_PHYS_PAGES) && defined(_SC_PAGESIZE) long phys_pages = sysconf(_SC_PHYS_PAGES); long pagesize = sysconf(_SC_PAGESIZE); if ((phys_pages == -1) || (pagesize == -1)) fatal("Cannot determine amount of RAM"); // sysconf(3) notes that pagesize * phys_pages can overflow, such as // when long is 32-bits and there's more than 4GB RAM. Since vsearch // apparently targets LP64 systems like x86_64 linux, this will not // arise in practice on the intended platform. if (pagesize > LONG_MAX / phys_pages) return LONG_MAX; else return (unsigned long)pagesize * (unsigned long)phys_pages; #elif defined(__APPLE__) int mib [] = { CTL_HW, HW_MEMSIZE }; int64_t ram = 0; size_t length = sizeof(ram); if(-1 == sysctl(mib, 2, &ram, &length, NULL, 0)) fatal("Cannot determine amount of RAM"); return ram; #else struct sysinfo si; if (sysinfo(&si)) fatal("Cannot determine amount of RAM"); return si.totalram * si.mem_unit; #endif } mptp-0.2.2/src/auto.c000066400000000000000000000225031304415103400143730ustar00rootroot00000000000000/* Copyright (C) 2015 Tomas Flouri This program is free software: you can redistribute it and/or modify it under the terms of the GNU Affero General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more details. You should have received a copy of the GNU Affero General Public License along with this program. If not, see . Contact: Tomas Flouri , Heidelberg Institute for Theoretical Studies, Schloss-Wolfsbrunnenweg 35, D-69118 Heidelberg, Germany */ #include "mptp.h" static double minbr; static const unsigned int mask[256] = { 0, 1, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, }; static int pdist(char * a, char * b, long len) { long i; int pdist = 0; for (i = 0; i < len; ++i) { if (mask[(int)a[i]] && mask[(int)b[i]] && (a[i] != b[i])) pdist++; } return pdist; } static long load_fasta(int tip_nodes_count, char ** headers, char ** seqdata) { int i; /* open FASTA file */ pll_fasta_t * fp = pll_fasta_open(opt_pdist_file, pll_map_fasta); if (!fp) fatal("Error opening file %s", opt_pdist_file); char * seq = NULL; char * hdr = NULL; long seqlen; long hdrlen; long seqno; /* read FASTA sequences and make sure they are all of the same length */ long sites = -1; for (i = 0; pll_fasta_getnext(fp,&hdr,&hdrlen,&seq,&seqlen,&seqno); ++i) { if (i >= tip_nodes_count) fatal("FASTA file contains more sequences than expected"); if (sites != -1 && sites != seqlen) fatal("FASTA file does not contain equal size sequences\n"); if (sites == -1) sites = seqlen; headers[i] = hdr; seqdata[i] = seq; } /* did we stop reading the file because we reached EOF? */ if (pll_errno != PLL_ERROR_FILE_EOF) fatal("Error while reading file %s", opt_pdist_file); /* close FASTA file */ pll_fasta_close(fp); if (sites == -1) fatal("Unable to read alignment"); if (i != tip_nodes_count) fatal("Some taxa are missing from FASTA file"); return sites; } static int cb_ascending(const void * a, const void * b) { if (*(double *)(a) < *(double *)(b)) return -1; else if (*(double *)(a) > *(double *)(b)) return 1; return 0; } static int cb_allnodes(rtree_t * node) { return 1; } static int cb_short_trees(rtree_t * node) { /* mark tip down but don't include them in the list */ if (!node->left) { node->mark = 1; return 0; } if (node->left->mark && node->right->mark && node->left->length <= minbr && node->right->length <= minbr) { node->mark = 1; if (node->parent) { /* if it's parent is the root of a short tree then dont include current node in the list, otherwise include it */ if (node->parent->left->length <= minbr && node->parent->right->length <= minbr) { return 0; } else { return 1; } } else /* the current node is the root */ { return 1; } } return 0; } static void hash_tips(rtree_t * root) { int i; /* obtain an array of pointers to tip names */ rtree_t ** tipnodes = (rtree_t **)xmalloc((size_t)(root->leaves) * sizeof(rtree_t *)); rtree_query_tipnodes(root, tipnodes); /* create a libc hash table of size tip_count */ hcreate(2*(size_t)(root->leaves)); /* populate a libc hash table with tree tip labels */ for (i = 0; i < root->leaves; ++i) { ENTRY entry; entry.key = tipnodes[i]->label; entry.data = (void *)(tipnodes[i]); hsearch(entry, ENTER); } free(tipnodes); } static void set_encode_sequence(rtree_t * node, char * sequence, long seqlen, const unsigned int * map) { unsigned int c; long i; /* iterate through sites and encode */ for (i = 0; i < seqlen; ++i) { if ((c = map[(int)sequence[i]]) == 0) fatal("Illegal state code in tip \"%c\"", sequence[i]); assert(c < 256); sequence[i] = (char)c; } /* set sequence to tip */ node->sequence = sequence; } static void link_sequences(rtree_t * root, char ** headers, char ** sequence, long seqlen) { int i; for (i = 0; i < root->leaves; ++i) { ENTRY query; // printf("Linking %s\n", headers[i]); query.key = headers[i]; ENTRY * found = NULL; found = hsearch(query,FIND); if (!found) fatal("Sequence with header %s does not appear in the tree", headers[i]); set_encode_sequence((rtree_t *)(found->data), sequence[i], seqlen, pll_map_nt); } } static int all_pairwise_dist(rtree_t ** tip_node_list, int tip_list_count, long seqlen) { int j,k; for (j = 0; j < tip_list_count; ++j) for (k = j+1; k < tip_list_count; ++k) if (pdist(tip_node_list[j]->sequence, tip_node_list[k]->sequence, seqlen)) return 1; return 0; } void detect_min_bl(rtree_t * rtree) { rtree_t ** inner_node_list; rtree_t ** tip_node_list = NULL; int inner_list_count = 0; int tip_list_count = 0; int i,n; char ** seqdata = NULL; char ** headers = NULL; long seqlen = 0; /* for p-distance computation load an alignment from a FASTA file and map the sequences to the tree tips */ if (!opt_quiet) fprintf(stdout, "Parsing FASTA file %s...\n", opt_pdist_file); /* allocate arrays to store FASTA headers and sequences */ headers = (char **)calloc((size_t)(rtree->leaves), sizeof(char *)); seqdata = (char **)calloc((size_t)(rtree->leaves), sizeof(char *)); seqlen = load_fasta(rtree->leaves, headers, seqdata); hash_tips(rtree); /* find sequences in hash table and link them with the corresponding taxa */ link_sequences(rtree, headers, seqdata, seqlen); /* destroy hash table */ hdestroy(); /* get inner nodes that are roots of of the largest short subtrees. Short are such subtrees where all branch lengths within them are less or equal to opt_subtree_short. The largest such subtrees are those that are not subtrees of short subtrees. */ inner_node_list = (rtree_t **)xmalloc((size_t)(rtree->leaves-1) * sizeof(rtree_t *)); double * branch_lengths = (double *)xmalloc((size_t)(2*rtree->leaves-1) * sizeof(double)); rtree_t ** allnodes_list = (rtree_t **)xmalloc((size_t)(2*rtree->leaves-1) * sizeof(rtree_t *)); int allnodes_count; /* get list of all nodes, extract branch lengths and sort them in ascending order */ allnodes_count = rtree_traverse_postorder(rtree, cb_allnodes, allnodes_list); assert(allnodes_count == 2*rtree->leaves-1); for (i = 0; i < allnodes_count; ++i) branch_lengths[i] = allnodes_list[i]->length; qsort(branch_lengths, (size_t)allnodes_count, sizeof(double), cb_ascending); free(allnodes_list); printf("Computing all pairwise p-distances ...\n"); tip_node_list = (rtree_t **)xmalloc((size_t)(rtree->leaves) * sizeof(rtree_t *)); int minfound = 0; /* go through all branch lengths */ for (n = 1; n < allnodes_count && !minfound; ++n) { minbr = branch_lengths[n]; inner_list_count = rtree_traverse_postorder(rtree, cb_short_trees, inner_node_list); for (i = 0; i < inner_list_count && !minfound; ++i) { /* traverse the roots and grab the tips */ tip_list_count = rtree_query_tipnodes(inner_node_list[i], tip_node_list); minfound = all_pairwise_dist(tip_node_list, tip_list_count, seqlen); if (minfound) break; } } if (minfound && n != 1) printf("Minimum branch length (--minbr) should be set to %.10f\n", branch_lengths[n-1]); else printf("Minimum branch length (--minbr) should be set to 0\n"); free(branch_lengths); free(inner_node_list); free(tip_node_list); for (i = 0; i < rtree->leaves; ++i) { free(seqdata[i]); free(headers[i]); } free(seqdata); free(headers); } mptp-0.2.2/src/dp.c000066400000000000000000000241031304415103400140240ustar00rootroot00000000000000/* Copyright (C) 2015 Tomas Flouri, Sarah Lutteropp This program is free software: you can redistribute it and/or modify it under the terms of the GNU Affero General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more details. You should have received a copy of the GNU Affero General Public License along with this program. If not, see . Contact: Tomas Flouri , Heidelberg Institute for Theoretical Studies, Schloss-Wolfsbrunnenweg 35, D-69118 Heidelberg, Germany */ #include "mptp.h" static unsigned int species_iter = 0; static void dp_recurse(rtree_t * node, long method) { int k,j; /* bottom-up recursion */ if (node->left) dp_recurse(node->left, method); if (node->right) dp_recurse(node->right, method); /* u_vec * / \ / \ v_vec * * w_vec */ dp_vector_t * u_vec = node->vector; double spec_logl = loglikelihood(node->spec_edge_count, node->spec_edgelen_sum); u_vec[0].spec_edgelen_sum = 0; u_vec[0].score_multi = node->coal_logl + spec_logl; u_vec[0].score_single = node->coal_logl + spec_logl; u_vec[0].coal_multi_logl = node->coal_logl; u_vec[0].species_count = 1; u_vec[0].filled = 1; if (!node->left) return; dp_vector_t * v_vec = node->left->vector; dp_vector_t * w_vec = node->right->vector; assert(node->spec_edge_count >= 0); int u_edge_count = 0; double u_edgelen_sum = 0; /* check whether edges (u,v) and (u,w) are > min branch length */ if (node->left->length > opt_minbr) { u_edge_count++; u_edgelen_sum += node->left->length; } if (node->right->length > opt_minbr) { u_edge_count++; u_edgelen_sum += node->right->length; } for (j = 0; j <= node->left->edge_count; ++j) { for (k = 0; k <= node->right->edge_count; ++k) { /* if at least one of the two entries is not valid/filled, skip */ if (!v_vec[j].filled || !w_vec[k].filled) continue; int i = j + k + u_edge_count; /* set the number of species */ unsigned int species_count = v_vec[j].species_count + w_vec[k].species_count; /* compute multi-rate coalescent log-likelihood */ double coal_multi_logl = v_vec[j].coal_multi_logl + w_vec[k].coal_multi_logl; /* compute coalescent edge count and length sum of subtree u */ double u_spec_edgelen_sum = v_vec[j].spec_edgelen_sum + w_vec[k].spec_edgelen_sum + u_edgelen_sum; int coal_edge_count = node->edge_count - i; /* change to int */ double coal_edgelen_sum = node->edgelen_sum - u_spec_edgelen_sum; /* compute single-rate coalescent log-likelihood */ double coal_single_logl = loglikelihood(coal_edge_count,coal_edgelen_sum); /* compute total speciation log-likelihood */ double spec_edgelen_sum = node->spec_edgelen_sum + u_edgelen_sum + v_vec[j].spec_edgelen_sum + w_vec[k].spec_edgelen_sum; int spec_edge_count = node->spec_edge_count + i; assert(species_count > 0); spec_logl = loglikelihood(spec_edge_count,spec_edgelen_sum); /* compute single- and multi-rate scores */ double score_multi = coal_multi_logl + spec_logl; double score_single = coal_single_logl + spec_logl; double score = score_multi; double best_score = u_vec[i].score_multi; if (method == PTP_METHOD_SINGLE) { score = score_single; best_score = u_vec[i].score_single; } if (!u_vec[i].filled || score > best_score) { u_vec[i].score_multi = score_multi; u_vec[i].score_single = score_single; u_vec[i].spec_edgelen_sum = u_spec_edgelen_sum; u_vec[i].coal_multi_logl = coal_multi_logl; u_vec[i].vec_left = j; u_vec[i].vec_right = k; u_vec[i].species_count = species_count; u_vec[i].filled = 1; } } } } static void backtrack(rtree_t * node, int index, bool *warning_minbr, FILE * out) { dp_vector_t * vec = node->vector; if ((vec[index].vec_left != -1) && (vec[index].vec_right != -1)) { node->event = EVENT_SPECIATION; if (node->length <= opt_minbr && node->parent) *warning_minbr = true; backtrack(node->left, vec[index].vec_left, warning_minbr, out); backtrack(node->right,vec[index].vec_right,warning_minbr, out); } else { species_iter++; node->event = EVENT_COALESCENT; fprintf(out, "\nSpecies %d:\n", species_iter); rtree_print_tips(node,out); } } void dp_ptp(rtree_t * tree, long method) { int i; int lrt_pass; int best_index = 0; unsigned int species_count; double max = 0; double pvalue = -1; /* reset species counter */ species_iter = 0; /* fill DP table */ dp_recurse(tree, method); /* obtain best entry in the root DP table */ dp_vector_t * vec = tree->vector; if (method == PTP_METHOD_MULTI) { max = vec[0].score_multi; double min_aic_score = aic(vec[0].score_multi, vec[0].species_count, tree->leaves+2); for (i = 1; i < tree->edge_count; i++) { if (vec[i].filled) { double aic_score = aic(vec[i].score_multi, vec[i].species_count, tree->leaves+2); //printf("edges: %d logl: %f aic: %f species: %d\n", i, vec[i].score_multi, aic_score, vec[i].species_count); if (aic_score < min_aic_score) { min_aic_score = aic_score; best_index = i; } } } } else { max = vec[0].score_single; for (i = 1; i < tree->edge_count; i++) { if (max < vec[i].score_single && vec[i].filled) { max = vec[i].score_single; best_index = i; } } } /* output some statistics */ if (!opt_quiet) { fprintf(stdout, "Number of edges greater than minimum branch length: %d / %d\n", tree->edge_count, 2 * tree->leaves - 2); printf("Score Null Model: %.6f\n", tree->coal_logl); fprintf(stdout, "Best score for single coalescent rate: %.6f\n", vec[best_index].score_single); fprintf(stdout, "Best score for multi coalescent rate: %.6f\n", vec[best_index].score_multi); } /* do a Likelihood Ratio Test (lrt) and return the computed p-value */ species_count = vec[best_index].species_count; // only do LRT for PTP, not for mPTP lrt_pass = (method == PTP_METHOD_MULTI) ? 1 : lrt(tree->coal_logl, vec[best_index].score_single, 1, &pvalue); #ifndef HAVE_LIBGSL fprintf(stderr, "WARNING: delimit was not compiled with libgsl. " "Likelihood ratio test disabled.\n"); #endif #ifdef HAVE_LIBGSL if (!opt_quiet && method == PTP_METHOD_SINGLE) fprintf(stdout,"LRT computed p-value: %.6f\n", pvalue); #endif /* initialize file name */ FILE * out = open_file_ext("txt", opt_seed); if (!opt_quiet) fprintf(stdout, "Writing delimitation file %s.txt ...\n", opt_outfile); /* write information about delimitation to file */ output_info(out, method, tree->coal_logl, max, pvalue, lrt_pass, tree, species_count); /* if LRT passed, then back-track the DP table and print the delimitation, otherwise print the null-model (one single species) */ if (lrt_pass) { bool warning_minbr = false; backtrack(tree, best_index, &warning_minbr,out); if (warning_minbr) fprintf(stderr,"WARNING: A speciation edge is smaller than the specified " "minimum branch length.\n"); } else { species_iter = 1; fprintf(stdout, "LRT failed -- null-model is preferred and printed\n"); fprintf(out,"\nSpecies 1:\n"); rtree_print_tips(tree,out); } if (!opt_quiet) printf("Number of delimited species: %d\n", species_iter); if (tree->edge_count == 0) fprintf(stderr, "WARNING: The tree has no edges > %f. " "All edges have been ignored. \n", opt_minbr); fclose(out); } void dp_init(rtree_t * tree) { int i; if (tree->left) dp_init(tree->left); if (tree->right) dp_init(tree->right); // TODO: Check whether this is the best way to handle those // nasty zero-length edges. tree->vector = calloc((size_t)(tree->edge_count + 1), sizeof(dp_vector_t)); for (i = 0; i <= tree->edge_count; i++) { tree->vector[i].vec_left = -1; tree->vector[i].vec_right = -1; } assert(tree->edge_count >= 0); tree->coal_logl = loglikelihood(tree->edge_count, tree->edgelen_sum); } void dp_free(rtree_t * tree) { if (tree->left) dp_free(tree->left); if (tree->right) dp_free(tree->right); if (tree->vector) free(tree->vector); } void dp_set_pernode_spec_edges(rtree_t * node) { if (!node) return; node->spec_edge_count = 0; node->spec_edgelen_sum = 0; /* for each node set spec_edge_count (and spec_edgelen_sum) as the count (or sum) of edges (edge-lengths) of all direct child edges of nodes on the path to root excluding the current node */ if (node->parent) { node->spec_edge_count = node->parent->spec_edge_count; node->spec_edgelen_sum = node->parent->spec_edgelen_sum; double len = node->parent->left->length; if (len > opt_minbr) { node->spec_edge_count++; node->spec_edgelen_sum += len; } len = node->parent->right->length; if (len > opt_minbr) { node->spec_edge_count++; node->spec_edgelen_sum += len; } } dp_set_pernode_spec_edges(node->left); dp_set_pernode_spec_edges(node->right); } mptp-0.2.2/src/fasta.c000066400000000000000000000170731304415103400145270ustar00rootroot00000000000000/* Copyright (C) 2015 Tomas Flouri This program is free software: you can redistribute it and/or modify it under the terms of the GNU Affero General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more details. You should have received a copy of the GNU Affero General Public License along with this program. If not, see . Contact: Tomas Flouri , Exelixis Lab, Heidelberg Instutute for Theoretical Studies Schloss-Wolfsbrunnenweg 35, D-69118 Heidelberg, Germany */ #include "mptp.h" #define MEMCHUNK 4096 /* please note that these functions will return a pointer to a buffer allocated here for the query header and sequence. This buffers will be overwritten on the next call of query_getnext. */ pll_fasta_t * pll_fasta_open(const char * filename, const unsigned int * map) { int i; pll_fasta_t * fd = (pll_fasta_t *)malloc(sizeof(pll_fasta_t)); if (!fd) return NULL; /* allocate space */ fd->lineno = 0; fd->no = -1; fd->chrstatus = map; /* open file */ fd->fp = fopen(filename, "r"); if (!(fd->fp)) { pll_errno = PLL_ERROR_FILE_OPEN; snprintf(errmsg, 200, "Unable to open file (%s)", filename); free(fd); return PLL_FAILURE; } /* get filesize */ if (fseek(fd->fp, 0, SEEK_END)) { pll_errno = PLL_ERROR_FILE_SEEK; snprintf(errmsg, 200, "Unable to seek in file (%s)", filename); free(fd); return PLL_FAILURE; } fd->filesize = ftell(fd->fp); rewind(fd->fp); /* reset stripped char frequencies */ fd->stripped_count = 0; for(i=0; i<256; i++) fd->stripped[i] = 0; fd->line[0] = 0; if (!fgets(fd->line, PLL_LINEALLOC, fd->fp)) { pll_errno = PLL_ERROR_FILE_SEEK; snprintf(errmsg, 200, "Unable to read file (%s)", filename); free(fd); return PLL_FAILURE; } fd->lineno = 1; return fd; } int pll_fasta_rewind(pll_fasta_t * fd) { int i; rewind(fd->fp); /* reset stripped char frequencies */ fd->stripped_count = 0; for(i=0; i<256; i++) fd->stripped[i] = 0; fd->line[0] = 0; if (!fgets(fd->line, PLL_LINEALLOC, fd->fp)) { pll_errno = PLL_ERROR_FILE_SEEK; snprintf(errmsg, 200, "Unable to rewind and cache data"); return PLL_FAILURE; } fd->lineno = 1; return PLL_SUCCESS; } void pll_fasta_close(pll_fasta_t * fd) { fclose(fd->fp); free(fd); } int pll_fasta_getnext(pll_fasta_t * fd, char ** head, long * head_len, char ** seq, long * seq_len, long * seqno) { void * mem; long head_alloc = MEMCHUNK; long seq_alloc = MEMCHUNK; *head_len = 0; *seq_len = 0; /* allocate sequence buffers */ *head = (char *)malloc((size_t)(head_alloc)); if (!(*head)) return PLL_FAILURE; *seq = (char *)malloc((size_t)(seq_alloc)); if (!(*seq)) { free(*head); return PLL_FAILURE; } /* read line and increase line number */ while (fd->line[0]) { /* read header */ if (fd->line[0] != '>') { pll_errno = PLL_ERROR_FASTA_INVALIDHEADER; snprintf(errmsg, 200, "Illegal header line in query fasta file"); free(*head); free(*seq); return PLL_FAILURE; } long headerlen; if (strchr(fd->line+1,'\r')) headerlen = xstrchrnul(fd->line+1, '\r') - (fd->line+1); else headerlen = xstrchrnul(fd->line+1, '\n') - (fd->line+1); *head_len = headerlen; if (headerlen + 1 > head_alloc) { head_alloc = headerlen + 1; mem = realloc(*head, (size_t)(head_alloc)); if (!mem) { pll_errno = PLL_ERROR_MEM_ALLOC; snprintf(errmsg, 200, "Unable to allocate enough memory."); free(*head); free(*seq); return PLL_FAILURE; } *head = (char *)mem; } memcpy(*head, fd->line + 1, (size_t)headerlen); *(*head + headerlen) = 0; /* get next line */ fd->line[0] = 0; if (!fgets(fd->line, PLL_LINEALLOC, fd->fp)) { /* do nothing */ } fd->lineno++; /* read sequence */ *seq_len = 0; while (fd->line[0] && (fd->line[0] != '>')) { char c; char m; char * p = fd->line; while((c = *p++)) { m = (char) fd->chrstatus[(int)c]; switch(m) { case 0: /* character to be stripped */ fd->stripped_count++; fd->stripped[(int)c]++; break; case 1: /* legal character */ if (*seq_len + 1 > seq_alloc) { seq_alloc += MEMCHUNK; mem = realloc(*seq, (size_t)(seq_alloc)); if (!mem) { pll_errno = PLL_ERROR_MEM_ALLOC; snprintf(errmsg, 200, "Unable to allocate enough memory."); free(*head); free(*seq); return PLL_FAILURE; } *seq = (char *)mem; } *(*seq + *seq_len) = c; (*seq_len)++; break; case 2: /* fatal character */ if (c>=32) { pll_errno = PLL_ERROR_FASTA_ILLEGALCHAR; snprintf(errmsg, 200, "illegal character '%c' " "on line %ld in the fasta file", c, fd->lineno); } else { pll_errno = PLL_ERROR_FASTA_UNPRINTABLECHAR; snprintf(errmsg, 200, "illegal unprintable character " "%#.2x (hexadecimal) on line %ld " "in the fasta file", c, fd->lineno); } return PLL_FAILURE; case 3: /* silently stripped chars */ break; } } fd->line[0] = 0; if (!fgets(fd->line, PLL_LINEALLOC, fd->fp)) { /* do nothing */ } fd->lineno++; } /* add zero after sequence */ if (*seq_len + 1 > seq_alloc) { seq_alloc += MEMCHUNK; mem = realloc(*seq, (size_t)seq_alloc); if (!mem) { pll_errno = PLL_ERROR_MEM_ALLOC; snprintf(errmsg, 200, "Unable to allocate enough memory."); free(*head); free(*seq); return PLL_FAILURE; } *seq = (char *)mem; } *(*seq + *seq_len) = 0; fd->no++; *seqno = fd->no; return PLL_SUCCESS; } snprintf(errmsg, 200, "End of file\n"); pll_errno = PLL_ERROR_FILE_EOF; free(*head); free(*seq); return PLL_FAILURE; } long pll_fasta_getfilesize(pll_fasta_t * fd) { return fd->filesize; } long pll_fasta_getfilepos(pll_fasta_t * fd) { return ftell(fd->fp); } mptp-0.2.2/src/lex_rtree.l000066400000000000000000000070621304415103400154300ustar00rootroot00000000000000/* Copyright (C) 2015 Tomas Flouri This program is free software: you can redistribute it and/or modify it under the terms of the GNU Affero General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more details. You should have received a copy of the GNU Affero General Public License along with this program. If not, see . Contact: Tomas Flouri , Heidelberg Institute for Theoretical Studies, Schloss-Wolfsbrunnenweg 35, D-69118 Heidelberg, Germany */ %{ #include "parse_rtree.h" #include "mptp.h" static size_t string_length = 0; static char * append(size_t * dstlen, const char * src, size_t srclen) { char * mem = (char *)xmalloc((*dstlen + srclen + 1)*sizeof(char)); memcpy(mem,rtree_lval.s,*dstlen); strncpy(mem+(*dstlen),src,srclen); mem[*dstlen+srclen] = 0; if (*dstlen) free(rtree_lval.s); rtree_lval.s = mem; *dstlen += srclen; return rtree_lval.s; } %} %option noyywrap %option prefix="rtree_" %option nounput %option noinput %x apos %x quot %% { \\\" { append(&string_length, "\\\"", 2); } \' { append(&string_length, "\'", 1); } \" { BEGIN(INITIAL); return STRING; } } { \\\' { append(&string_length, "\\\'", 2); } \" { append(&string_length, "\"", 1); } \' { BEGIN(INITIAL); return STRING; } } { \\n { append(&string_length, "\\n", 2); } \\t { append(&string_length, "\\t", 2); } \\ { append(&string_length, "\\", 1); } \\\\ { append(&string_length, "\\\\", 2); } ([^\"\'\\]|\n)+ { append(&string_length, rtree_text, rtree_leng); } } \: { return COLON; } \; { return SEMICOLON; } \) { return CPAR; } \( { return OPAR; } \, { return COMMA; } \" { string_length = 0; BEGIN(quot); } \' { string_length = 0; BEGIN(apos); } [-+]?[0-9]+ { rtree_lval.d = xstrndup(rtree_text,rtree_leng); return NUMBER; } [+-]?(([0-9]+[\.]?[0-9]*)|([0-9]*[\.]?[0-9]+))([eE][+-]?[0-9]+)? { rtree_lval.d = xstrndup(rtree_text,rtree_leng); return NUMBER; } [^ \'\",\(\):;\[\]\t\n\r][^ \t\n\r\)\(\[\]\,:;]* { rtree_lval.s = xstrndup(rtree_text,rtree_leng); return STRING; } [ \t\n\r] { ; } . { fatal("Syntax error (%c)\n", rtree_text[0]); } %% mptp-0.2.2/src/lex_utree.l000066400000000000000000000070561304415103400154360ustar00rootroot00000000000000/* Copyright (C) 2015 Tomas Flouri This program is free software: you can redistribute it and/or modify it under the terms of the GNU Affero General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more details. You should have received a copy of the GNU Affero General Public License along with this program. If not, see . Contact: Tomas Flouri , Heidelberg Institute for Theoretical Studies, Schloss-Wolfsbrunnenweg 35, D-69118 Heidelberg, Germany */ %{ #include "parse_utree.h" #include "mptp.h" static size_t string_length = 0; static char * append(size_t * dstlen, const char * src, size_t srclen) { char * mem = (char *)xmalloc((*dstlen + srclen + 1)*sizeof(char)); memcpy(mem,utree_lval.s,*dstlen); strncpy(mem+(*dstlen),src,srclen); mem[*dstlen+srclen] = 0; if (*dstlen) free(utree_lval.s); utree_lval.s = mem; *dstlen += srclen; return utree_lval.s; } %} %option noyywrap %option prefix="utree_" %option nounput %option noinput %x apos %x quot %% { \\\" { append(&string_length, "\\\"", 2); } \' { append(&string_length, "\'", 1); } \" { BEGIN(INITIAL); return STRING; } } { \\\' { append(&string_length, "\\\'", 2); } \" { append(&string_length, "\"", 1); } \' { BEGIN(INITIAL);return STRING;} } { \\n { append(&string_length, "\\n", 2); } \\t { append(&string_length, "\\t", 2); } \\ { append(&string_length, "\\", 1); } \\\\ { append(&string_length, "\\\\", 2); } ([^\"\'\\]|\n)+ { append(&string_length, utree_text, utree_leng); } } \: { return COLON; } \; { return SEMICOLON; } \) { return CPAR; } \( { return OPAR; } \, { return COMMA; } \" { string_length = 0; BEGIN(quot); } \' { string_length = 0; BEGIN(apos); } [-+]?[0-9]+ { utree_lval.d = xstrndup(utree_text,utree_leng); return NUMBER; } [+-]?(([0-9]+[\.]?[0-9]*)|([0-9]*[\.]?[0-9]+))([eE][+-]?[0-9]+)? { utree_lval.d = xstrndup(utree_text,utree_leng); return NUMBER; } [^ \'\",\(\):;\[\]\t\n\r][^ \t\n\r\)\(\[\]\,:;]* { utree_lval.s = xstrndup(utree_text,utree_leng); return STRING; } [ \t\n\r] { ; } . { fatal("Syntax error (%c)\n", utree_text[0]); } %% mptp-0.2.2/src/likelihood.c000066400000000000000000000031131304415103400155420ustar00rootroot00000000000000/* Copyright (C) 2015 Tomas Flouri, Sarah Lutteropp This program is free software: you can redistribute it and/or modify it under the terms of the GNU Affero General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more details. You should have received a copy of the GNU Affero General Public License along with this program. If not, see . Contact: Tomas Flouri , Heidelberg Institute for Theoretical Studies, Schloss-Wolfsbrunnenweg 35, D-69118 Heidelberg, Germany */ #include "mptp.h" double loglikelihood(long edge_count, double edgelen_sum) { assert(edge_count >= 0); if (edge_count == 0 || edgelen_sum < __DBL_MIN__) return 0; return edge_count * (log(edge_count) - 1 - log(edgelen_sum)); } int lrt(double nullmodel_logl, double ptp_logl, unsigned int df, double * pvalue) { #ifdef HAVE_LIBGSL double diff = 2*(ptp_logl - nullmodel_logl); /* http://docs.scipy.org/doc/scipy/reference/generated/scipy.special.chdtr.html */ *pvalue = 1 - gsl_cdf_chisq_P(diff,df); if ((*pvalue) > opt_pvalue) return 0; #endif return 1; } double aic(double logl, long k, long n) { if (k > 1) k++; return -2*logl + 2*k + (double)(2*k*(k + 1)) / (double)(n-k-1); } mptp-0.2.2/src/maps.c000066400000000000000000000072341304415103400143670ustar00rootroot00000000000000/* Copyright (C) 2015 Tomas Flouri This program is free software: you can redistribute it and/or modify it under the terms of the GNU Affero General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more details. You should have received a copy of the GNU Affero General Public License along with this program. If not, see . Contact: Tomas Flouri , Exelixis Lab, Heidelberg Instutute for Theoretical Studies Schloss-Wolfsbrunnenweg 35, D-69118 Heidelberg, Germany */ #include "mptp.h" /* maps for encoding sequences */ const unsigned int pll_map_nt[256] = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 15, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 15, 0, 1, 14, 2, 13, 0, 0, 4, 11, 0, 0, 12, 0, 3, 15, 15, 0, 0, 5, 6, 8, 8, 7, 9, 15, 10, 0, 0, 0, 0, 0, 0, 0, 1, 14, 2, 13, 0, 0, 4, 11, 0, 0, 12, 0, 3, 15, 15, 0, 0, 5, 6, 8, 8, 7, 9, 15, 10, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, }; /* map for fasta parsing legal symbols: *abcdefghiklmnpqrstuvxyz (all except j and o), also upper case fatal symbols: .- fatal: ascii 0-26 except tab (9), newline (10 and 13), vt (11), formfeed (12) stripped: !"#$&'()+,/0123456789:;<=>?@JO^_`joæøåÆØÅ§¨´ as well as chrs 9-13 includes both amino acid and nucleotide sequences, adapt to nt only */ const unsigned int pll_map_fasta[256] = { /* 0=stripped, 1=legal, 2=fatal, 3=silently stripped @ A B C D E F G H I J K L M N O P Q R S T U V W X Y Z [ \ ] ^ _ */ 2, 2, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 1, 1, 0, 0, 1, 1, 0, 0, 1, 0, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 1, 1, 0, 0, 1, 0, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }; mptp-0.2.2/src/mptp.c000066400000000000000000000410631304415103400144050ustar00rootroot00000000000000/* Copyright (C) 2015 Tomas Flouri, Sarah Lutteropp This program is free software: you can redistribute it and/or modify it under the terms of the GNU Affero General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more details. You should have received a copy of the GNU Affero General Public License along with this program. If not, see . Contact: Tomas Flouri , Heidelberg Institute for Theoretical Studies, Schloss-Wolfsbrunnenweg 35, D-69118 Heidelberg, Germany */ #include "mptp.h" static char * progname; static char progheader[80]; char * cmdline; /* global error message buffer */ char errmsg[200] = {0}; /* global pseudo-random number generator 48-bit state */ unsigned short global_xsubi[3]; /* number of mandatory options for the user to input */ static const char mandatory_options_count = 2; static const char * mandatory_options_list = " --tree_file --output_file"; /* options */ int pll_errno; int opt_quiet; int opt_precision; int opt_svg_showlegend; long opt_help; long opt_version; long opt_treeshow; long opt_method; long opt_mcmc_sample; long opt_mcmc_steps; long opt_mcmc_log; long opt_mcmc_startnull; long opt_mcmc_startrandom; long opt_mcmc_startml; long opt_mcmc_burnin; long opt_mcmc_runs; long opt_seed; long opt_mcmc; long opt_ml; long opt_multi; long opt_single; long opt_crop; long opt_svg; long opt_svg_width; long opt_svg_fontsize; long opt_svg_tipspace; long opt_svg_marginleft; long opt_svg_marginright; long opt_svg_margintop; long opt_svg_marginbottom; long opt_svg_inner_radius; double opt_mcmc_credible; double opt_svg_legend_ratio; double opt_pvalue; double opt_minbr; char * opt_treefile; char * opt_outfile; char * opt_outgroup; char * opt_pdist_file; static struct option long_options[] = { {"help", no_argument, 0, 0 }, /* 0 */ {"version", no_argument, 0, 0 }, /* 1 */ {"quiet", no_argument, 0, 0 }, /* 2 */ {"tree_file", required_argument, 0, 0 }, /* 3 */ {"tree_show", no_argument, 0, 0 }, /* 4 */ {"output_file", required_argument, 0, 0 }, /* 5 */ {"outgroup", required_argument, 0, 0 }, /* 6 */ {"pvalue", required_argument, 0, 0 }, /* 7 */ {"minbr", required_argument, 0, 0 }, /* 8 */ {"svg_width", required_argument, 0, 0 }, /* 9 */ {"svg_fontsize", required_argument, 0, 0 }, /* 10 */ {"svg_tipspacing", required_argument, 0, 0 }, /* 11 */ {"svg_legend_ratio", required_argument, 0, 0 }, /* 12 */ {"svg_nolegend", no_argument, 0, 0 }, /* 13 */ {"svg_marginleft", required_argument, 0, 0 }, /* 14 */ {"svg_marginright", required_argument, 0, 0 }, /* 15 */ {"svg_margintop", required_argument, 0, 0 }, /* 16 */ {"svg_marginbottom", required_argument, 0, 0 }, /* 17 */ {"svg_inner_radius", required_argument, 0, 0 }, /* 18 */ {"precision", required_argument, 0, 0 }, /* 19 */ {"mcmc_sample", required_argument, 0, 0 }, /* 20 */ {"mcmc_log", no_argument, 0, 0 }, /* 21 */ {"seed", required_argument, 0, 0 }, /* 22 */ {"mcmc_startnull", no_argument, 0, 0 }, /* 23 */ {"mcmc_burnin", required_argument, 0, 0 }, /* 24 */ {"mcmc_startrandom", no_argument, 0, 0 }, /* 25 */ {"mcmc_runs", required_argument, 0, 0 }, /* 26 */ {"minbr_auto", required_argument, 0, 0 }, /* 27 */ {"outgroup_crop", no_argument, 0, 0 }, /* 28 */ {"mcmc_credible", required_argument, 0, 0 }, /* 29 */ {"mcmc", required_argument, 0, 0 }, /* 30 */ {"ml", no_argument, 0, 0 }, /* 31 */ {"single", no_argument, 0, 0 }, /* 32 */ {"multi", no_argument, 0, 0 }, /* 33 */ {"mcmc_startml", no_argument, 0, 0 }, /* 34 */ { 0, 0, 0, 0 } }; void args_init(int argc, char ** argv) { int option_index = 0; int c; int mand_options = 0; /* set defaults */ progname = argv[0]; opt_help = 0; opt_version = 0; opt_treeshow = 0; opt_treefile = NULL; opt_outfile = NULL; opt_outgroup = NULL; opt_pdist_file = NULL; opt_quiet = 0; opt_pvalue = 0.001; opt_minbr = 0.0001; opt_precision = 7; opt_mcmc_steps = 0; opt_mcmc_sample = 1000; opt_mcmc_startnull = 0; opt_mcmc_startrandom = 0; opt_mcmc_startml = 0; opt_mcmc_log = 0; opt_mcmc_burnin = 1; opt_mcmc_runs = 1; opt_mcmc_credible = 0.95; opt_seed = (long)time(NULL); opt_crop = 0; opt_ml = 0; opt_mcmc = 0; opt_method = PTP_METHOD_MULTI; opt_multi = 0; opt_single = 0; opt_svg_width = 1920; opt_svg_fontsize = 12; opt_svg_tipspace = 20; opt_svg_legend_ratio = 0.1; opt_svg_showlegend = 1; opt_svg_marginleft = 20; opt_svg_marginright = 20; opt_svg_margintop = 20; opt_svg_marginbottom = 20; opt_svg_inner_radius = 0; while ((c = getopt_long_only(argc, argv, "", long_options, &option_index)) == 0) { char * end; switch (option_index) { case 0: opt_help = 1; break; case 1: opt_version = 1; break; case 2: opt_quiet = 1; break; case 3: free(opt_treefile); opt_treefile = optarg; break; case 4: opt_treeshow = 1; break; case 5: opt_outfile = optarg; break; case 6: opt_outgroup = optarg; break; case 7: opt_pvalue = strtod(optarg, &end); if (end == optarg) { fatal(" is not a valid number.\n"); } break; case 8: opt_minbr = strtod(optarg, &end); if (end == optarg) { fatal(" is not a valid number.\n"); } break; case 9: opt_svg_width = atoi(optarg); break; case 10: opt_svg_fontsize = atol(optarg); break; case 11: opt_svg_tipspace = atol(optarg); break; case 12: opt_svg_legend_ratio = atof(optarg); break; case 13: opt_svg_showlegend = 0; break; case 14: opt_svg_marginleft = atol(optarg); break; case 15: opt_svg_marginright = atol(optarg); break; case 16: opt_svg_margintop = atol(optarg); break; case 17: opt_svg_marginbottom = atol(optarg); break; case 18: opt_svg_inner_radius = atol(optarg); break; case 19: opt_precision = atoi(optarg); break; case 20: opt_mcmc_sample = atol(optarg); break; case 21: opt_mcmc_log = 1; break; case 22: opt_seed = atol(optarg); break; case 23: opt_mcmc_startnull = 1; break; case 24: opt_mcmc_burnin = atol(optarg); break; case 25: opt_mcmc_startrandom = 1; break; case 26: opt_mcmc_runs = atol(optarg); break; case 27: free(opt_pdist_file); opt_pdist_file = optarg; break; case 28: opt_crop = 1; break; case 29: opt_mcmc_credible = atof(optarg); break; case 30: opt_mcmc = 1; opt_mcmc_steps = atol(optarg); break; case 31: opt_ml = 1; break; case 32: opt_method = PTP_METHOD_SINGLE; opt_single = 1; break; case 33: opt_method = PTP_METHOD_MULTI; opt_multi = 1; break; case 34: opt_mcmc_startml = 1; break; default: fatal("Internal error in option parsing"); } } if (c != -1) exit(EXIT_FAILURE); int commands = 0; /* check for mandatory options */ if (opt_treefile) mand_options++; if (opt_outfile) mand_options++; /* check for number of independent commands selected */ if (opt_version) commands++; if (opt_help) commands++; if (opt_pdist_file) commands++; if (opt_mcmc) commands++; if (opt_ml) commands++; /* if more than one independent command, fail */ if (commands > 1) fatal("More than one command specified"); /* if more than one independent command, fail */ if (opt_mcmc_startrandom + opt_mcmc_startnull + opt_mcmc_startml > 1) fatal("You can only select one out of --mcmc_startrandom, --mcmc_startnull, --mcmc_startml"); /* if more than one independent command, fail */ if (opt_multi && opt_single) fatal("You can either specify --multi or --single, but not both at once."); /* if no command specified, turn on --help */ if (!commands) { opt_help = 1; return; } /* check for mandatory options */ if (!opt_version && !opt_help) if (mand_options != mandatory_options_count) fatal("Mandatory options are:\n\n%s", mandatory_options_list); } void cmd_help() { fprintf(stderr, "Usage: %s [OPTIONS]\n", progname); fprintf(stderr, "\n" "Examples:\n" " mptp --ml --multi --tree_file tree.newick --output_file output\n" " mptp --mcmc 50000000 --multi --mcmc_sample 1000000 --mcmc_burnin 1000000 --tree_file tree.newick --output_file output\n\n" "General options:\n" " --help display help information.\n" " --version display version information.\n" " --tree_show display an ASCII version of the tree.\n" " --multi Use one lambda per coalescent (this is default).\n" " --single Use one lambda for all coalescent.\n" " --ml Maximum-likelihood heuristic.\n" " --mcmc INT Support values for the delimitation (INT steps).\n" " --mcmc_sample INT Sample every INT iteration (default: 1000).\n" " --mcmc_log Log samples and create SVG plot of log-likelihoods.\n" " --mcmc_burnin INT Ignore all MCMC steps below threshold.\n" " --mcmc_runs INT Perform multiple MCMC runs.\n" " --mcmc_credible <0..1> Credible interval (default: 0.95).\n" " --mcmc_startnull Start each run with the null model (one single species).\n" " --mcmc_startrandom Start each run with a random delimitation.\n" " --mcmc_startml Start each run with the delimitation obtained by the Maximum-likelihood heuristic.\n" " --pvalue REAL Set p-value for LRT (default: 0.001)\n" " --minbr REAL Set minimum branch length (default: 0.0001)\n" " --minbr_auto FILENAME Detect minimum branch length from FASTA p-distances\n" " --outgroup TAXA Root unrooted tree at outgroup (default: taxon with longest branch).\n" " --outgroup_crop Crop outgroup from tree\n" " --quiet only output warnings and fatal errors to stderr.\n" " --precision INT Precision of floating point numbers on output (default: 7).\n" " --seed Seed for pseudo-random number generator.\n" "\n" "Input and output options:\n" " --tree_file FILENAME tree file in newick format.\n" " --output_file FILENAME output file name.\n" "\n" "Visualization options:\n" " --svg_width INT Width of SVG tree in pixels (default: 1920).\n" " --svg_fontsize INT Size of font in SVG image. (default: 12)\n" " --svg_tipspacing INT Vertical space between taxa in SVG tree (default: 20).\n" " --svg_legend_ratio <0..1> Ratio of total tree length to be displayed as legend line.\n" " --svg_nolegend Hides legend.\n" " --svg_marginleft INT Left margin in pixels (default: 20).\n" " --svg_marginright INT Right margin in pixels (default: 20).\n" " --svg_margintop INT Top margin in pixels (default: 20).\n" " --svg_marginbottom INT Bottom margin in pixels (default: 20).\n" " --svg_inner_radius INT Radius of inner nodes in pixels (default: 0).\n" ); } static rtree_t * load_tree(void) { /* parse tree */ if (!opt_quiet) fprintf(stdout, "Parsing tree file...\n"); rtree_t * rtree = rtree_parse_newick(opt_treefile); if (!rtree) { unsigned int tip_count; utree_t * utree = utree_parse_newick(opt_treefile, &tip_count); if (!utree) fatal("Tree is neither unrooted nor rooted."); if (!opt_quiet) { fprintf(stdout, "Loaded unrooted tree...\n"); fprintf(stdout, "Converting to rooted tree...\n"); } /* if outgroup was not specified, get the node with the longest branch */ utree_t * og_root = NULL; /* if outgroup was not specified, get the tip with the longest branch */ if (!opt_outgroup) { og_root = utree_longest_branchtip(utree, tip_count); assert(og_root); fprintf(stdout, "Selected %s as outgroup based on longest tip-branch criterion\n", og_root->label); } else { /* get LCA of out group */ og_root = utree_outgroup_lca(utree, tip_count); if (!og_root) { utree_destroy(utree); fatal("Outgroup must be a single tip or a list of all tips of a subtree"); } } if (opt_crop) { rtree = utree_crop(og_root); } else { rtree = utree_convert_rtree(og_root); } utree_destroy(utree); } else { if (!opt_quiet) fprintf(stdout, "Loaded rooted tree...\n"); if (opt_crop) { if (!opt_outgroup) fatal("--outgroup must be specified when using --outgroup_crop."); /* get LCA of outgroup */ rtree_t * og_root = get_outgroup_lca(rtree); /* crop outgroup from tree */ rtree = rtree_crop(rtree,og_root); if (!rtree) fatal("Cropping the outgroup leads to less than two tips."); } } return rtree; } void cmd_auto() { rtree_t * rtree = load_tree(); detect_min_bl(rtree); /* deallocate tree structure */ rtree_destroy(rtree); } void cmd_ml(void) { rtree_t * rtree = load_tree(); dp_init(rtree); dp_set_pernode_spec_edges(rtree); dp_ptp(rtree, opt_method); dp_free(rtree); if (opt_treeshow) rtree_show_ascii(rtree); cmd_svg(rtree, opt_seed, "svg"); /* deallocate tree structure */ rtree_destroy(rtree); if (!opt_quiet) fprintf(stdout, "Done...\n"); } void cmd_multirun(void) { if (opt_mcmc_steps == 0) fatal("The number of runs specified after --mcmc must be a positive integer greater than zero"); if (opt_mcmc_burnin < 1 || opt_mcmc_burnin > opt_mcmc_steps) fatal("--opt_mcmc_burnin must be a positive integer smaller or equal to --opt_mcmc_steps"); if (opt_mcmc_credible < 0 || opt_mcmc_credible > 1) fatal("--opt_mcmc_credible must be a real number between 0 and 1"); rtree_t * rtree = load_tree(); multirun(rtree, opt_method); if (opt_treeshow) rtree_show_ascii(rtree); if (!opt_quiet) fprintf(stdout, "Done...\n"); } void getentirecommandline(int argc, char * argv[]) { int len = 0; int i; for (i = 0; i < argc; ++i) len += strlen(argv[i]); cmdline = (char *)xmalloc((size_t)(len + argc + 1)); cmdline[0] = 0; for (i = 0; i < argc; ++i) { strcat(cmdline, argv[i]); strcat(cmdline, " "); } } void fillheader() { snprintf(progheader, 80, "%s %s_%s, %1.fGB RAM, %ld cores", PROG_NAME, PROG_VERSION, PROG_ARCH, arch_get_memtotal() / 1024.0 / 1024.0 / 1024.0, sysconf(_SC_NPROCESSORS_ONLN)); } void show_header() { fprintf(stdout, "%s\n", progheader); fprintf(stdout, "https://github.com/Pas-Kapli/mptp\n"); fprintf(stdout,"\n"); } int main (int argc, char * argv[]) { fillheader(); getentirecommandline(argc, argv); args_init(argc, argv); show_header(); /* init random number generator and maintain compatibility with srand48 */ random_init(global_xsubi,opt_seed); if (opt_help) { cmd_help(); } else if (opt_pdist_file) { cmd_auto(); } else if (opt_mcmc) { cmd_multirun(); } else if (opt_ml) { cmd_ml(); } free(cmdline); return (0); } mptp-0.2.2/src/mptp.h000066400000000000000000000257561304415103400144250ustar00rootroot00000000000000/* Copyright (C) 2015 Tomas Flouri, Sarah Lutteropp This program is free software: you can redistribute it and/or modify it under the terms of the GNU Affero General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more details. You should have received a copy of the GNU Affero General Public License along with this program. If not, see . Contact: Tomas Flouri , Heidelberg Institute for Theoretical Studies, Schloss-Wolfsbrunnenweg 35, D-69118 Heidelberg, Germany */ #define _GNU_SOURCE #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef HAVE_CONFIG_H #include "config.h" #endif #if (defined(HAVE_CONFIG_H) && defined(HAVE_LIBGSL)) #include #endif /* constants */ #define PROG_NAME PACKAGE #define PROG_VERSION PACKAGE_VERSION #ifdef __APPLE__ #define PROG_ARCH "macosx_x86_64" #else #define PROG_ARCH "linux_x86_64" #endif #define PLL_FAILURE 0 #define PLL_SUCCESS 1 #define PLL_LINEALLOC 2048 #define PLL_ERROR_FILE_OPEN 1 #define PLL_ERROR_FILE_SEEK 2 #define PLL_ERROR_FILE_EOF 3 #define PLL_ERROR_FASTA_ILLEGALCHAR 4 #define PLL_ERROR_FASTA_UNPRINTABLECHAR 5 #define PLL_ERROR_FASTA_INVALIDHEADER 6 #define PLL_ERROR_MEM_ALLOC 7 #define LINEALLOC 2048 #define EVENT_SPECIATION 0 #define EVENT_COALESCENT 1 #define PTP_METHOD_SINGLE 0 #define PTP_METHOD_MULTI 1 #define REGEX_REAL "([-+]?[0-9]*\\.?[0-9]+([eE][-+]?[0-9]+)?)" /* structures and data types */ typedef unsigned int UINT32; typedef unsigned short WORD; typedef unsigned char BYTE; typedef struct dp_vector_s { /* sum of speciation edge lengths of current subtree */ double spec_edgelen_sum; /* coalescent logl of subtree for multi lambda */ double coal_multi_logl; /* best single- and multi-rate log-likelihood for current subtree */ double score_multi; double score_single; /* back-tracking information */ int vec_left; int vec_right; unsigned int species_count; int filled; } dp_vector_t; typedef struct utree_s { char * label; double length; int height; struct utree_s * next; struct utree_s * back; void * data; /* for finding the lca */ int mark; } utree_t; typedef struct rtree_s { char * label; double length; struct rtree_s * left; struct rtree_s * right; struct rtree_s * parent; int leaves; /* number of edges within current subtree with lengths greater than opt_minbr and corresponding sum */ int edge_count; double edgelen_sum; double coal_logl; /* minimum number of speciation edges if current node is the start of a coalescent event, and the respective sum of lengths */ int spec_edge_count; double spec_edgelen_sum; /* which process does this node belong to (coalesent or speciation) */ int event; /* slot in which the node resides when doing mcmc analysis */ long mcmc_slot; long speciation_start; long speciation_count; double aic_weight_start; double aic_support; double support; /* dynamic programming vector */ dp_vector_t * vector; /* auxialiary data */ void * data; /* for generating random delimitations */ int max_species_count; /* mark */ int mark; char * sequence; } rtree_t; typedef struct pll_fasta { FILE * fp; char line[LINEALLOC]; const unsigned int * chrstatus; long no; long filesize; long lineno; long stripped_count; long stripped[256]; } pll_fasta_t; /* macros */ #define MIN(a,b) ((a) < (b) ? (a) : (b)) #define MAX(a,b) ((a) > (b) ? (a) : (b)) /* options */ extern int opt_quiet; extern int opt_precision; extern int opt_svg_showlegend; extern long opt_help; extern long opt_version; extern long opt_treeshow; extern long opt_mcmc_sample; extern long opt_mcmc_steps; extern long opt_mcmc_log; extern long opt_mcmc_startml; extern long opt_mcmc_startnull; extern long opt_mcmc_startrandom; extern long opt_mcmc_burnin; extern long opt_mcmc_runs; extern long opt_seed; extern long opt_mcmc; extern long opt_ml; extern long opt_multi; extern long opt_single; extern long opt_method; extern long opt_crop; extern long opt_svg; extern long opt_svg_width; extern long opt_svg_fontsize; extern long opt_svg_tipspace; extern long opt_svg_marginleft; extern long opt_svg_marginright; extern long opt_svg_margintop; extern long opt_svg_marginbottom; extern long opt_svg_inner_radius; extern double opt_mcmc_credible; extern double opt_svg_legend_ratio; extern double opt_pvalue; extern double opt_minbr; extern char * opt_treefile; extern char * opt_outfile; extern char * opt_outgroup; extern char * opt_pdist_file; extern char * cmdline; /* common data */ extern char errmsg[200]; extern int pll_errno; extern unsigned short global_xsubi[3]; extern const unsigned int pll_map_nt[256]; extern const unsigned int pll_map_fasta[256]; extern long mmx_present; extern long sse_present; extern long sse2_present; extern long sse3_present; extern long ssse3_present; extern long sse41_present; extern long sse42_present; extern long popcnt_present; extern long avx_present; extern long avx2_present; /* functions in util.c */ void fatal(const char * format, ...) __attribute__ ((noreturn)); void progress_init(const char * prompt, unsigned long size); void progress_update(unsigned int progress); void progress_done(void); void * xmalloc(size_t size); void * xcalloc(size_t nmemb, size_t size); void * xrealloc(void *ptr, size_t size); char * xstrchrnul(char *s, int c); char * xstrdup(const char * s); char * xstrndup(const char * s, size_t len); long getusec(void); void show_rusage(void); FILE * xopen(const char * filename, const char * mode); void random_init(unsigned short * rstate, long seedval); /* functions in mptp.c */ void args_init(int argc, char ** argv); void cmd_help(void); void getentirecommandline(int argc, char * argv[]); void fillheader(void); void show_header(void); void cmd_ml(void); void cmd_multirun(void); void cmd_auto(void); /* functions in parse_rtree.y */ rtree_t * rtree_parse_newick(const char * filename); void rtree_destroy(rtree_t * root); /* functions in parse_utree.y */ utree_t * utree_parse_newick(const char * filename, unsigned int * tip_count); void utree_destroy(utree_t * root); /* functions in utree.c */ void utree_show_ascii(utree_t * tree); char * utree_export_newick(utree_t * root); int utree_query_tipnodes(utree_t * root, utree_t ** node_list); int utree_query_innernodes(utree_t * root, utree_t ** node_list); rtree_t * utree_convert_rtree(utree_t * root); int utree_traverse(utree_t * root, int (*cbtrav)(utree_t *), utree_t ** outbuffer); utree_t * utree_longest_branchtip(utree_t * node, unsigned int tip_count); utree_t * utree_outgroup_lca(utree_t * root, unsigned int tip_count); rtree_t * utree_crop(utree_t * lca); /* functions in rtree.c */ void rtree_show_ascii(rtree_t * tree); char * rtree_export_newick(rtree_t * root); int rtree_query_tipnodes(rtree_t * root, rtree_t ** node_list); int rtree_query_innernodes(rtree_t * root, rtree_t ** node_list); void rtree_reset_info(rtree_t * root); void rtree_print_tips(rtree_t * node, FILE * out); int rtree_traverse(rtree_t * root, int (*cbtrav)(rtree_t *), unsigned short * rstate, rtree_t ** outbuffer); rtree_t * rtree_clone(rtree_t * node, rtree_t * parent); int rtree_traverse_postorder(rtree_t * root, int (*cbtrav)(rtree_t *), rtree_t ** outbuffer); rtree_t ** rtree_tipstring_nodes(rtree_t * root, char * tipstring, unsigned int * tiplist_count); rtree_t * get_outgroup_lca(rtree_t * root); rtree_t * rtree_lca(rtree_t * root, rtree_t ** tip_nodes, unsigned int count); rtree_t * rtree_crop(rtree_t * root, rtree_t * crop_root); int rtree_height(rtree_t * root); /* functions in parse_rtree.y */ rtree_t * rtree_parse_newick(const char * filename); /* functions in lca_utree.c */ void lca_init(utree_t * root); utree_t * lca_compute(utree_t * tip1, utree_t * tip2); void lca_destroy(void); /* functions in arch.c */ unsigned long arch_get_memused(void); unsigned long arch_get_memtotal(void); /* functions in dp.c */ void dp_init(rtree_t * tree); void dp_free(rtree_t * tree); void dp_ptp(rtree_t * rtree, long method); void dp_set_pernode_spec_edges(rtree_t * node); /* functions in svg.c */ void cmd_svg(rtree_t * rtree, long seed, const char * ext); /* functions in likelihood.c */ double loglikelihood(long edge_count, double edgelen_sum); int lrt(double nullmodel_logl, double ptp_logl, unsigned int df, double * pvalue); double aic(double logl, long k, long n); /* functions in output.c */ void output_info(FILE * out, long method, double nullmodel_logl, double logl, double pvalue, int lrt_result, rtree_t * root, unsigned int species_count); FILE * open_file_ext(const char * extension, long seed); /* functions in svg_landscape.c */ void svg_landscape(double mcmc_min_log, double mcmc_max_logl, long seed); void svg_landscape_combined(double mcmc_min_log, double mcmc_max_logl, long runs, long * seed); /* functions in random.c */ double random_delimitation(rtree_t * root, long * delimited_species, long * coal_edge_count, double * coal_edgelen_sum, long * spec_edge_count, double * spec_edgelen_sum, double * coal_score, unsigned short * rstate); /* functions in multirun.c */ void multirun(rtree_t * root, long method); /* functions in fasta.c */ pll_fasta_t * pll_fasta_open(const char * filename, const unsigned int * map); int pll_fasta_getnext(pll_fasta_t * fd, char ** head, long * head_len, char ** seq, long * seq_len, long * seqno); void pll_fasta_close(pll_fasta_t * fd); long pll_fasta_getfilesize(pll_fasta_t * fd); long pll_fasta_getfilepos(pll_fasta_t * fd); int pll_fasta_rewind(pll_fasta_t * fd); /* functions in auto.c */ void detect_min_bl(rtree_t * rtree); /* functions in aic.c */ void aic_mcmc(rtree_t * tree, long method, unsigned short * rstate, long seed, double * mcmc_min_logl, double * mcmc_max_logl); mptp-0.2.2/src/multirun.c000066400000000000000000000243411304415103400153040ustar00rootroot00000000000000/* Copyright (C) 2015 Tomas Flouri This program is free software: you can redistribute it and/or modify it under the terms of the GNU Affero General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more details. You should have received a copy of the GNU Affero General Public License along with this program. If not, see . Contact: Tomas Flouri , Heidelberg Institute for Theoretical Studies, Schloss-Wolfsbrunnenweg 35, D-69118 Heidelberg, Germany */ #include "mptp.h" #define MPTP_INNER_CROOT 1 #define MPTP_TIP_CROOT 2 static double asv(int * mlcroots, double * support, int count) { int i; double sum = 0; int croots_count = 0; for (i = 0; i < count; ++i) { if (mlcroots[i] == MPTP_INNER_CROOT) { sum += (1-support[i]); croots_count++; } else if (mlcroots[i] == MPTP_TIP_CROOT) { sum += support[i]; croots_count++; } } return sum / croots_count; } static void extract_croots_recursive(rtree_t * node, int * index, int * outbuffer) { if (!node->edge_count) return; if (node->parent) { outbuffer[*index] = 0; if (node->event == EVENT_COALESCENT && node->parent->event == EVENT_SPECIATION) { outbuffer[*index] = MPTP_INNER_CROOT; } else { if ((node->event == EVENT_SPECIATION) && (node->left->edge_count == 0 || node->right->edge_count == 0)) outbuffer[*index] = MPTP_TIP_CROOT; } } else { outbuffer[*index] = 0; if (node->event == EVENT_COALESCENT) outbuffer[*index] = MPTP_INNER_CROOT; } *index = *index+1; extract_croots_recursive(node->left, index, outbuffer); extract_croots_recursive(node->right, index, outbuffer); } /* recursively extract support values from a tree into an array */ static int extract_croots(rtree_t * root, int * outbuffer) { int index = 0; int count = 0; int i; if (!root->edge_count) return -1; extract_croots_recursive(root, &index, outbuffer); for (i = 0; i < index; ++i) if (outbuffer[i]) ++count; return count; } static void extract_support_recursive(rtree_t * node, int * index, double * outbuffer) { if (!node->edge_count) return; outbuffer[*index] = node->support; *index = *index + 1; extract_support_recursive(node->left, index, outbuffer); extract_support_recursive(node->right, index, outbuffer); } /* recursively extract support values from a tree into an array */ static int extract_support(rtree_t * root, double * outbuffer) { int index = 0; if (!root->edge_count) return -1; extract_support_recursive(root, &index, outbuffer); return index; } void multirun(rtree_t * root, long method) { long i,j; long * seeds; rtree_t * mltree; rtree_t * ctree; rtree_t ** trees; unsigned short ** rstates; double * mcmc_min_logl; double * mcmc_max_logl; trees = (rtree_t **)xmalloc((size_t)opt_mcmc_runs * sizeof(rtree_t *)); trees[0] = root; /* clone trees in order to have one independent tree per run */ for (i = 1; i < opt_mcmc_runs; ++i) trees[i] = rtree_clone(root, NULL); mltree = rtree_clone(root,NULL); ctree = rtree_clone(root,NULL); /* allocate memory for storing min and max logl for each run */ mcmc_min_logl = (double *)xmalloc((size_t)opt_mcmc_runs * sizeof(double)); mcmc_max_logl = (double *)xmalloc((size_t)opt_mcmc_runs * sizeof(double)); /* reset to zero */ memset(mcmc_min_logl, 0, (size_t)opt_mcmc_runs * sizeof(double)); memset(mcmc_max_logl, 0, (size_t)opt_mcmc_runs * sizeof(double)); /* generate one seed for each run */ seeds = (long *)xmalloc((size_t)opt_mcmc_runs * sizeof(long)); for (i = 0; i < opt_mcmc_runs; ++i) seeds[i] = nrand48(global_xsubi); if (opt_mcmc_runs == 1) seeds[0] = opt_seed; /* initialize states for random number generators */ rstates = (unsigned short **)xmalloc((size_t)opt_mcmc_runs * sizeof(unsigned short *)); for (i = 0; i < opt_mcmc_runs; ++i) rstates[i] = (unsigned short *)xmalloc(3*sizeof(unsigned short *)); /* initialize a pseudo-random number generator for each run */ for (i = 0; i < opt_mcmc_runs; ++i) random_init(rstates[i], seeds[i]); /* create an array for storing the sum of support values for each node across all MCMC runs */ double * combined_val; combined_val = (double *)xmalloc((size_t)(root->leaves-1) * sizeof(double)); memset(combined_val,0,(root->leaves-1)*sizeof(double)); rtree_t ** inner_node_list = (rtree_t **)xmalloc((size_t)(root->leaves-1) * sizeof(rtree_t *)); /* execute each run sequentially */ for (i = 0; i < opt_mcmc_runs; ++i) { dp_init(trees[i]); dp_set_pernode_spec_edges(trees[i]); if (!opt_quiet) fprintf(stdout, "\nMCMC run %ld...\n", i); aic_mcmc(trees[i], method, rstates[i], seeds[i], mcmc_min_logl+i, mcmc_max_logl+i); dp_free(trees[i]); /* add up support values */ rtree_query_innernodes(trees[i], inner_node_list); for (j = 0; j < trees[i]->leaves-1; ++j) combined_val[j] += inner_node_list[j]->support; /* print SVG log-likelihood landscape of current run given its generated seed */ if (opt_mcmc_log) { svg_landscape(mcmc_min_logl[i], mcmc_max_logl[i], seeds[i]); } /* output SVG tree with support values for current run */ char * newick = rtree_export_newick(trees[i]); if (!opt_quiet) fprintf(stdout, "Creating tree with support values in %s.%ld.tree ...\n", opt_outfile, seeds[i]); FILE * newick_fp = open_file_ext("tree", seeds[i]); fprintf(newick_fp, "%s\n", newick); fclose(newick_fp); cmd_svg(trees[i], seeds[i], "svg"); free(newick); } /* compute the min and max log-l values among all runs */ double min_logl = mcmc_min_logl[0]; double max_logl = mcmc_max_logl[0]; for (i = 1; i < opt_mcmc_runs; ++i) { if (mcmc_min_logl[i] < min_logl) min_logl = mcmc_min_logl[i]; if (mcmc_max_logl[i] > max_logl) max_logl = mcmc_max_logl[i]; } /* generate the SVG log-likelihood landscape for all runs combined */ if (!opt_quiet && opt_mcmc_log && (opt_mcmc_runs > 1)) fprintf(stdout, "\nPreparing overall log-likelihood landscape ...\n"); if (opt_mcmc_log && (opt_mcmc_runs > 1)) svg_landscape_combined(min_logl, max_logl, opt_mcmc_runs, seeds); /* free min and max logl arrays */ free(mcmc_min_logl); free(mcmc_max_logl); /* allocate memory for support values */ double ** support = (double **)xmalloc((size_t)opt_mcmc_runs * sizeof(double *)); int support_count = 0; for (i = 0; i < opt_mcmc_runs; ++i) { support[i] = (double *)xmalloc((size_t)(trees[i]->leaves) * sizeof(double)); support_count = extract_support(trees[i], support[i]); rtree_destroy(trees[i]); } /* compute ML tree */ dp_init(mltree); dp_set_pernode_spec_edges(mltree); dp_ptp(mltree, method); int * mlcroots = (int *)xmalloc((size_t)(mltree->leaves) * sizeof(int)); int croots_count = extract_croots(mltree, mlcroots); /* If any of the two following conditions hold then the ML solution is the null-model in the following form: 0 : we have n species (n = tips) -1 : we have one species In this case, ASV is not informative and hence it is skipped */ if (croots_count == 0 || croots_count == -1) fprintf(stderr, "WARNING: ML delimitation is the null-model - ASV is skipped\n"); else { for (i = 0; i < opt_mcmc_runs; ++i) { printf("ML average support based on run with seed %ld : %.17f\n", seeds[i], asv(mlcroots, support[i], support_count)); } } dp_free(mltree); rtree_destroy(mltree); free(mlcroots); /* compute the standard deviation of each support value given the runs, and then compute a consensus average standard deviation for all support values */ double mean, var, stdev, avg_stdev = 0; for (i = 0; i < support_count; ++i) { int j; mean = var = stdev = 0; for (j = 0; j < opt_mcmc_runs; ++j) mean += support[j][i]; mean /= opt_mcmc_runs; for (j = 0; j < opt_mcmc_runs; ++j) var += (mean - support[j][i])*(mean - support[j][i]); var /= opt_mcmc_runs; stdev = sqrt(var); avg_stdev += stdev; } avg_stdev /= support_count; if (!opt_quiet) printf("Average standard deviation of support values among runs: %f\n", avg_stdev); /* compute the combined support values */ for (j = 0; j < ctree->leaves-1; ++j) combined_val[j] /= opt_mcmc_runs; /* query inner nodes and set the combined support values */ rtree_query_innernodes(ctree, inner_node_list); for (j = 0; j < ctree->leaves-1; ++j) inner_node_list[j]->support = combined_val[j]; /* deallocate the structures */ free(inner_node_list); free(combined_val); /* export the combined tree */ char * newick = rtree_export_newick(ctree); if (!opt_quiet) fprintf(stdout, "Creating tree with combined support values in %s.%ld.combined.tree ...\n", opt_outfile, opt_seed); /* open, write, close, free newick */ FILE * newick_fp = open_file_ext("combined.tree", opt_seed); fprintf(newick_fp, "%s\n", newick); fclose(newick_fp); free(newick); /* create an SVG of the combined tree with support values */ cmd_svg(ctree, opt_seed, "combined.svg"); /* destroy combined tree */ rtree_destroy(ctree); /* deallocate support values array */ for (i = 0; i < opt_mcmc_runs; ++i) free(support[i]); free(support); /* deallocate all cloned trees (except from the original) */ for (i = 0; i < opt_mcmc_runs; ++i) free(rstates[i]); free(rstates); free(seeds); free(trees); } mptp-0.2.2/src/output.c000066400000000000000000000044071304415103400147660ustar00rootroot00000000000000/* Copyright (C) 2015 Tomas Flouri, Sarah Lutteropp This program is free software: you can redistribute it and/or modify it under the terms of the GNU Affero General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more details. You should have received a copy of the GNU Affero General Public License along with this program. If not, see . Contact: Tomas Flouri , Heidelberg Institute for Theoretical Studies, Schloss-Wolfsbrunnenweg 35, D-69118 Heidelberg, Germany */ #include "mptp.h" FILE * open_file_ext(const char * extension, long seed) { char * filename = NULL; if (opt_mcmc) { if (asprintf(&filename, "%s.%ld.%s", opt_outfile, seed, extension) == -1) fatal("Unable to allocate enough memory."); } else { if (asprintf(&filename, "%s.%s", opt_outfile, extension) == -1) fatal("Unable to allocate enough memory."); } FILE * out = xopen(filename,"w"); free(filename); return out; } void output_info(FILE * out, long method, double nullmodel_logl, double logl, double pvalue, int lrt_result, rtree_t * root, unsigned int species_count) { fprintf(out, "Command: %s\n", cmdline); fprintf(out, "Number of edges greater than minimum branch length: %d / %d\n", root->edge_count, 2 * root->leaves - 2); fprintf(out, "Null-model score: %.6f\n", nullmodel_logl); fprintf(out, "Best score for %s coalescent rate: %.6f\n", (method == PTP_METHOD_SINGLE) ? "single" : "multi", logl); #ifdef HAVE_LIBGSL if (method == PTP_METHOD_SINGLE) { fprintf(out, "LRT computed p-value: %.6f\n", pvalue); fprintf(out, "LRT: %s\n", lrt_result ? "passed" : "failed"); } #endif fprintf(out, "Number of delimited species: %d\n", species_count); } mptp-0.2.2/src/parse_rtree.y000066400000000000000000000105021304415103400157600ustar00rootroot00000000000000/* Copyright (C) 2015 Tomas Flouri This program is free software: you can redistribute it and/or modify it under the terms of the GNU Affero General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more details. You should have received a copy of the GNU Affero General Public License along with this program. If not, see . Contact: Tomas Flouri , Heidelberg Institute for Theoretical Studies, Schloss-Wolfsbrunnenweg 35, D-69118 Heidelberg, Germany */ %{ #include "mptp.h" extern int rtree_lex(); extern FILE * rtree_in; extern void rtree_lex_destroy(); void rtree_destroy(rtree_t * root) { if (!root) return; rtree_destroy(root->left); rtree_destroy(root->right); if (root->data) free(root->data); free(root->label); free(root); } static void rtree_error(rtree_t * tree, const char * s) { } %} %union { char * s; char * d; struct rtree_s * tree; } %error-verbose %parse-param {struct rtree_s * tree} %destructor { rtree_destroy($$); } subtree %destructor { free($$); } STRING %destructor { free($$); } NUMBER %destructor { free($$); } label %token OPAR %token CPAR %token COMMA %token COLON SEMICOLON %token STRING %token NUMBER %type label optional_label %type number optional_length %type subtree %start input %% input: OPAR subtree COMMA subtree CPAR optional_label optional_length SEMICOLON { tree->left = $2; tree->right = $4; tree->label = $6; tree->length = $7 ? atof($7) : 0; tree->leaves = $2->leaves + $4->leaves; tree->parent = NULL; tree->event = EVENT_COALESCENT; tree->data = NULL; free($7); tree->left->parent = tree; tree->right->parent = tree; tree->edge_count = $2->edge_count + $4->edge_count; tree->edgelen_sum = $2->edgelen_sum + $4->edgelen_sum; if ($2->length > opt_minbr) { tree->edge_count++; tree->edgelen_sum += $2->length; } if ($4->length > opt_minbr) { tree->edge_count++; tree->edgelen_sum += $4->length; } tree->max_species_count = 1; if (tree->edge_count > 0) tree->max_species_count = $2->max_species_count + $4->max_species_count; tree->mark = 0; }; subtree: OPAR subtree COMMA subtree CPAR optional_label optional_length { $$ = (rtree_t *)calloc(1, sizeof(rtree_t)); $$->left = $2; $$->right = $4; $$->label = $6; $$->length = $7 ? atof($7) : 0; $$->leaves = $2->leaves + $4->leaves; $$->event = EVENT_COALESCENT; free($7); $$->left->parent = $$; $$->right->parent = $$; $$->edge_count = $2->edge_count + $4->edge_count; $$->edgelen_sum = $2->edgelen_sum + $4->edgelen_sum; if ($2->length > opt_minbr) { $$->edge_count++; $$->edgelen_sum += $2->length; } if ($4->length > opt_minbr) { $$->edge_count++; $$->edgelen_sum += $4->length; } $$->max_species_count = 1; if ($$->edge_count > 0) $$->max_species_count = $2->max_species_count + $4->max_species_count; $$->mark = 0; $$->data = NULL; } | label optional_length { $$ = (rtree_t *)calloc(1, sizeof(rtree_t)); $$->label = $1; $$->length = $2 ? atof($2) : 0; $$->left = NULL; $$->right = NULL; $$->leaves = 1; $$->event = EVENT_COALESCENT; $$->edge_count = 0; $$->edgelen_sum = 0; $$->max_species_count = 1; $$->mark = 0; $$->data = NULL; free($2); }; optional_label: {$$ = NULL;} | label {$$ = $1;}; optional_length: {$$ = NULL;} | COLON number {$$ = $2;}; label: STRING {$$=$1;} | NUMBER {$$=$1;}; number: NUMBER {$$=$1;}; %% rtree_t * rtree_parse_newick(const char * filename) { struct rtree_s * tree; tree = (rtree_t *)calloc(1, sizeof(rtree_t)); rtree_in = fopen(filename, "r"); if (!rtree_in) { rtree_destroy(tree); snprintf(errmsg, 200, "Unable to open file (%s)", filename); return NULL; } else if (rtree_parse(tree)) { rtree_destroy(tree); tree = NULL; fclose(rtree_in); rtree_lex_destroy(); return NULL; } if (rtree_in) fclose(rtree_in); rtree_lex_destroy(); return tree; } mptp-0.2.2/src/parse_utree.y000066400000000000000000000121101304415103400157600ustar00rootroot00000000000000/* Copyright (C) 2015 Tomas Flouri This program is free software: you can redistribute it and/or modify it under the terms of the GNU Affero General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more details. You should have received a copy of the GNU Affero General Public License along with this program. If not, see . Contact: Tomas Flouri , Heidelberg Institute for Theoretical Studies, Schloss-Wolfsbrunnenweg 35, D-69118 Heidelberg, Germany */ %{ #include "mptp.h" extern int utree_lex(); extern FILE * utree_in; extern void utree_lex_destroy(); static unsigned int tip_cnt = 0; static void dealloc_tree_recursive(utree_t * node) { if (!node->next) { free(node->label); free(node); return; } dealloc_tree_recursive(node->next->back); dealloc_tree_recursive(node->next->next->back); free(node->next->next); free(node->next); free(node->label); free(node); } void utree_destroy(utree_t * root) { if (!root) return; if (!(root->next)) { free(root->label); free(root); return; } if (root->next) dealloc_tree_recursive(root->next->back); if (root->next->next) dealloc_tree_recursive(root->next->next->back); if (root->back) dealloc_tree_recursive(root->back); free(root->label); free(root->next->next); free(root->next); free(root); } static void utree_error(utree_t * tree, const char * s) { } %} %union { char * s; char * d; struct utree_s * tree; } %error-verbose %parse-param {struct utree_s * tree} %destructor { utree_destroy($$); } subtree %token OPAR %token CPAR %token COMMA %token COLON SEMICOLON %token STRING %token NUMBER %type label optional_label %type number optional_length %type subtree %start input %% input: OPAR subtree COMMA subtree COMMA subtree CPAR optional_label optional_length SEMICOLON { tree->next = (utree_t *)calloc(1, sizeof(utree_t)); tree->next->next = (utree_t *)calloc(1, sizeof(utree_t)); tree->next->next->next = tree; tree->back = $2; tree->next->back = $4; tree->next->next->back = $6; $2->back = tree; $4->back = tree->next; $6->back = tree->next->next; tree->label = $8; tree->next->label = $8; tree->next->next->label = $8; tree->length = $2->length; tree->next->length = $4->length; tree->next->next->length = $6->length; tree->height = ($2->height > $4->height) ? (($2->height > $6->height) ? $2->height + 1 : $6->height + 1) : (($4->height > $6->height) ? $4->height + 1 : $6->height + 1); tree->next->height = tree->height; tree->next->next->height = tree->height; free($9); }; subtree: OPAR subtree COMMA subtree CPAR optional_label optional_length { $$ = (utree_t *)calloc(1, sizeof(utree_t)); $$->next = (utree_t *)calloc(1, sizeof(utree_t)); $$->next->next = (utree_t *)calloc(1, sizeof(utree_t)); $$->next->next->next = $$; $$->next->back = $2; $$->next->next->back = $4; $2->back = $$->next; $4->back = $$->next->next; $$->label = $6; $$->next->label = $6; $$->next->next->label = $6; $$->length = $7 ? atof($7) : 0; $$->height = ($2->height > $4->height) ? $2->height + 1 : $4->height + 1; $$->next->height = $$->height; $$->next->next->height = $$->height; $$->mark = 0; $$->next->mark = 0; $$->next->next->mark = 0; free($7); $$->next->length = $2->length; $$->next->next->length = $4->length; } | label optional_length { $$ = (utree_t *)calloc(1, sizeof(utree_t)); $$->label = $1; $$->length = $2 ? atof($2) : 0; $$->next = NULL; $$->height = 0; $$->mark = 0; tip_cnt++; free($2); }; optional_label: { $$ = NULL;} | label {$$ = $1;}; optional_length: { $$ = NULL;} | COLON number {$$ = $2;}; label: STRING { $$=$1;} | NUMBER {$$=$1;}; number: NUMBER { $$=$1;}; %% utree_t * utree_parse_newick(const char * filename, unsigned int * tip_count) { struct utree_s * tree; /* reset tip count */ tip_cnt = 0; tree = (utree_t *)calloc(1, sizeof(utree_t)); utree_in = fopen(filename, "r"); if (!utree_in) { utree_destroy(tree); snprintf(errmsg, 200, "Unable to open file (%s)", filename); return NULL; } else if (utree_parse(tree)) { utree_destroy(tree); tree = NULL; fclose(utree_in); utree_lex_destroy(); return NULL; } if (utree_in) fclose(utree_in); utree_lex_destroy(); *tip_count = tip_cnt; return tree; } mptp-0.2.2/src/python/000077500000000000000000000000001304415103400145765ustar00rootroot00000000000000mptp-0.2.2/src/python/compare.py000077500000000000000000000050571304415103400166100ustar00rootroot00000000000000#! /usr/bin/env python import commands import time def evaluate(treeFile, rooted): cmd_multi = './delimit --ptp_multi --tree_file ' + treeFile + ' --output_file foo' cmd_single = './delimit --ptp_single --tree_file ' + treeFile + ' --output_file foo' cmd_ptp_rooted = './PTP/PTP.py -t ' + treeFile + ' -p -minbr 0 -o output -pvalue 1' cmd_ptp_unrooted = './PTP/PTP.py -t ' + treeFile + ' -p -minbr 0 -o output -pvalue 1 -r' if (rooted): programs = [cmd_multi, cmd_single, cmd_ptp_rooted] cmd_ptp = cmd_ptp_rooted else: programs = [cmd_multi, cmd_single, cmd_ptp_unrooted] cmd_ptp = cmd_ptp_unrooted scores = {} times = {} print "Testing " + treeFile + "..." # cmd_ptp: ts = time.time() ( stat, output ) = commands.getstatusoutput(cmd_ptp) te = time.time() times['ptp'] = te-ts #print output left = output.find("MAX logl: ") right = output[left+10:].find("\n") score = output[left+10:right+left+10] scores['ptp'] = score # cmd_multi: ts = time.time() ( stat, output ) = commands.getstatusoutput(cmd_multi) te = time.time() times['multi'] = te-ts #print output left = output.find("Best score found single: ") right = output[left+25:].find("\n") score = output[left+25:right+left+25] scores['multi'] = score # cmd_single: ts = time.time() ( stat, output ) = commands.getstatusoutput(cmd_single) te = time.time() times['single'] = te-ts #print output left = output.find("Best score found single: ") right = output[left+25:].find("\n") score = output[left+25:right+left+25] scores['single'] = score print 'scores: ' print scores print 'times: ' print times print '\n' return scores def compare_rooted(): with open('tree_names_rooted') as f_rooted: content = f_rooted.read().splitlines() #gnuplotOut = open('workfile', 'w') for i in range (0, len(content)): scores = evaluate('trees/' + content[i], True) #gnuplotOut.write(str(i) + ' ' + scores['ptp'] + ' ' + scores['multi'] + ' ' + scores['single'] + '\n') #print evaluate('trees/' + name) #gnuplotOut.close() #commands.getstatusoutput('gnuplot plotscript') f_rooted.close() def compare_unrooted(): with open('tree_names_unrooted') as f_unrooted: content = f_unrooted.read().splitlines() #gnuplotOut = open('workfile', 'w') for i in range (0, len(content)): scores = evaluate('trees/' + content[i], False) #gnuplotOut.write(str(i) + ' ' + scores['ptp'] + ' ' + scores['multi'] + ' ' + scores['single'] + '\n') #print evaluate('trees/' + name) #gnuplotOut.close() #commands.getstatusoutput('gnuplot plotscript') f_unrooted.close() compare_unrooted() compare_rooted() mptp-0.2.2/src/python/create_delimit_results.py000077500000000000000000000072461304415103400217170ustar00rootroot00000000000000#! /usr/bin/env python import os import commands def run_delimit_on_data(input_tree_file, output_delimit_single_minbr_0_file, output_delimit_multi_minbr_0_file, output_delimit_single_minbr_default_file, output_delimit_multi_minbr_default_file): try: open(input_tree_file) if not os.path.exists(os.path.dirname(output_delimit_single_minbr_0_file)): os.makedirs(os.path.dirname(output_delimit_single_minbr_0_file)) if not os.path.exists(os.path.dirname(output_delimit_multi_minbr_0_file)): os.makedirs(os.path.dirname(output_delimit_multi_minbr_0_file)) if not os.path.exists(os.path.dirname(output_delimit_single_minbr_default_file)): os.makedirs(os.path.dirname(output_delimit_single_minbr_default_file)) if not os.path.exists(os.path.dirname(output_delimit_multi_minbr_default_file)): os.makedirs(os.path.dirname(output_delimit_multi_minbr_default_file)) delimit_single_minbr_0_call = "./delimit --ml_single --min_br 0 --tree_file " + input_tree_file + " --output_file foo" delimit_multi_minbr_0_call = "./delimit --ml_multi --min_br 0 --tree_file " + input_tree_file + " --output_file foo" delimit_single_minbr_default_call = "./delimit --ml_single --tree_file " + input_tree_file + " --output_file foo" delimit_multi_minbr_default_call = "./delimit --ml_multi --tree_file " + input_tree_file + " --output_file foo" (stat_single_minbr_0, output_single_minbr_0) = commands.getstatusoutput(delimit_single_minbr_0_call) (stat_multi_minbr_0, output_multi_minbr_0) = commands.getstatusoutput(delimit_multi_minbr_0_call) (stat_single_minbr_default, output_single_minbr_default) = commands.getstatusoutput(delimit_single_minbr_default_call) (stat_multi_minbr_default, output_multi_minbr_default) = commands.getstatusoutput(delimit_multi_minbr_default_call) delimit_single_minbr_0_out = open(output_delimit_single_minbr_0_file, 'w') delimit_multi_minbr_0_out = open(output_delimit_multi_minbr_0_file, 'w') delimit_single_minbr_default_out = open(output_delimit_single_minbr_default_file, 'w') delimit_multi_minbr_default_out = open(output_delimit_multi_minbr_default_file, 'w') delimit_single_minbr_0_out.write(output_single_minbr_0) delimit_multi_minbr_0_out.write(output_multi_minbr_0) delimit_single_minbr_default_out.write(output_single_minbr_default) delimit_multi_minbr_default_out.write(output_multi_minbr_default) delimit_single_minbr_0_out.close() delimit_multi_minbr_0_out.close() delimit_single_minbr_default_out.close() delimit_multi_minbr_default_out.close() except IOError: print "File not found: " + input_tree_file set_names = ["Ne10000", "Ne100000", "Ne500000", "Ne1000000"] for set_name in set_names: for i in range(1,101): input_tree_file = "similar_to_GMYC/15-08-2015.16-40/set_BIRTH0.27_" + set_name + "/rooted.RAxML_result.inferred.simulated_set_BIRTH0.27_" + set_name + "_" + str(i) + ".phy" output_delimit_single_minbr_0_file = "similar_to_GMYC_delimit_single_minbr_0/set_" + set_name + "/delimit_results_set_" + set_name + "." + str(i) + ".txt" output_delimit_multi_minbr_0_file = "similar_to_GMYC_delimit_multi_minbr_0/set_" + set_name + "/delimit_results_set_" + set_name + "." + str(i) + ".txt" output_delimit_single_minbr_default_file = "similar_to_GMYC_delimit_single_minbr_default/set_" + set_name + "/delimit_results_set_" + set_name + "." + str(i) + ".txt" output_delimit_multi_minbr_default_file = "similar_to_GMYC_delimit_multi_minbr_default/set_" + set_name + "/delimit_results_set_" + set_name + "." + str(i) + ".txt" run_delimit_on_data(input_tree_file, output_delimit_single_minbr_0_file, output_delimit_multi_minbr_0_file, output_delimit_single_minbr_default_file, output_delimit_multi_minbr_default_file) mptp-0.2.2/src/python/create_delimit_results_simu_data.py000077500000000000000000000071071304415103400237410ustar00rootroot00000000000000#! /usr/bin/env python import os import commands def run_delimit_on_data(input_tree_file, output_delimit_single_minbr_0_file, output_delimit_multi_minbr_0_file, output_delimit_single_minbr_default_file, output_delimit_multi_minbr_default_file): try: open(input_tree_file) if not os.path.exists(os.path.dirname(output_delimit_single_minbr_0_file)): os.makedirs(os.path.dirname(output_delimit_single_minbr_0_file)) if not os.path.exists(os.path.dirname(output_delimit_multi_minbr_0_file)): os.makedirs(os.path.dirname(output_delimit_multi_minbr_0_file)) if not os.path.exists(os.path.dirname(output_delimit_single_minbr_default_file)): os.makedirs(os.path.dirname(output_delimit_single_minbr_default_file)) if not os.path.exists(os.path.dirname(output_delimit_multi_minbr_default_file)): os.makedirs(os.path.dirname(output_delimit_multi_minbr_default_file)) delimit_single_minbr_0_call = "./delimit --ml_single --min_br 0 --tree_file " + input_tree_file + " --output_file foo" delimit_multi_minbr_0_call = "./delimit --ml_multi --min_br 0 --tree_file " + input_tree_file + " --output_file foo" delimit_single_minbr_default_call = "./delimit --ml_single --tree_file " + input_tree_file + " --output_file foo" delimit_multi_minbr_default_call = "./delimit --ml_multi --tree_file " + input_tree_file + " --output_file foo" (stat_single_minbr_0, output_single_minbr_0) = commands.getstatusoutput(delimit_single_minbr_0_call) (stat_multi_minbr_0, output_multi_minbr_0) = commands.getstatusoutput(delimit_multi_minbr_0_call) (stat_single_minbr_default, output_single_minbr_default) = commands.getstatusoutput(delimit_single_minbr_default_call) (stat_multi_minbr_default, output_multi_minbr_default) = commands.getstatusoutput(delimit_multi_minbr_default_call) delimit_single_minbr_0_out = open(output_delimit_single_minbr_0_file, 'w') delimit_multi_minbr_0_out = open(output_delimit_multi_minbr_0_file, 'w') delimit_single_minbr_default_out = open(output_delimit_single_minbr_default_file, 'w') delimit_multi_minbr_default_out = open(output_delimit_multi_minbr_default_file, 'w') delimit_single_minbr_0_out.write(output_single_minbr_0) delimit_multi_minbr_0_out.write(output_multi_minbr_0) delimit_single_minbr_default_out.write(output_single_minbr_default) delimit_multi_minbr_default_out.write(output_multi_minbr_default) delimit_single_minbr_0_out.close() delimit_multi_minbr_0_out.close() delimit_single_minbr_default_out.close() delimit_multi_minbr_default_out.close() except IOError: print "File not found: " + input_tree_file set_names = ["Ne1e+05", "Ne1e+06", "Ne5e+05", "Ne10000"] for set_name in set_names: for i in range(1,101): input_tree_file = "SimulB_C_trees/set_" + set_name + "/SimulB_C_tree_set_" + set_name + "." + str(i) + ".txt" output_delimit_single_minbr_0_file = "SimulB_C_delimit_single_minbr_0/set_" + set_name + "/delimit_results_set_" + set_name + "." + str(i) + ".txt" output_delimit_multi_minbr_0_file = "SimulB_C_delimit_multi_minbr_0/set_" + set_name + "/delimit_results_set_" + set_name + "." + str(i) + ".txt" output_delimit_single_minbr_default_file = "SimulB_C_delimit_single_minbr_default/set_" + set_name + "/delimit_results_set_" + set_name + "." + str(i) + ".txt" output_delimit_multi_minbr_default_file = "SimulB_C_delimit_multi_minbr_default/set_" + set_name + "/delimit_results_set_" + set_name + "." + str(i) + ".txt" run_delimit_on_data(input_tree_file, output_delimit_single_minbr_0_file, output_delimit_multi_minbr_0_file, output_delimit_single_minbr_default_file, output_delimit_multi_minbr_default_file) mptp-0.2.2/src/python/create_scoring_results.py000077500000000000000000000426461304415103400217370ustar00rootroot00000000000000#! /usr/bin/env python import os import commands def extract_tree_score(input_text): lines = input_text.split('\n') for line in lines: if line.startswith("Tree penalty score:"): return int(line.split(': ')[1]) break def extract_nmi_score(input_text): lines = input_text.split('\n') for line in lines: if line.startswith("NMI score:"): return float(line.split(': ')[1]) break def extract_num_species(input_text): lines = input_text.split('\n') for line in lines: if line.startswith("Number of species in input file:"): return int(line.split(': ')[1]) if (int(line.split(': ')[1]) == 1): print "Baaaaad data" break def extract_num_real_species(input_text): lines = input_text.split('\n') for line in lines: if line.startswith("Number of real species:"): return int(line.split(': ')[1]) break def extract_score_real_single(input_text): lines = input_text.split('\n') for line in lines: if line.startswith("Score real single:"): return float(line.split(': ')[1]) break def extract_score_real_multi(input_text): lines = input_text.split('\n') for line in lines: if line.startswith("Score real multi:"): return float(line.split(': ')[1]) break def extract_score_input_single(input_text): lines = input_text.split('\n') for line in lines: if line.startswith("Score input single:"): return float(line.split(': ')[1]) break def extract_score_input_multi(input_text): lines = input_text.split('\n') for line in lines: if line.startswith("Score input multi:"): return float(line.split(': ')[1]) break def grab_scorings(input_tree_file, output_delimit_single_minbr_0, output_delimit_multi_minbr_0, output_delimit_single_minbr_default, output_delimit_multi_minbr_default, output_PTP_minbr_0): try: open(input_tree_file) programNames = ['delimit_single_minbr_0', 'delimit_multi_minbr_0', 'delimit_single_minbr_default', 'delimit_multi_minbr_default', 'PTP_minbr_0'] tree_scores = {} nmi_scores = {} num_species = {} single_scores = {} multi_scores = {} num_real_species = 0 score_real_single_minbr_0 = 0 score_real_multi_minbr_0 = 0 score_real_single_minbr_default = 0 score_real_multi_minbr_default = 0 tree_scores['delimit_single_minbr_0'] = extract_tree_score(output_delimit_single_minbr_0) tree_scores['delimit_multi_minbr_0'] = extract_tree_score(output_delimit_multi_minbr_0) tree_scores['delimit_single_minbr_default'] = extract_tree_score(output_delimit_single_minbr_default) tree_scores['delimit_multi_minbr_default'] = extract_tree_score(output_delimit_multi_minbr_default) tree_scores['PTP_minbr_0'] = extract_tree_score(output_PTP_minbr_0) nmi_scores['delimit_single_minbr_0'] = extract_nmi_score(output_delimit_single_minbr_0) nmi_scores['delimit_multi_minbr_0'] = extract_nmi_score(output_delimit_multi_minbr_0) nmi_scores['delimit_single_minbr_default'] = extract_nmi_score(output_delimit_single_minbr_default) nmi_scores['delimit_multi_minbr_default'] = extract_nmi_score(output_delimit_multi_minbr_default) nmi_scores['PTP_minbr_0'] = extract_nmi_score(output_PTP_minbr_0) num_species['delimit_single_minbr_0'] = extract_num_species(output_delimit_single_minbr_0) num_species['delimit_multi_minbr_0'] = extract_num_species(output_delimit_multi_minbr_0) num_species['delimit_single_minbr_default'] = extract_num_species(output_delimit_single_minbr_default) num_species['delimit_multi_minbr_default'] = extract_num_species(output_delimit_multi_minbr_default) num_species['PTP_minbr_0'] = extract_num_species(output_PTP_minbr_0) single_scores['delimit_single_minbr_0'] = extract_score_input_single(output_delimit_single_minbr_0) single_scores['delimit_multi_minbr_0'] = extract_score_input_single(output_delimit_multi_minbr_0) single_scores['delimit_single_minbr_default'] = extract_score_input_single(output_delimit_single_minbr_default) single_scores['delimit_multi_minbr_default'] = extract_score_input_single(output_delimit_multi_minbr_default) single_scores['PTP_minbr_0'] = extract_score_input_single(output_PTP_minbr_0) multi_scores['delimit_single_minbr_0'] = extract_score_input_multi(output_delimit_single_minbr_0) multi_scores['delimit_multi_minbr_0'] = extract_score_input_multi(output_delimit_multi_minbr_0) multi_scores['delimit_single_minbr_default'] = extract_score_input_multi(output_delimit_single_minbr_default) multi_scores['delimit_multi_minbr_default'] = extract_score_input_multi(output_delimit_multi_minbr_default) multi_scores['PTP_minbr_0'] = extract_score_input_multi(output_PTP_minbr_0) score_real_single_minbr_0 = extract_score_real_single(output_delimit_single_minbr_0) score_real_multi_minbr_0 = extract_score_real_multi(output_delimit_single_minbr_0) score_real_single_minbr_default = extract_score_real_single(output_delimit_single_minbr_default) score_real_multi_minbr_default = extract_score_real_multi(output_delimit_single_minbr_default) num_real_species = extract_num_real_species(output_delimit_single_minbr_0) return (tree_scores, nmi_scores, num_species, single_scores, multi_scores, score_real_single_minbr_0, score_real_multi_minbr_0, score_real_single_minbr_default, score_real_multi_minbr_default, num_real_species) except IOError: print "File not found: " + input_tree_file def create_scoring_results(input_tree_file, input_delimit_single_minbr_0_file, input_delimit_multi_minbr_0_file, input_delimit_single_minbr_default_file, input_delimit_multi_minbr_default_file, input_PTP_minbr_0_file, output_delimit_single_minbr_0_file, output_delimit_multi_minbr_0_file, output_delimit_single_minbr_default_file, output_delimit_multi_minbr_default_file, output_PTP_minbr_0_file): try: open(input_tree_file) if not os.path.exists(os.path.dirname(output_delimit_single_minbr_0_file)): os.makedirs(os.path.dirname(output_delimit_single_minbr_0_file)) if not os.path.exists(os.path.dirname(output_delimit_multi_minbr_0_file)): os.makedirs(os.path.dirname(output_delimit_multi_minbr_0_file)) if not os.path.exists(os.path.dirname(output_delimit_single_minbr_default_file)): os.makedirs(os.path.dirname(output_delimit_single_minbr_default_file)) if not os.path.exists(os.path.dirname(output_delimit_multi_minbr_default_file)): os.makedirs(os.path.dirname(output_delimit_multi_minbr_default_file)) if not os.path.exists(os.path.dirname(output_PTP_minbr_0_file)): os.makedirs(os.path.dirname(output_PTP_minbr_0_file)) call_delimit_single_minbr_0 = "./delimit --score " + input_delimit_single_minbr_0_file + " --min_br 0 --tree_file " + input_tree_file + " --output_file foo" call_delimit_multi_minbr_0 = "./delimit --score " + input_delimit_multi_minbr_0_file + " --min_br 0 --tree_file " + input_tree_file + " --output_file foo" call_delimit_single_minbr_default = "./delimit --score " + input_delimit_single_minbr_default_file + " --tree_file " + input_tree_file + " --output_file foo" call_delimit_multi_minbr_default = "./delimit --score " + input_delimit_multi_minbr_default_file + " --tree_file " + input_tree_file + " --output_file foo" call_PTP_minbr_0 = "./delimit --score " + input_PTP_minbr_0_file + " --min_br 0 --tree_file " + input_tree_file + " --output_file foo" (stat_delimit_single_minbr_0, output_delimit_single_minbr_0) = commands.getstatusoutput(call_delimit_single_minbr_0) (stat_delimit_multi_minbr_0, output_delimit_multi_minbr_0) = commands.getstatusoutput(call_delimit_multi_minbr_0) (stat_delimit_single_minbr_default, output_delimit_single_minbr_default) = commands.getstatusoutput(call_delimit_single_minbr_default) (stat_delimit_multi_minbr_default, output_delimit_multi_minbr_default) = commands.getstatusoutput(call_delimit_multi_minbr_default) (stat_PTP_minbr_0, output_PTP_minbr_0) = commands.getstatusoutput(call_PTP_minbr_0) delimit_single_minbr_0_out = open(output_delimit_single_minbr_0_file, 'w') delimit_multi_minbr_0_out = open(output_delimit_multi_minbr_0_file, 'w') delimit_single_minbr_default_out = open(output_delimit_single_minbr_default_file, 'w') delimit_multi_minbr_default_out = open(output_delimit_multi_minbr_default_file, 'w') PTP_minbr_0_out = open(output_PTP_minbr_0_file, 'w') delimit_single_minbr_0_out.write(output_delimit_single_minbr_0) delimit_multi_minbr_0_out.write(output_delimit_multi_minbr_0) delimit_single_minbr_default_out.write(output_delimit_single_minbr_default) delimit_multi_minbr_default_out.write(output_delimit_multi_minbr_default) PTP_minbr_0_out.write(output_PTP_minbr_0) delimit_single_minbr_0_out.close() delimit_multi_minbr_0_out.close() delimit_single_minbr_default_out.close() delimit_multi_minbr_default_out.close() PTP_minbr_0_out.close() return grab_scorings(input_tree_file, output_delimit_single_minbr_0, output_delimit_multi_minbr_0, output_delimit_single_minbr_default, output_delimit_multi_minbr_default, output_PTP_minbr_0) except IOError: print "File not found: " + input_tree_file set_names = ["1", "5", "10", "20", "40", "80", "160"] names = ['delimit_single_minbr_0', 'delimit_multi_minbr_0', 'delimit_single_minbr_default', 'delimit_multi_minbr_default', 'PTP_minbr_0'] gnuplotOut_tree_scores = open('workfile_tree_scores', 'w') gnuplotOut_nmi_scores = open('workfile_nmi_scores', 'w') gnuplotOut_single_scores = open('workfile_single_scores', 'w') gnuplotOut_multi_scores = open('workfile_multi_scores', 'w') gnuplotOut_num_species = open('workfile_num_species', 'w') for set_name in set_names: num_valid_indices = 0 average_tree_scores = {} average_nmi_scores = {} average_num_species = {} average_single_scores = {} average_multi_scores = {} average_real_num_species = 0 average_real_score_single_minbr_0 = 0 average_real_score_multi_minbr_0 = 0 average_real_score_single_minbr_default = 0 average_real_score_multi_minbr_default = 0 for name in names: average_tree_scores[name] = 0 average_nmi_scores[name] = 0 average_num_species[name] = 0 average_single_scores[name] = 0 average_multi_scores[name] = 0 for i in range(1,101): if (set_name == "1"): input_tree_file = "unique_taxa_trees_big_dataset/set_" + set_name + "/RAxML_inferred_trees_unique_taxa/rooted.inferred_unique_taxa." + str(i) else: input_tree_file = "unique_taxa_trees_big_dataset/set_" + set_name + "/RAxML_inferred_trees_unique_taxa/rooted.inferred_unique_taxa_set_" + set_name + "." + str(i) try: open(input_tree_file) input_delimit_single_minbr_0_file = "unique_taxa_big_delimit_single_minbr_0/set_" + set_name + "/delimit_results_set_" + set_name + "." + str(i) + ".txt" input_delimit_multi_minbr_0_file = "unique_taxa_big_delimit_multi_minbr_0/set_" + set_name + "/delimit_results_set_" + set_name + "." + str(i) + ".txt" input_delimit_single_minbr_default_file = "unique_taxa_big_delimit_single_minbr_default/set_" + set_name + "/delimit_results_set_" + set_name + "." + str(i) + ".txt" input_delimit_multi_minbr_default_file = "unique_taxa_big_delimit_multi_minbr_default/set_" + set_name + "/delimit_results_set_" + set_name + "." + str(i) + ".txt" input_PTP_minbr_0_file = "unique_taxa_big_PTP_minbr_0/set_" + set_name + "/PTP_results_set_" + set_name + "." + str(i) + ".txt" score_path = "unique_taxa_big_scoring_results/" output_delimit_single_minbr_0_file = score_path + "delimit_single_minbr_0/set_" + set_name + "/delimit_score_set_" + set_name + "." + str(i) + ".txt" output_delimit_multi_minbr_0_file = score_path + "delimit_multi_minbr_0/set_" + set_name + "/delimit_score_set_" + set_name + "." + str(i) + ".txt" output_delimit_single_minbr_default_file = score_path + "delimit_single_minbr_default/set_" + set_name + "/delimit_score_set_" + set_name + "." + str(i) + ".txt" output_delimit_multi_minbr_default_file = score_path + "delimit_multi_minbr_default/set_" + set_name + "/delimit_score_set_" + set_name + "." + str(i) + ".txt" output_PTP_minbr_0_file = score_path + "PTP_minbr_0/set_" + set_name + "/PTP_score_set_" + set_name + "." + str(i) + ".txt" (tree_scores, nmi_scores, num_species, single_scores, multi_scores, score_real_single_minbr_0, score_real_multi_minbr_0, score_real_single_minbr_default, score_real_multi_minbr_default, num_real_species) = create_scoring_results(input_tree_file, input_delimit_single_minbr_0_file, input_delimit_multi_minbr_0_file, input_delimit_single_minbr_default_file, input_delimit_multi_minbr_default_file, input_PTP_minbr_0_file, output_delimit_single_minbr_0_file, output_delimit_multi_minbr_0_file, output_delimit_single_minbr_default_file, output_delimit_multi_minbr_default_file, output_PTP_minbr_0_file) try: for name in names: average_tree_scores[name] = average_tree_scores[name] + tree_scores[name] average_nmi_scores[name] = average_nmi_scores[name] + nmi_scores[name] average_num_species[name] = average_num_species[name] + num_species[name] average_single_scores[name] = average_single_scores[name] + single_scores[name] average_multi_scores[name] = average_multi_scores[name] + multi_scores[name] average_real_num_species = average_real_num_species + num_real_species average_real_score_single_minbr_0 = average_real_score_single_minbr_0 + score_real_single_minbr_0 average_real_score_multi_minbr_0 = average_real_score_multi_minbr_0 + score_real_multi_minbr_0 average_real_score_single_minbr_default = average_real_score_single_minbr_default + score_real_single_minbr_default average_real_score_multi_minbr_default = average_real_score_multi_minbr_default + score_real_multi_minbr_default except: print "File is bad: " + input_tree_file num_valid_indices = num_valid_indices - 1 num_valid_indices = num_valid_indices + 1 except IOError: #1 print "File not found: " + input_tree_file if (num_valid_indices > 0): for name in names: average_tree_scores[name] = float(average_tree_scores[name]) / float(num_valid_indices) average_nmi_scores[name] = float(average_nmi_scores[name]) / float(num_valid_indices) average_num_species[name] = float(average_num_species[name]) / float(num_valid_indices) average_single_scores[name] = float(average_single_scores[name]) / float(num_valid_indices) average_multi_scores[name] = float(average_multi_scores[name]) / float(num_valid_indices) #print "Set " + set_name + ": Average tree score " + name #print average_tree_scores[name] #print "Set " + set_name + ": Average NMI score " + name #print average_nmi_scores[name] #print "Set " + set_name + ": Average num species " + name #print average_num_species[name] #print "Set " + set_name + ": Average input score single " + name #print average_single_scores[name] #print "Set " + set_name + ": Average input score multi " + name #print average_multi_scores[name] average_real_num_species = float(average_real_num_species) / float(num_valid_indices) average_real_score_single_minbr_0 = float(average_real_score_single_minbr_0) / float(num_valid_indices) average_real_score_multi_minbr_0 = float(average_real_score_multi_minbr_0) / float(num_valid_indices) average_real_score_single_minbr_default = float(average_real_score_single_minbr_default) / float(num_valid_indices) average_real_score_multi_minbr_default = float(average_real_score_multi_minbr_default) / float(num_valid_indices) #print "Set " + set_name + ": Average real num species " #print average_real_num_species #print "Set " + set_name + ": Average real score single " #print average_real_score_single #print "Set " + set_name + ": Average real score multi " #print average_real_score_multi gnuplotOut_tree_scores.write(set_name + ' ' + str(average_tree_scores['delimit_single_minbr_0']) + ' ' + str(average_tree_scores['delimit_multi_minbr_0']) + ' ' + str(average_tree_scores['delimit_single_minbr_default']) + ' ' + str(average_tree_scores['delimit_multi_minbr_default']) + ' ' + str(average_tree_scores['PTP_minbr_0']) + '\n') gnuplotOut_nmi_scores.write(set_name + ' ' + str(average_nmi_scores['delimit_single_minbr_0']) + ' ' + str(average_nmi_scores['delimit_multi_minbr_0']) + ' ' + str(average_nmi_scores['delimit_single_minbr_default']) + ' ' + str(average_nmi_scores['delimit_multi_minbr_default']) + ' ' + str(average_nmi_scores['PTP_minbr_0']) + '\n') gnuplotOut_single_scores.write(set_name + ' ' + str(average_single_scores['delimit_single_minbr_0']) + ' ' + str(average_single_scores['delimit_multi_minbr_0']) + ' ' + str(average_single_scores['delimit_single_minbr_default']) + ' ' + str(average_single_scores['delimit_multi_minbr_default']) + ' ' + str(average_single_scores['PTP_minbr_0']) + ' ' + str(average_real_score_single_minbr_0) + ' ' + str(average_real_score_single_minbr_default) + '\n') gnuplotOut_multi_scores.write(set_name + ' ' + str(average_multi_scores['delimit_single_minbr_0']) + ' ' + str(average_multi_scores['delimit_multi_minbr_0']) + ' ' + str(average_multi_scores['delimit_single_minbr_default']) + ' ' + str(average_multi_scores['delimit_multi_minbr_default']) + ' ' + str(average_multi_scores['PTP_minbr_0']) + ' ' + str(average_real_score_multi_minbr_0) + ' ' + str(average_real_score_multi_minbr_default) + '\n') gnuplotOut_num_species.write(set_name + ' ' + str(average_num_species['delimit_single_minbr_0']) + ' ' + str(average_num_species['delimit_multi_minbr_0']) + ' ' + str(average_num_species['delimit_single_minbr_default']) + ' ' + str(average_num_species['delimit_multi_minbr_default']) + ' ' + str(average_num_species['PTP_minbr_0']) + ' ' + str(average_real_num_species) + '\n') gnuplotOut_tree_scores.close() gnuplotOut_nmi_scores.close() gnuplotOut_single_scores.close() gnuplotOut_multi_scores.close() gnuplotOut_num_species.close() commands.getstatusoutput('gnuplot plotscript') mptp-0.2.2/src/python/create_scoring_results_with_gmyc.py000077500000000000000000000547211304415103400240060ustar00rootroot00000000000000#! /usr/bin/env python import os import commands def extract_tree_score(input_text): lines = input_text.split('\n') for line in lines: if line.startswith("Tree penalty score:"): return int(line.split(': ')[1]) break def extract_nmi_score(input_text): lines = input_text.split('\n') for line in lines: if line.startswith("NMI score:"): return float(line.split(': ')[1]) break def extract_num_species(input_text): lines = input_text.split('\n') for line in lines: if line.startswith("Number of species in input file:"): return int(line.split(': ')[1]) if (int(line.split(': ')[1]) == 1): print "Baaaaad data" break def extract_num_real_species(input_text): lines = input_text.split('\n') for line in lines: if line.startswith("Number of real species:"): return int(line.split(': ')[1]) break def extract_score_real_single(input_text): lines = input_text.split('\n') for line in lines: if line.startswith("Score real single:"): return float(line.split(': ')[1]) break def extract_score_real_multi(input_text): lines = input_text.split('\n') for line in lines: if line.startswith("Score real multi:"): return float(line.split(': ')[1]) break def extract_score_input_single(input_text): lines = input_text.split('\n') for line in lines: if line.startswith("Score input single:"): return float(line.split(': ')[1]) break def extract_score_input_multi(input_text): lines = input_text.split('\n') for line in lines: if line.startswith("Score input multi:"): return float(line.split(': ')[1]) break def grab_scorings(input_tree_file, output_delimit_single_minbr_0, output_delimit_multi_minbr_0, output_delimit_single_minbr_default, output_delimit_multi_minbr_default, output_PTP_minbr_default, output_gmyc_minbr_0): try: open(input_tree_file) programNames = ['delimit_single_minbr_0', 'delimit_multi_minbr_0', 'delimit_single_minbr_default', 'delimit_multi_minbr_default', 'PTP_minbr_default', 'gmyc_minbr_0'] tree_scores = {} nmi_scores = {} num_species = {} single_scores = {} multi_scores = {} num_real_species = 0 score_real_single_minbr_0 = 0 score_real_multi_minbr_0 = 0 score_real_single_minbr_default = 0 score_real_multi_minbr_default = 0 tree_scores['delimit_single_minbr_0'] = extract_tree_score(output_delimit_single_minbr_0) tree_scores['delimit_multi_minbr_0'] = extract_tree_score(output_delimit_multi_minbr_0) tree_scores['delimit_single_minbr_default'] = extract_tree_score(output_delimit_single_minbr_default) tree_scores['delimit_multi_minbr_default'] = extract_tree_score(output_delimit_multi_minbr_default) tree_scores['PTP_minbr_default'] = extract_tree_score(output_PTP_minbr_default) tree_scores['gmyc_minbr_0'] = extract_tree_score(output_gmyc_minbr_0) nmi_scores['delimit_single_minbr_0'] = extract_nmi_score(output_delimit_single_minbr_0) nmi_scores['delimit_multi_minbr_0'] = extract_nmi_score(output_delimit_multi_minbr_0) nmi_scores['delimit_single_minbr_default'] = extract_nmi_score(output_delimit_single_minbr_default) nmi_scores['delimit_multi_minbr_default'] = extract_nmi_score(output_delimit_multi_minbr_default) nmi_scores['PTP_minbr_default'] = extract_nmi_score(output_PTP_minbr_default) nmi_scores['gmyc_minbr_0'] = extract_nmi_score(output_gmyc_minbr_0) num_species['delimit_single_minbr_0'] = extract_num_species(output_delimit_single_minbr_0) num_species['delimit_multi_minbr_0'] = extract_num_species(output_delimit_multi_minbr_0) num_species['delimit_single_minbr_default'] = extract_num_species(output_delimit_single_minbr_default) num_species['delimit_multi_minbr_default'] = extract_num_species(output_delimit_multi_minbr_default) num_species['PTP_minbr_default'] = extract_num_species(output_PTP_minbr_default) num_species['gmyc_minbr_0'] = extract_num_species(output_gmyc_minbr_0) single_scores['delimit_single_minbr_0'] = extract_score_input_single(output_delimit_single_minbr_0) single_scores['delimit_multi_minbr_0'] = extract_score_input_single(output_delimit_multi_minbr_0) single_scores['delimit_single_minbr_default'] = extract_score_input_single(output_delimit_single_minbr_default) single_scores['delimit_multi_minbr_default'] = extract_score_input_single(output_delimit_multi_minbr_default) single_scores['PTP_minbr_default'] = extract_score_input_single(output_PTP_minbr_default) single_scores['gmyc_minbr_0'] = extract_score_input_single(output_gmyc_minbr_0) multi_scores['delimit_single_minbr_0'] = extract_score_input_multi(output_delimit_single_minbr_0) multi_scores['delimit_multi_minbr_0'] = extract_score_input_multi(output_delimit_multi_minbr_0) multi_scores['delimit_single_minbr_default'] = extract_score_input_multi(output_delimit_single_minbr_default) multi_scores['delimit_multi_minbr_default'] = extract_score_input_multi(output_delimit_multi_minbr_default) multi_scores['PTP_minbr_default'] = extract_score_input_multi(output_PTP_minbr_default) multi_scores['gmyc_minbr_0'] = extract_score_input_multi(output_gmyc_minbr_0) score_real_single_minbr_0 = extract_score_real_single(output_delimit_single_minbr_0) score_real_multi_minbr_0 = extract_score_real_multi(output_delimit_single_minbr_0) score_real_single_minbr_default = extract_score_real_single(output_delimit_single_minbr_default) score_real_multi_minbr_default = extract_score_real_multi(output_delimit_single_minbr_default) num_real_species = extract_num_real_species(output_delimit_single_minbr_0) return (tree_scores, nmi_scores, num_species, single_scores, multi_scores, score_real_single_minbr_0, score_real_multi_minbr_0, score_real_single_minbr_default, score_real_multi_minbr_default, num_real_species) except IOError: print "File not found: " + input_tree_file def create_scoring_results(input_tree_file, input_delimit_single_minbr_0_file, input_delimit_multi_minbr_0_file, input_delimit_single_minbr_default_file, input_delimit_multi_minbr_default_file, input_PTP_minbr_default_file, input_gmyc_minbr_0_file, output_delimit_single_minbr_0_file, output_delimit_multi_minbr_0_file, output_delimit_single_minbr_default_file, output_delimit_multi_minbr_default_file, output_PTP_minbr_default_file, output_gmyc_minbr_0_file): try: open(input_tree_file) if not os.path.exists(os.path.dirname(output_delimit_single_minbr_0_file)): os.makedirs(os.path.dirname(output_delimit_single_minbr_0_file)) if not os.path.exists(os.path.dirname(output_delimit_multi_minbr_0_file)): os.makedirs(os.path.dirname(output_delimit_multi_minbr_0_file)) if not os.path.exists(os.path.dirname(output_delimit_single_minbr_default_file)): os.makedirs(os.path.dirname(output_delimit_single_minbr_default_file)) if not os.path.exists(os.path.dirname(output_delimit_multi_minbr_default_file)): os.makedirs(os.path.dirname(output_delimit_multi_minbr_default_file)) if not os.path.exists(os.path.dirname(output_PTP_minbr_default_file)): os.makedirs(os.path.dirname(output_PTP_minbr_default_file)) if not os.path.exists(os.path.dirname(output_gmyc_minbr_0_file)): os.makedirs(os.path.dirname(output_gmyc_minbr_0_file)) call_delimit_single_minbr_0 = "./delimit --score " + input_delimit_single_minbr_0_file + " --min_br 0 --tree_file " + input_tree_file + " --output_file foo" call_delimit_multi_minbr_0 = "./delimit --score " + input_delimit_multi_minbr_0_file + " --min_br 0 --tree_file " + input_tree_file + " --output_file foo" call_delimit_single_minbr_default = "./delimit --score " + input_delimit_single_minbr_default_file + " --tree_file " + input_tree_file + " --output_file foo" call_delimit_multi_minbr_default = "./delimit --score " + input_delimit_multi_minbr_default_file + " --tree_file " + input_tree_file + " --output_file foo" call_PTP_minbr_default = "./delimit --score " + input_PTP_minbr_default_file + " --min_br 0 --tree_file " + input_tree_file + " --output_file foo" call_gmyc_minbr_0 = "./delimit --score " + input_gmyc_minbr_0_file + " --min_br 0 --tree_file " + input_tree_file + " --output_file foo" (stat_delimit_single_minbr_0, output_delimit_single_minbr_0) = commands.getstatusoutput(call_delimit_single_minbr_0) (stat_delimit_multi_minbr_0, output_delimit_multi_minbr_0) = commands.getstatusoutput(call_delimit_multi_minbr_0) (stat_delimit_single_minbr_default, output_delimit_single_minbr_default) = commands.getstatusoutput(call_delimit_single_minbr_default) (stat_delimit_multi_minbr_default, output_delimit_multi_minbr_default) = commands.getstatusoutput(call_delimit_multi_minbr_default) (stat_PTP_minbr_default, output_PTP_minbr_default) = commands.getstatusoutput(call_PTP_minbr_default) (stat_gmyc_minbr_0, output_gmyc_minbr_0) = commands.getstatusoutput(call_gmyc_minbr_0) delimit_single_minbr_0_out = open(output_delimit_single_minbr_0_file, 'w') delimit_multi_minbr_0_out = open(output_delimit_multi_minbr_0_file, 'w') delimit_single_minbr_default_out = open(output_delimit_single_minbr_default_file, 'w') delimit_multi_minbr_default_out = open(output_delimit_multi_minbr_default_file, 'w') PTP_minbr_default_out = open(output_PTP_minbr_default_file, 'w') gmyc_minbr_0_out = open(output_gmyc_minbr_0_file, 'w') delimit_single_minbr_0_out.write(output_delimit_single_minbr_0) delimit_multi_minbr_0_out.write(output_delimit_multi_minbr_0) delimit_single_minbr_default_out.write(output_delimit_single_minbr_default) delimit_multi_minbr_default_out.write(output_delimit_multi_minbr_default) PTP_minbr_default_out.write(output_PTP_minbr_default) gmyc_minbr_0_out.write(output_gmyc_minbr_0) delimit_single_minbr_0_out.close() delimit_multi_minbr_0_out.close() delimit_single_minbr_default_out.close() delimit_multi_minbr_default_out.close() PTP_minbr_default_out.close() gmyc_minbr_0_out.close() return grab_scorings(input_tree_file, output_delimit_single_minbr_0, output_delimit_multi_minbr_0, output_delimit_single_minbr_default, output_delimit_multi_minbr_default, output_PTP_minbr_default, output_gmyc_minbr_0) except IOError: print "File not found: " + input_tree_file set_names = ["Ne10000", "Ne100000", "Ne500000", "Ne1000000"] names = ['delimit_single_minbr_0', 'delimit_multi_minbr_0', 'delimit_single_minbr_default', 'delimit_multi_minbr_default', 'PTP_minbr_default', 'gmyc_minbr_0'] gnuplotOut_tree_scores = open('workfile_tree_scores', 'w') gnuplotOut_nmi_scores = open('workfile_nmi_scores', 'w') gnuplotOut_single_scores = open('workfile_single_scores', 'w') gnuplotOut_multi_scores = open('workfile_multi_scores', 'w') gnuplotOut_num_species = open('workfile_num_species', 'w') for set_name in set_names: gnuplotOut_tree_scores_current_set = open('workfile_tree_scores_' + set_name, 'w') gnuplotOut_nmi_scores_current_set = open('workfile_nmi_scores_' + set_name, 'w') gnuplotOut_single_scores_current_set = open('workfile_single_scores_' + set_name, 'w') gnuplotOut_multi_scores_current_set = open('workfile_multi_scores_' + set_name, 'w') gnuplotOut_num_species_current_set = open('workfile_num_species_' + set_name, 'w') gnuplotOut_delta_species_current_set = open('workfile_delta_species_' + set_name, 'w') num_valid_indices = 0 average_tree_scores = {} average_nmi_scores = {} average_num_species = {} average_single_scores = {} average_multi_scores = {} average_real_num_species = 0 average_real_score_single_minbr_0 = 0 average_real_score_multi_minbr_0 = 0 average_real_score_single_minbr_default = 0 average_real_score_multi_minbr_default = 0 for name in names: average_tree_scores[name] = 0 average_nmi_scores[name] = 0 average_num_species[name] = 0 average_single_scores[name] = 0 average_multi_scores[name] = 0 num_bad_guys = 0 for i in range(1,101): input_tree_file = "similar_to_GMYC/15-08-2015.16-40/set_BIRTH0.27_" + set_name + "/rooted.RAxML_result.inferred.simulated_set_BIRTH0.27_" + set_name + "_" + str(i) + ".phy" try: open(input_tree_file) input_delimit_single_minbr_0_file = "similar_to_GMYC_delimit_single_minbr_0/set_" + set_name + "/delimit_results_set_" + set_name + "." + str(i) + ".txt" input_delimit_multi_minbr_0_file = "similar_to_GMYC_delimit_multi_minbr_0/set_" + set_name + "/delimit_results_set_" + set_name + "." + str(i) + ".txt" input_delimit_single_minbr_default_file = "similar_to_GMYC_delimit_single_minbr_default/set_" + set_name + "/delimit_results_set_" + set_name + "." + str(i) + ".txt" input_delimit_multi_minbr_default_file = "similar_to_GMYC_delimit_multi_minbr_default/set_" + set_name + "/delimit_results_set_" + set_name + "." + str(i) + ".txt" input_PTP_minbr_default_file = "similar_to_GMYC_PTP_minbr_default/set_" + set_name + "/PTP_results_set_" + set_name + "." + str(i) + ".txt" input_gmyc_minbr_0_file = "similar_to_GMYC_gmyc_minbr_0/set_" + set_name + "/gmyc_results_set_" + set_name + "." + str(i) + ".txt" score_path = "similar_to_GMYC_scoring_results/" output_delimit_single_minbr_0_file = score_path + "delimit_single_minbr_0/set_" + set_name + "/delimit_score_set_" + set_name + "." + str(i) + ".txt" output_delimit_multi_minbr_0_file = score_path + "delimit_multi_minbr_0/set_" + set_name + "/delimit_score_set_" + set_name + "." + str(i) + ".txt" output_delimit_single_minbr_default_file = score_path + "delimit_single_minbr_default/set_" + set_name + "/delimit_score_set_" + set_name + "." + str(i) + ".txt" output_delimit_multi_minbr_default_file = score_path + "delimit_multi_minbr_default/set_" + set_name + "/delimit_score_set_" + set_name + "." + str(i) + ".txt" output_PTP_minbr_default_file = score_path + "PTP_minbr_default/set_" + set_name + "/PTP_score_set_" + set_name + "." + str(i) + ".txt" output_gmyc_minbr_0_file = score_path + "gmyc_minbr_0/set_" + set_name + "/PTP_score_set_" + set_name + "." + str(i) + ".txt" (tree_scores, nmi_scores, num_species, single_scores, multi_scores, score_real_single_minbr_0, score_real_multi_minbr_0, score_real_single_minbr_default, score_real_multi_minbr_default, num_real_species) = create_scoring_results(input_tree_file, input_delimit_single_minbr_0_file, input_delimit_multi_minbr_0_file, input_delimit_single_minbr_default_file, input_delimit_multi_minbr_default_file, input_PTP_minbr_default_file, input_gmyc_minbr_0_file, output_delimit_single_minbr_0_file, output_delimit_multi_minbr_0_file, output_delimit_single_minbr_default_file, output_delimit_multi_minbr_default_file, output_PTP_minbr_default_file, output_gmyc_minbr_0_file) gnuplotOut_tree_scores_current_set.write(str(i) + ' ' + str(tree_scores['delimit_single_minbr_0']) + ' ' + str(tree_scores['delimit_multi_minbr_0']) + ' ' + str(tree_scores['delimit_single_minbr_default']) + ' ' + str(tree_scores['delimit_multi_minbr_default']) + ' ' + str(tree_scores['PTP_minbr_default']) + ' ' + str(tree_scores['gmyc_minbr_0']) + '\n') gnuplotOut_nmi_scores_current_set.write(str(i) + ' ' + str(nmi_scores['delimit_single_minbr_0']) + ' ' + str(nmi_scores['delimit_multi_minbr_0']) + ' ' + str(nmi_scores['delimit_single_minbr_default']) + ' ' + str(nmi_scores['delimit_multi_minbr_default']) + ' ' + str(nmi_scores['PTP_minbr_default']) + ' ' + str(nmi_scores['gmyc_minbr_0']) + '\n') gnuplotOut_single_scores_current_set.write(str(i) + ' ' + str(single_scores['delimit_single_minbr_0']) + ' ' + str(single_scores['delimit_multi_minbr_0']) + ' ' + str(single_scores['delimit_single_minbr_default']) + ' ' + str(single_scores['delimit_multi_minbr_default']) + ' ' + str(single_scores['PTP_minbr_default']) + ' ' + str(score_real_single_minbr_0) + ' ' + str(single_scores['gmyc_minbr_0']) + ' ' + str(score_real_single_minbr_default) + '\n') gnuplotOut_multi_scores_current_set.write(str(i) + ' ' + str(multi_scores['delimit_single_minbr_0']) + ' ' + str(multi_scores['delimit_multi_minbr_0']) + ' ' + str(multi_scores['delimit_single_minbr_default']) + ' ' + str(multi_scores['delimit_multi_minbr_default']) + ' ' + str(multi_scores['PTP_minbr_default']) + ' ' + str(score_real_multi_minbr_0) + ' ' + ' ' + str(multi_scores['gmyc_minbr_0']) + str(score_real_multi_minbr_default) + '\n') gnuplotOut_num_species_current_set.write(str(i) + ' ' + str(num_species['delimit_single_minbr_0']) + ' ' + str(num_species['delimit_multi_minbr_0']) + ' ' + str(num_species['delimit_single_minbr_default']) + ' ' + str(num_species['delimit_multi_minbr_default']) + ' ' + str(num_species['PTP_minbr_default']) + ' ' + str(num_species['gmyc_minbr_0']) + ' ' + str(num_real_species) + '\n') gnuplotOut_delta_species_current_set.write(str(i) + ' ' + str(num_species['delimit_single_minbr_0'] - num_real_species) + ' ' + str(num_species['delimit_multi_minbr_0'] - num_real_species) + ' ' + str(num_species['delimit_single_minbr_default'] - num_real_species) + ' ' + str(num_species['delimit_multi_minbr_default'] - num_real_species) + ' ' + str(num_species['PTP_minbr_default'] - num_real_species) + ' ' + str(num_species['gmyc_minbr_0'] - num_real_species) + ' ' + str(num_real_species - num_real_species) + '\n') try: for name in names: average_tree_scores[name] = average_tree_scores[name] + tree_scores[name] average_nmi_scores[name] = average_nmi_scores[name] + nmi_scores[name] average_num_species[name] = average_num_species[name] + num_species[name] average_single_scores[name] = average_single_scores[name] + single_scores[name] average_multi_scores[name] = average_multi_scores[name] + multi_scores[name] average_real_num_species = average_real_num_species + num_real_species average_real_score_single_minbr_0 = average_real_score_single_minbr_0 + score_real_single_minbr_0 average_real_score_multi_minbr_0 = average_real_score_multi_minbr_0 + score_real_multi_minbr_0 average_real_score_single_minbr_default = average_real_score_single_minbr_default + score_real_single_minbr_default average_real_score_multi_minbr_default = average_real_score_multi_minbr_default + score_real_multi_minbr_default except: print "File is bad: " + input_tree_file num_valid_indices = num_valid_indices - 1 num_bad_guys = num_bad_guys + 1 num_valid_indices = num_valid_indices + 1 except IOError: #1 print "File not found: " + input_tree_file #print "Set " + set_name + ": Num bad guys " + str(num_bad_guys) #print "Set " + set_name + ": Num good guys " + str(num_valid_indices) if (num_valid_indices > 0): for name in names: average_tree_scores[name] = float(average_tree_scores[name]) / float(num_valid_indices) average_nmi_scores[name] = float(average_nmi_scores[name]) / float(num_valid_indices) average_num_species[name] = float(average_num_species[name]) / float(num_valid_indices) average_single_scores[name] = float(average_single_scores[name]) / float(num_valid_indices) average_multi_scores[name] = float(average_multi_scores[name]) / float(num_valid_indices) #print "Set " + set_name + ": Average tree score " + name #print average_tree_scores[name] #print "Set " + set_name + ": Average NMI score " + name #print average_nmi_scores[name] #print "Set " + set_name + ": Average num species " + name #print average_num_species[name] #print "Set " + set_name + ": Average input score single " + name #print average_single_scores[name] #print "Set " + set_name + ": Average input score multi " + name #print average_multi_scores[name] average_real_num_species = float(average_real_num_species) / float(num_valid_indices) average_real_score_single_minbr_0 = float(average_real_score_single_minbr_0) / float(num_valid_indices) average_real_score_multi_minbr_0 = float(average_real_score_multi_minbr_0) / float(num_valid_indices) average_real_score_single_minbr_default = float(average_real_score_single_minbr_default) / float(num_valid_indices) average_real_score_multi_minbr_default = float(average_real_score_multi_minbr_default) / float(num_valid_indices) #print "Set " + set_name + ": Average real num species " #print average_real_num_species #print "Set " + set_name + ": Average real score single " #print average_real_score_single #print "Set " + set_name + ": Average real score multi " #print average_real_score_multi gnuplotOut_tree_scores.write(set_name[2:] + ' ' + str(average_tree_scores['delimit_single_minbr_0']) + ' ' + str(average_tree_scores['delimit_multi_minbr_0']) + ' ' + str(average_tree_scores['delimit_single_minbr_default']) + ' ' + str(average_tree_scores['delimit_multi_minbr_default']) + ' ' + str(average_tree_scores['PTP_minbr_default']) + ' ' + str(average_tree_scores['gmyc_minbr_0']) + '\n') gnuplotOut_nmi_scores.write(set_name[2:] + ' ' + str(average_nmi_scores['delimit_single_minbr_0']) + ' ' + str(average_nmi_scores['delimit_multi_minbr_0']) + ' ' + str(average_nmi_scores['delimit_single_minbr_default']) + ' ' + str(average_nmi_scores['delimit_multi_minbr_default']) + ' ' + str(average_nmi_scores['PTP_minbr_default']) + ' ' + str(average_nmi_scores['gmyc_minbr_0']) + '\n') gnuplotOut_single_scores.write(set_name[2:] + ' ' + str(average_single_scores['delimit_single_minbr_0']) + ' ' + str(average_single_scores['delimit_multi_minbr_0']) + ' ' + str(average_single_scores['delimit_single_minbr_default']) + ' ' + str(average_single_scores['delimit_multi_minbr_default']) + ' ' + str(average_single_scores['PTP_minbr_default']) + ' ' + str(average_single_scores['gmyc_minbr_0']) + ' ' + str(average_real_score_single_minbr_0) + ' ' + str(average_real_score_single_minbr_default) + '\n') gnuplotOut_multi_scores.write(set_name[2:] + ' ' + str(average_multi_scores['delimit_single_minbr_0']) + ' ' + str(average_multi_scores['delimit_multi_minbr_0']) + ' ' + str(average_multi_scores['delimit_single_minbr_default']) + ' ' + str(average_multi_scores['delimit_multi_minbr_default']) + ' ' + str(average_multi_scores['PTP_minbr_default']) + ' ' + str(average_multi_scores['gmyc_minbr_0']) + ' ' + str(average_real_score_multi_minbr_0) + ' ' + str(average_real_score_multi_minbr_default) + '\n') gnuplotOut_num_species.write(set_name[2:] + ' ' + str(average_num_species['delimit_single_minbr_0']) + ' ' + str(average_num_species['delimit_multi_minbr_0']) + ' ' + str(average_num_species['delimit_single_minbr_default']) + ' ' + str(average_num_species['delimit_multi_minbr_default']) + ' ' + str(average_num_species['PTP_minbr_default']) + ' ' + str(average_num_species['gmyc_minbr_0']) + ' ' + str(average_real_num_species) + '\n') gnuplotOut_tree_scores_current_set.close() gnuplotOut_nmi_scores_current_set.close() gnuplotOut_single_scores_current_set.close() gnuplotOut_multi_scores_current_set.close() gnuplotOut_num_species_current_set.close() gnuplotOut_delta_species_current_set.close() gnuplotOut_tree_scores.close() gnuplotOut_nmi_scores.close() gnuplotOut_single_scores.close() gnuplotOut_multi_scores.close() gnuplotOut_num_species.close() commands.getstatusoutput('gnuplot plotscript') mptp-0.2.2/src/python/create_scoring_results_without_gmyc.py000077500000000000000000000515061304415103400245340ustar00rootroot00000000000000#! /usr/bin/env python import os import commands def extract_tree_score(input_text): lines = input_text.split('\n') for line in lines: if line.startswith("Tree penalty score:"): return int(line.split(': ')[1]) break def extract_nmi_score(input_text): lines = input_text.split('\n') for line in lines: if line.startswith("NMI score:"): return float(line.split(': ')[1]) break def extract_num_species(input_text): lines = input_text.split('\n') for line in lines: if line.startswith("Number of species in input file:"): return int(line.split(': ')[1]) if (int(line.split(': ')[1]) == 1): print "Baaaaad data" break def extract_num_real_species(input_text): lines = input_text.split('\n') for line in lines: if line.startswith("Number of real species:"): return int(line.split(': ')[1]) break def extract_score_real_single(input_text): lines = input_text.split('\n') for line in lines: if line.startswith("Score real single:"): return float(line.split(': ')[1]) break def extract_score_real_multi(input_text): lines = input_text.split('\n') for line in lines: if line.startswith("Score real multi:"): return float(line.split(': ')[1]) break def extract_score_input_single(input_text): lines = input_text.split('\n') for line in lines: if line.startswith("Score input single:"): return float(line.split(': ')[1]) break def extract_score_input_multi(input_text): lines = input_text.split('\n') for line in lines: if line.startswith("Score input multi:"): return float(line.split(': ')[1]) break def grab_scorings(input_tree_file, output_delimit_single_minbr_0, output_delimit_multi_minbr_0, output_delimit_single_minbr_default, output_delimit_multi_minbr_default, output_PTP_minbr_default): try: open(input_tree_file) programNames = ['delimit_single_minbr_0', 'delimit_multi_minbr_0', 'delimit_single_minbr_default', 'delimit_multi_minbr_default', 'PTP_minbr_default'] tree_scores = {} nmi_scores = {} num_species = {} single_scores = {} multi_scores = {} num_real_species = 0 score_real_single_minbr_0 = 0 score_real_multi_minbr_0 = 0 score_real_single_minbr_default = 0 score_real_multi_minbr_default = 0 tree_scores['delimit_single_minbr_0'] = extract_tree_score(output_delimit_single_minbr_0) tree_scores['delimit_multi_minbr_0'] = extract_tree_score(output_delimit_multi_minbr_0) tree_scores['delimit_single_minbr_default'] = extract_tree_score(output_delimit_single_minbr_default) tree_scores['delimit_multi_minbr_default'] = extract_tree_score(output_delimit_multi_minbr_default) tree_scores['PTP_minbr_default'] = extract_tree_score(output_PTP_minbr_default) nmi_scores['delimit_single_minbr_0'] = extract_nmi_score(output_delimit_single_minbr_0) nmi_scores['delimit_multi_minbr_0'] = extract_nmi_score(output_delimit_multi_minbr_0) nmi_scores['delimit_single_minbr_default'] = extract_nmi_score(output_delimit_single_minbr_default) nmi_scores['delimit_multi_minbr_default'] = extract_nmi_score(output_delimit_multi_minbr_default) nmi_scores['PTP_minbr_default'] = extract_nmi_score(output_PTP_minbr_default) num_species['delimit_single_minbr_0'] = extract_num_species(output_delimit_single_minbr_0) num_species['delimit_multi_minbr_0'] = extract_num_species(output_delimit_multi_minbr_0) num_species['delimit_single_minbr_default'] = extract_num_species(output_delimit_single_minbr_default) num_species['delimit_multi_minbr_default'] = extract_num_species(output_delimit_multi_minbr_default) num_species['PTP_minbr_default'] = extract_num_species(output_PTP_minbr_default) single_scores['delimit_single_minbr_0'] = extract_score_input_single(output_delimit_single_minbr_0) single_scores['delimit_multi_minbr_0'] = extract_score_input_single(output_delimit_multi_minbr_0) single_scores['delimit_single_minbr_default'] = extract_score_input_single(output_delimit_single_minbr_default) single_scores['delimit_multi_minbr_default'] = extract_score_input_single(output_delimit_multi_minbr_default) single_scores['PTP_minbr_default'] = extract_score_input_single(output_PTP_minbr_default) multi_scores['delimit_single_minbr_0'] = extract_score_input_multi(output_delimit_single_minbr_0) multi_scores['delimit_multi_minbr_0'] = extract_score_input_multi(output_delimit_multi_minbr_0) multi_scores['delimit_single_minbr_default'] = extract_score_input_multi(output_delimit_single_minbr_default) multi_scores['delimit_multi_minbr_default'] = extract_score_input_multi(output_delimit_multi_minbr_default) multi_scores['PTP_minbr_default'] = extract_score_input_multi(output_PTP_minbr_default) score_real_single_minbr_0 = extract_score_real_single(output_delimit_single_minbr_0) score_real_multi_minbr_0 = extract_score_real_multi(output_delimit_single_minbr_0) score_real_single_minbr_default = extract_score_real_single(output_delimit_single_minbr_default) score_real_multi_minbr_default = extract_score_real_multi(output_delimit_single_minbr_default) num_real_species = extract_num_real_species(output_delimit_single_minbr_0) return (tree_scores, nmi_scores, num_species, single_scores, multi_scores, score_real_single_minbr_0, score_real_multi_minbr_0, score_real_single_minbr_default, score_real_multi_minbr_default, num_real_species) except IOError: print "File not found: " + input_tree_file def create_scoring_results(input_tree_file, input_delimit_single_minbr_0_file, input_delimit_multi_minbr_0_file, input_delimit_single_minbr_default_file, input_delimit_multi_minbr_default_file, input_PTP_minbr_default_file, output_delimit_single_minbr_0_file, output_delimit_multi_minbr_0_file, output_delimit_single_minbr_default_file, output_delimit_multi_minbr_default_file, output_PTP_minbr_default_file): try: open(input_tree_file) if not os.path.exists(os.path.dirname(output_delimit_single_minbr_0_file)): os.makedirs(os.path.dirname(output_delimit_single_minbr_0_file)) if not os.path.exists(os.path.dirname(output_delimit_multi_minbr_0_file)): os.makedirs(os.path.dirname(output_delimit_multi_minbr_0_file)) if not os.path.exists(os.path.dirname(output_delimit_single_minbr_default_file)): os.makedirs(os.path.dirname(output_delimit_single_minbr_default_file)) if not os.path.exists(os.path.dirname(output_delimit_multi_minbr_default_file)): os.makedirs(os.path.dirname(output_delimit_multi_minbr_default_file)) if not os.path.exists(os.path.dirname(output_PTP_minbr_default_file)): os.makedirs(os.path.dirname(output_PTP_minbr_default_file)) call_delimit_single_minbr_0 = "./delimit --score " + input_delimit_single_minbr_0_file + " --min_br 0 --tree_file " + input_tree_file + " --output_file foo" call_delimit_multi_minbr_0 = "./delimit --score " + input_delimit_multi_minbr_0_file + " --min_br 0 --tree_file " + input_tree_file + " --output_file foo" call_delimit_single_minbr_default = "./delimit --score " + input_delimit_single_minbr_default_file + " --tree_file " + input_tree_file + " --output_file foo" call_delimit_multi_minbr_default = "./delimit --score " + input_delimit_multi_minbr_default_file + " --tree_file " + input_tree_file + " --output_file foo" call_PTP_minbr_default = "./delimit --score " + input_PTP_minbr_default_file + " --min_br 0 --tree_file " + input_tree_file + " --output_file foo" (stat_delimit_single_minbr_0, output_delimit_single_minbr_0) = commands.getstatusoutput(call_delimit_single_minbr_0) (stat_delimit_multi_minbr_0, output_delimit_multi_minbr_0) = commands.getstatusoutput(call_delimit_multi_minbr_0) (stat_delimit_single_minbr_default, output_delimit_single_minbr_default) = commands.getstatusoutput(call_delimit_single_minbr_default) (stat_delimit_multi_minbr_default, output_delimit_multi_minbr_default) = commands.getstatusoutput(call_delimit_multi_minbr_default) (stat_PTP_minbr_default, output_PTP_minbr_default) = commands.getstatusoutput(call_PTP_minbr_default) delimit_single_minbr_0_out = open(output_delimit_single_minbr_0_file, 'w') delimit_multi_minbr_0_out = open(output_delimit_multi_minbr_0_file, 'w') delimit_single_minbr_default_out = open(output_delimit_single_minbr_default_file, 'w') delimit_multi_minbr_default_out = open(output_delimit_multi_minbr_default_file, 'w') PTP_minbr_default_out = open(output_PTP_minbr_default_file, 'w') delimit_single_minbr_0_out.write(output_delimit_single_minbr_0) delimit_multi_minbr_0_out.write(output_delimit_multi_minbr_0) delimit_single_minbr_default_out.write(output_delimit_single_minbr_default) delimit_multi_minbr_default_out.write(output_delimit_multi_minbr_default) PTP_minbr_default_out.write(output_PTP_minbr_default) delimit_single_minbr_0_out.close() delimit_multi_minbr_0_out.close() delimit_single_minbr_default_out.close() delimit_multi_minbr_default_out.close() PTP_minbr_default_out.close() return grab_scorings(input_tree_file, output_delimit_single_minbr_0, output_delimit_multi_minbr_0, output_delimit_single_minbr_default, output_delimit_multi_minbr_default, output_PTP_minbr_default) except IOError: print "File not found: " + input_tree_file set_names = ["Ne10000", "Ne100000", "Ne500000", "Ne1000000"] names = ['delimit_single_minbr_0', 'delimit_multi_minbr_0', 'delimit_single_minbr_default', 'delimit_multi_minbr_default', 'PTP_minbr_default'] gnuplotOut_tree_scores = open('workfile_tree_scores', 'w') gnuplotOut_nmi_scores = open('workfile_nmi_scores', 'w') gnuplotOut_single_scores = open('workfile_single_scores', 'w') gnuplotOut_multi_scores = open('workfile_multi_scores', 'w') gnuplotOut_num_species = open('workfile_num_species', 'w') for set_name in set_names: gnuplotOut_tree_scores_current_set = open('workfile_tree_scores_' + set_name, 'w') gnuplotOut_nmi_scores_current_set = open('workfile_nmi_scores_' + set_name, 'w') gnuplotOut_single_scores_current_set = open('workfile_single_scores_' + set_name, 'w') gnuplotOut_multi_scores_current_set = open('workfile_multi_scores_' + set_name, 'w') gnuplotOut_num_species_current_set = open('workfile_num_species_' + set_name, 'w') gnuplotOut_delta_species_current_set = open('workfile_delta_species_' + set_name, 'w') num_valid_indices = 0 average_tree_scores = {} average_nmi_scores = {} average_num_species = {} average_single_scores = {} average_multi_scores = {} average_real_num_species = 0 average_real_score_single_minbr_0 = 0 average_real_score_multi_minbr_0 = 0 average_real_score_single_minbr_default = 0 average_real_score_multi_minbr_default = 0 for name in names: average_tree_scores[name] = 0 average_nmi_scores[name] = 0 average_num_species[name] = 0 average_single_scores[name] = 0 average_multi_scores[name] = 0 num_bad_guys = 0 for i in range(1,101): input_tree_file = "similar_to_GMYC/15-08-2015.16-40/set_BIRTH0.27_" + set_name + "/rooted.RAxML_result.inferred.simulated_set_BIRTH0.27_" + set_name + "_" + str(i) + ".phy" try: open(input_tree_file) input_delimit_single_minbr_0_file = "similar_to_GMYC_delimit_single_minbr_0/set_" + set_name + "/delimit_results_set_" + set_name + "." + str(i) + ".txt" input_delimit_multi_minbr_0_file = "similar_to_GMYC_delimit_multi_minbr_0/set_" + set_name + "/delimit_results_set_" + set_name + "." + str(i) + ".txt" input_delimit_single_minbr_default_file = "similar_to_GMYC_delimit_single_minbr_default/set_" + set_name + "/delimit_results_set_" + set_name + "." + str(i) + ".txt" input_delimit_multi_minbr_default_file = "similar_to_GMYC_delimit_multi_minbr_default/set_" + set_name + "/delimit_results_set_" + set_name + "." + str(i) + ".txt" input_PTP_minbr_default_file = "similar_to_GMYC_PTP_minbr_default/set_" + set_name + "/PTP_results_set_" + set_name + "." + str(i) + ".txt" input_gmyc_minbr_0_file = "similar_to_GMYC_gmyc_minbr_0/set_" + set_name + "/PTP_results_set_" + set_name + "." + str(i) + ".txt" score_path = "similar_to_GMYC_scoring_results/" output_delimit_single_minbr_0_file = score_path + "delimit_single_minbr_0/set_" + set_name + "/delimit_score_set_" + set_name + "." + str(i) + ".txt" output_delimit_multi_minbr_0_file = score_path + "delimit_multi_minbr_0/set_" + set_name + "/delimit_score_set_" + set_name + "." + str(i) + ".txt" output_delimit_single_minbr_default_file = score_path + "delimit_single_minbr_default/set_" + set_name + "/delimit_score_set_" + set_name + "." + str(i) + ".txt" output_delimit_multi_minbr_default_file = score_path + "delimit_multi_minbr_default/set_" + set_name + "/delimit_score_set_" + set_name + "." + str(i) + ".txt" output_PTP_minbr_default_file = score_path + "PTP_minbr_default/set_" + set_name + "/PTP_score_set_" + set_name + "." + str(i) + ".txt" (tree_scores, nmi_scores, num_species, single_scores, multi_scores, score_real_single_minbr_0, score_real_multi_minbr_0, score_real_single_minbr_default, score_real_multi_minbr_default, num_real_species) = create_scoring_results(input_tree_file, input_delimit_single_minbr_0_file, input_delimit_multi_minbr_0_file, input_delimit_single_minbr_default_file, input_delimit_multi_minbr_default_file, input_PTP_minbr_default_file, output_delimit_single_minbr_0_file, output_delimit_multi_minbr_0_file, output_delimit_single_minbr_default_file, output_delimit_multi_minbr_default_file, output_PTP_minbr_default_file) gnuplotOut_tree_scores_current_set.write(str(i) + ' ' + str(tree_scores['delimit_single_minbr_0']) + ' ' + str(tree_scores['delimit_multi_minbr_0']) + ' ' + str(tree_scores['delimit_single_minbr_default']) + ' ' + str(tree_scores['delimit_multi_minbr_default']) + ' ' + str(tree_scores['PTP_minbr_default']) + '\n') gnuplotOut_nmi_scores_current_set.write(str(i) + ' ' + str(nmi_scores['delimit_single_minbr_0']) + ' ' + str(nmi_scores['delimit_multi_minbr_0']) + ' ' + str(nmi_scores['delimit_single_minbr_default']) + ' ' + str(nmi_scores['delimit_multi_minbr_default']) + ' ' + str(nmi_scores['PTP_minbr_default']) + '\n') gnuplotOut_single_scores_current_set.write(str(i) + ' ' + str(single_scores['delimit_single_minbr_0']) + ' ' + str(single_scores['delimit_multi_minbr_0']) + ' ' + str(single_scores['delimit_single_minbr_default']) + ' ' + str(single_scores['delimit_multi_minbr_default']) + ' ' + str(single_scores['PTP_minbr_default']) + ' ' + str(score_real_single_minbr_0) + ' ' + str(score_real_single_minbr_default) + '\n') gnuplotOut_multi_scores_current_set.write(str(i) + ' ' + str(multi_scores['delimit_single_minbr_0']) + ' ' + str(multi_scores['delimit_multi_minbr_0']) + ' ' + str(multi_scores['delimit_single_minbr_default']) + ' ' + str(multi_scores['delimit_multi_minbr_default']) + ' ' + str(multi_scores['PTP_minbr_default']) + ' ' + str(score_real_multi_minbr_0) + ' ' + str(score_real_multi_minbr_default) + '\n') gnuplotOut_num_species_current_set.write(str(i) + ' ' + str(num_species['delimit_single_minbr_0']) + ' ' + str(num_species['delimit_multi_minbr_0']) + ' ' + str(num_species['delimit_single_minbr_default']) + ' ' + str(num_species['delimit_multi_minbr_default']) + ' ' + str(num_species['PTP_minbr_default']) + ' ' + str(num_real_species) + '\n') gnuplotOut_delta_species_current_set.write(str(i) + ' ' + str(num_species['delimit_single_minbr_0'] - num_real_species) + ' ' + str(num_species['delimit_multi_minbr_0'] - num_real_species) + ' ' + str(num_species['delimit_single_minbr_default'] - num_real_species) + ' ' + str(num_species['delimit_multi_minbr_default'] - num_real_species) + ' ' + str(num_species['PTP_minbr_default'] - num_real_species) + ' ' + str(num_real_species - num_real_species) + '\n') try: for name in names: average_tree_scores[name] = average_tree_scores[name] + tree_scores[name] average_nmi_scores[name] = average_nmi_scores[name] + nmi_scores[name] average_num_species[name] = average_num_species[name] + num_species[name] average_single_scores[name] = average_single_scores[name] + single_scores[name] average_multi_scores[name] = average_multi_scores[name] + multi_scores[name] average_real_num_species = average_real_num_species + num_real_species average_real_score_single_minbr_0 = average_real_score_single_minbr_0 + score_real_single_minbr_0 average_real_score_multi_minbr_0 = average_real_score_multi_minbr_0 + score_real_multi_minbr_0 average_real_score_single_minbr_default = average_real_score_single_minbr_default + score_real_single_minbr_default average_real_score_multi_minbr_default = average_real_score_multi_minbr_default + score_real_multi_minbr_default except: print "File is bad: " + input_tree_file num_valid_indices = num_valid_indices - 1 num_bad_guys = num_bad_guys + 1 num_valid_indices = num_valid_indices + 1 except IOError: #1 print "File not found: " + input_tree_file #print "Set " + set_name + ": Num bad guys " + str(num_bad_guys) #print "Set " + set_name + ": Num good guys " + str(num_valid_indices) if (num_valid_indices > 0): for name in names: average_tree_scores[name] = float(average_tree_scores[name]) / float(num_valid_indices) average_nmi_scores[name] = float(average_nmi_scores[name]) / float(num_valid_indices) average_num_species[name] = float(average_num_species[name]) / float(num_valid_indices) average_single_scores[name] = float(average_single_scores[name]) / float(num_valid_indices) average_multi_scores[name] = float(average_multi_scores[name]) / float(num_valid_indices) #print "Set " + set_name + ": Average tree score " + name #print average_tree_scores[name] #print "Set " + set_name + ": Average NMI score " + name #print average_nmi_scores[name] #print "Set " + set_name + ": Average num species " + name #print average_num_species[name] #print "Set " + set_name + ": Average input score single " + name #print average_single_scores[name] #print "Set " + set_name + ": Average input score multi " + name #print average_multi_scores[name] average_real_num_species = float(average_real_num_species) / float(num_valid_indices) average_real_score_single_minbr_0 = float(average_real_score_single_minbr_0) / float(num_valid_indices) average_real_score_multi_minbr_0 = float(average_real_score_multi_minbr_0) / float(num_valid_indices) average_real_score_single_minbr_default = float(average_real_score_single_minbr_default) / float(num_valid_indices) average_real_score_multi_minbr_default = float(average_real_score_multi_minbr_default) / float(num_valid_indices) #print "Set " + set_name + ": Average real num species " #print average_real_num_species #print "Set " + set_name + ": Average real score single " #print average_real_score_single #print "Set " + set_name + ": Average real score multi " #print average_real_score_multi gnuplotOut_tree_scores.write(set_name[2:] + ' ' + str(average_tree_scores['delimit_single_minbr_0']) + ' ' + str(average_tree_scores['delimit_multi_minbr_0']) + ' ' + str(average_tree_scores['delimit_single_minbr_default']) + ' ' + str(average_tree_scores['delimit_multi_minbr_default']) + ' ' + str(average_tree_scores['PTP_minbr_default']) + '\n') gnuplotOut_nmi_scores.write(set_name[2:] + ' ' + str(average_nmi_scores['delimit_single_minbr_0']) + ' ' + str(average_nmi_scores['delimit_multi_minbr_0']) + ' ' + str(average_nmi_scores['delimit_single_minbr_default']) + ' ' + str(average_nmi_scores['delimit_multi_minbr_default']) + ' ' + str(average_nmi_scores['PTP_minbr_default']) + '\n') gnuplotOut_single_scores.write(set_name[2:] + ' ' + str(average_single_scores['delimit_single_minbr_0']) + ' ' + str(average_single_scores['delimit_multi_minbr_0']) + ' ' + str(average_single_scores['delimit_single_minbr_default']) + ' ' + str(average_single_scores['delimit_multi_minbr_default']) + ' ' + str(average_single_scores['PTP_minbr_default']) + ' ' + str(average_real_score_single_minbr_0) + ' ' + str(average_real_score_single_minbr_default) + '\n') gnuplotOut_multi_scores.write(set_name[2:] + ' ' + str(average_multi_scores['delimit_single_minbr_0']) + ' ' + str(average_multi_scores['delimit_multi_minbr_0']) + ' ' + str(average_multi_scores['delimit_single_minbr_default']) + ' ' + str(average_multi_scores['delimit_multi_minbr_default']) + ' ' + str(average_multi_scores['PTP_minbr_default']) + ' ' + str(average_real_score_multi_minbr_0) + ' ' + str(average_real_score_multi_minbr_default) + '\n') gnuplotOut_num_species.write(set_name[2:] + ' ' + str(average_num_species['delimit_single_minbr_0']) + ' ' + str(average_num_species['delimit_multi_minbr_0']) + ' ' + str(average_num_species['delimit_single_minbr_default']) + ' ' + str(average_num_species['delimit_multi_minbr_default']) + ' ' + str(average_num_species['PTP_minbr_default']) + ' ' + str(average_real_num_species) + '\n') gnuplotOut_tree_scores_current_set.close() gnuplotOut_nmi_scores_current_set.close() gnuplotOut_single_scores_current_set.close() gnuplotOut_multi_scores_current_set.close() gnuplotOut_num_species_current_set.close() gnuplotOut_delta_species_current_set.close() gnuplotOut_tree_scores.close() gnuplotOut_nmi_scores.close() gnuplotOut_single_scores.close() gnuplotOut_multi_scores.close() gnuplotOut_num_species.close() commands.getstatusoutput('gnuplot plotscript_without_gmyc') mptp-0.2.2/src/python/create_scoring_results_without_ptp.py000077500000000000000000000505711304415103400244010ustar00rootroot00000000000000#! /usr/bin/env python import os import commands def extract_tree_score(input_text): lines = input_text.split('\n') for line in lines: if line.startswith("Tree penalty score:"): return int(line.split(': ')[1]) break def extract_nmi_score(input_text): lines = input_text.split('\n') for line in lines: if line.startswith("NMI score:"): return float(line.split(': ')[1]) break def extract_num_species(input_text): lines = input_text.split('\n') for line in lines: if line.startswith("Number of species in input file:"): return int(line.split(': ')[1]) if (int(line.split(': ')[1]) == 1): print "Baaaaad data" break def extract_num_real_species(input_text): lines = input_text.split('\n') for line in lines: if line.startswith("Number of real species:"): return int(line.split(': ')[1]) break def extract_score_real_single(input_text): lines = input_text.split('\n') for line in lines: if line.startswith("Score real single:"): return float(line.split(': ')[1]) break def extract_score_real_multi(input_text): lines = input_text.split('\n') for line in lines: if line.startswith("Score real multi:"): return float(line.split(': ')[1]) break def extract_score_input_single(input_text): lines = input_text.split('\n') for line in lines: if line.startswith("Score input single:"): return float(line.split(': ')[1]) break def extract_score_input_multi(input_text): lines = input_text.split('\n') for line in lines: if line.startswith("Score input multi:"): return float(line.split(': ')[1]) break def grab_scorings(input_tree_file, output_delimit_single_minbr_0, output_delimit_multi_minbr_0, output_delimit_single_minbr_default, output_delimit_multi_minbr_default, output_gmyc_minbr_0): try: open(input_tree_file) programNames = ['delimit_single_minbr_0', 'delimit_multi_minbr_0', 'delimit_single_minbr_default', 'delimit_multi_minbr_default', 'gmyc_minbr_0'] tree_scores = {} nmi_scores = {} num_species = {} single_scores = {} multi_scores = {} num_real_species = 0 score_real_single_minbr_0 = 0 score_real_multi_minbr_0 = 0 score_real_single_minbr_default = 0 score_real_multi_minbr_default = 0 tree_scores['delimit_single_minbr_0'] = extract_tree_score(output_delimit_single_minbr_0) tree_scores['delimit_multi_minbr_0'] = extract_tree_score(output_delimit_multi_minbr_0) tree_scores['delimit_single_minbr_default'] = extract_tree_score(output_delimit_single_minbr_default) tree_scores['delimit_multi_minbr_default'] = extract_tree_score(output_delimit_multi_minbr_default) tree_scores['gmyc_minbr_0'] = extract_tree_score(output_gmyc_minbr_0) nmi_scores['delimit_single_minbr_0'] = extract_nmi_score(output_delimit_single_minbr_0) nmi_scores['delimit_multi_minbr_0'] = extract_nmi_score(output_delimit_multi_minbr_0) nmi_scores['delimit_single_minbr_default'] = extract_nmi_score(output_delimit_single_minbr_default) nmi_scores['delimit_multi_minbr_default'] = extract_nmi_score(output_delimit_multi_minbr_default) nmi_scores['gmyc_minbr_0'] = extract_nmi_score(output_gmyc_minbr_0) num_species['delimit_single_minbr_0'] = extract_num_species(output_delimit_single_minbr_0) num_species['delimit_multi_minbr_0'] = extract_num_species(output_delimit_multi_minbr_0) num_species['delimit_single_minbr_default'] = extract_num_species(output_delimit_single_minbr_default) num_species['delimit_multi_minbr_default'] = extract_num_species(output_delimit_multi_minbr_default) num_species['gmyc_minbr_0'] = extract_num_species(output_gmyc_minbr_0) single_scores['delimit_single_minbr_0'] = extract_score_input_single(output_delimit_single_minbr_0) single_scores['delimit_multi_minbr_0'] = extract_score_input_single(output_delimit_multi_minbr_0) single_scores['delimit_single_minbr_default'] = extract_score_input_single(output_delimit_single_minbr_default) single_scores['delimit_multi_minbr_default'] = extract_score_input_single(output_delimit_multi_minbr_default) single_scores['gmyc_minbr_0'] = extract_score_input_single(output_gmyc_minbr_0) multi_scores['delimit_single_minbr_0'] = extract_score_input_multi(output_delimit_single_minbr_0) multi_scores['delimit_multi_minbr_0'] = extract_score_input_multi(output_delimit_multi_minbr_0) multi_scores['delimit_single_minbr_default'] = extract_score_input_multi(output_delimit_single_minbr_default) multi_scores['delimit_multi_minbr_default'] = extract_score_input_multi(output_delimit_multi_minbr_default) multi_scores['gmyc_minbr_0'] = extract_score_input_multi(output_gmyc_minbr_0) score_real_single_minbr_0 = extract_score_real_single(output_delimit_single_minbr_0) score_real_multi_minbr_0 = extract_score_real_multi(output_delimit_single_minbr_0) score_real_single_minbr_default = extract_score_real_single(output_delimit_single_minbr_default) score_real_multi_minbr_default = extract_score_real_multi(output_delimit_single_minbr_default) num_real_species = extract_num_real_species(output_delimit_single_minbr_0) return (tree_scores, nmi_scores, num_species, single_scores, multi_scores, score_real_single_minbr_0, score_real_multi_minbr_0, score_real_single_minbr_default, score_real_multi_minbr_default, num_real_species) except IOError: print "File not found: " + input_tree_file def create_scoring_results(input_tree_file, input_delimit_single_minbr_0_file, input_delimit_multi_minbr_0_file, input_delimit_single_minbr_default_file, input_delimit_multi_minbr_default_file, input_gmyc_minbr_0_file, output_delimit_single_minbr_0_file, output_delimit_multi_minbr_0_file, output_delimit_single_minbr_default_file, output_delimit_multi_minbr_default_file, output_gmyc_minbr_0_file): try: open(input_tree_file) if not os.path.exists(os.path.dirname(output_delimit_single_minbr_0_file)): os.makedirs(os.path.dirname(output_delimit_single_minbr_0_file)) if not os.path.exists(os.path.dirname(output_delimit_multi_minbr_0_file)): os.makedirs(os.path.dirname(output_delimit_multi_minbr_0_file)) if not os.path.exists(os.path.dirname(output_delimit_single_minbr_default_file)): os.makedirs(os.path.dirname(output_delimit_single_minbr_default_file)) if not os.path.exists(os.path.dirname(output_delimit_multi_minbr_default_file)): os.makedirs(os.path.dirname(output_delimit_multi_minbr_default_file)) if not os.path.exists(os.path.dirname(output_gmyc_minbr_0_file)): os.makedirs(os.path.dirname(output_gmyc_minbr_0_file)) call_delimit_single_minbr_0 = "./delimit --score " + input_delimit_single_minbr_0_file + " --min_br 0 --tree_file " + input_tree_file + " --output_file foo" call_delimit_multi_minbr_0 = "./delimit --score " + input_delimit_multi_minbr_0_file + " --min_br 0 --tree_file " + input_tree_file + " --output_file foo" call_delimit_single_minbr_default = "./delimit --score " + input_delimit_single_minbr_default_file + " --tree_file " + input_tree_file + " --output_file foo" call_delimit_multi_minbr_default = "./delimit --score " + input_delimit_multi_minbr_default_file + " --tree_file " + input_tree_file + " --output_file foo" call_gmyc_minbr_0 = "./delimit --score " + input_gmyc_minbr_0_file + " --min_br 0 --tree_file " + input_tree_file + " --output_file foo" (stat_delimit_single_minbr_0, output_delimit_single_minbr_0) = commands.getstatusoutput(call_delimit_single_minbr_0) (stat_delimit_multi_minbr_0, output_delimit_multi_minbr_0) = commands.getstatusoutput(call_delimit_multi_minbr_0) (stat_delimit_single_minbr_default, output_delimit_single_minbr_default) = commands.getstatusoutput(call_delimit_single_minbr_default) (stat_delimit_multi_minbr_default, output_delimit_multi_minbr_default) = commands.getstatusoutput(call_delimit_multi_minbr_default) (stat_gmyc_minbr_0, output_gmyc_minbr_0) = commands.getstatusoutput(call_gmyc_minbr_0) delimit_single_minbr_0_out = open(output_delimit_single_minbr_0_file, 'w') delimit_multi_minbr_0_out = open(output_delimit_multi_minbr_0_file, 'w') delimit_single_minbr_default_out = open(output_delimit_single_minbr_default_file, 'w') delimit_multi_minbr_default_out = open(output_delimit_multi_minbr_default_file, 'w') gmyc_minbr_0_out = open(output_gmyc_minbr_0_file, 'w') delimit_single_minbr_0_out.write(output_delimit_single_minbr_0) delimit_multi_minbr_0_out.write(output_delimit_multi_minbr_0) delimit_single_minbr_default_out.write(output_delimit_single_minbr_default) delimit_multi_minbr_default_out.write(output_delimit_multi_minbr_default) gmyc_minbr_0_out.write(output_gmyc_minbr_0) delimit_single_minbr_0_out.close() delimit_multi_minbr_0_out.close() delimit_single_minbr_default_out.close() delimit_multi_minbr_default_out.close() gmyc_minbr_0_out.close() return grab_scorings(input_tree_file, output_delimit_single_minbr_0, output_delimit_multi_minbr_0, output_delimit_single_minbr_default, output_delimit_multi_minbr_default, output_gmyc_minbr_0) except IOError: print "File not found: " + input_tree_file set_names = ["Ne1e+05", "Ne1e+06", "Ne5e+05", "Ne10000"] names = ['delimit_single_minbr_0', 'delimit_multi_minbr_0', 'delimit_single_minbr_default', 'delimit_multi_minbr_default', 'gmyc_minbr_0'] gnuplotOut_tree_scores = open('workfile_tree_scores', 'w') gnuplotOut_nmi_scores = open('workfile_nmi_scores', 'w') gnuplotOut_single_scores = open('workfile_single_scores', 'w') gnuplotOut_multi_scores = open('workfile_multi_scores', 'w') gnuplotOut_num_species = open('workfile_num_species', 'w') for set_name in set_names: gnuplotOut_tree_scores_current_set = open('workfile_tree_scores_' + set_name, 'w') gnuplotOut_nmi_scores_current_set = open('workfile_nmi_scores_' + set_name, 'w') gnuplotOut_single_scores_current_set = open('workfile_single_scores_' + set_name, 'w') gnuplotOut_multi_scores_current_set = open('workfile_multi_scores_' + set_name, 'w') gnuplotOut_num_species_current_set = open('workfile_num_species_' + set_name, 'w') gnuplotOut_delta_species_current_set = open('workfile_delta_species_' + set_name, 'w') num_valid_indices = 0 average_tree_scores = {} average_nmi_scores = {} average_num_species = {} average_single_scores = {} average_multi_scores = {} average_real_num_species = 0 average_real_score_single_minbr_0 = 0 average_real_score_multi_minbr_0 = 0 average_real_score_single_minbr_default = 0 average_real_score_multi_minbr_default = 0 for name in names: average_tree_scores[name] = 0 average_nmi_scores[name] = 0 average_num_species[name] = 0 average_single_scores[name] = 0 average_multi_scores[name] = 0 num_bad_guys = 0 for i in range(1,101): input_tree_file = "SimulB_C_trees/set_" + set_name + "/SimulB_C_tree_set_" + set_name + "." + str(i) + ".txt" try: open(input_tree_file) input_delimit_single_minbr_0_file = "SimulB_C_delimit_single_minbr_0/set_" + set_name + "/delimit_results_set_" + set_name + "." + str(i) + ".txt" input_delimit_multi_minbr_0_file = "SimulB_C_delimit_multi_minbr_0/set_" + set_name + "/delimit_results_set_" + set_name + "." + str(i) + ".txt" input_delimit_single_minbr_default_file = "SimulB_C_delimit_single_minbr_default/set_" + set_name + "/delimit_results_set_" + set_name + "." + str(i) + ".txt" input_delimit_multi_minbr_default_file = "SimulB_C_delimit_multi_minbr_default/set_" + set_name + "/delimit_results_set_" + set_name + "." + str(i) + ".txt" input_gmyc_minbr_0_file = "SimulB_C_gmyc_minbr_0/set_" + set_name + "/gmyc_results_set_" + set_name + "." + str(i) + ".txt" score_path = "SimulB_C_scoring_results/" output_delimit_single_minbr_0_file = score_path + "delimit_single_minbr_0/set_" + set_name + "/delimit_score_set_" + set_name + "." + str(i) + ".txt" output_delimit_multi_minbr_0_file = score_path + "delimit_multi_minbr_0/set_" + set_name + "/delimit_score_set_" + set_name + "." + str(i) + ".txt" output_delimit_single_minbr_default_file = score_path + "delimit_single_minbr_default/set_" + set_name + "/delimit_score_set_" + set_name + "." + str(i) + ".txt" output_delimit_multi_minbr_default_file = score_path + "delimit_multi_minbr_default/set_" + set_name + "/delimit_score_set_" + set_name + "." + str(i) + ".txt" output_gmyc_minbr_0_file = score_path + "gmyc_minbr_0/set_" + set_name + "/gmyc_score_set_" + set_name + "." + str(i) + ".txt" (tree_scores, nmi_scores, num_species, single_scores, multi_scores, score_real_single_minbr_0, score_real_multi_minbr_0, score_real_single_minbr_default, score_real_multi_minbr_default, num_real_species) = create_scoring_results(input_tree_file, input_delimit_single_minbr_0_file, input_delimit_multi_minbr_0_file, input_delimit_single_minbr_default_file, input_delimit_multi_minbr_default_file, input_gmyc_minbr_0_file, output_delimit_single_minbr_0_file, output_delimit_multi_minbr_0_file, output_delimit_single_minbr_default_file, output_delimit_multi_minbr_default_file, output_gmyc_minbr_0_file) gnuplotOut_tree_scores_current_set.write(str(i) + ' ' + str(tree_scores['delimit_single_minbr_0']) + ' ' + str(tree_scores['delimit_multi_minbr_0']) + ' ' + str(tree_scores['delimit_single_minbr_default']) + ' ' + str(tree_scores['delimit_multi_minbr_default']) + ' ' + str(tree_scores['gmyc_minbr_0']) + '\n') gnuplotOut_nmi_scores_current_set.write(str(i) + ' ' + str(nmi_scores['delimit_single_minbr_0']) + ' ' + str(nmi_scores['delimit_multi_minbr_0']) + ' ' + str(nmi_scores['delimit_single_minbr_default']) + ' ' + str(nmi_scores['delimit_multi_minbr_default']) + ' ' + str(nmi_scores['gmyc_minbr_0']) + '\n') gnuplotOut_single_scores_current_set.write(str(i) + ' ' + str(single_scores['delimit_single_minbr_0']) + ' ' + str(single_scores['delimit_multi_minbr_0']) + ' ' + str(single_scores['delimit_single_minbr_default']) + ' ' + str(single_scores['delimit_multi_minbr_default']) + ' ' + str(score_real_single_minbr_0) + ' ' + str(single_scores['gmyc_minbr_0']) + ' ' + str(score_real_single_minbr_default) + '\n') gnuplotOut_multi_scores_current_set.write(str(i) + ' ' + str(multi_scores['delimit_single_minbr_0']) + ' ' + str(multi_scores['delimit_multi_minbr_0']) + ' ' + str(multi_scores['delimit_single_minbr_default']) + ' ' + str(multi_scores['delimit_multi_minbr_default']) + ' ' + str(score_real_multi_minbr_0) + ' ' + ' ' + str(multi_scores['gmyc_minbr_0']) + str(score_real_multi_minbr_default) + '\n') gnuplotOut_num_species_current_set.write(str(i) + ' ' + str(num_species['delimit_single_minbr_0']) + ' ' + str(num_species['delimit_multi_minbr_0']) + ' ' + str(num_species['delimit_single_minbr_default']) + ' ' + str(num_species['delimit_multi_minbr_default']) + ' ' + str(num_species['gmyc_minbr_0']) + ' ' + str(num_real_species) + '\n') gnuplotOut_delta_species_current_set.write(str(i) + ' ' + str(num_species['delimit_single_minbr_0'] - num_real_species) + ' ' + str(num_species['delimit_multi_minbr_0'] - num_real_species) + ' ' + str(num_species['delimit_single_minbr_default'] - num_real_species) + ' ' + str(num_species['delimit_multi_minbr_default'] - num_real_species) + ' ' + str(num_species['gmyc_minbr_0'] - num_real_species) + ' ' + str(num_real_species - num_real_species) + '\n') try: for name in names: average_tree_scores[name] = average_tree_scores[name] + tree_scores[name] average_nmi_scores[name] = average_nmi_scores[name] + nmi_scores[name] average_num_species[name] = average_num_species[name] + num_species[name] average_single_scores[name] = average_single_scores[name] + single_scores[name] average_multi_scores[name] = average_multi_scores[name] + multi_scores[name] average_real_num_species = average_real_num_species + num_real_species average_real_score_single_minbr_0 = average_real_score_single_minbr_0 + score_real_single_minbr_0 average_real_score_multi_minbr_0 = average_real_score_multi_minbr_0 + score_real_multi_minbr_0 average_real_score_single_minbr_default = average_real_score_single_minbr_default + score_real_single_minbr_default average_real_score_multi_minbr_default = average_real_score_multi_minbr_default + score_real_multi_minbr_default except: print "File is bad: " + input_tree_file num_valid_indices = num_valid_indices - 1 num_bad_guys = num_bad_guys + 1 num_valid_indices = num_valid_indices + 1 except IOError: #1 print "File not found: " + input_tree_file #print "Set " + set_name + ": Num bad guys " + str(num_bad_guys) #print "Set " + set_name + ": Num good guys " + str(num_valid_indices) if (num_valid_indices > 0): for name in names: average_tree_scores[name] = float(average_tree_scores[name]) / float(num_valid_indices) average_nmi_scores[name] = float(average_nmi_scores[name]) / float(num_valid_indices) average_num_species[name] = float(average_num_species[name]) / float(num_valid_indices) average_single_scores[name] = float(average_single_scores[name]) / float(num_valid_indices) average_multi_scores[name] = float(average_multi_scores[name]) / float(num_valid_indices) #print "Set " + set_name + ": Average tree score " + name #print average_tree_scores[name] #print "Set " + set_name + ": Average NMI score " + name #print average_nmi_scores[name] #print "Set " + set_name + ": Average num species " + name #print average_num_species[name] #print "Set " + set_name + ": Average input score single " + name #print average_single_scores[name] #print "Set " + set_name + ": Average input score multi " + name #print average_multi_scores[name] average_real_num_species = float(average_real_num_species) / float(num_valid_indices) average_real_score_single_minbr_0 = float(average_real_score_single_minbr_0) / float(num_valid_indices) average_real_score_multi_minbr_0 = float(average_real_score_multi_minbr_0) / float(num_valid_indices) average_real_score_single_minbr_default = float(average_real_score_single_minbr_default) / float(num_valid_indices) average_real_score_multi_minbr_default = float(average_real_score_multi_minbr_default) / float(num_valid_indices) #print "Set " + set_name + ": Average real num species " #print average_real_num_species #print "Set " + set_name + ": Average real score single " #print average_real_score_single #print "Set " + set_name + ": Average real score multi " #print average_real_score_multi gnuplotOut_tree_scores.write(set_name[2:] + ' ' + str(average_tree_scores['delimit_single_minbr_0']) + ' ' + str(average_tree_scores['delimit_multi_minbr_0']) + ' ' + str(average_tree_scores['delimit_single_minbr_default']) + ' ' + str(average_tree_scores['delimit_multi_minbr_default']) + ' ' + str(average_tree_scores['gmyc_minbr_0']) + '\n') gnuplotOut_nmi_scores.write(set_name[2:] + ' ' + str(average_nmi_scores['delimit_single_minbr_0']) + ' ' + str(average_nmi_scores['delimit_multi_minbr_0']) + ' ' + str(average_nmi_scores['delimit_single_minbr_default']) + ' ' + str(average_nmi_scores['delimit_multi_minbr_default']) + ' ' + str(average_nmi_scores['gmyc_minbr_0']) + '\n') gnuplotOut_single_scores.write(set_name[2:] + ' ' + str(average_single_scores['delimit_single_minbr_0']) + ' ' + str(average_single_scores['delimit_multi_minbr_0']) + ' ' + str(average_single_scores['delimit_single_minbr_default']) + ' ' + str(average_single_scores['delimit_multi_minbr_default']) + ' ' + str(average_single_scores['gmyc_minbr_0']) + ' ' + str(average_real_score_single_minbr_0) + ' ' + str(average_real_score_single_minbr_default) + '\n') gnuplotOut_multi_scores.write(set_name[2:] + ' ' + str(average_multi_scores['delimit_single_minbr_0']) + ' ' + str(average_multi_scores['delimit_multi_minbr_0']) + ' ' + str(average_multi_scores['delimit_single_minbr_default']) + ' ' + str(average_multi_scores['delimit_multi_minbr_default']) + ' ' + str(average_multi_scores['gmyc_minbr_0']) + ' ' + str(average_real_score_multi_minbr_0) + ' ' + str(average_real_score_multi_minbr_default) + '\n') gnuplotOut_num_species.write(set_name[2:] + ' ' + str(average_num_species['delimit_single_minbr_0']) + ' ' + str(average_num_species['delimit_multi_minbr_0']) + ' ' + str(average_num_species['delimit_single_minbr_default']) + ' ' + str(average_num_species['delimit_multi_minbr_default']) + ' ' + str(average_num_species['gmyc_minbr_0']) + ' ' + str(average_real_num_species) + '\n') gnuplotOut_tree_scores_current_set.close() gnuplotOut_nmi_scores_current_set.close() gnuplotOut_single_scores_current_set.close() gnuplotOut_multi_scores_current_set.close() gnuplotOut_num_species_current_set.close() gnuplotOut_delta_species_current_set.close() gnuplotOut_tree_scores.close() gnuplotOut_nmi_scores.close() gnuplotOut_single_scores.close() gnuplotOut_multi_scores.close() gnuplotOut_num_species.close() commands.getstatusoutput('gnuplot plotscript_without_ptp') mptp-0.2.2/src/python/create_subsets.py000077500000000000000000000062651304415103400201770ustar00rootroot00000000000000#! /usr/bin/env python import os def create_subsets(alignmentFile, num_of_species, sum_of_species, num_basepairs, output_taxa_file, output_alignment_file, num_alignments): try: with open(alignmentFile) as f: content = f.read().splitlines() f.close() speciesList = [] for i in range(0,31): emptyList = [] speciesList.append(emptyList) alignments = {} for i in range(1, len(content)): # ignore first line contentSplitted = content[i].split(); taxonName = contentSplitted[0] alignments[taxonName] = contentSplitted[1][0:num_basepairs] species = taxonName.split('.')[0] speciesList[int(species)].append(taxonName) speciesListSorted = sorted(speciesList, key = len) currentIdx = 0 selectedTaxa = [] found = 0 for i in range(30,-1,-1): if currentIdx < len(num_of_species): if len(speciesListSorted[i]) >= sum_of_species[currentIdx]: found = found + 1 for j in range(1, sum_of_species[currentIdx]): selectedTaxa.append(speciesListSorted[i][j]) else: print "We had an error :(" if found == num_of_species[currentIdx]: currentIdx = currentIdx + 1 found = 0 # write the solutions into the files if not os.path.exists(os.path.dirname(output_taxa_file)): os.makedirs(os.path.dirname(output_taxa_file)) taxaOut = open(output_taxa_file, 'w') for taxon in selectedTaxa: taxaOut.write(taxon + "\n") taxaOut.close() if not os.path.exists(os.path.dirname(output_alignment_file)): os.makedirs(os.path.dirname(output_alignment_file)) alignmentOut = open(output_alignment_file, 'w') alignmentOut.write(str(num_alignments) + " " + str(num_basepairs) + "\n") for taxon in selectedTaxa: alignmentOut.write(taxon + " "+ alignments[taxon] + "\n") alignmentOut.close() return (currentIdx >= len(num_of_species)) except IOError: print "File not found: " + alignmentFile set_names = ["set_1", "set_5", "set_10", "set_20", "set_40", "set_80", "set_160"] num_of_species = [3, 6, 9, 12] size_of_species = [35, 25, 10, 2] uniform_num = [30] uniform_size = [12] base_pairs = [100, 250, 500, 1000] uniform_num_alignments = 360 nonuniform_num_alignments = 369 for set_name in set_names: for i in range(1,101): for bp in base_pairs: output_nonuniform_taxa_file = "nonuniform/taxa/"+str(bp)+"/taxa.simulated_" + set_name + "_" + str(i) output_nonuniform_alignment_file = "nonuniform/alignments/"+str(bp)+"/simulated_tree_" + set_name + "_" + str(i) output_uniform_taxa_file = "uniform/taxa/"+str(bp)+"/taxa.simulated_" + set_name + "_" + str(i) output_uniform_alignment_file = "uniform/alignments/"+str(bp)+"/simulated_tree_" + set_name + "_" + str(i) alignmentFile = "reduced_alignments/" + set_name + "/simulated_" + set_name + "_" + str(i) + ".phy.reduced" if create_subsets(alignmentFile, num_of_species, size_of_species, bp, output_nonuniform_taxa_file, output_nonuniform_alignment_file, nonuniform_num_alignments) == False: print "Found a file that does not fit our requirement :-(" if create_subsets(alignmentFile, uniform_num, uniform_size, bp, output_uniform_taxa_file, output_uniform_alignment_file, uniform_num_alignments) == False: print "Found a file that does not fit our requirement :-(" mptp-0.2.2/src/python/extract_trees.py000077500000000000000000000013161304415103400200300ustar00rootroot00000000000000#! /usr/bin/env python import os import commands set_names = ["Ne1e+05", "Ne1e+06", "Ne5e+05", "Ne10000"] for set_name in set_names: try: tree_path = "SimulB&C." + set_name + "_nospec.phy" tree_file = open(tree_path) lines = tree_file.readlines() for i in range(1,101): # only the first 100 trees tree_destination = "SimulB_C_trees/set_" + set_name + "/SimulB_C_tree_set_" + set_name + "." + str(i) + ".txt" if not os.path.exists(os.path.dirname(tree_destination)): os.makedirs(os.path.dirname(tree_destination)) tree_destination_file = open(tree_destination, 'w') tree_destination_file.write(lines[i - 1]) tree_file.close() except IOError: print "File not found: " + tree_path mptp-0.2.2/src/python/plotscript000066400000000000000000000541271304415103400167350ustar00rootroot00000000000000set term pngcairo size 800,600 nocrop enhanced font 'Verdana,11'#define axis set style line 11 lc rgb '#808080' lt 1 set border 3 back ls 11 set tics nomirror #define key #set key opaque set key outside # define grid set style line 12 lc rgb '#808080' lt 0 lw 1 set grid back ls 12 # define linecolors set style line 1 lc rgb '#0060ad' pt 7 ps 0.5 pi -1 lt 0 lw 2 # --- blue set style line 2 lc rgb '#8b1a0e' pt 7 ps 0.5 pi -1 lt 0 lw 2 # --- red set style line 3 lc rgb '#5e9c36' pt 7 ps 0.5 pi -1 lt 0 lw 2 # --- green set style line 4 lc rgb '#ffa500' pt 7 ps 0.5 pi -1 lt 0 lw 2 # --- orange set style line 5 lc rgb '#40e0d0' pt 7 ps 0.5 pi -1 lt 0 lw 2 # --- turquoise set style line 6 lc rgb '#9400d3' pt 7 ps 0.5 pi -1 lt 0 lw 2 # --- darkviolet set style line 7 lc rgb '#ff00ff' pt 7 ps 0.5 pi -1 lt 0 lw 2 # --- magenta set style line 8 lc rgb '#c0c0c0' pt 7 ps 0.5 pi -1 lt 0 lw 2 # --- silver set style line 9 lc rgb '#e6e6Fa' pt 7 ps 0.5 pi -1 lt 0 lw 2 # --- lavender set pointintervalbox 1 set samples 300 #Start of user script #--------------------- single_0 = 2 multi_0 = 3 single_default = 4 multi_default = 5 ptp_default = 6 gmyc_0 = 7 real = 8 # Kassian Score set title "Average Kassian Score similar GMYC taxa" set xlabel "Set number" set ylabel "Average tree score" ExtData1 = 'workfile_tree_scores' set output 'plots/average_tree_scores.png' plot ExtData1 using 1:2 title 'delimit single minbr 0' with linespoints ls 1, ExtData1 using 1:3 title 'delimit multi minbr 0' with linespoints ls 2, ExtData1 using 1:4 title 'delimit single minbr default' with linespoints ls 3, ExtData1 using 1:5 title 'delimit multi minbr default' with linespoints ls 4, ExtData1 using 1:6 title 'PTP minbr default' with linespoints ls 5, ExtData1 using 1:7 title 'GMYC minbr 0' with linespoints ls 6 set xlabel "index" set ylabel "Tree Score" ExtData1_10000 = 'workfile_tree_scores_Ne10000' ExtData1_100000 = 'workfile_tree_scores_Ne100000' ExtData1_500000 = 'workfile_tree_scores_Ne500000' ExtData1_1000000 = 'workfile_tree_scores_Ne1000000' set title "Kassian Tree Score similar GMYC taxa delimit single minbr 0" set output 'plots/tree_scores_delimit_single_minbr_0.png' plot ExtData1_10000 using 1:single_0 title 'Ne10000' with linespoints ls 1, ExtData1_100000 using 1:single_0 title 'Ne10000' with linespoints ls 2, ExtData1_500000 using 1:single_0 title 'Ne500000' with linespoints ls 3, ExtData1_1000000 using 1:single_0 title 'Ne1000000' with linespoints ls 4 set title "Kassian Tree Score similar GMYC taxa delimit multi minbr 0" set output 'plots/tree_scores_delimit_multi_minbr_0.png' plot ExtData1_10000 using 1:multi_0 title 'Ne10000' with linespoints ls 1, ExtData1_100000 using 1:multi_0 title 'Ne10000' with linespoints ls 2, ExtData1_500000 using 1:multi_0 title 'Ne500000' with linespoints ls 3, ExtData1_1000000 using 1:multi_0 title 'Ne1000000' with linespoints ls 4 set title "Kassian Tree Score similar GMYC taxa delimit single minbr default" set output 'plots/tree_scores_delimit_single_minbr_default.png' plot ExtData1_10000 using 1:single_default title 'Ne10000' with linespoints ls 1, ExtData1_100000 using 1:single_default title 'Ne10000' with linespoints ls 2, ExtData1_500000 using 1:single_default title 'Ne500000' with linespoints ls 3, ExtData1_1000000 using 1:single_default title 'Ne1000000' with linespoints ls 4 set title "Kassian Tree Score similar GMYC taxa delimit multi minbr default" set output 'plots/tree_scores_delimit_multi_minbr_default.png' plot ExtData1_10000 using 1:multi_default title 'Ne10000' with linespoints ls 1, ExtData1_100000 using 1:multi_default title 'Ne10000' with linespoints ls 2, ExtData1_500000 using 1:multi_default title 'Ne500000' with linespoints ls 3, ExtData1_1000000 using 1:multi_default title 'Ne1000000' with linespoints ls 4 set title "Kassian Tree Score similar GMYC taxa PTP minbr default" set output 'plots/tree_scores_PTP_minbr_default.png' plot ExtData1_10000 using 1:ptp_default title 'Ne10000' with linespoints ls 1, ExtData1_100000 using 1:ptp_default title 'Ne10000' with linespoints ls 2, ExtData1_500000 using 1:ptp_default title 'Ne500000' with linespoints ls 3, ExtData1_1000000 using 1:ptp_default title 'Ne1000000' with linespoints ls 4 set title "Kassian Tree Score similar GMYC taxa GMYC minbr 0" set output 'plots/tree_scores_GMYC_minbr_0.png' plot ExtData1_10000 using 1:gmyc_0 title 'Ne10000' with linespoints ls 1, ExtData1_100000 using 1:gmyc_0 title 'Ne10000' with linespoints ls 2, ExtData1_500000 using 1:gmyc_0 title 'Ne500000' with linespoints ls 3, ExtData1_1000000 using 1:gmyc_0 title 'Ne1000000' with linespoints ls 4 # NMI score set title "Average NMI Score similar GMYC taxa" set xlabel "Set number" set ylabel "Average NMI score" ExtData1 = 'workfile_nmi_scores' set output 'plots/average_nmi_scores.png' plot ExtData1 using 1:2 title 'delimit single minbr 0' with linespoints ls 1, ExtData1 using 1:3 title 'delimit multi minbr 0' with linespoints ls 2, ExtData1 using 1:4 title 'delimit single minbr default' with linespoints ls 3, ExtData1 using 1:5 title 'delimit multi minbr default' with linespoints ls 4, ExtData1 using 1:6 title 'PTP minbr default' with linespoints ls 5, ExtData1 using 1:7 title 'GMYC minbr 0' with linespoints ls 6 set xlabel "index" set ylabel "NMI Score" ExtData1_10000 = 'workfile_nmi_scores_Ne10000' ExtData1_100000 = 'workfile_nmi_scores_Ne100000' ExtData1_500000 = 'workfile_nmi_scores_Ne500000' ExtData1_1000000 = 'workfile_nmi_scores_Ne1000000' set title "NMI Score similar GMYC taxa delimit single minbr 0" set output 'plots/nmi_scores_delimit_single_minbr_0.png' plot ExtData1_10000 using 1:single_0 title 'Ne10000' with linespoints ls 1, ExtData1_100000 using 1:single_0 title 'Ne10000' with linespoints ls 2, ExtData1_500000 using 1:single_0 title 'Ne500000' with linespoints ls 3, ExtData1_1000000 using 1:single_0 title 'Ne1000000' with linespoints ls 4 set title "NMI Score similar GMYC taxa delimit multi minbr 0" set output 'plots/nmi_scores_delimit_multi_minbr_0.png' plot ExtData1_10000 using 1:multi_0 title 'Ne10000' with linespoints ls 1, ExtData1_100000 using 1:multi_0 title 'Ne10000' with linespoints ls 2, ExtData1_500000 using 1:multi_0 title 'Ne500000' with linespoints ls 3, ExtData1_1000000 using 1:multi_0 title 'Ne1000000' with linespoints ls 4 set title "NMI Score similar GMYC taxa delimit single minbr default" set output 'plots/nmi_scores_delimit_single_minbr_default.png' plot ExtData1_10000 using 1:single_default title 'Ne10000' with linespoints ls 1, ExtData1_100000 using 1:single_default title 'Ne10000' with linespoints ls 2, ExtData1_500000 using 1:single_default title 'Ne500000' with linespoints ls 3, ExtData1_1000000 using 1:single_default title 'Ne1000000' with linespoints ls 4 set title "NMI Score similar GMYC taxa delimit multi minbr default" set output 'plots/nmi_scores_delimit_multi_minbr_default.png' plot ExtData1_10000 using 1:multi_default title 'Ne10000' with linespoints ls 1, ExtData1_100000 using 1:multi_default title 'Ne10000' with linespoints ls 2, ExtData1_500000 using 1:multi_default title 'Ne500000' with linespoints ls 3, ExtData1_1000000 using 1:multi_default title 'Ne1000000' with linespoints ls 4 set title "NMI Score similar GMYC taxa PTP minbr default" set output 'plots/nmi_scores_PTP_minbr_default.png' plot ExtData1_10000 using 1:ptp_default title 'Ne10000' with linespoints ls 1, ExtData1_100000 using 1:ptp_default title 'Ne10000' with linespoints ls 2, ExtData1_500000 using 1:ptp_default title 'Ne500000' with linespoints ls 3, ExtData1_1000000 using 1:ptp_default title 'Ne1000000' with linespoints ls 4 set title "NMI Score similar GMYC taxa GMYC minbr 0" set output 'plots/nmi_scores_GMYC_minbr_0.png' plot ExtData1_10000 using 1:gmyc_0 title 'Ne10000' with linespoints ls 1, ExtData1_100000 using 1:gmyc_0 title 'Ne10000' with linespoints ls 2, ExtData1_500000 using 1:gmyc_0 title 'Ne500000' with linespoints ls 3, ExtData1_1000000 using 1:gmyc_0 title 'Ne1000000' with linespoints ls 4 # number of species set title "Average Number of Species similar GMYC taxa" set xlabel "Set number" set ylabel "Average number of species" ExtData1 = 'workfile_num_species' set output 'plots/average_num_species.png' plot ExtData1 using 1:2 title 'delimit single minbr 0' with linespoints ls 1, ExtData1 using 1:3 title 'delimit multi minbr 0' with linespoints ls 2, ExtData1 using 1:4 title 'delimit single minbr default' with linespoints ls 3, ExtData1 using 1:5 title 'delimit multi minbr default' with linespoints ls 4, ExtData1 using 1:6 title 'PTP minbr default' with linespoints ls 5, ExtData1 using 1:7 title 'GMYC minbr 0' with linespoints ls 6, ExtData1 using 1:8 title 'real' with linespoints ls 7 set xlabel "index" set ylabel "Number of Species" ExtData1_10000 = 'workfile_num_species_Ne10000' ExtData1_100000 = 'workfile_num_species_Ne100000' ExtData1_500000 = 'workfile_num_species_Ne500000' ExtData1_1000000 = 'workfile_num_species_Ne1000000' set title "Number of Species similar GMYC taxa delimit single minbr 0" set output 'plots/num_species_delimit_single_minbr_0.png' plot ExtData1_10000 using 1:single_0 title 'Ne10000' with linespoints ls 1, ExtData1_100000 using 1:single_0 title 'Ne10000' with linespoints ls 2, ExtData1_500000 using 1:single_0 title 'Ne500000' with linespoints ls 3, ExtData1_1000000 using 1:single_0 title 'Ne1000000' with linespoints ls 4 set title "Number of Species similar GMYC taxa delimit multi minbr 0" set output 'plots/num_species_delimit_multi_minbr_0.png' plot ExtData1_10000 using 1:multi_0 title 'Ne10000' with linespoints ls 1, ExtData1_100000 using 1:multi_0 title 'Ne10000' with linespoints ls 2, ExtData1_500000 using 1:multi_0 title 'Ne500000' with linespoints ls 3, ExtData1_1000000 using 1:multi_0 title 'Ne1000000' with linespoints ls 4 set title "Number of Species similar GMYC taxa delimit single minbr default" set output 'plots/num_species_delimit_single_minbr_default.png' plot ExtData1_10000 using 1:single_default title 'Ne10000' with linespoints ls 1, ExtData1_100000 using 1:single_default title 'Ne10000' with linespoints ls 2, ExtData1_500000 using 1:single_default title 'Ne500000' with linespoints ls 3, ExtData1_1000000 using 1:single_default title 'Ne1000000' with linespoints ls 4 set title "Number of Species similar GMYC taxa delimit multi minbr default" set output 'plots/num_species_delimit_multi_minbr_default.png' plot ExtData1_10000 using 1:multi_default title 'Ne10000' with linespoints ls 1, ExtData1_100000 using 1:multi_default title 'Ne10000' with linespoints ls 2, ExtData1_500000 using 1:multi_default title 'Ne500000' with linespoints ls 3, ExtData1_1000000 using 1:multi_default title 'Ne1000000' with linespoints ls 4 set title "Number of Species similar GMYC taxa PTP minbr default" set output 'plots/num_species_PTP_minbr_default.png' plot ExtData1_10000 using 1:ptp_default title 'Ne10000' with linespoints ls 1, ExtData1_100000 using 1:ptp_default title 'Ne10000' with linespoints ls 2, ExtData1_500000 using 1:ptp_default title 'Ne500000' with linespoints ls 3, ExtData1_1000000 using 1:ptp_default title 'Ne1000000' with linespoints ls 4 set title "Number of Species similar GMYC taxa real" set output 'plots/num_species_real.png' plot ExtData1_10000 using 1:real title 'Ne10000' with linespoints ls 1, ExtData1_100000 using 1:real title 'Ne10000' with linespoints ls 2, ExtData1_500000 using 1:real title 'Ne500000' with linespoints ls 3, ExtData1_1000000 using 1:real title 'Ne1000000' with linespoints ls 4 set title "Number of Species similar GMYC taxa GMYC minbr 0" set output 'plots/num_species_GMYC_minbr_0.png' plot ExtData1_10000 using 1:gmyc_0 title 'Ne10000' with linespoints ls 1, ExtData1_100000 using 1:gmyc_0 title 'Ne10000' with linespoints ls 2, ExtData1_500000 using 1:gmyc_0 title 'Ne500000' with linespoints ls 3, ExtData1_1000000 using 1:gmyc_0 title 'Ne1000000' with linespoints ls 4 set xlabel "index" set ylabel "Delta Number of Species" ExtData1_10000 = 'workfile_delta_species_Ne10000' ExtData1_100000 = 'workfile_delta_species_Ne100000' ExtData1_500000 = 'workfile_delta_species_Ne500000' ExtData1_1000000 = 'workfile_delta_species_Ne1000000' set title "Delta Number of Species similar GMYC taxa delimit single minbr 0" set output 'plots/delta_species_delimit_single_minbr_0.png' plot ExtData1_10000 using 1:single_0 title 'Ne10000' with linespoints ls 1, ExtData1_100000 using 1:single_0 title 'Ne10000' with linespoints ls 2, ExtData1_500000 using 1:single_0 title 'Ne500000' with linespoints ls 3, ExtData1_1000000 using 1:single_0 title 'Ne1000000' with linespoints ls 4 set title "Delta Number of Species similar GMYC taxa delimit multi minbr 0" set output 'plots/delta_species_delimit_multi_minbr_0.png' plot ExtData1_10000 using 1:multi_0 title 'Ne10000' with linespoints ls 1, ExtData1_100000 using 1:multi_0 title 'Ne10000' with linespoints ls 2, ExtData1_500000 using 1:multi_0 title 'Ne500000' with linespoints ls 3, ExtData1_1000000 using 1:multi_0 title 'Ne1000000' with linespoints ls 4 set title "Delta Number of Species similar GMYC taxa delimit single minbr default" set output 'plots/delta_species_delimit_single_minbr_default.png' plot ExtData1_10000 using 1:single_default title 'Ne10000' with linespoints ls 1, ExtData1_100000 using 1:single_default title 'Ne10000' with linespoints ls 2, ExtData1_500000 using 1:single_default title 'Ne500000' with linespoints ls 3, ExtData1_1000000 using 1:single_default title 'Ne1000000' with linespoints ls 4 set title "Delta Number of Species similar GMYC taxa delimit multi minbr default" set output 'plots/delta_species_delimit_multi_minbr_default.png' plot ExtData1_10000 using 1:multi_default title 'Ne10000' with linespoints ls 1, ExtData1_100000 using 1:multi_default title 'Ne10000' with linespoints ls 2, ExtData1_500000 using 1:multi_default title 'Ne500000' with linespoints ls 3, ExtData1_1000000 using 1:multi_default title 'Ne1000000' with linespoints ls 4 set title "Delta Number of Species similar GMYC taxa PTP minbr default" set output 'plots/delta_species_PTP_minbr_default.png' plot ExtData1_10000 using 1:ptp_default title 'Ne10000' with linespoints ls 1, ExtData1_100000 using 1:ptp_default title 'Ne10000' with linespoints ls 2, ExtData1_500000 using 1:ptp_default title 'Ne500000' with linespoints ls 3, ExtData1_1000000 using 1:ptp_default title 'Ne1000000' with linespoints ls 4 set title "Delta Number of Species similar GMYC taxa real" set output 'plots/delta_species_real.png' plot ExtData1_10000 using 1:real title 'Ne10000' with linespoints ls 1, ExtData1_100000 using 1:real title 'Ne10000' with linespoints ls 2, ExtData1_500000 using 1:real title 'Ne500000' with linespoints ls 3, ExtData1_1000000 using 1:real title 'Ne1000000' with linespoints ls 4 set title "Delta Number of Species similar GMYC taxa GMYC minbr 0" set output 'plots/delta_species_GMYC_minbr_0.png' plot ExtData1_10000 using 1:gmyc_0 title 'Ne10000' with linespoints ls 1, ExtData1_100000 using 1:gmyc_0 title 'Ne10000' with linespoints ls 2, ExtData1_500000 using 1:gmyc_0 title 'Ne500000' with linespoints ls 3, ExtData1_1000000 using 1:gmyc_0 title 'Ne1000000' with linespoints ls 4 # single lambda score set title "Average Single Lambda Score similar GMYC taxa" set xlabel "Set number" set ylabel "Average single lambda score" ExtData1 = 'workfile_single_scores' set output 'plots/average_single_scores.png' plot ExtData1 using 1:2 title 'delimit single minbr 0' with linespoints ls 1, ExtData1 using 1:3 title 'delimit multi minbr 0' with linespoints ls 2, ExtData1 using 1:4 title 'delimit single minbr default' with linespoints ls 3, ExtData1 using 1:5 title 'delimit multi minbr default' with linespoints ls 4, ExtData1 using 1:6 title 'PTP minbr default' with linespoints ls 5, ExtData1 using 1:7 title 'GMYC minbr 0' with linespoints ls 6, ExtData1 using 1:8 title 'real minbr 0' with linespoints ls 7, ExtData1 using 1:9 title 'real minbr default' with linespoints ls 8 set xlabel "index" set ylabel "Single Lambda Score" ExtData1_10000 = 'workfile_single_scores_Ne10000' ExtData1_100000 = 'workfile_single_scores_Ne100000' ExtData1_500000 = 'workfile_single_scores_Ne500000' ExtData1_1000000 = 'workfile_single_scores_Ne1000000' set title "Single Lambda Score similar GMYC taxa delimit single minbr 0" set output 'plots/single_scores_delimit_single_minbr_0.png' plot ExtData1_10000 using 1:single_0 title 'Ne10000' with linespoints ls 1, ExtData1_100000 using 1:single_0 title 'Ne10000' with linespoints ls 2, ExtData1_500000 using 1:single_0 title 'Ne500000' with linespoints ls 3, ExtData1_1000000 using 1:single_0 title 'Ne1000000' with linespoints ls 4 set title "Single Lambda Score similar GMYC taxa delimit multi minbr 0" set output 'plots/single_scores_delimit_multi_minbr_0.png' plot ExtData1_10000 using 1:multi_0 title 'Ne10000' with linespoints ls 1, ExtData1_100000 using 1:multi_0 title 'Ne10000' with linespoints ls 2, ExtData1_500000 using 1:multi_0 title 'Ne500000' with linespoints ls 3, ExtData1_1000000 using 1:multi_0 title 'Ne1000000' with linespoints ls 4 set title "Single Lambda Score similar GMYC taxa delimit single minbr default" set output 'plots/single_scores_delimit_single_minbr_default.png' plot ExtData1_10000 using 1:single_default title 'Ne10000' with linespoints ls 1, ExtData1_100000 using 1:single_default title 'Ne10000' with linespoints ls 2, ExtData1_500000 using 1:single_default title 'Ne500000' with linespoints ls 3, ExtData1_1000000 using 1:single_default title 'Ne1000000' with linespoints ls 4 set title "Single Lambda Score similar GMYC taxa delimit multi minbr default" set output 'plots/single_scores_delimit_multi_minbr_default.png' plot ExtData1_10000 using 1:multi_default title 'Ne10000' with linespoints ls 1, ExtData1_100000 using 1:multi_default title 'Ne10000' with linespoints ls 2, ExtData1_500000 using 1:multi_default title 'Ne500000' with linespoints ls 3, ExtData1_1000000 using 1:multi_default title 'Ne1000000' with linespoints ls 4 set title "Single Lambda Score similar GMYC taxa PTP minbr default" set output 'plots/single_scores_PTP_minbr_default.png' plot ExtData1_10000 using 1:ptp_default title 'Ne10000' with linespoints ls 1, ExtData1_100000 using 1:ptp_default title 'Ne10000' with linespoints ls 2, ExtData1_500000 using 1:ptp_default title 'Ne500000' with linespoints ls 3, ExtData1_1000000 using 1:ptp_default title 'Ne1000000' with linespoints ls 4 set title "Single Lambda Score similar GMYC taxa GMYC minbr 0" set output 'plots/single_scores_GMYC_minbr_0.png' plot ExtData1_10000 using 1:gmyc_0 title 'Ne10000' with linespoints ls 1, ExtData1_100000 using 1:gmyc_0 title 'Ne10000' with linespoints ls 2, ExtData1_500000 using 1:gmyc_0 title 'Ne500000' with linespoints ls 3, ExtData1_1000000 using 1:gmyc_0 title 'Ne1000000' with linespoints ls 4 set title "Average Multi Lambda Score similar GMYC taxa" set xlabel "Set number" set ylabel "Average multi lambda score" ExtData1 = 'workfile_multi_scores' set output 'plots/average_multi_scores.png' plot ExtData1 using 1:2 title 'delimit single minbr 0' with linespoints ls 1, ExtData1 using 1:3 title 'delimit multi minbr 0' with linespoints ls 2, ExtData1 using 1:4 title 'delimit single minbr default' with linespoints ls 3, ExtData1 using 1:5 title 'delimit multi minbr default' with linespoints ls 4, ExtData1 using 1:6 title 'PTP minbr default' with linespoints ls 5, ExtData1 using 1:7 title 'GMYC minbr 0' with linespoints ls 6, ExtData1 using 1:8 title 'real minbr 0' with linespoints ls 7, ExtData1 using 1:9 title 'real minbr default' with linespoints ls 8 set xlabel "index" set ylabel "Multi Lambda Score" ExtData1_10000 = 'workfile_multi_scores_Ne10000' ExtData1_100000 = 'workfile_multi_scores_Ne100000' ExtData1_500000 = 'workfile_multi_scores_Ne500000' ExtData1_1000000 = 'workfile_multi_scores_Ne1000000' set title "Multi Lambda Score similar GMYC taxa delimit single minbr 0" set output 'plots/multi_scores_delimit_single_minbr_0.png' plot ExtData1_10000 using 1:single_0 title 'Ne10000' with linespoints ls 1, ExtData1_100000 using 1:single_0 title 'Ne10000' with linespoints ls 2, ExtData1_500000 using 1:single_0 title 'Ne500000' with linespoints ls 3, ExtData1_1000000 using 1:single_0 title 'Ne1000000' with linespoints ls 4 set title "Multi Lambda Score similar GMYC taxa delimit multi minbr 0" set output 'plots/multi_scores_delimit_multi_minbr_0.png' plot ExtData1_10000 using 1:multi_0 title 'Ne10000' with linespoints ls 1, ExtData1_100000 using 1:multi_0 title 'Ne10000' with linespoints ls 2, ExtData1_500000 using 1:multi_0 title 'Ne500000' with linespoints ls 3, ExtData1_1000000 using 1:multi_0 title 'Ne1000000' with linespoints ls 4 set title "Multi Lambda Score similar GMYC taxa delimit single minbr default" set output 'plots/multi_scores_delimit_single_minbr_default.png' plot ExtData1_10000 using 1:single_default title 'Ne10000' with linespoints ls 1, ExtData1_100000 using 1:single_default title 'Ne10000' with linespoints ls 2, ExtData1_500000 using 1:single_default title 'Ne500000' with linespoints ls 3, ExtData1_1000000 using 1:single_default title 'Ne1000000' with linespoints ls 4 set title "Multi Lambda Score similar GMYC taxa delimit multi minbr default" set output 'plots/multi_scores_delimit_multi_minbr_default.png' plot ExtData1_10000 using 1:multi_default title 'Ne10000' with linespoints ls 1, ExtData1_100000 using 1:multi_default title 'Ne10000' with linespoints ls 2, ExtData1_500000 using 1:multi_default title 'Ne500000' with linespoints ls 3, ExtData1_1000000 using 1:multi_default title 'Ne1000000' with linespoints ls 4 set title "Multi Lambda Score similar GMYC taxa PTP minbr default" set output 'plots/multi_scores_PTP_minbr_default.png' plot ExtData1_10000 using 1:ptp_default title 'Ne10000' with linespoints ls 1, ExtData1_100000 using 1:ptp_default title 'Ne10000' with linespoints ls 2, ExtData1_500000 using 1:ptp_default title 'Ne500000' with linespoints ls 3, ExtData1_1000000 using 1:ptp_default title 'Ne1000000' with linespoints ls 4 set title "Multi Lambda Score similar GMYC taxa GMYC minbr 0" set output 'plots/multi_scores_GMYC_minbr_0.png' plot ExtData1_10000 using 1:gmyc_0 title 'Ne10000' with linespoints ls 1, ExtData1_100000 using 1:gmyc_0 title 'Ne10000' with linespoints ls 2, ExtData1_500000 using 1:gmyc_0 title 'Ne500000' with linespoints ls 3, ExtData1_1000000 using 1:gmyc_0 title 'Ne1000000' with linespoints ls 4 reset; mptp-0.2.2/src/python/plotscript_without_gmyc000066400000000000000000000466611304415103400215430ustar00rootroot00000000000000set term pngcairo size 800,600 nocrop enhanced font 'Verdana,11'#define axis set style line 11 lc rgb '#808080' lt 1 set border 3 back ls 11 set tics nomirror #define key #set key opaque set key outside # define grid set style line 12 lc rgb '#808080' lt 0 lw 1 set grid back ls 12 # define linecolors set style line 1 lc rgb '#0060ad' pt 7 ps 0.5 pi -1 lt 0 lw 2 # --- blue set style line 2 lc rgb '#8b1a0e' pt 7 ps 0.5 pi -1 lt 0 lw 2 # --- red set style line 3 lc rgb '#5e9c36' pt 7 ps 0.5 pi -1 lt 0 lw 2 # --- green set style line 4 lc rgb '#ffa500' pt 7 ps 0.5 pi -1 lt 0 lw 2 # --- orange set style line 5 lc rgb '#40e0d0' pt 7 ps 0.5 pi -1 lt 0 lw 2 # --- turquoise set style line 6 lc rgb '#9400d3' pt 7 ps 0.5 pi -1 lt 0 lw 2 # --- darkviolet set style line 7 lc rgb '#ff00ff' pt 7 ps 0.5 pi -1 lt 0 lw 2 # --- magenta set style line 8 lc rgb '#c0c0c0' pt 7 ps 0.5 pi -1 lt 0 lw 2 # --- silver set style line 9 lc rgb '#e6e6Fa' pt 7 ps 0.5 pi -1 lt 0 lw 2 # --- lavender set pointintervalbox 1 set samples 300 #Start of user script #--------------------- single_0 = 2 multi_0 = 3 single_default = 4 multi_default = 5 ptp_default = 6 real = 7 # Kassian Score set title "Average Kassian Score similar GMYC taxa" set xlabel "Set number" set ylabel "Average tree score" ExtData1 = 'workfile_tree_scores' set output 'plots/average_tree_scores.png' plot ExtData1 using 1:2 title 'delimit single minbr 0' with linespoints ls 1, ExtData1 using 1:3 title 'delimit multi minbr 0' with linespoints ls 2, ExtData1 using 1:4 title 'delimit single minbr default' with linespoints ls 3, ExtData1 using 1:5 title 'delimit multi minbr default' with linespoints ls 4, ExtData1 using 1:6 title 'PTP minbr default' with linespoints ls 5 set xlabel "index" set ylabel "Tree Score" ExtData1_10000 = 'workfile_tree_scores_Ne10000' ExtData1_100000 = 'workfile_tree_scores_Ne100000' ExtData1_500000 = 'workfile_tree_scores_Ne500000' ExtData1_1000000 = 'workfile_tree_scores_Ne1000000' set title "Kassian Tree Score similar GMYC taxa delimit single minbr 0" set output 'plots/tree_scores_delimit_single_minbr_0.png' plot ExtData1_10000 using 1:single_0 title 'Ne10000' with linespoints ls 1, ExtData1_100000 using 1:single_0 title 'Ne10000' with linespoints ls 2, ExtData1_500000 using 1:single_0 title 'Ne500000' with linespoints ls 3, ExtData1_1000000 using 1:single_0 title 'Ne1000000' with linespoints ls 4 set title "Kassian Tree Score similar GMYC taxa delimit multi minbr 0" set output 'plots/tree_scores_delimit_multi_minbr_0.png' plot ExtData1_10000 using 1:multi_0 title 'Ne10000' with linespoints ls 1, ExtData1_100000 using 1:multi_0 title 'Ne10000' with linespoints ls 2, ExtData1_500000 using 1:multi_0 title 'Ne500000' with linespoints ls 3, ExtData1_1000000 using 1:multi_0 title 'Ne1000000' with linespoints ls 4 set title "Kassian Tree Score similar GMYC taxa delimit single minbr default" set output 'plots/tree_scores_delimit_single_minbr_default.png' plot ExtData1_10000 using 1:single_default title 'Ne10000' with linespoints ls 1, ExtData1_100000 using 1:single_default title 'Ne10000' with linespoints ls 2, ExtData1_500000 using 1:single_default title 'Ne500000' with linespoints ls 3, ExtData1_1000000 using 1:single_default title 'Ne1000000' with linespoints ls 4 set title "Kassian Tree Score similar GMYC taxa delimit multi minbr default" set output 'plots/tree_scores_delimit_multi_minbr_default.png' plot ExtData1_10000 using 1:multi_default title 'Ne10000' with linespoints ls 1, ExtData1_100000 using 1:multi_default title 'Ne10000' with linespoints ls 2, ExtData1_500000 using 1:multi_default title 'Ne500000' with linespoints ls 3, ExtData1_1000000 using 1:multi_default title 'Ne1000000' with linespoints ls 4 set title "Kassian Tree Score similar GMYC taxa PTP minbr default" set output 'plots/tree_scores_PTP_minbr_default.png' plot ExtData1_10000 using 1:ptp_default title 'Ne10000' with linespoints ls 1, ExtData1_100000 using 1:ptp_default title 'Ne10000' with linespoints ls 2, ExtData1_500000 using 1:ptp_default title 'Ne500000' with linespoints ls 3, ExtData1_1000000 using 1:ptp_default title 'Ne1000000' with linespoints ls 4 # NMI score set title "Average NMI Score similar GMYC taxa" set xlabel "Set number" set ylabel "Average NMI score" ExtData1 = 'workfile_nmi_scores' set output 'plots/average_nmi_scores.png' plot ExtData1 using 1:2 title 'delimit single minbr 0' with linespoints ls 1, ExtData1 using 1:3 title 'delimit multi minbr 0' with linespoints ls 2, ExtData1 using 1:4 title 'delimit single minbr default' with linespoints ls 3, ExtData1 using 1:5 title 'delimit multi minbr default' with linespoints ls 4, ExtData1 using 1:6 title 'PTP minbr default' with linespoints ls 5 set xlabel "index" set ylabel "NMI Score" ExtData1_10000 = 'workfile_nmi_scores_Ne10000' ExtData1_100000 = 'workfile_nmi_scores_Ne100000' ExtData1_500000 = 'workfile_nmi_scores_Ne500000' ExtData1_1000000 = 'workfile_nmi_scores_Ne1000000' set title "NMI Score similar GMYC taxa delimit single minbr 0" set output 'plots/nmi_scores_delimit_single_minbr_0.png' plot ExtData1_10000 using 1:single_0 title 'Ne10000' with linespoints ls 1, ExtData1_100000 using 1:single_0 title 'Ne10000' with linespoints ls 2, ExtData1_500000 using 1:single_0 title 'Ne500000' with linespoints ls 3, ExtData1_1000000 using 1:single_0 title 'Ne1000000' with linespoints ls 4 set title "NMI Score similar GMYC taxa delimit multi minbr 0" set output 'plots/nmi_scores_delimit_multi_minbr_0.png' plot ExtData1_10000 using 1:multi_0 title 'Ne10000' with linespoints ls 1, ExtData1_100000 using 1:multi_0 title 'Ne10000' with linespoints ls 2, ExtData1_500000 using 1:multi_0 title 'Ne500000' with linespoints ls 3, ExtData1_1000000 using 1:multi_0 title 'Ne1000000' with linespoints ls 4 set title "NMI Score similar GMYC taxa delimit single minbr default" set output 'plots/nmi_scores_delimit_single_minbr_default.png' plot ExtData1_10000 using 1:single_default title 'Ne10000' with linespoints ls 1, ExtData1_100000 using 1:single_default title 'Ne10000' with linespoints ls 2, ExtData1_500000 using 1:single_default title 'Ne500000' with linespoints ls 3, ExtData1_1000000 using 1:single_default title 'Ne1000000' with linespoints ls 4 set title "NMI Score similar GMYC taxa delimit multi minbr default" set output 'plots/nmi_scores_delimit_multi_minbr_default.png' plot ExtData1_10000 using 1:multi_default title 'Ne10000' with linespoints ls 1, ExtData1_100000 using 1:multi_default title 'Ne10000' with linespoints ls 2, ExtData1_500000 using 1:multi_default title 'Ne500000' with linespoints ls 3, ExtData1_1000000 using 1:multi_default title 'Ne1000000' with linespoints ls 4 set title "NMI Score similar GMYC taxa PTP minbr default" set output 'plots/nmi_scores_PTP_minbr_default.png' plot ExtData1_10000 using 1:ptp_default title 'Ne10000' with linespoints ls 1, ExtData1_100000 using 1:ptp_default title 'Ne10000' with linespoints ls 2, ExtData1_500000 using 1:ptp_default title 'Ne500000' with linespoints ls 3, ExtData1_1000000 using 1:ptp_default title 'Ne1000000' with linespoints ls 4 # number of species set title "Average Number of Species similar GMYC taxa" set xlabel "Set number" set ylabel "Average number of species" ExtData1 = 'workfile_num_species' set output 'plots/average_num_species.png' plot ExtData1 using 1:2 title 'delimit single minbr 0' with linespoints ls 1, ExtData1 using 1:3 title 'delimit multi minbr 0' with linespoints ls 2, ExtData1 using 1:4 title 'delimit single minbr default' with linespoints ls 3, ExtData1 using 1:5 title 'delimit multi minbr default' with linespoints ls 4, ExtData1 using 1:6 title 'PTP minbr default' with linespoints ls 5, ExtData1 using 1:7 title 'real' with linespoints ls 7 set xlabel "index" set ylabel "Number of Species" ExtData1_10000 = 'workfile_num_species_Ne10000' ExtData1_100000 = 'workfile_num_species_Ne100000' ExtData1_500000 = 'workfile_num_species_Ne500000' ExtData1_1000000 = 'workfile_num_species_Ne1000000' set title "Number of Species similar GMYC taxa delimit single minbr 0" set output 'plots/num_species_delimit_single_minbr_0.png' plot ExtData1_10000 using 1:single_0 title 'Ne10000' with linespoints ls 1, ExtData1_100000 using 1:single_0 title 'Ne10000' with linespoints ls 2, ExtData1_500000 using 1:single_0 title 'Ne500000' with linespoints ls 3, ExtData1_1000000 using 1:single_0 title 'Ne1000000' with linespoints ls 4 set title "Number of Species similar GMYC taxa delimit multi minbr 0" set output 'plots/num_species_delimit_multi_minbr_0.png' plot ExtData1_10000 using 1:multi_0 title 'Ne10000' with linespoints ls 1, ExtData1_100000 using 1:multi_0 title 'Ne10000' with linespoints ls 2, ExtData1_500000 using 1:multi_0 title 'Ne500000' with linespoints ls 3, ExtData1_1000000 using 1:multi_0 title 'Ne1000000' with linespoints ls 4 set title "Number of Species similar GMYC taxa delimit single minbr default" set output 'plots/num_species_delimit_single_minbr_default.png' plot ExtData1_10000 using 1:single_default title 'Ne10000' with linespoints ls 1, ExtData1_100000 using 1:single_default title 'Ne10000' with linespoints ls 2, ExtData1_500000 using 1:single_default title 'Ne500000' with linespoints ls 3, ExtData1_1000000 using 1:single_default title 'Ne1000000' with linespoints ls 4 set title "Number of Species similar GMYC taxa delimit multi minbr default" set output 'plots/num_species_delimit_multi_minbr_default.png' plot ExtData1_10000 using 1:multi_default title 'Ne10000' with linespoints ls 1, ExtData1_100000 using 1:multi_default title 'Ne10000' with linespoints ls 2, ExtData1_500000 using 1:multi_default title 'Ne500000' with linespoints ls 3, ExtData1_1000000 using 1:multi_default title 'Ne1000000' with linespoints ls 4 set title "Number of Species similar GMYC taxa PTP minbr default" set output 'plots/num_species_PTP_minbr_default.png' plot ExtData1_10000 using 1:ptp_default title 'Ne10000' with linespoints ls 1, ExtData1_100000 using 1:ptp_default title 'Ne10000' with linespoints ls 2, ExtData1_500000 using 1:ptp_default title 'Ne500000' with linespoints ls 3, ExtData1_1000000 using 1:ptp_default title 'Ne1000000' with linespoints ls 4 set title "Number of Species similar GMYC taxa real" set output 'plots/num_species_real.png' plot ExtData1_10000 using 1:real title 'Ne10000' with linespoints ls 1, ExtData1_100000 using 1:real title 'Ne10000' with linespoints ls 2, ExtData1_500000 using 1:real title 'Ne500000' with linespoints ls 3, ExtData1_1000000 using 1:real title 'Ne1000000' with linespoints ls 4 set xlabel "index" set ylabel "Delta Number of Species" ExtData1_10000 = 'workfile_delta_species_Ne10000' ExtData1_100000 = 'workfile_delta_species_Ne100000' ExtData1_500000 = 'workfile_delta_species_Ne500000' ExtData1_1000000 = 'workfile_delta_species_Ne1000000' set title "Delta Number of Species similar GMYC taxa delimit single minbr 0" set output 'plots/delta_species_delimit_single_minbr_0.png' plot ExtData1_10000 using 1:single_0 title 'Ne10000' with linespoints ls 1, ExtData1_100000 using 1:single_0 title 'Ne10000' with linespoints ls 2, ExtData1_500000 using 1:single_0 title 'Ne500000' with linespoints ls 3, ExtData1_1000000 using 1:single_0 title 'Ne1000000' with linespoints ls 4 set title "Delta Number of Species similar GMYC taxa delimit multi minbr 0" set output 'plots/delta_species_delimit_multi_minbr_0.png' plot ExtData1_10000 using 1:multi_0 title 'Ne10000' with linespoints ls 1, ExtData1_100000 using 1:multi_0 title 'Ne10000' with linespoints ls 2, ExtData1_500000 using 1:multi_0 title 'Ne500000' with linespoints ls 3, ExtData1_1000000 using 1:multi_0 title 'Ne1000000' with linespoints ls 4 set title "Delta Number of Species similar GMYC taxa delimit single minbr default" set output 'plots/delta_species_delimit_single_minbr_default.png' plot ExtData1_10000 using 1:single_default title 'Ne10000' with linespoints ls 1, ExtData1_100000 using 1:single_default title 'Ne10000' with linespoints ls 2, ExtData1_500000 using 1:single_default title 'Ne500000' with linespoints ls 3, ExtData1_1000000 using 1:single_default title 'Ne1000000' with linespoints ls 4 set title "Delta Number of Species similar GMYC taxa delimit multi minbr default" set output 'plots/delta_species_delimit_multi_minbr_default.png' plot ExtData1_10000 using 1:multi_default title 'Ne10000' with linespoints ls 1, ExtData1_100000 using 1:multi_default title 'Ne10000' with linespoints ls 2, ExtData1_500000 using 1:multi_default title 'Ne500000' with linespoints ls 3, ExtData1_1000000 using 1:multi_default title 'Ne1000000' with linespoints ls 4 set title "Delta Number of Species similar GMYC taxa PTP minbr default" set output 'plots/delta_species_PTP_minbr_default.png' plot ExtData1_10000 using 1:ptp_default title 'Ne10000' with linespoints ls 1, ExtData1_100000 using 1:ptp_default title 'Ne10000' with linespoints ls 2, ExtData1_500000 using 1:ptp_default title 'Ne500000' with linespoints ls 3, ExtData1_1000000 using 1:ptp_default title 'Ne1000000' with linespoints ls 4 set title "Delta Number of Species similar GMYC taxa real" set output 'plots/delta_species_real.png' plot ExtData1_10000 using 1:real title 'Ne10000' with linespoints ls 1, ExtData1_100000 using 1:real title 'Ne10000' with linespoints ls 2, ExtData1_500000 using 1:real title 'Ne500000' with linespoints ls 3, ExtData1_1000000 using 1:real title 'Ne1000000' with linespoints ls 4 # single lambda score set title "Average Single Lambda Score similar GMYC taxa" set xlabel "Set number" set ylabel "Average single lambda score" ExtData1 = 'workfile_single_scores' set output 'plots/average_single_scores.png' plot ExtData1 using 1:2 title 'delimit single minbr 0' with linespoints ls 1, ExtData1 using 1:3 title 'delimit multi minbr 0' with linespoints ls 2, ExtData1 using 1:4 title 'delimit single minbr default' with linespoints ls 3, ExtData1 using 1:5 title 'delimit multi minbr default' with linespoints ls 4, ExtData1 using 1:6 title 'PTP minbr default' with linespoints ls 5, ExtData1 using 1:7 title 'real minbr 0' with linespoints ls 7, ExtData1 using 1:8 title 'real minbr default' with linespoints ls 8 set xlabel "index" set ylabel "Single Lambda Score" ExtData1_10000 = 'workfile_single_scores_Ne10000' ExtData1_100000 = 'workfile_single_scores_Ne100000' ExtData1_500000 = 'workfile_single_scores_Ne500000' ExtData1_1000000 = 'workfile_single_scores_Ne1000000' set title "Single Lambda Score similar GMYC taxa delimit single minbr 0" set output 'plots/single_scores_delimit_single_minbr_0.png' plot ExtData1_10000 using 1:single_0 title 'Ne10000' with linespoints ls 1, ExtData1_100000 using 1:single_0 title 'Ne10000' with linespoints ls 2, ExtData1_500000 using 1:single_0 title 'Ne500000' with linespoints ls 3, ExtData1_1000000 using 1:single_0 title 'Ne1000000' with linespoints ls 4 set title "Single Lambda Score similar GMYC taxa delimit multi minbr 0" set output 'plots/single_scores_delimit_multi_minbr_0.png' plot ExtData1_10000 using 1:multi_0 title 'Ne10000' with linespoints ls 1, ExtData1_100000 using 1:multi_0 title 'Ne10000' with linespoints ls 2, ExtData1_500000 using 1:multi_0 title 'Ne500000' with linespoints ls 3, ExtData1_1000000 using 1:multi_0 title 'Ne1000000' with linespoints ls 4 set title "Single Lambda Score similar GMYC taxa delimit single minbr default" set output 'plots/single_scores_delimit_single_minbr_default.png' plot ExtData1_10000 using 1:single_default title 'Ne10000' with linespoints ls 1, ExtData1_100000 using 1:single_default title 'Ne10000' with linespoints ls 2, ExtData1_500000 using 1:single_default title 'Ne500000' with linespoints ls 3, ExtData1_1000000 using 1:single_default title 'Ne1000000' with linespoints ls 4 set title "Single Lambda Score similar GMYC taxa delimit multi minbr default" set output 'plots/single_scores_delimit_multi_minbr_default.png' plot ExtData1_10000 using 1:multi_default title 'Ne10000' with linespoints ls 1, ExtData1_100000 using 1:multi_default title 'Ne10000' with linespoints ls 2, ExtData1_500000 using 1:multi_default title 'Ne500000' with linespoints ls 3, ExtData1_1000000 using 1:multi_default title 'Ne1000000' with linespoints ls 4 set title "Single Lambda Score similar GMYC taxa PTP minbr default" set output 'plots/single_scores_PTP_minbr_default.png' plot ExtData1_10000 using 1:ptp_default title 'Ne10000' with linespoints ls 1, ExtData1_100000 using 1:ptp_default title 'Ne10000' with linespoints ls 2, ExtData1_500000 using 1:ptp_default title 'Ne500000' with linespoints ls 3, ExtData1_1000000 using 1:ptp_default title 'Ne1000000' with linespoints ls 4 set title "Average Multi Lambda Score similar GMYC taxa" set xlabel "Set number" set ylabel "Average multi lambda score" ExtData1 = 'workfile_multi_scores' set output 'plots/average_multi_scores.png' plot ExtData1 using 1:2 title 'delimit single minbr 0' with linespoints ls 1, ExtData1 using 1:3 title 'delimit multi minbr 0' with linespoints ls 2, ExtData1 using 1:4 title 'delimit single minbr default' with linespoints ls 3, ExtData1 using 1:5 title 'delimit multi minbr default' with linespoints ls 4, ExtData1 using 1:6 title 'PTP minbr default' with linespoints ls 5, ExtData1 using 1:7 title 'real minbr 0' with linespoints ls 7, ExtData1 using 1:8 title 'real minbr default' with linespoints ls 8 set xlabel "index" set ylabel "Multi Lambda Score" ExtData1_10000 = 'workfile_multi_scores_Ne10000' ExtData1_100000 = 'workfile_multi_scores_Ne100000' ExtData1_500000 = 'workfile_multi_scores_Ne500000' ExtData1_1000000 = 'workfile_multi_scores_Ne1000000' set title "Multi Lambda Score similar GMYC taxa delimit single minbr 0" set output 'plots/multi_scores_delimit_single_minbr_0.png' plot ExtData1_10000 using 1:single_0 title 'Ne10000' with linespoints ls 1, ExtData1_100000 using 1:single_0 title 'Ne10000' with linespoints ls 2, ExtData1_500000 using 1:single_0 title 'Ne500000' with linespoints ls 3, ExtData1_1000000 using 1:single_0 title 'Ne1000000' with linespoints ls 4 set title "Multi Lambda Score similar GMYC taxa delimit multi minbr 0" set output 'plots/multi_scores_delimit_multi_minbr_0.png' plot ExtData1_10000 using 1:multi_0 title 'Ne10000' with linespoints ls 1, ExtData1_100000 using 1:multi_0 title 'Ne10000' with linespoints ls 2, ExtData1_500000 using 1:multi_0 title 'Ne500000' with linespoints ls 3, ExtData1_1000000 using 1:multi_0 title 'Ne1000000' with linespoints ls 4 set title "Multi Lambda Score similar GMYC taxa delimit single minbr default" set output 'plots/multi_scores_delimit_single_minbr_default.png' plot ExtData1_10000 using 1:single_default title 'Ne10000' with linespoints ls 1, ExtData1_100000 using 1:single_default title 'Ne10000' with linespoints ls 2, ExtData1_500000 using 1:single_default title 'Ne500000' with linespoints ls 3, ExtData1_1000000 using 1:single_default title 'Ne1000000' with linespoints ls 4 set title "Multi Lambda Score similar GMYC taxa delimit multi minbr default" set output 'plots/multi_scores_delimit_multi_minbr_default.png' plot ExtData1_10000 using 1:multi_default title 'Ne10000' with linespoints ls 1, ExtData1_100000 using 1:multi_default title 'Ne10000' with linespoints ls 2, ExtData1_500000 using 1:multi_default title 'Ne500000' with linespoints ls 3, ExtData1_1000000 using 1:multi_default title 'Ne1000000' with linespoints ls 4 set title "Multi Lambda Score similar GMYC taxa PTP minbr default" set output 'plots/multi_scores_PTP_minbr_default.png' plot ExtData1_10000 using 1:ptp_default title 'Ne10000' with linespoints ls 1, ExtData1_100000 using 1:ptp_default title 'Ne10000' with linespoints ls 2, ExtData1_500000 using 1:ptp_default title 'Ne500000' with linespoints ls 3, ExtData1_1000000 using 1:ptp_default title 'Ne1000000' with linespoints ls 4 reset; mptp-0.2.2/src/python/rewrite_species_result_file_GMYC.py000077500000000000000000000032441304415103400235660ustar00rootroot00000000000000#! /usr/bin/env python import os def rewrite_species_result(input_species_file, output_species_file): try: with open(input_species_file) as f: content = f.read().splitlines() f.close() largestSpecies = 0 taxaList = [] assignments = {} for i in range(1,151): line = content[i] line = " ".join(line.split()) species_idx = int(line.split(' ')[1]) taxon_name = line.split(' ')[2] if (species_idx > largestSpecies): assignments[species_idx] = [] largestSpecies = species_idx assignments[species_idx].append(taxon_name) if not os.path.exists(os.path.dirname(output_species_file)): os.makedirs(os.path.dirname(output_species_file)) speciesOut = open(output_species_file, 'w') speciesOut.write("Species 1:\n") for j in range(0, len(assignments[1])): speciesOut.write(assignments[1][j] + "\n") for i in range(2, largestSpecies + 1): speciesOut.write("\nSpecies " + str(i) + ":\n") for j in range(0, len(assignments[i])): speciesOut.write(assignments[i][j] + "\n") speciesOut.close() except IOError: print "File not found: " + input_species_file set_names = ["Ne1e+05", "Ne1e+06", "Ne5e+05", "Ne10000"] #rewrite_species_result("gmyc_results_SimulB_C/set_Ne1e+05/gmyc_results_set_Ne1e+05.1.txt", "SimulB_C_gmyc_minbr_0/set_Ne1e+05/gmyc_results_set_Ne1e+05.1.txt") for set_name in set_names: for i in range(1,101): input_species_file = "gmyc_results_SimulB_C/set_" + set_name + "/gmyc_results_set_" + set_name + "." + str(i) + ".txt" output_species_file = "SimulB_C_gmyc_minbr_0/set_" + set_name + "/gmyc_results_set_" + set_name + "." + str(i) + ".txt" rewrite_species_result(input_species_file, output_species_file) mptp-0.2.2/src/python/rewrite_species_result_file_PTP.py000077500000000000000000000036701304415103400234750ustar00rootroot00000000000000#! /usr/bin/env python import os def rewrite_species_result(input_species_file, output_species_file): try: with open(input_species_file) as f: content = f.read().splitlines() f.close() taxaListString = content[0].split(':')[1] taxaList = taxaListString.split(',') speciesList = content[1].split(',') if not os.path.exists(os.path.dirname(output_species_file)): os.makedirs(os.path.dirname(output_species_file)) speciesOut = open(output_species_file, 'w') oldSpeciesIdx = speciesList[0] speciesOut.write("Species " + oldSpeciesIdx + ":\n") speciesOut.write(taxaList[0] + "\n") for i in range(1,len(speciesList)): if (speciesList[i] == oldSpeciesIdx): speciesOut.write(taxaList[i] + "\n") else: oldSpeciesIdx = speciesList[i] speciesOut.write("\nSpecies " + oldSpeciesIdx + ":\n") speciesOut.write(taxaList[i] + "\n") speciesOut.close() except IOError: print "File not found: " + input_species_file set_names = ["Ne10000", "Ne100000", "Ne500000", "Ne1000000"] for set_name in set_names: for i in range(1,101): if set_name == "Ne10000": input_species_file = "similar_to_GMYC/15-08-2015.16-40/set_BIRTH0.27_" + set_name + "/PTP_result_BIRTH0.27_" + set_name + "_" + str(i) + ".PTPPartitions.txt" elif set_name == "Ne500000": input_species_file = "similar_to_GMYC/15-08-2015.16-40/set_BIRTH0.27_" + set_name + "/PTP_BIRTH0.27_" + set_name + "_" + str(i) + ".PTPPartitions.txt" elif set_name == "Ne100000": input_species_file = "similar_to_GMYC/15-08-2015.16-40/set_BIRTH0.27_" + set_name + "/PTP_BIRTH0.27_" + set_name + "_" + str(i) + ".PTPPartitions.txt" else: input_species_file = "similar_to_GMYC/15-08-2015.16-40/set_BIRTH0.27_" + set_name + "/PTP_result." + str(i) + ".PTPPartitions.txt" output_species_file = "similar_to_GMYC_PTP_minbr_default/set_" + set_name + "/PTP_results_set_" + set_name + "." + str(i) + ".txt" rewrite_species_result(input_species_file, output_species_file) mptp-0.2.2/src/random.c000066400000000000000000000073601304415103400147070ustar00rootroot00000000000000/* Copyright (C) 2015 Tomas Flouri, Sarah Lutteropp This program is free software: you can redistribute it and/or modify it under the terms of the GNU Affero General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more details. You should have received a copy of the GNU Affero General Public License along with this program. If not, see . Contact: Tomas Flouri , Heidelberg Institute for Theoretical Studies, Schloss-Wolfsbrunnenweg 35, D-69118 Heidelberg, Germany */ #include "mptp.h" static long min_species; static long max_species; static long species_count; static unsigned short * g_rstate; static int cb_node_select(rtree_t * node) { double rand_double = 0; if (!node->edge_count) return 0; /* check if not selecting node is possible */ if (min_species+1 > species_count) { /* we must select the node */ node->event = EVENT_COALESCENT; max_species = max_species - node->max_species_count + 1; return 0; } /* check if selecting the node is possible */ if (max_species - node->max_species_count + 1 < species_count) { /* we must NOT select the node */ node->event = EVENT_SPECIATION; min_species = min_species+1; return 1; } /* otherwise, we just throw a coin and select one of the two cases */ rand_double = erand48(g_rstate); if (rand_double >= 0.5) { /* don't select */ node->event = EVENT_SPECIATION; min_species = min_species+1; return 1; } /* otherwise select node */ node->event = EVENT_COALESCENT; max_species = max_species - node->max_species_count + 1; return 0; } double random_delimitation(rtree_t * root, long * delimited_species, long * coal_edge_count, double * coal_edgelen_sum, long * spec_edge_count, double * spec_edgelen_sum, double * coal_score, unsigned short * rstate) { int edge_count = 0; long i; long rand_long = 0; double logl = 0; double edgelen_sum = 0; /* initialize */ min_species = 1; max_species = root->max_species_count; g_rstate = rstate; rand_long = nrand48(rstate); if (!root->max_species_count) species_count = (rand_long % root->leaves) + 1; else species_count = (rand_long % root->max_species_count) + 1; rtree_t ** inner_node_list = (rtree_t **)xmalloc((size_t)species_count * sizeof(rtree_t *)); long count = rtree_traverse(root, cb_node_select, rstate, inner_node_list); for (i = 0; i < count; ++i) { logl += inner_node_list[i]->coal_logl; edge_count += inner_node_list[i]->edge_count; edgelen_sum += inner_node_list[i]->edgelen_sum; } *coal_score = logl; /* if we have PTP single logl is different */ if (opt_method == PTP_METHOD_SINGLE) logl = loglikelihood(edge_count, edgelen_sum); /* append speciation part log-likelihood */ logl += loglikelihood(root->edge_count - edge_count, root->edgelen_sum - edgelen_sum); free(inner_node_list); assert(count == species_count); *delimited_species = species_count; *coal_edge_count = edge_count; *coal_edgelen_sum = edgelen_sum; *spec_edge_count = root->edge_count - edge_count; *spec_edgelen_sum = root->edgelen_sum - edgelen_sum; return logl; } mptp-0.2.2/src/rtree.c000066400000000000000000000376441304415103400145600ustar00rootroot00000000000000/* Copyright (C) 2015 Tomas Flouri This program is free software: you can redistribute it and/or modify it under the terms of the GNU Affero General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more details. You should have received a copy of the GNU Affero General Public License along with this program. If not, see . Contact: Tomas Flouri , Heidelberg Institute for Theoretical Studies, Schloss-Wolfsbrunnenweg 35, D-69118 Heidelberg, Germany */ #include "mptp.h" static int indend_space = 4; static void print_node_info(rtree_t * tree) { printf (" %s", tree->label); printf (" %f", tree->length); printf("\n"); } static void print_tree_recurse(rtree_t * tree, int indend_level, int * active_node_order) { int i,j; if (!tree) return; for (i = 0; i < indend_level; ++i) { if (active_node_order[i]) printf("|"); else printf(" "); for (j = 0; j < indend_space-1; ++j) printf(" "); } printf("\n"); for (i = 0; i < indend_level-1; ++i) { if (active_node_order[i]) printf("|"); else printf(" "); for (j = 0; j < indend_space-1; ++j) printf(" "); } printf("+"); for (j = 0; j < indend_space-1; ++j) printf ("-"); if (tree->left || tree->right) printf("+"); print_node_info(tree); if (active_node_order[indend_level-1] == 2) active_node_order[indend_level-1] = 0; active_node_order[indend_level] = 1; print_tree_recurse(tree->left, indend_level+1, active_node_order); active_node_order[indend_level] = 2; print_tree_recurse(tree->right, indend_level+1, active_node_order); } static int tree_indend_level(rtree_t * tree, int indend) { if (!tree) return indend; int a = tree_indend_level(tree->left, indend+1); int b = tree_indend_level(tree->right, indend+1); return (a > b ? a : b); } void rtree_show_ascii(rtree_t * tree) { int indend_max = tree_indend_level(tree,0); int * active_node_order = (int *)malloc((size_t)(indend_max+1) * sizeof(int)); active_node_order[0] = 1; active_node_order[1] = 1; print_node_info(tree); print_tree_recurse(tree->left, 1, active_node_order); active_node_order[0] = 2; print_tree_recurse(tree->right, 1, active_node_order); free(active_node_order); } static char * rtree_export_newick_recursive(rtree_t * root) { char * newick; char * support = NULL; if (!root) return NULL; if (!(root->left) || !(root->right)) { if (asprintf(&newick, "%s:%f", root->label, root->length) == -1) fatal("Unable to allocate enough memory."); } else { char * subtree1 = rtree_export_newick_recursive(root->left); char * subtree2 = rtree_export_newick_recursive(root->right); if (opt_mcmc) if (asprintf(&support, "%f", root->support) == -1) fatal("Unable to allocate enough memory."); if (asprintf(&newick, "(%s,%s)%s:%f", subtree1, subtree2, (opt_mcmc) ? support : "", root->length) == -1) fatal("Unable to allocate enough memory."); if (opt_mcmc) free(support); free(subtree1); free(subtree2); } return newick; } char * rtree_export_newick(rtree_t * root) { char * newick; char * support = NULL; if (!root) return NULL; if (!(root->left) || !(root->right)) { if (asprintf(&newick, "%s:%f", root->label, root->length) == -1) fatal("Unable to allocate enough memory."); } else { char * subtree1 = rtree_export_newick_recursive(root->left); char * subtree2 = rtree_export_newick_recursive(root->right); if (opt_mcmc) if (asprintf(&support, "%f", root->support) == -1) fatal("Unable to allocate enough memory."); if (asprintf(&newick, "(%s,%s)%s:%f;", subtree1, subtree2, (opt_mcmc) ? support : "", root->length) == -1) fatal("Unable to allocate enough memory."); if (opt_mcmc) free(support); free(subtree1); free(subtree2); } return newick; } static void rtree_traverse_recursive(rtree_t * node, int (*cbtrav)(rtree_t *), int * index, unsigned short * rstate, rtree_t ** outbuffer) { double rand_double = 0; if (!node->left) { if (!cbtrav(node)) { outbuffer[*index] = node; *index = *index + 1; } return; } if (!cbtrav(node)) { outbuffer[*index] = node; *index = *index + 1; return; } rand_double = erand48(rstate); if (rand_double >= 0.5) { rtree_traverse_recursive(node->left, cbtrav, index, rstate, outbuffer); rtree_traverse_recursive(node->right, cbtrav, index, rstate, outbuffer); } else { rtree_traverse_recursive(node->right, cbtrav, index, rstate, outbuffer); rtree_traverse_recursive(node->left, cbtrav, index, rstate, outbuffer); } } int rtree_traverse(rtree_t * root, int (*cbtrav)(rtree_t *), unsigned short * rstate, rtree_t ** outbuffer) { int index = 0; if (!root->left) return -1; /* we will traverse an rooted tree in the following way root /\ / \ left right at each node the callback function is called to decide whether we are going to traversing the subtree rooted at the specific node */ rtree_traverse_recursive(root, cbtrav, &index, rstate, outbuffer); return index; } static void rtree_traverse_postorder_recursive(rtree_t * node, int (*cbtrav)(rtree_t *), int * index, rtree_t ** outbuffer) { if (!node) return; rtree_traverse_postorder_recursive(node->left, cbtrav, index, outbuffer); rtree_traverse_postorder_recursive(node->right, cbtrav, index, outbuffer); if (cbtrav(node)) { outbuffer[*index] = node; *index = *index + 1; } } int rtree_traverse_postorder(rtree_t * root, int (*cbtrav)(rtree_t *), rtree_t ** outbuffer) { int index = 0; if (!root->left) return -1; /* we will traverse an unrooted tree in the following way root /\ / \ left right at each node the callback function is called to decide whether to place the node in the list */ rtree_traverse_postorder_recursive(root, cbtrav, &index, outbuffer); return index; } static int rtree_height_recursive(rtree_t * node) { if (!node) return 1; int a = rtree_height_recursive(node->left); int b = rtree_height_recursive(node->right); return MAX(a,b)+1; } int rtree_height(rtree_t * root) { return rtree_height_recursive(root); } static void rtree_query_tipnodes_recursive(rtree_t * node, rtree_t ** node_list, int * index) { if (!node) return; if (!node->left) { node_list[*index] = node; *index = *index + 1; return; } rtree_query_tipnodes_recursive(node->left, node_list, index); rtree_query_tipnodes_recursive(node->right, node_list, index); } int rtree_query_tipnodes(rtree_t * root, rtree_t ** node_list) { int index = 0; if (!root) return 0; if (!root->left) { node_list[index++] = root; return index; } rtree_query_tipnodes_recursive(root->left, node_list, &index); rtree_query_tipnodes_recursive(root->right, node_list, &index); return index; } static void rtree_query_innernodes_recursive(rtree_t * root, rtree_t ** node_list, int * index) { if (!root) return; if (!root->left) return; /* postorder traversal */ rtree_query_innernodes_recursive(root->left, node_list, index); rtree_query_innernodes_recursive(root->right, node_list, index); node_list[*index] = root; *index = *index + 1; return; } int rtree_query_innernodes(rtree_t * root, rtree_t ** node_list) { int index = 0; if (!root) return 0; if (!root->left) return 0; rtree_query_innernodes_recursive(root->left, node_list, &index); rtree_query_innernodes_recursive(root->right, node_list, &index); node_list[index++] = root; return index; } void rtree_reset_info(rtree_t * root) { if (!root->left) { root->leaves = 1; root->edge_count = 0; root->edgelen_sum = 0; return; } rtree_reset_info(root->left); rtree_reset_info(root->right); root->leaves = root->left->leaves + root->right->leaves; root->edge_count = root->left->edge_count + root->right->edge_count; root->edgelen_sum = root->left->edgelen_sum + root->right->edgelen_sum; if (root->left->length > opt_minbr) { root->edge_count++; root->edgelen_sum += root->left->length; } if (root->right->length > opt_minbr) { root->edge_count++; root->edgelen_sum += root->right->length; } } void rtree_print_tips(rtree_t * node, FILE * out) { if (node->left) rtree_print_tips(node->left,out); if (node->right) rtree_print_tips(node->right,out); if (!node->left && !node->right) fprintf(out, "%s\n", node->label); } rtree_t * rtree_clone(rtree_t * node, rtree_t * parent) { if (!node) return NULL; /* clone node */ rtree_t * clone = (rtree_t *)xcalloc(1,sizeof(rtree_t)); memcpy(clone,node,sizeof(rtree_t)); clone->parent = parent; clone->data = NULL; if (node->label) clone->label = xstrdup(node->label); /* clone the two subtrees */ clone->left = rtree_clone(node->left, clone); clone->right = rtree_clone(node->right, clone); return clone; } rtree_t ** rtree_tipstring_nodes(rtree_t * root, char * tipstring, unsigned int * tiplist_count) { size_t i; unsigned int k; unsigned int commas_count = 0; char * taxon; unsigned long taxon_len; ENTRY * found = NULL; for (i = 0; i < strlen(tipstring); ++i) if (tipstring[i] == ',') commas_count++; rtree_t ** node_list = (rtree_t **)xmalloc((size_t)(root->leaves) * sizeof(rtree_t *)); rtree_query_tipnodes(root, node_list); rtree_t ** out_node_list = (rtree_t **)xmalloc((size_t)(commas_count+1) * sizeof(rtree_t *)); /* create a hashtable of tip labels */ hcreate(2 * (size_t)(root->leaves)); for (i = 0; i < (unsigned int)(root->leaves); ++i) { ENTRY entry; entry.key = node_list[i]->label; entry.data = node_list[i]; hsearch(entry,ENTER); } char * s = tipstring; k = 0; while (*s) { /* get next tip */ taxon_len = strcspn(s, ","); if (!taxon_len) fatal("Erroneous prune list format (double comma)/taxon missing"); taxon = xstrndup(s, taxon_len); /* search tip in hash table */ ENTRY query; query.key = taxon; found = NULL; found = hsearch(query,FIND); if (!found) fatal("Taxon %s in does not appear in the tree", taxon); /* store pointer in output list */ out_node_list[k++] = (rtree_t *)(found->data); /* free tip label, and move to the beginning of next tip if available */ free(taxon); s += taxon_len; if (*s == ',') s += 1; } /* kill the hash table */ hdestroy(); free(node_list); /* return number of tips in the list */ *tiplist_count = commas_count + 1; /* return tip node list */ return out_node_list; } /* fill path with nodes of the path tip to root */ static void fill_path(rtree_t ** path, int * path_len, rtree_t * tip) { int i = 0; while (tip) { path[i++] = tip; tip = tip->parent; } *path_len = i; } rtree_t * rtree_lca(rtree_t * root, rtree_t ** tip_nodes, unsigned int count) { unsigned int i; rtree_t *** path; assert(count >= 2); /* allocate path arrays for count tip nodes */ path = (rtree_t ***)xmalloc((size_t)count * sizeof(rtree_t **)); int * path_len = (int *)xmalloc((size_t)count * sizeof(int)); /* for each tip node fill corresponding path array with all nodes in the path to the root node and store the length of the path */ for (i = 0; i < count; ++i) { path[i] = (rtree_t **)xmalloc((size_t)(rtree_height(root)) * sizeof(rtree_t *)); fill_path(path[i], &(path_len[i]), tip_nodes[i]); } /* find the LCA using a breadth-first-search traversal starting from the root. Since all paths start at the root, the LCA is the parent of nodes that differ in the paths when encountered for the first time */ rtree_t * lca = NULL; while (!lca) { for (i = 0; i < count; ++i) --path_len[i]; for (i = 1; i < count; ++i) { if (path[i-1][path_len[i-1]] != path[i][path_len[i]]) { lca = path[i][path_len[i]+1]; break; } } } /* free allocated memory */ for (i = 0; i < count; ++i) free(path[i]); free(path); free(path_len); return lca; } rtree_t * get_outgroup_lca(rtree_t * root) { unsigned int og_tips_count; rtree_t * og_root; rtree_t ** og_tips; og_tips = rtree_tipstring_nodes(root, opt_outgroup, &og_tips_count); if (og_tips_count > 1) og_root = rtree_lca(root, og_tips, og_tips_count); else og_root = og_tips[0]; free(og_tips); return og_root; } rtree_t * rtree_crop(rtree_t * root, rtree_t * crop_root) { /* check if the selected subtree can be cropped */ if (root->leaves - crop_root->leaves < 2) return NULL; /* subtree can be cropped, distinguish between two cases: */ if (crop_root->parent == root) { /* Case 1: root * / \ A A * * crop_root ----> * / \ * * in this case the subtree rooted at crop_root is cropped, the root node is eliminated and subtree rooted at A becomes the new tree */ rtree_t * new_root; if (root->left == crop_root) { new_root = root->right; root->right = NULL; } else { new_root = root->left; root->left = NULL; } rtree_destroy(root); new_root->parent = NULL; rtree_reset_info(new_root); return new_root; } /* Case 2: root * / \ A * - \ root * B ----> * / \ / \ C * * crop_root A * - / \ \ * * * C in this case the subtree rooted at crop_root is cropped, the root node is eliminated and subtree rooted at A becomes the new tree */ rtree_t * b = crop_root->parent; rtree_t * c; /* get C and break the link between B and C */ if (b->left == crop_root) { c = b->right; b->right = NULL; } else { c = b->left; b->left = NULL; } /* link the parent of B with C from both directions */ c->parent = b->parent; if (b->parent->left == b) b->parent->left = c; else b->parent->right = c; c->length += b->length; rtree_destroy(b); rtree_reset_info(root); return root; } mptp-0.2.2/src/svg.c000066400000000000000000000250561304415103400142300ustar00rootroot00000000000000/* Copyright (C) 2015 Tomas Flouri This program is free software: you can redistribute it and/or modify it under the terms of the GNU Affero General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more details. You should have received a copy of the GNU Affero General Public License along with this program. If not, see . Contact: Tomas Flouri , Heidelberg Institute for Theoretical Studies, Schloss-Wolfsbrunnenweg 35, D-69118 Heidelberg, Germany */ #include "mptp.h" #define GRADIENT(x) (1-x)*100 static double scaler = 0; static long legend_spacing = 10; static FILE * svg_fp; static double max_font_len = 0; static double max_tree_len = 0; static double canvas_width; static char * const speciation_color = "#31a354"; static char * const coalesence_color = "#ff0000"; static int tip_occ = 0; typedef struct coord_s { double x; double y; } coord_t; static coord_t * create_coord(double x, double y) { coord_t * coord = (coord_t *)xmalloc(sizeof(coord_t)); coord->x = x; coord->y = y; return coord; } static void svg_line(double x1, double y1, double x2, double y2, const char * color, double stroke_width) { fprintf(svg_fp, "\n", x1, y1, x2, y2, color, stroke_width); } static void svg_circle(double cx, double cy, double r, const char * color) { fprintf(svg_fp, "\n", cx, cy, r, color, color); /* animation effect fprintf(svg_fp, "\n", (long)r, (long)r+5); fprintf(svg_fp, "\n\n", (long)r); */ } static void svg_text(double x, double y, long fontsize, const char * text) { fprintf(svg_fp, "" "%s\n", x,y,fontsize,text); } static void rtree_set_xcoord(rtree_t * node) { /* create the coordinate info of the node's scaled branch length (edge towards root) */ coord_t * coord = create_coord(node->length * scaler, 0); node->data = (void *)coord; /* if the node has a parent then add the x coord of the parent such that the branch is shifted towards right, otherwise, if the node is the root, align it with the left margin */ if (node->parent) coord->x += ((coord_t *)(node->parent->data))->x; else { coord->x = opt_svg_marginleft; } if (!node->left) return; /* recursively set coordinates of the other nodes in a pre-order fashion */ rtree_set_xcoord(node->left); rtree_set_xcoord(node->right); } static void svg_rtree_plot(rtree_t * node) { char * current_color; double y; double stroke_width = 3; /* traverse tree in post-order */ if (node->left) { svg_rtree_plot(node->left); svg_rtree_plot(node->right); } /* any node that has a parent, i.e. any node apart from the root */ if (node->parent) { double x,px; x = ((coord_t *)(node->data))->x; px = ((coord_t *)(node->parent->data))->x; if (!node->left) { y = tip_occ * opt_svg_tipspace + opt_svg_margintop + legend_spacing; tip_occ++; } else { double ly,ry; ly = ((coord_t *)(node->left->data))->y; ry = ((coord_t *)(node->right->data))->y; y = (ly + ry) / 2.0; /* decide the color */ if (opt_mcmc) { if (asprintf(¤t_color, "rgb(%f%%,%f%%,%f%%)", GRADIENT(node->support), 0.0, 0.0) == -1) fatal("Unable to allocate enough memory."); } else if (node->event == EVENT_COALESCENT) current_color = coalesence_color; else if (node->event == EVENT_SPECIATION) current_color = speciation_color; else assert(0); /* draw a vertical line and a circle in the middle */ svg_line(x, ly, x, ry, current_color, stroke_width); svg_circle(x, y, opt_svg_inner_radius, current_color); /* deallocate color if mcmc */ if (opt_mcmc) free(current_color); /* if support value greater than threshold output it */ if (opt_mcmc) { if (node->support > 0.5) { char * support; if (asprintf(&support, "%.2f", node->support) == -1) fatal("Unable to allocate enough memory."); svg_text(x-5,y-5,opt_svg_fontsize,support); free(support); } } } /* decide the color based on the parent node */ if (opt_mcmc) { if (asprintf(¤t_color, "rgb(%f%%,%f%%,%f%%)", GRADIENT(node->parent->support), 0.0, 0.0) == -1) fatal("Unable to allocate enough memory."); } else if (node->parent->event == EVENT_COALESCENT) current_color = coalesence_color; else if (node->parent->event == EVENT_SPECIATION) current_color = speciation_color; else assert(0); /* draw horizontal line */ svg_line(px,y,x,y,current_color,stroke_width); ((coord_t *)(node->data))->y = y; if (opt_mcmc) free(current_color); /* if node is a tip then print its label */ if (!node->left) { fprintf(svg_fp, "%s\n", x+5, y+opt_svg_fontsize/3.0, opt_svg_fontsize, node->label); } else fprintf(svg_fp, "\n"); } else /* the root node case */ { double ly,ry,x; // lx = ((coord_t *)(node->left->data))->x; ly = ((coord_t *)(node->left->data))->y; // rx = ((coord_t *)(node->right->data))->x; ry = ((coord_t *)(node->right->data))->y; y = (ly + ry) / 2.0; x = opt_svg_marginleft; /* decide the color */ if (opt_mcmc) { if (asprintf(¤t_color, "rgb(%f%%,%f%%,%f%%)", GRADIENT(node->support), 0.0, 0.0) == -1) fatal("Unable to allocate enough memory."); } else if (node->event == EVENT_COALESCENT) current_color = coalesence_color; else if (node->event == EVENT_SPECIATION) current_color = speciation_color; else assert(0); svg_line(x,ly,x,ry,current_color,stroke_width); svg_circle(x,y,opt_svg_inner_radius,current_color); if (opt_mcmc) free(current_color); if (opt_mcmc) { if (node->support > 0.5) { char * support; if (asprintf(&support, "%.2f", node->support) == -1) fatal("Unable to allocate enough memory."); svg_text(x-5,y-5,opt_svg_fontsize,support); free(support); } } } } static void rtree_scaler_init(rtree_t * root) { double len = 0; double label_len; int i; rtree_t ** node_list = (rtree_t **)malloc((size_t)(2 * root->leaves - 1) * sizeof(rtree_t *)); rtree_query_tipnodes(root, node_list); /* find longest path to root */ for (i = 0; i < root->leaves; ++i) { rtree_t * node = node_list[i]; len = 0; while(node) { len += node->length; node = node->parent; } /* subtract root length */ len -= root->length; if (len > max_tree_len) max_tree_len = len; label_len = (opt_svg_fontsize / 1.5) * (node_list[i]->label ? strlen(node_list[i]->label) : 0); len = (canvas_width - label_len) / len; if (i == 0) { scaler = len; max_font_len = label_len; } else if (len < scaler) { scaler = len; max_font_len = label_len; } } free(node_list); } static void svg_rtree_init(rtree_t * root) { long svg_height; canvas_width = opt_svg_width - opt_svg_marginleft - opt_svg_marginright; /* initialize pixel scaler (scaler) and compute max tree length (max_tree_len) */ rtree_scaler_init(root); svg_height = opt_svg_margintop + legend_spacing + opt_svg_marginbottom + opt_svg_tipspace * root->leaves; /* print svg header tag with dimensions and grey border */ fprintf(svg_fp, "\n", opt_svg_width, svg_height); /* draw legend */ if (opt_svg_showlegend) { svg_line(opt_svg_marginleft, 10, (canvas_width - max_font_len)*opt_svg_legend_ratio + opt_svg_marginleft, 10, speciation_color, 3); fprintf(svg_fp, "%.*f\n", (canvas_width - max_font_len)*opt_svg_legend_ratio + opt_svg_marginleft + 5, 20-opt_svg_fontsize/3.0, (long)opt_svg_fontsize, opt_precision, max_tree_len * opt_svg_legend_ratio); } /* uncomment to print a dashed border to indicate margins */ /* fprintf(svg_fp, "\n", opt_svg_marginleft, opt_svg_margintop + legend_spacing, opt_svg_width - opt_svg_marginleft - opt_svg_marginright, svg_height - opt_svg_margintop - legend_spacing - opt_svg_marginbottom); */ rtree_set_xcoord(root); svg_rtree_plot(root); fprintf(svg_fp, "\n"); } void cmd_svg(rtree_t * root, long seed, const char * ext) { /* reset tip occurrence */ tip_occ = 0; if (!opt_quiet) { if (opt_mcmc) fprintf(stdout, "Creating SVG delimitation file %s.%ld.svg ...\n", opt_outfile, seed); else fprintf(stdout, "Creating SVG delimitation file %s.svg ...\n", opt_outfile); } svg_fp = open_file_ext(ext, seed); svg_rtree_init(root); fclose(svg_fp); } mptp-0.2.2/src/svg_landscape.c000066400000000000000000000215311304415103400162340ustar00rootroot00000000000000/* Copyright (C) 2015 Tomas Flouri, Sarah Lutteropp This program is free software: you can redistribute it and/or modify it under the terms of the GNU Affero General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more details. You should have received a copy of the GNU Affero General Public License along with this program. If not, see . Contact: Tomas Flouri , Heidelberg Institute for Theoretical Studies, Schloss-Wolfsbrunnenweg 35, D-69118 Heidelberg, Germany */ #include "mptp.h" static char line[LINEALLOC]; static double originx = 133; static int xtics = 10; static long canvas_x1 = 130; static long canvas_x2 = 730; static long canvas_y1 = 10; static long canvas_y2 = 360; static int radius = 4; static int radius_mouseover = 10; static int color_index = 2; static char * const color10[] = { "#1f77b4", "#ff7f0e", "#2ca02c", "#d62728", "#9467bd", "#8c564b", "#e377c2", "#7f7f7f", "#bcbd22", "#17becf" }; static void svg_header(FILE * svg_fp) { fprintf(svg_fp,"\n"); fprintf(svg_fp,"\n"); /* print axes */ fprintf(svg_fp, "\n" " \n" " \n" " \n" " \n" " \n" " \n" " \n" " \n" " \n" " \n" " \n" "\n"); fprintf(svg_fp, "\n" " \n" " \n" " \n" " \n" " \n" " \n" " \n" "\n"); fprintf(svg_fp, "\n"); } static void out_svg(FILE * svg_fp, double min_logl, double max_logl, long seed) { double scale = (max_logl - min_logl) * 1.1; /* open data points file */ char * filename; if (asprintf(&filename, "%s.%ld.%s", opt_outfile, seed, "log") == -1) fatal("Unable to allocate enough memory."); FILE * fp = xopen(filename,"r"); free(filename); /* read and print data points to svg */ int i = 0; while (fgets(line,LINEALLOC,fp)) { double x,y; double logl; int species; sscanf(line,"%lf,%d\n",&logl,&species); /* compute x point */ x = ((i*opt_mcmc_sample)/(double)(opt_mcmc_steps-opt_mcmc_burnin)) * (canvas_x2 - canvas_x1) + canvas_x1; /* compute y point */ y = (1 - (logl-min_logl)/scale) * (canvas_y2-canvas_y1) + canvas_y1; /* print point */ fprintf(svg_fp, "\n" "\n" "\n" "\n" "\n" "\n", x, y, radius, color10[color_index], color10[color_index], radius, radius_mouseover, radius); ++i; } fclose(fp); } static void svg_footer(FILE * svg_fp, double min_logl, double max_logl) { double scale = (max_logl - min_logl) * 1.1; int i; fprintf(svg_fp, "\n"); /* bring gridlines to front */ fprintf(svg_fp,"\n"); fprintf(svg_fp,"\n"); /* x labels */ fprintf(svg_fp, "\n"); fprintf(svg_fp, "%ld\n", originx, opt_mcmc_burnin); for (i = 0; i < xtics; ++i) { fprintf(svg_fp, "%ld\n", originx + (i+1)*((canvas_x2 - canvas_x1)/(double)xtics), (long)((i+1)*((opt_mcmc_steps-opt_mcmc_burnin)/(double)xtics)) + opt_mcmc_burnin); } fprintf(svg_fp, "\n"); /* y labels */ fprintf(svg_fp, "\n"); fprintf(svg_fp, " %.3f\n", min_logl + scale); fprintf(svg_fp, " %.3f\n", min_logl + 5*(scale)/6); fprintf(svg_fp, " %.3f\n", min_logl + 4*(scale)/6); fprintf(svg_fp, " %.3f\n", min_logl + 3*(scale)/6); fprintf(svg_fp, " %.3f\n", min_logl + 2*(scale)/6); fprintf(svg_fp, " %.3f\n", min_logl + scale/6); fprintf(svg_fp, " %.3f\n", min_logl); fprintf(svg_fp, "\n"); fprintf(svg_fp,"\n"); } void svg_landscape(double mcmc_min_logl, double mcmc_max_logl, long seed) { FILE * svg_fp = open_file_ext("logl.svg", seed); if (!opt_quiet) fprintf(stdout, "Creating log-likelihood visualization in %s.%ld.logl.svg ...\n", opt_outfile, seed); svg_header(svg_fp); out_svg(svg_fp, mcmc_min_logl, mcmc_max_logl, seed); svg_footer(svg_fp, mcmc_min_logl, mcmc_max_logl); fclose(svg_fp); } void svg_landscape_combined(double mcmc_min_logl, double mcmc_max_logl, long runs, long *seed) { long i; FILE * svg_fp = open_file_ext("logl.svg", opt_seed); if (!opt_quiet) fprintf(stdout, "Overall log-likelihood visualization in %s.%ld.logl.svg ...\n", opt_outfile, opt_seed); svg_header(svg_fp); for (i = 0; i < runs; ++i) { color_index = i % 10; out_svg(svg_fp, mcmc_min_logl, mcmc_max_logl, seed[i]); } svg_footer(svg_fp, mcmc_min_logl, mcmc_max_logl); fclose(svg_fp); } mptp-0.2.2/src/util.c000066400000000000000000000100251304415103400143740ustar00rootroot00000000000000/* Copyright (C) 2015 Tomas Flouri This program is free software: you can redistribute it and/or modify it under the terms of the GNU Affero General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more details. You should have received a copy of the GNU Affero General Public License along with this program. If not, see . Contact: Tomas Flouri , Heidelberg Institute for Theoretical Studies, Schloss-Wolfsbrunnenweg 35, D-69118 Heidelberg, Germany */ #include "mptp.h" static const char * progress_prompt; static unsigned long progress_next; static unsigned long progress_size; static unsigned long progress_chunk; static const unsigned long progress_granularity = 200; void fatal(const char * format, ...) { va_list argptr; va_start(argptr, format); vfprintf(stderr, format, argptr); va_end(argptr); fprintf(stderr, "\n"); exit(1); } void progress_init(const char * prompt, unsigned long size) { if (!opt_quiet) { progress_prompt = prompt; progress_size = size; progress_chunk = size < progress_granularity ? 1 : size / progress_granularity; progress_next = 0; fprintf(stderr, "%s %.0f%%", prompt, 0.0); } } void progress_update(unsigned int progress) { if (!opt_quiet) { if (progress >= progress_next) { fprintf(stderr, " \r%s %.0f%%", progress_prompt, 100.0 * progress / progress_size); progress_next = progress + progress_chunk; } } } void progress_done() { if (!opt_quiet) fprintf(stderr, " \r%s %.0f%%\n", progress_prompt, 100.0); } #if 0 void * xmalloc(size_t size) { const size_t alignment = 16; void * t = NULL; if (posix_memalign(& t, alignment, size) == -1) fatal("Unable to allocate enough memory."); if (!t) fatal("Unable to allocate enough memory."); return t; } #else void * xmalloc(size_t size) { void * t; t = malloc(size); if (!t) fatal("Unable to allocate enough memory."); return t; } #endif void * xcalloc(size_t nmemb, size_t size) { void * t; t = calloc(nmemb,size); if (!t) fatal("Unable to allocate enough memory."); return t; } void * xrealloc(void *ptr, size_t size) { void * t = realloc(ptr, size); if (!t) fatal("Unable to allocate enough memory."); return t; } char * xstrchrnul(char *s, int c) { char * r = strchr(s, c); if (r) return r; else return (char *)s + strlen(s); } char * xstrdup(const char * s) { size_t len = strlen(s); char * p = (char *)xmalloc(len+1); return strcpy(p,s); } char * xstrndup(const char * s, size_t len) { char * p = (char *)xmalloc(len+1); strncpy(p,s,len); p[len] = 0; return p; } long getusec(void) { struct timeval tv; if(gettimeofday(&tv,0) != 0) return 0; return tv.tv_sec * 1000000 + tv.tv_usec; } void show_rusage() { struct rusage r_usage; getrusage(RUSAGE_SELF, & r_usage); fprintf(stderr, "Time: %.3fs (user)", r_usage.ru_utime.tv_sec * 1.0 + (double) r_usage.ru_utime.tv_usec * 1.0e-6); fprintf(stderr, " %.3fs (sys)", r_usage.ru_stime.tv_sec * 1.0 + r_usage.ru_stime.tv_usec * 1.0e-6); #if defined __APPLE__ /* Mac: ru_maxrss gives the size in bytes */ fprintf(stderr, " Memory: %.0fMB\n", r_usage.ru_maxrss * 1.0e-6); #else /* Linux: ru_maxrss gives the size in kilobytes */ fprintf(stderr, " Memory: %.0fMB\n", r_usage.ru_maxrss * 1.0e-3); #endif } FILE * xopen(const char * filename, const char * mode) { FILE * out = fopen(filename, mode); if (!out) fatal("Cannot open file %s", opt_outfile); return out; } void random_init(unsigned short * rstate, long seedval) { /* emulate drand48() */ rstate[0] = 0x330e; rstate[1] = seedval & 0xffffl; rstate[2] = seedval >> 16; } mptp-0.2.2/src/utree.c000066400000000000000000000361341304415103400145540ustar00rootroot00000000000000/* Copyright (C) 2015 Tomas Flouri This program is free software: you can redistribute it and/or modify it under the terms of the GNU Affero General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more details. You should have received a copy of the GNU Affero General Public License along with this program. If not, see . Contact: Tomas Flouri , Heidelberg Institute for Theoretical Studies, Schloss-Wolfsbrunnenweg 35, D-69118 Heidelberg, Germany */ #include "mptp.h" static int indend_space = 4; static void print_node_info(utree_t * tree) { printf (" %s", tree->label); printf (" %f", tree->length); printf("\n"); } static void print_tree_recurse(utree_t * tree, int indend_level, int * active_node_order) { int i,j; if (!tree) return; for (i = 0; i < indend_level; ++i) { if (active_node_order[i]) printf("|"); else printf(" "); for (j = 0; j < indend_space-1; ++j) printf(" "); } printf("\n"); for (i = 0; i < indend_level-1; ++i) { if (active_node_order[i]) printf("|"); else printf(" "); for (j = 0; j < indend_space-1; ++j) printf(" "); } printf("+"); for (j = 0; j < indend_space-1; ++j) printf ("-"); if (tree->next) printf("+"); print_node_info(tree); if (active_node_order[indend_level-1] == 2) active_node_order[indend_level-1] = 0; if (tree->next) { active_node_order[indend_level] = 1; print_tree_recurse(tree->next->back, indend_level+1, active_node_order); active_node_order[indend_level] = 2; print_tree_recurse(tree->next->next->back, indend_level+1, active_node_order); } } static int tree_indend_level(utree_t * tree, int indend) { if (!tree->next) return indend+1; int a = tree_indend_level(tree->next->back, indend+1); int b = tree_indend_level(tree->next->next->back, indend+1); return (a > b ? a : b); } void utree_show_ascii(utree_t * tree) { int a, b; a = tree_indend_level(tree->back,1); b = tree_indend_level(tree,0); int max_indend_level = (a > b ? a : b); int * active_node_order = (int *)malloc((size_t)(max_indend_level+1) * sizeof(int)); active_node_order[0] = 1; active_node_order[1] = 1; print_tree_recurse(tree->back, 1, active_node_order); print_tree_recurse(tree->next->back, 1, active_node_order); active_node_order[0] = 2; print_tree_recurse(tree->next->next->back, 1, active_node_order); free(active_node_order); } static char * newick_utree_recurse(utree_t * root) { char * newick; if (!root->next) { if (asprintf(&newick, "%s:%f", root->label, root->length) == -1) fatal("Unable to allocate enough memory."); } else { char * subtree1 = newick_utree_recurse(root->next->back); char * subtree2 = newick_utree_recurse(root->next->next->back); if (asprintf(&newick, "(%s,%s)%s:%f", subtree1, subtree2, root->label ? root->label : "", root->length) == -1) fatal("Unable to allocate enough memory."); free(subtree1); free(subtree2); } return newick; } char * utree_export_newick(utree_t * root) { char * newick; if (!root) return NULL; char * subtree1 = newick_utree_recurse(root->back); char * subtree2 = newick_utree_recurse(root->next->back); char * subtree3 = newick_utree_recurse(root->next->next->back); if (asprintf(&newick, "(%s,%s,%s)%s:%f;", subtree1, subtree2, subtree3, root->label ? root->label : "", root->length) == -1) fatal("Unable to allocate enough memory."); free(subtree1); free(subtree2); free(subtree3); return (newick); } static void utree_traverse_recursive(utree_t * node, int (*cbtrav)(utree_t *), int * index, utree_t ** outbuffer) { if (!node->next) { if (cbtrav(node)) { outbuffer[*index] = node; *index = *index + 1; } return; } if (!cbtrav(node)) return; utree_traverse_recursive(node->next->back, cbtrav, index, outbuffer); utree_traverse_recursive(node->next->next->back, cbtrav, index, outbuffer); outbuffer[*index] = node; *index = *index + 1; } int utree_traverse(utree_t * root, int (*cbtrav)(utree_t *), utree_t ** outbuffer) { int index = 0; if (!root->next) return -1; /* we will traverse an unrooted tree in the following way 2 / 1 --* \ 3 at each node the callback function is called to decide whether we are going to traversing the subtree rooted at the specific node */ utree_traverse_recursive(root->back, cbtrav, &index, outbuffer); utree_traverse_recursive(root, cbtrav, &index, outbuffer); return index; } static void utree_traverse_postorder_recursive(utree_t * node, int (*cbtrav)(utree_t *), int * index, utree_t ** outbuffer) { if (!node->next) { if (cbtrav(node)) { outbuffer[*index] = node; *index = *index + 1; } return; } utree_traverse_postorder_recursive(node->next->back, cbtrav, index, outbuffer); utree_traverse_postorder_recursive(node->next->next->back, cbtrav, index, outbuffer); if (cbtrav(node)) { outbuffer[*index] = node; *index = *index + 1; } } static int cb_outgroup(utree_t * node) { /* if it's a tip */ if (!node->next) return 0; /* if inner node */ if (node->next->back->mark == 1 || node->next->next->back->mark == 1) node->mark = 1; else node->mark = 0; node->next->mark = node->next->back->mark; node->next->next->mark = node->next->next->back->mark; return node->mark; } static int utree_traverse_postorder(utree_t * root, int (*cbtrav)(utree_t *), utree_t ** outbuffer) { int index = 0; if (!root->next) return -1; /* we will traverse an unrooted tree in the following way 2 / 1 --* \ 3 at each node the callback function is called to decide whether we are going to traversing the subtree rooted at the specific node */ utree_traverse_postorder_recursive(root->back, cbtrav, &index, outbuffer); utree_traverse_postorder_recursive(root, cbtrav, &index, outbuffer); return index; } static void utree_query_tipnodes_recursive(utree_t * node, utree_t ** node_list, int * index) { if (!node->next) { node_list[*index] = node; *index = *index + 1; return; } utree_query_tipnodes_recursive(node->next->back, node_list, index); utree_query_tipnodes_recursive(node->next->next->back, node_list, index); } int utree_query_tipnodes(utree_t * root, utree_t ** node_list) { int index = 0; if (!root) return 0; if (!root->next) root = root->back; utree_query_tipnodes_recursive(root->back, node_list, &index); utree_query_tipnodes_recursive(root->next->back, node_list, &index); utree_query_tipnodes_recursive(root->next->next->back, node_list, &index); return index; } static void utree_query_innernodes_recursive(utree_t * node, utree_t ** node_list, int * index) { if (!node->next) return; /* postorder traversal */ utree_query_innernodes_recursive(node->next->back, node_list, index); utree_query_innernodes_recursive(node->next->next->back, node_list, index); node_list[*index] = node; *index = *index + 1; return; } int utree_query_innernodes(utree_t * root, utree_t ** node_list) { int index = 0; if (!root) return 0; if (!root->next) root = root->back; utree_query_innernodes_recursive(root->back, node_list, &index); utree_query_innernodes_recursive(root->next->back, node_list, &index); utree_query_innernodes_recursive(root->next->next->back, node_list, &index); node_list[index++] = root; return index; } static rtree_t * utree_rtree(utree_t * unode) { rtree_t * rnode = (rtree_t *)xcalloc(1,sizeof(rtree_t)); rnode->event = EVENT_COALESCENT; if (unode->label) rnode->label = xstrdup(unode->label); else rnode->label = NULL; rnode->length = unode->length; rnode->data = NULL; rnode->mark = 0; if (!unode->next) { rnode->left = NULL; rnode->right = NULL; return rnode; } rnode->left = utree_rtree(unode->next->back); rnode->right = utree_rtree(unode->next->next->back); rnode->left->parent = rnode; rnode->right->parent = rnode; return rnode; } utree_t * utree_longest_branchtip(utree_t * node, unsigned int tip_count) { unsigned int index = 0; unsigned int i; double branch_length = 0; utree_t * outgroup = NULL; /* query tip nodes */ utree_t ** tip_nodes_list = (utree_t **)xcalloc(1,(size_t)tip_count * sizeof(utree_t *)); utree_query_tipnodes(node, tip_nodes_list); for (i = 0; i < tip_count; ++i) if (tip_nodes_list[i]->length > branch_length) { index = i; branch_length = tip_nodes_list[i]->length; } outgroup = tip_nodes_list[index]; free(tip_nodes_list); return outgroup; } rtree_t * utree_crop(utree_t * lca) { /* is the back of the lca a tip? */ if (!lca->back->next) return NULL; rtree_t * root = (rtree_t *)xcalloc(1,sizeof(rtree_t)); /* clone the two subtrees */ root->left = utree_rtree(lca->back->next->back); root->right = utree_rtree(lca->back->next->next->back); root->parent = NULL; root->length = 0; root->label = NULL; root->data = NULL; root->mark = 0; root->left->parent = root; root->right->parent = root; rtree_reset_info(root); return root; } rtree_t * utree_convert_rtree(utree_t * outgroup) { rtree_t * root = (rtree_t *)xcalloc(1,sizeof(rtree_t)); root->left = utree_rtree(outgroup); root->right = utree_rtree(outgroup->back); root->left->parent = root; root->right->parent = root; root->left->length /= 2; root->right->length /= 2; root->label = NULL; root->length = 0; root->parent = NULL; root->event = EVENT_COALESCENT; root->data = NULL; root->mark = 0; /* reset per-node leaves and valid edges */ rtree_reset_info(root); return root; } static utree_t ** utree_tipstring_nodes(utree_t * root, char * tipstring, unsigned int utree_tip_count, unsigned int * tiplist_count) { unsigned int i; unsigned int k; unsigned int commas_count = 0; char * taxon; size_t taxon_len; ENTRY * found = NULL; for (i = 0; i < strlen(tipstring); ++i) if (tipstring[i] == ',') commas_count++; utree_t ** node_list = (utree_t **)xcalloc(1,(size_t)utree_tip_count * sizeof(utree_t *)); utree_query_tipnodes(root, node_list); utree_t ** out_node_list = (utree_t **)xcalloc(1,(commas_count+1) * sizeof(utree_t *)); /* create a hashtable of tip labels */ hcreate(2 * (size_t)utree_tip_count); for (i = 0; i < (unsigned int)utree_tip_count; ++i) { ENTRY entry; entry.key = node_list[i]->label; entry.data = node_list[i]; hsearch(entry,ENTER); } char * s = tipstring; k = 0; while (*s) { /* get next tip */ taxon_len = strcspn(s, ","); if (!taxon_len) fatal("Erroneous prune list format (double comma)/taxon missing"); taxon = xstrndup(s, taxon_len); /* search tip in hash table */ ENTRY query; query.key = taxon; found = NULL; found = hsearch(query,FIND); if (!found) fatal("Taxon %s does not appear in the tree", taxon); /* store pointer in output list */ out_node_list[k++] = (utree_t *)(found->data); /* free tip label, and move to the beginning of next tip if available */ free(taxon); s += taxon_len; if (*s == ',') s += 1; } /* kill the hash table */ hdestroy(); free(node_list); /* return number of tips in the list */ *tiplist_count = commas_count + 1; /* return tip node list */ return out_node_list; } static utree_t * utree_lca(utree_t ** tip_nodes, unsigned int count, unsigned int utree_tip_count) { long i; utree_t * lca = NULL; utree_t ** path; /* allocate a path */ path = (utree_t **)xcalloc(1,(size_t)utree_tip_count * sizeof(utree_t **)); /* mark all tip nodes */ for (i = 0; i < count; ++i) tip_nodes[i]->mark = 1; /* traverse the tree with the cb_outgroup callback to get the inner nodes of the subtree formed by the outgroup */ int path_len = utree_traverse_postorder(tip_nodes[0]->back, cb_outgroup, path); /* there must be exactly one inner node that does not have all three directions mark. That one will be the root of the outgroup subtree */ int root_count = 0; for (i = 0; i < path_len; ++i) if (!(path[i]->mark && path[i]->next->mark && path[i]->next->next->mark)) { root_count++; lca = path[i]; } /* deallocate path */ free(path); /* if we had more than one inner nodes with less than three directions marked then not all tips of a subtree were specified (invalid outgroup) */ if (root_count != 1) return NULL; while (lca->mark == 1) lca = lca->next; /* return the LCA */ return lca; } utree_t * utree_outgroup_lca(utree_t * root, unsigned int tip_count) { unsigned int og_tips_count; utree_t * og_root; utree_t ** og_tips; /* get all nodes that have labels equal to the comma separated string in opt_outgroup */ og_tips = utree_tipstring_nodes(root, opt_outgroup, tip_count, &og_tips_count); if (og_tips_count == 1) { og_root = og_tips[0]; } else { /* find the LCA of the tips in og_tips. Note that, *all* tips of the desired subtree *must* be specified */ og_root = utree_lca(og_tips, og_tips_count, tip_count); } free(og_tips); /* return the LCA (root of the outgroup subtree */ return og_root; }