pax_global_header00006660000000000000000000000064135341374030014515gustar00rootroot0000000000000052 comment=68bb95dd38cb5dc64259f6f5ac43e3ddcad07dca simka-1.5.1/000077500000000000000000000000001353413740300126255ustar00rootroot00000000000000simka-1.5.1/.gitignore000066400000000000000000000001131353413740300146100ustar00rootroot00000000000000/build/ .DS_Store /docker/simka_results/ /docker/simka_temp_output/ *.pyc simka-1.5.1/.gitmodules000066400000000000000000000001551353413740300150030ustar00rootroot00000000000000[submodule "thirdparty/gatb-core"] path = thirdparty/gatb-core url = https://github.com/GATB/gatb-core.git simka-1.5.1/CMakeLists.txt000077500000000000000000000125231353413740300153730ustar00rootroot00000000000000project(simka) cmake_minimum_required(VERSION 2.6) ################################################################################ # The version number. ################################################################################ SET (gatb-tool_VERSION_MAJOR 1) SET (gatb-tool_VERSION_MINOR 5) SET (gatb-tool_VERSION_PATCH 1) IF (DEFINED MAJOR) SET (gatb-tool_VERSION_MAJOR ${MAJOR}) ENDIF() IF (DEFINED MINOR) SET (gatb-tool_VERSION_MINOR ${MINOR}) ENDIF() IF (DEFINED PATCH) SET (gatb-tool_VERSION_PATCH ${PATCH}) ENDIF() set (gatb-tool-version ${gatb-tool_VERSION_MAJOR}.${gatb-tool_VERSION_MINOR}.${gatb-tool_VERSION_PATCH}) # However, continuous integration has priority over local compilation IF (DEFINED JENKINS_TAG) SET (gatb-tool-version ${JENKINS_TAG}) ENDIF() ################################################################################ # Define cmake modules directory ################################################################################ SET (GATB_CORE_HOME ${PROJECT_SOURCE_DIR}/thirdparty/gatb-core/gatb-core) SET (CMAKE_MODULE_PATH ${GATB_CORE_HOME}/cmake) ################################################################################ # THIRD PARTIES ################################################################################ # We don't want to install some GATB-CORE artifacts SET (GATB_CORE_EXCLUDE_TOOLS 1) SET (GATB_CORE_EXCLUDE_TESTS 1) SET (GATB_CORE_EXCLUDE_EXAMPLES 1) # GATB CORE include (GatbCore) ################################################################################ # TOOL ################################################################################ # we get compilation definitions from the gatb-core part add_definitions (${gatb-core-flags}) # we add a new compilation variable if (PRINTALL) SET( CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DPRINTALL" ) endif() # we give the headers directories from : # - from project source # - from GATB-CORE source # - from simka source include_directories (include ${gatb-core-includes} ${PROJECT_SOURCE_DIR}/src/core ${PROJECT_SOURCE_DIR}/src/minikc ${PROJECT_SOURCE_DIR}/src) # we generate one file per template specialization FOREACH (KSIZE ${gatb-core-klist}) configure_file ( ${PROJECT_SOURCE_DIR}/src/core/SimkaAlgorithmTemplate.cpp.in ${PROJECT_BINARY_DIR}/src/core/template/SimkaAlgorithmTemplate_${KSIZE}.cpp ) ENDFOREACH () # we define the files to be compiled file (GLOB_RECURSE ProjectFiles src/core/Simka* ${PROJECT_BINARY_DIR}/src/core/template/*.cpp) file (GLOB_RECURSE SimkaMinFiles src/simkaMin/MurmurHash3.h src/simkaMin/MurmurHash3.cpp src/simkaMin/*.hpp)# ${PROJECT_BINARY_DIR}/src/core/template/*.cpp) SET(CMAKE_RUNTIME_OUTPUT_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}/bin) set(PROJECT_BINARY_DIR ${CMAKE_CURRENT_BINARY_DIR}/bin) add_executable (simka src/SimkaPotara.cpp ${ProjectFiles}) target_link_libraries (simka ${gatb-core-libraries}) add_executable (simkaCountProcess src/minikc/SimkaCountProcess.cpp ${ProjectFiles}) target_link_libraries (simkaCountProcess ${gatb-core-libraries}) add_executable (simkaCount src/SimkaCount.cpp ${ProjectFiles}) target_link_libraries (simkaCount ${gatb-core-libraries}) add_executable (simkaMerge src/SimkaMerge.cpp ${ProjectFiles}) target_link_libraries (simkaMerge ${gatb-core-libraries}) add_executable (simkaMinCore src/simkaMin/SimkaMin.cpp ${SimkaMinFiles}) target_link_libraries (simkaMinCore ${gatb-core-libraries}) ################################################################################ # PACKAGING ################################################################################ SET (CPACK_PACKAGE_DESCRIPTION_SUMMARY "gatb-tool ${PROJECT_NAME}") SET (CPACK_PACKAGE_VENDOR "Genscale team (INRIA)") SET (CPACK_PACKAGE_VERSION_MAJOR "${gatb-tool_VERSION_MAJOR}") SET (CPACK_PACKAGE_VERSION_MINOR "${gatb-tool_VERSION_MINOR}") SET (CPACK_PACKAGE_VERSION_PATCH "${gatb-tool_VERSION_PATCH}") SET (CPACK_PACKAGE_VERSION "${gatb-tool-version}") # We chose the kind of archive we want to generate SET (CPACK_GENERATOR "TGZ") SET (CPACK_SOURCE_GENERATOR "TGZ") # We ignore unwanted files for the source archive SET (CPACK_SOURCE_IGNORE_FILES "^${PROJECT_SOURCE_DIR}/\\.git/" ; "^${PROJECT_SOURCE_DIR}/\\.gitmodules" ; "^${PROJECT_SOURCE_DIR}/\\.gitignore" ; "^${PROJECT_SOURCE_DIR}/build/" ; "^${PROJECT_SOURCE_DIR}/dependency/" ; "^${GATB_CORE_HOME}/\\.cproject" ; "^${GATB_CORE_HOME}/\\.git/" ; "^${GATB_CORE_HOME}/\\.project" ; "^${GATB_CORE_HOME}/\\.gitignore" ) # For creating the BINARY package we include the files we want INSTALL (DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}/bin DESTINATION .) INSTALL (DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}/example DESTINATION .) INSTALL (DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}/simkaMin DESTINATION .) INSTALL (DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}/scripts DESTINATION . FILES_MATCHING REGEX ".*\\.(py|r)$" PATTERN "jenkins" EXCLUDE) INSTALL (FILES ${CMAKE_CURRENT_SOURCE_DIR}/README.md DESTINATION .) INSTALL (FILES ${CMAKE_CURRENT_SOURCE_DIR}/LICENSE DESTINATION .) # We include the "bin" tag into binary archive file name set (CPACK_PACKAGE_FILE_NAME ${PROJECT_NAME}-${CPACK_PACKAGE_VERSION}-bin-${CMAKE_SYSTEM_NAME}) # To be done at the end. INCLUDE (CPack) simka-1.5.1/INSTALL000077500000000000000000000006051353413740300136620ustar00rootroot00000000000000# CMake is required to compile simka (http://www.cmake.org/cmake/resources/software.html) # # you can install simka by executing this file: sh INSTALL # # Prepare GATB sub-module git submodule init git submodule update # Prepare directories: rm -rf build mkdir build # Go in the 'build' directory cd build # Prepare the makefile cmake .. # Run the newly created makefile: make -j8 simka-1.5.1/LICENSE000077500000000000000000001033301353413740300136350ustar00rootroot00000000000000 GNU AFFERO GENERAL PUBLIC LICENSE Version 3, 19 November 2007 Copyright (C) 2007 Free Software Foundation, Inc. Everyone is permitted to copy and distribute verbatim copies of this license document, but changing it is not allowed. Preamble The GNU Affero General Public License is a free, copyleft license for software and other kinds of works, specifically designed to ensure cooperation with the community in the case of network server software. The licenses for most software and other practical works are designed to take away your freedom to share and change the works. By contrast, our General Public Licenses are intended to guarantee your freedom to share and change all versions of a program--to make sure it remains free software for all its users. When we speak of free software, we are referring to freedom, not price. Our General Public Licenses are designed to make sure that you have the freedom to distribute copies of free software (and charge for them if you wish), that you receive source code or can get it if you want it, that you can change the software or use pieces of it in new free programs, and that you know you can do these things. Developers that use our General Public Licenses protect your rights with two steps: (1) assert copyright on the software, and (2) offer you this License which gives you legal permission to copy, distribute and/or modify the software. A secondary benefit of defending all users' freedom is that improvements made in alternate versions of the program, if they receive widespread use, become available for other developers to incorporate. Many developers of free software are heartened and encouraged by the resulting cooperation. However, in the case of software used on network servers, this result may fail to come about. The GNU General Public License permits making a modified version and letting the public access it on a server without ever releasing its source code to the public. The GNU Affero General Public License is designed specifically to ensure that, in such cases, the modified source code becomes available to the community. It requires the operator of a network server to provide the source code of the modified version running there to the users of that server. Therefore, public use of a modified version, on a publicly accessible server, gives the public access to the source code of the modified version. An older license, called the Affero General Public License and published by Affero, was designed to accomplish similar goals. This is a different license, not a version of the Affero GPL, but Affero has released a new version of the Affero GPL which permits relicensing under this license. The precise terms and conditions for copying, distribution and modification follow. TERMS AND CONDITIONS 0. Definitions. "This License" refers to version 3 of the GNU Affero General Public License. "Copyright" also means copyright-like laws that apply to other kinds of works, such as semiconductor masks. "The Program" refers to any copyrightable work licensed under this License. Each licensee is addressed as "you". "Licensees" and "recipients" may be individuals or organizations. To "modify" a work means to copy from or adapt all or part of the work in a fashion requiring copyright permission, other than the making of an exact copy. The resulting work is called a "modified version" of the earlier work or a work "based on" the earlier work. A "covered work" means either the unmodified Program or a work based on the Program. To "propagate" a work means to do anything with it that, without permission, would make you directly or secondarily liable for infringement under applicable copyright law, except executing it on a computer or modifying a private copy. Propagation includes copying, distribution (with or without modification), making available to the public, and in some countries other activities as well. To "convey" a work means any kind of propagation that enables other parties to make or receive copies. Mere interaction with a user through a computer network, with no transfer of a copy, is not conveying. An interactive user interface displays "Appropriate Legal Notices" to the extent that it includes a convenient and prominently visible feature that (1) displays an appropriate copyright notice, and (2) tells the user that there is no warranty for the work (except to the extent that warranties are provided), that licensees may convey the work under this License, and how to view a copy of this License. If the interface presents a list of user commands or options, such as a menu, a prominent item in the list meets this criterion. 1. Source Code. The "source code" for a work means the preferred form of the work for making modifications to it. "Object code" means any non-source form of a work. A "Standard Interface" means an interface that either is an official standard defined by a recognized standards body, or, in the case of interfaces specified for a particular programming language, one that is widely used among developers working in that language. The "System Libraries" of an executable work include anything, other than the work as a whole, that (a) is included in the normal form of packaging a Major Component, but which is not part of that Major Component, and (b) serves only to enable use of the work with that Major Component, or to implement a Standard Interface for which an implementation is available to the public in source code form. A "Major Component", in this context, means a major essential component (kernel, window system, and so on) of the specific operating system (if any) on which the executable work runs, or a compiler used to produce the work, or an object code interpreter used to run it. The "Corresponding Source" for a work in object code form means all the source code needed to generate, install, and (for an executable work) run the object code and to modify the work, including scripts to control those activities. However, it does not include the work's System Libraries, or general-purpose tools or generally available free programs which are used unmodified in performing those activities but which are not part of the work. For example, Corresponding Source includes interface definition files associated with source files for the work, and the source code for shared libraries and dynamically linked subprograms that the work is specifically designed to require, such as by intimate data communication or control flow between those subprograms and other parts of the work. The Corresponding Source need not include anything that users can regenerate automatically from other parts of the Corresponding Source. The Corresponding Source for a work in source code form is that same work. 2. Basic Permissions. All rights granted under this License are granted for the term of copyright on the Program, and are irrevocable provided the stated conditions are met. This License explicitly affirms your unlimited permission to run the unmodified Program. The output from running a covered work is covered by this License only if the output, given its content, constitutes a covered work. This License acknowledges your rights of fair use or other equivalent, as provided by copyright law. You may make, run and propagate covered works that you do not convey, without conditions so long as your license otherwise remains in force. You may convey covered works to others for the sole purpose of having them make modifications exclusively for you, or provide you with facilities for running those works, provided that you comply with the terms of this License in conveying all material for which you do not control copyright. Those thus making or running the covered works for you must do so exclusively on your behalf, under your direction and control, on terms that prohibit them from making any copies of your copyrighted material outside their relationship with you. Conveying under any other circumstances is permitted solely under the conditions stated below. Sublicensing is not allowed; section 10 makes it unnecessary. 3. Protecting Users' Legal Rights From Anti-Circumvention Law. No covered work shall be deemed part of an effective technological measure under any applicable law fulfilling obligations under article 11 of the WIPO copyright treaty adopted on 20 December 1996, or similar laws prohibiting or restricting circumvention of such measures. When you convey a covered work, you waive any legal power to forbid circumvention of technological measures to the extent such circumvention is effected by exercising rights under this License with respect to the covered work, and you disclaim any intention to limit operation or modification of the work as a means of enforcing, against the work's users, your or third parties' legal rights to forbid circumvention of technological measures. 4. Conveying Verbatim Copies. You may convey verbatim copies of the Program's source code as you receive it, in any medium, provided that you conspicuously and appropriately publish on each copy an appropriate copyright notice; keep intact all notices stating that this License and any non-permissive terms added in accord with section 7 apply to the code; keep intact all notices of the absence of any warranty; and give all recipients a copy of this License along with the Program. You may charge any price or no price for each copy that you convey, and you may offer support or warranty protection for a fee. 5. Conveying Modified Source Versions. You may convey a work based on the Program, or the modifications to produce it from the Program, in the form of source code under the terms of section 4, provided that you also meet all of these conditions: a) The work must carry prominent notices stating that you modified it, and giving a relevant date. b) The work must carry prominent notices stating that it is released under this License and any conditions added under section 7. This requirement modifies the requirement in section 4 to "keep intact all notices". c) You must license the entire work, as a whole, under this License to anyone who comes into possession of a copy. This License will therefore apply, along with any applicable section 7 additional terms, to the whole of the work, and all its parts, regardless of how they are packaged. This License gives no permission to license the work in any other way, but it does not invalidate such permission if you have separately received it. d) If the work has interactive user interfaces, each must display Appropriate Legal Notices; however, if the Program has interactive interfaces that do not display Appropriate Legal Notices, your work need not make them do so. A compilation of a covered work with other separate and independent works, which are not by their nature extensions of the covered work, and which are not combined with it such as to form a larger program, in or on a volume of a storage or distribution medium, is called an "aggregate" if the compilation and its resulting copyright are not used to limit the access or legal rights of the compilation's users beyond what the individual works permit. Inclusion of a covered work in an aggregate does not cause this License to apply to the other parts of the aggregate. 6. Conveying Non-Source Forms. You may convey a covered work in object code form under the terms of sections 4 and 5, provided that you also convey the machine-readable Corresponding Source under the terms of this License, in one of these ways: a) Convey the object code in, or embodied in, a physical product (including a physical distribution medium), accompanied by the Corresponding Source fixed on a durable physical medium customarily used for software interchange. b) Convey the object code in, or embodied in, a physical product (including a physical distribution medium), accompanied by a written offer, valid for at least three years and valid for as long as you offer spare parts or customer support for that product model, to give anyone who possesses the object code either (1) a copy of the Corresponding Source for all the software in the product that is covered by this License, on a durable physical medium customarily used for software interchange, for a price no more than your reasonable cost of physically performing this conveying of source, or (2) access to copy the Corresponding Source from a network server at no charge. c) Convey individual copies of the object code with a copy of the written offer to provide the Corresponding Source. This alternative is allowed only occasionally and noncommercially, and only if you received the object code with such an offer, in accord with subsection 6b. d) Convey the object code by offering access from a designated place (gratis or for a charge), and offer equivalent access to the Corresponding Source in the same way through the same place at no further charge. You need not require recipients to copy the Corresponding Source along with the object code. If the place to copy the object code is a network server, the Corresponding Source may be on a different server (operated by you or a third party) that supports equivalent copying facilities, provided you maintain clear directions next to the object code saying where to find the Corresponding Source. Regardless of what server hosts the Corresponding Source, you remain obligated to ensure that it is available for as long as needed to satisfy these requirements. e) Convey the object code using peer-to-peer transmission, provided you inform other peers where the object code and Corresponding Source of the work are being offered to the general public at no charge under subsection 6d. A separable portion of the object code, whose source code is excluded from the Corresponding Source as a System Library, need not be included in conveying the object code work. A "User Product" is either (1) a "consumer product", which means any tangible personal property which is normally used for personal, family, or household purposes, or (2) anything designed or sold for incorporation into a dwelling. In determining whether a product is a consumer product, doubtful cases shall be resolved in favor of coverage. For a particular product received by a particular user, "normally used" refers to a typical or common use of that class of product, regardless of the status of the particular user or of the way in which the particular user actually uses, or expects or is expected to use, the product. A product is a consumer product regardless of whether the product has substantial commercial, industrial or non-consumer uses, unless such uses represent the only significant mode of use of the product. "Installation Information" for a User Product means any methods, procedures, authorization keys, or other information required to install and execute modified versions of a covered work in that User Product from a modified version of its Corresponding Source. The information must suffice to ensure that the continued functioning of the modified object code is in no case prevented or interfered with solely because modification has been made. If you convey an object code work under this section in, or with, or specifically for use in, a User Product, and the conveying occurs as part of a transaction in which the right of possession and use of the User Product is transferred to the recipient in perpetuity or for a fixed term (regardless of how the transaction is characterized), the Corresponding Source conveyed under this section must be accompanied by the Installation Information. But this requirement does not apply if neither you nor any third party retains the ability to install modified object code on the User Product (for example, the work has been installed in ROM). The requirement to provide Installation Information does not include a requirement to continue to provide support service, warranty, or updates for a work that has been modified or installed by the recipient, or for the User Product in which it has been modified or installed. Access to a network may be denied when the modification itself materially and adversely affects the operation of the network or violates the rules and protocols for communication across the network. Corresponding Source conveyed, and Installation Information provided, in accord with this section must be in a format that is publicly documented (and with an implementation available to the public in source code form), and must require no special password or key for unpacking, reading or copying. 7. Additional Terms. "Additional permissions" are terms that supplement the terms of this License by making exceptions from one or more of its conditions. Additional permissions that are applicable to the entire Program shall be treated as though they were included in this License, to the extent that they are valid under applicable law. If additional permissions apply only to part of the Program, that part may be used separately under those permissions, but the entire Program remains governed by this License without regard to the additional permissions. When you convey a copy of a covered work, you may at your option remove any additional permissions from that copy, or from any part of it. (Additional permissions may be written to require their own removal in certain cases when you modify the work.) You may place additional permissions on material, added by you to a covered work, for which you have or can give appropriate copyright permission. Notwithstanding any other provision of this License, for material you add to a covered work, you may (if authorized by the copyright holders of that material) supplement the terms of this License with terms: a) Disclaiming warranty or limiting liability differently from the terms of sections 15 and 16 of this License; or b) Requiring preservation of specified reasonable legal notices or author attributions in that material or in the Appropriate Legal Notices displayed by works containing it; or c) Prohibiting misrepresentation of the origin of that material, or requiring that modified versions of such material be marked in reasonable ways as different from the original version; or d) Limiting the use for publicity purposes of names of licensors or authors of the material; or e) Declining to grant rights under trademark law for use of some trade names, trademarks, or service marks; or f) Requiring indemnification of licensors and authors of that material by anyone who conveys the material (or modified versions of it) with contractual assumptions of liability to the recipient, for any liability that these contractual assumptions directly impose on those licensors and authors. All other non-permissive additional terms are considered "further restrictions" within the meaning of section 10. If the Program as you received it, or any part of it, contains a notice stating that it is governed by this License along with a term that is a further restriction, you may remove that term. If a license document contains a further restriction but permits relicensing or conveying under this License, you may add to a covered work material governed by the terms of that license document, provided that the further restriction does not survive such relicensing or conveying. If you add terms to a covered work in accord with this section, you must place, in the relevant source files, a statement of the additional terms that apply to those files, or a notice indicating where to find the applicable terms. Additional terms, permissive or non-permissive, may be stated in the form of a separately written license, or stated as exceptions; the above requirements apply either way. 8. Termination. You may not propagate or modify a covered work except as expressly provided under this License. Any attempt otherwise to propagate or modify it is void, and will automatically terminate your rights under this License (including any patent licenses granted under the third paragraph of section 11). However, if you cease all violation of this License, then your license from a particular copyright holder is reinstated (a) provisionally, unless and until the copyright holder explicitly and finally terminates your license, and (b) permanently, if the copyright holder fails to notify you of the violation by some reasonable means prior to 60 days after the cessation. Moreover, your license from a particular copyright holder is reinstated permanently if the copyright holder notifies you of the violation by some reasonable means, this is the first time you have received notice of violation of this License (for any work) from that copyright holder, and you cure the violation prior to 30 days after your receipt of the notice. Termination of your rights under this section does not terminate the licenses of parties who have received copies or rights from you under this License. If your rights have been terminated and not permanently reinstated, you do not qualify to receive new licenses for the same material under section 10. 9. Acceptance Not Required for Having Copies. You are not required to accept this License in order to receive or run a copy of the Program. Ancillary propagation of a covered work occurring solely as a consequence of using peer-to-peer transmission to receive a copy likewise does not require acceptance. However, nothing other than this License grants you permission to propagate or modify any covered work. These actions infringe copyright if you do not accept this License. Therefore, by modifying or propagating a covered work, you indicate your acceptance of this License to do so. 10. Automatic Licensing of Downstream Recipients. Each time you convey a covered work, the recipient automatically receives a license from the original licensors, to run, modify and propagate that work, subject to this License. You are not responsible for enforcing compliance by third parties with this License. An "entity transaction" is a transaction transferring control of an organization, or substantially all assets of one, or subdividing an organization, or merging organizations. If propagation of a covered work results from an entity transaction, each party to that transaction who receives a copy of the work also receives whatever licenses to the work the party's predecessor in interest had or could give under the previous paragraph, plus a right to possession of the Corresponding Source of the work from the predecessor in interest, if the predecessor has it or can get it with reasonable efforts. You may not impose any further restrictions on the exercise of the rights granted or affirmed under this License. For example, you may not impose a license fee, royalty, or other charge for exercise of rights granted under this License, and you may not initiate litigation (including a cross-claim or counterclaim in a lawsuit) alleging that any patent claim is infringed by making, using, selling, offering for sale, or importing the Program or any portion of it. 11. Patents. A "contributor" is a copyright holder who authorizes use under this License of the Program or a work on which the Program is based. The work thus licensed is called the contributor's "contributor version". A contributor's "essential patent claims" are all patent claims owned or controlled by the contributor, whether already acquired or hereafter acquired, that would be infringed by some manner, permitted by this License, of making, using, or selling its contributor version, but do not include claims that would be infringed only as a consequence of further modification of the contributor version. For purposes of this definition, "control" includes the right to grant patent sublicenses in a manner consistent with the requirements of this License. Each contributor grants you a non-exclusive, worldwide, royalty-free patent license under the contributor's essential patent claims, to make, use, sell, offer for sale, import and otherwise run, modify and propagate the contents of its contributor version. In the following three paragraphs, a "patent license" is any express agreement or commitment, however denominated, not to enforce a patent (such as an express permission to practice a patent or covenant not to sue for patent infringement). To "grant" such a patent license to a party means to make such an agreement or commitment not to enforce a patent against the party. If you convey a covered work, knowingly relying on a patent license, and the Corresponding Source of the work is not available for anyone to copy, free of charge and under the terms of this License, through a publicly available network server or other readily accessible means, then you must either (1) cause the Corresponding Source to be so available, or (2) arrange to deprive yourself of the benefit of the patent license for this particular work, or (3) arrange, in a manner consistent with the requirements of this License, to extend the patent license to downstream recipients. "Knowingly relying" means you have actual knowledge that, but for the patent license, your conveying the covered work in a country, or your recipient's use of the covered work in a country, would infringe one or more identifiable patents in that country that you have reason to believe are valid. If, pursuant to or in connection with a single transaction or arrangement, you convey, or propagate by procuring conveyance of, a covered work, and grant a patent license to some of the parties receiving the covered work authorizing them to use, propagate, modify or convey a specific copy of the covered work, then the patent license you grant is automatically extended to all recipients of the covered work and works based on it. A patent license is "discriminatory" if it does not include within the scope of its coverage, prohibits the exercise of, or is conditioned on the non-exercise of one or more of the rights that are specifically granted under this License. You may not convey a covered work if you are a party to an arrangement with a third party that is in the business of distributing software, under which you make payment to the third party based on the extent of your activity of conveying the work, and under which the third party grants, to any of the parties who would receive the covered work from you, a discriminatory patent license (a) in connection with copies of the covered work conveyed by you (or copies made from those copies), or (b) primarily for and in connection with specific products or compilations that contain the covered work, unless you entered into that arrangement, or that patent license was granted, prior to 28 March 2007. Nothing in this License shall be construed as excluding or limiting any implied license or other defenses to infringement that may otherwise be available to you under applicable patent law. 12. No Surrender of Others' Freedom. If conditions are imposed on you (whether by court order, agreement or otherwise) that contradict the conditions of this License, they do not excuse you from the conditions of this License. If you cannot convey a covered work so as to satisfy simultaneously your obligations under this License and any other pertinent obligations, then as a consequence you may not convey it at all. For example, if you agree to terms that obligate you to collect a royalty for further conveying from those to whom you convey the Program, the only way you could satisfy both those terms and this License would be to refrain entirely from conveying the Program. 13. Remote Network Interaction; Use with the GNU General Public License. Notwithstanding any other provision of this License, if you modify the Program, your modified version must prominently offer all users interacting with it remotely through a computer network (if your version supports such interaction) an opportunity to receive the Corresponding Source of your version by providing access to the Corresponding Source from a network server at no charge, through some standard or customary means of facilitating copying of software. This Corresponding Source shall include the Corresponding Source for any work covered by version 3 of the GNU General Public License that is incorporated pursuant to the following paragraph. Notwithstanding any other provision of this License, you have permission to link or combine any covered work with a work licensed under version 3 of the GNU General Public License into a single combined work, and to convey the resulting work. The terms of this License will continue to apply to the part which is the covered work, but the work with which it is combined will remain governed by version 3 of the GNU General Public License. 14. Revised Versions of this License. The Free Software Foundation may publish revised and/or new versions of the GNU Affero General Public License from time to time. Such new versions will be similar in spirit to the present version, but may differ in detail to address new problems or concerns. Each version is given a distinguishing version number. If the Program specifies that a certain numbered version of the GNU Affero General Public License "or any later version" applies to it, you have the option of following the terms and conditions either of that numbered version or of any later version published by the Free Software Foundation. If the Program does not specify a version number of the GNU Affero General Public License, you may choose any version ever published by the Free Software Foundation. If the Program specifies that a proxy can decide which future versions of the GNU Affero General Public License can be used, that proxy's public statement of acceptance of a version permanently authorizes you to choose that version for the Program. Later license versions may give you additional or different permissions. However, no additional obligations are imposed on any author or copyright holder as a result of your choosing to follow a later version. 15. Disclaimer of Warranty. THERE IS NO WARRANTY FOR THE PROGRAM, TO THE EXTENT PERMITTED BY APPLICABLE LAW. EXCEPT WHEN OTHERWISE STATED IN WRITING THE COPYRIGHT HOLDERS AND/OR OTHER PARTIES PROVIDE THE PROGRAM "AS IS" WITHOUT WARRANTY OF ANY KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE. THE ENTIRE RISK AS TO THE QUALITY AND PERFORMANCE OF THE PROGRAM IS WITH YOU. SHOULD THE PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF ALL NECESSARY SERVICING, REPAIR OR CORRECTION. 16. Limitation of Liability. IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MODIFIES AND/OR CONVEYS THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES, INCLUDING ANY GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING OUT OF THE USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED TO LOSS OF DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY YOU OR THIRD PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER PROGRAMS), EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE POSSIBILITY OF SUCH DAMAGES. 17. Interpretation of Sections 15 and 16. If the disclaimer of warranty and limitation of liability provided above cannot be given local legal effect according to their terms, reviewing courts shall apply local law that most closely approximates an absolute waiver of all civil liability in connection with the Program, unless a warranty or assumption of liability accompanies a copy of the Program in return for a fee. END OF TERMS AND CONDITIONS How to Apply These Terms to Your New Programs If you develop a new program, and you want it to be of the greatest possible use to the public, the best way to achieve this is to make it free software which everyone can redistribute and change under these terms. To do so, attach the following notices to the program. It is safest to attach them to the start of each source file to most effectively state the exclusion of warranty; and each file should have at least the "copyright" line and a pointer to where the full notice is found. Copyright (C) This program is free software: you can redistribute it and/or modify it under the terms of the GNU Affero General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more details. You should have received a copy of the GNU Affero General Public License along with this program. If not, see . Also add information on how to contact you by electronic and paper mail. If your software can interact with users remotely through a computer network, you should also make sure that it provides a way for users to get its source. For example, if your program is a web application, its interface could display a "Source" link that leads users to an archive of the code. There are many ways you could offer source, and different solutions will be better for different programs; see section 13 for the specific requirements. You should also get your employer (if you work as a programmer) or school, if any, to sign a "copyright disclaimer" for the program, if necessary. For more information on this, and how to apply and follow the GNU AGPL, see . simka-1.5.1/README.md000077500000000000000000000302331353413740300141100ustar00rootroot00000000000000# Simka & SimkaMin | **Linux** | **Mac OSX** | |-----------|-------------| [![Build Status](https://ci.inria.fr/gatb-core/view/Simka/job/tool-simka-build-debian7-64bits-gcc-4.7/badge/icon)](https://ci.inria.fr/gatb-core/view/Simka/job/tool-simka-build-debian7-64bits-gcc-4.7/) | [![Build Status](https://ci.inria.fr/gatb-core/view/Simka/job/tool-simka-build-macos-10.9.5-gcc-4.2.1/badge/icon)](https://ci.inria.fr/gatb-core/view/Simka/job/tool-simka-build-macos-10.9.5-gcc-4.2.1/) [![License](http://img.shields.io/:license-affero-blue.svg)](http://www.gnu.org/licenses/agpl-3.0.en.html) This github directory stores Simka and SimkaMin software. This readme focuses on Simka features. All information about SimkaMin is located in the [simkaMin](simkaMin/) directory. # What is Simka? Simka is a de novo comparative metagenomics tool. Simka represents each dataset as a k-mer spectrum and compute several classical ecological distances between them. Developper: [Gaëtan Benoit](http://people.rennes.inria.fr/Gaetan.Benoit/), PhD, former member of the [Genscale](http://team.inria.fr/genscale/) team at Inria. Contact: claire dot lemaitre at inria dot fr # References Benoit G, Peterlongo P, Mariadassou M, Drezen E, Schbath S, Lavenier D, Lemaitre C. (2016) [Multiple comparative metagenomics using multiset k-mer counting](https://doi.org/10.7717/peerj-cs.94). PeerJ Computer Science 2:e94 Benoit G (2017) [Large scale de novo comparative metagenomics (PhD thesis in french)](https://tel.archives-ouvertes.fr/tel-01659395v2/). # Install a binary release of simka Retrieve the binary archive file from one of the official simka releases (see "Releases" tab on the Github web page of the simka project); file name is "simka-xyz-bin-Darwin.tar.gz" or "simka-xyz-bin-Linux.tar.gz" (where xyz is a release version). Then, from the command-line: gunzip simka-xyz-bin-Dawrin.tar.gz tar -xf simka-xyz-bin-Dawrin.tar cd simka-xyz-Dawrin chmod +x bin/* example/*.sh Binary of simka is in folder "bin". You can try the software on your computer, as follows: cd example ./simple_test.sh In case the software does not run appropriately on your system, you should consider to install it from its source code, as explained below. For further instructions on using simka, see User Manual, below. # Install simka from source code: git clone Requirements: cmake 2.6+ and gcc 4.4.7+ (Linux) or clang 4.1+ (Mac OSX). From the command-line: git clone https://github.com/GATB/simka.git cd simka sh INSTALL See the INSTALL file for more information. Then, you can try the software on your computer, as follows: cd example ./simple_test.sh The installation creates 4 executables (./build/bin directory): simka: main software to be used for your analysis simkaCount, simkaMerge and simkaCountProcess: not to be used directly, called by 'simka' All softwares must stay in the same folder; so, if you want to move them elsewhere on your system, consider to let them altogether. For further instructions on using simka, see User Manual, below. # Install simka from source code: using a source release archive Requirements: cmake 2.6+ and gcc 4.5+ (Linux) or clang 4.1+ (Mac OSX). Retrieve the source code archive file from one of the official simka releases (see "Releases" tab on the Github web page of the simka project); file name is "simka-xyz-Source.tar.gz" (where xyz is a release version). Then, from the command-line: gunzip simka-xyz-Source.tar.gz tar -xf simka-xyz-Source.tar cd simka-xyz-Source sh INSTALL Then, you can try the software on your computer, as follows: cd example ./simple_test.sh For further instructions on using simka, see User Manual, below. # Changelog * version 1.5.1 Sept 05, 2019: - simkaMin: easier usage of simkaMin, usefull for conda packaging * version 1.5 Jun 07, 2019: - simkaMin software: faster results by subsampling the kmer space * version 1.4 Jun 21, 2017: - update gatb-core to version 1.2.2 - simka now provide gz compressed results - new scripts for result visualization * version 1.3.2 Oct 25, 2016: - improve memory usage of symetrical distances - option -data-info to compute information on the input data (nb reads per dataset...) - intermediate merge sort passes to handle large number of datasets - prevent distances from producing nan value - fix bug that occur during k-mer counting * version 1.3.0 July 29, 2016: - Bray-Crutis computed by default - Better k-mer statistics - Fix bug in script for creating heatmaps - Add "all in memory" k-mer counter when k <= 15 - Fine grain paralellization for computing distances - Clean all memory leaks with valgrind - Update help messages - Redirect stdout and stderr of parallel processes in specific log files * version 1.0.1 March 16, 2016: minor updates ang bug fixes, first release on Github * version 1 Feb 16, 2016: stable version * version 0.1 May 28, 2015: initial public release # User manual ## Description Simka computes several classical ecological distances between N (metagenomic) read sets based on k-mer counts. Simka is implemented with the GATB library (http://gatb.inria.fr/). ## Input The input file (-in) lists the datasets. These datasets can be in fasta, fastq and in gzip compressed format (.gz). One dataset per line with the following syntax (you can put any number of spaces and/or tabs between syntax): ID1: filename.fasta ID2: filename.fasta ID3: filename.fasta The dataset ID in the name that will appear in the headers of the distance matrices. You can find a simka input file in example directory: ./example/data/simka_input.txt If a given datset has been splitted in several parts, Simka can automatically concatenate them. ID1: filename_part1.fasta , filename_part2.fasta , ... If you have paired files, you can list them separated by a ‘;’: ID1: filename_pair1.fasta ; filename_pair2.fasta You can combine concatenated and paired operations: ID1: filename_part1_pair1.fasta , filename_part2_pair1.fasta ; filename_part1_pair2.fasta , filename_part2_pair2.fasta Paired syntax is only usefull if the -max-reads option of Simka is set. Example: If -max-reads is set to 100, then Simka will considered the 100 first reads of the first paired files and the 100 first reads of the second paired files… ## Output ### Temporary output The option -out-tmp controls where the temporary files of Simka will be stored. This option is mandatory since the disk usage of Simka can be high depending on the input size. This option must target a directory on your faster disk with some free space. One may want to add new datasets to existing Simka results without recomputing everything again (for instance, if your metagenomic project is incomplete). This can only be achieved by keeping those temporary files on the disk using the option -keep-tmp of Simka. ### Result output Simka results are distance matrices. A distance matrix is a squared matrix of size N (where N is the number of input datasets). Each value in the matrix give you the distance between a pair of datasets. These values are usually in the range [0, 1]. A distance value of 0 means that the pair of dataset is perfectly similar. The higher the distance value is, the more dissimilar is the pair of datasets. Simka results will be stored in the directory indicated by -out option. By default, Simka compute an abundance-based Bray-Curtis distance matrix and a presence-absence-based Jaccard distance matrix. The option -simple-dist allows to compute more ecology distances which are fast to compute (Chord, Hellinger, Kulczinski...). The option -complex-dist allows to compute others ecology distances which can be very long to compute (Jensen-Shannon, Canberra, Whittaker...). The matrice names follow this template: mat_[abundance|presenceAbsence]_[distanceName].csv.gz The distance matrices containing ‘simka’ are distances introduces by the comparead method. These distances have the advantage of having a symmetrical and asymmetrical version. ## Visualize simka results Simka results can be visualized through heatmaps, hierarchical clustering and PCA (MDS or PCoA to be exact). Requirements: R, gplots package (only for heatmap) Use the script run-visualization.py (located in "scripts/visualization" folder). Example: ```bash python run-visualization.py -in simka_results_dir -out output_figures_dir -pca -heatmap -tree ``` where simka_results_dir is the folder containing the distances matrices of Simka (-out) Figures can be annotated by providing a metadata data in standard csv format: ```bash DATASET_ID;VARIABLE_NAME_1;VARIABLE_NAME_2 A;1;aquatic B;1;human C;2;human D;2;soil E;3;soil ``` An example of this table is given at ./example/dataset_metadata.csv Dataset ID in the metadata table must match with the dataset ID in simka distance matrices Add the following options to activate annotations: ```bash -metadata-in: filename to a metadata table -metadata-variable: the name of the variable that you want to display in figures (the name of the column), for instance VARIABLE_NAME_1 in example above ``` Visualization example commands are given when running simka example (./example/simple_test.sh). ## Usage for simka To see simka in-line help: ```bash ./bin/simka ``` ## Simka command examples Run the toy example: ```bash ./bin/simka -in example/simka_input.txt -out results -out-tmp temp_output ``` Compute all the distances that Simka can provide (Bray-Curtis, Jensen-Shannon…): ```bash ./bin/simka … -simple-dist -complex-dist ``` Change the kmer size ```bash ./bin/simka … -kmer-size 31 ``` Filter kmers seen one time (potentially erroneous) and very high abundance kmers (potentially contaminants): ```bash ./bin/simka … -abundance-min 2 -abundance-max 200 ``` Filter over the sequences of the reads and k-mers: Minimum read size of 90. Discards low complexity reads and k-mers (shannon index < 1.5) ```bash ./bin/simka … -min-read-size 90 -read-shannon-index 1.5 -kmer-shannon-index 1.5 ``` Consider a subset of the reads of the input dataset (for dataset with non-uniform reads per sample): Considers all the reads of each samples (default) ```bash ./bin/simka … -max-reads -1 ``` Let Simka compute automatically the maximum of read per samples (normalization) ```bash ./bin/simka … -max-reads 0 ``` Used only the first 1000 reads of each samples: ```bash ./bin/simka … -max-reads 1000 ``` Allow more memory and cores improve the execution time: ```bash ./bin/simka … -max-memory 20000 -nb-cores 8 ``` ## Computer cluster options Simka can be ran on computer cluster equipped of a job scheduling system such as SGE. Giving a job file template and a submission command, Simka will take care of creating and synchronizing the jobs until the end of the execution. You must provide the filenames to two job templates, one for counting and one for merging (-count-file -count-merge). There are example of file templates in the folder ‘example/potara_job’. And you must provide a submission command for both job (-count-cmd -merge-cmd) Example for SGE: ```bash -count-cmd ‘qsub -pe make 8’ -merge-cmd qsub ``` The option -max-count and -max-merge controls the maximum of simultaneous jobs. They have to be fixed if you system have a maximum of jobs restriction. Command example: ```bash ./bin/simka … -count-file example/potara_job/sge/job_count -merge-file example/potara_job/sge/job_merge \ -count-cmd qsub -pe make 34 -merge-cmd qsub \ -max-count 6 -max-merge 18 -nb-cores 200 -max-memory 500000 ``` Simka will run a maximum of 6 simultaneous counting jobs, each using 200/6 cores and 500000/6 MB of memory. Simka will run a maximum of 18 merging jobs. A merging job can not be ran on more than 1 core and use very low memory. By default Simka use -nb-cores/2 counting jobs simultaneously and -nb-cores merging jobs simultaneously. ## Possible issues with Simka ### TOO MUCH OPENED FILES Simka is a disk-based method. Depending on the chosen options (-nb-cores -max-memory), it is possible that Simka required a lot of open files. You can fix this issue in two ways: * increasing the maximum open files limit imposed by your system: ulimit -n maxFiles * reducing the number of files opened by Simka by using the option -max-count and -max-merge simka-1.5.1/docker/000077500000000000000000000000001353413740300140745ustar00rootroot00000000000000simka-1.5.1/docker/Dockerfile000066400000000000000000000124341353413740300160720ustar00rootroot00000000000000######################################################################################### # # Docker file for Simka project. # # It prepares a Docker container to run Simka jobs: # # - bin/simka: computing simka results from sequencing data # - scripts/visualization/run-visualization.py: making images from results of # bin/simka. # ######################################################################################### # # == Docker build command: # # docker build -f Dockerfile -t simka_machine . # # == Docker test command: # # docker run --rm -i -t simka_machine -c test # # -> you should see a simka test with some provided data. # # == Running a Simka job: # # docker run --rm -i -t simka_machine -c -- # # where: # : MUST BE one of: simka, visu, test # : remaining arguments passed in after are passed # to the appropriate simka program: # - simka: will run 'bin/simka' within the container # - visu: will run 'scripts/visualization/run-visualization.py' # within the container # Please refer to these programs to review their expected arguments. # See https://github.com/GATB/simka # # == Sample Simka job with provided data: # # docker run --rm -i -t -v $PWD:/tmp simka_machine -c simka -- -in /opt/simka/example/simka_input.txt -out /tmp/simka_results/ -out-tmp /tmp/simka_temp_output # # -> you should have results in $PWD/simka_results directory when Simka job is done. # # This command-line line explained: # # docker run [1] # --rm [2] # -i -t [3] # -v $PWD:/tmp [4] # simka_machine [5] # -c simka [6] # -- [7] # -in /opt/simka/example/simka_input.txt [8] # -out /tmp/simka_results/ [9] # -out-tmp /tmp/simka_temp_output [10] # # [1]-[5]: Docker arguments # [6]-[7]: simka container's invoker program # [8]-[10]: 'bin/simka' arguments # # [1]: start Docker container # [2]: destroy container when Docker finishes # (it does NOT delete the 'simka_machine' image) # [3]: start an interactive job # (for instance, you'll see messages on stdout, if any) # [4]: mount a volume. This is required to get the results from Simka. # Here, we say that current local directory will be viewed as '/tmp' # from the inside of the container. # [5]: tell Docker which image to start: the 'simka_machine' of course. # [6]: ask to start the simka program. Other option is to start the # 'visu' task (see below). See companion file 'run_simka.sh' for # more information. # [7]: '--' is required to separate arguments [6] from the rest of the # command line # [8]: the data file to process with simka. Here we use a data file # provided with the simka software to test it. # [9]: tells simka where to put results. Of course, simka will write # within /tmp directory inside the container. However, since we # have directive [4], data writing is actually done in $PWD, i.e. # a local directory. # [10]: tells simka where to put temporary files. # # == Sample Simka Visualization job with provided data # # After running the previous command, you can do this: # # docker run --rm -i -t -v $PWD:/tmp simka_machine -c visu -- -in /tmp/simka_results/ -out /tmp/simka_results/ -pca -heatmap -tree # # -> you should have PNG files in $PWD/simka_results directory. # # == Additional notes # # Root access inside the container: # # - if running: docker exec -it simka_machine bash # # - if not yet running: docker run --rm -i -t simka_machine bash # ######################################################################################### # Simka binary available on Github (see below) is built using a # Debian 8 (jessie) based system on Inria Jenkins CI platform FROM debian:jessie # who to blame? MAINTAINER Patrick Durand patrick.durand@inria.fr # ### # We always use the latest official SIMKA release. # ENV SIMKA_VERSION=1.4.0 # ### # Package installation and configuration # RUN apt-get update && apt-get -y dist-upgrade \ && apt-get install -y --no-install-recommends curl python2.7 r-base \ && apt-get clean # ### # SIMKA installation: get the binary release from Github mirror. # RUN cd /opt \ && export SIMKA_TGZ=simka-v${SIMKA_VERSION}-bin-Linux.tar.gz \ && export GIT_URL=https://github.com/GATB/simka/releases/download \ && export SIMKA_URL=${GIT_URL}/v${SIMKA_VERSION}/${SIMKA_TGZ} \ && curl -ksL ${SIMKA_URL} | tar xz \ && rm -f ${SIMKA_TGZ} \ && mv simka-v${SIMKA_VERSION}-bin-Linux simka \ && cd simka/bin \ && chmod +x simka* \ && cd ../example \ && chmod +x *.sh \ && ./simple_test.sh COPY run_simka.sh /opt/simka # Fix: ensure script has exec permission RUN chmod +x /opt/simka/run_simka.sh # ### # Start simka. # ENTRYPOINT ["/opt/simka/run_simka.sh"] simka-1.5.1/docker/README.md000066400000000000000000000123301353413740300153520ustar00rootroot00000000000000# *SIMKA* and *Docker* This document explains how you can setup and use *SIMKA* within a Docker container. ## Requirements Of course, you need to have [Docker](https://docs.docker.com/engine/installation/) installed on your system. We also suppose that you are familiar with [docker build](https://docs.docker.com/engine/reference/commandline/build/) and [docker run](https://docs.docker.com/engine/reference/commandline/run/) commands. Note: this SIMKA's *Dockerfile* was made and tested using *Docker version 17* on *Mac OSX Yosemite*. However, it should work on other releases of Docker and OS (Linux, Windows or OSX). # How to build and run using the command-line? ## Build the container docker build -f Dockerfile -t simka_machine . ## Run a Simka job with sample data docker run --rm -i -t -v $PWD:/tmp simka_machine -c simka -- -in /opt/simka/example/simka_input.txt -out /tmp/simka_results/ -out-tmp /tmp/simka_temp_output You should have results in ```$PWD/simka_results``` directory when Simka job is done. ## Run Simka Visualization job with provided data docker run --rm -i -t -v $PWD:/tmp simka_machine -c visu -- -in /tmp/simka_results/ -out /tmp/simka_results/ -pca -heatmap -tree You should have PNG files in ```$PWD/simka_results``` directory. ## More documentation Please refer to the documented header of the ```Dockerfile``` located in this directory. # How to run Simka using the GoDocker platform? ## What is GoDocker? [GoDocker](http://www.genouest.org/godocker/) is a front-end to execute Docker containers on the [Genouest](http://www.genouest.org) bioinformatics platform. An account is required to access this service. ## How to prepare a Simka job * Log in to the GoDocker platform [here](https://godocker.genouest.org/) using your GenOuest credentials * Click on ```Create Job``` (top-left toolbar) * Then fill in the new job as follows: * Name: ```simka``` *(adapt to your needs)* * Description: ```simka job``` *(adapt to your needs)* * Tags *(leave empty)* * Projects *(leave value to 'default')* * Container image: ```pgdurand56/simka140``` *(see ```Comment 1``` below)* * Command: *(see ```Comment 2``` below)* * CPU: ```4``` * GPU: *(leave value to '0')* * RAM: ```8``` * Mount volumes: select ```home``` and/or ```omaha``` *(see ```Comment 2``` below)* * Advanced options: *(do not modify)* * Click on [Submit] ### Comment 1: the Simka Docker Image pgdurand56/simka140 In this tutorial you'll use the [pgdurand56/simka140](https://hub.docker.com/r/pgdurand56/simka140/) Docker Image: this is an official Simka 1.4.0 runtime made by Genscale team member. If you want to use your own, see below. ### Comment 2: the Simka command to use In order to use Simka Docker Image, you'll have to know that: * GoDocker won't use the default entrypoint defined in [Simka Dockerfile](https://github.com/GATB/simka/blob/master/docker/Dockerfile). As a consequence, you do no start Simka on GoDocker as you do on the command-line. * GoDocker enables you to access YOUR data located either in your *home directory* or in the *Omaha* storage on Genocluster machine #### Start a Simka data processing Job So, here is an example of command to use while setting up a Simka job for GoDocker: #!/bin/bash /opt/simka/bin/simka -in $GODOCKER_HOME/simka/example/simka_input.txt -out $GODOCKER_HOME/simka/example/simka_results/ -out-tmp $GODOCKER_HOME/simka/example/simka_temp_output In the above short script, we suppose that the data are located in the home directory of the user (denoted by variable $GODOCKER\_HOME). Simply adapt paths to your needs. If you want to use data located in Omaha, use '/omaha-beach' instead. In this script, please DO NOT modify path: ```/opt/simka/bin/simka```. It targets the simka binary within the Simka Docker image. #### Start a Simka visualization Job After running a Simka data processing job, you can prepare PNG images using: #!/bin/bash python2.7 /opt/simka/scripts/visualization/run-visualization.py -in $GODOCKER_HOME/simka/example/simka_results/ -out $GODOCKER_HOME/simka/example/simka_results/ -pca -heatmap -tree In this script: * DO NOT modify path: "python2.7 /opt/simka/scripts/visualization/run-visualization.py". It targets a simka python script within the Simka Docker container. * adapt the use of $GODOCKER\_HOME to your needs; you can also targets data located in Omaha using '/omaha-beach' ### Making your own Simka image for GoDocker On your local computer: [1] cd /tmp git clone https://github.com/GATB/simka.git [2] cd simka/docker docker build -f Dockerfile -t simka_machine . [3] docker login -u -p (e.g. docker login -u pgdurand56 -p xxxx) [4] docker tag / (e.g. docker tag 2520e066828a pgdurand56/simka140) [5] docker push (e.g. docker push pgdurand56/simka140) Steps are as follows: [1] get a copy of simka project [2] build the Simka Docker image [3] login to your DockerHub account [4] give a name to your Simka Docker Image suitable for DockerHub publication [5] push the image to DockerHub Now, on GoDocker use "\/\" (e.g. pgdurand56/simka140) to access your own Simka Image. simka-1.5.1/docker/run_simka.sh000066400000000000000000000030431353413740300164200ustar00rootroot00000000000000#!/usr/bin/env bash # # A script to be used within a Docker container: it aims at starting a simka # task given some parameters. # # Use: ./run_simka.sh -c -- # # : MUST BE one of: simka, visu, test # : remaining arguments passed in after are passed # to the appropriate simka program: # bin/simka # scripts/visualization/run-visualization.py # Please refer to these programs to review their expected arguments. # # Author: Patrick G. Durand, Inria, June 2017 # ======================================================================================== # Section: utility function declarations # -------- # FUNCTION: display help message function help(){ printf "\n$0: a tool to invoke simka within a Docker container.\n\n" printf "usage: $0 -c [arguments]\n\n" exit 1 } # ======================================================================================== # Section: Main # Prepare arguments for processing while getopts hc: opt do case "$opt" in c) COMMAND="$OPTARG";; h) help;; \?) help;; esac done shift `expr $OPTIND - 1` # remaining arguments, if any, are supposed to be the [file ...] part of the command-line ALL_ARGS=$@ #execute command case "$COMMAND" in test) cd /opt/simka/example ./simple_test.sh ;; simka) /opt/simka/bin/simka $ALL_ARGS ;; visu) python2.7 /opt/simka/scripts/visualization/run-visualization.py $ALL_ARGS ;; esac exit 0 simka-1.5.1/example/000077500000000000000000000000001353413740300142605ustar00rootroot00000000000000simka-1.5.1/example/A.fasta000077500000000000000000000261631353413740300154730ustar00rootroot00000000000000>1 AAAAAACATTAGTTACGGAAGGTGGGTGGAGCGGGGGCCGCCAGTCTATATTCATACTAGAAAGGGGCTAAGGGCATCGCGCTCATGAAGTGGCACTTGCAGAGGTGAGT >4 AAAAAATTGTCGTTAAGATGAGGAGCTCTTTCGCATTTGACCCATCAAATCTCGGAATGCACTTGAGATCGACCCGTTTGATACAAGCCTTCATCGTCGATAATATATCG >6 AAAAACCGATGGGGCCGAGCTGTTCTTTGGCCGGGTTACTCTACGCCCACACGGGTACACAGCCGCGAAACGGGAGGCTCGTGCGCGGTCACCTAAGTCCCTGTGGCGGG >7 AAAAACGGATGAGAAATAAAAGGGGAAATAGCGACATGTCAAATGGCCTCTTGGCTGGCGGTGTCTGGCTGGACTAACCCTCTTAAGGACTTAAAGCGTAGGCAAGGTTA >10 AAAAAGCTTACAGTGTTCGTAGTTCTGCTCGTGTCGGTATTCTCATACTCACTCCAGGACTTCGGAAACATTAGTGAAAGTGTCACCGGCCGTGCATTTTCCGGAGTAAG >12 AAAAAGTCTCACTTATTTACGCGCTTGATAGCCCAAACCGTTCACAGCATCTCCAGTAGTCGATTCCGGCTCCTCTCCATGTTGGAGATGTCAGCACGGCAGGTATATGA >14 AAAAATATTTGAATCCGCTGCTGACTTTTTCAAAATTCGATTATCACTACGGCGGTCAGCATATTTCCTTGAACTTATAACGTATCTCAGTATTGCTGGTTTGGAAACGT >15 AAAAATGCCCAGTTCGGCGATCCAACTCGTTAGTCTCAGGGTTCCCGCGGCGAACTCCCTTCGGTTGACTTTACGGCCTACACGCTCAGCCCTGTACCCGCATTGATGTC >100 AAACCGTGGGTAAGGCCGGGAGCTTTACACCAATTGGAACCACAAGTGGCTTGGCGGCCCAGGCTTCCTCATGCAGGATCATAACAATTACGCAAGTCCGGGAATCAGGA >19 AAAACACGCATGTCCGACGGGCGCATTAAACGGCATTGTTGTTCTTGAACCGGAAGCAATAGTCTAGGACCGTTCATATTGACTTACTTTACGCGTGGGCCTTGGATGAA >22 AAAACCATCTGTTTCCCAGAATGCTCCCGATACTTAATCACCTGCGCTTTGATCGTAGGCACTCACCCTCTCAAGTACCCTTGGACAGATTTAAGATCGATAGTTCGTAT >26 AAAACGAACCGGACTACATTCCTCATAGGCTTGAGGGGCAAAAGTTACAGTACAGATTTCGGGACCCGTCGCTTTACAGTGTAGACTGTTTTCCGAGAGTGCCTAGTCCA >28 AAAACGCTCGTGCCCAGTTCGAGCGCCCTGGAGTTTACCGCCCAAGTAAACGCTATTTTTTAAATGACATAGCTCCTTACAGGCGTGGGGGGACGCCTATAGGGCCGTCG >29 AAAACGGTGCCAACCATTGAAGATAGAAGATTCAATACATTGCACAAACGAATATAACCGGTAGGTAAAGTCTCGTTTCAAAGCGGCTCTATTTGTGCACCTTTTGCTGA >31 AAAACTCACCACCAGGACAGAAGCTAAAAGACAGGCCGATTCACGGGCGAGCGGTCGAAGCATACTTTTTAGGGCATCATGTGACCACACGTGTTGACCGTCCATCGTTT >34 AAAACTTTACCGATTTGCAAGATGTAAACAGCACGGGGACGCTATATCGACTGAGTGTTGTAGTGGAATCTACCACCCCGGACATGGGGCTGGACTGATTATTGAACGCT >38 AAAAGAGTTTGCAGGGTCACTAGCCCGGCTCTGCATACATTGGCAAAGAGGCGATAAGCTGTGAATTCCAGGCTGGGGACGAGCTTGAACCTACTGTGCGAAGTTACCAC >40 AAAAGCCAGAAGGTATGCAGATCGATATCGCTCCAATGTGTCAGTAGCCCGGACGAAGGCCTGCTCGATGATGCGTCATATTAATGTCAGAGAACCTAGTAGCCATCGGT >44 AAAAGGACAACGATGTGTGTGCTTCAAATATTCCTCCAATTCGTTACTGTCGCGGATGTTATGTCGGGCACTCTTAGCTTCCAAGGTGGGTGGATGTTAACGCGCATAGA >46 AAAAGGGGTTTGCTGACCCACAGACAATAGATAGCGGATTACTTACTCCACTTACGAGACTAACGCAATGCTAATAACTTGACGTCTAAACGGGACCACATAAGCCTTTA >47 AAAAGGTATTATATTGATGATTAGCCATTTAGGGAAGATCAACAAGATGAAGCATGACGCGAGAGGTAAGGGATCGACGAACGCTCCTGCCATGGGCAATCCAGGAGGGC >49 AAAAGTCCCTGTCGTACCTCAATCCAGGCCACGTATACTCAGGGGGTACCGAAACCCTAATAACTCTCCCGTTGGTGACCAGATCTGAATCTGCACCGCACACAACTACA >51 AAAAGTGGAAAATACCCTCACTAACAGGGTATCTTGCGTTGAGCTGGTAAGTGCACCAGGGAAAACTTGTTATCCTCGAGTCTGATCGATGCTATCTCATATTTCTGAGT >54 AAAATAGAAATCGCAGCTTGAATGCTGTTACTCAATGTTTATGAACATAGCTCCCGCGTTACTCTGTTTCGACATAGGATGCGCGCACCGGAAGCCTGCTACCACAACAT >57 AAAATCAAACTAGCTACTCCGAAGGCGGGATATTTGCCTGGTAGAGGGAAATGTAGCTCACGGGCCGTTTACTCTTCTTCAGAGCAACTAAGTATTCCGGAAAACCTCAG >60 AAAATCGTGTTCAAGAAAGAGAGGGCCGGGCGCTGAATTGGGCCCGAATCACAAAAAAGTGAGTCGCGCTCCTACAAAGTCCTAATCTAATAGTGGATGATGTGTCTGGT >62 AAAATGCCATACTAAATTAACGGTGTCTCATAGCGACATTGTTATTGTCACCTGACATAGCCAGAAGGTTAAAAGTAGTGCGCGACGCGAAATACCCATGCTGCGGAGTC >67 AAAATTCAGCCTTTCTTACCCAAGGCCTCTCTCGGACAAGAACTATGAAAGCATGCCCTACAGCATACTTTCGCTGACATATGGAGCAGGGAGCCTAAAGGCCGTTTATC >71 AAACAACCACAGATGGCCACTATGTGAAGTTTTGGACGAGTCCATACATTTTTCACTAAGTAAGAAGCTACCTTAATACGTGCACGCATCACATCCTAGACGCTCTGGCT >72 AAACAAGCCGAGAATCTGGCGTGACAAATCCTCCGGAACGGGCTGACCCCACTGTACAAACAATGTAAATCACCGCAGTTTCACTGTACGTTTGGTCTTTTTGATAGACA >74 AAACAATGGACGGAACCATGTTCTGTTACAAGCGCTGACCACACAAGCCGAGAGTACCCCAAGATGATGTCTTAGGATCGTATATACCCTCCATACCCGAGCTTTCCCCG >77 AAACACGTCACCGAGCGCTTAGTGGATCGTACTCAACATGTTGAACAGACATTATCTACATTCGATTCTTCCCATTATGTATCATCGCAGTACACGCCGCTTTCCATTTT >78 AAACACTCGCACAGACCGGTAACCGAGGGAATACAGAATTATAGCCCATATTCGCTGTCCCAAACTGCACCCATCGTTGGCAATTCCGAGACCTCTATTTCCGGTATGCG >79 AAACAGAGACATGTACGTTTTGCGCGGTGGTAGCTCTGGAGTCGGAGGCAGGGTTTTTTGGCCGGCAAAATCAGTATCCGACCTCGTTGGATGACTCCGGAAAACCTTTT >80 AAACAGATCGCTATTAGCACGCGTATGCTTTCACTAGCGAACAAAAGTGCCCCTTTGAGTCCTAGCAGCTACAGTGCCCGTAACTGATATTCTTAAGGCTATTTACAGTT >81 AAACAGGGCACTGGAGGGCAGCCCTTGAACCGCATAGATGGTGGAATTTCATACGGACTGGCGGGCATTATCGGGGTCGTATTGCCTTTGGGGGCATAGCCCACGAGTGC >82 AAACAGTAGAGTTTCATGTCCCTTGTATCGGAGGCAGCGACTCGCTTGAGCAGACCAAGTCCCGTCACTGAGGGTTATCAGTGAGGATACCTTGGTTCAGACAAAAAGAT >83 AAACATACGATCAAGTGTCGAAATTATATCACCGGCATTTGGTCTTTAGATATCTAAAGAAATGGCGCTAGGCCATCTCCCGGGTTTTTTCTGCTTCATGGCTAGATTCG >85 AAACATGATTTCGTACCCCGTGTAGGGCATGTTACCCACGTGAGGCGAGGTATGCGTGGGTCGATGTAGTACCTGTTGACCCGCATTTAGCCTCGACTCAATCTGCTGGA >86 AAACATGCGTTAAATGCACTGCGAGTTGTCCGGTAGCCGTCCAACCTCCTCTAGTACCAAGTAATGGCATTAGCACGCGACAACATGGCTGTAAGGGCCCGTGCGACTAA >87 AAACATTTGCACCTAGGCTTCCTAGGTGTTTGGCTGGAAACGTAGGCAAGGTCAGGTATTCGACACATCGCCCATTATCCGTTACACGAATATACAAGACGAGAGACCGG >90 AAACCAGACGCCTAAGTTGCACTTCGTGTGGACAGTTCACCTGAAAAGGCAGAAAGTTCTGAGATGAGTGCGCGGAGTTACTAACCTAGGCCCTGTACGAGAGCAACATA >91 AAACCATAGATCACAAGTCCACCTCGAGGCGATTATGCATGCCCTTCACTTCTCACGGCTGGATGGGCTTGCCTTAGTCACTTGCGATTGAGTCGTACGATTATAAAGCG >93 AAACCCCCCCGTTACCCTCACTCCGCTCGGCCTAGCGGTGGTCAGGTCAGGAGTTGCAATCGGAGTCACACGATCATACTTTCTCACTGCGCACAACATATCTGCTTGCA >95 AAACCCTAAAGAGGTTACAGATACCATTTTAAAGTCTAGATTCTATGTGGGTATTTGCGGTCGGGACCCGTTCGCGTCCGCGTCAGACTTGCAATCGTGAGCCCGTCACA >97 AAACCGCAGTGCAATCCTCTGGCAACACGGATAGATTCCTTGCTGTAGCAAGACCGACCGCTGTCCCGGGTGCCGCGATGCGCGAGCATGCCCTGCAGGATCCCACACAT >98 AAACCGCATCGGGCTGGGTACCGGACGGTGCTAAGAGTGCCAGAATGAAGGTAAATAAGGTGGATTGAACATTTTATTAGCTCGTCTCGTGGTGCCATTGCCCAGCATCG >99 AAACCGGGTCAATGTGATTCGTATTACTTGTCAAACAGTACTATCAAACCACCGTTCAGTCGCCCGCTTGATCCCTTGATTCTAGAGGCCATACGGCGCGCCTACTTTTT >100 AAACCGTGGGTAAGGCCGGGAGCTTTACACCAATTGGAACCACAAGTGGCTTGGCGGCCCAGGCTTCCTCATGCAGGATCATAACAATTACGCAAGTCCGGGAATCAGGA >1 AAAAAACATTAGTTACGGAAGGTGGGTGGAGCGGGGGCCGCCAGTCTATATTCATACTAGAAAGGGGCTAAGGGCATCGCGCTCATGAAGTGGCACTTGCAGAGGTGAGT >4 AAAAAATTGTCGTTAAGATGAGGAGCTCTTTCGCATTTGACCCATCAAATCTCGGAATGCACTTGAGATCGACCCGTTTGATACAAGCCTTCATCGTCGATAATATATCG >6 AAAAACCGATGGGGCCGAGCTGTTCTTTGGCCGGGTTACTCTACGCCCACACGGGTACACAGCCGCGAAACGGGAGGCTCGTGCGCGGTCACCTAAGTCCCTGTGGCGGG >7 AAAAACGGATGAGAAATAAAAGGGGAAATAGCGACATGTCAAATGGCCTCTTGGCTGGCGGTGTCTGGCTGGACTAACCCTCTTAAGGACTTAAAGCGTAGGCAAGGTTA >10 AAAAAGCTTACAGTGTTCGTAGTTCTGCTCGTGTCGGTATTCTCATACTCACTCCAGGACTTCGGAAACATTAGTGAAAGTGTCACCGGCCGTGCATTTTCCGGAGTAAG >12 AAAAAGTCTCACTTATTTACGCGCTTGATAGCCCAAACCGTTCACAGCATCTCCAGTAGTCGATTCCGGCTCCTCTCCATGTTGGAGATGTCAGCACGGCAGGTATATGA >14 AAAAATATTTGAATCCGCTGCTGACTTTTTCAAAATTCGATTATCACTACGGCGGTCAGCATATTTCCTTGAACTTATAACGTATCTCAGTATTGCTGGTTTGGAAACGT >15 AAAAATGCCCAGTTCGGCGATCCAACTCGTTAGTCTCAGGGTTCCCGCGGCGAACTCCCTTCGGTTGACTTTACGGCCTACACGCTCAGCCCTGTACCCGCATTGATGTC >19 AAAACACGCATGTCCGACGGGCGCATTAAACGGCATTGTTGTTCTTGAACCGGAAGCAATAGTCTAGGACCGTTCATATTGACTTACTTTACGCGTGGGCCTTGGATGAA >22 AAAACCATCTGTTTCCCAGAATGCTCCCGATACTTAATCACCTGCGCTTTGATCGTAGGCACTCACCCTCTCAAGTACCCTTGGACAGATTTAAGATCGATAGTTCGTAT >26 AAAACGAACCGGACTACATTCCTCATAGGCTTGAGGGGCAAAAGTTACAGTACAGATTTCGGGACCCGTCGCTTTACAGTGTAGACTGTTTTCCGAGAGTGCCTAGTCCA >28 AAAACGCTCGTGCCCAGTTCGAGCGCCCTGGAGTTTACCGCCCAAGTAAACGCTATTTTTTAAATGACATAGCTCCTTACAGGCGTGGGGGGACGCCTATAGGGCCGTCG >29 AAAACGGTGCCAACCATTGAAGATAGAAGATTCAATACATTGCACAAACGAATATAACCGGTAGGTAAAGTCTCGTTTCAAAGCGGCTCTATTTGTGCACCTTTTGCTGA >31 AAAACTCACCACCAGGACAGAAGCTAAAAGACAGGCCGATTCACGGGCGAGCGGTCGAAGCATACTTTTTAGGGCATCATGTGACCACACGTGTTGACCGTCCATCGTTT >34 AAAACTTTACCGATTTGCAAGATGTAAACAGCACGGGGACGCTATATCGACTGAGTGTTGTAGTGGAATCTACCACCCCGGACATGGGGCTGGACTGATTATTGAACGCT >38 AAAAGAGTTTGCAGGGTCACTAGCCCGGCTCTGCATACATTGGCAAAGAGGCGATAAGCTGTGAATTCCAGGCTGGGGACGAGCTTGAACCTACTGTGCGAAGTTACCAC >40 AAAAGCCAGAAGGTATGCAGATCGATATCGCTCCAATGTGTCAGTAGCCCGGACGAAGGCCTGCTCGATGATGCGTCATATTAATGTCAGAGAACCTAGTAGCCATCGGT >44 AAAAGGACAACGATGTGTGTGCTTCAAATATTCCTCCAATTCGTTACTGTCGCGGATGTTATGTCGGGCACTCTTAGCTTCCAAGGTGGGTGGATGTTAACGCGCATAGA >46 AAAAGGGGTTTGCTGACCCACAGACAATAGATAGCGGATTACTTACTCCACTTACGAGACTAACGCAATGCTAATAACTTGACGTCTAAACGGGACCACATAAGCCTTTA >47 AAAAGGTATTATATTGATGATTAGCCATTTAGGGAAGATCAACAAGATGAAGCATGACGCGAGAGGTAAGGGATCGACGAACGCTCCTGCCATGGGCAATCCAGGAGGGC >49 AAAAGTCCCTGTCGTACCTCAATCCAGGCCACGTATACTCAGGGGGTACCGAAACCCTAATAACTCTCCCGTTGGTGACCAGATCTGAATCTGCACCGCACACAACTACA >51 AAAAGTGGAAAATACCCTCACTAACAGGGTATCTTGCGTTGAGCTGGTAAGTGCACCAGGGAAAACTTGTTATCCTCGAGTCTGATCGATGCTATCTCATATTTCTGAGT >54 AAAATAGAAATCGCAGCTTGAATGCTGTTACTCAATGTTTATGAACATAGCTCCCGCGTTACTCTGTTTCGACATAGGATGCGCGCACCGGAAGCCTGCTACCACAACAT >57 AAAATCAAACTAGCTACTCCGAAGGCGGGATATTTGCCTGGTAGAGGGAAATGTAGCTCACGGGCCGTTTACTCTTCTTCAGAGCAACTAAGTATTCCGGAAAACCTCAG >60 AAAATCGTGTTCAAGAAAGAGAGGGCCGGGCGCTGAATTGGGCCCGAATCACAAAAAAGTGAGTCGCGCTCCTACAAAGTCCTAATCTAATAGTGGATGATGTGTCTGGT >62 AAAATGCCATACTAAATTAACGGTGTCTCATAGCGACATTGTTATTGTCACCTGACATAGCCAGAAGGTTAAAAGTAGTGCGCGACGCGAAATACCCATGCTGCGGAGTC >67 AAAATTCAGCCTTTCTTACCCAAGGCCTCTCTCGGACAAGAACTATGAAAGCATGCCCTACAGCATACTTTCGCTGACATATGGAGCAGGGAGCCTAAAGGCCGTTTATC >71 AAACAACCACAGATGGCCACTATGTGAAGTTTTGGACGAGTCCATACATTTTTCACTAAGTAAGAAGCTACCTTAATACGTGCACGCATCACATCCTAGACGCTCTGGCT >72 AAACAAGCCGAGAATCTGGCGTGACAAATCCTCCGGAACGGGCTGACCCCACTGTACAAACAATGTAAATCACCGCAGTTTCACTGTACGTTTGGTCTTTTTGATAGACA >74 AAACAATGGACGGAACCATGTTCTGTTACAAGCGCTGACCACACAAGCCGAGAGTACCCCAAGATGATGTCTTAGGATCGTATATACCCTCCATACCCGAGCTTTCCCCG >77 AAACACGTCACCGAGCGCTTAGTGGATCGTACTCAACATGTTGAACAGACATTATCTACATTCGATTCTTCCCATTATGTATCATCGCAGTACACGCCGCTTTCCATTTT >78 AAACACTCGCACAGACCGGTAACCGAGGGAATACAGAATTATAGCCCATATTCGCTGTCCCAAACTGCACCCATCGTTGGCAATTCCGAGACCTCTATTTCCGGTATGCG >79 AAACAGAGACATGTACGTTTTGCGCGGTGGTAGCTCTGGAGTCGGAGGCAGGGTTTTTTGGCCGGCAAAATCAGTATCCGACCTCGTTGGATGACTCCGGAAAACCTTTT >80 AAACAGATCGCTATTAGCACGCGTATGCTTTCACTAGCGAACAAAAGTGCCCCTTTGAGTCCTAGCAGCTACAGTGCCCGTAACTGATATTCTTAAGGCTATTTACAGTT >81 AAACAGGGCACTGGAGGGCAGCCCTTGAACCGCATAGATGGTGGAATTTCATACGGACTGGCGGGCATTATCGGGGTCGTATTGCCTTTGGGGGCATAGCCCACGAGTGC >82 AAACAGTAGAGTTTCATGTCCCTTGTATCGGAGGCAGCGACTCGCTTGAGCAGACCAAGTCCCGTCACTGAGGGTTATCAGTGAGGATACCTTGGTTCAGACAAAAAGAT >83 AAACATACGATCAAGTGTCGAAATTATATCACCGGCATTTGGTCTTTAGATATCTAAAGAAATGGCGCTAGGCCATCTCCCGGGTTTTTTCTGCTTCATGGCTAGATTCG >85 AAACATGATTTCGTACCCCGTGTAGGGCATGTTACCCACGTGAGGCGAGGTATGCGTGGGTCGATGTAGTACCTGTTGACCCGCATTTAGCCTCGACTCAATCTGCTGGA >86 AAACATGCGTTAAATGCACTGCGAGTTGTCCGGTAGCCGTCCAACCTCCTCTAGTACCAAGTAATGGCATTAGCACGCGACAACATGGCTGTAAGGGCCCGTGCGACTAA >87 AAACATTTGCACCTAGGCTTCCTAGGTGTTTGGCTGGAAACGTAGGCAAGGTCAGGTATTCGACACATCGCCCATTATCCGTTACACGAATATACAAGACGAGAGACCGG >90 AAACCAGACGCCTAAGTTGCACTTCGTGTGGACAGTTCACCTGAAAAGGCAGAAAGTTCTGAGATGAGTGCGCGGAGTTACTAACCTAGGCCCTGTACGAGAGCAACATA >91 AAACCATAGATCACAAGTCCACCTCGAGGCGATTATGCATGCCCTTCACTTCTCACGGCTGGATGGGCTTGCCTTAGTCACTTGCGATTGAGTCGTACGATTATAAAGCG >93 AAACCCCCCCGTTACCCTCACTCCGCTCGGCCTAGCGGTGGTCAGGTCAGGAGTTGCAATCGGAGTCACACGATCATACTTTCTCACTGCGCACAACATATCTGCTTGCA >95 AAACCCTAAAGAGGTTACAGATACCATTTTAAAGTCTAGATTCTATGTGGGTATTTGCGGTCGGGACCCGTTCGCGTCCGCGTCAGACTTGCAATCGTGAGCCCGTCACA >97 AAACCGCAGTGCAATCCTCTGGCAACACGGATAGATTCCTTGCTGTAGCAAGACCGACCGCTGTCCCGGGTGCCGCGATGCGCGAGCATGCCCTGCAGGATCCCACACAT >98 AAACCGCATCGGGCTGGGTACCGGACGGTGCTAAGAGTGCCAGAATGAAGGTAAATAAGGTGGATTGAACATTTTATTAGCTCGTCTCGTGGTGCCATTGCCCAGCATCG >99 AAACCGGGTCAATGTGATTCGTATTACTTGTCAAACAGTACTATCAAACCACCGTTCAGTCGCCCGCTTGATCCCTTGATTCTAGAGGCCATACGGCGCGCCTACTTTTT >100 AAACCGTGGGTAAGGCCGGGAGCTTTACACCAATTGGAACCACAAGTGGCTTGGCGGCCCAGGCTTCCTCATGCAGGATCATAACAATTACGCAAGTCCGGGAATCAGGA >91 AAACCATAGATCACAAGTCCACCTCGAGGCGATTATGCATGCCCTTCACTTCTCACGGCTGGATGGGCTTGCCTTAGTCACTTGCGATTGAGTCGTACGATTATAAAGCG >93 AAACCCCCCCGTTACCCTCACTCCGCTCGGCCTAGCGGTGGTCAGGTCAGGAGTTGCAATCGGAGTCACACGATCATACTTTCTCACTGCGCACAACATATCTGCTTGCAsimka-1.5.1/example/B.fasta000077500000000000000000000267071353413740300155000ustar00rootroot00000000000000>0 AAAAAAACTCTACAGCGAGCAGTGTTAAAATCGTGGCGCACCCAGACAGCCACTTCGCCGTTCTAAGTGGCCGATCGTAGCACATCGGATGACCTTGGTTGGTACGACAT >2 AAAAAAGAGACGAGCCACGCGGTGCGCCTGAACGTTGGGTCCAGACCACACTTATGGATTCGACGGGGCACCTATCAGGTTCTCCATCGTATAGTCGTCTGTAGGTCTGA >3 AAAAAATCGCTAGGGGGGATGGCCATCAACCCCCCCTCCCGTACCTATGATAGTGGGATCAGATTTAAGCACGGGCCCTACGACTCCCCTTCATGGAATAGGCTAAGGTG >4 AAAAAATTGTCGTTAAGATGAGGAGCTCTTTCGCATTTGACCCATCAAATCTCGGAATGCACTTGAGATCGACCCGTTTGATACAAGCCTTCATCGTCGATAATATATCG >5 AAAAACATTGCGGACTACCGTCGTTGCAGTGGGTCGCCCATTCTAGGCTGCGAGTTCATATGTGTGCCTGTCGCTTAGGGCAATCCTCGGATTGGCTGTTTAACAGGGGT >6 AAAAACCGATGGGGCCGAGCTGTTCTTTGGCCGGGTTACTCTACGCCCACACGGGTACACAGCCGCGAAACGGGAGGCTCGTGCGCGGTCACCTAAGTCCCTGTGGCGGG >7 AAAAACGGATGAGAAATAAAAGGGGAAATAGCGACATGTCAAATGGCCTCTTGGCTGGCGGTGTCTGGCTGGACTAACCCTCTTAAGGACTTAAAGCGTAGGCAAGGTTA >8 AAAAACTTTGACTTTTTCAAGACATGAAAGGATGCGGGCTCATACTGGACGGGTTCATTCCTACCGCGGAACGAAGGGCTATTTTTTGTTTGGGCGAGAGTACATCCGTC >9 AAAAAGACTCAGCTTGACATGGCGGTCTGAGCTTTGCTTGGGCTCTTACTATGTCAGGGTTGGAAACTATGGCAGAAGGGCTTCTCGCATCCTCACGGCTCGAATTAGCT >100 AAACCGTGGGTAAGGCCGGGAGCTTTACACCAATTGGAACCACAAGTGGCTTGGCGGCCCAGGCTTCCTCATGCAGGATCATAACAATTACGCAAGTCCGGGAATCAGGA >100 AAACCGTGGGTAAGGCCGGGAGCTTTACACCAATTGGAACCACAAGTGGCTTGGCGGCCCAGGCTTCCTCATGCAGGATCATAACAATTACGCAAGTCCGGGAATCAGGA >11 AAAAAGGCAAGGATCTTTGACCACGGCAGTTTGCAATAGTCAATTACGCTTCCCTGAGTACAAACAATGGTCTAATGCTGTTCAGTTGGGGTTAACTGGCCTGACGCTCT >13 AAAAATAAAAATGGCCTAGGTCCAACTCGTCCCCGGGGTAAGTTAGTGTAAGAGCTTGGAGCAAGTCTGTTCCTCGTCTGTCCCAGTAACACTCGGGTCTACGGTCGCAG >16 AAAAATTAGTCAGGTTACCCCCAATTAGGTGAAATACGTCGAAGGGTCGCGTCCAAGAAAGAATGATAGCTGACAGTTCTCTAGGTATTTATATTTGTTTGCATCGACTG >17 AAAACAAATATCCTGAATTCATAGAGCCGTGACTTACAGTTCTATGAAAAGTGTCTGGCAAGGGAGATTTCACGTTTCCCTGTATAGGGTCGTCGTATTGCCCACCATTT >18 AAAACAATGCTGAAGACCCTAATGCGTAACCAACAATGTAAGACTGGCACGTATTCTATGATATCTTATTGGCACTCCATCGCGAGGATACTAATAGACACCTAAAAGGA >19 AAAACACGCATGTCCGACGGGCGCATTAAACGGCATTGTTGTTCTTGAACCGGAAGCAATAGTCTAGGACCGTTCATATTGACTTACTTTACGCGTGGGCCTTGGATGAA >20 AAAACAGTCATTGTCTGATTTAGCATCGGGTGCACCGAAACGCTATCGCCTTCTCGTGAATTCGCAATTCAGCTCCAAGCATCAATCAGAATCACATGCCGCACACAGAA >21 AAAACATAGTTATAATGTTTTGAGTGTAAATGCTTACATCGAAATATTCGACGTACTACACCGTGAGCCAAGACCCTGACTTCGAACCATGCGCTCCTGTGAGTGTGACG >23 AAAACCCACCGTGCCGGTTTAAGAGCAGTAGCTGTTGGTGTTGTGTCCGCGTGGCATCGGAAAGTCGCTACAACGCCGGAAGCCGGGGATTTACAGAATACGTTATACGA >24 AAAACCGGTTCGTGAAGTCCGAAGGAATTCCACAACGCACTGCATGACACCTGGAAGAAGAAGGCCTTTTCCACGCCCTGAACGTAATTTCTGGTAAAGCAAGTGCTCCA >25 AAAACCTGATCGTGTTCATGAGTCTGGTATAGACGGATCCTTGGGCCAAAGTCTTCCGGTCTTCTGGCCGCCTTCAGGAGTCTAATTACCTGAACCTCATCGTAATTGCT >27 AAAACGATTATTTATCATTTACCGCCTTAGAGTGTGGCTTATATAGCATGGGTTTGATTTGAGTGGGACAACAGATCCATTTGATGCAGTATGTATTAGCGGATCATGAC >28 AAAACGCTCGTGCCCAGTTCGAGCGCCCTGGAGTTTACCGCCCAAGTAAACGCTATTTTTTAAATGACATAGCTCCTTACAGGCGTGGGGGGACGCCTATAGGGCCGTCG >30 AAAACGTGATGTAATCGGACGCATTTGACCAGACGTAGCCACCTTATTCGGTGCGTGCCATGACCCGAGAGCGCCAGGGATGCTTCTCGTTCCGGGTCACTGATAAATAG >31 AAAACTCACCACCAGGACAGAAGCTAAAAGACAGGCCGATTCACGGGCGAGCGGTCGAAGCATACTTTTTAGGGCATCATGTGACCACACGTGTTGACCGTCCATCGTTT >32 AAAACTCCTCCGCCGAGGAGGCACTTAGCCTCGTATGGATGCTTAAGGCATGATCGAGCCGGCCGGAAATCTCCTACCGTCTAATTAGGGGCATTGAAGTCCGGTTCCGC >33 AAAACTGTCAGCTCTAATCGAATGCTTGGAAGTTCTGTCCAAAGTGTTGCGAAGCCGAGCTTGAACGTATATAATAACTGCGGTCCTCATACCGGAACAAGTTTACTGCC >35 AAAACTTTTGGATCGCCATGTGACTAATTCCTATAATTACAATCTGTCATTAGTCGGAGCGGTGCGAGATGTGAGTAGTATAGTCGACGCGGCTAATCGAGGCAGATTCC >36 AAAAGACATCGCTAAGTAGTCGATAACTTTAGGTCTGGCTCAGCGAAGTCCGCGCACCGAGGTACGCGATGAACGTGTAGTAGCTGTGCTGCCGACTCTGAGGCGGTAGC >37 AAAAGACGTGACAGAGGCGATGATACCGCAGACGATACGCCACTACAGCTAAAGAGTCTGTCTAGAAATGCCTAGCGGCACCTGGCGCCGCCGTCTAATGGAGTGCAAGC >38 AAAAGAGTTTGCAGGGTCACTAGCCCGGCTCTGCATACATTGGCAAAGAGGCGATAAGCTGTGAATTCCAGGCTGGGGACGAGCTTGAACCTACTGTGCGAAGTTACCAC >39 AAAAGATGCACCGGCTATCTTAGTTCGTTCCAGGCCAACGAGTGTGACTATAGACGAGTTCTGCTCAGAACGGACTAGGCTTCGGGTGTCACGCGGGATCATATTATCTT >41 AAAAGCCTGGCCTTAACAAAATCTAGGTGCGTCTCGAATCGAACGGAAAACAACGTCTGGTTTACTGAGAAATCCTAGGATGCTGCTGGCTATTTGACCTCACGGGGGTT >42 AAAAGCGTGCCGAAGATGCCGTAATTTCCGAGGATGCACTCTCGTGACATCTCTTTTTAACGAACAAATTGCAGAGGTCAAGGTGATCGAGGCACGCTATGCTAAGCACT >43 AAAAGGAAAGGGAGAACGAATGATTGTTTCCAGGTATCAGGAAGCAACAAAATATAATCGATTCGTCACTGTGAGCCAACAGGCGTGTATGTCTGCGTCAAGCGTGCATC >45 AAAAGGCACGCATCGTCATCTGAACAGCAAAGTTGGGCGTTTCCGCCAATAAAGCGTTTCCCTTCCATTTTATTGTACTAGGAAGAAACCACTCCTATAAGCAAACAAGT >48 AAAAGTACTAACTTATCACGAACCGCTTTTGACGTCTTAATTACAGGTTGGGTGACGCGGCATGTCAGGGGCAAACTAACTATGATATCCACGGAACTGCCGACGACTAA >50 AAAAGTCGGAGTCGATAATGATATAGCGGCACGACTCGAACCCGCTTCGCAGCTCATCTCAGGAGATAGGCCTCGAACCTTCCCTGGTATGTACTCGGAGGCTCTCACCC >51 AAAAGTGGAAAATACCCTCACTAACAGGGTATCTTGCGTTGAGCTGGTAAGTGCACCAGGGAAAACTTGTTATCCTCGAGTCTGATCGATGCTATCTCATATTTCTGAGT >52 AAAAGTTTAGTAAGTTGGGTACGAGGCGTTATGGAGGGTTGCGTCGCTTTCACGAGCCTCATCGATAGCATACCTGTCGCAGATGTATTCAATGGTAACATGACGGTTTT >53 AAAATAATTTTCTGCAGGACCTCTGCTTCGGGAGAAAATATATTTAGATCTCCACCCGGAACCGCTCGCGACTTCACGAGCATCGGGTAGGACTTCGGCCGCTTGGATGC >55 AAAATAGACCTTTCCCCGAAACTGTATGCTAAGACTGTGAGGCGGCGAACGGTCTTTGTTCCTCAGTTAACTGACAACTCACAACAGCGCATAACGAATCACATGCCAGG >56 AAAATATTATTGATCTGACACTACAGAACTTTCTCGTTGACATCGTGCATTGAACTATCAGATGCCCAACCGAGTGGCGGCGATCGCGTTTCGAGTATCACGCGGGTTCC >58 AAAATCCCACCGACGCACTCAGCTAAGTTGATGCATAACAACGTTTGAGCGCTACCTGAGTTAGTTGCAAGCTCGGTCAACGTGTAAAATGTCCATCAGAGTCACCTCAT >59 AAAATCCTAATCCAGCACACGAGCCTCAGTCAGGTTCAAGGTGCGGCTTACTCTGCCGACACCAGCAAAGATATACTCGGGCAGGGAGATTAAATTGGGTTTGTCGACCT >61 AAAATGACACTAACAACGTTGACCGAGAATGAACGTCTAAACCCTTAGTGTGAATTCGTTTCAATATGTACAGGGCCCTCTGGCATATCCCGCTGCCCGGGCTAATGTCA >63 AAAATGCGAAGGTATTCTGTAGAGGGGAATAACTGGGCTTCCATCCCCAGAGCTAACACAGCCGACTACACACTACATAGATGGTCGGGGGTGGTCCGCCGGAAGACGCT >64 AAAATGGCTTTAGGCTAGTAGTAATCTAATGTGTAACAAAGTCTTGTGGCCCGATCGTTATATCTCTGGCACGATCGGTTGGCGGTTTTTCTAGATTACCTTACGCGATA >65 AAAATGTGGTCGGAGCCGCGTACATTATGTGTGGCTTCACCTATATCTAGGGGAGTTCCCGGCCTAGCACACCAGCGGTCCGTAGGAACCGCGCCCGCCAAACCGAGCAC >66 AAAATTACGGCGCAACTGTTGGCTTCTTCATTCCCTGTTAGGTCCAAGAGCTGACAGGTCATATCTAATTCGACAGTTGCTAGATCGTAGTGAGTTAAGTATTCGTGGAG >68 AAAATTGCGTAGTTAGAACGACGAGCATTCTAATGTACAACCTATAATAAATAACGGGCCCTTGTTGCCTAACCAACAACAGTACCGCCAGGCCACTCCGCTAAGGTCAG >1 AAAAAACATTAGTTACGGAAGGTGGGTGGAGCGGGGGCCGCCAGTCTATATTCATACTAGAAAGGGGCTAAGGGCATCGCGCTCATGAAGTGGCACTTGCAGAGGTGAGT >4 AAAAAATTGTCGTTAAGATGAGGAGCTCTTTCGCATTTGACCCATCAAATCTCGGAATGCACTTGAGATCGACCCGTTTGATACAAGCCTTCATCGTCGATAATATATCG >6 AAAAACCGATGGGGCCGAGCTGTTCTTTGGCCGGGTTACTCTACGCCCACACGGGTACACAGCCGCGAAACGGGAGGCTCGTGCGCGGTCACCTAAGTCCCTGTGGCGGG >7 AAAAACGGATGAGAAATAAAAGGGGAAATAGCGACATGTCAAATGGCCTCTTGGCTGGCGGTGTCTGGCTGGACTAACCCTCTTAAGGACTTAAAGCGTAGGCAAGGTTA >10 AAAAAGCTTACAGTGTTCGTAGTTCTGCTCGTGTCGGTATTCTCATACTCACTCCAGGACTTCGGAAACATTAGTGAAAGTGTCACCGGCCGTGCATTTTCCGGAGTAAG >12 AAAAAGTCTCACTTATTTACGCGCTTGATAGCCCAAACCGTTCACAGCATCTCCAGTAGTCGATTCCGGCTCCTCTCCATGTTGGAGATGTCAGCACGGCAGGTATATGA >14 AAAAATATTTGAATCCGCTGCTGACTTTTTCAAAATTCGATTATCACTACGGCGGTCAGCATATTTCCTTGAACTTATAACGTATCTCAGTATTGCTGGTTTGGAAACGT >15 AAAAATGCCCAGTTCGGCGATCCAACTCGTTAGTCTCAGGGTTCCCGCGGCGAACTCCCTTCGGTTGACTTTACGGCCTACACGCTCAGCCCTGTACCCGCATTGATGTC >19 AAAACACGCATGTCCGACGGGCGCATTAAACGGCATTGTTGTTCTTGAACCGGAAGCAATAGTCTAGGACCGTTCATATTGACTTACTTTACGCGTGGGCCTTGGATGAA >22 AAAACCATCTGTTTCCCAGAATGCTCCCGATACTTAATCACCTGCGCTTTGATCGTAGGCACTCACCCTCTCAAGTACCCTTGGACAGATTTAAGATCGATAGTTCGTAT >26 AAAACGAACCGGACTACATTCCTCATAGGCTTGAGGGGCAAAAGTTACAGTACAGATTTCGGGACCCGTCGCTTTACAGTGTAGACTGTTTTCCGAGAGTGCCTAGTCCA >28 AAAACGCTCGTGCCCAGTTCGAGCGCCCTGGAGTTTACCGCCCAAGTAAACGCTATTTTTTAAATGACATAGCTCCTTACAGGCGTGGGGGGACGCCTATAGGGCCGTCG >29 AAAACGGTGCCAACCATTGAAGATAGAAGATTCAATACATTGCACAAACGAATATAACCGGTAGGTAAAGTCTCGTTTCAAAGCGGCTCTATTTGTGCACCTTTTGCTGA >31 AAAACTCACCACCAGGACAGAAGCTAAAAGACAGGCCGATTCACGGGCGAGCGGTCGAAGCATACTTTTTAGGGCATCATGTGACCACACGTGTTGACCGTCCATCGTTT >34 AAAACTTTACCGATTTGCAAGATGTAAACAGCACGGGGACGCTATATCGACTGAGTGTTGTAGTGGAATCTACCACCCCGGACATGGGGCTGGACTGATTATTGAACGCT >38 AAAAGAGTTTGCAGGGTCACTAGCCCGGCTCTGCATACATTGGCAAAGAGGCGATAAGCTGTGAATTCCAGGCTGGGGACGAGCTTGAACCTACTGTGCGAAGTTACCAC >40 AAAAGCCAGAAGGTATGCAGATCGATATCGCTCCAATGTGTCAGTAGCCCGGACGAAGGCCTGCTCGATGATGCGTCATATTAATGTCAGAGAACCTAGTAGCCATCGGT >44 AAAAGGACAACGATGTGTGTGCTTCAAATATTCCTCCAATTCGTTACTGTCGCGGATGTTATGTCGGGCACTCTTAGCTTCCAAGGTGGGTGGATGTTAACGCGCATAGA >46 AAAAGGGGTTTGCTGACCCACAGACAATAGATAGCGGATTACTTACTCCACTTACGAGACTAACGCAATGCTAATAACTTGACGTCTAAACGGGACCACATAAGCCTTTA >47 AAAAGGTATTATATTGATGATTAGCCATTTAGGGAAGATCAACAAGATGAAGCATGACGCGAGAGGTAAGGGATCGACGAACGCTCCTGCCATGGGCAATCCAGGAGGGC >49 AAAAGTCCCTGTCGTACCTCAATCCAGGCCACGTATACTCAGGGGGTACCGAAACCCTAATAACTCTCCCGTTGGTGACCAGATCTGAATCTGCACCGCACACAACTACA >51 AAAAGTGGAAAATACCCTCACTAACAGGGTATCTTGCGTTGAGCTGGTAAGTGCACCAGGGAAAACTTGTTATCCTCGAGTCTGATCGATGCTATCTCATATTTCTGAGT >54 AAAATAGAAATCGCAGCTTGAATGCTGTTACTCAATGTTTATGAACATAGCTCCCGCGTTACTCTGTTTCGACATAGGATGCGCGCACCGGAAGCCTGCTACCACAACAT >57 AAAATCAAACTAGCTACTCCGAAGGCGGGATATTTGCCTGGTAGAGGGAAATGTAGCTCACGGGCCGTTTACTCTTCTTCAGAGCAACTAAGTATTCCGGAAAACCTCAG >60 AAAATCGTGTTCAAGAAAGAGAGGGCCGGGCGCTGAATTGGGCCCGAATCACAAAAAAGTGAGTCGCGCTCCTACAAAGTCCTAATCTAATAGTGGATGATGTGTCTGGT >62 AAAATGCCATACTAAATTAACGGTGTCTCATAGCGACATTGTTATTGTCACCTGACATAGCCAGAAGGTTAAAAGTAGTGCGCGACGCGAAATACCCATGCTGCGGAGTC >67 AAAATTCAGCCTTTCTTACCCAAGGCCTCTCTCGGACAAGAACTATGAAAGCATGCCCTACAGCATACTTTCGCTGACATATGGAGCAGGGAGCCTAAAGGCCGTTTATC >71 AAACAACCACAGATGGCCACTATGTGAAGTTTTGGACGAGTCCATACATTTTTCACTAAGTAAGAAGCTACCTTAATACGTGCACGCATCACATCCTAGACGCTCTGGCT >72 AAACAAGCCGAGAATCTGGCGTGACAAATCCTCCGGAACGGGCTGACCCCACTGTACAAACAATGTAAATCACCGCAGTTTCACTGTACGTTTGGTCTTTTTGATAGACA >74 AAACAATGGACGGAACCATGTTCTGTTACAAGCGCTGACCACACAAGCCGAGAGTACCCCAAGATGATGTCTTAGGATCGTATATACCCTCCATACCCGAGCTTTCCCCG >77 AAACACGTCACCGAGCGCTTAGTGGATCGTACTCAACATGTTGAACAGACATTATCTACATTCGATTCTTCCCATTATGTATCATCGCAGTACACGCCGCTTTCCATTTT >78 AAACACTCGCACAGACCGGTAACCGAGGGAATACAGAATTATAGCCCATATTCGCTGTCCCAAACTGCACCCATCGTTGGCAATTCCGAGACCTCTATTTCCGGTATGCG >79 AAACAGAGACATGTACGTTTTGCGCGGTGGTAGCTCTGGAGTCGGAGGCAGGGTTTTTTGGCCGGCAAAATCAGTATCCGACCTCGTTGGATGACTCCGGAAAACCTTTT >80 AAACAGATCGCTATTAGCACGCGTATGCTTTCACTAGCGAACAAAAGTGCCCCTTTGAGTCCTAGCAGCTACAGTGCCCGTAACTGATATTCTTAAGGCTATTTACAGTT >81 AAACAGGGCACTGGAGGGCAGCCCTTGAACCGCATAGATGGTGGAATTTCATACGGACTGGCGGGCATTATCGGGGTCGTATTGCCTTTGGGGGCATAGCCCACGAGTGC >82 AAACAGTAGAGTTTCATGTCCCTTGTATCGGAGGCAGCGACTCGCTTGAGCAGACCAAGTCCCGTCACTGAGGGTTATCAGTGAGGATACCTTGGTTCAGACAAAAAGAT >83 AAACATACGATCAAGTGTCGAAATTATATCACCGGCATTTGGTCTTTAGATATCTAAAGAAATGGCGCTAGGCCATCTCCCGGGTTTTTTCTGCTTCATGGCTAGATTCG >85 AAACATGATTTCGTACCCCGTGTAGGGCATGTTACCCACGTGAGGCGAGGTATGCGTGGGTCGATGTAGTACCTGTTGACCCGCATTTAGCCTCGACTCAATCTGCTGGA >86 AAACATGCGTTAAATGCACTGCGAGTTGTCCGGTAGCCGTCCAACCTCCTCTAGTACCAAGTAATGGCATTAGCACGCGACAACATGGCTGTAAGGGCCCGTGCGACTAA >87 AAACATTTGCACCTAGGCTTCCTAGGTGTTTGGCTGGAAACGTAGGCAAGGTCAGGTATTCGACACATCGCCCATTATCCGTTACACGAATATACAAGACGAGAGACCGG >90 AAACCAGACGCCTAAGTTGCACTTCGTGTGGACAGTTCACCTGAAAAGGCAGAAAGTTCTGAGATGAGTGCGCGGAGTTACTAACCTAGGCCCTGTACGAGAGCAACATA >91 AAACCATAGATCACAAGTCCACCTCGAGGCGATTATGCATGCCCTTCACTTCTCACGGCTGGATGGGCTTGCCTTAGTCACTTGCGATTGAGTCGTACGATTATAAAGCG >93 AAACCCCCCCGTTACCCTCACTCCGCTCGGCCTAGCGGTGGTCAGGTCAGGAGTTGCAATCGGAGTCACACGATCATACTTTCTCACTGCGCACAACATATCTGCTTGCA >95 AAACCCTAAAGAGGTTACAGATACCATTTTAAAGTCTAGATTCTATGTGGGTATTTGCGGTCGGGACCCGTTCGCGTCCGCGTCAGACTTGCAATCGTGAGCCCGTCACA >97 AAACCGCAGTGCAATCCTCTGGCAACACGGATAGATTCCTTGCTGTAGCAAGACCGACCGCTGTCCCGGGTGCCGCGATGCGCGAGCATGCCCTGCAGGATCCCACACAT >98 AAACCGCATCGGGCTGGGTACCGGACGGTGCTAAGAGTGCCAGAATGAAGGTAAATAAGGTGGATTGAACATTTTATTAGCTCGTCTCGTGGTGCCATTGCCCAGCATCG >99 AAACCGGGTCAATGTGATTCGTATTACTTGTCAAACAGTACTATCAAACCACCGTTCAGTCGCCCGCTTGATCCCTTGATTCTAGAGGCCATACGGCGCGCCTACTTTTT >100 AAACCGTGGGTAAGGCCGGGAGCTTTACACCAATTGGAACCACAAGTGGCTTGGCGGCCCAGGCTTCCTCATGCAGGATCATAACAATTACGCAAGTCCGGGAATCAGGA >91 AAACCATAGATCACAAGTCCACCTCGAGGCGATTATGCATGCCCTTCACTTCTCACGGCTGGATGGGCTTGCCTTAGTCACTTGCGATTGAGTCGTACGATTATAAAGCG >93 AAACCCCCCCGTTACCCTCACTCCGCTCGGCCTAGCGGTGGTCAGGTCAGGAGTTGCAATCGGAGTCACACGATCATACTTTCTCACTGCGCACAACATATCTGCTTGCAsimka-1.5.1/example/C.fasta000077500000000000000000000257761353413740300155060ustar00rootroot00000000000000>0 AAAAAAACCGAAGTCTCTTGCAGTATGCACAGGAGATCGCGGAGAACCGGAATCCCCTGGTTCGACTGTGACTCTTGGAGCGTCGCTAATCGCGTCGGTAACTTTAATTT >2 AAAAAAGCGGAATCACTGAAGACGTCATTTCCTCTTCAGAAAAAGTTTACCATGTTCTATTCTTGACCCTACGAATCAAGGTTCCGTTTAGCGTGTCGTTATAGAACAAT >3 AAAAAATACAGTATATGCCCCGAACTAGCCATCCGGATCCCAATAATCAAATCACCGTGACTGATGGTCAGATATCTTCTCCGGGATTGTGGGGAACGCCTACTTCGTGG >4 AAAAACAAGGGCTGAGCATCTGCGACATAAGTATCCCGTACATTATGTTCGATTGTAGTTAACTAAATTGTCCTACACTAGCGACCCTAAACCACCGTTACTCGAGGGCA >7 AAAAACGGTGGTTTGCATGTTAGGCGGAAGATCCCAATCCCTTGAAGAATCACGGAAGTAAGTTACTAGTGAAGGTTACTTTGCGATCTAAGGCGCTCGTGGGTGTCACA >8 AAAAACTGGCTACATTTAGACTTTTCCCAGCCTTGCCTACATGCCTAGCACTGAATCAAATACCCGGCTACCGGGTAGCGACCCCACAATATCAAGTTGCTTCTCTGAGT >10 AAAAAGCAGTCGGAAGACCTTTAGCTGTCCTTTAGCAGTGACTCTCCGCCCGTCTCAATGCAAAGAGATTCTGCGCATTGCTCCTTGTCCTAAGACACCATTAAGCGAAT >12 AAAAAGTTAGTCCTAAATTGTGGGCATCGGGTTGGCTGAATTAGACTGTGTGATCATATTCTACTTTTCGTCGACAGGATCTGGGCCGATTAGGGACATGTAAAGTGTCC >13 AAAAATAAGAAGCCTTACGAGCGTTGCGGGTTCGTTCTATATAAGGGCAGTACGTTTAGTTTACCCATCTGGGGTTCTTGTTACAACATGGAGTCGACTGTGTCTTTTAC >14 AAAAATCCGTTTAAGTCCACACCCCATATCTTTTGCAAGTTACGTCTTCACTTGTGTACTGTAGACTGCTGGGGACTAGAGTCCATCGTTGTGACTTAAAGGACGCTTTC >100 AAACCGTGGGTAAGGCCGGGAGCTTTACACCAATTGGAACCACAAGTGGCTTGGCGGCCCAGGCTTCCTCATGCAGGATCATAACAATTACGCAAGTCCGGGAATCAGGA >100 AAACCGTGGGTAAGGCCGGGAGCTTTACACCAATTGGAACCACAAGTGGCTTGGCGGCCCAGGCTTCCTCATGCAGGATCATAACAATTACGCAAGTCCGGGAATCAGGA >15 AAAAATGAATGATCCACGCCCGGTCAGTGTAGATCACTGAGCCTCGGCTATTCAGCCCATAAAACACAAAGCCTGCGGAGCGGCGGCCTACTAGAAATGCGATCCATCCT >17 AAAACAAAAGCAATTAGCAACGGACGCGAACTCTCGATCTTACAAGGATACCAGAATAGTTCACCCTGGCTGCGCAATGCCGATATTAAGTGTAGGCTTCGGCCCATTGT >18 AAAACAAGCGATTGAGATTCTAATCCAGCTTGCGACATACTTTGTCAGCGATCTGCGCTGTCTAATTTGCATCAAGTGTCTATACACGGTCCCTCCTCAGGGCGGGCACG >19 AAAACACTCCCGTATTCATGTCTACGTACCGTTATCCACTATTAAATGTACACTAGCTAACAGTGGTCATTGAGTAATCAAGATCAAGCGCGAGATGGGGGAGGATTGCT >23 AAAACCCAAGATGACGAATATCTGGTTACTGGGTTGACGGGATATAATCATTGCAAAACCTTTTGGGAAGGGACCTTAAAGCCTAGTCTGTCTTTTAGTTGCCCTCGTGG >25 AAAACCTGACGTGTAGATTAGTTACGTCCCCCGGCGCATGCAATCGTCCCACACGGCAGATGTGTTCGGACTGGAGACAATAGTGCTAGTTGGACTACGGCCTATTGGCG >26 AAAACGAAGTAGCGGGTTACGTGAATGGTGCCCACTTACGATAGGTTCGAGGCTGAGGGCTGTACCACGCCGGACGATTCGTGTTCGCCTCCTTGAATCTGATTCCCCGT >29 AAAACGGGACCTGCTTTTTTGCTTTGCGGAGGTCAATGGTTAAGTACGAACCAGCATGCGGGATGTGGCCCCGCCAATGTGGTACCGTTTGGTACTGGATCAGTCCTTTT >38 AAAAGATACCTACGCTCCATCTAGAGGTTAATAGCGAACCTCTCGCTTCTAGAGCGAGGTGGCATTCAGCGAGATGAACCTCTAATCTAAGCCCCGAATATCTGACGCAG >39 AAAAGATTGCGTCTCAGGCGGGCTCTAAACACGCTTCGCCGGACCAACTTTCTGACATTCTGGGACTGTCACTTGCAAGGCCTTATAGCTTTACGGCATTCTCCTTCGCC >40 AAAAGCACTACTCCGTTTGGACCATTTGCGCAAACATGATCCCCGCGCGGTACGGAACTTTTTTACAACGCAGGATCTTCTGACCCGGTAACATGCTACCTGGCGCCTGT >41 AAAAGCCGGGATGGGTAACTCTTTTACTATTTGGGGACAAAGCTGCATAGTGACCGGCACTCAAACATACATTGTAATAGTGAAATCGGAGAGCACCATGCGCCAGTCGG >42 AAAAGCGATTTGAGCGGAGGAGGCGTTGAGGAGATCAGAAGGGGGCACACCGCAACGGTCTTCAACACACACTGGCAGTCTTTAAGACGTTTGAAGTTTAGCTTAGTTAT >43 AAAAGCTGGTCCTGCCATTGATGTCCGATGAGCGACTCTTACCCTTGCAAAGAGCACAAAAGTTATATTCTGATCAATGTCATGCATTTAACATTACTGACAGGGTGGCG >44 AAAAGGAAGATCACAGGAGAAGTAGACACCTCATCGATGGCACGGACCCAGTTCACATATAGGGTGGGGAGCTATCCAGAAGTGGCCATGGTTTTGTAACTTCGCGTTAG >46 AAAAGGGGGAAACAACTGAAGGTCGGTGCCGCGATAAGCCCGTTAACAGGGACGGGCCAAAGCATTAGCTTGTTAAGAACCGATGCTATGTTTATCCGATTGGATGTTGT >50 AAAAGTGACGAATTCCTGCATCGAGAGGATGGCGTCTCTCGCATCGCCGGTCCAAGAAGACCACAGGAAACAGATCGGAGAAGGCCGCAGGTATTCAGGAAGTTCATATA >51 AAAAGTTCCATTGGGGGCACGCCTAAGTTACGGCACCCGAGTTTCGCCAGGAAGTGGAAATTTATTCTTTTGAATCCGCAGAAGTGTAAAATGCCGTCCAATAAAATTAC >53 AAAATAAGGCTTTGGTGCCAAGACCAAACTCGCTTTGATGTCGTCTTGGCCAAAAAGATACCTTCGGATGGGCCCACCCCTATGCTTCCTCATGCTTTCACTAGGGAGAC >55 AAAATAGTAGGCCGAATGGATTGGGTGACGTGTGGCACAGTAAGGGAGGACTATCAGGTGATTCTCTACCTGGAGCCACCATGAACCTCATGAGTAGAGGCGGAACAAGG >58 AAAATCCGTACCTCGAGCGGTTGGAGAACGCTCGCGCTGAATGCCCGTAAGATGTTGACAGTGAAGTGATTTTGCAATCGATGTATTCGTGTCGAATCATCATGCCCGCT >60 AAAATCTATTAGGAGATCAACTGTCCGAGTATTGTGGGGTTGGCTCTACTTACGCAACCCGCGATACAGCAATACGATCCTCGAGCTCTCCTCAACCCCGATTGCGTATG >62 AAAATGCACGAAAATAGGAGCATTCGTCCCAGTAGTGATTGAAAGTCCTTAGGCATAATTCAATACATTCGTTTGGACCCCAAGTGTTGGGCGTTCAACGCGCGAGATTG >64 AAAATGGCGGAATCTGTTGGATCCTGGCCGGTAGAGTGTGCCTACAGATTTGTCGAGGCGGGTAGTCTGCCTGCGGCCTGCACGTTAGAGTACTACCTCATAGTGTTAAG >65 AAAATGTCAACTCACGTTCTTTCGCACTTATGTTTCAGCCTCAGATTCAATTTGACATCCTACAAATATGAGAAAGGCAACCAGGAATGGGGCTGAACCCGTTCAGCCGT >66 AAAATTATGAGAGGGGCGTTCTCGCAATGGAGATTCTTCTCGTCGACTCACCAGGGGACCAGTGCACGCAGCTCCATAGGTGCACGCTCTCGGACGTGGCAACGGAGGAC >68 AAAATTGGCCGCTGAGACATGGGACATGGATTATCGTGTCATATAGACGGCGGCATTTTGCTACTAGCGAACACTCAATGGGGCTTTCCGTGGACTACTATCAATACTGA >70 AAACAAAATTTTGTTCTATCGAGTCTACCAGAGCCGAACACGGCCAAGCCCAATATGCCAGTGGTGTGCTGCTTGAGCAATTCGCAGGTATCTCTCCAACTACATCGCCG >73 AAACAAGGTCCGTACGAAAACCTAGTGACCTCAAATCAGTTGTAGGTGTACTGGCTTGCAACGTTGCCGGTGAACGAAAGAACCGCTAGGGGGCCGTGATGCATTCTACC >79 AAACAGATAGCCACCAACTATACCCTTCTTCGATGTCCATGCGGGTCGTTAACGTCGCTAGGCGTGAACGGACGCTCGTGGATGTCCGTCTACGCAATGTTACGAGTCAA >80 AAACAGCCTCGATGGGTGTACAGTGCACCCTTGCTCGACTACGGCTTCAATTCTGATGTAAAACCTGTACGTGAGACTGCCAAGGCAGATGCAACCAGATCTCCTGGATT >81 AAACAGCTAGGGGTCGACCACTTGCCGGCAGACACTGAGGTAGATATTAAGCAGAACACCGGGTGGTAATTGCATGTCGTATTAGTTCCCGTTGCTTAGCATGCCTAAGG >84 AAACATATGTCATAAGGCGCTGAAGACACCGCACGGGGACTAACACAACAGCACCAGATTGTCGACGTAAGTGCTTTTCCTATTTCTTAGCCCATCTCTAATCAGCCCGG >85 AAACATCGAGCTTGGACGCGTGCAGGTATTAAATTTGAGCCCCAAGGCTTATAATGCATCCTCCCACAAGAAGGCATAGATGTACTCGTCTTAGTACAAGGCTGCTAGAG >87 AAACATTTGTACGCGAATAAATTATTTCGGTGTCAGAGGCGACACCCGTAAACGGGAGCAAGGCTAAGTCAAGGTGTTGAAGAGAATTTTCTGTGGTCATTTACTGTCCT >89 AAACCACGAATACCAAAATAAAGTCACCCTGTGCCTTAGTGTTTAAGATGTACTGACAATTTCCTGTGGATCGTTGTGCGGTTGCTGTGGGGGCCCTATCAGCGAACGGG >90 AAACCAGTATGCTTTTAAGGGAACCGAGGAATCGCATGATCTTCCGGTGATTATGCCATCTCTAACAGGGAGGCGCCTTGCTTTAACGCTGTACCCGTTTTGTACTCGAA >91 AAACCATCTTGATAATTCTAAGGTCAGTACGAAAGGCCTCTAGTCAACCGTCTCGTGGATCGGGACTCAGCCGTGGAATGATCATCATTAGCAGACAGACAGTCGATATC >94 AAACCCGCCCCATGCATAGTAAACGAAGAAGTCCACTCTTAATGTCAAACTAACTTTTTAGGGCATCCGTTGAAGGGCATCGATACCGTCCAACCGGTCGGTGGAGGACG >99 AAACCGTAAATGCCGCCCCCCCCACCAGGCTGGAAGGGAAGGGATCTAGTAGCAAACCTACATCCATGAATGGAGAAGAACTGGTTCGAACACCATGCGCATGTTGACCA >1 AAAAAACATTAGTTACGGAAGGTGGGTGGAGCGGGGGCCGCCAGTCTATATTCATACTAGAAAGGGGCTAAGGGCATCGCGCTCATGAAGTGGCACTTGCAGAGGTGAGT >4 AAAAAATTGTCGTTAAGATGAGGAGCTCTTTCGCATTTGACCCATCAAATCTCGGAATGCACTTGAGATCGACCCGTTTGATACAAGCCTTCATCGTCGATAATATATCG >6 AAAAACCGATGGGGCCGAGCTGTTCTTTGGCCGGGTTACTCTACGCCCACACGGGTACACAGCCGCGAAACGGGAGGCTCGTGCGCGGTCACCTAAGTCCCTGTGGCGGG >7 AAAAACGGATGAGAAATAAAAGGGGAAATAGCGACATGTCAAATGGCCTCTTGGCTGGCGGTGTCTGGCTGGACTAACCCTCTTAAGGACTTAAAGCGTAGGCAAGGTTA >10 AAAAAGCTTACAGTGTTCGTAGTTCTGCTCGTGTCGGTATTCTCATACTCACTCCAGGACTTCGGAAACATTAGTGAAAGTGTCACCGGCCGTGCATTTTCCGGAGTAAG >12 AAAAAGTCTCACTTATTTACGCGCTTGATAGCCCAAACCGTTCACAGCATCTCCAGTAGTCGATTCCGGCTCCTCTCCATGTTGGAGATGTCAGCACGGCAGGTATATGA >14 AAAAATATTTGAATCCGCTGCTGACTTTTTCAAAATTCGATTATCACTACGGCGGTCAGCATATTTCCTTGAACTTATAACGTATCTCAGTATTGCTGGTTTGGAAACGT >15 AAAAATGCCCAGTTCGGCGATCCAACTCGTTAGTCTCAGGGTTCCCGCGGCGAACTCCCTTCGGTTGACTTTACGGCCTACACGCTCAGCCCTGTACCCGCATTGATGTC >19 AAAACACGCATGTCCGACGGGCGCATTAAACGGCATTGTTGTTCTTGAACCGGAAGCAATAGTCTAGGACCGTTCATATTGACTTACTTTACGCGTGGGCCTTGGATGAA >22 AAAACCATCTGTTTCCCAGAATGCTCCCGATACTTAATCACCTGCGCTTTGATCGTAGGCACTCACCCTCTCAAGTACCCTTGGACAGATTTAAGATCGATAGTTCGTAT >26 AAAACGAACCGGACTACATTCCTCATAGGCTTGAGGGGCAAAAGTTACAGTACAGATTTCGGGACCCGTCGCTTTACAGTGTAGACTGTTTTCCGAGAGTGCCTAGTCCA >28 AAAACGCTCGTGCCCAGTTCGAGCGCCCTGGAGTTTACCGCCCAAGTAAACGCTATTTTTTAAATGACATAGCTCCTTACAGGCGTGGGGGGACGCCTATAGGGCCGTCG >29 AAAACGGTGCCAACCATTGAAGATAGAAGATTCAATACATTGCACAAACGAATATAACCGGTAGGTAAAGTCTCGTTTCAAAGCGGCTCTATTTGTGCACCTTTTGCTGA >31 AAAACTCACCACCAGGACAGAAGCTAAAAGACAGGCCGATTCACGGGCGAGCGGTCGAAGCATACTTTTTAGGGCATCATGTGACCACACGTGTTGACCGTCCATCGTTT >34 AAAACTTTACCGATTTGCAAGATGTAAACAGCACGGGGACGCTATATCGACTGAGTGTTGTAGTGGAATCTACCACCCCGGACATGGGGCTGGACTGATTATTGAACGCT >38 AAAAGAGTTTGCAGGGTCACTAGCCCGGCTCTGCATACATTGGCAAAGAGGCGATAAGCTGTGAATTCCAGGCTGGGGACGAGCTTGAACCTACTGTGCGAAGTTACCAC >40 AAAAGCCAGAAGGTATGCAGATCGATATCGCTCCAATGTGTCAGTAGCCCGGACGAAGGCCTGCTCGATGATGCGTCATATTAATGTCAGAGAACCTAGTAGCCATCGGT >44 AAAAGGACAACGATGTGTGTGCTTCAAATATTCCTCCAATTCGTTACTGTCGCGGATGTTATGTCGGGCACTCTTAGCTTCCAAGGTGGGTGGATGTTAACGCGCATAGA >46 AAAAGGGGTTTGCTGACCCACAGACAATAGATAGCGGATTACTTACTCCACTTACGAGACTAACGCAATGCTAATAACTTGACGTCTAAACGGGACCACATAAGCCTTTA >47 AAAAGGTATTATATTGATGATTAGCCATTTAGGGAAGATCAACAAGATGAAGCATGACGCGAGAGGTAAGGGATCGACGAACGCTCCTGCCATGGGCAATCCAGGAGGGC >49 AAAAGTCCCTGTCGTACCTCAATCCAGGCCACGTATACTCAGGGGGTACCGAAACCCTAATAACTCTCCCGTTGGTGACCAGATCTGAATCTGCACCGCACACAACTACA >51 AAAAGTGGAAAATACCCTCACTAACAGGGTATCTTGCGTTGAGCTGGTAAGTGCACCAGGGAAAACTTGTTATCCTCGAGTCTGATCGATGCTATCTCATATTTCTGAGT >54 AAAATAGAAATCGCAGCTTGAATGCTGTTACTCAATGTTTATGAACATAGCTCCCGCGTTACTCTGTTTCGACATAGGATGCGCGCACCGGAAGCCTGCTACCACAACAT >57 AAAATCAAACTAGCTACTCCGAAGGCGGGATATTTGCCTGGTAGAGGGAAATGTAGCTCACGGGCCGTTTACTCTTCTTCAGAGCAACTAAGTATTCCGGAAAACCTCAG >60 AAAATCGTGTTCAAGAAAGAGAGGGCCGGGCGCTGAATTGGGCCCGAATCACAAAAAAGTGAGTCGCGCTCCTACAAAGTCCTAATCTAATAGTGGATGATGTGTCTGGT >62 AAAATGCCATACTAAATTAACGGTGTCTCATAGCGACATTGTTATTGTCACCTGACATAGCCAGAAGGTTAAAAGTAGTGCGCGACGCGAAATACCCATGCTGCGGAGTC >67 AAAATTCAGCCTTTCTTACCCAAGGCCTCTCTCGGACAAGAACTATGAAAGCATGCCCTACAGCATACTTTCGCTGACATATGGAGCAGGGAGCCTAAAGGCCGTTTATC >71 AAACAACCACAGATGGCCACTATGTGAAGTTTTGGACGAGTCCATACATTTTTCACTAAGTAAGAAGCTACCTTAATACGTGCACGCATCACATCCTAGACGCTCTGGCT >72 AAACAAGCCGAGAATCTGGCGTGACAAATCCTCCGGAACGGGCTGACCCCACTGTACAAACAATGTAAATCACCGCAGTTTCACTGTACGTTTGGTCTTTTTGATAGACA >74 AAACAATGGACGGAACCATGTTCTGTTACAAGCGCTGACCACACAAGCCGAGAGTACCCCAAGATGATGTCTTAGGATCGTATATACCCTCCATACCCGAGCTTTCCCCG >77 AAACACGTCACCGAGCGCTTAGTGGATCGTACTCAACATGTTGAACAGACATTATCTACATTCGATTCTTCCCATTATGTATCATCGCAGTACACGCCGCTTTCCATTTT >78 AAACACTCGCACAGACCGGTAACCGAGGGAATACAGAATTATAGCCCATATTCGCTGTCCCAAACTGCACCCATCGTTGGCAATTCCGAGACCTCTATTTCCGGTATGCG >79 AAACAGAGACATGTACGTTTTGCGCGGTGGTAGCTCTGGAGTCGGAGGCAGGGTTTTTTGGCCGGCAAAATCAGTATCCGACCTCGTTGGATGACTCCGGAAAACCTTTT >80 AAACAGATCGCTATTAGCACGCGTATGCTTTCACTAGCGAACAAAAGTGCCCCTTTGAGTCCTAGCAGCTACAGTGCCCGTAACTGATATTCTTAAGGCTATTTACAGTT >81 AAACAGGGCACTGGAGGGCAGCCCTTGAACCGCATAGATGGTGGAATTTCATACGGACTGGCGGGCATTATCGGGGTCGTATTGCCTTTGGGGGCATAGCCCACGAGTGC >82 AAACAGTAGAGTTTCATGTCCCTTGTATCGGAGGCAGCGACTCGCTTGAGCAGACCAAGTCCCGTCACTGAGGGTTATCAGTGAGGATACCTTGGTTCAGACAAAAAGAT >83 AAACATACGATCAAGTGTCGAAATTATATCACCGGCATTTGGTCTTTAGATATCTAAAGAAATGGCGCTAGGCCATCTCCCGGGTTTTTTCTGCTTCATGGCTAGATTCG >85 AAACATGATTTCGTACCCCGTGTAGGGCATGTTACCCACGTGAGGCGAGGTATGCGTGGGTCGATGTAGTACCTGTTGACCCGCATTTAGCCTCGACTCAATCTGCTGGA >86 AAACATGCGTTAAATGCACTGCGAGTTGTCCGGTAGCCGTCCAACCTCCTCTAGTACCAAGTAATGGCATTAGCACGCGACAACATGGCTGTAAGGGCCCGTGCGACTAA >87 AAACATTTGCACCTAGGCTTCCTAGGTGTTTGGCTGGAAACGTAGGCAAGGTCAGGTATTCGACACATCGCCCATTATCCGTTACACGAATATACAAGACGAGAGACCGG >90 AAACCAGACGCCTAAGTTGCACTTCGTGTGGACAGTTCACCTGAAAAGGCAGAAAGTTCTGAGATGAGTGCGCGGAGTTACTAACCTAGGCCCTGTACGAGAGCAACATA >91 AAACCATAGATCACAAGTCCACCTCGAGGCGATTATGCATGCCCTTCACTTCTCACGGCTGGATGGGCTTGCCTTAGTCACTTGCGATTGAGTCGTACGATTATAAAGCG >93 AAACCCCCCCGTTACCCTCACTCCGCTCGGCCTAGCGGTGGTCAGGTCAGGAGTTGCAATCGGAGTCACACGATCATACTTTCTCACTGCGCACAACATATCTGCTTGCA >100 AAACCGTGGGTAAGGCCGGGAGCTTTACACCAATTGGAACCACAAGTGGCTTGGCGGCCCAGGCTTCCTCATGCAGGATCATAACAATTACGCAAGTCCGGGAATCAGGA >91 AAACCATAGATCACAAGTCCACCTCGAGGCGATTATGCATGCCCTTCACTTCTCACGGCTGGATGGGCTTGCCTTAGTCACTTGCGATTGAGTCGTACGATTATAAAGCG >93 AAACCCCCCCGTTACCCTCACTCCGCTCGGCCTAGCGGTGGTCAGGTCAGGAGTTGCAATCGGAGTCACACGATCATACTTTCTCACTGCGCACAACATATCTGCTTGCAsimka-1.5.1/example/D_paired_1.fasta000077500000000000000000000246751353413740300172500ustar00rootroot00000000000000>0 AAAAAAACTCTACAGCGAGCAGTGTTAAAATCGTGGCGCACCCAGACAGCCACTTCGCCGTTCTAAGTGGCCGATCGTAGCACATCGGATGACCTTGGTTGGTACGACAT >2 AAAAAAGAGACGAGCCACGCGGTGCGCCTGAACGTTGGGTCCAGACCACACTTATGGATTCGACGGGGCACCTATCAGGTTCTCCATCGTATAGTCGTCTGTAGGTCTGA >3 AAAAAATCGCTAGGGGGGATGGCCATCAACCCCCCCTCCCGTACCTATGATAGTGGGATCAGATTTAAGCACGGGCCCTACGACTCCCCTTCATGGAATAGGCTAAGGTG >4 AAAAAATTGTCGTTAAGATGAGGAGCTCTTTCGCATTTGACCCATCAAATCTCGGAATGCACTTGAGATCGACCCGTTTGATACAAGCCTTCATCGTCGATAATATATCG >5 AAAAACATTGCGGACTACCGTCGTTGCAGTGGGTCGCCCATTCTAGGCTGCGAGTTCATATGTGTGCCTGTCGCTTAGGGCAATCCTCGGATTGGCTGTTTAACAGGGGT >6 AAAAACCGATGGGGCCGAGCTGTTCTTTGGCCGGGTTACTCTACGCCCACACGGGTACACAGCCGCGAAACGGGAGGCTCGTGCGCGGTCACCTAAGTCCCTGTGGCGGG >7 AAAAACGGATGAGAAATAAAAGGGGAAATAGCGACATGTCAAATGGCCTCTTGGCTGGCGGTGTCTGGCTGGACTAACCCTCTTAAGGACTTAAAGCGTAGGCAAGGTTA >8 AAAAACTTTGACTTTTTCAAGACATGAAAGGATGCGGGCTCATACTGGACGGGTTCATTCCTACCGCGGAACGAAGGGCTATTTTTTGTTTGGGCGAGAGTACATCCGTC >9 AAAAAGACTCAGCTTGACATGGCGGTCTGAGCTTTGCTTGGGCTCTTACTATGTCAGGGTTGGAAACTATGGCAGAAGGGCTTCTCGCATCCTCACGGCTCGAATTAGCT >11 AAAAAGGCAAGGATCTTTGACCACGGCAGTTTGCAATAGTCAATTACGCTTCCCTGAGTACAAACAATGGTCTAATGCTGTTCAGTTGGGGTTAACTGGCCTGACGCTCT >13 AAAAATAAAAATGGCCTAGGTCCAACTCGTCCCCGGGGTAAGTTAGTGTAAGAGCTTGGAGCAAGTCTGTTCCTCGTCTGTCCCAGTAACACTCGGGTCTACGGTCGCAG >16 AAAAATTAGTCAGGTTACCCCCAATTAGGTGAAATACGTCGAAGGGTCGCGTCCAAGAAAGAATGATAGCTGACAGTTCTCTAGGTATTTATATTTGTTTGCATCGACTG >17 AAAACAAATATCCTGAATTCATAGAGCCGTGACTTACAGTTCTATGAAAAGTGTCTGGCAAGGGAGATTTCACGTTTCCCTGTATAGGGTCGTCGTATTGCCCACCATTT >18 AAAACAATGCTGAAGACCCTAATGCGTAACCAACAATGTAAGACTGGCACGTATTCTATGATATCTTATTGGCACTCCATCGCGAGGATACTAATAGACACCTAAAAGGA >19 AAAACACGCATGTCCGACGGGCGCATTAAACGGCATTGTTGTTCTTGAACCGGAAGCAATAGTCTAGGACCGTTCATATTGACTTACTTTACGCGTGGGCCTTGGATGAA >20 AAAACAGTCATTGTCTGATTTAGCATCGGGTGCACCGAAACGCTATCGCCTTCTCGTGAATTCGCAATTCAGCTCCAAGCATCAATCAGAATCACATGCCGCACACAGAA >21 AAAACATAGTTATAATGTTTTGAGTGTAAATGCTTACATCGAAATATTCGACGTACTACACCGTGAGCCAAGACCCTGACTTCGAACCATGCGCTCCTGTGAGTGTGACG >23 AAAACCCACCGTGCCGGTTTAAGAGCAGTAGCTGTTGGTGTTGTGTCCGCGTGGCATCGGAAAGTCGCTACAACGCCGGAAGCCGGGGATTTACAGAATACGTTATACGA >24 AAAACCGGTTCGTGAAGTCCGAAGGAATTCCACAACGCACTGCATGACACCTGGAAGAAGAAGGCCTTTTCCACGCCCTGAACGTAATTTCTGGTAAAGCAAGTGCTCCA >25 AAAACCTGATCGTGTTCATGAGTCTGGTATAGACGGATCCTTGGGCCAAAGTCTTCCGGTCTTCTGGCCGCCTTCAGGAGTCTAATTACCTGAACCTCATCGTAATTGCT >27 AAAACGATTATTTATCATTTACCGCCTTAGAGTGTGGCTTATATAGCATGGGTTTGATTTGAGTGGGACAACAGATCCATTTGATGCAGTATGTATTAGCGGATCATGAC >28 AAAACGCTCGTGCCCAGTTCGAGCGCCCTGGAGTTTACCGCCCAAGTAAACGCTATTTTTTAAATGACATAGCTCCTTACAGGCGTGGGGGGACGCCTATAGGGCCGTCG >30 AAAACGTGATGTAATCGGACGCATTTGACCAGACGTAGCCACCTTATTCGGTGCGTGCCATGACCCGAGAGCGCCAGGGATGCTTCTCGTTCCGGGTCACTGATAAATAG >31 AAAACTCACCACCAGGACAGAAGCTAAAAGACAGGCCGATTCACGGGCGAGCGGTCGAAGCATACTTTTTAGGGCATCATGTGACCACACGTGTTGACCGTCCATCGTTT >32 AAAACTCCTCCGCCGAGGAGGCACTTAGCCTCGTATGGATGCTTAAGGCATGATCGAGCCGGCCGGAAATCTCCTACCGTCTAATTAGGGGCATTGAAGTCCGGTTCCGC >33 AAAACTGTCAGCTCTAATCGAATGCTTGGAAGTTCTGTCCAAAGTGTTGCGAAGCCGAGCTTGAACGTATATAATAACTGCGGTCCTCATACCGGAACAAGTTTACTGCC >35 AAAACTTTTGGATCGCCATGTGACTAATTCCTATAATTACAATCTGTCATTAGTCGGAGCGGTGCGAGATGTGAGTAGTATAGTCGACGCGGCTAATCGAGGCAGATTCC >36 AAAAGACATCGCTAAGTAGTCGATAACTTTAGGTCTGGCTCAGCGAAGTCCGCGCACCGAGGTACGCGATGAACGTGTAGTAGCTGTGCTGCCGACTCTGAGGCGGTAGC >37 AAAAGACGTGACAGAGGCGATGATACCGCAGACGATACGCCACTACAGCTAAAGAGTCTGTCTAGAAATGCCTAGCGGCACCTGGCGCCGCCGTCTAATGGAGTGCAAGC >38 AAAAGAGTTTGCAGGGTCACTAGCCCGGCTCTGCATACATTGGCAAAGAGGCGATAAGCTGTGAATTCCAGGCTGGGGACGAGCTTGAACCTACTGTGCGAAGTTACCAC >39 AAAAGATGCACCGGCTATCTTAGTTCGTTCCAGGCCAACGAGTGTGACTATAGACGAGTTCTGCTCAGAACGGACTAGGCTTCGGGTGTCACGCGGGATCATATTATCTT >41 AAAAGCCTGGCCTTAACAAAATCTAGGTGCGTCTCGAATCGAACGGAAAACAACGTCTGGTTTACTGAGAAATCCTAGGATGCTGCTGGCTATTTGACCTCACGGGGGTT >42 AAAAGCGTGCCGAAGATGCCGTAATTTCCGAGGATGCACTCTCGTGACATCTCTTTTTAACGAACAAATTGCAGAGGTCAAGGTGATCGAGGCACGCTATGCTAAGCACT >43 AAAAGGAAAGGGAGAACGAATGATTGTTTCCAGGTATCAGGAAGCAACAAAATATAATCGATTCGTCACTGTGAGCCAACAGGCGTGTATGTCTGCGTCAAGCGTGCATC >45 AAAAGGCACGCATCGTCATCTGAACAGCAAAGTTGGGCGTTTCCGCCAATAAAGCGTTTCCCTTCCATTTTATTGTACTAGGAAGAAACCACTCCTATAAGCAAACAAGT >48 AAAAGTACTAACTTATCACGAACCGCTTTTGACGTCTTAATTACAGGTTGGGTGACGCGGCATGTCAGGGGCAAACTAACTATGATATCCACGGAACTGCCGACGACTAA >50 AAAAGTCGGAGTCGATAATGATATAGCGGCACGACTCGAACCCGCTTCGCAGCTCATCTCAGGAGATAGGCCTCGAACCTTCCCTGGTATGTACTCGGAGGCTCTCACCC >51 AAAAGTGGAAAATACCCTCACTAACAGGGTATCTTGCGTTGAGCTGGTAAGTGCACCAGGGAAAACTTGTTATCCTCGAGTCTGATCGATGCTATCTCATATTTCTGAGT >52 AAAAGTTTAGTAAGTTGGGTACGAGGCGTTATGGAGGGTTGCGTCGCTTTCACGAGCCTCATCGATAGCATACCTGTCGCAGATGTATTCAATGGTAACATGACGGTTTT >53 AAAATAATTTTCTGCAGGACCTCTGCTTCGGGAGAAAATATATTTAGATCTCCACCCGGAACCGCTCGCGACTTCACGAGCATCGGGTAGGACTTCGGCCGCTTGGATGC >55 AAAATAGACCTTTCCCCGAAACTGTATGCTAAGACTGTGAGGCGGCGAACGGTCTTTGTTCCTCAGTTAACTGACAACTCACAACAGCGCATAACGAATCACATGCCAGG >56 AAAATATTATTGATCTGACACTACAGAACTTTCTCGTTGACATCGTGCATTGAACTATCAGATGCCCAACCGAGTGGCGGCGATCGCGTTTCGAGTATCACGCGGGTTCC >58 AAAATCCCACCGACGCACTCAGCTAAGTTGATGCATAACAACGTTTGAGCGCTACCTGAGTTAGTTGCAAGCTCGGTCAACGTGTAAAATGTCCATCAGAGTCACCTCAT >59 AAAATCCTAATCCAGCACACGAGCCTCAGTCAGGTTCAAGGTGCGGCTTACTCTGCCGACACCAGCAAAGATATACTCGGGCAGGGAGATTAAATTGGGTTTGTCGACCT >61 AAAATGACACTAACAACGTTGACCGAGAATGAACGTCTAAACCCTTAGTGTGAATTCGTTTCAATATGTACAGGGCCCTCTGGCATATCCCGCTGCCCGGGCTAATGTCA >63 AAAATGCGAAGGTATTCTGTAGAGGGGAATAACTGGGCTTCCATCCCCAGAGCTAACACAGCCGACTACACACTACATAGATGGTCGGGGGTGGTCCGCCGGAAGACGCT >64 AAAATGGCTTTAGGCTAGTAGTAATCTAATGTGTAACAAAGTCTTGTGGCCCGATCGTTATATCTCTGGCACGATCGGTTGGCGGTTTTTCTAGATTACCTTACGCGATA >65 AAAATGTGGTCGGAGCCGCGTACATTATGTGTGGCTTCACCTATATCTAGGGGAGTTCCCGGCCTAGCACACCAGCGGTCCGTAGGAACCGCGCCCGCCAAACCGAGCAC >66 AAAATTACGGCGCAACTGTTGGCTTCTTCATTCCCTGTTAGGTCCAAGAGCTGACAGGTCATATCTAATTCGACAGTTGCTAGATCGTAGTGAGTTAAGTATTCGTGGAG >68 AAAATTGCGTAGTTAGAACGACGAGCATTCTAATGTACAACCTATAATAAATAACGGGCCCTTGTTGCCTAACCAACAACAGTACCGCCAGGCCACTCCGCTAAGGTCAG >1 AAAAAACATTAGTTACGGAAGGTGGGTGGAGCGGGGGCCGCCAGTCTATATTCATACTAGAAAGGGGCTAAGGGCATCGCGCTCATGAAGTGGCACTTGCAGAGGTGAGT >4 AAAAAATTGTCGTTAAGATGAGGAGCTCTTTCGCATTTGACCCATCAAATCTCGGAATGCACTTGAGATCGACCCGTTTGATACAAGCCTTCATCGTCGATAATATATCG >6 AAAAACCGATGGGGCCGAGCTGTTCTTTGGCCGGGTTACTCTACGCCCACACGGGTACACAGCCGCGAAACGGGAGGCTCGTGCGCGGTCACCTAAGTCCCTGTGGCGGG >7 AAAAACGGATGAGAAATAAAAGGGGAAATAGCGACATGTCAAATGGCCTCTTGGCTGGCGGTGTCTGGCTGGACTAACCCTCTTAAGGACTTAAAGCGTAGGCAAGGTTA >10 AAAAAGCTTACAGTGTTCGTAGTTCTGCTCGTGTCGGTATTCTCATACTCACTCCAGGACTTCGGAAACATTAGTGAAAGTGTCACCGGCCGTGCATTTTCCGGAGTAAG >12 AAAAAGTCTCACTTATTTACGCGCTTGATAGCCCAAACCGTTCACAGCATCTCCAGTAGTCGATTCCGGCTCCTCTCCATGTTGGAGATGTCAGCACGGCAGGTATATGA >14 AAAAATATTTGAATCCGCTGCTGACTTTTTCAAAATTCGATTATCACTACGGCGGTCAGCATATTTCCTTGAACTTATAACGTATCTCAGTATTGCTGGTTTGGAAACGT >15 AAAAATGCCCAGTTCGGCGATCCAACTCGTTAGTCTCAGGGTTCCCGCGGCGAACTCCCTTCGGTTGACTTTACGGCCTACACGCTCAGCCCTGTACCCGCATTGATGTC >19 AAAACACGCATGTCCGACGGGCGCATTAAACGGCATTGTTGTTCTTGAACCGGAAGCAATAGTCTAGGACCGTTCATATTGACTTACTTTACGCGTGGGCCTTGGATGAA >22 AAAACCATCTGTTTCCCAGAATGCTCCCGATACTTAATCACCTGCGCTTTGATCGTAGGCACTCACCCTCTCAAGTACCCTTGGACAGATTTAAGATCGATAGTTCGTAT >26 AAAACGAACCGGACTACATTCCTCATAGGCTTGAGGGGCAAAAGTTACAGTACAGATTTCGGGACCCGTCGCTTTACAGTGTAGACTGTTTTCCGAGAGTGCCTAGTCCA >28 AAAACGCTCGTGCCCAGTTCGAGCGCCCTGGAGTTTACCGCCCAAGTAAACGCTATTTTTTAAATGACATAGCTCCTTACAGGCGTGGGGGGACGCCTATAGGGCCGTCG >29 AAAACGGTGCCAACCATTGAAGATAGAAGATTCAATACATTGCACAAACGAATATAACCGGTAGGTAAAGTCTCGTTTCAAAGCGGCTCTATTTGTGCACCTTTTGCTGA >31 AAAACTCACCACCAGGACAGAAGCTAAAAGACAGGCCGATTCACGGGCGAGCGGTCGAAGCATACTTTTTAGGGCATCATGTGACCACACGTGTTGACCGTCCATCGTTT >34 AAAACTTTACCGATTTGCAAGATGTAAACAGCACGGGGACGCTATATCGACTGAGTGTTGTAGTGGAATCTACCACCCCGGACATGGGGCTGGACTGATTATTGAACGCT >38 AAAAGAGTTTGCAGGGTCACTAGCCCGGCTCTGCATACATTGGCAAAGAGGCGATAAGCTGTGAATTCCAGGCTGGGGACGAGCTTGAACCTACTGTGCGAAGTTACCAC >40 AAAAGCCAGAAGGTATGCAGATCGATATCGCTCCAATGTGTCAGTAGCCCGGACGAAGGCCTGCTCGATGATGCGTCATATTAATGTCAGAGAACCTAGTAGCCATCGGT >44 AAAAGGACAACGATGTGTGTGCTTCAAATATTCCTCCAATTCGTTACTGTCGCGGATGTTATGTCGGGCACTCTTAGCTTCCAAGGTGGGTGGATGTTAACGCGCATAGA >46 AAAAGGGGTTTGCTGACCCACAGACAATAGATAGCGGATTACTTACTCCACTTACGAGACTAACGCAATGCTAATAACTTGACGTCTAAACGGGACCACATAAGCCTTTA >47 AAAAGGTATTATATTGATGATTAGCCATTTAGGGAAGATCAACAAGATGAAGCATGACGCGAGAGGTAAGGGATCGACGAACGCTCCTGCCATGGGCAATCCAGGAGGGC >49 AAAAGTCCCTGTCGTACCTCAATCCAGGCCACGTATACTCAGGGGGTACCGAAACCCTAATAACTCTCCCGTTGGTGACCAGATCTGAATCTGCACCGCACACAACTACA >51 AAAAGTGGAAAATACCCTCACTAACAGGGTATCTTGCGTTGAGCTGGTAAGTGCACCAGGGAAAACTTGTTATCCTCGAGTCTGATCGATGCTATCTCATATTTCTGAGT >54 AAAATAGAAATCGCAGCTTGAATGCTGTTACTCAATGTTTATGAACATAGCTCCCGCGTTACTCTGTTTCGACATAGGATGCGCGCACCGGAAGCCTGCTACCACAACAT >57 AAAATCAAACTAGCTACTCCGAAGGCGGGATATTTGCCTGGTAGAGGGAAATGTAGCTCACGGGCCGTTTACTCTTCTTCAGAGCAACTAAGTATTCCGGAAAACCTCAG >60 AAAATCGTGTTCAAGAAAGAGAGGGCCGGGCGCTGAATTGGGCCCGAATCACAAAAAAGTGAGTCGCGCTCCTACAAAGTCCTAATCTAATAGTGGATGATGTGTCTGGT >62 AAAATGCCATACTAAATTAACGGTGTCTCATAGCGACATTGTTATTGTCACCTGACATAGCCAGAAGGTTAAAAGTAGTGCGCGACGCGAAATACCCATGCTGCGGAGTC >67 AAAATTCAGCCTTTCTTACCCAAGGCCTCTCTCGGACAAGAACTATGAAAGCATGCCCTACAGCATACTTTCGCTGACATATGGAGCAGGGAGCCTAAAGGCCGTTTATC >71 AAACAACCACAGATGGCCACTATGTGAAGTTTTGGACGAGTCCATACATTTTTCACTAAGTAAGAAGCTACCTTAATACGTGCACGCATCACATCCTAGACGCTCTGGCT >72 AAACAAGCCGAGAATCTGGCGTGACAAATCCTCCGGAACGGGCTGACCCCACTGTACAAACAATGTAAATCACCGCAGTTTCACTGTACGTTTGGTCTTTTTGATAGACA >74 AAACAATGGACGGAACCATGTTCTGTTACAAGCGCTGACCACACAAGCCGAGAGTACCCCAAGATGATGTCTTAGGATCGTATATACCCTCCATACCCGAGCTTTCCCCG >77 AAACACGTCACCGAGCGCTTAGTGGATCGTACTCAACATGTTGAACAGACATTATCTACATTCGATTCTTCCCATTATGTATCATCGCAGTACACGCCGCTTTCCATTTT >78 AAACACTCGCACAGACCGGTAACCGAGGGAATACAGAATTATAGCCCATATTCGCTGTCCCAAACTGCACCCATCGTTGGCAATTCCGAGACCTCTATTTCCGGTATGCG >79 AAACAGAGACATGTACGTTTTGCGCGGTGGTAGCTCTGGAGTCGGAGGCAGGGTTTTTTGGCCGGCAAAATCAGTATCCGACCTCGTTGGATGACTCCGGAAAACCTTTT >80 AAACAGATCGCTATTAGCACGCGTATGCTTTCACTAGCGAACAAAAGTGCCCCTTTGAGTCCTAGCAGCTACAGTGCCCGTAACTGATATTCTTAAGGCTATTTACAGTT >81 AAACAGGGCACTGGAGGGCAGCCCTTGAACCGCATAGATGGTGGAATTTCATACGGACTGGCGGGCATTATCGGGGTCGTATTGCCTTTGGGGGCATAGCCCACGAGTGC >82 AAACAGTAGAGTTTCATGTCCCTTGTATCGGAGGCAGCGACTCGCTTGAGCAGACCAAGTCCCGTCACTGAGGGTTATCAGTGAGGATACCTTGGTTCAGACAAAAAGAT >83 AAACATACGATCAAGTGTCGAAATTATATCACCGGCATTTGGTCTTTAGATATCTAAAGAAATGGCGCTAGGCCATCTCCCGGGTTTTTTCTGCTTCATGGCTAGATTCG >85 AAACATGATTTCGTACCCCGTGTAGGGCATGTTACCCACGTGAGGCGAGGTATGCGTGGGTCGATGTAGTACCTGTTGACCCGCATTTAGCCTCGACTCAATCTGCTGGA >100 AAACCGTGGGTAAGGCCGGGAGCTTTACACCAATTGGAACCACAAGTGGCTTGGCGGCCCAGGCTTCCTCATGCAGGATCATAACAATTACGCAAGTCCGGGAATCAGGA >100 AAACCGTGGGTAAGGCCGGGAGCTTTACACCAATTGGAACCACAAGTGGCTTGGCGGCCCAGGCTTCCTCATGCAGGATCATAACAATTACGCAAGTCCGGGAATCAGGA >100 AAACCGTGGGTAAGGCCGGGAGCTTTACACCAATTGGAACCACAAGTGGCTTGGCGGCCCAGGCTTCCTCATGCAGGATCATAACAATTACGCAAGTCCGGGAATCAGGA >91 AAACCATAGATCACAAGTCCACCTCGAGGCGATTATGCATGCCCTTCACTTCTCACGGCTGGATGGGCTTGCCTTAGTCACTTGCGATTGAGTCGTACGATTATAAAGCG >93 AAACCCCCCCGTTACCCTCACTCCGCTCGGCCTAGCGGTGGTCAGGTCAGGAGTTGCAATCGGAGTCACACGATCATACTTTCTCACTGCGCACAACATATCTGCTTGCA simka-1.5.1/example/D_paired_2.fasta000077500000000000000000000137141353413740300172410ustar00rootroot00000000000000>0 AAAAAAACCGAAGTCTCTTGCAGTATGCACAGGAGATCGCGGAGAACCGGAATCCCCTGGTTCGACTGTGACTCTTGGAGCGTCGCTAATCGCGTCGGTAACTTTAATTT >2 AAAAAAGCGGAATCACTGAAGACGTCATTTCCTCTTCAGAAAAAGTTTACCATGTTCTATTCTTGACCCTACGAATCAAGGTTCCGTTTAGCGTGTCGTTATAGAACAAT >3 AAAAAATACAGTATATGCCCCGAACTAGCCATCCGGATCCCAATAATCAAATCACCGTGACTGATGGTCAGATATCTTCTCCGGGATTGTGGGGAACGCCTACTTCGTGG >4 AAAAACAAGGGCTGAGCATCTGCGACATAAGTATCCCGTACATTATGTTCGATTGTAGTTAACTAAATTGTCCTACACTAGCGACCCTAAACCACCGTTACTCGAGGGCA >7 AAAAACGGTGGTTTGCATGTTAGGCGGAAGATCCCAATCCCTTGAAGAATCACGGAAGTAAGTTACTAGTGAAGGTTACTTTGCGATCTAAGGCGCTCGTGGGTGTCACA >8 AAAAACTGGCTACATTTAGACTTTTCCCAGCCTTGCCTACATGCCTAGCACTGAATCAAATACCCGGCTACCGGGTAGCGACCCCACAATATCAAGTTGCTTCTCTGAGT >10 AAAAAGCAGTCGGAAGACCTTTAGCTGTCCTTTAGCAGTGACTCTCCGCCCGTCTCAATGCAAAGAGATTCTGCGCATTGCTCCTTGTCCTAAGACACCATTAAGCGAAT >12 AAAAAGTTAGTCCTAAATTGTGGGCATCGGGTTGGCTGAATTAGACTGTGTGATCATATTCTACTTTTCGTCGACAGGATCTGGGCCGATTAGGGACATGTAAAGTGTCC >13 AAAAATAAGAAGCCTTACGAGCGTTGCGGGTTCGTTCTATATAAGGGCAGTACGTTTAGTTTACCCATCTGGGGTTCTTGTTACAACATGGAGTCGACTGTGTCTTTTAC >14 AAAAATCCGTTTAAGTCCACACCCCATATCTTTTGCAAGTTACGTCTTCACTTGTGTACTGTAGACTGCTGGGGACTAGAGTCCATCGTTGTGACTTAAAGGACGCTTTC >15 AAAAATGAATGATCCACGCCCGGTCAGTGTAGATCACTGAGCCTCGGCTATTCAGCCCATAAAACACAAAGCCTGCGGAGCGGCGGCCTACTAGAAATGCGATCCATCCT >17 AAAACAAAAGCAATTAGCAACGGACGCGAACTCTCGATCTTACAAGGATACCAGAATAGTTCACCCTGGCTGCGCAATGCCGATATTAAGTGTAGGCTTCGGCCCATTGT >18 AAAACAAGCGATTGAGATTCTAATCCAGCTTGCGACATACTTTGTCAGCGATCTGCGCTGTCTAATTTGCATCAAGTGTCTATACACGGTCCCTCCTCAGGGCGGGCACG >19 AAAACACTCCCGTATTCATGTCTACGTACCGTTATCCACTATTAAATGTACACTAGCTAACAGTGGTCATTGAGTAATCAAGATCAAGCGCGAGATGGGGGAGGATTGCT >23 AAAACCCAAGATGACGAATATCTGGTTACTGGGTTGACGGGATATAATCATTGCAAAACCTTTTGGGAAGGGACCTTAAAGCCTAGTCTGTCTTTTAGTTGCCCTCGTGG >25 AAAACCTGACGTGTAGATTAGTTACGTCCCCCGGCGCATGCAATCGTCCCACACGGCAGATGTGTTCGGACTGGAGACAATAGTGCTAGTTGGACTACGGCCTATTGGCG >26 AAAACGAAGTAGCGGGTTACGTGAATGGTGCCCACTTACGATAGGTTCGAGGCTGAGGGCTGTACCACGCCGGACGATTCGTGTTCGCCTCCTTGAATCTGATTCCCCGT >29 AAAACGGGACCTGCTTTTTTGCTTTGCGGAGGTCAATGGTTAAGTACGAACCAGCATGCGGGATGTGGCCCCGCCAATGTGGTACCGTTTGGTACTGGATCAGTCCTTTT >38 AAAAGATACCTACGCTCCATCTAGAGGTTAATAGCGAACCTCTCGCTTCTAGAGCGAGGTGGCATTCAGCGAGATGAACCTCTAATCTAAGCCCCGAATATCTGACGCAG >39 AAAAGATTGCGTCTCAGGCGGGCTCTAAACACGCTTCGCCGGACCAACTTTCTGACATTCTGGGACTGTCACTTGCAAGGCCTTATAGCTTTACGGCATTCTCCTTCGCC >40 AAAAGCACTACTCCGTTTGGACCATTTGCGCAAACATGATCCCCGCGCGGTACGGAACTTTTTTACAACGCAGGATCTTCTGACCCGGTAACATGCTACCTGGCGCCTGT >41 AAAAGCCGGGATGGGTAACTCTTTTACTATTTGGGGACAAAGCTGCATAGTGACCGGCACTCAAACATACATTGTAATAGTGAAATCGGAGAGCACCATGCGCCAGTCGG >42 AAAAGCGATTTGAGCGGAGGAGGCGTTGAGGAGATCAGAAGGGGGCACACCGCAACGGTCTTCAACACACACTGGCAGTCTTTAAGACGTTTGAAGTTTAGCTTAGTTAT >43 AAAAGCTGGTCCTGCCATTGATGTCCGATGAGCGACTCTTACCCTTGCAAAGAGCACAAAAGTTATATTCTGATCAATGTCATGCATTTAACATTACTGACAGGGTGGCG >44 AAAAGGAAGATCACAGGAGAAGTAGACACCTCATCGATGGCACGGACCCAGTTCACATATAGGGTGGGGAGCTATCCAGAAGTGGCCATGGTTTTGTAACTTCGCGTTAG >46 AAAAGGGGGAAACAACTGAAGGTCGGTGCCGCGATAAGCCCGTTAACAGGGACGGGCCAAAGCATTAGCTTGTTAAGAACCGATGCTATGTTTATCCGATTGGATGTTGT >50 AAAAGTGACGAATTCCTGCATCGAGAGGATGGCGTCTCTCGCATCGCCGGTCCAAGAAGACCACAGGAAACAGATCGGAGAAGGCCGCAGGTATTCAGGAAGTTCATATA >51 AAAAGTTCCATTGGGGGCACGCCTAAGTTACGGCACCCGAGTTTCGCCAGGAAGTGGAAATTTATTCTTTTGAATCCGCAGAAGTGTAAAATGCCGTCCAATAAAATTAC >53 AAAATAAGGCTTTGGTGCCAAGACCAAACTCGCTTTGATGTCGTCTTGGCCAAAAAGATACCTTCGGATGGGCCCACCCCTATGCTTCCTCATGCTTTCACTAGGGAGAC >55 AAAATAGTAGGCCGAATGGATTGGGTGACGTGTGGCACAGTAAGGGAGGACTATCAGGTGATTCTCTACCTGGAGCCACCATGAACCTCATGAGTAGAGGCGGAACAAGG >58 AAAATCCGTACCTCGAGCGGTTGGAGAACGCTCGCGCTGAATGCCCGTAAGATGTTGACAGTGAAGTGATTTTGCAATCGATGTATTCGTGTCGAATCATCATGCCCGCT >60 AAAATCTATTAGGAGATCAACTGTCCGAGTATTGTGGGGTTGGCTCTACTTACGCAACCCGCGATACAGCAATACGATCCTCGAGCTCTCCTCAACCCCGATTGCGTATG >62 AAAATGCACGAAAATAGGAGCATTCGTCCCAGTAGTGATTGAAAGTCCTTAGGCATAATTCAATACATTCGTTTGGACCCCAAGTGTTGGGCGTTCAACGCGCGAGATTG >64 AAAATGGCGGAATCTGTTGGATCCTGGCCGGTAGAGTGTGCCTACAGATTTGTCGAGGCGGGTAGTCTGCCTGCGGCCTGCACGTTAGAGTACTACCTCATAGTGTTAAG >65 AAAATGTCAACTCACGTTCTTTCGCACTTATGTTTCAGCCTCAGATTCAATTTGACATCCTACAAATATGAGAAAGGCAACCAGGAATGGGGCTGAACCCGTTCAGCCGT >66 AAAATTATGAGAGGGGCGTTCTCGCAATGGAGATTCTTCTCGTCGACTCACCAGGGGACCAGTGCACGCAGCTCCATAGGTGCACGCTCTCGGACGTGGCAACGGAGGAC >68 AAAATTGGCCGCTGAGACATGGGACATGGATTATCGTGTCATATAGACGGCGGCATTTTGCTACTAGCGAACACTCAATGGGGCTTTCCGTGGACTACTATCAATACTGA >70 AAACAAAATTTTGTTCTATCGAGTCTACCAGAGCCGAACACGGCCAAGCCCAATATGCCAGTGGTGTGCTGCTTGAGCAATTCGCAGGTATCTCTCCAACTACATCGCCG >73 AAACAAGGTCCGTACGAAAACCTAGTGACCTCAAATCAGTTGTAGGTGTACTGGCTTGCAACGTTGCCGGTGAACGAAAGAACCGCTAGGGGGCCGTGATGCATTCTACC >79 AAACAGATAGCCACCAACTATACCCTTCTTCGATGTCCATGCGGGTCGTTAACGTCGCTAGGCGTGAACGGACGCTCGTGGATGTCCGTCTACGCAATGTTACGAGTCAA >80 AAACAGCCTCGATGGGTGTACAGTGCACCCTTGCTCGACTACGGCTTCAATTCTGATGTAAAACCTGTACGTGAGACTGCCAAGGCAGATGCAACCAGATCTCCTGGATT >81 AAACAGCTAGGGGTCGACCACTTGCCGGCAGACACTGAGGTAGATATTAAGCAGAACACCGGGTGGTAATTGCATGTCGTATTAGTTCCCGTTGCTTAGCATGCCTAAGG >84 AAACATATGTCATAAGGCGCTGAAGACACCGCACGGGGACTAACACAACAGCACCAGATTGTCGACGTAAGTGCTTTTCCTATTTCTTAGCCCATCTCTAATCAGCCCGG >85 AAACATCGAGCTTGGACGCGTGCAGGTATTAAATTTGAGCCCCAAGGCTTATAATGCATCCTCCCACAAGAAGGCATAGATGTACTCGTCTTAGTACAAGGCTGCTAGAG >87 AAACATTTGTACGCGAATAAATTATTTCGGTGTCAGAGGCGACACCCGTAAACGGGAGCAAGGCTAAGTCAAGGTGTTGAAGAGAATTTTCTGTGGTCATTTACTGTCCT >89 AAACCACGAATACCAAAATAAAGTCACCCTGTGCCTTAGTGTTTAAGATGTACTGACAATTTCCTGTGGATCGTTGTGCGGTTGCTGTGGGGGCCCTATCAGCGAACGGG >90 AAACCAGTATGCTTTTAAGGGAACCGAGGAATCGCATGATCTTCCGGTGATTATGCCATCTCTAACAGGGAGGCGCCTTGCTTTAACGCTGTACCCGTTTTGTACTCGAA >91 AAACCATCTTGATAATTCTAAGGTCAGTACGAAAGGCCTCTAGTCAACCGTCTCGTGGATCGGGACTCAGCCGTGGAATGATCATCATTAGCAGACAGACAGTCGATATC >100 AAACCGTGGGTAAGGCCGGGAGCTTTACACCAATTGGAACCACAAGTGGCTTGGCGGCCCAGGCTTCCTCATGCAGGATCATAACAATTACGCAAGTCCGGGAATCAGGA >100 AAACCGTGGGTAAGGCCGGGAGCTTTACACCAATTGGAACCACAAGTGGCTTGGCGGCCCAGGCTTCCTCATGCAGGATCATAACAATTACGCAAGTCCGGGAATCAGGA >100 AAACCGTGGGTAAGGCCGGGAGCTTTACACCAATTGGAACCACAAGTGGCTTGGCGGCCCAGGCTTCCTCATGCAGGATCATAACAATTACGCAAGTCCGGGAATCAGGA >91 AAACCATAGATCACAAGTCCACCTCGAGGCGATTATGCATGCCCTTCACTTCTCACGGCTGGATGGGCTTGCCTTAGTCACTTGCGATTGAGTCGTACGATTATAAAGCG >93 AAACCCCCCCGTTACCCTCACTCCGCTCGGCCTAGCGGTGGTCAGGTCAGGAGTTGCAATCGGAGTCACACGATCATACTTTCTCACTGCGCACAACATATCTGCTTGCA simka-1.5.1/example/dataset_metadata.csv000077500000000000000000000001221353413740300202600ustar00rootroot00000000000000DATASET_ID;VARIABLE_1;VARIABLE_2 A;1;aquatic B;1;human C;2;human D;2;soil E;3;soilsimka-1.5.1/example/potara_job/000077500000000000000000000000001353413740300164005ustar00rootroot00000000000000simka-1.5.1/example/potara_job/sge/000077500000000000000000000000001353413740300171565ustar00rootroot00000000000000simka-1.5.1/example/potara_job/sge/job_count.bash000077500000000000000000000001231353413740300217760ustar00rootroot00000000000000#!/bin/bash #$ -S /bin/bash #$ -m bea #$ -cwd source /local/env/envgcc-4.9.1.shsimka-1.5.1/example/potara_job/sge/job_merge.bash000077500000000000000000000001231353413740300217450ustar00rootroot00000000000000#!/bin/bash #$ -S /bin/bash #$ -m bea #$ -cwd source /local/env/envgcc-4.9.1.shsimka-1.5.1/example/potara_job/tgcc/000077500000000000000000000000001353413740300173205ustar00rootroot00000000000000simka-1.5.1/example/potara_job/tgcc/job_count.bash000077500000000000000000000001461353413740300221450ustar00rootroot00000000000000#!/bin/bash #MSUB -r Counting #MSUB -T 86400 #MSUB -q large #MSUB -A fg0001 #MSUB -n 1 #MSUB -Q normalsimka-1.5.1/example/potara_job/tgcc/job_merge.bash000077500000000000000000000001461353413740300221140ustar00rootroot00000000000000#!/bin/bash #MSUB -r Counting #MSUB -T 86400 #MSUB -q large #MSUB -A fg0001 #MSUB -n 1 #MSUB -Q normalsimka-1.5.1/example/simka_input.txt000077500000000000000000000001601353413740300173440ustar00rootroot00000000000000A: A.fasta B: B.fasta C: C.fasta D: D_paired_1.fasta ; D_paired_2.fasta E: A.fasta , A.fasta ; B.fasta , B.fastasimka-1.5.1/example/simple_test.sh000077500000000000000000000023421353413740300171500ustar00rootroot00000000000000#!/bin/bash #simple test with real data # look for simka binary. In devel mode, it's in ../build/bin directory. # In production mode, it's in ../bin directory. if [ -f "../bin/simka" ] then bindir="../bin" elif [ -f "../build/bin/simka" ] then bindir="../build/bin" else echo "could not find a compiled simka binary" exit 1 fi # run simka command="$bindir/simka -in ../example/simka_input.txt -out ./simka_results/ -out-tmp ./simka_temp_output" #printf "$command\n\n" # DO NOT add lines between '$command' exec and 'var...' ! $command var=$? printf "\n\n\n" if [ $var -eq 0 ] then echo "*** Test: PASSED" else echo "*** Test: FAILED" exit 1 fi #printf "\nremoving all created dirs\n" # clean temp files rm -rf temp_output printf "\nCommand used:\n" printf "\t$command\n" printf "\nCommand for visualizing results:\n" printf "\tpython ../scripts/visualization/run-visualization.py -in ./simka_results/ -out ./simka_results/ -pca -heatmap -tree\n" printf "\nCommand for visualizing results with metadata annotations:\n" printf "\tpython ../scripts/visualization/run-visualization.py -in ./simka_results/ -out ./simka_results/ -pca -heatmap -tree -metadata-in ../example/dataset_metadata.csv -metadata-variable VARIABLE_1\n" simka-1.5.1/scripts/000077500000000000000000000000001353413740300143145ustar00rootroot00000000000000simka-1.5.1/scripts/jenkins/000077500000000000000000000000001353413740300157555ustar00rootroot00000000000000simka-1.5.1/scripts/jenkins/README000066400000000000000000000003361353413740300166370ustar00rootroot00000000000000These scripts are intended to be used with the Jenkins CI Platform available at Inria. They can be called from a Jenkins Task / Build / Execute script, as follows: /bin/bash -xv gatb-${TOOL_NAME}/scripts/jenkins/xxx.sh simka-1.5.1/scripts/jenkins/tool-simka-build-debian7-64bits-gcc-4.7.sh000077500000000000000000000070551353413740300251570ustar00rootroot00000000000000#!/bin/bash #--------------------------------------------------------------# # Continuous integration script for Jenkins # #--------------------------------------------------------------# # # Default mode : # This script will exit with error (exit code 1) if any of its steps fails. # To change this behaviour, choose DO_NOT_STOP_AT_ERROR in Jenkins (see below). #--------------------------------------------------------------# set +xv echo " ----------------------------------------- Miscellaneous information ----------------------------------------- date : `date` hostname : `hostname` pwd : `pwd` ----------------------------------------- Jenkins build parameters (user defined) ----------------------------------------- BRANCH_TO_BUILD : ${BRANCH_TO_BUILD} INRIA_FORGE_LOGIN : ${INRIA_FORGE_LOGIN} DO_NOT_STOP_AT_ERROR : ${DO_NOT_STOP_AT_ERROR} ----------------------------------------- Jenkins build parameters (built in) ----------------------------------------- BUILD_NUMBER : ${BUILD_NUMBER} " error_code () { [ "$DO_NOT_STOP_AT_ERROR" = "true" ] && { return 0 ; } } [ "$DO_NOT_STOP_AT_ERROR" != "true" ] && { set -e ; } || { echo "(!) DEBUG mode, the script will NOT stop..." ; echo; } set -xv # quick look at resources #----------------------------------------------- free -h #----------------------------------------------- lstopo #----------------------------------------------- df -kh #----------------------------------------------- ################################################################ # COMPILATION # ################################################################ gcc --version g++ --version [ `gcc -dumpversion` = 4.7 ] && { echo "GCC 4.7"; } || { echo "GCC version is not 4.7, we exit"; exit 1; } JENKINS_TASK=tool-${TOOL_NAME}-build-debian7-64bits-gcc-4.7 GIT_DIR=/scratchdir/builds/workspace/gatb-${TOOL_NAME} BUILD_DIR=/scratchdir/$JENKINS_TASK/gatb-${TOOL_NAME}/build rm -rf $BUILD_DIR mkdir -p $BUILD_DIR #----------------------------------------------- # we need gatb-core submodule to be initialized cd $GIT_DIR git submodule init git submodule update #----------------------------------------------- cd $BUILD_DIR #----------------------------------------------- cmake -Wno-dev -DJENKINS_TAG=${BRANCH_TO_BUILD} $GIT_DIR #----------------------------------------------- make -j 2 || error_code ################################################################ # TEST # ################################################################ # prepare data and scripts cp -R $GIT_DIR/example/ .. # 'tests' directory does not exist on older releases of simka if [ -d "$GIT_DIR/tests" ]; then cp -R $GIT_DIR/tests/ .. fi # run tests cd ../example ./simple_test.sh || error_code if [ -d "../tests" ]; then cd ../tests python simple_test.py || error_code fi # cleanup disk space cd .. rm -rf example if [ -d "tests" ]; then rm -rf tests fi # go bask to build for packaging step cd build ################################################################ # PACKAGING # ################################################################ # Upload bin bundle to the forge if [ $? -eq 0 ] && [ "$INRIA_FORGE_LOGIN" != none ] && [ "$DO_NOT_STOP_AT_ERROR" != true ]; then make package scp ${ARCHIVE_NAME}-${BRANCH_TO_BUILD}-bin-Linux.tar.gz ${INRIA_FORGE_LOGIN}@scm.gforge.inria.fr:/home/groups/gatb-tools/htdocs/ci-inria # source package is handled by the osx task fi simka-1.5.1/scripts/jenkins/tool-simka-build-macos-10.9.5-gcc-4.2.1.sh000077500000000000000000000073341353413740300245210ustar00rootroot00000000000000#!/bin/bash #--------------------------------------------------------------# # Continuous integration script for Jenkins # #--------------------------------------------------------------# # # Default mode : # This script will exit with error (exit code 1) if any of its steps fails. # To change this behaviour, choose DO_NOT_STOP_AT_ERROR in Jenkins (see below). #--------------------------------------------------------------# set +xv echo " ----------------------------------------- Miscellaneous information ----------------------------------------- date : `date` hostname : `hostname` pwd : `pwd` ----------------------------------------- Jenkins build parameters (user defined) ----------------------------------------- BRANCH_TO_BUILD : ${BRANCH_TO_BUILD} INRIA_FORGE_LOGIN : ${INRIA_FORGE_LOGIN} DO_NOT_STOP_AT_ERROR : ${DO_NOT_STOP_AT_ERROR} ----------------------------------------- Jenkins build parameters (built in) ----------------------------------------- BUILD_NUMBER : ${BUILD_NUMBER} " error_code () { [ "$DO_NOT_STOP_AT_ERROR" = "true" ] && { return 0 ; } } [ "$DO_NOT_STOP_AT_ERROR" != "true" ] && { set -e ; } || { echo "(!) DEBUG mode, the script will NOT stop..." ; echo; } set -xv # quick look at resources #----------------------------------------------- sw_vers -productVersion #----------------------------------------------- system_profiler SPSoftwareDataType #----------------------------------------------- lstopo #----------------------------------------------- top -l 1|head -15 #----------------------------------------------- ################################################################ # COMPILATION # ################################################################ gcc --version g++ --version [ `gcc -dumpversion` = 4.2.1 ] && { echo "GCC 4.2.1"; } || { echo "GCC version is not 4.2.1, we exit"; exit 1; } JENKINS_TASK=tool-${TOOL_NAME}-build-macos-10.9.5-gcc-4.2.1 GIT_DIR=/builds/workspace/$JENKINS_TASK/gatb-${TOOL_NAME} #N.B. /scratchdir not yet mounted on the osx slave (ciosx). # as soon as /scratchdir is created, one has to update TEST procedure, below. # refer to linux build target to see how to do that BUILD_DIR=$GIT_DIR/build rm -rf $BUILD_DIR mkdir -p $BUILD_DIR #----------------------------------------------- # we need gatb-core submodule to be initialized cd $GIT_DIR git submodule init git submodule update #----------------------------------------------- cd $BUILD_DIR #----------------------------------------------- cmake -Wno-dev -DJENKINS_TAG=${BRANCH_TO_BUILD} $GIT_DIR #----------------------------------------------- make -j 2 || error_code ################################################################ # TEST # ################################################################ cd ../example ./simple_test.sh || error_code # 'tests' directory does not exist on older releases of simka if [ -d "../tests" ]; then cd ../tests python simple_test.py || error_code fi cd ../build ################################################################ # PACKAGING # ################################################################ # Prepare and upload bin and source bundle to the forge if [ $? -eq 0 ] && [ "$INRIA_FORGE_LOGIN" != none ] && [ "$DO_NOT_STOP_AT_ERROR" != true ]; then make package make package_source scp ${ARCHIVE_NAME}-${BRANCH_TO_BUILD}-bin-Darwin.tar.gz ${INRIA_FORGE_LOGIN}@scm.gforge.inria.fr:/home/groups/gatb-tools/htdocs/ci-inria scp ${ARCHIVE_NAME}-${BRANCH_TO_BUILD}-Source.tar.gz ${INRIA_FORGE_LOGIN}@scm.gforge.inria.fr:/home/groups/gatb-tools/htdocs/ci-inria fi simka-1.5.1/scripts/jenkins/tool-simka-release-debian.sh000077500000000000000000000106111353413740300232300ustar00rootroot00000000000000#!/bin/bash #--------------------------------------------------------------# # Continuous integration script for Jenkins # #--------------------------------------------------------------# # # Default mode : # This script will exit with error (exit code 1) if any of its steps fails. # To change this behaviour, choose DO_NOT_STOP_AT_ERROR in Jenkins (see below). #--------------------------------------------------------------# set +xv echo " ----------------------------------------- Miscellaneous information ----------------------------------------- date : `date` hostname : `hostname` pwd : `pwd` ----------------------------------------- Jenkins build parameters (user defined) ----------------------------------------- BRANCH_TO_BUILD : ${BRANCH_TO_BUILD} RELEASE_TO_BUILD : ${RELEASE_TO_BUILD} INRIA_FORGE_LOGIN : ${INRIA_FORGE_LOGIN} TEST_VARIABLE : ${TEST_VARIABLE} DO_NOT_STOP_AT_ERROR : ${DO_NOT_STOP_AT_ERROR} ----------------------------------------- Jenkins build parameters (built in) ----------------------------------------- BUILD_NUMBER : ${BUILD_NUMBER} " set -xv # quick look at resources #----------------------------------------------- free -h #----------------------------------------------- lstopo #----------------------------------------------- df -kh #----------------------------------------------- ################################################################ # PREPARE RELEASE # ################################################################ # paths to access tool source code and build JENKINS_TASK=tool-${TOOL_NAME}-build-debian7-64bits-gcc-4.7 BUILD_DIR=/scratchdir/$JENKINS_TASK/gatb-${TOOL_NAME}-release TOOL_GIT_HOME="/scratchdir/builds/workspace/gatb-${TOOL_NAME}" # path to 'github_release_manager.sh' script GRM_PATH="${BUILD_DIR}/github-release-api" GRM_CMD="${GRM_PATH}/github_release_manager.sh" # github credentials and repository GITHUB_REPO=${TOOL_NAME} GITHUB_OWNER=GATB GRM_CREDENTIALS="-l $GITHUB_ADMIN -t $GITHUB_TOKEN -o ${GITHUB_OWNER} -r ${GITHUB_REPO}" # Prepare build dir rm -rf $BUILD_DIR mkdir -p $BUILD_DIR #----------------------------------------------- # check tag version; 'master' is not allowed if [ ! "${BRANCH_TO_BUILD}" == "master" ] ; then cd ${TOOL_GIT_HOME} DOES_TAG_EXIST=`git tag -l | grep "^${BRANCH_TO_BUILD}$"` if [ -z ${DOES_TAG_EXIST} ] ; then echo "/!\ Error: tag '${BRANCH_TO_BUILD}' does not exist on 'gatb-tool-${TOOL_NAME}' repository" exit 1 fi else echo "/!\ Error: cannot make an official release on 'master' branch" exit 1 fi #----------------------------------------------- if [ "$INRIA_FORGE_LOGIN" == none ]; then echo "/!\ Error: No login name to connect to Inria Forge" exit 1 fi cd $BUILD_DIR git clone https://github.com/pgdurand/github-release-api.git ################################################################ # RETRIEVE ARCHIVES FROM INRIA FORGE # ################################################################ #retrieve last build from ci-inria (see tool-lean-build-XXX tasks) scp ${INRIA_FORGE_LOGIN}@scm.gforge.inria.fr:/home/groups/gatb-tools/htdocs/ci-inria/${TOOL_NAME}-${BRANCH_TO_BUILD}-bin-Linux.tar.gz . [ $? != 0 ] && exit 1 scp ${INRIA_FORGE_LOGIN}@scm.gforge.inria.fr:/home/groups/gatb-tools/htdocs/ci-inria/${TOOL_NAME}-${BRANCH_TO_BUILD}-bin-Darwin.tar.gz . [ $? != 0 ] && exit 1 scp ${INRIA_FORGE_LOGIN}@scm.gforge.inria.fr:/home/groups/gatb-tools/htdocs/ci-inria/${TOOL_NAME}-${BRANCH_TO_BUILD}-Source.tar.gz . [ $? != 0 ] && exit 1 ################################################################ # INTERACT WITH GITHUB # ################################################################ # create Github release ${GRM_CMD} ${GRM_CREDENTIALS} -d ${BRANCH_TO_BUILD} -c create if [ $? != 0 ] ; then echo "/!\ Error: unable to create release, check above error" exit 1 fi #upload files function uploadFile(){ local FILE_TO_LOAD=$1 echo "Uploading: ${FILE_TO_LOAD}" ${GRM_CMD} ${GRM_CREDENTIALS} -d ${BRANCH_TO_BUILD} -c upload ${FILE_TO_LOAD} if [ $? != 0 ] ; then echo "/!\ Error: unable to upload file, check above error" exit 1 fi } uploadFile ${TOOL_NAME}-${BRANCH_TO_BUILD}-bin-Linux.tar.gz uploadFile ${TOOL_NAME}-${BRANCH_TO_BUILD}-bin-Darwin.tar.gz uploadFile ${TOOL_NAME}-${BRANCH_TO_BUILD}-Source.tar.gz simka-1.5.1/scripts/visualization/000077500000000000000000000000001353413740300172155ustar00rootroot00000000000000simka-1.5.1/scripts/visualization/dendro.r000077500000000000000000000047641353413740300206710ustar00rootroot00000000000000#Author: Gaetan Benoit #Contact: gaetan.benoit@inria.fr args <- commandArgs(trailingOnly = TRUE) distanceMatrixFilename = args[1] distance_name = basename(distanceMatrixFilename) distance_name = unlist(strsplit(distance_name, "[.]"))[1] distance_name = gsub("mat_", "", distance_name) distanceMatrix = as.matrix(read.table(file=distanceMatrixFilename, sep=";", header=TRUE, row.names=1)) distanceMatrix[lower.tri(distanceMatrix)] <- t(distanceMatrix)[lower.tri(distanceMatrix)] #symmetrize matrix width = as.numeric(args[3]) height = as.numeric(args[4]) format = args[5] if(format == "png"){ png(file=paste0(args[2], ".png"), width=width, height=height, units="in",res=72) } else{ pdf(file=paste0(args[2], ".pdf"), width=width, height=height) } use_metadata = F if(length(args) == 7){ suppressPackageStartupMessages(library(dendextend)) use_metadata = T metadata_table = as.matrix(read.table(file=args[6], sep=";", header=TRUE, row.names=1)) metadata_variable = args[7] #print(metadata_table) variables = metadata_table[,metadata_variable] #print(variables) meatadata_index = list() dataset_ids = rownames(metadata_table) for(i in 1:length(dataset_ids)){ dataset_id = dataset_ids[i] #print(dataset_id) #print(variables[[i]]) meatadata_index[[dataset_id]] = variables[[i]] print(paste0(dataset_id, " ", variables[[i]])) #print(meatadata_index[[dataset_id]]) } colors = c() dataset_ids = rownames(distanceMatrix) for(i in 1:dim(distanceMatrix)[1]){ dataset_id = dataset_ids[i] colors = c(colors, meatadata_index[[dataset_id]]) } colors_numeric_temp = c() colors_numeric = as.numeric(as.factor(colors)) for(i in 1:length(colors_numeric)){ colors_numeric_temp = c(colors_numeric_temp, colors_numeric[i]+1) } colors_numeric = colors_numeric_temp #print(colors) } distanceMatrix = distanceMatrix*100 #inv_cr3 = matrix(100, ncol=dim(cr3)[1], nrow=dim(cr3)[1]) - cr3 Commet_distance = as.dist(distanceMatrix) hc = hclust(Commet_distance, method="average") dendo_cr3 = as.dendrogram(hc) if(use_metadata){ colors_numeric_hc = colors_numeric[hc$order] dendo_cr3 %>% set("labels_col", colors_numeric_hc) %>% set("branches_k_color", colors_numeric_hc) %>% # change color plot(main=paste0("Simka hierarchical clustering\n", distance_name), cex = 0.3, xlab="", sub="") legend("topright", title=metadata_variable, legend=unique(colors), col=unique(colors_numeric), pch=16) } else{ plot(dendo_cr3, main=paste0("Simka hierarchical clustering\n", distance_name), cex = 0.3, xlab="", sub="") } simka-1.5.1/scripts/visualization/heatmap.r000077500000000000000000000145141353413740300210270ustar00rootroot00000000000000# Contributors : # Pierre PETERLONGO, pierre.peterlongo@inria.fr [12/06/13] # Nicolas MAILLET, nicolas.maillet@inria.fr [12/06/13] # Guillaume Collet, guillaume@gcollet.fr [27/05/14] # Gaetan BENOIT, gaetan.benoit@inria.fr [08/10/15] # Claire LEMAITRE, claire.lemaitre@inria.fr [06/07/16] # # This software is a computer program whose purpose is to find all the # similar reads between sets of NGS reads. It also provide a similarity # score between the two samples. # # Copyright (C) 2014 INRIA # # This program is free software: you can redistribute it and/or modify # it under the terms of the GNU Affero General Public License as # published by the Free Software Foundation, either version 3 of the # License, or any later version. # # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU Affero General Public License for more details. # # You should have received a copy of the GNU Affero General Public License # along with this program. If not, see . ## Usage : Rscript heatmap.r matrix_asym.csv matrix_sym.csv output_file.pdf title if (!require("gplots")) { install.packages("gplots", dependencies = TRUE) library(gplots) } #options(echo=TRUE) # if you want see commands in output file args <- commandArgs(trailingOnly = TRUE) #png(file=args[3],width=800,height=800,res=65) width = as.numeric(args[4]) height = as.numeric(args[5]) format = args[6] if(format == "png"){ png(file=paste0(args[3], ".png"), width=width, height=height, units="in",res=72) } else{ pdf(file=paste0(args[3], ".pdf"), width=width, height=height) } cr3 = as.matrix(read.table(file=args[1], sep=";", header=TRUE, row.names=1)) # can be symetric matrix cr3_norm = as.matrix(read.table(file=args[2], sep=";", header=TRUE, row.names=1)) # must be a symetric matrix cr3[lower.tri(cr3)] <- t(cr3)[lower.tri(cr3)] #symmetrize matrix cr3_norm[lower.tri(cr3_norm)] <- t(cr3_norm)[lower.tri(cr3_norm)] #symmetrize matrix distance_name = basename(args[1]) distance_name = unlist(strsplit(distance_name, "[.]"))[1] distance_name = gsub("mat_", "", distance_name) use_metadata = F if(length(args) == 8){ use_metadata = T metadata_table = as.matrix(read.table(file=args[7], sep=";", header=TRUE, row.names=1)) metadata_variable = args[8] #print(metadata_table) variables = metadata_table[,metadata_variable] #print(variables) meatadata_index = list() dataset_ids = rownames(metadata_table) for(i in 1:length(dataset_ids)){ dataset_id = dataset_ids[i] #print(dataset_id) #print(variables[[i]]) meatadata_index[[dataset_id]] = variables[[i]] #print(meatadata_index[[dataset_id]]) } colors = c() dataset_ids = rownames(cr3_norm) for(i in 1:dim(cr3_norm)[1]){ dataset_id = dataset_ids[i] colors = c(colors, meatadata_index[[dataset_id]]) } colors_numeric_temp = c() colors_numeric = as.numeric(as.factor(colors)) for(i in 1:length(colors_numeric)){ colors_numeric_temp = c(colors_numeric_temp, colors_numeric[i]+1) } colors_numeric = colors_numeric_temp #print(colors) } n=100 # number of steps between 2 colors ## Transforming 0-1 distances in 0-100 similarity measure if(grepl("chord",args[1]) || grepl("hellinger",args[1])){ cr3 = (sqrt(2) - cr3) * 100 } else { cr3 = (1 - cr3) * 100 } ## Computing mini-maxi for colour palette mini=min(cr3[]) maxi=max(cr3[row(cr3)!=col(cr3)]) # ignoring the diagonal trueMax=max(cr3[]) # typically the value in the diagonal = 100 q25=quantile(cr3[row(cr3)!=col(cr3)],0.25,1) q50=quantile(cr3[row(cr3)!=col(cr3)],0.5,1) q75=quantile(cr3[row(cr3)!=col(cr3)],0.75,1) ## We use the quantiles to ignore some outlier values in the matrix (valuesmaxi will have a colour between brown and grey23) mini=max(q25-1.5*(q75-q25),0) maxi=min(q75+1.5*(q75-q25),trueMax) palette=colorRampPalette(c("green", "yellow", "red", "brown", "grey23"))(n = 5*n-1) ## Checking if maxi = trueMax trueMax.needed=ifelse(maxi /dev/null 2>&1 ") def outputHclust(outputFilename, matrixNormFilename): if not args.want_tree: return command = "Rscript " + hclust_script_filename + " " + join(args.input_dir, matrixNormFilename) + " " + join(args.output_dir, outputFilename) command = add_metadata_args(command) print("\t"+command) #print command os.system(command)# + " > /dev/null 2>&1 ") def outputPca(outputFilename, matrixNormFilename): if not args.want_pca: return command = "Rscript " + pca_script_filename + " " + join(args.input_dir, matrixNormFilename) + " " + join(args.output_dir, outputFilename) + " " + args.pca_axis_1 + " " + args.pca_axis_2 command = add_metadata_args(command) print("\t"+command) #print(args.metadata_filename) #print(args.metadata_variable) #print command os.system(command)# + " > /dev/null 2>&1 ") def execute(): files = [ f for f in listdir(args.input_dir) if isfile(join(args.input_dir,f))] for filename in files: asym = False if not ".csv.gz" in filename: continue if "asym" in filename: asym = True asym_filename = filename filename = filename.replace("_asym", "") method_name = filename.split(".")[0] method_name = method_name.replace("mat_", "") try: if asym: matrix[method_name].append(asym_filename) else: matrix[method_name].append(filename) except: matrix[method_name] = [] if asym: matrix[method_name].append(asym_filename) else: matrix[method_name].append(filename) #for method_name in matrix.keys(): #print(filename, method_name) #if method_name in filename: #matrix[method_name].append(filename) #break for method_name, matrix_filenames in matrix.items(): print("") print(method_name) #one version of the similairty function (sym) if len(matrix_filenames) == 1: #print("lala") outputHeatmap("heatmap_" + method_name, matrix_filenames[0], matrix_filenames[0]) outputHclust("hclust_" + method_name, matrix_filenames[0]) outputPca("pca_" + method_name, matrix_filenames[0]) #two version of the similarity function (sym and asym) else: sym = "" asym = "" for filename in matrix_filenames: if "asym" in filename: asym = filename else: sym = filename outputHeatmap("heatmap_" + method_name, asym, sym) outputHclust("hclust_" + method_name, sym) outputPca("pca_" + method_name, sym) #args = sys.argv #mat_input_dir = args[1] #try: # rscript_dir = args[2] #except: # rscript_dir = os.path.dirname(os.path.realpath(__file__)) rscript_dir = os.path.dirname(os.path.realpath(__file__)) heatmap_script_filename = join(rscript_dir, "heatmap.r") hclust_script_filename = join(rscript_dir, "dendro.r") pca_script_filename = join(rscript_dir, "pca.r") if not args.want_heatmap and not args.want_pca and not args.want_tree: print("Please, choose at least one option among: -heatmap -tree -pca") exit(1) if not os.path.exists(args.output_dir): os.makedirs(args.output_dir) execute() simka-1.5.1/simkaMin/000077500000000000000000000000001353413740300143755ustar00rootroot00000000000000simka-1.5.1/simkaMin/README.md000066400000000000000000000123331353413740300156560ustar00rootroot00000000000000# SimkaMin [![License](http://img.shields.io/:license-affero-blue.svg)](http://www.gnu.org/licenses/agpl-3.0.en.html) ## What is SimkaMin? As in the case of Simka, SimkaMin is a *de novo* comparative metagenomics tool. The difference with Simka stands in the fact that SimkaMin outputs approximate (but very similar) results by subsampling the kmer space. With this strategy, and with default parameters, SimkaMin is an order of magnitude faster, uses 10 times less memory and 70 times less disk than Simka. Developper: [Gaëtan Benoit](http://people.rennes.inria.fr/Gaetan.Benoit/), PhD, former member of the [Genscale](http://team.inria.fr/genscale/) team at Inria. Contact: claire dot lemaitre at inria dot fr ## References Benoit G, Mariadassou M, Robin S, Schbath S, Peterlongo P and Lemaitre C. (2019) [SimkaMin: fast and resource frugal *de novo* comparative metagenomics](https://doi.org/10.1093/bioinformatics/btz685). Bioinformatics Benoit G, Peterlongo P, Mariadassou M, Drezen E, Schbath S, Lavenier D, Lemaitre C. (2016) [Multiple comparative metagenomics using multiset k-mer counting](https://doi.org/10.7717/peerj-cs.94). PeerJ Computer Science 2:e94 Benoit G (2017) [Large scale de novo comparative metagenomics (PhD thesis in french)](https://tel.archives-ouvertes.fr/tel-01659395v2/). ## Install simkaMin SimkaMin comes with Simka installation. Refer to [Simka install instructions](../README.md). ## User manual ### Description SimkaMin computes Bray-Curtis (abundance based) and Jaccard (presence/absence based) distances between N (metagenomic) read sets based on subsamples of k-mer counts. Basically it takes as input the N metagenomic read sets and it outputs two matrices respectively providing the pairwise Bray-Curtis and the Jaccard distances between each dataset pairs. ### A simple command example Run the toy example: ```bash ./simkaMin/simkaMin.py -in example/simka_input.txt -out results ``` ### Input The input file (`-in`) lists the datasets. These datasets can be in fasta, fastq and in gzip compressed format (.gz). One dataset per line with the following syntax (you can put any number of spaces and/or tabs between syntax): ID1: filename.fasta ID2: filename.fasta ID3: filename.fasta The dataset ID in the name that will appear in the headers of the distance matrices. You can find a simka input file in example directory: ./example/data/simka_input.txt If a given datset has been splitted in several parts, Simka can automatically concatenate them. ID1: filename_part1.fasta , filename_part2.fasta , ... If you have paired files, you can list them separated by a ‘;’: ID1: filename_pair1.fasta ; filename_pair2.fasta You can combine concatenated and paired operations: ID1: filename_part1_pair1.fasta , filename_part2_pair1.fasta ; filename_part1_pair2.fasta , filename_part2_pair2.fasta Paired syntax is only usefull if the `-max-reads` option of SimkaMin is set. Example: If `-max-reads` is set to 100, then Simka will considered the 100 first reads of the first paired files and the 100 first reads of the second paired files… ### Output SimkaMin results are an abundance-based Bray-Curtis distance matrix `mat_presenceAbsence_jaccard.csv.gz` and a presence-absence-based Jaccard distance matrix `mat_abundance_braycurtis.csv.gz`. A distance matrix is a squared matrix of size N (where N is the number of input datasets). Each value in the matrix gives the distance between a pair of datasets. These values are in the range [0, 1]. A distance value of 0 means that the pair of datasets is perfectly similar. The greater the distance value is, the more dissimilar is the pair of datasets. SimkaMin results will be stored in the directory indicated by `-out` option. #### Visualize SimkaMin results SimkaMin results can be visualised through heatmaps, hierarchical clustering and PCA. This module is common with the Simka visualisation script `../scripts/visualization/run-visualization.py`. Please refer to the documentation provided in the [Simka Readme file](../README.md). ## Usage To see simka in-line help: ```bash ./simkaMin/simkaMin.py ``` ## Simka command examples Run the toy example: ```bash ./simkaMin/simkaMin.py -in example/simka_input.txt -out results ``` Change the kmer size ```bash ./simkaMin/simkaMin.py … -kmer-size 31 ``` Change the sub-sampling effort (default 1 million kmers are used per read set) ```bash ./simkaMin/simkaMin.py … -nb-kmers 10000 ``` Filter kmers seen one time (potentially erroneous): ```bash ./simkaMin/simkaMin.py … -filter ``` Consider all the reads of each samples (set 0 to use all reads) ```bash ./simkaMin/simkaMin.py … -max-reads 0 ``` Use only the first 1000 reads of each sample: ```bash ./simkaMin/simkaMin.py … -max-reads 1000 ``` Allow more memory and cores to improve the execution time: ```bash ./simkaMin/simkaMin.py … -max-memory 20000 -nb-cores 8 ``` Filter low complexity reads ```bash ./simkaMin/simkaMin.py … -min-shannon-index 1 ``` Filter small reads ```bash ./simkaMin/simkaMin.py … -min-read-size 80 ``` Update existing results with additional datasets ```bash ./simkaMin/simkaMin_update.py -in another_simka_input.txt -in-to-update results/simkamin # updated matrices will be in dir results/simkamin/ ``` simka-1.5.1/simkaMin/simkaMin.py000077500000000000000000000213011353413740300165170ustar00rootroot00000000000000#!/usr/bin/env python #***************************************************************************** # SimkaMin: Fast kmer-based method for estimating the similarity between numerous metagenomic datasets # A tool from the GATB (Genome Assembly Tool Box) # Copyright (C) 2019 INRIA # Authors: G.Benoit, C.Lemaitre, P.Peterlongo # # This program is free software: you can redistribute it and/or modify # it under the terms of the GNU Affero General Public License as # published by the Free Software Foundation, either version 3 of the # License, or (at your option) any later version. # # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU Affero General Public License for more details. # # You should have received a copy of the GNU Affero General Public License # along with this program. If not, see . #***************************************************************************** import os, math, subprocess from os import listdir from os.path import isfile, join, splitext import sys, argparse from simkaMin_utils import SimkaParser, ArgumentFormatterSimka, read_sketch_header, ProgressBar, is_executable #------------------------------------------------------------------------------------------------------------- # Arg parser #------------------------------------------------------------------------------------------------------------- parser = SimkaParser(formatter_class=ArgumentFormatterSimka) parserMain = parser.add_argument_group("[main options]") parserCore = parser.add_argument_group("[core options]") parserDistance = parser.add_argument_group("[distance options]") parserKmer = parser.add_argument_group("[k-mer options]") parserRead = parser.add_argument_group("[read options]") parserDev = parser.add_argument_group("[advanced (developer) options]") parserMain.add_argument('-in', action="store", dest="input_filename", help="input file of datasets. One sample per line: id1: filename1...", required=True) parserMain.add_argument('-out', action="store", dest="out", default="./simka_results", help="output directory for result files (distance matrices)") parserMain.add_argument('-seed', action="store", dest="seed", default="100", help="seed used for random k-mer selection") parserMain.add_argument('-bin', action="store", dest="bin", help="path to simkaMinCore program (to be specified if not in PATH, or not in standard installation directory /build/bin/simkaMinCore)") parserKmer.add_argument('-kmer-size', action="store", dest="kmer_size", help="size of a kmer", default="21") parserKmer.add_argument('-nb-kmers', action="store", dest="nb_kmers", help="number of kmers used to compute distances", default="1000000") parserKmer.add_argument('-filter', action="store_true", dest="filter", help="filter out k-mer seen one time (potentially erroneous)") parserRead.add_argument('-max-reads', action="store", dest="max_reads", default="0", help="maximum number of reads per sample to process") parserRead.add_argument('-min-read-size', action="store", dest="min_read_size", default="0", help="minimal size a read should have to be kept") parserRead.add_argument('-min-shannon-index', action="store", dest="min_shannon_index", default="0", help="minimal Shannon index a read should have to be kept. Float in [0,2]") parserCore.add_argument('-nb-cores', action="store", dest="nb_cores", help="number of cores", default="0") parserCore.add_argument('-max-memory', action="store", dest="max_memory", help="max memory (MB)", default="8000") args = parser.parse_args() # Check SimkaMinCore executable # ----------------------------- simkaMinCoreBin=args.bin if args.bin is not None: # given by the user if not is_executable(simkaMinCoreBin): print("Error: "+simkaMinCoreBin+" not found or not executable, should be /build/bin/simkaMinCore") exit(1) else: # Check if is in the PATH simkaMinCoreBin="simkaMinCore" if not is_executable(simkaMinCoreBin): # not in PATH, checking "../build/bin/simkaMinCore" simkaMinCoreBin=os.path.join(os.path.split(os.path.realpath(__file__))[0],"../build/bin/simkaMinCore") if not is_executable(simkaMinCoreBin): print("Error: simkaMinCore executable not found, please give the executable path with option -bin (should be /build/bin/simkaMinCore)") exit(1) #------------------------------------------------------------------------------------------------------------- # SimkaMin pipeline #------------------------------------------------------------------------------------------------------------- #Create some dirs and filenames if not os.path.exists(args.out): os.makedirs(args.out) outDir = os.path.join(args.out, "simkamin") if not os.path.exists(outDir): os.makedirs(outDir) sketchDir = os.path.join(outDir, "sketch") if not os.path.exists(sketchDir): os.makedirs(sketchDir) sketchFilename = os.path.join(sketchDir, "sketch.bin") distanceOutputDir = os.path.join(outDir, "distance") if not os.path.exists(distanceOutputDir): os.makedirs(distanceOutputDir) logsDir = os.path.join(outDir, "logs") if not os.path.exists(logsDir): os.makedirs(logsDir) #Create commands sketchCommand = simkaMinCoreBin + " sketch " sketchCommand += " -in " + args.input_filename sketchCommand += " -out " + sketchFilename sketchCommand += " -seed " + args.seed sketchCommand += " -kmer-size " + args.kmer_size sketchCommand += " -nb-kmers " + args.nb_kmers if args.filter: sketchCommand += " -filter " sketchCommand += " -max-reads " + args.max_reads sketchCommand += " -min-read-size " + args.min_read_size sketchCommand += " -min-shannon-index " + args.min_shannon_index sketchCommand += " -nb-cores " + args.nb_cores sketchCommand += " -max-memory " + args.max_memory exportCommand = simkaMinCoreBin + " export " exportCommand += " -in " + distanceOutputDir exportCommand += " -in1 " + sketchFilename exportCommand += " -in2 " + sketchFilename #exportCommand += " -in-ids " + distanceOutputDir #not applicable here exportCommand += " -out " + args.out exportCommand += " -nb-cores " + args.nb_cores print("\n\n#-----------------------------") print("# Sketching") print(sketchCommand) print("#-----------------------------\n") print("\n\n") ret = os.system(sketchCommand) if ret != 0: print("ERROR"); exit(1) print("\n\n#-----------------------------") print("# Computing distances") print("#-----------------------------\n") print("\n\n") #Create binary matrix file (required in case the following distance commands are run in parallel if os.path.exists(distanceOutputDir + "/mat_presenceAbsence_jaccard.bin"): os.remove(distanceOutputDir + "/mat_presenceAbsence_jaccard.bin") if os.path.exists(distanceOutputDir + "/mat_abundance_braycurtis.bin"): os.remove(distanceOutputDir + "/mat_abundance_braycurtis.bin") open(distanceOutputDir + "/mat_presenceAbsence_jaccard.bin", "wb").close() open(distanceOutputDir + "/mat_abundance_braycurtis.bin", "wb").close() sketch_header = read_sketch_header(sketchFilename) nbDatasetToProcess = sketch_header["nbDatasets"] MAX_DATASETS_PROCESS = 100 def create_distance_command(i, j, n1, n2): distanceCommand = simkaMinCoreBin + " distance " distanceCommand += " -in1 " + sketchFilename distanceCommand += " -in2 " + sketchFilename distanceCommand += " -out " + distanceOutputDir distanceCommand += " -nb-cores " + args.nb_cores distanceCommand += " -start-i " + str(i*MAX_DATASETS_PROCESS) distanceCommand += " -start-j " + str(j*MAX_DATASETS_PROCESS) distanceCommand += " -n-i " + str(n1) distanceCommand += " -n-j " + str(n2) distanceCommand += " > " + os.path.join(logsDir, "log_distance_" + str(i) + "-" + str(j)) + " 2>&1 " return distanceCommand step = int(math.ceil( float(nbDatasetToProcess) / float(MAX_DATASETS_PROCESS))) nbCommands = int(math.ceil( float(step * step) / float(2))) progressBar = ProgressBar("Computing distances", nbCommands) progressBar.start() done = False for i in range(0, step): n1 = min(MAX_DATASETS_PROCESS, nbDatasetToProcess-i*MAX_DATASETS_PROCESS) for j in range(i, step): n2 = min(MAX_DATASETS_PROCESS, nbDatasetToProcess-j*MAX_DATASETS_PROCESS) distanceCommand = create_distance_command(i, j, n1, n2) #print distanceCommand ret = os.system(distanceCommand) if ret != 0: print("ERROR"); exit(1) progressBar.step(1) #print("\n\n#-----------------------------") #print("# Exporting distances") #print("#-----------------------------\n") print("\n\nExporting distance matrices in csv.gz format...") ret = os.system(exportCommand) if ret != 0: print("ERROR"); exit(1) print("\n\n") print("Result dir: " + args.out) simka-1.5.1/simkaMin/simkaMin_update.py000077500000000000000000000233501353413740300200670ustar00rootroot00000000000000#!/usr/bin/env python #***************************************************************************** # SimkaMin: Fast kmer-based method for estimating the similarity between numerous metagenomic datasets # A tool from the GATB (Genome Assembly Tool Box) # Copyright (C) 2019 INRIA # Authors: G.Benoit, C.Lemaitre, P.Peterlongo # # This program is free software: you can redistribute it and/or modify # it under the terms of the GNU Affero General Public License as # published by the Free Software Foundation, either version 3 of the # License, or (at your option) any later version. # # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU Affero General Public License for more details. # # You should have received a copy of the GNU Affero General Public License # along with this program. If not, see . #***************************************************************************** import os, struct, shutil from os import listdir from os.path import isfile, join, splitext import sys, argparse from simkaMin_utils import SimkaParser, ArgumentFormatterSimka, read_sketch_header, is_executable #------------------------------------------------------------------------------------------------------------- # Arg parser #------------------------------------------------------------------------------------------------------------- parser = SimkaParser(formatter_class=ArgumentFormatterSimka) parserMain = parser.add_argument_group("[main options]") parserCore = parser.add_argument_group("[core options]") parserDistance = parser.add_argument_group("[distance options]") parserKmer = parser.add_argument_group("[k-mer options]") parserRead = parser.add_argument_group("[read options]") parserDev = parser.add_argument_group("[advanced (developer) options]") parserMain.add_argument('-in', action="store", dest="input_filename", help="input file of datasets (datasets to add to existing simka results", required=True) parserMain.add_argument('-in-to-update', action="store", dest="input_existingResults", help="path to existing simka results to update (existing results will be overwritten)", required=True) parserMain.add_argument('-bin', action="store", dest="bin", help="path to simkaMinCore program (to be specified if not in PATH, or not in standard installation directory /build/bin/simkaMinCore)") parserKmer.add_argument('-filter', action="store_true", dest="filter", help="filter out k-mer seen one time (potentially erroneous)") parserRead.add_argument('-max-reads', action="store", dest="max_reads", default="0", help="maximum number of reads per sample to process") parserRead.add_argument('-min-read-size', action="store", dest="min_read_size", default="0", help="minimal size a read should have to be kept") parserRead.add_argument('-min-shannon-index', action="store", dest="min_shannon_index", default="0", help="minimal Shannon index a read should have to be kept. Float in [0,2]") parserCore.add_argument('-nb-cores', action="store", dest="nb_cores", help="number of cores", default="0") parserCore.add_argument('-max-memory', action="store", dest="max_memory", help="max memory (MB)", default="8000") args = parser.parse_args() # Check SimkaMinCore executable # ----------------------------- simkaMinCoreBin=args.bin if args.bin is not None: # given by the user if not is_executable(simkaMinCoreBin): print("Error: "+simkaMinCoreBin+" not found or not executable, should be /build/bin/simkaMinCore") exit(1) else: # Check if is in the PATH simkaMinCoreBin="simkaMinCore" if not is_executable(simkaMinCoreBin): # not in PATH, checking "../build/bin/simkaMinCore" simkaMinCoreBin=os.path.join(os.path.split(os.path.realpath(__file__))[0],"../build/bin/simkaMinCore") if not is_executable(simkaMinCoreBin): print("Error: simkaMinCore executable not found, please give the executable path with option -bin (should be /build/bin/simkaMinCore)") exit(1) #------------------------------------------------------------------------------------------------------------- # SimkaMin pipeline #------------------------------------------------------------------------------------------------------------- #Create some dirs and filenames #if not os.path.exists(args.out): os.makedirs(args.out) existingDir = args.input_existingResults sketchDir = os.path.join(existingDir, "sketch") #if not os.path.exists(sketchDir): os.makedirs(sketchDir) sketchFilename_existing = os.path.join(sketchDir, "sketch.bin") sketchFilename_new = os.path.join(sketchDir, "sketch_new.bin") distanceOutputDir = os.path.join(existingDir, "distance") distanceDir_existingVsNew = os.path.join(distanceOutputDir, "existingVsNew") if not os.path.exists(distanceDir_existingVsNew): os.makedirs(distanceDir_existingVsNew) distanceDir_newVsNew = os.path.join(distanceOutputDir, "newVsNew") if not os.path.exists(distanceDir_newVsNew): os.makedirs(distanceDir_newVsNew) #simkaMin_pipeline_filename = "./simkaMin_pipeline.py" #Existing datasets: datasets that have already been processed by SimkaMin # - ids and k-mers are contained in (-in-existing)/sketch/sketch.bin # - distances are contained in (-in-existing)/distance/mat_*.bin #New datasets: datasets to add contains in -in file existing_sketch_header = read_sketch_header(sketchFilename_existing) print(existing_sketch_header) #Sketch new datasets command_sketchNewDatasets = simkaMinCoreBin + " sketch " command_sketchNewDatasets += " -in " + args.input_filename command_sketchNewDatasets += " -out " + sketchFilename_new command_sketchNewDatasets += " -seed " + str(existing_sketch_header["seed"]) command_sketchNewDatasets += " -kmer-size " + str(existing_sketch_header["kmerSize"]) command_sketchNewDatasets += " -nb-kmers " + str(existing_sketch_header["sketchSize"]) if args.filter: command_sketchNewDatasets += " -filter " command_sketchNewDatasets += " -max-reads " + args.max_reads command_sketchNewDatasets += " -min-read-size " + args.min_read_size command_sketchNewDatasets += " -min-shannon-index " + args.min_shannon_index command_sketchNewDatasets += " -nb-cores " + args.nb_cores command_sketchNewDatasets += " -max-memory " + args.max_memory #Compute distance between existing datasets and new datasets command_distance_existingVsNew = simkaMinCoreBin + " distance " command_distance_existingVsNew += " -in1 " + sketchFilename_existing command_distance_existingVsNew += " -in2 " + sketchFilename_new command_distance_existingVsNew += " -out " + distanceDir_existingVsNew command_distance_existingVsNew += " -nb-cores " + args.nb_cores #Compute distance between new datasets and new datasets command_distance_newVsNew = simkaMinCoreBin + " distance " command_distance_newVsNew += " -in1 " + sketchFilename_new command_distance_newVsNew += " -in2 " + sketchFilename_new command_distance_newVsNew += " -out " + distanceDir_newVsNew command_distance_newVsNew += " -nb-cores " + args.nb_cores #Update existing distance matrix command_distanceMatrix_update = simkaMinCoreBin + " matrix-update " command_distanceMatrix_update += " -in " + distanceOutputDir command_distanceMatrix_update += " -in1 " + sketchFilename_existing command_distanceMatrix_update += " -in2 " + sketchFilename_new #Append new sketch to existing sketch command_sketch_append = simkaMinCoreBin + " append " command_sketch_append += " -in1 " + sketchFilename_existing command_sketch_append += " -in2 " + sketchFilename_new exportCommand = simkaMinCoreBin + " export " exportCommand += " -in " + distanceOutputDir exportCommand += " -in1 " + sketchFilename_existing exportCommand += " -in2 " + sketchFilename_existing #exportCommand += " -in-ids " + distanceOutputDir #not applicable here exportCommand += " -out " + args.input_existingResults print("\n\n#-----------------------------") print("# Sketching new datasets") print("#-----------------------------\n") ret = os.system(command_sketchNewDatasets) if ret != 0: print("ERROR"); exit(1) print("\n\n#-----------------------------") print("# Computing distances between existing datasets and new datasets") print("#-----------------------------\n") ret = os.system(command_distance_existingVsNew) if ret != 0: print("ERROR"); exit(1) ######################## #exportCommand = args.bin + " export " #exportCommand += " -in " + distanceDir_existingVsNew #exportCommand += " -in1 " + sketchFilename_existing #exportCommand += " -in2 " + sketchFilename_new #exportCommand += " -in-ids " + distanceOutputDir #not applicable here #exportCommand += " -out " + distanceDir_existingVsNew #os.system(exportCommand) #os.system("gzip -cd "+ distanceDir_existingVsNew +"/mat_abundance_braycurtis.csv.gz") ######################## print("\n\n#-----------------------------") print("# Computing distances between new datasets") print("#-----------------------------\n") ret = os.system(command_distance_newVsNew) if ret != 0: print("ERROR"); exit(1) print("\n\n#-----------------------------") print("# Update existing distance matrices") print("#-----------------------------\n") ret = os.system(command_distanceMatrix_update) if ret != 0: print("ERROR"); exit(1) print("\n\n#-----------------------------") print("# Append new sketch to existing sketch") print("#-----------------------------\n") ret = os.system(command_sketch_append) if ret != 0: print("ERROR"); exit(1) print("\n\n#-----------------------------") print("# Exporting distances") print("#-----------------------------\n") ret = os.system(exportCommand) if ret != 0: print("ERROR"); exit(1) #Clear temp dir shutil.rmtree(distanceDir_existingVsNew) shutil.rmtree(distanceDir_newVsNew) os.remove(sketchFilename_new) print("\n\n") print("Result dir: " + existingDir) simka-1.5.1/simkaMin/simkaMin_utils.py000077500000000000000000000123521353413740300177450ustar00rootroot00000000000000#***************************************************************************** # SimkaMin: Fast kmer-based method for estimating the similarity between numerous metagenomic datasets # A tool from the GATB (Genome Assembly Tool Box) # Copyright (C) 2019 INRIA # Authors: G.Benoit, C.Lemaitre, P.Peterlongo # # This program is free software: you can redistribute it and/or modify # it under the terms of the GNU Affero General Public License as # published by the Free Software Foundation, either version 3 of the # License, or (at your option) any later version. # # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU Affero General Public License for more details. # # You should have received a copy of the GNU Affero General Public License # along with this program. If not, see . #***************************************************************************** import argparse, struct, time, datetime, sys, os, subprocess def is_executable(bin): try: subprocess.call([bin, "-h"],stdout=open(os.devnull, 'wb'), stderr=open(os.devnull, 'wb')) except OSError as e: return(0) return(1) #------------------------------------------------------------------------------------------------------------- # ProgressBar #------------------------------------------------------------------------------------------------------------- class ProgressBar(): def __init__(self, text, max): self.text = text self.max = max self.progress = 0 self.start_time = 0 def start(self): self.progress = 0 self.start_time = time.time() self.display() def step(self, value): self.progress += value self.display() def display(self): progress_percent = float(self.progress) / float(self.max) * 100 duration = int(time.time() - self.start_time) duration_str = str(datetime.timedelta(seconds=duration)) #--- sys.stdout.write('\r') sys.stdout.write("[" + str(round(progress_percent, 1)) + "%] " + self.text + " [Time: " + duration_str + "]") if self.progress == self.max: sys.stdout.write("\n") sys.stdout.flush() #------------------------------------------------------------------------------------------------------------- # ArgumentFormatterSimka #------------------------------------------------------------------------------------------------------------- class SimkaParser(argparse.ArgumentParser): def error(self, message): print("") sys.stderr.write('error: %s\n' % message) print("") self.print_help() sys.exit(2) class ArgumentFormatterSimka(argparse.HelpFormatter): #def _fill_text(self, text, width, indent): # return ''.join([indent + line for line in text.splitlines(True)]) def _split_lines(self, text, width): return text.splitlines() #remove default args layout def _format_args(self, action, default_metavar): result = "" return result #Remove "usage: ..." header def _format_usage(self, usage, actions, groups, prefix): return "" #Changed layout of each item def _get_help_string(self, action): text = "" if type(action) == argparse._StoreAction: text = "(1 arg) : " + action.help elif type(action) == argparse._StoreTrueAction: text = "(0 arg) : " + action.help if type(action) == argparse._StoreAction and action.default != None: text += " [Default: " + str(action.default) + "]" #print type(action), action #print action #return "-5-" #return action.help if text != "": return text return "__none__" #Hack for removing useless "optional arguments:" section def _join_parts(self, part_strings): #print part_strings return ''.join([part for part in part_strings if part and part is not argparse.SUPPRESS and not "optional arguments:" in part and not "__none__" in part and not "--help" in part]) #------------------------------------------------------------------------------------------------------------- # Sketch reader #------------------------------------------------------------------------------------------------------------- def read_sketch_header(sketchFilename): f = open(sketchFilename, mode='rb') kmerSize = struct.unpack("B", f.read(1))[0] #B = unsigned char sketchSize = struct.unpack("I", f.read(4))[0] #I = unsigned int seed = struct.unpack("I", f.read(4))[0] #I = unsigned int nbDatasets = struct.unpack("I", f.read(4))[0] #I = unsigned int f.close() #u_int8_t kmerSize_; #file.read((char*)(&kmerSize_), sizeof(kmerSize_)); #u_int32_t sketchSize_; #file.read((char*)(&sketchSize_), sizeof(sketchSize_)); #u_int32_t seed_; #file.read((char*)(&seed_), sizeof(seed_)); #u_int32_t nbDatasets_; #file.read((char*)(&nbDatasets_), sizeof(nbDatasets_)); return {"kmerSize": kmerSize, "sketchSize": sketchSize, "seed": seed, "nbDatasets": nbDatasets} simka-1.5.1/src/000077500000000000000000000000001353413740300134145ustar00rootroot00000000000000simka-1.5.1/src/SimkaCount.cpp000077500000000000000000000331121353413740300162000ustar00rootroot00000000000000/***************************************************************************** * Simka: Fast kmer-based method for estimating the similarity between numerous metagenomic datasets * A tool from the GATB (Genome Assembly Tool Box) * Copyright (C) 2015 INRIA * Authors: G.Benoit, C.Lemaitre, P.Peterlongo * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU Affero General Public License as * published by the Free Software Foundation, either version 3 of the * License, or (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU Affero General Public License for more details. * * You should have received a copy of the GNU Affero General Public License * along with this program. If not, see . *****************************************************************************/ #include "SimkaPotara.hpp" #include "minikc/MiniKC.hpp" //#include // We use the required packages using namespace std; //#define NB_COUNT_CACHE 1 //#define TRACK_DISK_USAGE /* template class SimkaPotaraBankFiltered : public BankDelegate { public: Iterator* _it; SimkaPotaraBankFiltered (IBank* ref, const Filter& filter, u_int64_t maxReads, size_t nbDatasets) : BankDelegate (ref), _filter(filter) { //_nbReadsPerDataset = nbReadsPerDataset; _maxReads = maxReads; _nbDatasets = nbDatasets; } ~SimkaPotaraBankFiltered(){ delete _it; } Iterator* iterator () { _it = _ref->iterator (); //std::vector*> iterators = it->getComposition(); return new SimkaInputIterator (_it, _nbDatasets, _maxReads, _filter); //return filterIt; } private: //vector _nbReadsPerDataset; u_int64_t _maxReads; Filter _filter; u_int64_t _nbReadToProcess; size_t _datasetId; size_t _nbDatasets; }; */ class SimkaCount : public Tool { public: SimkaCount () : Tool ("SimkaCount") { //getParser()->push_front (new OptionOneParam (STR_URI_OUTPUT, "output file", true)); //getParser()->push_back (new OptionOneParam (STR_ID, "dataset id", true)); //getParser()->push_back (new OptionOneParam (STR_KMER_SIZE, "kmer size", true)); getParser()->push_back (new OptionOneParam ("-out-tmp-simka", "tmp output", true)); getParser()->push_back (new OptionOneParam ("-bank-name", "bank name", true)); getParser()->push_back (new OptionOneParam ("-bank-index", "bank name", true)); getParser()->push_back (new OptionOneParam (STR_SIMKA_MIN_READ_SIZE, "bank name", true)); getParser()->push_back (new OptionOneParam (STR_SIMKA_MIN_READ_SHANNON_INDEX, "bank name", true)); getParser()->push_back (new OptionOneParam (STR_SIMKA_MAX_READS, "bank name", true)); getParser()->push_back (new OptionOneParam ("-nb-datasets", "bank name", true)); getParser()->push_back (new OptionOneParam ("-nb-partitions", "bank name", true)); //getParser()->push_back (new OptionOneParam ("-nb-cores", "bank name", true)); //getParser()->push_back (new OptionOneParam ("-max-memory", "bank name", true)); getParser()->push_back (SortingCountAlgorithm<>::getOptionsParser(), 1); if (Option* p = dynamic_cast (getParser()->getParser(STR_KMER_ABUNDANCE_MIN))) { p->setDefaultValue ("0"); } } void execute () { //size_t datasetId = getInput()->getInt(STR_ID); size_t kmerSize = getInput()->getInt(STR_KMER_SIZE); //cout << kmerSize << endl; string outputDir = getInput()->getStr("-out-tmp-simka"); string bankName = getInput()->getStr("-bank-name"); size_t bankIndex = getInput()->getInt("-bank-index"); size_t minReadSize = getInput()->getInt(STR_SIMKA_MIN_READ_SIZE); double minReadShannonIndex = getInput()->getDouble(STR_SIMKA_MIN_READ_SHANNON_INDEX); u_int64_t maxReads = getInput()->getInt(STR_SIMKA_MAX_READS); size_t nbDatasets = getInput()->getInt("-nb-datasets"); size_t nbPartitions = getInput()->getInt("-nb-partitions"); CountNumber abundanceMin = getInput()->getInt(STR_KMER_ABUNDANCE_MIN); CountNumber abundanceMax = getInput()->getInt(STR_KMER_ABUNDANCE_MAX); Parameter params(*this, kmerSize, outputDir, bankName, minReadSize, minReadShannonIndex, maxReads, nbDatasets, nbPartitions, abundanceMin, abundanceMax, bankIndex); Integer::apply (kmerSize, params); //SimkaBankId* bank = new SimkaBankId(_banks, i); //cout << config._nb_partitions << endl; //KmerCountCompressor* kmerCountCompressor = new KmerCountCompressor(outputDir, config._nb_partitions, 1); //SimkaCompProcessor* processor = new SimkaCompProcessor(kmerCountCompressor); //vector*> procs; //procs.push_back(processor); //algo.addProcessor(processor); //algo.execute(); //delete kmerCountCompressor; //itBanks[i]-> // We get a handle on the HDF5 storage object. // Note that we use an auto pointer since the StorageFactory dynamically allocates an instance //Storage* storage = StorageFactory(DSK::getStorageMode()).load (getInput()->getStr(STR_URI_FILE)); //LOCAL (storage); //string kmerSizeStr = storage->getGroup("params").getProperty ("kmer_size"); //if (kmerSizeStr.empty()) { throw Exception ("unable to get the kmer size"); } //size_t kmerSize = atoi (kmerSizeStr.c_str()); } struct Parameter { Parameter (SimkaCount& tool, size_t kmerSize, string outputDir, string bankName, size_t minReadSize, double minReadShannonIndex, u_int64_t maxReads, size_t nbDatasets, size_t nbPartitions, CountNumber abundanceMin, CountNumber abundanceMax, size_t bankIndex) : tool(tool), kmerSize(kmerSize), outputDir(outputDir), bankName(bankName), minReadSize(minReadSize), minReadShannonIndex(minReadShannonIndex), maxReads(maxReads), nbDatasets(nbDatasets), nbPartitions(nbPartitions), abundanceMin(abundanceMin), abundanceMax(abundanceMax), bankIndex(bankIndex) {} SimkaCount& tool; //size_t datasetId; size_t kmerSize; string outputDir; string bankName; size_t minReadSize; double minReadShannonIndex; u_int64_t maxReads; size_t nbDatasets; size_t nbPartitions; CountNumber abundanceMin; CountNumber abundanceMax; size_t bankIndex; }; template struct Functor { typedef typename Kmer::Type Type; typedef typename Kmer::Count Count; typedef typename SimkaCompressedProcessor::Kmer_BankId_Count Kmer_BankId_Count; void operator () (Parameter p){ IProperties* props = p.tool.getInput(); vector outInfo; IBank* bank = Bank::open(p.outputDir + "/input/" + p.bankName); LOCAL(bank); /* u_int64_t nbSeqs = 1; IBank* sampleBank = new SimkaBankSample(bank, nbSeqs); SortingCountAlgorithm sortingCount (sampleBank, props); SimkaNullProcessor* proc = new SimkaNullProcessor(); sortingCount.addProcessor (proc); sortingCount.execute(); Configuration config = sortingCount.getConfig(); //_nbPartitions = _maxJobMerge; config._nb_partitions = p.nbPartitions; uint64_t memoryUsageCachedItems; config._nb_cached_items_per_core_per_part = 1 << 8; // cache at least 256 items (128 here, then * 2 in the next while loop) do { config._nb_cached_items_per_core_per_part *= 2; memoryUsageCachedItems = 1LL * config._nb_cached_items_per_core_per_part *config._nb_partitions * config._nbCores * sizeof(Type); } while (memoryUsageCachedItems < config._max_memory * MBYTE / 10); */ vector nbKmerPerParts(p.nbPartitions, 0); vector nbDistinctKmerPerParts(p.nbPartitions, 0); vector chordNiPerParts(p.nbPartitions, 0); Configuration config; { Repartitor* repartitor = new Repartitor(); LOCAL(repartitor); { Storage* storage = StorageFactory(STORAGE_HDF5).load (p.outputDir + "/" + "config.h5"); LOCAL (storage); config.load(storage->getGroup("")); repartitor->load(storage->getGroup("")); } //config._abundanceUserNb = 1; //config._abundance.clear(); //CountRange range(props->getInt(STR_KMER_ABUNDANCE_MIN), 100000); //config._abundance.push_back(range); /* vector cacheIndexes; cacheIndexes.resize(p.nbPartitions); vector > caches; caches.resize(p.nbPartitions); for(size_t i=0; i* > bags; vector* > cachedBags; for(size_t i=0; i* bag = new BagGzFile(outputFilename); Bag* cachedBag = new BagCache(bag, 10000); cachedBags.push_back(cachedBag); //BagCache bagCache(*bag, 10000); bags.push_back(bag); } string tempDir = p.outputDir + "/temp/" + p.bankName; System::file().mkdir(tempDir, -1); //cout << i << endl; //string outputDir = p.outputDir + "/comp_part" + to_string(p.datasetId) + "/"; //cout << "\tinput: " << p.outputDir + "/input/" + p.bankName << endl; SimkaSequenceFilter sequenceFilter(p.minReadSize, p.minReadShannonIndex); IBank* filteredBank = new SimkaPotaraBankFiltered(bank, sequenceFilter, p.maxReads, p.nbDatasets); // = new SimkaPotaraBankFiltered(bank) LOCAL(filteredBank); //LOCAL(bank); //Storage* solidStorage = 0: //string solidsName = p.outputDir + "/solid/" + p.bankName + ".h5"; //bool autoDelete = false; // (solidsName == "none") || (solidsName == "null"); //solidStorage = StorageFactory(STORAGE_HDF5).create (solidsName, true, autoDelete); //LOCAL(solidStorage); SimkaCompressedProcessor* proc = new SimkaCompressedProcessor(cachedBags, nbKmerPerParts, nbDistinctKmerPerParts, chordNiPerParts, p.abundanceMin, p.abundanceMax, p.bankIndex); u_int64_t nbReads = 0; if(p.kmerSize <= 15){ MiniKC miniKc(p.tool.getInput(), p.kmerSize, filteredBank, *repartitor, proc); miniKc.execute(); nbReads = miniKc._nbReads; } else{ //SimkaCompressedProcessor* proc = new SimkaCompressedProcessor(bags, caches, cacheIndexes, p.abundanceMin, p.abundanceMax); std::vector* > procs; procs.push_back(proc); SortingCountAlgorithm algo (filteredBank, config, repartitor, procs, props); algo.execute(); nbReads = algo.getInfo()->getInt("seq_number"); } u_int64_t nbDistinctKmers = 0; u_int64_t nbKmers = 0; u_int64_t chord_N2 = 0; for(size_t i=0; iflush(); //cachedBags[i]->flush(); delete cachedBags[i]; //delete bags[i]; } //delete proc; } string contents = ""; for(size_t i=0; ifwrite(contents.c_str(), contents.size(), 1); nbKmerPerPartFile->flush(); delete nbKmerPerPartFile; //cout << "heo" << endl; //delete config; //cout << "heo" << endl; writeFinishSignal(p, outInfo); //cout << "heo" << endl; } void writeFinishSignal(Parameter& p, const vector& outInfo){ string finishFilename = p.outputDir + "/count_synchro/" + p.bankName + ".ok"; IFile* file = System::file().newFile(finishFilename, "w"); string contents = ""; for(size_t i=0; ifwrite(contents.c_str(), contents.size(), 1); file->flush(); delete file; } }; }; /********************************************************************************/ /* Dump solid kmers in ASCII format */ /********************************************************************************/ int main (int argc, char* argv[]) { try { SimkaCount().run (argc, argv); } catch (Exception& e) { std::cout << "EXCEPTION: " << e.getMessage() << std::endl; return EXIT_FAILURE; } } //! [snippet1] simka-1.5.1/src/SimkaMerge.cpp000077500000000000000000001237311353413740300161560ustar00rootroot00000000000000/***************************************************************************** * Simka: Fast kmer-based method for estimating the similarity between numerous metagenomic datasets * A tool from the GATB (Genome Assembly Tool Box) * Copyright (C) 2015 INRIA * Authors: G.Benoit, C.Lemaitre, P.Peterlongo * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU Affero General Public License as * published by the Free Software Foundation, either version 3 of the * License, or (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU Affero General Public License for more details. * * You should have received a copy of the GNU Affero General Public License * along with this program. If not, see . *****************************************************************************/ #include #include #include // We use the required packages using namespace std; using namespace gatb::core::system; using namespace gatb::core::system::impl; #define MERGE_BUFFER_SIZE 1000 #define SIMKA_MERGE_MAX_FILE_USED 200 struct sortItem_Size_Filename_ID{ u_int64_t _size; size_t _datasetID; sortItem_Size_Filename_ID(){} sortItem_Size_Filename_ID(u_int64_t size, size_t datasetID){ _size = size; _datasetID = datasetID; } }; bool sortFileBySize (sortItem_Size_Filename_ID i, sortItem_Size_Filename_ID j){ return ( i._size < j._size ); } u_int64_t getFileSize(const string& filename){ std::ifstream in(filename.c_str(), std::ifstream::ate | std::ifstream::binary); u_int64_t size = in.tellg(); in.close(); return size; } template class DistanceCommand : public gatb::core::tools::dp::ICommand //, public gatb::core::system::SmartPointer { public: /** Shortcut. */ typedef typename Kmer::Type Type; typedef typename Kmer::Count Count; size_t _bufferIndex; size_t _partitionId; SimkaStatistics* _stats; SimkaCountProcessorSimple* _processor; vector _bufferKmers; vector _bufferCounts; /** Constructor. */ DistanceCommand ( const string& tmpDir, const vector& datasetIds, size_t partitionId, size_t nbBanks, bool computeSimpleDistances, bool computeComplexDistances, size_t kmerSize, pair& abundanceThreshold, float minShannonIndex ) { _partitionId = partitionId; _stats = new SimkaStatistics(nbBanks, computeSimpleDistances, computeComplexDistances, tmpDir, datasetIds); _processor = new SimkaCountProcessorSimple (_stats, nbBanks, kmerSize, abundanceThreshold, SUM, false, minShannonIndex); _bufferKmers.resize(MERGE_BUFFER_SIZE); _bufferCounts.resize(MERGE_BUFFER_SIZE); _bufferIndex = 0; } ~DistanceCommand(){ delete _processor; delete _stats; } //void add(Type& kmer, CountVector& counts){ // _bufferIndex += //} void setup(size_t bufferIndex, vector& bufferKmers, vector& bufferCounts){ //cout << "hey " << bufferIndex << endl; _bufferIndex = bufferIndex; for(size_t i=0; i<_bufferIndex; i++){ _bufferKmers[i] = bufferKmers[i]; _bufferCounts[i] = bufferCounts[i]; } } void execute (){ for(size_t i=0; i<_bufferIndex; i++){ _processor->process(_partitionId, _bufferKmers[i], _bufferCounts[i]); } } void use () {} void forget () {} }; struct Parameter { Parameter (IProperties* props, string inputFilename, string outputDir, size_t partitionId, size_t kmerSize, double minShannonIndex, bool computeSimpleDistances, bool computeComplexDistances, size_t nbCores) : props(props), inputFilename(inputFilename), outputDir(outputDir), partitionId(partitionId), kmerSize(kmerSize), minShannonIndex(minShannonIndex), computeSimpleDistances(computeSimpleDistances), computeComplexDistances(computeComplexDistances), nbCores(nbCores) {} IProperties* props; string inputFilename; string outputDir; size_t partitionId; size_t kmerSize; double minShannonIndex; bool computeSimpleDistances; bool computeComplexDistances; size_t nbCores; }; template class StorageIt { public: typedef typename Kmer::Type Type; typedef typename Kmer::Count Count; struct Kmer_BankId_Count{ Type _type; u_int32_t _bankId; u_int64_t _count; Kmer_BankId_Count(){ } Kmer_BankId_Count(Type type, u_int64_t bankId, u_int64_t count){ _type = type; _bankId = bankId; _count = count; } }; //typedef tuple Kmer_BankId_Count; //typedef typename Kmer::ModelCanonical ModelCanonical; //typedef typename ModelCanonical::Kmer KmerType; StorageIt(Iterator* it, size_t bankId, size_t partitionId){ _it = it; //cout << h5filename << endl; _bankId = bankId; _partitionId = partitionId; //Iterator* it2 = partition1.iterator(); //Collection& kmers1 = (*partition1)[_partitionId]; //collections.push_back(&kmers1); //_it = kmers1.iterator(); //_nbKmers = it->estimateNbItems(); //it2->first(); //while(!it2->isDone()){ // cout << it2->item().value.toString(31) << endl; // it2->next(); //} } ~StorageIt(){ delete _it; } //void setPartitionId(size_t partitionId){ // _partitionId = partitionId; //} bool next(){ _it->next(); //cout << "is done?" << _it->isDone() << endl; return !_it->isDone(); } Type& value(){ return _it->item()._type; } u_int16_t getBankId(){ return _it->item()._bankId; } u_int64_t& abundance(){ return _it->item()._count; } //u_int64_t getNbKmers(){ // return _nbKmers; //} u_int16_t _bankId; u_int16_t _partitionId; Iterator* _it; //u_int64_t _nbKmers; }; class SimkaCounterBuilderMerge { public: /** Constructor. * \param[in] nbBanks : number of banks parsed during kmer counting. */ SimkaCounterBuilderMerge (CountVector& abundancePerBank) : _abundancePerBank(abundancePerBank) {} /** Get the number of banks. * \return the number of banks. */ size_t size() const { return _abundancePerBank.size(); } /** Initialization of the counting for the current kmer. This method should be called * when a kmer is seen for the first time. * \param[in] idxBank : bank index where the new current kmer has been found. */ void init (size_t idxBank, CountNumber abundance) { for (size_t k=0; k<_abundancePerBank.size(); k++) { _abundancePerBank[k]=0; } _abundancePerBank [idxBank]= abundance; } /** Increase the abundance of the current kmer for the provided bank index. * \param[in] idxBank : index of the bank */ void increase (size_t idxBank, CountNumber abundance) { _abundancePerBank [idxBank] += abundance; } /** Set the abundance of the current kmer for the provided bank index. * \param[in] idxBank : index of the bank */ //void set (CountNumber val, size_t idxBank=0) { _abundancePerBank [idxBank] = val; } /** Get the abundance of the current kmer for the provided bank index. * \param[in] idxBank : index of the bank * \return the abundance of the current kmer for the given bank. */ //CountNumber operator[] (size_t idxBank) const { return _abundancePerBank[idxBank]; } /** */ //const CountVector& get () const { return _abundancePerBank; } void print(const string& kmer){ cout << kmer << ": "; for(size_t i=0; i class MergeCommand : public gatb::core::tools::dp::ICommand //, public gatb::core::system::SmartPointer { public: void use () {} void forget () {} typedef typename Kmer::Type Type; typedef typename Kmer::Count Count; typedef std::pair kxp; //id pointer in vec_pointer , value struct kxpcomp { bool operator() (kxp l,kxp r) { return ((r.second) < (l.second)); } } ; size_t _currentBuffer; u_int64_t _progressStep; vector > _bufferKmers; vector > _bufferCounts; vector _bufferIndex; u_int64_t _nbDistinctKmers; u_int64_t _nbSharedDistinctKmers; MergeCommand ( size_t partitionId, size_t nbBanks, IteratorListener* progress, vector*>& its, u_int64_t progressStep, size_t nbCores, bool computeComplexDistances ) : its(its) { _nbBanks = nbBanks; _partitionId = partitionId; _progress = progress; _progressStep = progressStep; _nbCores = nbCores; _computeComplexDistances = computeComplexDistances; _nbDistinctKmers = 0; _nbSharedDistinctKmers = 0; init(); } ~MergeCommand(){ delete solidCounter; } //void add(Type& kmer, CountVector& counts){ // _bufferIndex += //} //void setup(vector& bufferKmers, vector& bufferCounts){ // _bufferKmers = bufferKmers; // _bufferCounts = bufferCounts; //} size_t _nbCores; size_t _partitionId; size_t _nbBanks; vector*>& its; std::priority_queue< kxp, vector,kxpcomp > pq; u_int64_t nbKmersProcessed; IteratorListener* _progress; bool _computeComplexDistances; u_int16_t best_p; Type previous_kmer; CountVector abundancePerBank; size_t nbBankThatHaveKmer; SimkaCounterBuilderMerge* solidCounter; bool _isDone; void init(){ _isDone = false; solidCounter = new SimkaCounterBuilderMerge(abundancePerBank); for(size_t i=0; i<_nbCores; i++){ vector vec = vector(MERGE_BUFFER_SIZE); _bufferKmers.push_back(vec); vector vec2 = vector(MERGE_BUFFER_SIZE); _bufferCounts.push_back(vec2); _bufferIndex.push_back(0); } nbBankThatHaveKmer = 0; abundancePerBank.resize(_nbBanks, 0); _currentBuffer = 0; //_bufferIndex = 0; //_bufferSize = 1000; nbKmersProcessed = 0; //vector*> partitions; //vector*> collections; //vector*> its; //vector storages; //size_t nbPartitions; for(size_t i=0; i<_nbBanks; i++){ StorageIt* it = its[i]; it->_it->first(); //partitionIts[i]->first(); //while(!it->_it->isDone()){ // it->_it->next(); // cout << it->_it->item().value.toString(_kmerSize) << " " << it->_it->item().abundance << endl; //} } //fill the priority queue with the first elems for (size_t ii=0; ii<_nbBanks; ii++) { //if(its[ii]->next()) { pq.push(kxp(ii,its[ii]->value())); } pq.push(kxp(ii,its[ii]->value())); } if (pq.size() != 0) // everything empty, no kmer at all { //get first pointer best_p = pq.top().first ; pq.pop(); previous_kmer = its[best_p]->value(); solidCounter->init (its[best_p]->getBankId(), its[best_p]->abundance()); nbBankThatHaveKmer = 1; } } void reset(){ for(size_t i=0; i<_bufferIndex.size(); i++){ _bufferIndex[i] = 0; } } void execute (){ //cout << "lala " << pq.size() << endl; //merge-scan all 'virtual' arrays and output counts while (_currentBuffer < _nbCores) { //cout << _currentBuffer << endl; //go forward in this array or in new array of reaches end of this one if (! its[best_p]->next()) { //reaches end of one array if(pq.size() == 0){ _isDone = true; break; } //otherwise get new best best_p = pq.top().first ; pq.pop(); } if (its[best_p]->value() != previous_kmer ) { //if diff, changes to new array, get new min pointer pq.push(kxp(best_p,its[best_p]->value())); //push new val of this pointer in pq, will be counted later best_p = pq.top().first ; pq.pop(); //if new best is diff, this is the end of this kmer if(its[best_p]->value()!=previous_kmer ) { nbKmersProcessed += nbBankThatHaveKmer; if(nbKmersProcessed > _progressStep){ //cout << "queue size: " << pq.size() << endl; //cout << nbKmersProcessed << endl; _progress->inc(nbKmersProcessed); nbKmersProcessed = 0; } //cout << previous_kmer.toString(p.kmerSize) << endl; //for(size_t i=0; i 1) // _processor->process (_partitionId, previous_kmer, abundancePerBank); //this->insert (previous_kmer, solidCounter); solidCounter->init (its[best_p]->getBankId(), its[best_p]->abundance()); nbBankThatHaveKmer = 1; previous_kmer = its[best_p]->value(); } else { solidCounter->increase (its[best_p]->getBankId(), its[best_p]->abundance()); nbBankThatHaveKmer += 1; } } else { solidCounter->increase (its[best_p]->getBankId(), its[best_p]->abundance()); nbBankThatHaveKmer += 1; } } if(_isDone){ insert(previous_kmer, abundancePerBank, nbBankThatHaveKmer); } else{ } _currentBuffer = 0; //_bufferIndex = 0; //cout << nbBankThatHaveKmer << endl; //cout << previous_kmer.toString(p.kmerSize) << endl; //for(size_t i=0; iinsert (previous_kmer, solidCounter); // } //cout << "end " << endl; } void insert(const Type& kmer, const CountVector& counts, size_t nbBankThatHaveKmer){ _nbDistinctKmers += 1; if(_computeComplexDistances || nbBankThatHaveKmer > 1){ if(nbBankThatHaveKmer > 1){ _nbSharedDistinctKmers += 1; } //DistanceCommand* cmd = dynamic_cast*>(_cmds[_currentBuffer]); //cmd->_bufferKmers[cmd->_bufferIndex] = kmer; //cmd->_bufferCounts[cmd->_bufferIndex] = counts; _bufferKmers[_currentBuffer][_bufferIndex[_currentBuffer]] = kmer; _bufferCounts[_currentBuffer][_bufferIndex[_currentBuffer]] = counts; _bufferIndex[_currentBuffer] += 1; if(_bufferIndex[_currentBuffer] >= MERGE_BUFFER_SIZE){ //DistanceCommand* cmd = dynamic_cast*>(_cmds[_currentBuffer]); //cmd->setup(_bufferKmers[_currentBuffer], _bufferCounts[_currentBuffer]); _currentBuffer += 1; if(_currentBuffer >= _nbCores){ //dispatch(); } else{ //_bufferIndex = 0; } } //_processor->process (_partitionId, kmer, counts); } //_processor->process (_partitionId, kmer, counts); //cout <<_partitiontId << " "<< kmer.toString(31) << endl; //_processor->process (_partitionId, kmer, counter.get()); } }; */ template class DiskBasedMergeSort { public: typedef typename Kmer::Type Type; typedef typename Kmer::Count Count; //typedef tuple Kmer_BankId_Count; //typedef tuple*> kxp; typedef typename StorageIt::Kmer_BankId_Count Kmer_BankId_Count; struct kxp{ Type _type; u_int32_t _bankId; u_int64_t _count; StorageIt* _it; kxp(){ } kxp(Type type, u_int64_t bankId, u_int64_t count, StorageIt* it){ _type = type; _bankId = bankId; _count = count; _it = it; } }; struct kxpcomp { bool operator() (kxp& l, kxp& r) { return (r._type < l._type); } } ; string _outputDir; string _outputFilename; vector& _datasetIds; size_t _partitionId; Bag* _outputGzFile; Bag* _cachedBag; DiskBasedMergeSort(size_t mergeId, const string& outputDir, vector& datasetIds, size_t partitionId): _datasetIds(datasetIds) { _outputDir = outputDir; _partitionId = partitionId; _outputFilename = _outputDir + "/solid/part_" + Stringify::format("%i", partitionId) + "/__p__" + Stringify::format("%i", mergeId) + ".gz.temp"; _outputGzFile = new BagGzFile(_outputFilename); _cachedBag = new BagCache(_outputGzFile, 10000); } ~DiskBasedMergeSort(){ } void execute(){ vector* > partitions; vector*> its; size_t _nbBanks = _datasetIds.size(); for(size_t i=0; i<_nbBanks; i++){ //cout << _datasetIds[i] << endl; string filename = _outputDir + "/solid/part_" + Stringify::format("%i", _partitionId) + "/__p__" + Stringify::format("%i", _datasetIds[i]) + ".gz"; //cout << "\t\t" << filename << endl; IterableGzFile* partition = new IterableGzFile(filename, 10000); partitions.push_back(partition); its.push_back(new StorageIt(partition->iterator(), i, _partitionId)); //nbKmers += partition->estimateNbItems(); //size_t currentPart = 0; //ifstream file((_outputDir + "/kmercount_per_partition/" + _datasetIds[i] + ".txt").c_str()); //while(getline(file, line)){ // if(line == "") continue; // if(currentPart == _partitionId){ // //cout << stoull(line) << endl; // nbKmers += strtoull(line.c_str(), NULL, 10); // break; // } // currentPart += 1; //} //file.close(); } //u_int64_t progressStep = nbKmers / 1000; //_progress = new ProgressSynchro ( // createIteratorListener (nbKmers, "Merging kmers"), // System::thread().newSynchronizer()); //_progress->init (); //_nbDistinctKmers = 0; //_nbSharedDistinctKmers = 0; //u_int64_t nbKmersProcessed = 0; //size_t nbBankThatHaveKmer = 0; //u_int16_t best_p = 0; Type previous_kmer; //CountVector abundancePerBank; //abundancePerBank.resize(_nbBanks, 0); //SimkaCounterBuilderMerge* solidCounter = new SimkaCounterBuilderMerge(abundancePerBank);; std::priority_queue< kxp, vector,kxpcomp > pq; StorageIt* bestIt; for(size_t i=0; i<_nbBanks; i++){ StorageIt* it = its[i]; it->_it->first(); } //fill the priority queue with the first elems for (size_t ii=0; ii<_nbBanks; ii++) { //pq.push(Kmer_BankId_Count(ii,its[ii]->value())); pq.push(kxp(its[ii]->value(), its[ii]->getBankId(), its[ii]->abundance(), its[ii])); } if (pq.size() != 0) // everything empty, no kmer at all { //get first pointer bestIt = pq.top()._it; pq.pop(); _cachedBag->insert(Kmer_BankId_Count(bestIt->value(), bestIt->getBankId(), bestIt->abundance())); //best_p = get<1>(pq.top()) ; pq.pop(); //previous_kmer = bestIt->value(); //solidCounter->init (bestIt->getBankId(), bestIt->abundance()); //nbBankThatHaveKmer = 1; while(1){ if (! bestIt->next()) { //reaches end of one array if(pq.size() == 0){ break; } //otherwise get new best //best_p = get<1>(pq.top()) ; pq.pop(); bestIt = pq.top()._it; pq.pop(); } pq.push(kxp(bestIt->value(), bestIt->getBankId(), bestIt->abundance(), bestIt)); //push new val of this pointer in pq, will be counted later bestIt = pq.top()._it; pq.pop(); _cachedBag->insert(Kmer_BankId_Count(bestIt->value(), bestIt->getBankId(), bestIt->abundance())); //cout << bestIt->value().toString(31) << " " << bestIt->getBankId() << " "<< bestIt->abundance() << endl; //bestIt = get<3>(pq.top()); pq.pop(); //pq.push(kxp(bestIt->value(), bestIt->getBankId(), bestIt->abundance(), bestIt)); } //_outputGzFile->insert(Kmer_BankId_Count(bestIt->value(), bestIt->getBankId(), bestIt->abundance())); //cout << bestIt->value().toString(31) << " " << bestIt->getBankId() << " "<< bestIt->abundance() << endl; } for(size_t i=0; iflush(); delete _cachedBag; for(size_t i=0; i<_nbBanks; i++){ //cout << _datasetIds[i] << endl; string filename = _outputDir + "/solid/part_" + Stringify::format("%i", _partitionId) + "/__p__" + Stringify::format("%i", _datasetIds[i]) + ".gz"; System::file().remove(filename); } string newOutputFilename = _outputFilename; newOutputFilename.erase(_outputFilename.size()-5, 5); System::file().rename(_outputFilename, newOutputFilename); //remove .temp at the end of new merged file //_outputFilename = newOutputFilename; } }; template class SimkaMergeAlgorithm : public Algorithm { public: typedef typename Kmer::Type Type; typedef typename Kmer::Count Count; //typedef tuple Kmer_BankId_Count; //typedef tuple*> kxp; typedef typename DiskBasedMergeSort::Kmer_BankId_Count Kmer_BankId_Count; typedef typename DiskBasedMergeSort::kxp kxp; /* struct Kmer_BankId_Count{ Type _type; u_int64_t _bankId; u_int64_t _count; Kmer_BankId_Count(){ } Kmer_BankId_Count(Type type, u_int64_t bankId, u_int64_t count){ _type = type; _bankId = bankId; _count = count; } }; struct kxp{ Type _type; u_int32_t _bankId; u_int64_t _count; StorageIt* _it; kxp(){ } kxp(Type type, u_int64_t bankId, u_int64_t count, StorageIt* it){ _type = type; _bankId = bankId; _count = count; _it = it; } };*/ //typedef std::pair kxp; //id pointer in vec_pointer , value //typedef std::pair kxp; //id pointer in vec_pointer , value //struct kxpcomp { bool operator() (Kmer_BankId_Count l,Kmer_BankId_Count r) { return ((r.second) < (l.second)); } } ; struct kxpcomp { bool operator() (kxp& l,kxp& r) { return (r._type < l._type); } } ; Parameter& p; SimkaMergeAlgorithm(Parameter& p) : Algorithm("SimkaMergeAlgorithm", p.nbCores, p.props), p(p) { _abundanceThreshold.first = 0; _abundanceThreshold.second = 999999999; _computeSimpleDistances = p.computeSimpleDistances; _computeComplexDistances = p.computeComplexDistances; _kmerSize = p.kmerSize; _minShannonIndex = p.minShannonIndex; } ~SimkaMergeAlgorithm(){ //delete _progress; } //pthread_t statThread;_datasetNbReads /* void createInfo(Parameter& p){ } void loadCountInfo(){ for(size_t i=0; i<_nbBanks; i++){ string name = _datasetIds[i]; string countFilename = p.outputDir + "/count_synchro/" + name + ".ok"; string line; ifstream file(countFilename.c_str()); vector lines; while(getline(file, line)){ if(line == "") continue; lines.push_back(line); } file.close(); u_int64_t nbReads = strtoull(lines[0].c_str(), NULL, 10); _stats->_datasetNbReads[i] = nbReads; _stats->_nbSolidDistinctKmersPerBank[i] = strtoull(lines[1].c_str(), NULL, 10); _stats->_nbSolidKmersPerBank[i] = strtoull(lines[2].c_str(), NULL, 10); _stats->_chord_sqrt_N2[i] = sqrt(strtoull(lines[3].c_str(), NULL, 10)); //cout << _stats->_chord_sqrt_N2[i] << endl; } }*/ //struct sortFileBySize { bool operator() (sortItem_Size_Filename_ID& l,sortItem_Size_Filename_ID& r) { return (r._size < l._size); } } ; void execute(){ _nbCores = p.nbCores; removeStorage(p); _partitionId = p.partitionId; createDatasetIdList(p); _nbBanks = _datasetIds.size(); string partDir = p.outputDir + "/solid/part_" + Stringify::format("%i", _partitionId) + "/"; vector filenames = System::file().listdir(partDir); //cout << filenames.size() << endl; vector partFilenames; vector filenameSizes; for(size_t i=0; i SIMKA_MERGE_MAX_FILE_USED){ //cout << "Start merging pass" << endl; sort(filenameSizes.begin(),filenameSizes.end(),sortFileBySize); vector mergeDatasetIds; vector toRemoveItem; for(size_t i=0; i= _nbBanks) break; //cout << mergeDatasetIds[i] << endl; //cout << "First val must never be greater than second: " << i << " " << _nbBanks << endl; //cout << "\t" << get<1>(sfi) << endl; } for(size_t i=0; i diskBasedMergeSort(mergedId, p.outputDir, mergeDatasetIds, _partitionId); diskBasedMergeSort.execute(); filenameSizes.push_back(sortItem_Size_Filename_ID(getFileSize(diskBasedMergeSort._outputFilename), mergedId)); //cout << "\tmerged id: " << mergedId << endl; //cout << "\tremainging files: " << filenameSizes.size() << endl; } //cout << filenameSizes.size() << endl; //for(size_t i=0; i mergeDatasetIds; for(size_t j=0; j= _nbBanks) break; } cout << "doivent etre égaux a la dernière passe: " << _nbBanks << " " << mergeDatasetIds.size() << " " << datasetIndex << endl; DiskBasedMergeSort diskBasedMergeSort(i, p.outputDir, mergeDatasetIds, _partitionId); diskBasedMergeSort.execute(); }*/ //exit(1); /* PARALLEL for (size_t i=0; i<_nbCores; i++) { //cout << i << endl; ICommand* cmd = 0; cmd = new DistanceCommand(p.outputDir, _datasetIds, _partitionId, _nbBanks, _computeSimpleDistances, _computeComplexDistances, _kmerSize, _abundanceThreshold, _minShannonIndex); //cmd->use(); _cmds.push_back (cmd); //cout << _cmds[i] << endl; } resetCommands(); */ //SimkaDistanceParam distanceParams(p.props); //createInfo(p); //createProcessor(p); //PARALLEL line to remove _stats = new SimkaStatistics(_nbBanks, p.computeSimpleDistances, p.computeComplexDistances, p.outputDir, _datasetIds); _processor = new SimkaCountProcessorSimple (_stats, _nbBanks, p.kmerSize, _abundanceThreshold, SUM, false, p.minShannonIndex); //_processor->use(); string line; vector* > partitions; vector*> its; u_int64_t nbKmers = 0; for(size_t i=0; i* partition = new IterableGzFile(filename, 10000); partitions.push_back(partition); its.push_back(new StorageIt(partition->iterator(), i, _partitionId)); //nbKmers += partition->estimateNbItems(); size_t currentPart = 0; ifstream file((p.outputDir + "/kmercount_per_partition/" + _datasetIds[i] + ".txt").c_str()); while(getline(file, line)){ if(line == "") continue; if(currentPart == _partitionId){ //cout << stoull(line) << endl; nbKmers += strtoull(line.c_str(), NULL, 10); break; } currentPart += 1; } file.close(); } /* //vector* > partitionIts; for(size_t i=0; i<_nbBanks; i++){ string filename = p.outputDir + "/solid/" + _datasetIds[i] + "/" + "part" + Stringify::format("%i", _partitionId); //cout << filename << endl; IterableGzFile* partition = new IterableGzFile(filename, 1000); partitions.push_back(partition); its.push_back(new StorageIt(partition->iterator(), i, _partitionId)); //nbKmers += partition->estimateNbItems(); size_t currentPart = 0; ifstream file((p.outputDir + "/kmercount_per_partition/" + _datasetIds[i] + ".txt").c_str()); while(getline(file, line)){ if(line == "") continue; if(currentPart == _partitionId){ //cout << stoull(line) << endl; nbKmers += strtoull(line.c_str(), NULL, 10); break; } currentPart += 1; } file.close(); }*/ //u_int64_t progressStep = nbKmers / 1000; //_progress = new ProgressSynchro ( // createIteratorListener (nbKmers, "Merging kmers"), // System::thread().newSynchronizer()); //_progress->init (); /* PARALLEL _mergeCommand = new MergeCommand( _partitionId, _nbBanks, _progress, its, progressStep, _nbCores, p.computeComplexDistances); //_mergeCommand->use(); _cmds.push_back(_mergeCommand); //cout << "CMDS SIZE:" << _cmds.size() << endl; MergeCommand* mergeCmd = dynamic_cast*>(_mergeCommand); mergeCmd->execute(); while(!mergeCmd->_isDone){ //cout << mergeCmd->_isDone << endl; //mergeCmd->execute(); dispatch(); } dispatch();*/ _nbDistinctKmers = 0; _nbSharedDistinctKmers = 0; u_int64_t nbKmersProcessed = 0; size_t nbBankThatHaveKmer = 0; u_int16_t best_p = 0; Type previous_kmer; CountVector abundancePerBank; abundancePerBank.resize(_nbBanks, 0); SimkaCounterBuilderMerge* solidCounter = new SimkaCounterBuilderMerge(abundancePerBank);; std::priority_queue< kxp, vector,kxpcomp > pq; StorageIt* bestIt; for(size_t i=0; i* it = its[i]; it->_it->first(); } //fill the priority queue with the first elems for (size_t ii=0; iivalue())); pq.push(kxp(its[ii]->value(), its[ii]->getBankId(), its[ii]->abundance(), its[ii])); } if (pq.size() != 0) // everything empty, no kmer at all { //get first pointer bestIt = pq.top()._it; pq.pop(); //best_p = get<1>(pq.top()) ; pq.pop(); previous_kmer = bestIt->value(); solidCounter->init (bestIt->getBankId(), bestIt->abundance()); nbBankThatHaveKmer = 1; while(1){ if (! bestIt->next()) { //reaches end of one array if(pq.size() == 0){ break; } //otherwise get new best //best_p = get<1>(pq.top()) ; pq.pop(); bestIt = pq.top()._it; pq.pop(); } //cout << bestIt->value().toString(31) << " " << bestIt->getBankId() << " "<< bestIt->abundance() << endl; if (bestIt->value() != previous_kmer ) { //if diff, changes to new array, get new min pointer pq.push(kxp(bestIt->value(), bestIt->getBankId(), bestIt->abundance(), bestIt)); //push new val of this pointer in pq, will be counted later bestIt = pq.top()._it; pq.pop(); //best_p = get<1>(pq.top()) ; pq.pop(); //if new best is diff, this is the end of this kmer if(bestIt->value()!=previous_kmer ) { //nbKmersProcessed += nbBankThatHaveKmer; //if(nbKmersProcessed > progressStep){ //cout << "queue size: " << pq.size() << endl; //cout << nbKmersProcessed << endl; //_progress->inc(nbKmersProcessed); //nbKmersProcessed = 0; //} //cout << previous_kmer.toString(p.kmerSize) << endl; //for(size_t i=0; i 1) // _processor->process (_partitionId, previous_kmer, abundancePerBank); //this->insert (previous_kmer, solidCounter); solidCounter->init (bestIt->getBankId(), bestIt->abundance()); nbBankThatHaveKmer = 1; previous_kmer = bestIt->value(); } else { solidCounter->increase (bestIt->getBankId(), bestIt->abundance()); nbBankThatHaveKmer += 1; } } else { //cout << "increase" << endl; solidCounter->increase (bestIt->getBankId(), bestIt->abundance()); nbBankThatHaveKmer += 1; } } insert(previous_kmer, abundancePerBank, nbBankThatHaveKmer); } _processor->end(); //cout << "lala" << endl; for(size_t i=0; i_nbDistinctKmers, mergeCmd->_nbSharedDistinctKmers); //cout << _cmds.size() << endl; for(size_t i=0; i<_cmds.size(); i++){ //cout << _cmds[i] << endl; //_cmds[i]->forget(); delete _cmds[i]; } //_cmds.clear(); //delete _mergeCommand; */ delete solidCounter; for(size_t i=0; ifinish(); } void insert(const Type& kmer, const CountVector& counts, size_t nbBankThatHaveKmer){ //cout << kmer.toString(31) << endl; //for(size_t i=0; i_nbDistinctKmers += 1; if(_computeComplexDistances || nbBankThatHaveKmer > 1){ if(nbBankThatHaveKmer > 1){ _stats->_nbSharedKmers += 1; } _processor->process(_partitionId, kmer, counts); } } void createDatasetIdList(Parameter& p){ string datasetIdFilename = p.outputDir + "/" + "datasetIds"; IFile* inputFile = System::file().newFile(datasetIdFilename, "rb"); //IFile* bankFile = System::file().newFile(_banksInputFilename, "wb"); inputFile->seeko(0, SEEK_END); u_int64_t size = inputFile->tell(); inputFile->seeko(0, SEEK_SET); char buffer2[size]; inputFile->fread(buffer2, size, size); string fileContents(buffer2, size); string line; string linePart; vector linePartList; stringstream fileContentsStream(fileContents); //string bankFileContents = ""; //u_int64_t lineIndex = 0; while(getline(fileContentsStream, line)){ if(line == "") continue; _datasetIds.push_back(line); } //bankFileContents.erase(bankFileContents.size()-1); //bankFileContents.pop_back(); // "remove last /n //bankFile->fwrite(bankFileContents.c_str(), bankFileContents.size(), 1); delete inputFile; } void createProcessor(Parameter& p){ //ICountProcessor* proc = _processor->clone(); //proc->use(); //_processors.push_back(proc); } void resetCommands(){ for (size_t i=0; i<_nbCores; i++){ DistanceCommand* cmd = dynamic_cast*>(_cmds[i]); cmd->_bufferIndex = 0; } } /* void dispatch(){ MergeCommand* mergeCommand = dynamic_cast*>(_mergeCommand); for (size_t i=0; i<_nbCores; i++){ //cout << mergeCommand->_bufferKmers.size() << endl; //cout << i << endl; DistanceCommand* cmd = dynamic_cast*>(_cmds[i]); cmd->setup(mergeCommand->_bufferIndex[i], mergeCommand->_bufferKmers[i], mergeCommand->_bufferCounts[i]); } //MergeCommand* mergeCommand = dynamic_cast*>(_mergeCommand); mergeCommand->reset(); //cout << "start dispatch" << endl; getDispatcher()->dispatchCommands(_cmds, 0); //cout << "end dispatch" << endl; resetCommands(); }*/ void removeStorage(Parameter& p){ //Storage* storage = 0; //storage = StorageFactory(STORAGE_HDF5).create (p.outputDir + "/stats/part_" + SimkaAlgorithm<>::toString(p.partitionId) + ".stats", true, true); //LOCAL (storage); } /* PARALLEL void saveStats(Parameter& p, const u_int64_t nbDistinctKmers, const u_int64_t nbSharedDistinctKmers){ _stats = new SimkaStatistics(_nbBanks, p.computeSimpleDistances, p.computeComplexDistances, p.outputDir, _datasetIds); for (size_t i=0; i<_nbCores; i++){ DistanceCommand* cmd = dynamic_cast*>(_cmds[i]); cmd->_processor->end(); (*_stats) += (*cmd->_stats); } //loadCountInfo(); string filename = p.outputDir + "/stats/part_" + SimkaAlgorithm<>::toString(p.partitionId) + ".gz"; _stats->_nbDistinctKmers = nbDistinctKmers; _stats->_nbSharedKmers = nbSharedDistinctKmers; _stats->save(filename); //storage->getGroup("")); delete _stats; //string filename = p.outputDir + "/stats/part_" + SimkaAlgorithm<>::toString(p.partitionId) + ".gz"; //_processor->finishClones(_processors); //Storage* storage = 0; //storage = StorageFactory(STORAGE_HDF5).create (p.outputDir + "/stats/part_" + SimkaAlgorithm<>::toString(p.partitionId) + ".stats", true, false); //LOCAL (storage); //_stats->save(filename); //storage->getGroup("")); //cout << _stats->_nbKmers << endl; //_processors[0]->forget(); //_processor->forget(); }*/ void saveStats(Parameter& p){ string filename = p.outputDir + "/stats/part_" + SimkaAlgorithm<>::toString(p.partitionId) + ".gz"; _stats->save(filename); //storage->getGroup("")); //string filename = p.outputDir + "/stats/part_" + SimkaAlgorithm<>::toString(p.partitionId) + ".gz"; //_processor->finishClones(_processors); //Storage* storage = 0; //storage = StorageFactory(STORAGE_HDF5).create (p.outputDir + "/stats/part_" + SimkaAlgorithm<>::toString(p.partitionId) + ".stats", true, false); //LOCAL (storage); //_stats->save(filename); //storage->getGroup("")); //cout << _stats->_nbKmers << endl; //_processors[0]->forget(); //_processor->forget(); } void writeFinishSignal(Parameter& p){ string finishFilename = p.outputDir + "/merge_synchro/" + SimkaAlgorithm<>::toString(p.partitionId) + ".ok"; IFile* file = System::file().newFile(finishFilename, "w"); delete file; } private: size_t _nbBanks; bool _computeSimpleDistances; bool _computeComplexDistances; size_t _kmerSize; float _minShannonIndex; pair _abundanceThreshold; vector _datasetIds; size_t _partitionId; //vector*> _processors; IteratorListener* _progress; vector _cmds; ICommand* _mergeCommand; size_t _nbCores; SimkaStatistics* _stats; SimkaCountProcessorSimple* _processor; u_int64_t _nbDistinctKmers; u_int64_t _nbSharedDistinctKmers; }; class SimkaMerge : public Tool { public: SimkaMerge () : Tool ("SimkaMerge") { //Original input filename given to simka. Used to recreate dataset id list getParser()->push_back (new OptionOneParam (STR_NB_CORES, "nb cores", true)); getParser()->push_back (new OptionOneParam (STR_KMER_SIZE, "kmer size", true)); getParser()->push_back (new OptionOneParam (STR_URI_INPUT, "input filename", true)); getParser()->push_back (new OptionOneParam ("-out-tmp-simka", "tmp output", true)); getParser()->push_back (new OptionOneParam ("-partition-id", "bank name", true)); getParser()->push_back (new OptionOneParam ("-nb-cores", "bank name", true)); getParser()->push_back (new OptionOneParam ("-max-memory", "bank name", true)); getParser()->push_back (new OptionOneParam (STR_SIMKA_MIN_KMER_SHANNON_INDEX, "bank name", true)); getParser()->push_back (new OptionNoParam (STR_SIMKA_COMPUTE_ALL_SIMPLE_DISTANCES.c_str(), "compute simple distances")); getParser()->push_back (new OptionNoParam (STR_SIMKA_COMPUTE_ALL_COMPLEX_DISTANCES.c_str(), "compute complex distances")); } void execute () { size_t nbCores = getInput()->getInt(STR_NB_CORES); size_t kmerSize = getInput()->getInt(STR_KMER_SIZE); size_t partitionId = getInput()->getInt("-partition-id"); string inputFilename = getInput()->getStr(STR_URI_INPUT); string outputDir = getInput()->getStr("-out-tmp-simka"); double minShannonIndex = getInput()->getDouble(STR_SIMKA_MIN_KMER_SHANNON_INDEX); bool computeSimpleDistances = getInput()->get(STR_SIMKA_COMPUTE_ALL_SIMPLE_DISTANCES); bool computeComplexDistances = getInput()->get(STR_SIMKA_COMPUTE_ALL_COMPLEX_DISTANCES); Parameter params(getInput(), inputFilename, outputDir, partitionId, kmerSize, minShannonIndex, computeSimpleDistances, computeComplexDistances, nbCores); Integer::apply (kmerSize, params); } template struct Functor { void operator () (Parameter& p) { SimkaMergeAlgorithm(p).execute(); } }; }; int main (int argc, char* argv[]) { try { SimkaMerge().run (argc, argv); } catch (Exception& e) { std::cout << "EXCEPTION: " << e.getMessage() << std::endl; return EXIT_FAILURE; } } //! [snippet1] simka-1.5.1/src/SimkaPotara.cpp000077500000000000000000000157721353413740300163520ustar00rootroot00000000000000/***************************************************************************** * Simka: Fast kmer-based method for estimating the similarity between numerous metagenomic datasets * A tool from the GATB (Genome Assembly Tool Box) * Copyright (C) 2015 INRIA * Authors: G.Benoit, C.Lemaitre, P.Peterlongo * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU Affero General Public License as * published by the Free Software Foundation, either version 3 of the * License, or (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU Affero General Public License for more details. * * You should have received a copy of the GNU Affero General Public License * along with this program. If not, see . *****************************************************************************/ #include "SimkaPotara.hpp" /* TODO: - Faire la config dans un job a part (job_count.bash) pour avoir la même config pour les job de comptage et la config - Verifier les paramètre passer au jobs généré (nbcores, maxmemory...) */ SimkaPotara::SimkaPotara(const string& execFilename) : Tool ("Simka") { _execFilename = execFilename; Simka::createOptionsParser(getParser()); //Kmer parser IOptionsParser* coreParser = getParser()->getParser("core"); //clusterParser->push_back (new OptionNoParam (STR_SIMKA_CLUSTER_MODE, "enable cluster mode. All cluster args below must be set", false)); coreParser->push_back (new OptionOneParam (STR_SIMKA_NB_JOB_COUNT, "maximum number of simultaneous counting jobs (a higher value improve execution time but increase temporary disk usage)", false)); coreParser->push_back (new OptionOneParam (STR_SIMKA_NB_JOB_MERGE, "maximum number of simultaneous merging jobs (1 job = 1 core)", false)); IOptionsParser* clusterParser = new OptionsParser ("cluster"); //clusterParser->push_back (new OptionOneParam (STR_SIMKA_NB_PARTITIONS, "nb partitions", false, "0" )); clusterParser->push_back (new OptionOneParam (STR_SIMKA_JOB_COUNT_COMMAND, "command to submit counting job", false )); clusterParser->push_back (new OptionOneParam (STR_SIMKA_JOB_MERGE_COMMAND, "command to submit merging job", false )); clusterParser->push_back (new OptionOneParam (STR_SIMKA_JOB_COUNT_FILENAME, "filename to the couting job template", false )); clusterParser->push_back (new OptionOneParam (STR_SIMKA_JOB_MERGE_FILENAME, "filename to the merging job template", false )); //getParser()->push_back(coreParser); getParser()->push_back(clusterParser); //getParser()->getParser("core")->getParser(STR_NB_CORES)->setHelp("number of cores per counting job"); //if (Option* p = dynamic_cast (getParser()->getParser(STR_MAX_MEMORY))) { p->setHelp("max memory per counting job (in MBytes) "); } //if (Option* p = dynamic_cast (getParser()->getParser(STR_NB_CORES))) { p->setHelp("number of cores per job"); } //coreParser->push_back(new OptionOneParam(parser->getParser(STR_NB_CORES)->getName(), parser->getParser(STR_NB_CORES)->getHelp(), false, "0")); //if (IOptionsParser* input = dskParser->getParser (STR_KMER_ABUNDANCE_MIN_THRESHOLD)) { input->setVisible (false); } /* IOptionsParser* parser = getParser(); IOptionsParser* dskParser = SortingCountAlgorithm<>::getOptionsParser(); parser->push_back (dskParser, 1); parser->push_back(dskParser); parser->getParser (STR_URI_INPUT)->setHelp("input file of datasets and their id. One dataset per line: dataset_id dataset_filename"); parser->getParser (STR_KMER_ABUNDANCE_MIN_THRESHOLD)->setVisible (false); parser->getParser (STR_HISTOGRAM_MAX)->setVisible (false); parser->getParser (STR_URI_SOLID_KMERS)->setVisible (false); parser->getParser (STR_URI_OUTPUT_DIR)->setHelp("output directory for temporary files"); parser->getParser (STR_URI_OUTPUT)->setHelp("output directory for result files"); parser->getParser (STR_SOLIDITY_KIND)->setHelp("TODO"); parser->getParser (STR_MINIMIZER_TYPE)->setVisible (false); parser->getParser (STR_MINIMIZER_SIZE)->setVisible (false); parser->getParser (STR_REPARTITION_TYPE)->setVisible (false); if (Option* p = dynamic_cast (parser->getParser(STR_KMER_ABUNDANCE_MIN))) { p->setDefaultValue ("0"); } parser->push_back (new OptionNoParam (STR_SOLIDITY_PER_DATASET.c_str(), "Do not take into consideration multi-counting when determining solidity of kmers", false )); */ /* parser->push_back (new OptionOneParam (STR_URI_INPUT, "reads file", true )); parser->push_back (new OptionOneParam (STR_KMER_SIZE, "size of a kmer", false, "31" )); parser->push_back (new OptionOneParam (STR_KMER_ABUNDANCE_MIN,"min abundance threshold for solid kmers", false, "3" )); parser->push_back (new OptionOneParam (STR_KMER_ABUNDANCE_MAX,"min abundance threshold for solid kmers", false, "3" )); parser->push_back (new OptionOneParam (STR_MAX_MEMORY, "max memory (in MBytes)", false, "2000")); parser->push_back (new OptionOneParam (STR_URI_OUTPUT_DIR, "output folder for solid kmers", false)); parser->push_back (new OptionOneParam (STR_URI_OUTPUT, "output file", false)); */ //setParser (parser); } struct Parameter { //Parameter (Simka& simka, IProperties* props) : props(props) {} Parameter (IProperties* props, const string& execFilename) : _props(props), _execFilename(execFilename) {} //Simka& _simka; IProperties* _props; string _execFilename; /* string _inputFilename; string _outputDir; size_t _kmerSize; pair _abundanceThreshold; bool _soliditySingle;*/ }; template struct Functor { void operator () (Parameter p) { /* cout << "SimkaAlgo.cpp 1" << endl; clear(); delete _banks; cout << "SimkaAlgo.cpp 2" << endl; SimkaFusion* simkaFusion = new SimkaFusion(_options, _inputFilename, _outputDir, _outputDirTemp, _nbReadsPerDataset, _maxNbReads); simkaFusion->execute(); return;*/ SimkaPotaraAlgorithm simkaAlgorithm (p._props, p._execFilename); simkaAlgorithm.execute(); /* #ifdef SIMKA_MIN simkaAlgorithm.executeSimkamin(); #else #endif*/ }}; void SimkaPotara::execute () { IProperties* input = getInput(); //Parameter params(*this, getInput()); Parameter params(input, _execFilename); size_t kmerSize = getInput()->getInt (STR_KMER_SIZE); Integer::apply (kmerSize, params); } int main (int argc, char* argv[]) { try { // We run the tool with the provided command line arguments. //cout << argv[0] << endl; SimkaPotara(string(argv[0])).run (argc, argv); } catch (Exception& e) { std::cout << "EXCEPTION: " << e.getMessage() << std::endl; return EXIT_FAILURE; } return EXIT_SUCCESS; } simka-1.5.1/src/SimkaPotara.hpp000077500000000000000000001105111353413740300163420ustar00rootroot00000000000000/***************************************************************************** * Simka: Fast kmer-based method for estimating the similarity between numerous metagenomic datasets * A tool from the GATB (Genome Assembly Tool Box) * Copyright (C) 2015 INRIA * Authors: G.Benoit, C.Lemaitre, P.Peterlongo * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU Affero General Public License as * published by the Free Software Foundation, either version 3 of the * License, or (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU Affero General Public License for more details. * * You should have received a copy of the GNU Affero General Public License * along with this program. If not, see . *****************************************************************************/ #ifndef TOOLS_SIMKA_SRC_SIMKAFUSION_HPP_ #define TOOLS_SIMKA_SRC_SIMKAFUSION_HPP_ #include #include #include #include #include #include //#include //#include #include //#define CLUSTER //#define SERIAL #define SLEEP_TIME_SEC 1 const string STR_SIMKA_CLUSTER_MODE = "-cluster"; const string STR_SIMKA_NB_JOB_COUNT = "-max-count"; const string STR_SIMKA_NB_JOB_MERGE = "-max-merge"; const string STR_SIMKA_JOB_COUNT_COMMAND = "-count-cmd"; const string STR_SIMKA_JOB_MERGE_COMMAND = "-merge-cmd"; const string STR_SIMKA_JOB_COUNT_FILENAME = "-count-file"; const string STR_SIMKA_JOB_MERGE_FILENAME = "-merge-file"; class SimkaBankSample : public BankDelegate { public: SimkaBankSample (IBank* ref, u_int64_t nbRead) : BankDelegate (ref) { _nbRead = nbRead; } /** \copydoc tools::collections::Iterable::iterator */ Iterator* iterator () { Iterator* it = _ref->iterator (); std::vector*> iterators = it->getComposition(); TruncateIterator* truncIt = new TruncateIterator(*iterators[0], _nbRead); return truncIt; } private: u_int64_t _nbRead; }; template class SimkaNullProcessor : public CountProcessorAbstract{ public: typedef typename Kmer::Type Type; //typedef typename Kmer::Count Count; SimkaNullProcessor(){} ~SimkaNullProcessor(){} CountProcessorAbstract* clone () { return new SimkaNullProcessor (); } void finishClones (vector*>& clones){} bool process (size_t partId, const typename Kmer::Type& kmer, const CountVector& count, CountNumber sum){return false;} }; template class SimkaCompProcessor : public CountProcessorAbstract{ public: SimkaCompProcessor(KmerCountCompressor* comp){ _comp = comp; } ~SimkaCompProcessor(){} CountProcessorAbstract* clone () { return new SimkaCompProcessor (_comp); } void finishClones (vector*>& clones){} bool process (size_t partId, const typename Kmer::Type& kmer, const CountVector& count, CountNumber sum){ _comp->insert(partId, kmer, count); return true; } private: KmerCountCompressor* _comp; }; class SimkaBankTemp : public BankDelegate { public: u_int64_t _refNbReads; u_int64_t _refTotalSeqSize; u_int64_t _refMaxReadSize; /** Constructor. * \param[in] ref : referred bank. * \param[in] filter : functor that filters sequence. */ SimkaBankTemp (IBank* ref, u_int64_t maxReads) : BankDelegate (ref) { _maxReads = maxReads; //_nbBanks = ref->getCompositionNb(); ref->estimate(_refNbReads, _refTotalSeqSize, _refMaxReadSize); //cout << _refNbReads << endl; //cout << _refTotalSeqSize << endl; //cout << _refMaxReadSize << endl; } void estimate (u_int64_t& number, u_int64_t& totalSize, u_int64_t& maxSize){ if(_maxReads == 0){ number = _refNbReads; totalSize = _refTotalSeqSize; maxSize = _refMaxReadSize; } else{ u_int64_t maxReads = _maxReads; //u_int64_t maxReads = 0; //for(size_t i=0; i<_nbBanks; i++){ // maxReads += _maxReads * _nbPaireds[i]; //} //cout << _refNbReads << endl; //cout << _maxReads*_nbBanks << endl; maxReads = min (maxReads, _refNbReads); //cout << "ha " << maxReads << endl; if(maxReads == _refNbReads){ number = _refNbReads; totalSize = _refTotalSeqSize; maxSize = _refMaxReadSize; } else{ number = maxReads; double factor = (double)maxReads / (double)_refNbReads; totalSize = _refTotalSeqSize * factor; maxSize = _refMaxReadSize; } } //number = _maxReads; //totalSize = (_totalSizeRef*_nbReadToProcess)/_numberRef; //maxSize = _maxSizeRef; //cout << number2 << endl; //u_int64_t readSize = totalSize2 / number2; //cout << "lal:" << number2 << endl; //number = _maxReads; //number = _nbReadToProcess; //totalSize = _nbReadToProcess*readSize; //maxSize = readSize; //cout << number << endl; //cout << totalSize << endl; //cout << maxSize << endl; } u_int64_t _maxReads; }; template class SimkaPotaraAlgorithm : public SimkaAlgorithm{ public: typedef typename Kmer::Type Type; SimkaPotaraAlgorithm(IProperties* options, const string& execFilename): SimkaAlgorithm(options) { _isClusterMode = false; //cout << "lala" << endl; //cout << _execDir << endl; _execDir = System::file().getRealPath(execFilename); _execDir = System::file().getDirectory(_execDir) + "/"; //_options = options; //_inputFilename = _options->getStr(STR_URI_INPUT); //_outputDir = _options->getStr(STR_URI_OUTPUT); //_outputDirTemp = _options->getStr(STR_URI_OUTPUT_TMP); //_maxNbReads = _options->getInt(STR_SIMKA_MAX_READS); //_maxJobCount = _options->getInt(STR_SIMKA_NB_JOB_COUNT); //_maxJobMerge = _options->getInt(STR_SIMKA_NB_JOB_MERGE); //_jobCountFilename = _options->getStr(STR_SIMKA_JOB_COUNT_FILENAME); //_jobMergeFilename = _options->getStr(STR_SIMKA_JOB_MERGE_FILENAME); //_jobCountCommand = _options->getStr(STR_SIMKA_JOB_COUNT_COMMAND); //_jobMergeCommand = _options->getStr(STR_SIMKA_JOB_MERGE_COMMAND); //string solidFilename = _outputDir + "/solid/" + p.bankName + suffix + ".h5"; //cout << "SimkaFusion constructor " << _outputDirTemp << endl; } ~SimkaPotaraAlgorithm(){ } void execute(){ parseArgs(); setup(); if(!SimkaAlgorithm::isInputValid()) exit(1); SimkaAlgorithm::computeMaxReads(); createConfig(); count(); printCountInfo(); merge(); stats(); if(this->_options->getInt(STR_VERBOSE) != 0){ cout << endl; cout << "Output dir: " << this->_outputDir << endl; cout << endl; } //bool keepTempFiles = false; if(!this->_keepTmpFiles){ string command = "rm -rf " + this->_outputDirTemp + "/solid/"; system(command.c_str()); command = "rm -rf " + this->_outputDirTemp + "/temp/"; system(command.c_str()); command = "rm -rf " + this->_outputDirTemp + "/count_synchro/"; system(command.c_str()); command = "rm -rf " + this->_outputDirTemp + "/merge_synchro/"; system(command.c_str()); command = "rm -rf " + this->_outputDirTemp + "/stats/"; system(command.c_str()); command = "rm -rf " + this->_outputDirTemp + "/job_count/"; system(command.c_str()); command = "rm -rf " + this->_outputDirTemp + "/job_merge/"; system(command.c_str()); command = "rm -rf " + this->_outputDirTemp + "/kmercount_per_partition/"; system(command.c_str()); command = "rm -rf " + this->_outputDirTemp + "/input/"; system(command.c_str()); command = "rm " + this->_outputDirTemp + "/config.h5"; system(command.c_str()); command = "rm " + this->_outputDirTemp + "/datasetIds"; system(command.c_str()); //cout << command << endl; //System::file().rmdir(this->_outputDirTemp); //System::file().mkdir(this->_outputDirTemp + "/solid/", -1); //System::file().mkdir(this->_outputDirTemp + "/temp/", -1); //System::file().mkdir(this->_outputDirTemp + "/log/", -1); //System::file().mkdir(this->_outputDirTemp + "/count_synchro/", -1); //System::file().mkdir(this->_outputDirTemp + "/merge_synchro/", -1); //System::file().mkdir(this->_outputDirTemp + "/stats/", -1); //System::file().mkdir(this->_outputDirTemp + "/job_count/", -1); //System::file().mkdir(this->_outputDirTemp + "/job_merge/", -1); //System::file().mkdir(this->_outputDirTemp + "/kmercount_per_partition/", -1); } } void parseArgs() { SimkaAlgorithm::parseArgs(); if(this->_options->get(STR_SIMKA_JOB_COUNT_FILENAME) || this->_options->get(STR_SIMKA_JOB_MERGE_FILENAME) || this->_options->get(STR_SIMKA_JOB_COUNT_COMMAND) || this->_options->get(STR_SIMKA_JOB_MERGE_COMMAND)){ _isClusterMode = true; _jobCountFilename = this->_options->getStr(STR_SIMKA_JOB_COUNT_FILENAME); _jobMergeFilename = this->_options->getStr(STR_SIMKA_JOB_MERGE_FILENAME); _jobCountCommand = this->_options->getStr(STR_SIMKA_JOB_COUNT_COMMAND); _jobMergeCommand = this->_options->getStr(STR_SIMKA_JOB_MERGE_COMMAND); if(! this->_options->get(STR_SIMKA_NB_JOB_COUNT) || this->_options->get(STR_SIMKA_NB_JOB_MERGE)){ //cout << endl; cout << "Cluster mode enable. Be sure to set correctly the following arguments if you have any job submission constraints:" << endl; cout << "\t" << STR_SIMKA_NB_JOB_COUNT << " : the maximum number of simultaneous couting" << endl; //job (each job will use up to " << STR_NB_CORES << " cores and " << STR_MAX_MEMORY << " MB memory)" << endl; cout << "\t" << STR_SIMKA_NB_JOB_MERGE << " : the maximum number of simultaneous merging job" << endl; // (each job will use up to 1 core and " << STR_MAX_MEMORY << " MB memory)" << endl; //cout << endl; } IFile* inputFile = System::file().newFile(_jobCountFilename, "rb"); inputFile->seeko(0, SEEK_END); u_int64_t size = inputFile->tell(); inputFile->seeko(0, SEEK_SET); char buffer2[size]; inputFile->fread(buffer2, size, size); string fileContents(buffer2, size); _jobCountContents = fileContents; delete inputFile; inputFile = System::file().newFile(_jobMergeFilename, "rb"); inputFile->seeko(0, SEEK_END); size = inputFile->tell(); inputFile->seeko(0, SEEK_SET); char buffer3[size]; inputFile->fread(buffer3, size, size); string fileContents2(buffer3, size); _jobMergeContents = fileContents2; delete inputFile; } else{ _isClusterMode = false; } //_isClusterMode = true; //if(this->_options->get(STR_SIMKA_CLUSTER_MODE)){ //cout << "cluster mode activated" << endl; //cout << "\t-max-memory = memory per job" << endl; //cout << "\t-nb-cores = cores per job" << endl; //cout << endl; //if(_isClusterMode) // _maxJobMerge = max((int)_maxJobMerge, (int)30); //else{ // _maxJobMerge = max((int)_maxJobMerge, (int)30); //} //} /* _maxJobMerge = maxCores-1; size_t maxCoreCount = maxCores-1; size_t nbCoresCount = min(maxCoreCount, this->_nbCores); u_int64_t minMemory = 2000; size_t maxJobCountTemp = this->_maxMemory/minMemory; _maxJobCount = min(nbCoresCount, maxJobCountTemp); _memoryPerJob = this->_maxMemory / _maxJobCount; _coresPerJob = ceil(nbCoresCount / (float)_maxJobCount); cout << "Nb jobs count in parallel: " << _maxJobCount << endl; cout << "\tCores per jobs: " << _coresPerJob << endl; cout << "\tMemory per jobs: " << _memoryPerJob << endl; cout << "Nb jobs merge in parallel: " << _maxJobMerge << endl; cout << endl; */ } void setup(){ SimkaAlgorithm::setup(); createDirs(); layoutInputFilename(); } void layoutInputFilename(){ //SimkaAlgorithm::layoutInputFilename(); string datasetIdFilename = this->_outputDirTemp + "/" + "datasetIds"; IFile* datasetIdFile = System::file().newFile(datasetIdFilename, "wb"); for(size_t i=0; i_bankNames.size(); i++){ string bankName = this->_bankNames[i]; string bankIdLine = bankName + '\n'; datasetIdFile->fwrite(bankIdLine.c_str(), bankIdLine.size(), 1); } datasetIdFile->flush(); delete datasetIdFile; } void createDirs(){ /* string suffix = ""; suffix += "m" + _options->getStr(STR_SIMKA_MIN_READ_SIZE); suffix += "_s" + _options->getStr(STR_SIMKA_MIN_READ_SHANNON_INDEX); suffix += "_n" + SimkaAlgorithm<>::toString(_maxNbReads); suffix += "_p" + SimkaAlgorithm<>::toString(_nbAskedPartitions);*/ //_outputDirTemp = _outputDirTemp; // + "/" + suffix + "/"; //System::file().mkdir(_outputDirTemp, -1); System::file().mkdir(this->_outputDirTemp + "/solid/", -1); //System::file().mkdir(this->_outputDirTemp + "/solid/merged/", -1); System::file().mkdir(this->_outputDirTemp + "/temp/", -1); System::file().mkdir(this->_outputDirTemp + "/log/", -1); System::file().mkdir(this->_outputDirTemp + "/count_synchro/", -1); System::file().mkdir(this->_outputDirTemp + "/merge_synchro/", -1); System::file().mkdir(this->_outputDirTemp + "/stats/", -1); System::file().mkdir(this->_outputDirTemp + "/job_count/", -1); System::file().mkdir(this->_outputDirTemp + "/job_merge/", -1); System::file().mkdir(this->_outputDirTemp + "/kmercount_per_partition/", -1); } void createConfig(){ size_t maxCores = this->_nbCores; size_t maxMemory = this->_maxMemory; size_t minMemoryPerJobMB = 500; if(this->_options->get(STR_SIMKA_NB_JOB_COUNT)){ _maxJobCount = this->_options->getInt(STR_SIMKA_NB_JOB_COUNT); //maxCores = _maxJobCount; //TO REMOVE WHEN BUG IN DISPATCHER IS RESOLVED } else{ size_t maxjob_byCore = min(maxCores/2, this->_nbBanks); //size_t maxjob_byCore = min(maxCores, this->_nbBanks); //TO REMOVE WHEN BUG IN DISPATCHER IS RESOLVED maxjob_byCore = max(maxjob_byCore, (size_t)1); size_t maxjob_byMemory = maxMemory/minMemoryPerJobMB; maxjob_byMemory = max(maxjob_byMemory, (size_t) 1); size_t maxJobs = min(maxjob_byCore, maxjob_byMemory); _maxJobCount = maxJobs; //maxCores = _maxJobCount; //TO REMOVE WHEN BUG IN DISPATCHER IS RESOLVED } //_maxJobCount = 1; if(this->_options->get(STR_SIMKA_NB_JOB_MERGE)){ _maxJobMerge = this->_options->getInt(STR_SIMKA_NB_JOB_MERGE); } else{ _maxJobMerge = maxCores; /* if(this->_computeComplexDistances && this->_computeSimpleDistances){ _maxJobMerge = max((size_t)maxCores/4, (size_t)1); } else if(this->_computeSimpleDistances){ _maxJobMerge = max((size_t)maxCores/2, (size_t)1); } else if(this->_computeComplexDistances){ _maxJobMerge = max((size_t)maxCores/3, (size_t)1); } else{ _maxJobMerge = maxCores; }*/ } _maxJobCount = min(_maxJobCount, maxCores); _maxJobMerge = min(_maxJobMerge, maxCores); _coresPerJob = maxCores / _maxJobCount; _coresPerJob = max((size_t)1, _coresPerJob); _memoryPerJob = maxMemory / _maxJobCount; _memoryPerJob = max(_memoryPerJob, (size_t)minMemoryPerJobMB); _coresPerMergeJob = maxCores / _maxJobMerge; _coresPerMergeJob = max((size_t)1, _coresPerMergeJob); cout << endl; cout << "Maximum ressources used by Simka: " << endl; cout << "\t - " << _maxJobCount << " simultaneous processes for counting the kmers (per job: " << _coresPerJob << " cores, " << _memoryPerJob << " MB memory)" << endl; cout << "\t - " << _maxJobMerge << " simultaneous processes for merging the kmer counts (per job: " << _coresPerMergeJob << " cores, memory undefined)" << endl; cout << endl; //_coresPerJob = this->_nbCores; //_memoryPerJob = this->_maxMemory; string filename = this->_outputDirTemp + "/" + "config.h5"; if(System::file().doesExist(filename)){ try{ cout << "\tconfig already exists (remove file " << filename << " to config again)" << endl; Storage* storage = StorageFactory(STORAGE_HDF5).load (filename); LOCAL (storage); Configuration* config = new Configuration(); config->load(storage->getGroup("")); _nbPartitions = config->_nb_partitions; delete config; Repartitor* repartitor = new Repartitor(); //LOCAL(repartitor); repartitor->load(storage->getGroup("")); delete repartitor; return; } catch (Exception& e) { cout << "\tcan't open config, computing it again" << endl; System::file().remove(filename); createConfig(); return; } } this->_options->setInt(STR_MAX_MEMORY, _memoryPerJob - _memoryPerJob/3); this->_options->setInt(STR_NB_CORES, _coresPerJob); Storage* storage = 0; storage = StorageFactory(STORAGE_HDF5).create (filename, true, false); LOCAL (storage); size_t chosenBankId = 0; SimkaSequenceFilter dummyFilter(0, 0); //vector*> banksToDelete; string inputDir = this->_outputDirTemp + "/input/"; u_int64_t maxPart = 0; for (size_t i=0; i_nbBanks; i++){ IBank* bank = Bank::open(inputDir + this->_bankNames[i]); LOCAL(bank); //size_t nbBank_ = bank->getCompositionNb(); SimkaBankTemp* simkaBank = new SimkaBankTemp(bank, this->_maxNbReads*this->_nbBankPerDataset[i]); //banksToDelete.push_back(simkaBank); ConfigurationAlgorithm testConfig(simkaBank, this->_options); testConfig.execute(); size_t part = testConfig.getConfiguration()._nb_partitions; if(part > maxPart){ maxPart = part; chosenBankId = i; } //delete simkaBank; /* u_int64_t nbReads = bank->estimateNbItems(); nbReads /= _nbBankPerDataset[i]; totalReads += nbReads; if(nbReads < minReads){ minReads = nbReads; //_smallerBankId = _bankNames[i]; } if(nbReads > maxReads){ maxReads = nbReads; _largerBankId = _bankNames[i]; }*/ } //maxPart += 2; //for(size_t i=0; i_options->setInt(STR_MAX_MEMORY, _memoryPerJob); IBank* inputbank = Bank::open(this->_banksInputFilename); LOCAL(inputbank); IBank* bank = Bank::open(this->_outputDirTemp + "/input/" + this->_bankNames[chosenBankId]); LOCAL(bank); //IBank* bank = Bank::open(_outputDirTemp + "/input/" + _bankNames[0]); //LOCAL(inputbank); //bank->finalize(); //u_int64_t nbSeqs = 1; //IBank* sampleBank = new SimkaBankSample(bank, nbSeqs); //SortingCountAlgorithm sortingCount (sampleBank, this->_options); //SimkaNullProcessor* proc = new SimkaNullProcessor(); //sortingCount.addProcessor (proc); // We launch the algorithm //sortingCount.execute(); //Configuration config = sortingCount.getConfig(); ConfigurationAlgorithm testConfig1(inputbank, this->_options); testConfig1.execute(); Configuration config1 = testConfig1.getConfiguration(); ConfigurationAlgorithm testConfig2(bank, this->_options); testConfig2.execute(); Configuration config2 = testConfig2.getConfiguration(); //IBank* inputbank = Bank::open(this->_banksInputFilename); //LOCAL(inputbank); //IBank* sampleBank2 = new SimkaBankSample(inputbank, nbSeqs); //SortingCountAlgorithm sortingCount2 (sampleBank2, this->_options); //SimkaNullProcessor* proc2 = new SimkaNullProcessor(); //sortingCount2.addProcessor (proc2); //sortingCount2.execute(); //Configuration config2 = sortingCount2.getConfig(); //cout << config2._nb_partitions << endl; _nbPartitions = max((size_t)maxPart, (size_t)_maxJobMerge); //_nbPartitions = max(_nbPartitions, (size_t)32); cout << "Nb partitions: " << _nbPartitions << " partitions" << endl << endl << endl; //_nbPartitions = max((int)_nbPartitions, (int)30); config1._nb_partitions = _nbPartitions; config2._nb_partitions = _nbPartitions; RepartitorAlgorithm repart (inputbank, storage->getGroup(""), config1); repart.execute (); uint64_t memoryUsageCachedItems; config2._nb_cached_items_per_core_per_part = 1 << 8; // cache at least 256 items (128 here, then * 2 in the next while loop) do { config2._nb_cached_items_per_core_per_part *= 2; memoryUsageCachedItems = 1LL * config2._nb_cached_items_per_core_per_part *config2._nb_partitions * config2._nbCores * sizeof(Type); } while (memoryUsageCachedItems < config2._max_memory * MBYTE / 10); /* if(_isClusterMode){ //config._nb_cached_items_per_core_per_part = 100000; _nbPartitions = _maxJobMerge; config._nb_partitions = _nbPartitions; uint64_t memoryUsageCachedItems; config._nb_cached_items_per_core_per_part = 1 << 8; // cache at least 256 items (128 here, then * 2 in the next while loop) do { config._nb_cached_items_per_core_per_part *= 2; memoryUsageCachedItems = 1LL * config._nb_cached_items_per_core_per_part *config._nb_partitions * config._nbCores * sizeof(Type); } while (memoryUsageCachedItems < config._max_memory * MBYTE / 10); //cout << config._nb_cached_items_per_core_per_part << endl; } else{ _nbPartitions = config._nb_partitions; }*/ //IBank* inputbank = Bank::open(this->_banksInputFilename); //LOCAL(inputbank); //ConfigurationAlgorithm inputconfig(inputbank, this->_options); //inputconfig.execute(); //RepartitorAlgorithm repart (inputbank, storage->getGroup(""), config); //repart.execute (); //setRepartitor (new Repartitor(storage->getGroup("minimizers"))); //SortingCountAlgorithm sortingCount (sampleBank, _options); config2.save(storage->getGroup("")); //sortingCount.getRepartitor()->save(storage->getGroup("")); //delete sampleBank; //setStorage (storage); //delete storage; //sampleBank->forget(); } void removeMergeSynchro(){ for (size_t i=0; i_bankNames.size(); i++){ string finishFilename = this->_outputDirTemp + "/merge_synchro/" + this->_bankNames[i] + ".ok"; if(System::file().doesExist(finishFilename)) System::file().remove(finishFilename); } } void printCountInfo(){ char * pEnd; vector kmerPerParts(_nbPartitions, 0); for(size_t i=0; i_bankNames.size(); i++){ //cout << filename << endl; string line; size_t currentPart = 0; ifstream file((this->_outputDirTemp + "/kmercount_per_partition/" + this->_bankNames[i] + ".txt").c_str()); size_t j = 0; while(getline(file, line)){ if(line == "") continue; kmerPerParts[j] += strtoull(line.c_str(), NULL, 10); j += 1; } file.close(); } cout << endl << endl << "Kmer repartition" << endl; for(size_t i=0; i_outputDirTemp + "/log/count_*)" << endl; for (size_t i=0; i<_nbPartitions; i++){ //System::file().mkdir(this->_outputDirTemp + "/solid/merged/part_" + Stringify::format("%i", i), -1); System::file().mkdir(this->_outputDirTemp + "/solid/part_" + Stringify::format("%i", i), -1); } vector commands; _progress = new ProgressSynchro ( this->createIteratorListener (this->_bankNames.size(), "Counting datasets"), System::thread().newSynchronizer()); _progress->init (); vector filenameQueue; vector filenameQueueToRemove; size_t nbJobs = 0; for (size_t i=0; i_bankNames.size(); i++){ string logFilename = this->_outputDirTemp + "/log/count_" + this->_bankNames[i] + ".txt"; string finishFilename = this->_outputDirTemp + "/count_synchro/" + this->_bankNames[i] + ".ok"; if(System::file().doesExist(finishFilename)){ _progress->inc(1); cout << "\t" << this->_bankNames[i] << " already counted (remove file " << finishFilename << " to count again)" << endl; continue; } //else{ string tempDir = this->_outputDirTemp + "/temp/" + this->_bankNames[i]; string command = "nohup " + _execDir + "/simkaCountProcess " + _execDir + "/simkaCount "; command += " " + string(STR_KMER_SIZE) + " " + SimkaAlgorithm<>::toString(this->_kmerSize); command += " " + string("-out-tmp-simka") + " " + this->_outputDirTemp; command += " " + string("-out-tmp") + " " + tempDir; command += " -bank-name " + this->_bankNames[i]; command += " -bank-index " + SimkaAlgorithm<>::toString(i); command += " -nb-datasets " + SimkaAlgorithm<>::toString(this->_nbBankPerDataset[i]); command += " " + string(STR_MAX_MEMORY) + " " + SimkaAlgorithm<>::toString(_memoryPerJob); command += " " + string(STR_NB_CORES) + " " + SimkaAlgorithm<>::toString(_coresPerJob); command += " " + string(STR_URI_INPUT) + " dummy "; command += " " + string(STR_KMER_ABUNDANCE_MIN) + " " + SimkaAlgorithm<>::toString(this->_abundanceThreshold.first); command += " " + string(STR_KMER_ABUNDANCE_MAX) + " " + SimkaAlgorithm<>::toString(this->_abundanceThreshold.second); command += " " + string(STR_SIMKA_MIN_READ_SIZE) + " " + SimkaAlgorithm<>::toString(this->_minReadSize); command += " " + string(STR_SIMKA_MIN_READ_SHANNON_INDEX) + " " + Stringify::format("%f", this->_minReadShannonIndex); command += " " + string(STR_SIMKA_MAX_READS) + " " + SimkaAlgorithm<>::toString(this->_maxNbReads); command += " -nb-partitions " + SimkaAlgorithm<>::toString(_nbPartitions); //command += " -verbose " + Stringify::format("%d", this->_options->getInt(STR_VERBOSE)); command += " >> " + logFilename + " 2>&1"; filenameQueue.push_back(this->_bankNames[i]); System::file().mkdir(tempDir, -1); string str = "Counting dataset " + SimkaAlgorithm<>::toString(i) + "\n"; str += "\t" + command + "\n\n\n"; system(("echo \"" + str + "\" > " + logFilename).c_str()); //cout << "Counting dataset " << i << endl; //cout << "\t" << command << endl; removeMergeSynchro(); //_progress->inc(1); //nanosleep((const struct timespec[]){{0, 10000000L}}, NULL); if(_isClusterMode){ string jobFilename = this->_outputDirTemp + "/job_count/job_count_" + SimkaAlgorithm<>::toString(i) + ".bash"; IFile* jobFile = System::file().newFile(jobFilename.c_str(), "w"); system(("chmod 755 " + jobFilename).c_str()); string jobCommand = _jobCountContents + '\n' + '\n'; jobCommand += command; jobFile->fwrite(jobCommand.c_str(), jobCommand.size(), 1); jobFile->flush(); string submitCommand = _jobCountCommand + " " + jobFile->getPath(); delete jobFile; system(submitCommand.c_str()); } else{ command += " &"; system(command.c_str()); } nbJobs += 1; //cout << "job started" << endl; if(nbJobs >= _maxJobCount){ while(true){ bool isJobAvailbale = false; for(size_t j=0; j_outputDirTemp + "/count_synchro/" + filenameQueue[j] + ".ok"; if(System::file().doesExist(finishFilename2)){ filenameQueueToRemove.push_back(filenameQueue[j]); isJobAvailbale = true; nbJobs -= 1; //cout << "job finished" << endl; _progress->inc(1); } } if(isJobAvailbale){ for(size_t j=0; j= this->_bankNames.size()) break; } } } while(nbJobs > 0){ bool isJobAvailbale = false; for(size_t j=0; j_outputDirTemp + "/count_synchro/" + filenameQueue[j] + ".ok"; if(System::file().doesExist(finishFilename2)){ filenameQueueToRemove.push_back(filenameQueue[j]); isJobAvailbale = true; nbJobs -= 1; _progress->inc(1); } } if(isJobAvailbale){ for(size_t j=0; jfinish(); delete _progress; } void merge(){ cout << endl << "Merging k-mer counts and computing distances... (log files are " + this->_outputDirTemp + "/log/merge_*)" << endl; _progress = new ProgressSynchro ( this->createIteratorListener (_nbPartitions, "Merging datasets"), System::thread().newSynchronizer()); _progress->init (); vector filenameQueue; vector filenameQueueToRemove; size_t nbJobs = 0; for (size_t i=0; i<_nbPartitions; i++){ string datasetId = SimkaAlgorithm<>::toString(i); string finishFilename = this->_outputDirTemp + "/merge_synchro/" + datasetId + ".ok"; string logFilename = this->_outputDirTemp + "/log/merge_" + datasetId + ".txt"; if(System::file().doesExist(finishFilename)){ _progress->inc(1); cout << "\t" << datasetId << " already merged (remove file " << finishFilename << " to merge again)" << endl; } else{ //if(System::file().doesExist(finishFilename)){ // System::file().remove(finishFilename); // cout << "\t" << _bankNames[i] << " already (remove file " << finishFilename << " to count again)" << endl; //} //else{ filenameQueue.push_back(datasetId); string command = "nohup " + _execDir + "/simkaMerge "; command += " " + string(STR_KMER_SIZE) + " " + SimkaAlgorithm<>::toString(this->_kmerSize); command += " " + string(STR_URI_INPUT) + " " + this->_inputFilename; command += " " + string("-out-tmp-simka") + " " + this->_outputDirTemp; command += " -partition-id " + SimkaAlgorithm<>::toString(i); command += " " + string(STR_MAX_MEMORY) + " " + SimkaAlgorithm<>::toString(this->_maxMemory / this->_nbCores); command += " " + string(STR_NB_CORES) + " " + SimkaAlgorithm<>::toString(_coresPerMergeJob); command += " " + string(STR_SIMKA_MIN_KMER_SHANNON_INDEX) + " " + Stringify::format("%f", this->_minKmerShannonIndex); command += " -verbose " + Stringify::format("%d", this->_options->getInt(STR_VERBOSE)); if(this->_computeSimpleDistances) command += " " + string(STR_SIMKA_COMPUTE_ALL_SIMPLE_DISTANCES); if(this->_computeComplexDistances) command += " " + string(STR_SIMKA_COMPUTE_ALL_COMPLEX_DISTANCES); command += " >> " + logFilename + " 2>&1"; //SimkaDistanceParam distanceParams(this->_options); //if(distanceParams._computeBrayCurtis) command += " " + STR_SIMKA_DISTANCE_BRAYCURTIS + " "; //if(distanceParams._computeCanberra) command += " " + STR_SIMKA_DISTANCE_CANBERRA + " "; //if(distanceParams._computeChord) command += " " + STR_SIMKA_DISTANCE_CHORD + " "; //if(distanceParams._computeHellinger) command += " " + STR_SIMKA_DISTANCE_HELLINGER + " "; //if(distanceParams._computeKulczynski) command += " " + STR_SIMKA_DISTANCE_KULCZYNSKI + " "; string str = "Merging partition " + SimkaAlgorithm<>::toString(i) + "\n"; str += "\t" + command + "\n\n\n"; system(("echo \"" + str + "\" > " + logFilename).c_str()); if(_isClusterMode){ string jobFilename = this->_outputDirTemp + "/job_merge/job_merge_" + SimkaAlgorithm<>::toString(i) + ".bash"; IFile* jobFile = System::file().newFile(jobFilename.c_str(), "w"); system(("chmod 755 " + jobFilename).c_str()); string jobCommand = _jobMergeContents + '\n' + '\n'; jobCommand += command; jobFile->fwrite(jobCommand.c_str(), jobCommand.size(), 1); jobFile->flush(); string submitCommand = _jobMergeCommand + " " + jobFile->getPath(); delete jobFile; system(submitCommand.c_str()); } else{ command += " &"; system(command.c_str()); } nbJobs += 1; } if(nbJobs >= _maxJobMerge){ while(true){ bool isJobAvailbale = false; for(size_t j=0; j_outputDirTemp + "/merge_synchro/" + filenameQueue[j] + ".ok"; if(System::file().doesExist(finishFilename2)){ filenameQueueToRemove.push_back(filenameQueue[j]); isJobAvailbale = true; nbJobs -= 1; _progress->inc(1); } } if(isJobAvailbale){ for(size_t j=0; j= this->_bankNames.size()) break; } } } //cout << nbJobs << endl; while(nbJobs > 0){ bool isJobAvailbale = false; for(size_t j=0; j_outputDirTemp + "/merge_synchro/" + filenameQueue[j] + ".ok"; if(System::file().doesExist(finishFilename2)){ filenameQueueToRemove.push_back(filenameQueue[j]); isJobAvailbale = true; nbJobs -= 1; _progress->inc(1); } } if(isJobAvailbale){ for(size_t j=0; jfinish(); delete _progress; } /* void getCountInfo(SimkaStatistics& mainStats){ for(size_t i=0; i_nbBanks; i++){ string name = this->_bankNames[i]; string countFilename = this->_outputDirTemp + "/count_synchro/" + name + ".ok"; string line; ifstream file(countFilename.c_str()); vector lines; while(getline(file, line)){ if(line == "") continue; lines.push_back(line); } file.close(); u_int64_t nbReads = strtoull(lines[0].c_str(), NULL, 10) ; mainStats._datasetNbReads[i] = nbReads; mainStats._nbSolidDistinctKmersPerBank[i] = strtoull(lines[1].c_str(), NULL, 10); //cout << mainStats._nbSolidDistinctKmersPerBank[i] << endl; mainStats._nbSolidKmersPerBank[i] = strtoull(lines[2].c_str(), NULL, 10); mainStats._chord_sqrt_N2[i] = sqrt(strtoull(lines[3].c_str(), NULL, 10)); } }*/ void stats(){ cout << endl << "Computing stats..." << endl; //cout << this->_nbBanks << endl; //u_int64_t nbKmers = 0; //SimkaDistanceParam distanceParams(this->_options); SimkaStatistics mainStats(this->_nbBanks, this->_computeSimpleDistances, this->_computeComplexDistances, this->_outputDirTemp, this->_bankNames); for(size_t i=0; i<_nbPartitions; i++){ string filename = this->_outputDirTemp + "/stats/part_" + SimkaAlgorithm<>::toString(i) + ".gz"; //Storage* storage = StorageFactory(STORAGE_HDF5).load (this->_outputDirTemp + "/stats/part_" + SimkaAlgorithm<>::toString(i) + ".stats"); //LOCAL (storage); SimkaStatistics stats(this->_nbBanks, this->_computeSimpleDistances, this->_computeComplexDistances, this->_outputDirTemp, this->_bankNames); stats.load(filename); //cout << stats._nbDistinctKmers << " " << stats._nbKmers << endl; mainStats += stats; //nbKmers += stats._nbKmers; } //cout << "Nb kmers: " << nbKmers << endl; //getCountInfo(mainStats); //for(size_t i=0; i_nbBanks; i++){ // cout << mainStats._nbSolidDistinctKmersPerBank[i] << endl; //} mainStats.outputMatrix(this->_outputDir, this->_bankNames); #//ifdef PRINT_STATS if(this->_options->getInt(STR_VERBOSE) != 0) mainStats.print(); #//endif } //u_int64_t _maxMemory; //size_t _nbCores; size_t _memoryPerJob; size_t _coresPerJob; size_t _coresPerMergeJob; //IBank* _banks; //IProperties* _options; //string _inputFilename; //vector _bankNames; //vector _nbBankPerDataset; size_t _nbPartitions; //size_t _nbBanks; //vector _nbReadsPerDataset; //string _banksInputFilename; //vector _tempFilenamesToDelete; //u_int64_t _maxNbReads; //IBank* _sampleBank; string _execDir; bool _isClusterMode; size_t _maxJobCount; size_t _maxJobMerge; string _jobCountFilename; string _jobMergeFilename; string _jobCountCommand; string _jobMergeCommand; //u_int64_t _nbAskedPartitions; string _jobCountContents; string _jobMergeContents; IteratorListener* _progress; }; class SimkaPotara : public Tool{ public: SimkaPotara(const string& execFilename); void execute(); string _execFilename; }; #endif simka-1.5.1/src/core/000077500000000000000000000000001353413740300143445ustar00rootroot00000000000000simka-1.5.1/src/core/KmerCountCompressor.hpp000077500000000000000000000462251353413740300210550ustar00rootroot00000000000000 #ifndef _GATB_CORE_KMER_IMPL_KMER_COUNT_COMPRESSOR_HPP_ #define _GATB_CORE_KMER_IMPL_KMER_COUNT_COMPRESSOR_HPP_ /********************************************************************************/ #include #include #include #include #include #include #include #include #include using namespace std; using namespace gatb::core::system; using namespace gatb::core::system::impl; const u_int64_t MAX_MEMORY_PER_BLOCK = 100000; //#define INDEXING #define MONO_BANK /********************************************************************************/ namespace gatb { namespace core { namespace kmer { namespace impl { /********************************************************************************/ /********************************************************************* * ** KmerCountCoder *********************************************************************/ class KmerCountCoder { public: KmerCountCoder(int nbBanks, int partitionIndex) //_bankCountDeltaModel(3) { _nbBanks = nbBanks; _partitionIndex = partitionIndex; _nbKmers = 0; /* for(int i=0; i models; vector bankModels; for(int j=0; j models; vector bankModels; for(int j=0; j _kmerModel; vector > _bankModels; vector > _abundanceModels; //vector _bankDeltaModels; //vector _deltaModels; //vector _lastAbundances; //vector _lastBanks; u_int64_t _lastKmerValue; u_int64_t _lastNbBankCount; //vector _blockSizes; vector _bankCountModel; //Order0Model _bankCountDeltaModel; }; /********************************************************************* * ** KmerCountCompressorPartition *********************************************************************/ template class KmerCountCompressorPartition : public KmerCountCoder { public: /** Shortcuts. */ typedef typename Kmer::Type Type; typedef typename Kmer::Count Count; typedef typename Kmer::ModelCanonical::Kmer Kmer; KmerCountCompressorPartition(const string& outputDir, int partitionIndex, int nbBanks) : KmerCountCoder(nbBanks, partitionIndex) { string filename = outputDir + "/part_" + KmerCountCoder::toString(_partitionIndex); _outputFile = System::file().newFile(filename.c_str(), "wb"); } ~KmerCountCompressorPartition(){ //cout << _rangeEncoder.getBufferSize() << endl; /* string path = _outputFile->getPath(); cout << "Partition " << _partitionIndex << endl; cout << "\tNb kmers: " << _nbKmers << endl; cout << "\tCompressed size: " << System::file().getSize(path) << endl; cout << "\tByte per kmer count: " << System::file().getSize(path) / (float)_nbKmers<< endl;*/ delete _outputFile; } void flush(){ _rangeEncoder.flush(); writeBlock(); clear(); _rangeEncoder.clear(); CompressionUtils::encodeNumeric(_rangeEncoder, _kmerModel, _nbKmers); _rangeEncoder.flush(); //for(u_int64_t blockSize : _blockSizes){ // CompressionUtils::encodeNumeric(_rangeEncoder, _kmerModel, blockSize); //} _outputFile->fwrite((const char*) _rangeEncoder.getBuffer(true), _rangeEncoder.getBufferSize(), 1); _outputFile->flush(); } void insert(const Type& kmer, const CountVector& abundancePerBank){ _nbKmers += 1; u_int64_t kmerValue = kmer.getVal(); CompressionUtils::encodeNumeric(_rangeEncoder, _kmerModel, kmerValue - _lastKmerValue); _lastKmerValue = kmerValue; if(abundancePerBank.size() == 1){ CompressionUtils::encodeNumeric(_rangeEncoder, _bankCountModel, abundancePerBank[0]); } else{ //u_int64_t deltaValue; //u_int8_t deltaType; int modelIndex = 0; _banks.clear(); //int nbBankCount; for(size_t bankId=0; bankId 0){ _banks.push_back(bankId); //nbBankCount += 1; } } //deltaType = CompressionUtils::getDeltaValue(nbBankCount, _lastNbBankCount, &deltaValue); //_rangeEncoder.encode(_bankCountDeltaModel, deltaType); //CompressionUtils::encodeNumeric(_rangeEncoder, _bankCountModel, deltaValue); //_lastNbBankCount = nbBankCount; CompressionUtils::encodeNumeric(_rangeEncoder, _bankCountModel, _banks.size()); int lastBankId = 0; for(size_t i=0; i<_banks.size(); i++){ int bankId = _banks[i]; u_int16_t abundance = abundancePerBank[bankId]; //if(abundance == 0){ //} //else{ if(modelIndex >= _bankModels.size()){ addField(); } //deltaType = CompressionUtils::getDeltaValue(bankId, _lastBanks[modelIndex], &deltaValue); //_rangeEncoder.encode(_bankDeltaModels[modelIndex], deltaType); //CompressionUtils::encodeNumeric(_rangeEncoder, _bankModels[modelIndex], deltaValue); //_lastBanks[modelIndex] = bankId; CompressionUtils::encodeNumeric(_rangeEncoder, _bankModels[modelIndex], bankId - lastBankId); lastBankId = bankId; //deltaType = CompressionUtils::getDeltaValue(abundance, _lastAbundances[modelIndex], &deltaValue); //_rangeEncoder.encode(_deltaModels[modelIndex], deltaType); //CompressionUtils::encodeNumeric(_rangeEncoder, _abundanceModels[modelIndex], deltaValue); //_lastAbundances[modelIndex] = abundance; CompressionUtils::encodeNumeric(_rangeEncoder, _abundanceModels[modelIndex], abundance); modelIndex += 1; //} } } if(_rangeEncoder.getBufferSize() >= MAX_MEMORY_PER_BLOCK){ writeBlock(); } } void writeBlock(){ if(_rangeEncoder.getBufferSize() > 0){ //_rangeEncoder.flush(); //_blockSizes.push_back(_rangeEncoder.getBufferSize()); _outputFile->fwrite((const char*) _rangeEncoder.getBuffer(), _rangeEncoder.getBufferSize(), 1); } _rangeEncoder.clearBuffer(); //_rangeEncoder.clear(); //clear(); } u_int64_t getSizeByte(){ //cout << _outputFile->getPath() << endl; //cout << System::file().getSize(_outputFile->getPath()) << endl; return System::file().getSize(_outputFile->getPath()); } private: RangeEncoder _rangeEncoder; IFile* _outputFile; vector _banks; }; /********************************************************************* * ** KmerCountCompressor *********************************************************************/ template class KmerCountCompressor { public: /** Shortcuts. */ typedef typename Kmer::Type Type; typedef typename Kmer::Count Count; typedef typename Kmer::ModelCanonical::Kmer Kmer; /** */ KmerCountCompressor(const string& outputDir, int nbPartitions, int nbBanks){ _rootDir = outputDir; // + "/dsk_output/"; _nbPartitions = nbPartitions; _nbBanks = nbBanks; System::file().rmdir(_rootDir); System::file().mkdir(_rootDir, -1); cout << _rootDir << endl; //cout << nbPartitions << endl; for(int i=0; i* comp = new KmerCountCompressorPartition(_rootDir, i, nbBanks); _partitionCompressors.push_back(comp); } } ~KmerCountCompressor(){ u_int64_t nbKmers = 0; u_int64_t size = 0; for(size_t i=0; i<_partitionCompressors.size(); i++){ KmerCountCompressorPartition* comp = _partitionCompressors[i]; comp->flush(); nbKmers += comp->getNbKmers(); size += comp->getSizeByte(); delete comp; } cout << "Compression statistics " << endl; cout << "\tNb kmers: " << nbKmers << endl; cout << "\tCompressed size: " << size << "B - " << size/MBYTE << " MB" << endl; cout << "\tByte per kmer count: " << size / (float) nbKmers<< endl; IFile* outputFile = System::file().newFile(_rootDir + "/dsk_count_data", "wb"); outputFile->print("%i %i", _nbPartitions, _nbBanks); outputFile->flush(); delete outputFile; } void insert(int partitionIndex, const Type& kmer, const CountVector& abundancePerBank){ _partitionCompressors[partitionIndex]->insert(kmer, abundancePerBank); } private: string _rootDir; int _nbPartitions; int _nbBanks; vector* > _partitionCompressors; }; /********************************************************************* * ** KmerCountDecompressorPartition *********************************************************************/ template class KmerCountDecompressorPartition : KmerCountCoder { public: typedef typename Kmer::Type Type; typedef typename Kmer::Count Count; typedef typename Kmer::ModelCanonical::Kmer Kmer; KmerCountDecompressorPartition(const string& inputDir, int partitionIndex, int nbBanks, Functor* functor, gatb::core::tools::dp::IteratorListener* progress) : KmerCountCoder(nbBanks, partitionIndex) { _progress = progress; _functor = functor; _nbDecodedKmers = 0; _nbDecodedKmersProgress = 0; string filename = inputDir + "/part_" + KmerCountCoder::toString(_partitionIndex); _inputFile = new ifstream(filename.c_str(), ios::in|ios::binary); _inputFile->seekg(0, _inputFile->end); _rangeDecoder.setInputFile(_inputFile, true); _nbKmers = CompressionUtils::decodeNumeric(_rangeDecoder, _kmerModel); //cout << _nbKmers << endl; //_rangeEncoder.flush(); //for(u_int64_t blockSize : _blockSizes){ // CompressionUtils::encodeNumeric(_rangeEncoder, _kmerModel, blockSize); //} //_outputFile->fwrite((const char*) _rangeEncoder.getBuffer(true), _rangeEncoder.getBufferSize(), 1); clear(); _rangeDecoder.clear(); _inputFile->seekg(0, _inputFile->beg); _rangeDecoder.setInputFile(_inputFile); /* for(int i=0; i models; vector bankModels; for(int j=0; jexecute(kmer, abundancePerBanks); _nbDecodedKmers += 1; _nbDecodedKmersProgress += 1; if (_nbDecodedKmersProgress > 500000) { _progress->inc (_nbDecodedKmersProgress); _nbDecodedKmersProgress = 0; } } _progress->inc (_nbDecodedKmersProgress); } private: Functor* _functor; gatb::core::tools::dp::IteratorListener* _progress; RangeDecoder _rangeDecoder; ifstream* _inputFile; u_int64_t _nbDecodedKmers; u_int64_t _nbDecodedKmersProgress; }; /********************************************************************* * ** KmerCountDecompressor *********************************************************************/ template class KmerCountDecompressor : public gatb::core::tools::misc::impl::Algorithm { public: /** Shortcuts. */ typedef typename Kmer::Type Type; typedef typename Kmer::Count Count; typedef typename Kmer::ModelCanonical::Kmer Kmer; /** */ KmerCountDecompressor(const string& inputDir, int nbCores) : Algorithm("kcc", 0, 0) { //getInput()->setStr(STR_VERBOSE, "1"); _inputDir = inputDir; _nbCores = nbCores; IFile* dskCountDataFile = System::file().newFile(inputDir + "/dsk_count_data", "rb"); vector numbers; string n = ""; while(true){ u_int8_t c = dskCountDataFile->get(); if(c == ' ' || dskCountDataFile->isEOF()){ numbers.push_back(n); n.clear(); } else{ n += c; } if(dskCountDataFile->isEOF()) break; } //for(string& number: numbers){ // cout << number << endl; //} _nbPartitions = atoi(numbers[0].c_str()); _nbBanks = atoi(numbers[1].c_str()); //for(int i=0; i<_nbCores; i++) //for(int i=0; i<_nbPartitions; i++){ // KmerCountCompressorPartition decomp(inputDir, i, _nbBanks); // decomp.iterKmers(_nbCores); //} } ~KmerCountDecompressor(){ delete _progress; } void setupProgress(){ RangeDecoder rangeDecoder; vector kmerModel; for(int i=0; iseekg(0, file->end); rangeDecoder.setInputFile(file, true); u_int64_t nbKmers = CompressionUtils::decodeNumeric(rangeDecoder, kmerModel); totalKmers += nbKmers; rangeDecoder.clear(); for(int j=0; jinit (); } template static void *callMyFunction(void *object){ ((KmerCountDecompressorPartition*)object)->execute(); //object->execute(); return NULL; } void execute(){ } template void iterate (const Functor& functor, size_t groupSize=1000){ setupProgress(); pthread_t* tab_threads = new pthread_t[_nbCores]; //thread_arg_decoder * targ = new thread_arg_decoder [_nbCores]; vector* > _partitionDecompressors; for(int i=0; i<_nbPartitions;){ for(int j=0; j<_nbCores && i<_nbPartitions; j++){ Functor* func = new Functor(functor); KmerCountDecompressorPartition* decomp = new KmerCountDecompressorPartition(_inputDir, i, _nbBanks, func, _progress); _partitionDecompressors.push_back(decomp); i += 1; } //Lala * lala = new Lala(); for(int j=0; j<_partitionDecompressors.size(); j++){ //cout << "start" << endl; pthread_create(&tab_threads[j], NULL, &KmerCountDecompressor::callMyFunction, _partitionDecompressors[j]); //_partitionDecompressors[j]->execute(); //cout << "loulou" << endl; } for(int j=0; j<_partitionDecompressors.size(); j++){ pthread_join(tab_threads[j], NULL); } for(int j=0; j<_partitionDecompressors.size(); j++){ delete _partitionDecompressors[j]; } _partitionDecompressors.clear(); } delete tab_threads; _progress->finish (); } private: string _inputDir; int _nbCores; int _nbBanks; int _nbPartitions; gatb::core::tools::dp::IteratorListener* _progress; //vector* > _partitionDecompressors; }; /********************************************************************************/ } } } } /* end of namespaces. */ /********************************************************************************/ #endif /* _GATB_CORE_KMER_IMPL_KMER_COUNT_COMPRESSOR__HPP_ */ simka-1.5.1/src/core/Simka.cpp000077500000000000000000000257571353413740300161370ustar00rootroot00000000000000/***************************************************************************** * Simka: Fast kmer-based method for estimating the similarity between numerous metagenomic datasets * A tool from the GATB (Genome Assembly Tool Box) * Copyright (C) 2015 INRIA * Authors: G.Benoit, C.Lemaitre, P.Peterlongo * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU Affero General Public License as * published by the Free Software Foundation, either version 3 of the * License, or (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU Affero General Public License for more details. * * You should have received a copy of the GNU Affero General Public License * along with this program. If not, see . *****************************************************************************/ #include "Simka.hpp" #include "SimkaAlgorithm.hpp" IOptionsParser* Simka::createOptionsParser (IOptionsParser* parent) { IOptionsParser* parser = parent; //new OptionsParser ("Simka"); //Main parser parser->push_front (new OptionNoParam (STR_SIMKA_COMPUTE_DATA_INFO, "compute (and display) information before running Simka, such as the number of reads per dataset", false)); parser->push_front (new OptionNoParam (STR_SIMKA_KEEP_TMP_FILES, "keep temporary files", false)); parser->push_front (new OptionOneParam (STR_URI_OUTPUT_TMP, "output directory for temporary files", true)); parser->push_front (new OptionOneParam (STR_URI_OUTPUT, "output directory for result files (distance matrices)", false, "./simka_results")); parser->push_front (new OptionOneParam (STR_URI_INPUT, "input file of samples. One sample per line: id1: filename1...", true)); //parser->push_back (new OptionOneParam (STR_URI_OUTPUT_TMP, "output directory for temporary files", true)); //IOptionsParser* parser = getParser(); //IOptionsParser* dskParser = SortingCountAlgorithm<>::getOptionsParser(); //parser->push_back(dskParser); //dskParser->setVisible(false); //cout << parser->getParser(STR_NB_CORES) << endl; parser->getParser(STR_NB_CORES)->setVisible(false); //parser->push_back(new OptionOneParam(parser->getParser(STR_NB_CORES)->getName(), parser->getParser(STR_NB_CORES)->getHelp(), false, "0")); //parser->push_front(dskParser->getParser (STR_URI_OUTPUT_TMP)); //dskParser->getParser (STR_URI_OUTPUT_TMP)->setMandatory //parser->push_front(dskParser->getParser (STR_URI_OUTPUT)); //parser->getParser (STR_URI_OUTPUT)->setHelp("output directory for result files (similarity matrix, heatmaps)"); //parser->push_front(dskParser->getParser (STR_URI_INPUT)); //parser->getParser(STR_URI_INPUT)->setHelp("input file of datasets. One dataset per line: id filename1 filename2..."); //if (Option* p = dynamic_cast (parser->getParser(STR_URI_OUTPUT_TMP))) { p->s; } //Distance parser IOptionsParser* distanceParser = new OptionsParser ("distance"); distanceParser->push_back (new OptionNoParam (STR_SIMKA_COMPUTE_ALL_SIMPLE_DISTANCES, "compute all simple distances (Chord, Hellinger...)", false)); distanceParser->push_back (new OptionNoParam (STR_SIMKA_COMPUTE_ALL_COMPLEX_DISTANCES, "compute all complex distances (Jensen-Shannon...)", false)); //Kmer parser IOptionsParser* kmerParser = new OptionsParser ("kmer"); kmerParser->push_back (new OptionOneParam (STR_KMER_SIZE, "size of a kmer", false, "21")); //kmerParser->push_back(dskParser->getParser (STR_KMER_SIZE)); //kmerParser->push_back(new OptionOneParam (STR_KMER_PER_READ.c_str(), "number of selected kmers per read", false, "0")); //kmerParser->push_back (new OptionOneParam (STR_KMER_ABUNDANCE_MIN, "min abundance a kmer need to be considered", false, "1")); kmerParser->push_back (new OptionOneParam (STR_KMER_ABUNDANCE_MIN, "min abundance a kmer need to be considered", false, "2")); kmerParser->push_back (new OptionOneParam (STR_KMER_ABUNDANCE_MAX, "max abundance a kmer can have to be considered", false, "999999999")); //kmerParser->push_back(dskParser->getParser (STR_KMER_ABUNDANCE_MIN)); //if (Option* p = dynamic_cast (parser->getParser(STR_KMER_ABUNDANCE_MIN))) { p->setDefaultValue ("0"); } //if (Option* p = dynamic_cast (parser->getParser(STR_SOLIDITY_KIND))) { p->setDefaultValue ("all"); } //kmerParser->push_back(dskParser->getParser (STR_KMER_ABUNDANCE_MAX)); //kmerParser->push_back(dskParser->getParser (STR_SOLIDITY_KIND)); //kmerParser->getParser (STR_SOLIDITY_KIND)->setHelp("TODO"); //kmerParser->push_back (new OptionNoParam (STR_SIMKA_SOLIDITY_PER_DATASET.c_str(), "do not take into consideration multi-counting when determining solid kmers", false )); kmerParser->push_back (new OptionOneParam (STR_SIMKA_MIN_KMER_SHANNON_INDEX.c_str(), "minimal Shannon index a kmer should have to be kept. Float in [0,2]", false, "0" )); //Read filter parser IOptionsParser* readParser = new OptionsParser ("read"); readParser->push_back (new OptionOneParam (STR_SIMKA_MAX_READS.c_str(), "maximum number of reads per sample to process. Can be -1: use all reads. Can be 0: estimate it", false, "-1" )); readParser->push_back (new OptionOneParam (STR_SIMKA_MIN_READ_SIZE.c_str(), "minimal size a read should have to be kept", false, "0" )); readParser->push_back (new OptionOneParam (STR_SIMKA_MIN_READ_SHANNON_INDEX.c_str(), "minimal Shannon index a read should have to be kept. Float in [0,2]", false, "0" )); //Core parser IOptionsParser* coreParser = new OptionsParser ("core"); coreParser->push_back(new OptionOneParam(STR_NB_CORES, "number of cores", false, "0")); coreParser->push_back (new OptionOneParam (STR_MAX_MEMORY, "max memory (MB)", false, "5000")); //coreParser->push_back(dskParser->getParser ()); //coreParser->push_back(dskParser->getParser (STR_MAX_DISK)); //Distances //IOptionsParser* distanceParser = new OptionsParser ("distances"); //distanceParser->push_back (new OptionNoParam (STR_SIMKA_DISTANCE_BRAYCURTIS.c_str(), "compute Bray Curtis distance")); //distanceParser->push_back (new OptionNoParam (STR_SIMKA_DISTANCE_CHORD.c_str(), "compute Chord distance")); //distanceParser->push_back (new OptionNoParam (STR_SIMKA_DISTANCE_HELLINGER.c_str(), "compute Hellinger distance")); //distanceParser->push_back (new OptionNoParam (STR_SIMKA_DISTANCE_CANBERRA.c_str(), "compute Canberra distance")); //distanceParser->push_back (new OptionNoParam (STR_SIMKA_DISTANCE_KULCZYNSKI.c_str(), "compute Kulczynski distance")); parser->push_back(distanceParser); parser->push_back(kmerParser); parser->push_back(readParser); parser->push_back(coreParser); //parser->push_back(distanceParser); IOptionsParser* dskParser = SortingCountAlgorithm<>::getOptionsParser(); if (Option* p = dynamic_cast (dskParser->getParser(STR_MINIMIZER_SIZE))) { p->setDefaultValue ("7"); } parser->push_back(dskParser); dskParser->setVisible(false); if (Option* p = dynamic_cast (parser->getParser(STR_SOLIDITY_KIND))) { p->setDefaultValue ("all"); } return parser; } Simka::Simka() : Tool ("Simka") { Simka::createOptionsParser(getParser()); //coreParser->push_back(new OptionOneParam(parser->getParser(STR_NB_CORES)->getName(), parser->getParser(STR_NB_CORES)->getHelp(), false, "0")); //if (IOptionsParser* input = dskParser->getParser (STR_KMER_ABUNDANCE_MIN_THRESHOLD)) { input->setVisible (false); } /* IOptionsParser* parser = getParser(); IOptionsParser* dskParser = SortingCountAlgorithm<>::getOptionsParser(); parser->push_back (dskParser, 1); parser->push_back(dskParser); parser->getParser (STR_URI_INPUT)->setHelp("input file of datasets and their id. One dataset per line: dataset_id dataset_filename"); parser->getParser (STR_KMER_ABUNDANCE_MIN_THRESHOLD)->setVisible (false); parser->getParser (STR_HISTOGRAM_MAX)->setVisible (false); parser->getParser (STR_URI_SOLID_KMERS)->setVisible (false); parser->getParser (STR_URI_OUTPUT_DIR)->setHelp("output directory for temporary files"); parser->getParser (STR_URI_OUTPUT)->setHelp("output directory for result files"); parser->getParser (STR_SOLIDITY_KIND)->setHelp("TODO"); parser->getParser (STR_MINIMIZER_TYPE)->setVisible (false); parser->getParser (STR_MINIMIZER_SIZE)->setVisible (false); parser->getParser (STR_REPARTITION_TYPE)->setVisible (false); if (Option* p = dynamic_cast (parser->getParser(STR_KMER_ABUNDANCE_MIN))) { p->setDefaultValue ("0"); } parser->push_back (new OptionNoParam (STR_SOLIDITY_PER_DATASET.c_str(), "Do not take into consideration multi-counting when determining solidity of kmers", false )); */ /* parser->push_back (new OptionOneParam (STR_URI_INPUT, "reads file", true )); parser->push_back (new OptionOneParam (STR_KMER_SIZE, "size of a kmer", false, "31" )); parser->push_back (new OptionOneParam (STR_KMER_ABUNDANCE_MIN,"min abundance threshold for solid kmers", false, "3" )); parser->push_back (new OptionOneParam (STR_KMER_ABUNDANCE_MAX,"min abundance threshold for solid kmers", false, "3" )); parser->push_back (new OptionOneParam (STR_MAX_MEMORY, "max memory (in MBytes)", false, "2000")); parser->push_back (new OptionOneParam (STR_URI_OUTPUT_DIR, "output folder for solid kmers", false)); parser->push_back (new OptionOneParam (STR_URI_OUTPUT, "output file", false)); */ //setParser (parser); } struct Parameter { //Parameter (Simka& simka, IProperties* props) : props(props) {} Parameter (IProperties* props) : _props(props) {} //Simka& _simka; IProperties* _props; /* string _inputFilename; string _outputDir; size_t _kmerSize; pair _abundanceThreshold; bool _soliditySingle;*/ }; template struct Functor { void operator () (Parameter p) { SimkaAlgorithm simkaAlgorithm (p._props); simkaAlgorithm.execute(); /* #ifdef SIMKA_MIN simkaAlgorithm.executeSimkamin(); #else #endif*/ }}; void Simka::execute () { IProperties* input = getInput(); //Parameter params(*this, getInput()); Parameter params(input); size_t kmerSize = getInput()->getInt (STR_KMER_SIZE); /* params._kmerSize = getInput()->getInt (STR_KMER_SIZE); params._inputFilename = input->getStr(STR_URI_INPUT); params._outputDir = input->get(STR_URI_OUTPUT) ? input->getStr(STR_URI_OUTPUT) : "./"; params._abundanceThreshold.first = input->getInt(STR_KMER_ABUNDANCE_MIN); params._abundanceThreshold.second = input->getInt(STR_KMER_ABUNDANCE_MAX); params._soliditySingle = input->get(Simka::STR_SOLIDITY_PER_DATASET); cout << params._soliditySingle << endl; */ /** We launch the tool with the correct Integer implementation according to the choosen kmer size. */ Integer::apply (kmerSize, params); } simka-1.5.1/src/core/Simka.hpp000077500000000000000000000044031353413740300161250ustar00rootroot00000000000000/***************************************************************************** * Simka: Fast kmer-based method for estimating the similarity between numerous metagenomic datasets * A tool from the GATB (Genome Assembly Tool Box) * Copyright (C) 2015 INRIA * Authors: G.Benoit, C.Lemaitre, P.Peterlongo * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU Affero General Public License as * published by the Free Software Foundation, either version 3 of the * License, or (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU Affero General Public License for more details. * * You should have received a copy of the GNU Affero General Public License * along with this program. If not, see . *****************************************************************************/ #ifndef _TOOL_Simka_HPP_ #define _TOOL_Simka_HPP_ /********************************************************************************/ #include /********************************************************************************/ //using namespace gatb::core::system; //using namespace gatb::core::system::impl; //////////////////////////////////////////////////////////////////////////////// // // THIS FILE IS AUTOMATICALLY GENERATED... // // THIS IS A SIMPLE EXAMPLE HOW TO USE THE Tool CLASS. IF YOU WANT MORE FEATURES, // YOU CAN HAVE A LOOK AT THE ToyTool SNIPPET HERE: // // http://gatb-core.gforge.inria.fr/snippets_tools.html // //////////////////////////////////////////////////////////////////////////////// class Simka : public Tool { public: //typedef typename Kmer::Type Type; //typedef typename Kmer::Count Count; //typedef kmer::impl::Kmer<>::ModelDirect KmerModel; // Constructor Simka(); // Actual job done by the tool is here void execute (); static IOptionsParser* createOptionsParser (IOptionsParser* parent); //static void executeAlgorithm (Simka& simka, IProperties* props); private: }; #endif /* _TOOL_Simka_HPP_ */ simka-1.5.1/src/core/SimkaAlgorithm.cpp000077500000000000000000000367071353413740300200030ustar00rootroot00000000000000/***************************************************************************** * Simka: Fast kmer-based method for estimating the similarity between numerous metagenomic datasets * A tool from the GATB (Genome Assembly Tool Box) * Copyright (C) 2015 INRIA * Authors: G.Benoit, C.Lemaitre, P.Peterlongo * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU Affero General Public License as * published by the Free Software Foundation, either version 3 of the * License, or (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU Affero General Public License for more details. * * You should have received a copy of the GNU Affero General Public License * along with this program. If not, see . *****************************************************************************/ #include "SimkaAlgorithm.hpp" static const char* strProgressPartitionning = "Simka: Step 1: partitioning "; static const char* strProgressCounting = "Simka: Step 2: counting kmers "; template SimkaAlgorithm::SimkaAlgorithm(IProperties* options) : Algorithm("simka", -1, options) //_progress (0), _tmpPartitionsStorage(0), _tmpPartitions(0) { _options = options; _stats = 0; //_simkaDistance = 0; _banks = 0; //_processor = 0; //string maxDisk = ""; //if(_options->get(STR_MAX_DISK)){ // maxDisk = _options->getStr(STR_MAX_DISK); // cout << maxDisk << endl; //} //_multiStorage = new MultiDiskStorage(_options->getStr(STR_URI_OUTPUT_DIR), _options->getStr(STR_MAX_DISK)); // vector _tempDirMaxDisk _totalKmers = 0; /* if(_options->getInt(STR_VERBOSE) != 0){ cout << "Filter options" << endl; cout << "\tMax reads per dataset: " << _maxNbReads << endl; cout << "\tMin read size: " << _minReadSize << endl; cout << "\tMin Shannon index: " << _minShannonIndex << endl; }*/ //if(_maxNbReads == 0) // _maxNbReads = -1; //cout << _maxNbReads << endl; //cout << _soliditySingle << endl; /* string solidKindStr = _options->getStr(STR_SOLIDITY_KIND); if(solidKindStr == "range"){ _solidKind = SIMKA_SOLID_KIND::RANGE; } else if(solidKindStr == "sum"){ _solidKind = SIMKA_SOLID_KIND::SUM; } cout << solidKindStr << " " << solidKindStr << endl;*/ //_kmerSize = _options->get(STR_KMER_SIZE) ? _options->getInt(STR_KMER_SIZE) : 31; //_abundanceMin = _options->get(STR_KMER_ABUNDANCE_MIN) ? _options->getInt(STR_KMER_ABUNDANCE_MIN) : 0; //_maxMemory = props->get(STR_MAX_MEMORY) ? props->getInt(STR_MAX_MEMORY) : 2000; //_outputTempDir = props->get(STR_URI_OUTPUT_DIR) ? props->getStr(STR_URI_OUTPUT_DIR) : System::file().getDirectory(_inputFilename); //_outputFilename = props->get(STR_URI_OUTPUT) ? props->getStr(STR_URI_OUTPUT) : System::file().getDirectory(_inputFilename) + "/" + System::file().getBaseName(_inputFilename) + "_output.fasta"; //_nbCores = getInput()->getInt(STR_NB_CORES); //cout << "Input filename: " << _inputFilename << endl; //cout << "Kmer size: " << _kmerSize << endl; //cout << "Abundance min: " << _abundanceMin << endl; //cout << "Max memory: " << _maxMemory << endl; //cout << "Output temp dir: " << _outputTempDir << endl; //cout << "Output filename: " << _outputFilename << endl; //_banksInputFilename = _inputFilename + "_dsk_dataset_temp__"; } template SimkaAlgorithm::~SimkaAlgorithm() { } template void SimkaAlgorithm::execute() { /* if(!setup()) return; if(!isInputValid()) return; createBank(); count(); outputMatrix(); //outputHeatmap(); if(_options->getInt(STR_VERBOSE) != 0){ _stats->print(); print(); } clear();*/ } template bool SimkaAlgorithm::setup() { if(! createDirs() ) return false; try{ layoutInputFilename(); } catch (Exception& e){ cout << "Syntax error in input file" << endl; return false; } _nbBanks = _bankNames.size(); return true; } template void SimkaAlgorithm::parseArgs() { _computeSimpleDistances = _options->get(STR_SIMKA_COMPUTE_ALL_SIMPLE_DISTANCES); _computeComplexDistances = _options->get(STR_SIMKA_COMPUTE_ALL_COMPLEX_DISTANCES); _keepTmpFiles = _options->get(STR_SIMKA_KEEP_TMP_FILES); _maxMemory = _options->getInt(STR_MAX_MEMORY); _nbCores = _options->getInt(STR_NB_CORES); _inputFilename = _options->getStr(STR_URI_INPUT); _outputDir = _options->get(STR_URI_OUTPUT) ? _options->getStr(STR_URI_OUTPUT) : "./"; _outputDirTemp = _options->get(STR_URI_OUTPUT_TMP) ? _options->getStr(STR_URI_OUTPUT_TMP) : "./"; _kmerSize = _options->getInt(STR_KMER_SIZE); _abundanceThreshold.first = _options->getInt(STR_KMER_ABUNDANCE_MIN); _abundanceThreshold.second = min((u_int64_t)_options->getInt(STR_KMER_ABUNDANCE_MAX), (u_int64_t)(999999999)); //cout << _options->getInt(STR_KMER_ABUNDANCE_MAX) << endl; //cout << _abundanceThreshold.second << endl; _soliditySingle = _options->get(STR_SIMKA_SOLIDITY_PER_DATASET); //_nbMinimizers = _options->getInt(STR_KMER_PER_READ); //_maxDisk = getInput()->getInt(STR_MAX_DISK); //read filter _maxNbReads = _options->getInt(STR_SIMKA_MAX_READS); _minReadSize = _options->getInt(STR_SIMKA_MIN_READ_SIZE); _minReadShannonIndex = _options->getDouble(STR_SIMKA_MIN_READ_SHANNON_INDEX); _minReadShannonIndex = std::max(_minReadShannonIndex, 0.0); _minReadShannonIndex = std::min(_minReadShannonIndex, 2.0); _minKmerShannonIndex = _options->getDouble(STR_SIMKA_MIN_KMER_SHANNON_INDEX); _minKmerShannonIndex = std::max(_minKmerShannonIndex, 0.0); _minKmerShannonIndex = std::min(_minKmerShannonIndex, 2.0); if(!System::file().doesExist(_inputFilename)){ cerr << "ERROR: Input filename does not exist" << endl; exit(1); } } template bool SimkaAlgorithm::createDirs(){ if(!System::file().doesExist(_outputDir)){ int ok = System::file().mkdir(_outputDir, -1); if(ok != 0){ std::cout << "Error: can't create output directory (" << _outputDir << ")" << std::endl; return false; } } _outputDirTemp = _outputDirTemp; if(!System::file().doesExist(_outputDirTemp)){ int ok = System::file().mkdir(_outputDirTemp, -1); if(ok != 0){ std::cout << "Error: can't create output temp directory (" << _outputDirTemp << ")" << std::endl; return false; } } _outputDirTemp = System::file().getRealPath(_outputDirTemp); _outputDirTemp += "/simka_output_temp/"; System::file().mkdir(_outputDirTemp, -1); _options->setStr(STR_URI_OUTPUT_TMP, _outputDirTemp); System::file().mkdir(_outputDirTemp + "/input/", -1); return true; } template void SimkaAlgorithm::layoutInputFilename(){ if(_options->getInt(STR_VERBOSE) != 0){ cout << endl << "Creating input" << endl; } string inputDir = _outputDirTemp + "/input/"; ifstream inputFile(_inputFilename.c_str()); _banksInputFilename = inputDir + "__input_simka__"; //_inputFilename + "_dsk_dataset_temp__"; IFile* bankFile = System::file().newFile(_banksInputFilename, "wb"); string line; string linePart; vector lineIdDatasets; vector linepartPairedDatasets; vector linepartDatasets; string bankFileContents = ""; u_int64_t lineIndex = 0; while(getline(inputFile, line)){ line.erase(std::remove(line.begin(),line.end(),' '),line.end()); if(line == "") continue; //cout << line << endl; lineIdDatasets.clear(); linepartPairedDatasets.clear(); //vector filenames; stringstream lineStream(line); while(getline(lineStream, linePart, ':')){ lineIdDatasets.push_back(linePart); } string bankId = lineIdDatasets[0]; string linePairedDatasets = lineIdDatasets[1]; stringstream linePairedDatasetsStream(linePairedDatasets); while(getline(linePairedDatasetsStream, linePart, ';')){ linepartPairedDatasets.push_back(linePart); } string subBankFilename = inputDir + bankId; IFile* subBankFile = System::file().newFile(subBankFilename, "wb"); //cout << subBankFile->getPath() << endl; string subBankContents = ""; _nbBankPerDataset.push_back(linepartPairedDatasets.size()); for(size_t i=0; ifwrite(subBankContents.c_str(), subBankContents.size(), 1); subBankFile->flush(); delete subBankFile; bankFileContents += inputDir + "/" + bankId + "\n"; lineIndex += 1; _bankNames.push_back(bankId); } inputFile.close(); bankFileContents.erase(bankFileContents.size()-1); bankFile->fwrite(bankFileContents.c_str(), bankFileContents.size(), 1); bankFile->flush(); delete bankFile; if(_options->getInt(STR_VERBOSE) != 0){ cout << "\tNb input datasets: " << _bankNames.size() << endl; cout << endl; } } template bool SimkaAlgorithm::isInputValid(){ string inputDir = _outputDirTemp + "/input/"; for (size_t i=0; i<_nbBanks; i++){ try{ IBank* bank = Bank::open(inputDir + _bankNames[i]); LOCAL(bank); } catch (Exception& e){ cerr << "ERROR: Can't open dataset: " << _bankNames[i] << endl; return false; } } return true; } template void SimkaAlgorithm::computeMaxReads(){ string inputDir = _outputDirTemp + "/input/"; //if(_maxNbReads != 0){ // return; //} if(_maxNbReads == 0){ if(_options->getInt(STR_VERBOSE) != 0) cout << "-maxNbReads is not defined. Simka will estimating it..." << endl; } u_int64_t totalReads = 0; u_int64_t minReads = -1; u_int64_t maxReads = 0; u_int64_t meanReads = 0; if(_maxNbReads == 0 || _options->get(STR_SIMKA_COMPUTE_DATA_INFO)){ for (size_t i=0; i<_nbBanks; i++){ IBank* bank = Bank::open(inputDir + _bankNames[i]); LOCAL(bank); u_int64_t nbReads = bank->estimateNbItems(); nbReads /= _nbBankPerDataset[i]; totalReads += nbReads; if(nbReads < minReads){ minReads = nbReads; //_smallerBankId = _bankNames[i]; } if(nbReads > maxReads){ maxReads = nbReads; _largerBankId = _bankNames[i]; } } meanReads = totalReads / _nbBanks; if(_options->getInt(STR_VERBOSE) != 0){ cout << "Smaller sample contains: " << minReads << " reads" << endl; cout << "Larger sample contains: " << maxReads << " reads" << endl; cout << "Whole dataset contains a mean of: " << meanReads << " reads" << endl << endl; } } if(_maxNbReads == 0){ _maxNbReads = (minReads + meanReads) / 2; if(_options->getInt(STR_VERBOSE) != 0){ cout << "Reads per sample used up to: " << _maxNbReads << endl << endl; } } else if(_maxNbReads == -1){ if(_options->getInt(STR_VERBOSE) != 0) cout << "Reads per sample used: all"<< endl << endl; _maxNbReads = 0; } else{ if(_options->getInt(STR_VERBOSE) != 0){ cout << "Reads per sample used up to: " << _maxNbReads << endl << endl; } } } /* template void SimkaAlgorithm::layoutInputFilename(){ if(_options->getInt(STR_VERBOSE) != 0){ cout << endl << "Creating input" << endl; } _banksInputFilename = _inputFilename + "_dsk_dataset_temp__"; ifstream inputFile(_inputFilename.c_str()); IFile* bankFile = System::file().newFile(_banksInputFilename, "wb"); string line; string linePart; vector linePartList; string bankFileContents = ""; u_int64_t lineIndex = 0; while(getline(inputFile, line)){ if(line == "") continue; stringstream lineStream(line); linePartList.clear(); //vector filenames; while(getline(lineStream, linePart, ' ')){ if(linePart != ""){ linePartList.push_back(linePart); } } string bankId = linePartList[0]; _bankNames.push_back(bankId); //ID and one filename if(linePartList.size() == 2){ bankFileContents += linePartList[1] + "\n"; _nbBankPerDataset.push_back(1); } //ID and list of filename (paired files for example) else{ char buffer[200]; snprintf(buffer,200,"%llu", lineIndex); string subBankFilename = _banksInputFilename + "_" + string(buffer); _tempFilenamesToDelete.push_back(subBankFilename); IFile* subBankFile = System::file().newFile(subBankFilename, "wb"); string subBankContents = ""; for(size_t i=1; ifwrite(subBankContents.c_str(), subBankContents.size(), 1); subBankFile->flush(); delete subBankFile; bankFileContents += subBankFilename + "\n"; _nbBankPerDataset.push_back(linePartList.size() - 1); //linePartList.size() - 1 = nb sub banks //_nbReadsPerDataset.push_back(ceil(_maxNbReads / (float)())); } lineIndex += 1; } bankFileContents.erase(bankFileContents.size()-1); //bankFileContents.pop_back(); // "remove last /n bankFile->fwrite(bankFileContents.c_str(), bankFileContents.size(), 1); inputFile.close(); //delete inputFile; bankFile->flush(); delete bankFile; //for(int i=0; i<_nbBanksOfDataset.size(); i++){ // cout << i << " " << _nbBanksOfDataset[i] << endl; //} if(_options->getInt(STR_VERBOSE) != 0){ cout << "\tNb input datasets: " << _bankNames.size() << endl; } cout << endl; }*/ template void SimkaAlgorithm::createBank(){ IBank* bank = Bank::open(_banksInputFilename); SimkaSequenceFilter sequenceFilter(_minReadSize, _minReadShannonIndex); _banks = new SimkaBankFiltered(bank, sequenceFilter, _nbBankPerDataset, _maxNbReads); } template void SimkaAlgorithm::count(){ /* //SimkaDistanceParam distanceParams(_options); _stats = new SimkaStatistics(_nbBanks, _computeSimpleDistances, _computeComplexDistances, _outputDirTemp, _bankNames); SortingCountAlgorithm sortingCount (_banks, _options); // We create a custom count processor and give it to the sorting count algorithm vector dummyVec; _processor = new SimkaCountProcessor (*_stats, _nbBanks, _kmerSize, _abundanceThreshold, _solidKind, _soliditySingle, _minKmerShannonIndex); _processor->use(); sortingCount.addProcessor (_processor); // We launch the algorithm sortingCount.execute(); */ } template void SimkaAlgorithm::outputMatrix(){ _stats->outputMatrix(_outputDir, _bankNames); } template void SimkaAlgorithm::print(){ cout << "Output folder: " << _outputDir << endl; } template void SimkaAlgorithm::clear(){ if(_banks){ //_banks->finalize(); //delete _banks; } System::file().remove(_banksInputFilename); //if(_processor) _processor->forget(); for(size_t i=0; i<_tempFilenamesToDelete.size(); i++){ System::file().remove(_tempFilenamesToDelete[i]); } if(_stats) delete _stats; //if(_simkaDistance) delete _simkaDistance; //_banks->remove(); //delete _processor; } simka-1.5.1/src/core/SimkaAlgorithm.hpp000077500000000000000000000654621353413740300200100ustar00rootroot00000000000000/***************************************************************************** * Simka: Fast kmer-based method for estimating the similarity between numerous metagenomic datasets * A tool from the GATB (Genome Assembly Tool Box) * Copyright (C) 2015 INRIA * Authors: G.Benoit, C.Lemaitre, P.Peterlongo * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU Affero General Public License as * published by the Free Software Foundation, either version 3 of the * License, or (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU Affero General Public License for more details. * * You should have received a copy of the GNU Affero General Public License * along with this program. If not, see . *****************************************************************************/ #ifndef TOOLS_SIMKA_SRC_SIMKAALGORITHM_HPP_ #define TOOLS_SIMKA_SRC_SIMKAALGORITHM_HPP_ #include #include "SimkaCommons.hpp" #include #include //#define PRINT_STATS //#define CHI2_TEST //#define SIMKA_POTARA //#define BOOTSTRAP #define MAX_BOOTSTRAP 50 #define NB_BOOTSTRAP 45 //#define SIMKA_FUSION //#define MULTI_PROCESSUS //#define MULTI_DISK //#define SIMKA_MIN #include "SimkaDistance.hpp" enum SIMKA_SOLID_KIND{ RANGE, SUM, }; typedef u_int16_t bankIdType; class SimkaCounterBuilder { public: /** Constructor. * \param[in] nbBanks : number of banks parsed during kmer counting. */ SimkaCounterBuilder (size_t nbBanks=1) : _abundancePerBank(nbBanks) {} /** Get the number of banks. * \return the number of banks. */ size_t size() const { return _abundancePerBank.size(); } /** Initialization of the counting for the current kmer. This method should be called * when a kmer is seen for the first time. * \param[in] idxBank : bank index where the new current kmer has been found. */ void init (size_t idxBank=0) { for (size_t k=0; k<_abundancePerBank.size(); k++) { _abundancePerBank[k]=0; } _abundancePerBank [idxBank]= 1; } /** Increase the abundance of the current kmer for the provided bank index. * \param[in] idxBank : index of the bank */ void increase (size_t idxBank=0) { _abundancePerBank [idxBank] ++; } /** Set the abundance of the current kmer for the provided bank index. * \param[in] idxBank : index of the bank */ void set (CountNumber val, size_t idxBank=0) { _abundancePerBank [idxBank] = val; } /** Get the abundance of the current kmer for the provided bank index. * \param[in] idxBank : index of the bank * \return the abundance of the current kmer for the given bank. */ CountNumber operator[] (size_t idxBank) const { return _abundancePerBank[idxBank]; } /** */ const CountVector& get () const { return _abundancePerBank; } private: CountVector _abundancePerBank; }; /********************************************************************* * ** SimkaCountProcessor *********************************************************************/ template class SimkaCountProcessorSimple{ private: size_t _nbBanks; size_t _kmerSize; //pair _abundanceThreshold; //bool isAbundanceThreshold; SimkaStatistics* _stats; double _totalAbundance; u_int64_t _nbKmerCounted; double _minKmerShannonIndex; //vector _banksOks; vector _sharedBanks; typedef std::pair chi2val_Abundances; struct _chi2ValueSorterFunction { bool operator() (chi2val_Abundances l,chi2val_Abundances r) { return r.first < l.first; } } ; std::priority_queue< chi2val_Abundances, vector, _chi2ValueSorterFunction> _chi2ValueSorter; size_t _maxChi2Values; public: typedef typename Kmer::Type Type; //typedef typename Kmer::Count Count; SimkaCountProcessorSimple(SimkaStatistics* stats, size_t nbBanks, size_t kmerSize, const pair& abundanceThreshold, SIMKA_SOLID_KIND solidKind, bool soliditySingle, double minKmerShannonIndex) : _stats(stats) { _maxChi2Values = 1000; // We configure the vector for the N.(N+1)/2 possible pairs //_countTotal.resize (_nbBanks*(_nbBanks+1)/2); _nbBanks = nbBanks; _kmerSize = kmerSize; //_abundanceThreshold = abundanceThreshold; _minKmerShannonIndex = minKmerShannonIndex; //_localStats = new SimkaStatistics(_nbBanks, _stats._distanceParams); _nbKmerCounted = 0; //isAbundanceThreshold = _abundanceThreshold.first > 1 || _abundanceThreshold.second < 1000000; } void end(){ #ifdef CHI2_TEST size_t nbValues = _chi2ValueSorter.size(); for(size_t i=0; i::Type& kmer, const CountVector& counts){ //cout << kmer.toString(_kmerSize) << endl; //for(size_t i=0; i_nbDistinctKmers += 1; for(size_t i=0; i_nbKmers += abundance; _stats->_nbKmersPerBank[i] += abundance; _totalAbundance += abundance; } #endif /* A DEPLACER PENDANT LE COMPTAGE DES KMERS if(_minKmerShannonIndex != 0){ double shannonIndex = getShannonIndex(kmer); if(shannonIndex < _minKmerShannonIndex){ return; } }*/ #ifdef CHI2_TEST float X2j = 0; _totalAbundance = 0; for(size_t i=0; i_datasetNbReads[i]/_stats->_totalReads), 2) / (_stats->_datasetNbReads[i] / (_stats->_totalReads*_totalAbundance)); } //std::chi_squared_distribution distribution(_nbBanks-1); //double pvalue = chisqr(_nbBanks-1, X2j); /* if(lala> 100){ for(size_t i=0; i<_chi2ValueSorter.size(); i++){ double val = _chi2ValueSorter.top(); _chi2ValueSorter.pop(); cout << val << endl; } return; }*/ //cout << X2j << endl; if(_chi2ValueSorter.size() > _maxChi2Values){ if(X2j > _chi2ValueSorter.top().first){ _chi2ValueSorter.push(pair(X2j, counts)); _chi2ValueSorter.pop(); } } else{ _chi2ValueSorter.push(pair(X2j, counts)); } //cout << _chi2ValueSorter.size() << " " << X2j << " " << _chi2ValueSorter.top() << endl; //cout << X2j << " " << pvalue << endl; return; /* cout << kmer.toString(_kmerSize) << " ["; for(size_t i=0; i 0.01) return; #endif /* //for(size_t i=0; i<_datasetNbReads.size(); i++) // cout << i << " " << _datasetNbReads[i] << endl; //cout << _totalReads << " " << _totalAbundance << endl; //float Ri = 500000; //float Rtotal = Ri * _nbBanks; //float Ntotal = _totalAbundance; float X2j = 0; for(size_t i=0; i_nbSolidKmers += 1; // computeStats(counts); //} //else{ updateDistance(counts); //else // computeStats(counts); //_stats->_nbSolidKmers += 1; } void updateDistance(const CountVector& counts){ _sharedBanks.clear(); for(size_t i=0; i_computeSimpleDistances) updateDistanceSimple(counts); if(_stats->_computeComplexDistances) updateDistanceComplex(counts); } void updateDistanceDefault(const CountVector& counts){ for(size_t ii=0; ii<_sharedBanks.size(); ii++){ for(size_t jj=ii+1; jj<_sharedBanks.size(); jj++){ u_int16_t i = _sharedBanks[ii]; u_int16_t j = _sharedBanks[jj]; size_t symetricIndex = j + ((_nbBanks-1)*i) - (i*(i-1)/2); u_int64_t abundanceI = counts[i]; u_int64_t abundanceJ = counts[j]; _stats->_matrixNbSharedKmers[i][j] += counts[i]; _stats->_matrixNbSharedKmers[j][i] += counts[j]; _stats->_matrixNbDistinctSharedKmers[symetricIndex] += 1; //cout << i << " " << j << " " << (j + ((_nbBanks-1)*i) - (i*(i-1)/2)) << endl; _stats->_brayCurtisNumerator[symetricIndex] += min(abundanceI, abundanceJ); } } } void updateDistanceSimple(const CountVector& counts){ for(size_t ii=0; ii<_sharedBanks.size(); ii++){ for(size_t jj=ii+1; jj<_sharedBanks.size(); jj++){ u_int16_t i = _sharedBanks[ii]; u_int16_t j = _sharedBanks[jj]; u_int64_t abundanceI = counts[i]; u_int64_t abundanceJ = counts[j]; //cout << _stats->_chord_sqrt_N2[i] << endl; //_stats->_chord_NiNj[i][j] += abundanceI * abundanceJ; _stats->_chord_NiNj[i][j] += abundanceI * abundanceJ; _stats->_hellinger_SqrtNiNj[i][j] += sqrt(abundanceI * abundanceJ); _stats->_kulczynski_minNiNj[i][j] += min(abundanceI, abundanceJ); } } } void updateDistanceComplex(const CountVector& counts){ //_sharedBanks.clear(); //for(size_t i=0; i 0) double abundanceI = counts[i]; double abundanceJ = counts[j]; if(abundanceJ){ //_stats->_matrixNbSharedKmers[i][j] += abundanceI; //_stats->_matrixNbSharedKmers[j][i] += abundanceJ; //_stats->_matrixNbDistinctSharedKmers[i][j] += 1; //_stats->_chord_NiNj[i][j] += abundanceI * abundanceJ; //_stats->_chord_NiNj[i][j] += (abundanceI * abundanceJ) / (_stats->_chord_sqrt_N2[i]*_stats->_chord_sqrt_N2[j]); //_stats->_hellinger_SqrtNiNj[i][j] += sqrt(abundanceI * abundanceJ); //_stats->_kulczynski_minNiNj[i][j] += min(abundanceI, abundanceJ); double yX = abundanceJ * _stats->_nbSolidKmersPerBank[i]; double xY = abundanceI * _stats->_nbSolidKmersPerBank[j]; xi = (double)abundanceI / _stats->_nbSolidKmersPerBank[i]; d1 = xi * log((2*xY) / (xY + yX)); //xY = abundanceI * _stats->_nbSolidKmersPerBank[j]; //yX = abundanceJ * _stats->_nbSolidKmersPerBank[i]; xj = (double)abundanceJ / _stats->_nbSolidKmersPerBank[j]; d2 = xj * log((2*yX) / (xY + yX)); } else{ d2 = 0; double yX = abundanceJ * _stats->_nbSolidKmersPerBank[i]; double xY = abundanceI * _stats->_nbSolidKmersPerBank[j]; xi = (double)abundanceI / _stats->_nbSolidKmersPerBank[i]; d1 = xi * log((2*xY) / (xY + yX)); } /* if(abundanceI){ double yX = abundanceJ * _stats->_nbSolidKmersPerBank[i]; double xY = abundanceI * _stats->_nbSolidKmersPerBank[j]; xi = (double)abundanceI / _stats->_nbSolidKmersPerBank[i]; d1 = xi * log((2*xY) / (xY + yX)); } else{ d1 = 0; } if(abundanceJ){ double xY = abundanceI * _stats->_nbSolidKmersPerBank[j]; double yX = abundanceJ * _stats->_nbSolidKmersPerBank[i]; xj = (double)abundanceJ / _stats->_nbSolidKmersPerBank[j]; d2 = xj * log((2*yX) / (xY + yX)); } else{ d2 = 0; }*/ _stats->_kullbackLeibler[i][j] += d1 + d2; _stats->_canberra[i][j] += abs(abundanceI - abundanceJ) / (abundanceI + abundanceJ); //_stats->_brayCurtisNumerator[i][j] += abs(abundanceI - abundanceJ); _stats->_whittaker_minNiNj[i][j] += abs((int)((u_int64_t)(abundanceI*_stats->_nbSolidKmersPerBank[j]) - (u_int64_t)(abundanceJ*_stats->_nbSolidKmersPerBank[i]))); //cout << _stats->_nbSolidKmersPerBank[i] << endl; } } else{ //Here, we know that (abundanceI == 0) for(size_t jj=0; jj<_sharedBanks.size(); jj++){ u_int16_t j = _sharedBanks[jj]; if(i > j) continue; double abundanceI = counts[i]; double abundanceJ = counts[j]; d1 = 0; double xY = abundanceI * _stats->_nbSolidKmersPerBank[j]; double yX = abundanceJ * _stats->_nbSolidKmersPerBank[i]; xj = (double)abundanceJ / _stats->_nbSolidKmersPerBank[j]; d2 = xj * log((2*yX) / (xY + yX)); _stats->_kullbackLeibler[i][j] += d1 + d2; _stats->_canberra[i][j] += abs(abundanceI - abundanceJ) / (abundanceI + abundanceJ); //_stats->_brayCurtisNumerator[i][j] += abs(abundanceI - abundanceJ); //cout << _stats->_nbSolidKmersPerBank[i] << endl; _stats->_whittaker_minNiNj[i][j] += abs((int)((u_int64_t)(abundanceI*_stats->_nbSolidKmersPerBank[j]) - (u_int64_t)(abundanceJ*_stats->_nbSolidKmersPerBank[i]))); } } } /* return; double xi = 0; double xj = 0; double d1 = 0; double d2 = 0; #ifdef PRINT_STATS int nbBanksThatHaveKmer = 0; #endif //u_int64_t totalAbundance = 0; for(size_t i=0; i_nbSolidDistinctKmersPerBank[i] += 1; //_stats->_nbSolidKmersPerBank[i] += abundanceI; //_stats->_chord_N2[i] += pow(abundanceI, 2); } #endif for(size_t j=i+1; j_matrixNbSharedKmers[i][j] += abundanceI; _stats->_matrixNbSharedKmers[j][i] += abundanceJ; _stats->_matrixNbDistinctSharedKmers[i][j] += 1; //_stats->_chord_NiNj[i][j] += abundanceI * abundanceJ; _stats->_chord_NiNj[i][j] += (abundanceI * abundanceJ) / (_stats->_chord_sqrt_N2[i]*_stats->_chord_sqrt_N2[j]); _stats->_hellinger_SqrtNiNj[i][j] += sqrt(abundanceI * abundanceJ); _stats->_kulczynski_minNiNj[i][j] += min(abundanceI, abundanceJ); _stats->_whittaker_minNiNj[i][j] += abs((int)((u_int64_t)(abundanceI*_stats->_nbSolidKmersPerBank[j]) - (u_int64_t)(abundanceJ*_stats->_nbSolidKmersPerBank[i]))); } if(abundanceI){ double yX = abundanceJ * _stats->_nbSolidKmersPerBank[i]; double xY = abundanceI * _stats->_nbSolidKmersPerBank[j]; xi = (double)abundanceI / _stats->_nbSolidKmersPerBank[i]; d1 = xi * log((2*xY) / (xY + yX)); } else{ d1 = 0; } if(abundanceJ){ double xY = abundanceI * _stats->_nbSolidKmersPerBank[j]; double yX = abundanceJ * _stats->_nbSolidKmersPerBank[i]; xj = (double)abundanceJ / _stats->_nbSolidKmersPerBank[j]; d2 = xj * log((2*yX) / (xY + yX)); } else{ d2 = 0; } _stats->_kullbackLeibler[i][j] += d1 + d2; _stats->_canberra[i][j] += abs(abundanceI - abundanceJ) / (abundanceI + abundanceJ); _stats->_brayCurtisNumerator[i][j] += abs(abundanceI - abundanceJ); //cout << _stats->_nbSolidKmersPerBank[i] << endl; _stats->_whittaker_minNiNj[i][j] += abs((int)((u_int64_t)(abundanceI*_stats->_nbSolidKmersPerBank[j]) - (u_int64_t)(abundanceJ*_stats->_nbSolidKmersPerBank[i]))); } } }*/ #ifdef PRINT_STATS _stats->_nbDistinctKmersSharedByBanksThreshold[nbBanksThatHaveKmer-1] += 1; _stats->_nbKmersSharedByBanksThreshold[nbBanksThatHaveKmer-1] += _totalAbundance; if(_totalAbundance == 1){ //if( == 1){ _stats->_nbErroneousKmers += 1; //} } //else if(nbBanksThatHaveKmer == counter.size()){ //} #endif } //inline bool isSolidVector(const CountVector& counts); double getShannonIndex(const Type& kmer){ float index = 0; //float freq [5]; vector _freqs(4, 0); //char* seqStr = seq.getDataBuffer(); for (size_t i=0; i<_kmerSize; i++){ _freqs[kmer[i]] += 1.0; //seq[sizeKmer-i-1] = bin2NT [(*this)[i]]; } // Frequency of each letter (A, C, G, T or N) //for(size_t i=0; i < seq.size(); i++) // _freqs[nt2binTab[(unsigned char)seq[i]]] += 1.0; // Shannon index calculation for (size_t i=0; i<_freqs.size(); i++){ _freqs[i] /= (float) _kmerSize; if (_freqs[i] != 0) index += _freqs[i] * log (_freqs[i]) / log(2); } return abs(index); } double approx_gamma(double Z) { const double RECIP_E = 0.36787944117144232159552377016147; // RECIP_E = (E^-1) = (1.0 / E) const double TWOPI = 6.283185307179586476925286766559; // TWOPI = 2.0 * PI double D = 1.0 / (10.0 * Z); D = 1.0 / ((12 * Z) - D); D = (D + Z) * RECIP_E; D = pow(D, Z); D *= sqrt(TWOPI / Z); return D; } static double igf(double S, double Z) { if(Z < 0.0) { return 0.0; } double Sc = (1.0 / S); Sc *= pow(Z, S); Sc *= exp(-Z); double Sum = 1.0; double Nom = 1.0; double Denom = 1.0; for(int I = 0; I < 200; I++) { Nom *= Z; S++; Denom *= S; Sum += (Nom / Denom); } return Sum * Sc; } double chisqr(int Dof, double Cv) { if(Cv < 0 || Dof < 1) { return 0.0; } double K = ((double)Dof) * 0.5; double X = Cv * 0.5; if(Dof == 2) { return exp(-1.0 * X); } double PValue = igf(K, X); //if(isnan(PValue) || isinf(PValue) || PValue <= 1e-8) //{ // return 1e-14; //} PValue /= approx_gamma(K); //PValue /= tgamma(K); return PValue; //return (1.0 - PValue); } }; /********************************************************************************/ /** * */ /* template class SimkaTruncateIterator : public TruncateIterator { public: SimkaTruncateIterator (Iterator* ref, u_int64_t limit, bool initRef=true) : TruncateIterator(*ref, limit, initRef), _ref2(0){ setRef(ref); } private: Iterator* _ref2; void setRef (Iterator* ref2) { SP_SETATTR(ref2); } };*/ template class SimkaBankFiltered : public BankDelegate { public: u_int64_t _refNbReads; u_int64_t _refTotalSeqSize; u_int64_t _refMaxReadSize; /** Constructor. * \param[in] ref : referred bank. * \param[in] filter : functor that filters sequence. */ SimkaBankFiltered (IBank* ref, const Filter& filter, const vector& nbPaireds, u_int64_t maxReads) : BankDelegate (ref), _filter(filter) { _nbPaireds = nbPaireds; _maxReads = maxReads; _nbBanks = ref->getCompositionNb(); ref->estimate(_refNbReads, _refTotalSeqSize, _refMaxReadSize); //cout << _refNbReads << endl; //cout << _refTotalSeqSize << endl; //cout << _refMaxReadSize << endl; } void estimate (u_int64_t& number, u_int64_t& totalSize, u_int64_t& maxSize){ if(_maxReads == 0){ number = _refNbReads; totalSize = _refTotalSeqSize; maxSize = _refMaxReadSize; } else{ u_int64_t maxReads = 0; for(size_t i=0; i<_nbBanks; i++){ maxReads += _maxReads * _nbPaireds[i]; } //cout << _refNbReads << endl; //cout << _maxReads*_nbBanks << endl; maxReads = min (maxReads, _refNbReads); //cout << "ha " << maxReads << endl; if(maxReads == _refNbReads){ number = _refNbReads; totalSize = _refTotalSeqSize; maxSize = _refMaxReadSize; } else{ number = maxReads; double factor = (double)maxReads / (double)_refNbReads; totalSize = _refTotalSeqSize * factor; maxSize = _refMaxReadSize; } } //number = _maxReads; //totalSize = (_totalSizeRef*_nbReadToProcess)/_numberRef; //maxSize = _maxSizeRef; //cout << number2 << endl; //u_int64_t readSize = totalSize2 / number2; //cout << "lal:" << number2 << endl; //number = _maxReads; //number = _nbReadToProcess; //totalSize = _nbReadToProcess*readSize; //maxSize = readSize; cout << number << endl; //cout << totalSize << endl; //cout << maxSize << endl; } /** \copydoc tools::collections::Iterable::iterator */ Iterator* iterator () { //cout << endl << "---" << endl; //cout << "lala" << endl; // We create one iterator from the reference Iterator* it = _ref->iterator (); // We get the composition for this iterator std::vector*> iterators = it->getComposition(); //if (iterators.size() == 1) { return new FilterIterator (it, _filter); } //else //{ // We are going to create a new CompositeIterator, we won't need the one we just got from the reference LOCAL(it); // We may have to encapsulate each sub iterator with the filter. for (size_t i=0; i (iterators[i], _filter); } else{ //We create a truncated iterator that stop processing reads when _nbReadsPerDataset[i] is reached //cout << _nbReadsPerDataset[i] << endl; //CancellableIterator* truncIt = new CancellableIterator(*iterators[i]); Filter filter(_filter); //filter.setMaxReads(_nbReadsPerDataset[i]); //filter.setIt(truncIt); #ifdef BOOTSTRAP srand (time(NULL)); size_t nbBootstrap = 0; vector iSBoostrap(MAX_BOOTSTRAP); while(nbBootstrap != NB_BOOTSTRAP){ int index = rand() % iSBoostrap.size(); if(!iSBoostrap[index]){ iSBoostrap[index] = true; nbBootstrap += 1; } } filter.setBootstrap(iSBoostrap); #endif FilterIterator* filterIt = new FilterIterator (iterators[i], filter); iterators[i] = filterIt; }*/ //Iterator* it = iterators[i]; //std::vector*> iterators_ = it->getComposition(); iterators[i] = new SimkaInputIterator (iterators[i], _nbPaireds[i], _maxReads, _filter); } return new CompositeIterator (iterators); } private: vector _nbPaireds; Filter _filter; u_int64_t _maxReads; size_t _nbBanks; }; /********************************************************************* * ** SimkaAlgorithm *********************************************************************/ template class SimkaAlgorithm : public Algorithm { public: typedef typename Kmer::Type Type; typedef typename Kmer::Count Count; typedef typename Kmer::ModelCanonical ModelCanonical; typedef typename ModelCanonical::Kmer KmerType; SimkaAlgorithm(IProperties* options); ~SimkaAlgorithm(); void execute(); void print(); //void executeSimkamin(); static string toString(u_int64_t value){ char buffer[40]; snprintf(buffer, 30, "%llu", value); return string(buffer); } protected: bool setup(); bool isInputValid(); void parseArgs(); bool createDirs(); void computeMaxReads(); void layoutInputFilename(); void createBank(); void count(); void outputMatrix(); //void dumpMatrix(const string& outputFilename, const vector >& matrix); //void outputHeatmap(); //void __outputHeatmap(const string& outputFilenamePrefix, const string& matrixPercFilename, const string& matrixNormFilename); void clear(); u_int64_t _maxMemory; size_t _nbCores; string _outputDir; string _outputDirTemp; size_t _nbBanks; string _inputFilename; size_t _kmerSize; pair _abundanceThreshold; SIMKA_SOLID_KIND _solidKind; bool _soliditySingle; int64_t _maxNbReads; size_t _minReadSize; double _minReadShannonIndex; double _minKmerShannonIndex; size_t _nbMinimizers; //size_t _nbCores; SimkaStatistics* _stats; //SimkaDistance* _simkaDistance; string _banksInputFilename; vector _tempFilenamesToDelete; IBank* _banks; IProperties* _options; vector _bankNames; //vector _nbReadsPerDataset; string _outputFilenameSuffix; u_int64_t _totalKmers; vector _nbBankPerDataset; string _largerBankId; bool _computeSimpleDistances; bool _computeComplexDistances; bool _keepTmpFiles; //string _matDksNormFilename; //string _matDksPercFilename; //string _matAksNormFilename; //string _matAksPercFilename; //string _heatmapDksFilename; //string _heatmapAksFilename; /* gatb::core::tools::dp::IteratorListener* _progress; void setProgress (gatb::core::tools::dp::IteratorListener* progress) { SP_SETATTR(progress); } size_t _nbPartitions; std::vector > _nbKmersPerPartitionPerBank; vector > _nbk_per_radix_per_part;//number of kxmer per parti per rad Storage* _tmpPartitionsStorage; void setPartitionsStorage (Storage* tmpPartitionsStorage) { SP_SETATTR(tmpPartitionsStorage); } Partition* _tmpPartitions; void setPartitions (Partition* tmpPartitions) { SP_SETATTR(tmpPartitions); } vector _nbKmerPerPartitions; int getSizeofPerItem () const { return Type::getSize()/8 + sizeof(bankIdType); } std::vector getNbCoresList(); //this->_local_pInfo.incKmer_and_rad (p, radix_kxmer.getVal(), kx_size); //nb of superkmer per x per parti per radix //vector _speciesAbundancePerDataset; //MultiDiskStorage* _multiStorage; //u_int64_t _maxDisk; */ }; #endif /* TOOLS_SIMKA_SRC_SIMKAALGORITHM_HPP_ */ simka-1.5.1/src/core/SimkaAlgorithmTemplate.cpp.in000077500000000000000000000026341353413740300220740ustar00rootroot00000000000000/***************************************************************************** * Simka: Fast kmer-based method for estimating the similarity between numerous metagenomic datasets * A tool from the GATB (Genome Assembly Tool Box) * Copyright (C) 2015 INRIA * Authors: G.Benoit, C.Lemaitre, P.Peterlongo * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU Affero General Public License as * published by the Free Software Foundation, either version 3 of the * License, or (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU Affero General Public License for more details. * * You should have received a copy of the GNU Affero General Public License * along with this program. If not, see . *****************************************************************************/ #include // since we didn't define the functions in a .h file, that trick removes linker errors, // see http://www.parashift.com/c++-faq-lite/separate-template-class-defn-from-decl.html // (last example) // also, to reduce compilation time, I'm splitting it into several (8) files that will be compiled in parallel template class SimkaAlgorithm <${KSIZE}>;simka-1.5.1/src/core/SimkaCommons.hpp000066400000000000000000000250671353413740300174670ustar00rootroot00000000000000/* * SimkaCommons.h * * Created on: 24 juin 2017 * Author: gbenoit */ #ifndef SIMKA1_4_SRC_CORE_SIMKACOMMONS_HPP_ #define SIMKA1_4_SRC_CORE_SIMKACOMMONS_HPP_ #include const string STR_SIMKA_SOLIDITY_PER_DATASET = "-solidity-single"; const string STR_SIMKA_MAX_READS = "-max-reads"; const string STR_SIMKA_MIN_READ_SIZE = "-min-read-size"; const string STR_SIMKA_MIN_READ_SHANNON_INDEX = "-min-shannon-index"; const string STR_SIMKA_MIN_KMER_SHANNON_INDEX = "-kmer-shannon-index"; const string STR_KMER_PER_READ = "-kmer-per-read"; const string STR_SIMKA_COMPUTE_ALL_SIMPLE_DISTANCES= "-simple-dist"; const string STR_SIMKA_COMPUTE_ALL_COMPLEX_DISTANCES = "-complex-dist"; const string STR_SIMKA_KEEP_TMP_FILES = "-keep-tmp"; const string STR_SIMKA_COMPUTE_DATA_INFO = "-data-info"; class SimkaCommons { public: SimkaCommons(); virtual ~SimkaCommons(); static void checkInputValidity(const string& outputDirTemp, const string& inputFilename, u_int64_t& nbDatasets){ if(!System::file().doesExist(inputFilename)){ cout << "ERROR: Input does not exists (" + inputFilename + ")" << endl; exit(1); } nbDatasets = 0; bool error = false; //string inputDir = _outputDirTemp; // + "/input/"; ifstream inputFile(inputFilename.c_str()); //ofstream outputFileIds(_outputFilenameIds.c_str(), ios::binary); //_banksInputFilename = inputDir + "__input_simka__"; //_inputFilename + "_dsk_dataset_temp__"; //IFile* bankFile = System::file().newFile(_banksInputFilename, "wb"); string line; string linePart; vector lineIdDatasets; vector linepartPairedDatasets; vector linepartDatasets; //string bankFileContents = ""; u_int64_t lineIndex = 0; u_int64_t bankIdBytePos = 0; while(getline(inputFile, line)){ line.erase(std::remove(line.begin(),line.end(),' '),line.end()); if(line == "") continue; //cout << line << endl; lineIdDatasets.clear(); linepartPairedDatasets.clear(); //vector filenames; stringstream lineStream(line); while(getline(lineStream, linePart, ':')){ lineIdDatasets.push_back(linePart); } string bankId = lineIdDatasets[0]; string linePairedDatasets = lineIdDatasets[1]; stringstream linePairedDatasetsStream(linePairedDatasets); while(getline(linePairedDatasetsStream, linePart, ';')){ linepartPairedDatasets.push_back(linePart); } string subBankFilename = outputDirTemp + bankId; IFile* subBankFile = System::file().newFile(subBankFilename, "wb"); //cout << subBankFile->getPath() << endl; string subBankContents = ""; //_nbBankPerDataset.push_back(linepartPairedDatasets.size()); for(size_t i=0; ifwrite(subBankContents.c_str(), subBankContents.size(), 1); subBankFile->flush(); delete subBankFile; //bankFileContents += inputDir + "/" + bankId + "\n"; lineIndex += 1; try{ IBank* bank = Bank::open(subBankFilename); LOCAL(bank); nbDatasets += 1; } catch (Exception& e){ cerr << "ERROR: Can't open dataset: " << bankId << endl; error = true; } System::file().remove(subBankFilename); } inputFile.close(); if(error) exit(1); } }; template class SimkaInputIterator : public Iterator { public: /** Constructor. * \param[in] ref : the referred iterator * \param[in] initRef : will call 'first' on the reference if true */ SimkaInputIterator(Iterator* refs, size_t nbBanks, u_int64_t maxReads, Filter filter) : _filter(filter), _mainref(0) { setMainref(refs); _ref = _mainref->getComposition()[0]; _isDone = false; _nbDatasets = nbBanks; _nbBanks = _mainref->getComposition().size() / _nbDatasets; _maxReads = maxReads; _nbReadProcessed = 0; _currentBank = 0; _currentInternalBank = 0; _currentDataset = 0; } bool isFinished(){ if(_currentDataset == _nbDatasets){ _isDone = true; return true; } return false; } void nextDataset(){ _currentDataset += 1; if(isFinished()) return; _currentBank = _currentDataset * _nbBanks; _currentInternalBank = 0; _nbReadProcessed = 0; if(isFinished()) return; _ref = _mainref->getComposition()[_currentBank]; _isDone = false; first(); //nextBank(); } void nextBank(){ //cout << "next bank" << endl; //cout << "next bank "<< endl; _currentInternalBank += 1; if(_currentInternalBank == _nbBanks){ nextDataset(); } else{ _isDone = false; _currentBank += 1; _ref = _mainref->getComposition()[_currentBank]; first(); } } void first() { _ref->first(); while (!_ref->isDone() && _filter(_ref->item())==false) _ref->next(); _isDone = _ref->isDone(); if(!_isDone) *(this->_item) = _ref->item(); } void next(){ if(isFinished()){ _isDone = true; return; } //cout << "haha" << endl; _ref->next(); while (!_ref->isDone() && _filter(_ref->item())==false) _ref->next(); _isDone = _ref->isDone(); //cout << "haha" << endl; //if(!_isDone){ //cout << _currentBank << " " << _isDone << endl; //} //cout << _nbReadProcessed << " " << _currentBank << " " << _nbBanks << " " << _maxReads << endl; if(_isDone){ if(isFinished()){ //cout << _nbReadProcessed << endl; return; } else{ //cout << _nbReadProcessed << endl; nextBank(); if(isFinished()){ //cout << _nbReadProcessed << endl; return; } } } else{ *(this->_item) = _ref->item(); _nbReadProcessed += 1; } if(_maxReads && _nbReadProcessed >= _maxReads){ if(isFinished()) return; else nextDataset(); } } /** \copydoc Iterator::isDone */ bool isDone() { return _isDone; } /** \copydoc Iterator::item */ Item& item () { return *(this->_item); } private: bool _isDone; size_t _currentBank; //vector* > _refs; Iterator* _ref; size_t _nbBanks; u_int64_t _maxReads; Filter _filter; u_int64_t _nbReadProcessed; size_t _currentInternalBank; size_t _currentDataset; size_t _nbDatasets; Iterator* _mainref; void setMainref (Iterator* mainref) { SP_SETATTR(mainref); } }; struct SimkaSequenceFilter { //u_int64_t _maxNbReads; //u_int64_t _maxNbReadsPerBank; //u_int64_t _nbReadProcessed; //CancellableIterator* _it; //int* _bankIndex; //int* _datasetIndex; SimkaSequenceFilter(size_t minReadSize, double minShannonIndex){ //_maxNbReads = 0; //_nbReadProcessed = 0; _minReadSize = minReadSize; _minShannonIndex = minShannonIndex; } #ifdef BOOTSTRAP vector _bootstraps; void setBootstrap(vector& bootstraps){ _bootstraps = bootstraps; //for(size_t i=0; i<_bootstraps.size(); i++) // cout << _bootstraps[i]; //cout << endl << endl; } #endif //void setMaxReads(u_int64_t maxReads){ // _maxNbReads = maxReads; //} //void setIt(CancellableIterator* it){ // _it = it; //} bool operator() (Sequence& seq){ //cout << seq.toString() << endl; //cout << _nbReadProcessed << endl; //if(_maxNbReads != 0){ // if(_nbReadProcessed >= _maxNbReads){ // _it->_cancel = true; // return false; // } //} //cout << seq.getIndex() << " " << _nbReadProcessed << endl; #ifdef BOOTSTRAP int readPerBootstrap = _maxNbReads / MAX_BOOTSTRAP; int bootstrapIndex = seq.getIndex() / readPerBootstrap; if(!_bootstraps[bootstrapIndex]) return false; //cout << bootstrapIndex << endl; #endif if(!isReadSizeValid(seq)) return false; if(!isShannonIndexValid(seq)) return false; //cout << _nbReadProcessed << endl; //_nbReadProcessed += 1; return true; } bool isReadSizeValid(Sequence& seq){ if(_minReadSize == 0) return true; return seq.getDataSize() >= _minReadSize; } bool isShannonIndexValid(Sequence& seq){ if(_minShannonIndex == 0) return true; return getShannonIndex(seq) >= _minShannonIndex; } float getShannonIndex(Sequence& seq){ static char nt2binTab[128] = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, //69 0, 3, 0, 0, 0, 0, 0, 0, 4, 0, //79 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, }; float index = 0; //float freq [5]; vector _freqs(5, 0); char* seqStr = seq.getDataBuffer(); // Frequency of each letter (A, C, G, T or N) for(size_t i=0; i < seq.getDataSize(); i++) _freqs[nt2binTab[(unsigned char)seqStr[i]]] += 1.0; // Shannon index calculation for (size_t i=0; i<_freqs.size(); i++){ _freqs[i] /= (float) seq.getDataSize(); if (_freqs[i] != 0) index += _freqs[i] * log (_freqs[i]) / log(2); } return abs(index); } size_t _minReadSize; double _minShannonIndex; }; template class SimkaPotaraBankFiltered : public BankDelegate { public: SimkaPotaraBankFiltered (IBank* ref, const Filter& filter, u_int64_t maxReads, size_t nbDatasets) : BankDelegate (ref), _ref2(0), _filter(filter) { _maxReads = maxReads; _nbDatasets = nbDatasets; setRef2(_ref->iterator ()); } ~SimkaPotaraBankFiltered(){ std::vector*> itBanks = _ref2->getComposition(); for(size_t i=0; i setRef2(0); } Iterator* iterator () { return new SimkaInputIterator (_ref2, _nbDatasets, _maxReads, _filter); } private: Iterator* _ref2; void setRef2 (Iterator* ref2) { SP_SETATTR(ref2); } u_int64_t _maxReads; Filter _filter; u_int64_t _nbReadToProcess; size_t _datasetId; size_t _nbDatasets; }; #endif /* SIMKA1_4_SRC_CORE_SIMKACOMMONS_H_ */ simka-1.5.1/src/core/SimkaDistance.cpp000077500000000000000000001224651353413740300176040ustar00rootroot00000000000000/***************************************************************************** * Simka: Fast kmer-based method for estimating the similarity between numerous metagenomic datasets * A tool from the GATB (Genome Assembly Tool Box) * Copyright (C) 2015 INRIA * Authors: G.Benoit, C.Lemaitre, P.Peterlongo * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU Affero General Public License as * published by the Free Software Foundation, either version 3 of the * License, or (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU Affero General Public License for more details. * * You should have received a copy of the GNU Affero General Public License * along with this program. If not, see . *****************************************************************************/ #include "SimkaDistance.hpp" SimkaStatistics::SimkaStatistics(size_t nbBanks, bool computeSimpleDistances, bool computeComplexDistances, const string& tmpDir, const vector& datasetIds) { _nbBanks = nbBanks; _symetricDistanceMatrixSize = (_nbBanks*(_nbBanks+1))/2; _computeSimpleDistances = computeSimpleDistances; _computeComplexDistances = computeComplexDistances; //_nbBanks = 10000; _nbKmers = 0; _nbDistinctKmers = 0; _nbSolidKmers = 0; _nbErroneousKmers = 0; _nbSharedKmers = 0; //_abundanceMin = abundanceMin; //_mutex = mutex; //_outputDir = outputDir; _datasetNbReads.resize(_nbBanks, 0); _nbSolidDistinctKmersPerBank.resize(_nbBanks, 0); _nbSolidKmersPerBank.resize(_nbBanks, 0); _nbKmersPerBank.resize(_nbBanks, 0); //_nbDistinctKmersSharedByBanksThreshold.resize(_nbBanks, 0); //_nbKmersSharedByBanksThreshold.resize(_nbBanks, 0); _matrixNbDistinctSharedKmers.resize(_symetricDistanceMatrixSize); _matrixNbSharedKmers.resize(_nbBanks); _brayCurtisNumerator.resize(_symetricDistanceMatrixSize); for(size_t i=0; i<_nbBanks; i++){ //_matrixNbDistinctSharedKmers[i].resize(nbBanks, 0); _matrixNbSharedKmers[i].resize(nbBanks, 0); //_brayCurtisNumerator[i].resize(nbBanks, 0); //_kullbackLeibler[i].resize(nbBanks, 0); } if(_computeSimpleDistances){ //_abundance_jaccard_intersection.resize(_nbBanks); //for(size_t i=0; i<_nbBanks; i++){ // _abundance_jaccard_intersection[i].resize(nbBanks, 0); //} _chord_NiNj.resize(_nbBanks); _chord_sqrt_N2.resize(_nbBanks); //_chord_N2j.resize(_nbBanks); for(size_t i=0; i<_nbBanks; i++){ _chord_NiNj[i].resize(nbBanks, 0); //_chord_N2i[i].resize(nbBanks, 0); //_chord_N2j[i].resize(nbBanks, 0); } _hellinger_SqrtNiNj.resize(_nbBanks); for(size_t i=0; i<_nbBanks; i++){ _hellinger_SqrtNiNj[i].resize(nbBanks, 0); } _kulczynski_minNiNj.resize(_nbBanks); for(size_t i=0; i<_nbBanks; i++){ _kulczynski_minNiNj[i].resize(nbBanks, 0); } } if(_computeComplexDistances){ _whittaker_minNiNj.resize(_nbBanks); for(size_t i=0; i<_nbBanks; i++){ _whittaker_minNiNj[i].resize(nbBanks, 0); } _kullbackLeibler.resize(_nbBanks); for(size_t i=0; i<_nbBanks; i++){ _kullbackLeibler[i].resize(nbBanks, 0); } _canberra.resize(_nbBanks); for(size_t i=0; i<_nbBanks; i++){ _canberra[i].resize(nbBanks, 0); } } _totalReads = 0; for(size_t i=0; i<_nbBanks; i++){ string name = datasetIds[i]; string countFilename = tmpDir + "/count_synchro/" + name + ".ok"; string line; ifstream file(countFilename.c_str()); vector lines; while(getline(file, line)){ if(line == "") continue; lines.push_back(line); } file.close(); u_int64_t nbReads = strtoull(lines[0].c_str(), NULL, 10); _datasetNbReads[i] = nbReads; _nbSolidDistinctKmersPerBank[i] = strtoull(lines[1].c_str(), NULL, 10); _nbSolidKmersPerBank[i] = strtoull(lines[2].c_str(), NULL, 10); if(_computeSimpleDistances){ _chord_sqrt_N2[i] = sqrt(strtoull(lines[3].c_str(), NULL, 10)); } _totalReads += nbReads; /* for (size_t j=0; j<_nbCores; j++){ DistanceCommand* cmd = dynamic_cast*>(_cmds[j]); cmd->_stats->_datasetNbReads[i] = nbReads; cmd->_stats->_nbSolidDistinctKmersPerBank[i] = strtoull(lines[1].c_str(), NULL, 10); cmd->_stats->_nbSolidKmersPerBank[i] = strtoull(lines[2].c_str(), NULL, 10); cmd->_stats->_chord_sqrt_N2[i] = sqrt(strtoull(lines[3].c_str(), NULL, 10)); }*/ } } SimkaStatistics& SimkaStatistics::operator+= (const SimkaStatistics& other){ _nbKmers += other._nbKmers; _nbDistinctKmers += other._nbDistinctKmers; _nbSolidKmers += other._nbSolidKmers; _nbErroneousKmers += other._nbErroneousKmers; _nbSharedKmers += other._nbSharedKmers; for(size_t i=0; i<_nbBanks; i++){ _nbKmersPerBank[i] += other._nbKmersPerBank[i]; //_nbSolidDistinctKmersPerBank[i] += other._nbSolidDistinctKmersPerBank[i]; //_nbSolidKmersPerBank[i] += other._nbSolidKmersPerBank[i]; //_nbDistinctKmersSharedByBanksThreshold[i] += other._nbDistinctKmersSharedByBanksThreshold[i]; //_nbKmersSharedByBanksThreshold[i] += other._nbKmersSharedByBanksThreshold[i]; //if(_distanceParams._computeChord) //_chord_sqrt_N2[i] += other._chord_sqrt_N2[i]; } for(size_t i=0; i<_symetricDistanceMatrixSize; i++){ _brayCurtisNumerator[i] += other._brayCurtisNumerator[i]; _matrixNbDistinctSharedKmers[i] += other._matrixNbDistinctSharedKmers[i]; } for(size_t i=0; i<_nbBanks; i++){ for(size_t j=0; j<_nbBanks; j++){ _matrixNbSharedKmers[i][j] += other._matrixNbSharedKmers[i][j]; } } if(_computeSimpleDistances){ for(size_t i=0; i<_nbBanks; i++){ for(size_t j=0; j<_nbBanks; j++){ _chord_NiNj[i][j] += other._chord_NiNj[i][j]; _hellinger_SqrtNiNj[i][j] += other._hellinger_SqrtNiNj[i][j]; _kulczynski_minNiNj[i][j] += other._kulczynski_minNiNj[i][j]; } } } if(_computeComplexDistances){ for(size_t i=0; i<_nbBanks; i++){ for(size_t j=0; j<_nbBanks; j++){ _canberra[i][j] += other._canberra[i][j]; _whittaker_minNiNj[i][j] += other._whittaker_minNiNj[i][j]; _kullbackLeibler[i][j] += other._kullbackLeibler[i][j]; } } } return *this; } void SimkaStatistics::print(){ u_int64_t nbKmers = 0; u_int64_t nbDistinctKmersAfterMerging = _nbDistinctKmers; u_int64_t nbDistinctKmers = 0; u_int64_t nbSharedDistinctKmers = _nbSharedKmers; u_int64_t nbSharedKmers = 0; double meanCoverage = 0; for(size_t i=0; i<_nbBanks; i++){ nbKmers += _nbSolidKmersPerBank[i]; nbDistinctKmers += _nbSolidDistinctKmersPerBank[i]; float coverage = (double)_nbSolidKmersPerBank[i] / (double)_nbSolidDistinctKmersPerBank[i]; //cout << coverage << endl; meanCoverage += coverage; //nbDistinctKmers += _nbDistinctKmers; //for(size_t j=i+1; j<_nbBanks; j++){ // nbSharedDistinctKmers += _matrixNbDistinctSharedKmers[i][j]; // nbSharedKmers += _matrixNbSharedKmers[i][j]; //} } meanCoverage /= _nbBanks; u_int64_t totalReads = 0; u_int64_t minReads = -1; u_int64_t maxReads = 0; for (size_t i=0; i<_nbBanks; i++){ u_int64_t nbReads = _datasetNbReads[i]; //nbReads /= _nbBankPerDataset[i]; totalReads += nbReads; if(nbReads < minReads){ minReads = nbReads; //_smallerBankId = _bankNames[i]; } if(nbReads > maxReads){ maxReads = nbReads; } } u_int64_t meanReads = totalReads / _nbBanks; cout << endl << "Stats" << endl; cout << "\tReads" << endl; cout << "\t\tTotal: " << totalReads << " " << totalReads/1000000 << "M" << " " << totalReads/1000000000 << "G" << endl; cout << "\t\tMin: " << minReads << " " << minReads/1000000 << "M" << " " << minReads/1000000000 << "G" << endl; cout << "\t\tMax: " << maxReads << " " << maxReads/1000000 << "M" << " " << maxReads/1000000000 << "G" << endl; cout << "\t\tAverage: " << meanReads << " " << meanReads/1000000 << "M" << " " << meanReads/1000000000 << "G" << endl; cout << "\tKmers" << endl; cout << "\t\tDistinct Kmers (before merging): " << nbDistinctKmers << " " << nbDistinctKmers/1000000 << "M" << " " << nbDistinctKmers/1000000000 << "G" << endl; cout << "\t\tDistinct Kmers (after merging): " << nbDistinctKmersAfterMerging << " " << nbDistinctKmersAfterMerging/1000000 << "M" << " " << nbDistinctKmersAfterMerging/1000000000 << "G" << endl; cout << "\t\tShared distinct Kmers: " << nbSharedDistinctKmers << " " << nbSharedDistinctKmers/1000000 << "M" << " " << nbSharedDistinctKmers/1000000000 << "G" << endl; cout << "\t\tKmers: " << nbKmers << " " << nbKmers/1000000 << "M" << " " << nbKmers/1000000000 << "G" << endl; cout << "\t\tMean k-mer coverage: " << meanCoverage << endl; //cout << "\t\tShared distinct kmers: " << (int)((long double) nbSharedDistinctKmers / (long double)nbDistinctKmers * 100) << "% " << nbSharedDistinctKmers << " " << nbSharedDistinctKmers/1000000 << "M" << " " << nbSharedDistinctKmers/1000000000 << "G" << endl; //cout << "\t\tShared kmers: " << (int)((long double) nbSharedKmers / (long double)nbKmers * 100) << "% " << nbSharedKmers << " " << nbSharedKmers/1000000 << "M" << " " << nbSharedKmers/1000000000 << "G" << endl; cout << endl; return; //cout.precision(4); cout << endl << endl; //return; u_int64_t solidAbundance = 0; //for(int i=0; i<_nbSolidKmersPerBankAbundance.size(); i++) // solidAbundance += _nbSolidKmersPerBankAbundance[i]; for(size_t i=0; i<_nbKmersSharedByBanksThreshold.size(); i++) solidAbundance += _nbKmersSharedByBanksThreshold[i]; cout << "Statistics on kmer intersections:" << endl; cout << "\tNb kmers: " << _nbKmers << " " << _nbKmers / 1000000 << " M" << " " << _nbKmers / 1000000000 << " G" << endl; cout << endl; cout << "\tNb distinct kmers: " << _nbDistinctKmers << " " << _nbDistinctKmers / 1000000 << " M" << " " << _nbDistinctKmers / 1000000000 << " G" << " " << (100*_nbDistinctKmers)/(float)_nbKmers << "%" << endl; cout << "\tNb solid kmers: " << _nbSolidKmers << " " << _nbSolidKmers / 1000000 << " M" << " " << _nbSolidKmers / 1000000000 << " G" << " " << (100*_nbSolidKmers)/(float)_nbDistinctKmers << "% distinct" << " " << (100*solidAbundance) / (double)_nbKmers << "% abundance" << endl; //for(int i=0; i<_nbBanks; i++){ //cout << "Nb kmers (M) " << i << ": " << _nbSolidKmersPerBank[i] << endl << endl; //} cout << endl; cout << "\tPotentially erroneous (Kmers appearing only one time in a single bank): " << endl; cout << "\t\t" << _nbErroneousKmers << " " << _nbErroneousKmers / 1000000 << " M" << " " << _nbErroneousKmers / 1000000000 << " G" << " " << (100*_nbErroneousKmers)/(float)_nbDistinctKmers << "% distinct" << " " << (100*_nbErroneousKmers)/(float)_nbKmers << "% abundance" << endl; cout << endl; cout << "\tKmer shared by T banks :" << endl; for(size_t i=0; i<_nbBanks; i++){ cout << "\t\tShared by " << i+1 << " banks:"; cout << endl; cout << "\t\t\tDistinct: " << _nbDistinctKmersSharedByBanksThreshold[i] << " "; if(_nbSolidKmers > 0){ cout << (_nbDistinctKmersSharedByBanksThreshold[i]*100) / (float)_nbSolidKmers << "%"; } else{ cout << "0%"; } cout << endl; cout << "\t\t\tAbundance: " << _nbKmersSharedByBanksThreshold[i] << " "; if(solidAbundance > 0){ cout << (_nbKmersSharedByBanksThreshold[i]*100) / (float)solidAbundance << "%"; } else{ cout << "0%"; } if(_nbDistinctKmersSharedByBanksThreshold[i] > 0){ cout << endl; cout << "\t\t\tMean abundance per bank: " << _nbKmersSharedByBanksThreshold[i] / _nbDistinctKmersSharedByBanksThreshold[i] / (float) _nbBanks; } cout << endl; } //cout << endl; //cout << "Nb kmers in all banks (max/min > 10): " << _nbKmersInCoupleBankSupRatio << " " << (_nbKmersInCoupleBankSupRatio*100) / (float)_nbSolidKmers << "%" << endl; cout << endl << endl; } void SimkaStatistics::load(const string& filename){ IterableGzFile* file = new IterableGzFile(filename); Iterator* it = file->iterator(); LOCAL(it); it->first(); //_nbBanks = it->item(); it->next(); _computeSimpleDistances = it->item(); it->next(); _computeComplexDistances = it->item(); it->next(); //cout << _computeSimpleDistances << " " << _computeComplexDistances << endl; _nbKmers = it->item(); it->next(); _nbErroneousKmers = it->item(); it->next(); _nbDistinctKmers = it->item(); it->next(); _nbSolidKmers = it->item(); it->next(); _nbSharedKmers = it->item(); it->next(); for(size_t i=0; i<_nbBanks; i++){ _nbSolidDistinctKmersPerBank[i] = it->item(); it->next();} for(size_t i=0; i<_nbBanks; i++){ _nbKmersPerBank[i] = it->item(); it->next();} for(size_t i=0; i<_nbBanks; i++){ _nbSolidKmersPerBank[i] = it->item(); it->next();} //for(size_t i=0; i<_nbBanks; i++){ _nbDistinctKmersSharedByBanksThreshold[i] = it->item(); it->next();} //for(size_t i=0; i<_nbBanks; i++){ _nbKmersSharedByBanksThreshold[i] = it->item(); it->next();} for(size_t i=0; i<_nbBanks; i++){ //cout << i << endl; //cout << _nbBanks << endl; //cout << _matrixNbDistinctSharedKmers[i].size() << endl; for(size_t j=0; j<_nbBanks; j++){ _matrixNbSharedKmers[i][j] = it->item(); it->next();} //for(size_t j=0; j<_nbBanks; j++){ _abundance_jaccard_intersection[i][j] = it->item(); it->next();} } for(size_t i=0; i<_symetricDistanceMatrixSize; i++){ _matrixNbDistinctSharedKmers[i] = it->item(); it->next(); _brayCurtisNumerator[i] = it->item(); it->next(); } if(_computeSimpleDistances){ for(size_t i=0; i<_nbBanks; i++){ _chord_sqrt_N2[i] = it->item(); it->next();} for(size_t i=0; i<_nbBanks; i++){ for(size_t j=0; j<_nbBanks; j++){ _chord_NiNj[i][j] = it->item(); it->next();} for(size_t j=0; j<_nbBanks; j++){ _hellinger_SqrtNiNj[i][j] = it->item(); it->next();} for(size_t j=0; j<_nbBanks; j++){ _kulczynski_minNiNj[i][j] = it->item(); it->next();} } } if(_computeComplexDistances){ for(size_t i=0; i<_nbBanks; i++){ for(size_t j=0; j<_nbBanks; j++){ _canberra[i][j] = it->item(); it->next();} for(size_t j=0; j<_nbBanks; j++){ _whittaker_minNiNj[i][j] = it->item(); it->next();} for(size_t j=0; j<_nbBanks; j++){ _kullbackLeibler[i][j] = it->item(); it->next();} } } delete file; /* Storage::istream is (group, "simkaStats"); //is.read ((char*)&_nbBanks, sizeof(_nbBanks)); is.read ((char*)&_nbKmers, sizeof(_nbKmers)); is.read ((char*)&_nbErroneousKmers, sizeof(_nbErroneousKmers)); is.read ((char*)&_nbDistinctKmers, sizeof(_nbDistinctKmers)); is.read ((char*)&_nbSolidKmers, sizeof(_nbSolidKmers)); is.read ((char*)_nbSolidDistinctKmersPerBank.data(), sizeof(u_int64_t)*_nbBanks); is.read ((char*)_nbKmersPerBank.data(), sizeof(u_int64_t)*_nbBanks); is.read ((char*)_nbSolidKmersPerBank.data(), sizeof(u_int64_t)*_nbBanks); is.read ((char*)_nbDistinctKmersSharedByBanksThreshold.data(), sizeof(u_int64_t)*_nbBanks); is.read ((char*)_nbKmersSharedByBanksThreshold.data(), sizeof(u_int64_t)*_nbBanks); for(size_t i=0; i<_nbBanks; i++){ is.read ((char*)_matrixNbDistinctSharedKmers[i].data(), sizeof(u_int64_t)*_nbBanks); is.read ((char*)_matrixNbSharedKmers[i].data(), sizeof(u_int64_t)*_nbBanks); } //is.read ((char*)&_distanceParams._computeBrayCurtis, sizeof(_distanceParams._computeBrayCurtis)); //is.read ((char*)&_distanceParams._computeCanberra, sizeof(_distanceParams._computeCanberra)); //is.read ((char*)&_distanceParams._computeChord, sizeof(_distanceParams._computeChord)); //is.read ((char*)&_distanceParams._computeHellinger, sizeof(_distanceParams._computeHellinger)); //is.read ((char*)&_distanceParams._computeKulczynski, sizeof(_distanceParams._computeKulczynski)); //if(_distanceParams._computeBrayCurtis) for(size_t i=0; i<_nbBanks; i++) is.read ((char*)_brayCurtisNumerator[i].data(), sizeof(u_int64_t)*_nbBanks); //if(_distanceParams._computeCanberra) for(size_t i=0; i<_nbBanks; i++) is.read ((char*)_canberra[i].data(), sizeof(u_int64_t)*_nbBanks); //if(_distanceParams._computeChord){ is.read ((char*)_chord_N2.data(), sizeof(u_int64_t)*_nbBanks); for(size_t i=0; i<_nbBanks; i++) is.read ((char*)_chord_NiNj[i].data(), sizeof(u_int64_t)*_nbBanks); //} //if(_distanceParams._computeHellinger) for(size_t i=0; i<_nbBanks; i++) is.read ((char*)_hellinger_SqrtNiNj[i].data(), sizeof(u_int64_t)*_nbBanks); //if(_distanceParams._computeKulczynski) for(size_t i=0; i<_nbBanks; i++) is.read ((char*)_kulczynski_minNiNj[i].data(), sizeof(u_int64_t)*_nbBanks); */ } void SimkaStatistics::save (const string& filename){ BagGzFile* file = new BagGzFile(filename); //file->insert(_nbBanks); file->insert((long double)_computeSimpleDistances); file->insert((long double)_computeComplexDistances); file->insert((long double)_nbKmers); file->insert((long double)_nbErroneousKmers); file->insert((long double)_nbDistinctKmers); file->insert((long double)_nbSolidKmers); file->insert((long double)_nbSharedKmers); for(size_t i=0; i<_nbBanks; i++){ file->insert((long double)_nbSolidDistinctKmersPerBank[i]);} for(size_t i=0; i<_nbBanks; i++){ file->insert((long double)_nbKmersPerBank[i]);} for(size_t i=0; i<_nbBanks; i++){ file->insert((long double)_nbSolidKmersPerBank[i]);} //for(size_t i=0; i<_nbBanks; i++){ file->insert((long double)_nbDistinctKmersSharedByBanksThreshold[i]);} //for(size_t i=0; i<_nbBanks; i++){ file->insert((long double)_nbKmersSharedByBanksThreshold[i]);} for(size_t i=0; i<_nbBanks; i++){ //cout << i << endl; //cout << _nbBanks << endl; //cout << _matrixNbDistinctSharedKmers[i].size() << endl; for(size_t j=0; j<_nbBanks; j++){ file->insert((long double)_matrixNbSharedKmers[i][j]);} //for(size_t j=0; j<_nbBanks; j++){ file->insert((long double)_abundance_jaccard_intersection[i][j]);} } for(size_t i=0; i<_symetricDistanceMatrixSize; i++){ file->insert((long double)_matrixNbDistinctSharedKmers[i]); file->insert((long double)_brayCurtisNumerator[i]); } if(_computeSimpleDistances){ for(size_t i=0; i<_nbBanks; i++){ file->insert((long double)_chord_sqrt_N2[i]);} for(size_t i=0; i<_nbBanks; i++){ for(size_t j=0; j<_nbBanks; j++){ file->insert((long double)_chord_NiNj[i][j]);} for(size_t j=0; j<_nbBanks; j++){ file->insert((long double)_hellinger_SqrtNiNj[i][j]);} for(size_t j=0; j<_nbBanks; j++){ file->insert((long double)_kulczynski_minNiNj[i][j]);} } } if(_computeComplexDistances){ for(size_t i=0; i<_nbBanks; i++){ for(size_t j=0; j<_nbBanks; j++){ file->insert((long double)_canberra[i][j]);} for(size_t j=0; j<_nbBanks; j++){ file->insert((long double)_whittaker_minNiNj[i][j]);} for(size_t j=0; j<_nbBanks; j++){ file->insert((long double)_kullbackLeibler[i][j]);} } } /* file->insert(_nbKmersPerBank, 0); file->insert(_nbSolidKmersPerBank, 0); file->insert(_nbDistinctKmersSharedByBanksThreshold, 0); file->insert(_nbKmersSharedByBanksThreshold, 0); file->insert(_chord_N2, 0); for(size_t i=0; i<_nbBanks; i++){ file->insert(_matrixNbDistinctSharedKmers[i], 0); file->insert(_matrixNbSharedKmers[i], 0); file->insert(_brayCurtisNumerator[i], 0); file->insert(_canberra[i], 0); file->insert(_chord_NiNj[i], 0); file->insert(_hellinger_SqrtNiNj[i], 0); file->insert(_whittaker_minNiNj[i], 0); //cout << _kullbackLeibler[i][j] << endl; //file->insert(_kullbackLeibler[i], 0); file->insert(_kulczynski_minNiNj[i], 0); //for(size_t j=0; j<_nbBanks; j++){ // cout << _kullbackLeibler[i][j] << endl; //} }*/ file->flush(); delete file; /* cout << "loulou2" << endl; Storage::ostream os (group, "simkaStats"); cout << "loulou3" << endl; //os.write ((const char*)&_nbBanks, sizeof(_nbBanks)); os.write ((const char*)&_nbKmers, sizeof(_nbKmers)); os.write ((const char*)&_nbErroneousKmers, sizeof(_nbErroneousKmers)); os.write ((const char*)&_nbDistinctKmers, sizeof(_nbDistinctKmers)); os.write ((const char*)&_nbSolidKmers, sizeof(_nbSolidKmers)); cout << "loulou4" << endl; os.write ((const char*)_nbSolidDistinctKmersPerBank.data(), sizeof(u_int64_t)*_nbBanks); os.write ((const char*)_nbKmersPerBank.data(), sizeof(u_int64_t)*_nbBanks); os.write ((const char*)_nbSolidKmersPerBank.data(), sizeof(u_int64_t)*_nbBanks); os.write ((const char*)_nbDistinctKmersSharedByBanksThreshold.data(), sizeof(u_int64_t)*_nbBanks); os.write ((const char*)_nbKmersSharedByBanksThreshold.data(), sizeof(u_int64_t)*_nbBanks); cout << "loulou5" << endl; for(size_t i=0; i<_nbBanks; i++){ os.write ((const char*)_matrixNbDistinctSharedKmers[i].data(), sizeof(u_int64_t)*_nbBanks); os.write ((const char*)_matrixNbSharedKmers[i].data(), sizeof(u_int64_t)*_nbBanks); } //os.write ((const char*)&_distanceParams._computeBrayCurtis, sizeof(_distanceParams._computeBrayCurtis)); //os.write ((const char*)&_distanceParams._computeCanberra, sizeof(_distanceParams._computeCanberra)); //os.write ((const char*)&_distanceParams._computeChord, sizeof(_distanceParams._computeChord)); //os.write ((const char*)&_distanceParams._computeHellinger, sizeof(_distanceParams._computeHellinger)); //os.write ((const char*)&_distanceParams._computeKulczynski, sizeof(_distanceParams._computeKulczynski)); cout << "loulou6" << endl; //if(_distanceParams._computeBrayCurtis) for(size_t i=0; i<_nbBanks; i++) os.write ((const char*)_brayCurtisNumerator[i].data(), sizeof(u_int64_t)*_nbBanks); cout << "loulou7" << endl; //if(_distanceParams._computeCanberra) for(size_t i=0; i<_nbBanks; i++) os.write ((const char*)_canberra[i].data(), sizeof(u_int64_t)*_nbBanks); cout << "loulou8" << endl; //if(_distanceParams._computeChord){ os.write ((const char*)_chord_N2.data(), sizeof(u_int64_t)*_nbBanks); for(size_t i=0; i<_nbBanks; i++) os.write ((const char*)_chord_NiNj[i].data(), sizeof(u_int64_t)*_nbBanks); //} cout << "loulou9" << endl; //if(_distanceParams._computeHellinger) for(size_t i=0; i<_nbBanks; i++) os.write ((const char*)_hellinger_SqrtNiNj[i].data(), sizeof(u_int64_t)*_nbBanks); cout << "loulou10" << endl; //if(_distanceParams._computeKulczynski) for(size_t i=0; i<_nbBanks; i++) os.write ((const char*)_kulczynski_minNiNj[i].data(), sizeof(u_int64_t)*_nbBanks); cout << "loulou11" << endl; os.flush();*/ } void SimkaStatistics::outputMatrix(const string& outputDir, const vector& bankNames){ SimkaDistance _simkaDistance(*this); _outputFilenameSuffix = ""; char buffer[200]; //string strKmerSize = "_k"; //snprintf(buffer,200,"%llu",_kmerSize); //strKmerSize += string(buffer); //_outputFilenameSuffix += strKmerSize; dumpMatrix(outputDir, bankNames, "mat_presenceAbsence_chord", _simkaDistance._matrix_presenceAbsence_chordHellinger()); dumpMatrix(outputDir, bankNames, "mat_presenceAbsence_whittaker", _simkaDistance._matrix_presenceAbsence_Whittaker()); dumpMatrix(outputDir, bankNames, "mat_presenceAbsence_kulczynski", _simkaDistance._matrix_presenceAbsence_kulczynski()); dumpMatrix(outputDir, bankNames, "mat_presenceAbsence_braycurtis", _simkaDistance._matrix_presenceAbsence_sorensenBrayCurtis()); dumpMatrix(outputDir, bankNames, "mat_presenceAbsence_jaccard", _simkaDistance._matrix_presenceAbsence_jaccardCanberra()); dumpMatrix(outputDir, bankNames, "mat_presenceAbsence_simka-jaccard", _simkaDistance._matrix_presenceAbsence_jaccard_simka()); dumpMatrix(outputDir, bankNames, "mat_presenceAbsence_simka-jaccard_asym", _simkaDistance._matrix_presenceAbsence_jaccard_simka_asym()); dumpMatrix(outputDir, bankNames, "mat_presenceAbsence_ochiai", _simkaDistance._matrix_presenceAbsence_ochiai()); dumpMatrix(outputDir, bankNames, "mat_abundance_simka-jaccard", _simkaDistance._matrixSymJaccardAbundance()); dumpMatrix(outputDir, bankNames, "mat_abundance_simka-jaccard_asym", _simkaDistance._matrixAsymJaccardAbundance()); dumpMatrix(outputDir, bankNames, "mat_abundance_ab-ochiai", _simkaDistance._matrixOchiai()); dumpMatrix(outputDir, bankNames, "mat_abundance_ab-sorensen", _simkaDistance._matrixSorensen()); dumpMatrix(outputDir, bankNames, "mat_abundance_ab-jaccard", _simkaDistance._matrixJaccardAbundance()); const vector >& matrix = _simkaDistance._matrixBrayCurtis(); dumpMatrix(outputDir, bankNames, "mat_abundance_braycurtis", matrix); dumpMatrix(outputDir, bankNames, "mat_abundance_jaccard", _simkaDistance.computeJaccardDistanceFromBrayCurtis(matrix)); if(_computeSimpleDistances){ //dumpMatrix(outputDir, bankNames, "mat_abundance_braycurtis-simple", _simkaDistance._matrixJaccardIntersection); dumpMatrix(outputDir, bankNames, "mat_abundance_chord", _simkaDistance._matrixChord()); dumpMatrix(outputDir, bankNames, "mat_abundance_hellinger", _simkaDistance._matrixHellinger()); dumpMatrix(outputDir, bankNames, "mat_abundance_kulczynski", _simkaDistance._matrixKulczynski()); } if(_computeComplexDistances){ dumpMatrix(outputDir, bankNames, "mat_abundance_whittaker", _simkaDistance._matrixWhittaker()); dumpMatrix(outputDir, bankNames, "mat_abundance_jensenshannon", _simkaDistance._matrixKullbackLeibler()); dumpMatrix(outputDir, bankNames, "mat_abundance_canberra", _simkaDistance._matrixCanberra()); } } void SimkaStatistics::dumpMatrix(const string& outputDir, const vector& bankNames, const string& outputFilename, const vector >& matrix){ string filename = outputDir + "/" + outputFilename + ".csv"; gzFile out = gzopen((filename + ".gz").c_str(),"wb"); //char buffer[200]; string str; for(size_t i=0; ifwrite(str.c_str(), str.size(), 1); //file->flush(); //delete file; } SimkaDistance::SimkaDistance(SimkaStatistics& stats) : _stats(stats){ _nbBanks = _stats._nbBanks; //AnB is symetrical //for(size_t i=0; i<_nbBanks; i++) // for(size_t j=i+1; j<_nbBanks; j++) // _stats._matrixNbDistinctSharedKmers[j][i] = _stats._matrixNbDistinctSharedKmers[i][j]; /* u_int64_t a, b, c; u_int64_t b; u_int64_t c; _matrixJaccardAbundance = createSquaredMatrix(_nbBanks); _matrixBrayCurtis = createSquaredMatrix(_nbBanks); //_matrixJaccardIntersection = createSquaredMatrix(_nbBanks); _matrixSymJaccardAbundance = createSquaredMatrix(_nbBanks); _matrixAsymJaccardAbundance = createSquaredMatrix(_nbBanks); _matrixOchiai = createSquaredMatrix(_nbBanks); _matrixSorensen = createSquaredMatrix(_nbBanks); _matrix_presenceAbsence_sorensenBrayCurtis = createSquaredMatrix(_nbBanks); _matrix_presenceAbsence_Whittaker = createSquaredMatrix(_nbBanks); _matrix_presenceAbsence_kulczynski = createSquaredMatrix(_nbBanks); _matrix_presenceAbsence_ochiai = createSquaredMatrix(_nbBanks); _matrix_presenceAbsence_chordHellinger = createSquaredMatrix(_nbBanks); _matrix_presenceAbsence_jaccardCanberra = createSquaredMatrix(_nbBanks); _matrix_presenceAbsence_jaccard_simka = createSquaredMatrix(_nbBanks); _matrix_presenceAbsence_jaccard_simka_asym = createSquaredMatrix(_nbBanks); double dist = 0; for(size_t i=0; i<_nbBanks; i++){ //SpeciesAbundanceVectorType& X_i = _stats._speciesAbundancePerDataset[i]; //for(size_t j=0; j<_nbBanks; j++){ for(size_t j=i+1; j<_nbBanks; j++){ //SpeciesAbundanceVectorType& X_j = _stats._speciesAbundancePerDataset[j]; get_abc(i, j, a, b ,c); //PresenceAbsence chord hellinger dist = distance_presenceAbsence_chordHellinger(a, b, c); _matrix_presenceAbsence_chordHellinger[i][j] = dist; _matrix_presenceAbsence_chordHellinger[j][i] = dist; //Presence Absence Ochiai dist = distance_presenceAbsence_ochiai(a, b, c); _matrix_presenceAbsence_ochiai[i][j] = dist; _matrix_presenceAbsence_ochiai[j][i] = dist; //PresenceAbsence Jaccard Canberra dist = distance_presenceAbsence_jaccardCanberra(a, b, c); _matrix_presenceAbsence_jaccardCanberra[i][j] = dist; _matrix_presenceAbsence_jaccardCanberra[j][i] = dist; //PresenceAbsence Jaccard Simka dist = distance_presenceAbsence_jaccard_simka(i, j, SYMETRICAL); _matrix_presenceAbsence_jaccard_simka[i][j] = dist; _matrix_presenceAbsence_jaccard_simka[j][i] = dist; _matrix_presenceAbsence_jaccard_simka_asym[i][j] = distance_presenceAbsence_jaccard_simka(i, j, ASYMETRICAL); _matrix_presenceAbsence_jaccard_simka_asym[j][i] = distance_presenceAbsence_jaccard_simka(j, i, ASYMETRICAL); //PresenceAbsence Sorensen BrayCurtis dist = distance_presenceAbsence_sorensenBrayCurtis(a, b, c); _matrix_presenceAbsence_sorensenBrayCurtis[i][j] = dist; _matrix_presenceAbsence_sorensenBrayCurtis[j][i] = dist; //PresenceAbsence Whittaker dist = distance_presenceAbsence_whittaker(a, b, c); _matrix_presenceAbsence_Whittaker[i][j] = dist; _matrix_presenceAbsence_Whittaker[j][i] = dist; //PresenceAbsence kulczynski dist = distance_presenceAbsence_kulczynski(a, b, c); _matrix_presenceAbsence_kulczynski[i][j] = dist; _matrix_presenceAbsence_kulczynski[j][i] = dist; //Abundance Ochiai dist = distance_abundance_ochiai(i, j); _matrixOchiai[i][j] = dist; _matrixOchiai[j][i] = dist; //Abundance Sorensen dist = distance_abundance_sorensen(i, j); _matrixSorensen[i][j] = dist; _matrixSorensen[j][i] = dist; //Abundance Jaccard dist = distance_abundance_jaccard(i, j); _matrixJaccardAbundance[i][j] = dist; _matrixJaccardAbundance[j][i] = dist; //Abundance Jaccard Simka dist = distance_abundance_jaccard_simka(i, j, SYMETRICAL); _matrixSymJaccardAbundance[i][j] = dist; _matrixSymJaccardAbundance[j][i] = dist; _matrixAsymJaccardAbundance[i][j] = distance_abundance_jaccard_simka(i, j, ASYMETRICAL); _matrixAsymJaccardAbundance[j][i] = distance_abundance_jaccard_simka(j, i, ASYMETRICAL); //Abundance bray-curtis dist = distance_abundance_brayCurtis(i,j); _matrixBrayCurtis[i][j] = dist; _matrixBrayCurtis[j][i] = dist; //Abundance Jaccard Intersection //dist = distance_abundance_jaccard_intersection(i, j); //_matrixJaccardIntersection[i][j] = dist; //_matrixJaccardIntersection[j][i] = dist; } } if(_stats._computeSimpleDistances){ _matrixChord = createSquaredMatrix(_nbBanks); _matrixHellinger = createSquaredMatrix(_nbBanks); _matrixKulczynski = createSquaredMatrix(_nbBanks); for(size_t i=0; i<_nbBanks; i++){ for(size_t j=i+1; j<_nbBanks; j++){ //Abundance Chord dist = distance_abundance_chord(i, j); _matrixChord[i][j] = dist; _matrixChord[j][i] = dist; //Abundance Hellinger dist = distance_abundance_hellinger(i, j); _matrixHellinger[i][j] = dist; _matrixHellinger[j][i] = dist; //Abundance Kulczynski dist = distance_abundance_kulczynski(i, j); _matrixKulczynski[i][j] = dist; _matrixKulczynski[j][i] = dist; } } } if(_stats._computeComplexDistances){ _matrixCanberra = createSquaredMatrix(_nbBanks); _matrixWhittaker = createSquaredMatrix(_nbBanks); _matrixKullbackLeibler = createSquaredMatrix(_nbBanks); for(size_t i=0; i<_nbBanks; i++){ for(size_t j=i+1; j<_nbBanks; j++){ //Abundance Whittaker dist = distance_abundance_whittaker(i, j); _matrixWhittaker[i][j] = dist; _matrixWhittaker[j][i] = dist; //Abundance Kullback Leibler dist = distance_abundance_kullbackLeibler(i, j); _matrixKullbackLeibler[i][j] = dist; _matrixKullbackLeibler[j][i] = dist; //Abundance Canberra dist = distance_abundance_canberra(i, j, a, b, c); _matrixCanberra[i][j] = dist; _matrixCanberra[j][i] = dist; } } } */ } vector > SimkaDistance::createSquaredMatrix(size_t n){ vector > matrix; matrix.resize(n); for(size_t i=0; i. *****************************************************************************/ #ifndef TOOLS_SIMKA_SRC_SIMKADISTANCE_HPP_ #define TOOLS_SIMKA_SRC_SIMKADISTANCE_HPP_ #include const string STR_SIMKA_DISTANCE_BRAYCURTIS = "-bray-curtis"; const string STR_SIMKA_DISTANCE_CHORD = "-chord"; const string STR_SIMKA_DISTANCE_HELLINGER = "-hellinger"; const string STR_SIMKA_DISTANCE_CANBERRA = "-canberra"; const string STR_SIMKA_DISTANCE_KULCZYNSKI = "-kulczynski"; typedef vector SpeciesAbundanceVectorType; enum SIMKA_MATRIX_TYPE{ SYMETRICAL, ASYMETRICAL, }; /* class SimkaDistanceParam{ public: SimkaDistanceParam(){} SimkaDistanceParam(IProperties* params){ //_computeBrayCurtis = true; //_computeChord = true; //_computeHellinger = true; //_computeCanberra = true; //_computeKulczynski = true; //_computeBrayCurtis = params->get(STR_SIMKA_DISTANCE_BRAYCURTIS); //_computeChord = params->get(STR_SIMKA_DISTANCE_CHORD); //_computeHellinger = params->get(STR_SIMKA_DISTANCE_HELLINGER); //_computeCanberra = params->get(STR_SIMKA_DISTANCE_CANBERRA); //_computeKulczynski = params->get(STR_SIMKA_DISTANCE_KULCZYNSKI); } //bool _computeBrayCurtis; //bool _computeChord; //bool _computeHellinger; //bool _computeCanberra; //bool _computeKulczynski; };*/ class SimkaStatistics{ public: SimkaStatistics(size_t nbBanks, bool computeSimpleDistances, bool computeComplexDistances, const string& tmpDir, const vector& datasetIds); SimkaStatistics& operator+= (const SimkaStatistics& other); void print(); void load(const string& filename); void save(const string& filename); void outputMatrix(const string& outputDir, const vector& _bankNames); size_t _nbBanks; size_t _symetricDistanceMatrixSize; bool _computeSimpleDistances; bool _computeComplexDistances; double _totalReads; vector _nbSolidDistinctKmersPerBank; vector _nbSolidKmersPerBank; vector _nbDistinctKmersSharedByBanksThreshold; vector _nbKmersSharedByBanksThreshold; vector _matrixNbDistinctSharedKmers; vector > _matrixNbSharedKmers; vector _brayCurtisNumerator; //vector > _brayCurtisNumerator; //vector > _kullbackLeibler; //Abundance Chord vector > _chord_NiNj; vector _chord_sqrt_N2; //Abundance Hellinger vector > _hellinger_SqrtNiNj; vector > _whittaker_minNiNj; vector > _kullbackLeibler; vector > _abundance_jaccard_intersection; //Abundance Canberra vector > _canberra; //Abundance Kulczynski vector > _kulczynski_minNiNj; //string _outputDir; u_int64_t _nbKmers; vector _nbKmersPerBank; u_int64_t _nbErroneousKmers; u_int64_t _nbDistinctKmers; u_int64_t _nbSolidKmers; u_int64_t _nbSharedKmers; u_int64_t _nbDistinctSharedKmers; //SimkaDistanceParam _distanceParams; vector _datasetNbReads; //u_int64_t _nbKmersInCoupleBankSupRatio; //unordered_map _histos; private: void dumpMatrix(const string& outputDir, const vector& _bankNames, const string& outputFilename, const vector >& matrix); string _outputFilenameSuffix; }; class SimkaDistance { public: SimkaDistance(SimkaStatistics& stats); //virtual ~SimkaDistance(); //vector > getMatrixSorensen(SIMKA_MATRIX_TYPE type); //vector > getMatrixJaccard(); //vector > getMatrixAKS(SIMKA_MATRIX_TYPE type); //vector > getMatrixBrayCurtis(); //vector > getMatrixKullbackLeibler(); vector > _matrixJaccardAbundance(){ vector > matrix = createSquaredMatrix(_nbBanks); for(size_t i=0; i<_nbBanks; i++){ for(size_t j=i+1; j<_nbBanks; j++){ double dist = distance_abundance_jaccard(i, j); matrix[i][j] = dist; matrix[j][i] = dist; } } return matrix; } vector > _matrixBrayCurtis(){ vector > matrix = createSquaredMatrix(_nbBanks); for(size_t i=0; i<_nbBanks; i++){ for(size_t j=i+1; j<_nbBanks; j++){ double dist = distance_abundance_brayCurtis(i, j, j + ((_nbBanks-1)*i) - (i*(i-1)/2)); matrix[i][j] = dist; matrix[j][i] = dist; } } return matrix; } vector > _matrixChord(){ vector > matrix = createSquaredMatrix(_nbBanks); for(size_t i=0; i<_nbBanks; i++){ for(size_t j=i+1; j<_nbBanks; j++){ double dist = distance_abundance_chord(i, j); matrix[i][j] = dist; matrix[j][i] = dist; } } return matrix; } vector > _matrixHellinger(){ vector > matrix = createSquaredMatrix(_nbBanks); for(size_t i=0; i<_nbBanks; i++){ for(size_t j=i+1; j<_nbBanks; j++){ double dist = distance_abundance_hellinger(i, j); matrix[i][j] = dist; matrix[j][i] = dist; } } return matrix; } vector > _matrixWhittaker(){ vector > matrix = createSquaredMatrix(_nbBanks); for(size_t i=0; i<_nbBanks; i++){ for(size_t j=i+1; j<_nbBanks; j++){ double dist = distance_abundance_whittaker(i, j); matrix[i][j] = dist; matrix[j][i] = dist; } } return matrix; } vector > _matrixKullbackLeibler(){ vector > matrix = createSquaredMatrix(_nbBanks); for(size_t i=0; i<_nbBanks; i++){ for(size_t j=i+1; j<_nbBanks; j++){ double dist = distance_abundance_kullbackLeibler(i, j); matrix[i][j] = dist; matrix[j][i] = dist; } } return matrix; } vector > _matrixCanberra(){ vector > matrix = createSquaredMatrix(_nbBanks); u_int64_t a, b, c; for(size_t i=0; i<_nbBanks; i++){ for(size_t j=i+1; j<_nbBanks; j++){ get_abc(i, j, j + ((_nbBanks-1)*i) - (i*(i-1)/2), a, b ,c); double dist = distance_abundance_canberra(i, j, a, b, c); matrix[i][j] = dist; matrix[j][i] = dist; } } return matrix; } vector > _matrixKulczynski(){ vector > matrix = createSquaredMatrix(_nbBanks); for(size_t i=0; i<_nbBanks; i++){ for(size_t j=i+1; j<_nbBanks; j++){ double dist = distance_abundance_kulczynski(i, j); matrix[i][j] = dist; matrix[j][i] = dist; } } return matrix; } vector > _matrixSymJaccardAbundance(){ vector > matrix = createSquaredMatrix(_nbBanks); for(size_t i=0; i<_nbBanks; i++){ for(size_t j=i+1; j<_nbBanks; j++){ double dist = distance_abundance_jaccard_simka(i, j, SYMETRICAL); matrix[i][j] = dist; matrix[j][i] = dist; } } return matrix; } vector > _matrixAsymJaccardAbundance(){ vector > matrix = createSquaredMatrix(_nbBanks); for(size_t i=0; i<_nbBanks; i++){ for(size_t j=i+1; j<_nbBanks; j++){ matrix[i][j] = distance_abundance_jaccard_simka(i, j, ASYMETRICAL); matrix[j][i] = distance_abundance_jaccard_simka(j, i, ASYMETRICAL); } } return matrix; } vector > _matrixOchiai(){ vector > matrix = createSquaredMatrix(_nbBanks); for(size_t i=0; i<_nbBanks; i++){ for(size_t j=i+1; j<_nbBanks; j++){ double dist = distance_abundance_ochiai(i, j); matrix[i][j] = dist; matrix[j][i] = dist; } } return matrix; } vector > _matrixSorensen(){ vector > matrix = createSquaredMatrix(_nbBanks); for(size_t i=0; i<_nbBanks; i++){ for(size_t j=i+1; j<_nbBanks; j++){ double dist = distance_abundance_sorensen(i, j); matrix[i][j] = dist; matrix[j][i] = dist; } } return matrix; } vector > _matrix_presenceAbsence_sorensenBrayCurtis(){ vector > matrix = createSquaredMatrix(_nbBanks); u_int64_t a, b, c; for(size_t i=0; i<_nbBanks; i++){ for(size_t j=i+1; j<_nbBanks; j++){ get_abc(i, j, j + ((_nbBanks-1)*i) - (i*(i-1)/2), a, b ,c); double dist = distance_presenceAbsence_sorensenBrayCurtis(a, b, c); matrix[i][j] = dist; matrix[j][i] = dist; } } return matrix; } vector > _matrix_presenceAbsence_Whittaker(){ vector > matrix = createSquaredMatrix(_nbBanks); u_int64_t a, b, c; for(size_t i=0; i<_nbBanks; i++){ for(size_t j=i+1; j<_nbBanks; j++){ get_abc(i, j, j + ((_nbBanks-1)*i) - (i*(i-1)/2), a, b ,c); double dist = distance_presenceAbsence_whittaker(a, b, c); matrix[i][j] = dist; matrix[j][i] = dist; } } return matrix; } vector > _matrix_presenceAbsence_kulczynski(){ vector > matrix = createSquaredMatrix(_nbBanks); u_int64_t a, b, c; for(size_t i=0; i<_nbBanks; i++){ for(size_t j=i+1; j<_nbBanks; j++){ get_abc(i, j, j + ((_nbBanks-1)*i) - (i*(i-1)/2), a, b ,c); double dist = distance_presenceAbsence_kulczynski(a, b, c); matrix[i][j] = dist; matrix[j][i] = dist; } } return matrix; } vector > _matrix_presenceAbsence_ochiai(){ vector > matrix = createSquaredMatrix(_nbBanks); u_int64_t a, b, c; for(size_t i=0; i<_nbBanks; i++){ for(size_t j=i+1; j<_nbBanks; j++){ get_abc(i, j, j + ((_nbBanks-1)*i) - (i*(i-1)/2), a, b ,c); double dist = distance_presenceAbsence_ochiai(a, b, c); matrix[i][j] = dist; matrix[j][i] = dist; } } return matrix; } vector > _matrix_presenceAbsence_chordHellinger(){ vector > matrix = createSquaredMatrix(_nbBanks); u_int64_t a, b, c; for(size_t i=0; i<_nbBanks; i++){ for(size_t j=i+1; j<_nbBanks; j++){ get_abc(i, j, j + ((_nbBanks-1)*i) - (i*(i-1)/2), a, b ,c); double dist = distance_presenceAbsence_chordHellinger(a, b, c); matrix[i][j] = dist; matrix[j][i] = dist; } } return matrix; } vector > _matrix_presenceAbsence_jaccardCanberra(){ vector > matrix = createSquaredMatrix(_nbBanks); u_int64_t a, b, c; for(size_t i=0; i<_nbBanks; i++){ for(size_t j=i+1; j<_nbBanks; j++){ get_abc(i, j, j + ((_nbBanks-1)*i) - (i*(i-1)/2), a, b ,c); double dist = distance_presenceAbsence_jaccardCanberra(a, b, c); matrix[i][j] = dist; matrix[j][i] = dist; } } return matrix; } vector > _matrix_presenceAbsence_jaccard_simka(){ vector > matrix = createSquaredMatrix(_nbBanks); for(size_t i=0; i<_nbBanks; i++){ for(size_t j=i+1; j<_nbBanks; j++){ double dist = distance_presenceAbsence_jaccard_simka(i, j, j + ((_nbBanks-1)*i) - (i*(i-1)/2), SYMETRICAL); matrix[i][j] = dist; matrix[j][i] = dist; } } return matrix; } vector > _matrix_presenceAbsence_jaccard_simka_asym(){ vector > matrix = createSquaredMatrix(_nbBanks); for(size_t i=0; i<_nbBanks; i++){ for(size_t j=i+1; j<_nbBanks; j++){ matrix[i][j] = distance_presenceAbsence_jaccard_simka(i, j, j + ((_nbBanks-1)*i) - (i*(i-1)/2), ASYMETRICAL); matrix[j][i] = distance_presenceAbsence_jaccard_simka(j, i, j + ((_nbBanks-1)*i) - (i*(i-1)/2), ASYMETRICAL); } } return matrix; } vector > computeJaccardDistanceFromBrayCurtis(const vector >& brayDistanceMatrix){ vector > jaccardDistanceMatrix = createSquaredMatrix(_nbBanks); for(size_t i=0; i<_nbBanks; i++){ for(size_t j=0; j<_nbBanks; j++){ double B = brayDistanceMatrix[i][j]; double J = (2*B) / (1+B); jaccardDistanceMatrix[i][j] = J; } } return jaccardDistanceMatrix; } private: vector > createSquaredMatrix(size_t n); void get_abc(size_t bank1, size_t bank2, size_t symetricIndex, u_int64_t& a, u_int64_t& b, u_int64_t& c); double distance_abundance_brayCurtis(size_t bank1, size_t bank2, size_t symetricIndex); double distance_abundance_chord(size_t i, size_t j); double distance_abundance_hellinger(size_t i, size_t j); //double distance_abundance_jaccard_intersection(size_t i, size_t j); double distance_abundance_whittaker(size_t i, size_t j); double distance_abundance_kullbackLeibler(size_t i, size_t j); double distance_abundance_canberra(size_t i, size_t j, u_int64_t& ua, u_int64_t& ub, u_int64_t& uc); double distance_abundance_kulczynski(size_t i, size_t j); double distance_abundance_ochiai(size_t i, size_t j); double distance_abundance_sorensen(size_t i, size_t j); double distance_abundance_jaccard(size_t i, size_t j); double distance_abundance_jaccard_simka(size_t i, size_t j, SIMKA_MATRIX_TYPE type); double distance_presenceAbsence_chordHellinger(u_int64_t& ua, u_int64_t& ub, u_int64_t& uc); double distance_presenceAbsence_hellinger(u_int64_t& ua, u_int64_t& ub, u_int64_t& uc); double distance_presenceAbsence_whittaker(u_int64_t& ua, u_int64_t& ub, u_int64_t& uc); double distance_presenceAbsence_canberra(u_int64_t& ua, u_int64_t& ub, u_int64_t& uc); double distance_presenceAbsence_kulczynski(u_int64_t& ua, u_int64_t& ub, u_int64_t& uc); double distance_presenceAbsence_ochiai(u_int64_t& ua, u_int64_t& ub, u_int64_t& uc); double distance_presenceAbsence_sorensenBrayCurtis(u_int64_t& ua, u_int64_t& ub, u_int64_t& uc); double distance_presenceAbsence_jaccardCanberra(u_int64_t& ua, u_int64_t& ub, u_int64_t& uc); double distance_presenceAbsence_jaccard_simka(size_t i, size_t j, size_t symetricIndex, SIMKA_MATRIX_TYPE type); SimkaStatistics& _stats; //SimkaDistanceParam _distanceParams; size_t _nbBanks; }; #endif /* TOOLS_SIMKA_SRC_SIMKADISTANCE_HPP_ */ simka-1.5.1/src/core/main.cpp000077500000000000000000000030131353413740300157740ustar00rootroot00000000000000/***************************************************************************** * Simka: Fast kmer-based method for estimating the similarity between numerous metagenomic datasets * A tool from the GATB (Genome Assembly Tool Box) * Copyright (C) 2015 INRIA * Authors: G.Benoit, C.Lemaitre, P.Peterlongo * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU Affero General Public License as * published by the Free Software Foundation, either version 3 of the * License, or (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU Affero General Public License for more details. * * You should have received a copy of the GNU Affero General Public License * along with this program. If not, see . *****************************************************************************/ // We include the header file for the tool #include "Simka.hpp" /********************************************************************************/ int main (int argc, char* argv[]) { try { // We run the tool with the provided command line arguments. Simka().run (argc, argv); } catch (Exception& e) { std::cout << "EXCEPTION: " << e.getMessage() << std::endl; return EXIT_FAILURE; } return EXIT_SUCCESS; } simka-1.5.1/src/minikc/000077500000000000000000000000001353413740300146665ustar00rootroot00000000000000simka-1.5.1/src/minikc/MiniKC.hpp000066400000000000000000000167461353413740300165270ustar00rootroot00000000000000/* * MiniKC.hpp * * Created on: 16 juin 2016 * Author: gbenoit */ #ifndef GATB_SIMKA_SRC_MINIKC_MINIKC_HPP_ #define GATB_SIMKA_SRC_MINIKC_MINIKC_HPP_ #include //#include "../SimkaCount.cpp" //typedef u_int16_t CountType; template class SimkaCompressedProcessor : public CountProcessorAbstract{ public: typedef typename Kmer::Type Type; typedef typename Kmer::Count Count; struct Kmer_BankId_Count{ Type _type; u_int32_t _bankId; u_int64_t _count; Kmer_BankId_Count(){ } Kmer_BankId_Count(Type type, u_int64_t bankId, u_int64_t count){ _type = type; _bankId = bankId; _count = count; } }; //SimkaCompressedProcessor(vector* >& bags, vector >& caches, vector& cacheIndexes, CountNumber abundanceMin, CountNumber abundanceMax) : _bags(bags), _caches(caches), _cacheIndexes(cacheIndexes) SimkaCompressedProcessor(vector* >& bags, vector& nbKmerPerParts, vector& nbDistinctKmerPerParts, vector& chordPerParts, CountNumber abundanceMin, CountNumber abundanceMax, size_t bankIndex) : _bags(bags), _nbDistinctKmerPerParts(nbDistinctKmerPerParts), _nbKmerPerParts(nbKmerPerParts), _chordPerParts(chordPerParts) { _abundanceMin = abundanceMin; _abundanceMax = abundanceMax; _bankIndex = bankIndex; } ~SimkaCompressedProcessor(){} CountProcessorAbstract* clone () { return new SimkaCompressedProcessor (_bags, _nbKmerPerParts, _nbDistinctKmerPerParts, _chordPerParts, _abundanceMin, _abundanceMax, _bankIndex); } //CountProcessorAbstract* clone () { return new SimkaCompressedProcessor (_bags, _caches, _cacheIndexes, _abundanceMin, _abundanceMax); } void finishClones (vector*>& clones){} bool process (size_t partId, const typename Kmer::Type& kmer, const CountVector& count, CountNumber sum){ if(count[0] < _abundanceMin || count[0] > _abundanceMax) return false; Kmer_BankId_Count item(kmer, _bankIndex, count[0]); _bags[partId]->insert(item); _nbDistinctKmerPerParts[partId] += 1; _nbKmerPerParts[partId] += count[0]; _chordPerParts[partId] += pow(count[0], 2); /* size_t index = _cacheIndexes[partId]; _caches[partId][index] = item; index += 1; if(index == NB_COUNT_CACHE){ _bags[partId]->insert(_caches[partId], index); _cacheIndexes[partId] = 0; } else{ _cacheIndexes[partId] = index; }*/ return true; } vector* >& _bags; vector& _nbDistinctKmerPerParts; vector& _nbKmerPerParts; vector& _chordPerParts; CountNumber _abundanceMin; CountNumber _abundanceMax; size_t _bankIndex; //_stats->_chord_N2[i] += pow(abundanceI, 2); //vector >& _caches; //vector& _cacheIndexes; }; /* class SimkaCompressedProcessor_Mini{ public: typedef typename Kmer<>::Type Type; typedef typename Kmer<>::Count Count; //SimkaCompressedProcessor(vector* >& bags, vector >& caches, vector& cacheIndexes, CountNumber abundanceMin, CountNumber abundanceMax) : _bags(bags), _caches(caches), _cacheIndexes(cacheIndexes) SimkaCompressedProcessor_Mini(vector* >& bags, vector& nbKmerPerParts, vector& nbDistinctKmerPerParts, vector& chordPerParts, CountNumber abundanceMin, CountNumber abundanceMax) : _bags(bags), _nbDistinctKmerPerParts(nbDistinctKmerPerParts), _nbKmerPerParts(nbKmerPerParts), _chordPerParts(chordPerParts) { _abundanceMin = abundanceMin; _abundanceMax = abundanceMax; } bool process (size_t partId, const Type& kmer, CountType count){ if(count < _abundanceMin || count > _abundanceMax) return false; Count item(kmer, count); _bags[partId]->insert(item); _nbDistinctKmerPerParts[partId] += 1; _nbKmerPerParts[partId] += count; _chordPerParts[partId] += pow(count, 2); return true; } vector* >& _bags; vector& _nbDistinctKmerPerParts; vector& _nbKmerPerParts; vector& _chordPerParts; CountNumber _abundanceMin; CountNumber _abundanceMax; //_stats->_chord_N2[i] += pow(abundanceI, 2); //vector >& _caches; //vector& _cacheIndexes; };*/ template class MiniKC : public Algorithm{ public: typedef typename Kmer::Type Type; typedef typename Kmer::Count Count; typedef typename Kmer::ModelCanonical Model; typedef typename Kmer::ModelCanonical::Iterator ModelIt; //typedef Kmer<>::ModelCanonical ModelCanon; //typedef Kmer<>::ModelMinimizer ModelMini; typedef typename Kmer::template ModelMinimizer ModelMinimizer; //typedef typename Kmer::ModelCanonical ModelCanonical; //typedef typename ModelCanonical::Kmer KmerType; /* typedef Kmer::Count Count; typedef Kmer::Type Type; typedef Kmer::ModelCanonical ModelCanon; typedef Kmer::ModelMinimizer Model;*/ IBank* _bank; size_t _kmerSize; CountVector* _counts; Repartitor& _repartition; SimkaCompressedProcessor* _proc; u_int64_t _nbReads; MiniKC(IProperties* options, size_t kmerSize, IBank* bank, Repartitor& repartition, SimkaCompressedProcessor* proc): Algorithm("minikc", -1, options), _repartition(repartition) { _bank = bank; _kmerSize = kmerSize; _proc = proc; u_int64_t nbCounts = pow(4, _kmerSize); cout << "Nb distinct kmers (canonical): " << nbCounts << endl; _counts = new CountVector(nbCounts, 0); } void execute(){ count(); dump(); } void count(){ _nbReads = 0; Iterator* itSeq = createIterator(_bank->iterator(), _bank->estimateNbItems(), "Counting"); //Model definition of a kmer iterator (this one put kmer in cannonical form) //ModelCanonical _model(_kmerSize); //Model:: //Model _kmerIt(_model); Model model (_kmerSize); // We declare an iterator on a given sequence. ModelIt _kmerIt (model); Sequence* sequence; for (itSeq->first(); !itSeq->isDone(); itSeq->next()){ _nbReads += 1; sequence = &itSeq->item(); _kmerIt.setData (sequence->getData()); for (_kmerIt.first(); !_kmerIt.isDone(); _kmerIt.next()){ //u_int64_t kmer = min(_kmerIt->value(), revcomp(_kmerIt->value(), _kmerSize)).getVal(); //Kmer<> canonicalkmer = min(_kmerIt.item(), revcomp(_kmerIt->value())); //cout << _kmerIt->value().toString(kmerSize) << endl; u_int64_t kmer = _kmerIt->value().getVal(); //cout << _model.toString(kmer) << endl; //cout << kmer << endl; (*_counts)[kmer] += 1; //cout << kmer << endl; } } } void dump(){ ModelMinimizer model (_kmerSize, 7); Type kmer; //Kmer<>::ModelCanonical _model(_kmerSize); CountVector vec(1, 0); for(size_t i=0; i<_counts->size(); i++){ CountNumber count = (*_counts)[i]; if(count == 0) continue; kmer.setVal(i); //cout << i << " " << model.toString(kmer) << endl; //Type kmer(i); u_int64_t mini = model.getMinimizerValue(kmer); size_t p = this->_repartition (mini); vec[0] = count; _proc->process(p, kmer, vec, count); } } }; #endif /* GATB_SIMKA_SRC_MINIKC_MINIKC_HPP_ */ simka-1.5.1/src/minikc/SimkaCountProcess.cpp000066400000000000000000000011221353413740300210020ustar00rootroot00000000000000 #include /* printf */ #include /* system, NULL, EXIT_FAILURE */ #include #include using namespace std; int main (int argc, char* argv[]) { string command = "nohup "; for (int i = 1; i < argc; ++i) { //std::cout << argv[i] << std::endl; command += string(argv[i]) + " "; } //cout << command << endl; //cout << argc << " " << argv << endl; int ret=1; int nbTries = 0; while(ret != 0){ ret = system(command.c_str()); nanosleep((const struct timespec[]){{0, 10000000L}}, NULL); if(nbTries > 3) exit(1); nbTries += 1; } } simka-1.5.1/src/simkaMin/000077500000000000000000000000001353413740300151645ustar00rootroot00000000000000simka-1.5.1/src/simkaMin/MurmurHash3.cpp000066400000000000000000000163761353413740300200630ustar00rootroot00000000000000#include "MurmurHash3.h" //----------------------------------------------------------------------------- // Platform-specific functions and macros // Microsoft Visual Studio #if defined(_MSC_VER) #define FORCE_INLINE __forceinline #include #define ROTL32(x,y) _rotl(x,y) #define ROTL64(x,y) _rotl64(x,y) #define BIG_CONSTANT(x) (x) // Other compilers #else // defined(_MSC_VER) #define FORCE_INLINE inline __attribute__((always_inline)) inline uint32_t rotl32 ( uint32_t x, int8_t r ) { return (x << r) | (x >> (32 - r)); } inline uint64_t rotl64 ( uint64_t x, int8_t r ) { return (x << r) | (x >> (64 - r)); } #define ROTL32(x,y) rotl32(x,y) #define ROTL64(x,y) rotl64(x,y) #define BIG_CONSTANT(x) (x##LLU) #endif // !defined(_MSC_VER) //----------------------------------------------------------------------------- // Block read - if your platform needs to do endian-swapping or can only // handle aligned reads, do the conversion here FORCE_INLINE uint32_t getblock32 ( const uint32_t * p, int i ) { return p[i]; } FORCE_INLINE uint64_t getblock64 ( const uint64_t * p, int i ) { return p[i]; } //----------------------------------------------------------------------------- // Finalization mix - force all bits of a hash block to avalanche FORCE_INLINE uint32_t fmix32 ( uint32_t h ) { h ^= h >> 16; h *= 0x85ebca6b; h ^= h >> 13; h *= 0xc2b2ae35; h ^= h >> 16; return h; } //---------- FORCE_INLINE uint64_t fmix64 ( uint64_t k ) { k ^= k >> 33; k *= BIG_CONSTANT(0xff51afd7ed558ccd); k ^= k >> 33; k *= BIG_CONSTANT(0xc4ceb9fe1a85ec53); k ^= k >> 33; return k; } //----------------------------------------------------------------------------- void MurmurHash3_x86_32 ( const void * key, int len, uint32_t seed, void * out ) { const uint8_t * data = (const uint8_t*)key; const int nblocks = len / 4; uint32_t h1 = seed; const uint32_t c1 = 0xcc9e2d51; const uint32_t c2 = 0x1b873593; //---------- // body const uint32_t * blocks = (const uint32_t *)(data + nblocks*4); for(int i = -nblocks; i; i++) { uint32_t k1 = getblock32(blocks,i); k1 *= c1; k1 = ROTL32(k1,15); k1 *= c2; h1 ^= k1; h1 = ROTL32(h1,13); h1 = h1*5+0xe6546b64; } //---------- // tail const uint8_t * tail = (const uint8_t*)(data + nblocks*4); uint32_t k1 = 0; switch(len & 3) { case 3: k1 ^= tail[2] << 16; case 2: k1 ^= tail[1] << 8; case 1: k1 ^= tail[0]; k1 *= c1; k1 = ROTL32(k1,15); k1 *= c2; h1 ^= k1; }; //---------- // finalization h1 ^= len; h1 = fmix32(h1); *(uint32_t*)out = h1; } //----------------------------------------------------------------------------- void MurmurHash3_x86_128 ( const void * key, const int len, uint32_t seed, void * out ) { const uint8_t * data = (const uint8_t*)key; const int nblocks = len / 16; uint32_t h1 = seed; uint32_t h2 = seed; uint32_t h3 = seed; uint32_t h4 = seed; const uint32_t c1 = 0x239b961b; const uint32_t c2 = 0xab0e9789; const uint32_t c3 = 0x38b34ae5; const uint32_t c4 = 0xa1e38b93; //---------- // body const uint32_t * blocks = (const uint32_t *)(data + nblocks*16); for(int i = -nblocks; i; i++) { uint32_t k1 = getblock32(blocks,i*4+0); uint32_t k2 = getblock32(blocks,i*4+1); uint32_t k3 = getblock32(blocks,i*4+2); uint32_t k4 = getblock32(blocks,i*4+3); k1 *= c1; k1 = ROTL32(k1,15); k1 *= c2; h1 ^= k1; h1 = ROTL32(h1,19); h1 += h2; h1 = h1*5+0x561ccd1b; k2 *= c2; k2 = ROTL32(k2,16); k2 *= c3; h2 ^= k2; h2 = ROTL32(h2,17); h2 += h3; h2 = h2*5+0x0bcaa747; k3 *= c3; k3 = ROTL32(k3,17); k3 *= c4; h3 ^= k3; h3 = ROTL32(h3,15); h3 += h4; h3 = h3*5+0x96cd1c35; k4 *= c4; k4 = ROTL32(k4,18); k4 *= c1; h4 ^= k4; h4 = ROTL32(h4,13); h4 += h1; h4 = h4*5+0x32ac3b17; } //---------- // tail const uint8_t * tail = (const uint8_t*)(data + nblocks*16); uint32_t k1 = 0; uint32_t k2 = 0; uint32_t k3 = 0; uint32_t k4 = 0; switch(len & 15) { case 15: k4 ^= tail[14] << 16; case 14: k4 ^= tail[13] << 8; case 13: k4 ^= tail[12] << 0; k4 *= c4; k4 = ROTL32(k4,18); k4 *= c1; h4 ^= k4; case 12: k3 ^= tail[11] << 24; case 11: k3 ^= tail[10] << 16; case 10: k3 ^= tail[ 9] << 8; case 9: k3 ^= tail[ 8] << 0; k3 *= c3; k3 = ROTL32(k3,17); k3 *= c4; h3 ^= k3; case 8: k2 ^= tail[ 7] << 24; case 7: k2 ^= tail[ 6] << 16; case 6: k2 ^= tail[ 5] << 8; case 5: k2 ^= tail[ 4] << 0; k2 *= c2; k2 = ROTL32(k2,16); k2 *= c3; h2 ^= k2; case 4: k1 ^= tail[ 3] << 24; case 3: k1 ^= tail[ 2] << 16; case 2: k1 ^= tail[ 1] << 8; case 1: k1 ^= tail[ 0] << 0; k1 *= c1; k1 = ROTL32(k1,15); k1 *= c2; h1 ^= k1; }; //---------- // finalization h1 ^= len; h2 ^= len; h3 ^= len; h4 ^= len; h1 += h2; h1 += h3; h1 += h4; h2 += h1; h3 += h1; h4 += h1; h1 = fmix32(h1); h2 = fmix32(h2); h3 = fmix32(h3); h4 = fmix32(h4); h1 += h2; h1 += h3; h1 += h4; h2 += h1; h3 += h1; h4 += h1; ((uint32_t*)out)[0] = h1; ((uint32_t*)out)[1] = h2; ((uint32_t*)out)[2] = h3; ((uint32_t*)out)[3] = h4; } //----------------------------------------------------------------------------- void MurmurHash3_x64_128 ( const void * key, const int len, const uint32_t seed, void * out ) { const uint8_t * data = (const uint8_t*)key; const int nblocks = len / 16; uint64_t h1 = seed; uint64_t h2 = seed; const uint64_t c1 = BIG_CONSTANT(0x87c37b91114253d5); const uint64_t c2 = BIG_CONSTANT(0x4cf5ad432745937f); //---------- // body const uint64_t * blocks = (const uint64_t *)(data); for(int i = 0; i < nblocks; i++) { uint64_t k1 = getblock64(blocks,i*2+0); uint64_t k2 = getblock64(blocks,i*2+1); k1 *= c1; k1 = ROTL64(k1,31); k1 *= c2; h1 ^= k1; h1 = ROTL64(h1,27); h1 += h2; h1 = h1*5+0x52dce729; k2 *= c2; k2 = ROTL64(k2,33); k2 *= c1; h2 ^= k2; h2 = ROTL64(h2,31); h2 += h1; h2 = h2*5+0x38495ab5; } //---------- // tail const uint8_t * tail = (const uint8_t*)(data + nblocks*16); uint64_t k1 = 0; uint64_t k2 = 0; switch(len & 15) { case 15: k2 ^= ((uint64_t)tail[14]) << 48; case 14: k2 ^= ((uint64_t)tail[13]) << 40; case 13: k2 ^= ((uint64_t)tail[12]) << 32; case 12: k2 ^= ((uint64_t)tail[11]) << 24; case 11: k2 ^= ((uint64_t)tail[10]) << 16; case 10: k2 ^= ((uint64_t)tail[ 9]) << 8; case 9: k2 ^= ((uint64_t)tail[ 8]) << 0; k2 *= c2; k2 = ROTL64(k2,33); k2 *= c1; h2 ^= k2; case 8: k1 ^= ((uint64_t)tail[ 7]) << 56; case 7: k1 ^= ((uint64_t)tail[ 6]) << 48; case 6: k1 ^= ((uint64_t)tail[ 5]) << 40; case 5: k1 ^= ((uint64_t)tail[ 4]) << 32; case 4: k1 ^= ((uint64_t)tail[ 3]) << 24; case 3: k1 ^= ((uint64_t)tail[ 2]) << 16; case 2: k1 ^= ((uint64_t)tail[ 1]) << 8; case 1: k1 ^= ((uint64_t)tail[ 0]) << 0; k1 *= c1; k1 = ROTL64(k1,31); k1 *= c2; h1 ^= k1; }; //---------- // finalization h1 ^= len; h2 ^= len; h1 += h2; h2 += h1; h1 = fmix64(h1); h2 = fmix64(h2); h1 += h2; h2 += h1; ((uint64_t*)out)[0] = h1; ((uint64_t*)out)[1] = h2; } //----------------------------------------------------------------------------- simka-1.5.1/src/simkaMin/MurmurHash3.h000066400000000000000000000021261353413740300175140ustar00rootroot00000000000000//----------------------------------------------------------------------------- // MurmurHash3 was written by Austin Appleby, and is placed in the public // domain. The author hereby disclaims copyright to this source code. #ifndef _MURMURHASH3_H_ #define _MURMURHASH3_H_ //----------------------------------------------------------------------------- // Platform-specific functions and macros // Microsoft Visual Studio #if defined(_MSC_VER) && (_MSC_VER < 1600) typedef unsigned char uint8_t; typedef unsigned int uint32_t; typedef unsigned __int64 uint64_t; // Other compilers #else // defined(_MSC_VER) #include #endif // !defined(_MSC_VER) //----------------------------------------------------------------------------- void MurmurHash3_x86_32 ( const void * key, int len, uint32_t seed, void * out ); void MurmurHash3_x86_128 ( const void * key, int len, uint32_t seed, void * out ); void MurmurHash3_x64_128 ( const void * key, int len, uint32_t seed, void * out ); //----------------------------------------------------------------------------- #endif // _MURMURHASH3_H_ simka-1.5.1/src/simkaMin/SimkaMin.cpp000066400000000000000000000074261353413740300174110ustar00rootroot00000000000000/***************************************************************************** * SimkaMin: Fast kmer-based method for estimating the similarity between numerous metagenomic datasets * A tool from the GATB (Genome Assembly Tool Box) * Copyright (C) 2019 INRIA * Authors: G.Benoit * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU Affero General Public License as * published by the Free Software Foundation, either version 3 of the * License, or (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU Affero General Public License for more details. * * You should have received a copy of the GNU Affero General Public License * along with this program. If not, see . *****************************************************************************/ #include "SimkaMinCount.hpp" #include "SimkaMinDistance.hpp" #include "SimkaMinDistanceMatrixExporter.hpp" #include "SimkaMinDistanceMatrixMerger.hpp" #include "SimkaMinInfos.hpp" #include "SimkaMinAppend.hpp" void displayHelp(){ cout << "Usage: ./simkaMin [option]" << endl; cout << endl << "[Distance computation options]" << endl; cout << "\tsketch : transform datasets in small sketches of k-mers and their abundance" << endl; cout << "\tdistance : compute Jaccard and Bray-Curtis distances between sketches" << endl; cout << endl << "[Distance matrix manipulation options]" << endl; cout << "\texport : export distance matrices stored in binary format" << endl; //cout << "\tmatrix-update : update existing distance matrices" << endl; cout << endl << "[Sketch options]" << endl; cout << "\tappend : merge multiple sketch files into a single one" << endl; cout << "\tinfo : list datasets contained in a sketch file" << endl; cout << endl; } int main (int argc, char* argv[]) { try { if(argc < 2){ displayHelp(); } else{ //std::vector args; vector argsTemp( argv, argv + argc ); argsTemp.erase(argsTemp.begin()+1); //std::transform(argsTemp.begin(), argsTemp.end(), std::back_inserter(vc), convert); char** args = &argsTemp[0]; //char* args[]; //for(string& arg: argsTemp){ //} //rArray = new char*[argc+1]; //for(int i=0; i <= argc; i++) { // rArray[i] = argv[i]; //} // use rArray //delete [] rArray; //char* args = new char*[argc-1]; //vector test; //for(size_t i=0; i args(argv); string programName = string(argv[1]); if(programName == "sketch"){ Simka2ComputeKmerSpectrum().run (argc, args); } else if(programName == "append"){ SimkaMinAppend().run(argc, args); } else if(programName == "distance"){ SimkaMinDistance().run(argc, args); } else if(programName == "export"){ SimkaMinDistanceMatrixExporter().run(argc, args); } else if(programName == "matrix-update"){ //Hidden feature SimkaMinDistanceMatrixMerger().run(argc, args); } else if(programName == "info"){ SimkaMinInfos().run(argc, args); } else{ displayHelp(); } } //cout << argc << endl; //cout << argv[0] << endl; //cout << argv[1] << endl; //cout << argv[2] << endl; // } catch (Exception& e) { std::cout << "EXCEPTION: " << e.getMessage() << std::endl; return EXIT_FAILURE; } return EXIT_SUCCESS; } simka-1.5.1/src/simkaMin/SimkaMinAppend.hpp000066400000000000000000000130311353413740300205330ustar00rootroot00000000000000/***************************************************************************** * SimkaMin: Fast kmer-based method for estimating the similarity between numerous metagenomic datasets * A tool from the GATB (Genome Assembly Tool Box) * Copyright (C) 2019 INRIA * Authors: G.Benoit * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU Affero General Public License as * published by the Free Software Foundation, either version 3 of the * License, or (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU Affero General Public License for more details. * * You should have received a copy of the GNU Affero General Public License * along with this program. If not, see . *****************************************************************************/ #ifndef SIMKA1_4_SRC_SIMKAMIN_SIMKAMINAPPEND_HPP_ #define SIMKA1_4_SRC_SIMKAMIN_SIMKAMINAPPEND_HPP_ #include "SimkaMinCommons.hpp" /* Header and sketches of first sketch (-in1) are kept First sketch overwrite starts after its sketches, where ids starts Sketches of second files (-in2) are written after sketches of first file (-in1) Then, Ids of first file are written, and then ids of second file The number of datasets in the header is updated */ class SimkaMinAppendAlgorithm : public Algorithm { public: IProperties* _options; string _inputFilename1; string _inputFilename2; u_int32_t _nbDatasets; u_int32_t _sketchSize; ofstream _outputFile; ifstream _inputFile2; SimkaMinAppendAlgorithm(IProperties* options): Algorithm("simkaMinAppendAlgorithm", -1, options) { } void execute(){ parseArgs(); append(); } void parseArgs(){ _options = getInput(); _inputFilename1 = _options->getStr(STR_SIMKA_URI_INPUT_1); _inputFilename2 = _options->getStr(STR_SIMKA_URI_INPUT_2); if(!System::file().doesExist(_inputFilename1)){ std::cerr << "Error: input does not exist (" << _inputFilename1 << ")" << std::endl; exit(1); } if(!System::file().doesExist(_inputFilename2)){ std::cerr << "Error: input does not exist (" << _inputFilename2 << ")" << std::endl; exit(1); } } void append(){ u_int8_t kmerSize1, kmerSize2; u_int32_t sketchSize1, sketchSize2, seed1, seed2, nbDatasets1, nbDatasets2; SimkaMinCommons::getKmerInfos(_inputFilename1, kmerSize1, sketchSize1, seed1, nbDatasets1); SimkaMinCommons::getKmerInfos(_inputFilename2, kmerSize2, sketchSize2, seed2, nbDatasets2); if(kmerSize1 != kmerSize2){ std::cerr << "Error: can't merge sketches with different kmer sizes (" << kmerSize1 << " vs " << kmerSize2 << ")" << std::endl; exit(1); } if(sketchSize1 != sketchSize2){ std::cerr << "Error: can't merge sketches with different sketch sizes (" << sketchSize1 << " vs " << sketchSize2 << ")" << std::endl; exit(1); } if(seed1 != seed2){ std::cerr << "Error: can't merge sketches with different seeds (" << seed1 << " vs " << seed2 << ")" << std::endl; exit(1); } u_int32_t nbDatasets = nbDatasets1 + nbDatasets2; vector id1, id2; SimkaMinCommons::readIds(_inputFilename1, id1); SimkaMinCommons::readIds(_inputFilename2, id2); //open first file to be overwritten (but without rewriting all its sketches) _outputFile.open(_inputFilename1, ios::binary|ios::in); _inputFile2.open(_inputFilename2, ios::binary); //Update number of datasets in the header _outputFile.seekp(SimkaMinCommons::getFilePosition_nbDatasets()); _outputFile.write((const char*)&nbDatasets, sizeof(nbDatasets)); appendSkecthes(nbDatasets1, sketchSize1, nbDatasets2); appendIds(id1); appendIds(id2); _inputFile2.close(); _outputFile.close(); } void appendSkecthes(u_int32_t nbDatasets1, u_int32_t sketchSize1, u_int32_t nbDatasets2){ _outputFile.seekp(SimkaMinCommons::getFilePosition_sketchIds(nbDatasets1, sketchSize1)); _inputFile2.seekg(KMER_SPECTRUM_HEADER_SIZE); u_int64_t dataToTransfer = nbDatasets2*sketchSize1*sizeof(KmerAndCountType); u_int64_t bufferSize = 1024; char buffer[bufferSize]; /* copy from input to output */ while (dataToTransfer > 0) { u_int64_t size = min(bufferSize, dataToTransfer); _inputFile2.read(buffer, size); _outputFile.write(buffer, size); dataToTransfer -= size; } //fclose(infile); //fclose(outfile); } void appendIds(vector& ids){ for(size_t i=0; ipush_front (new OptionOneParam (STR_SIMKA_URI_INPUT_2, "second sketch file to merge (this file will be appended to the first one)", true)); parser->push_front (new OptionOneParam (STR_SIMKA_URI_INPUT_1, "first sketch file to merge (this file will be overwritten)", true)); parser->getParser (STR_NB_CORES)->setVisible (false); parser->getParser (STR_VERBOSE)->setVisible (false); } void execute () { IProperties* args = getInput(); SimkaMinAppendAlgorithm* algo = new SimkaMinAppendAlgorithm(args); algo->execute(); delete algo; } }; #endif /* SIMKA1_4_SRC_SIMKAMIN_SIMKAMINAPPEND_HPP_ */ simka-1.5.1/src/simkaMin/SimkaMinCommons.hpp000066400000000000000000000113261353413740300207440ustar00rootroot00000000000000/***************************************************************************** * SimkaMin: Fast kmer-based method for estimating the similarity between numerous metagenomic datasets * A tool from the GATB (Genome Assembly Tool Box) * Copyright (C) 2019 INRIA * Authors: G.Benoit * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU Affero General Public License as * published by the Free Software Foundation, either version 3 of the * License, or (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU Affero General Public License for more details. * * You should have received a copy of the GNU Affero General Public License * along with this program. If not, see . *****************************************************************************/ #ifndef SIMKA1_4_SRC_SIMKAMIN_SIMKAMINCOMMONS_HPP_ #define SIMKA1_4_SRC_SIMKAMIN_SIMKAMINCOMMONS_HPP_ #include #include #define KMER_SPECTRUM_HEADER_SIZE (1+4+4+4) //At the begining of the .kmers file we store the size of the kmer (on 1 byte), the sketch size (on 4 bytes), the seed used by Murmurhash3 (4 bytes), the number of datasets in the sketch file (4 bytes) const string STR_SIMKA_SEED = "-seed"; const string STR_SIMKA_SKETCH_SIZE = "-nb-kmers"; const string STR_SIMKA_URI_INPUT_1 = "-in1"; const string STR_SIMKA_URI_INPUT_2 = "-in2"; const string STR_SIMKA_INPUT_IDS = "-in-ids"; const string STR_SIMKA_ABUNDANCE_FILTER = "-filter"; //const string STR_SIMKA2_DATASET_ID = "-id"; typedef u_int32_t KmerCountType; typedef unordered_map KmerCountDictionaryType; typedef float DistanceValueType; struct KmerAndCountType{ public: u_int64_t _kmer; KmerCountType _count; KmerAndCountType(){ } KmerAndCountType(u_int64_t kmer, KmerCountType count){ _kmer = kmer; _count = count; } }; struct PairwiseDistance{ u_int64_t _i; u_int64_t _j; DistanceValueType _distance; PairwiseDistance(){ _i = -1; _j = -1; _distance = -1; } void set(u_int64_t i, u_int64_t j, DistanceValueType distance){ _i = i; _j = j; _distance = distance; } }; class SimkaMinCommons { public: //SimkaMinCommons(); //virtual ~SimkaMinCommons(); static void writeString(const string& s, ofstream& file){ u_int8_t size = s.size(); file.write((char const*)(&size), sizeof(size)); file.write(s.c_str(), size); } static void readString(string& s, ifstream& file){ u_int8_t size; file.read((char*)(&size), sizeof(size)); std::vector buffer(size); file.read(&buffer[0], buffer.size()); s.assign(buffer.begin(), buffer.end()); //return string linkedDatasetID( buffer.begin(), buffer.end() ); } static void readIds(const string& filename, vector& datasetIds){ u_int8_t kmerSize; u_int32_t sketchSize, seed, nbDatasets; getKmerInfos(filename, kmerSize, sketchSize, seed, nbDatasets); ifstream file(filename.c_str(), ios::binary); file.seekg(SimkaMinCommons::getFilePosition_sketchIds(nbDatasets, sketchSize)); //u_int32_t nbDatasets; //file.read((char*)(&nbDatasets), sizeof(nbDatasets)); string datasetId; for(size_t i=0; i. *****************************************************************************/ #ifndef SIMKA1_4_SRC_SIMKAMIN_SIMKAMINCOUNT_HPP_ #define SIMKA1_4_SRC_SIMKAMIN_SIMKAMINCOUNT_HPP_ /* * Simka2ComputeKmerSpectrum.hpp * * Created on: 4 nov. 2016 * Author: gbenoit */ //#include "../core/SimkaUtils.hpp" //#include "Simka2Utils.hpp" //#include "../minikc/MiniKC.hpp" //#include //#include //#include "../utils/SimkaIoUtils.hpp" //#include "SimkaAlgorithm.hpp" //#include "SimkaAlgorithm.hpp" #include "SimkaMinCommons.hpp" #include "SimkaCommons.hpp" #include "MurmurHash3.h" #include //#include "../../thirdparty/KMC/kmc_api/kmc_file.h" //#include "../../thirdparty/KMC/kmc_api/kmer_defs.h" //#include "../utils/MurmurHash3.h" //#define MERGE_BUFFER_SIZE 10000 //------------------------------------------------------------------------------- //------------------------------------------------------------------------------- //------------------------------------------------------------------------------- //------------------------------------------------------------------------------- //------------------------------------------------------------------------------- //------------------------------------------------------------------------------- //------------------------------------------------------------------------------- //------------------------------------------------------------------------------- //------------------------------------------------------------------------------- //------------------------------------------------------------------------------- //------------------------------------------------------------------------------- //------------------------------------------------------------------------------- //------------------------------------------------------------------------------- //------------------------------------------------------------------------------- //------------------------------------------------------------------------------- //------------------------------------------------------------------------------- //------------------------------------------------------------------------------- //------------------------------------------------------------------------------- //------------------------------------------------------------------------------- //------------------------------------------------------------------------------- //------------------------------------------------------------------------------- //------------------------------------------------------------------------------- //------------------------------------------------------------------------------- //------------------------------------------------------------------------------- //------------------------------------------------------------------------------- //------------------------------------------------------------------------------- //------------------------------------------------------------------------------- //------------------------------------------------------------------------------- //------------------------------------------------------------------------------- //------------------------------------------------------------------------------- //------------------------------------------------------------------------------- //------------------------------------------------------------------------------- //------------------------------------------------------------------------------- //------------------------------------------------------------------------------- //------------------------------------------------------------------------------- //------------------------------------------------------------------------------- //------------------------------------------------------------------------------- //------------------------------------------------------------------------------- template class SelectKmersCommand { public: typedef typename Kmer::ModelCanonical ModelCanonical; typedef typename Kmer::ModelCanonical::Iterator ModelCanonicalIterator; typedef typename Kmer::Type KmerType; typedef typename Kmer::ModelCanonical::Kmer KmerCanonicalType; //typedef typename ModelCanonical::Kmer Lol; size_t _kmerSize; size_t _sketchSize; u_int32_t _seed; //vector _minHashValues; //vector _minHashKmers; ModelCanonical _model; ModelCanonicalIterator _itKmer; u_int64_t _hash_otpt[2]; bool _isMaster; //size_t _bufferIndex; //size_t _partitionId; //vector _bufferKmers; //vector _bufferCounts; //vector _minHashValues; //vector& _minHashValuesSynchronized; //vector _minHashKmers; //vector _minHashKmersCounts; struct KmerCountSorter{ bool operator() (u_int64_t l, u_int64_t r) { return r > l; } }; //typedef typename KmerCountSorter KmerSorter; std::priority_queue< u_int64_t, vector, KmerCountSorter> _kmerCountSorter; vector _kmers; //std::priority_queue< u_int64_t, vector, KmerCountSorter>& _kmerCountSorterSynch; //KmerCountDictionaryType& _kmerCountsSynch; Bloom* _bloomFilter; u_int64_t _nbInsertedKmersInBloom; vector& _hashedKmers; KmerCountDictionaryType& _kmerCounts; //ofstream _outputFile; bool _useAbundanceFilter; SelectKmersCommand(size_t kmerSize, size_t sketchSize, u_int32_t seed, Bloom* bloomFilter, vector& kmers, KmerCountDictionaryType& kmerCounts, bool useAbundanceFilter) : _model(kmerSize), _itKmer(_model), _bloomFilter(bloomFilter), _hashedKmers(kmers), _kmerCounts(kmerCounts) { _kmerSize = kmerSize; _sketchSize = sketchSize; _seed = seed; _isMaster = true; _nbInsertedKmersInBloom = 0; _useAbundanceFilter = useAbundanceFilter; } SelectKmersCommand(const SelectKmersCommand& copy) : _model(copy._kmerSize), _itKmer(_model), _bloomFilter(copy._bloomFilter), _hashedKmers(copy._hashedKmers), _kmerCounts(copy._kmerCounts) { _kmerSize = copy._kmerSize; _sketchSize = copy._sketchSize; _seed = copy._seed; _isMaster = false; _nbInsertedKmersInBloom = 0; _useAbundanceFilter = copy._useAbundanceFilter; } ~SelectKmersCommand(){ if(_isMaster) return; if(_kmerCountSorter.size() == 0) return; //cout << "deleteeeeee" << endl; size_t sketchSize = _kmerCountSorter.size(); //cout << sketchSize << endl; for(size_t i=0; i=0; i++){ if(seq[i] == 'A'){ rev += 'T'; } else if(seq[i] == 'C'){ rev += 'G'; } else if(seq[i] == 'G'){ rev += 'C'; } else if(seq[i] == 'T'){ rev += 'A'; } } return rev; } */ //void minRevComp(string& kmer){ //string revKmer = //} void operator()(Sequence& sequence){ _model.build(sequence.getData(), _kmers); //_itKmer.setData(sequence.getData()); //cout << sequence.toString() << endl; //size_t len = sequence.getDataSize() - _kmerSize + 1; /// _kmerSize; //char* data = sequence.getDataBuffer(); for(size_t i=0; i<_kmers.size(); i++){ KmerCanonicalType& kmer = _kmers[i]; // We iterate the sequence data by block of size kmerSize //for (size_t i=0; ivalue(); //KmerType kmerRev = revcomp(kmer.value(), _kmerSize); //string kmerStr = kmer.value().toString(_kmerSize); //string kmerStrRev = kmerRev.toString(_kmerSize); //if(kmerStrRev < kmerStr){ // kmerStr = kmerStrRev; //} u_int64_t kmerValue = kmer.value().getVal(); u_int64_t kmerHashed; MurmurHash3_x64_128 ((const char*)&kmerValue, sizeof(kmerValue), _seed, &_hash_otpt); kmerHashed = _hash_otpt[0]; //cout << kmerStr << ": " << kmerHashed << endl; //todo: verifier dabord si le kmer peut etre insérer, plus rapide que els accès au table de hachage (bloom et selected) //cout << _useAbundanceFilter << endl; if(_useAbundanceFilter){ processFiltered(kmer.value(), kmerHashed); } else{ processUnfiltered(kmerHashed); } //cout << kmer.isValid() << endl; // We update the occurrences number for this kmer value //distrib [kmer.value().toInt()] += 1; } /* for(_itKmer.first(); !_itKmer.isDone(); _itKmer.next()){ //cout << _itKmer->value().toString(_kmerSize) << endl; Lol kkaka = _itKmer->value().value(); //cout << _itKmer->value().isValid() << endl; KmerType kmer = _itKmer->value(); KmerType kmerRev = revcomp(kmer, _kmerSize); string kmerStr = kmer.toString(_kmerSize); string kmerStrRev = kmerRev.toString(_kmerSize); if(kmerStrRev < kmerStr){ kmerStr = kmerStrRev; } //u_int64_t kmerValue = kmer.getVal(); u_int64_t kmerHashed; MurmurHash3_x64_128 ( kmerStr.c_str(), _kmerSize, 42, &_hash_otpt); kmerHashed = _hash_otpt[0]; if(kmerHashed == 66908235404){ //cout << kmer.value().isValid() << endl; cout << sequence.toString() << endl; cout << kmerStr << endl; } //cout << kmerStr << ": " << kmerHashed << endl; //todo: verifier dabord si le kmer peut etre insérer, plus rapide que els accès au table de hachage (bloom et selected) //cout << _useAbundanceFilter << endl; if(_useAbundanceFilter){ processFiltered(kmer, kmerHashed); } else{ processUnfiltered(kmerHashed); } }*/ } inline void processUnfiltered(u_int64_t kmerHashed){ if(_kmerCountSorter.size() < _sketchSize){ if(_kmerCounts.find(kmerHashed) == _kmerCounts.end()){ _kmerCountSorter.push(kmerHashed); _kmerCounts[kmerHashed] = 1; //cout << _kmerCountSorter.size() << endl; } else{ _kmerCounts[kmerHashed] += 1; } } else{ if(kmerHashed < _kmerCountSorter.top()){ if(_kmerCounts.find(kmerHashed) == _kmerCounts.end()){ //cout << kmer << " " << _kmerCounts.size() << endl; u_int64_t greaterValue = _kmerCountSorter.top(); _kmerCounts.erase(greaterValue); _kmerCountSorter.pop(); _kmerCountSorter.push(kmerHashed); _kmerCounts[kmerHashed] = 1; } else{ _kmerCounts[kmerHashed] += 1; } } } } inline void processFiltered(const KmerType& kmer, u_int64_t kmerHashed){ if(_kmerCountSorter.size() < _sketchSize){ if(_bloomFilter->contains(kmer)){ //Filling the queue with first elements if(_kmerCounts.find(kmerHashed) == _kmerCounts.end()){ _kmerCountSorter.push(kmerHashed); _kmerCounts[kmerHashed] = 2; //cout << _kmerCountSorter.size() << endl; } else{ _kmerCounts[kmerHashed] += 1; } } else{ _bloomFilter->insert(kmer); _nbInsertedKmersInBloom += 1; } } else{ if(kmerHashed < _kmerCountSorter.top()){ if(_bloomFilter->contains(kmer)){ if(_kmerCounts.find(kmerHashed) == _kmerCounts.end()){ //cout << kmer << " " << _kmerCounts.size() << endl; u_int64_t greaterValue = _kmerCountSorter.top(); _kmerCounts.erase(greaterValue); _kmerCountSorter.pop(); _kmerCountSorter.push(kmerHashed); _kmerCounts[kmerHashed] = 2; } else{ _kmerCounts[kmerHashed] += 1; } } else{ _bloomFilter->insert(kmer); _nbInsertedKmersInBloom += 1; } } } } }; /* size_t _kmerSize; size_t _sketchSize; vector _minHashValues; vector _minHashKmers; ModelCanonical _model; ModelCanonicalIterator _itKmer; //ModelCanonical model (kmerSize); //ModelCanonical::Kmer kmer = model.codeSeed (seq, Data::ASCII); pthread_mutex_t* _mutex; vector& _minHashValuesSynchronized; vector& _minHashKmersSynchronized; MinhashSketcher(size_t kmerSize, size_t sketchSize, pthread_mutex_t* mutex, vector& minHashValuesSynchronized, vector& minHashKmersSynchronized) : _model(kmerSize), _itKmer(_model), _mutex(mutex), _minHashValuesSynchronized(minHashValuesSynchronized), _minHashKmersSynchronized(minHashKmersSynchronized) { _kmerSize = kmerSize; _sketchSize = sketchSize; ModelCanonical _model(_kmerSize); _minHashValues = vector(_sketchSize, -1); _minHashKmers = vector(_sketchSize, 0); } MinhashSketcher(const MinhashSketcher& copy) : _model(copy._kmerSize), _itKmer(_model), _mutex(copy._mutex), _minHashValuesSynchronized(copy._minHashValuesSynchronized), _minHashKmersSynchronized(copy._minHashKmersSynchronized) { _kmerSize = copy._kmerSize; _sketchSize = copy._sketchSize; _minHashValues = vector(_sketchSize, -1); _minHashKmers = vector(_sketchSize, 0); } ~MinhashSketcher(){ //cout << "deleteeeeee" << endl; pthread_mutex_lock(_mutex); for(size_t i=0; i<_sketchSize; i++){ if(_minHashValues[i] < _minHashValuesSynchronized[i]){ _minHashValuesSynchronized[i] = _minHashValues[i]; _minHashKmersSynchronized[i] = _minHashKmers[i]; //cout << _minHashKmers[i] << endl; } } pthread_mutex_unlock(_mutex); } */ /* template class StorageItKmerCount { public: typedef typename Kmer::Type Type; typedef typename Kmer::Count Count; StorageItKmerCount(Iterator* it){ _it = it; } ~StorageItKmerCount(){ delete _it; } bool next(){ _it->next(); return !_it->isDone(); } Count& item(){ return _it->item(); } Iterator* _it; }; template class SimkaPartitionWriter { public: typedef typename Kmer::Type Type; typedef typename Kmer::Count Count; //typedef tuple*> KmerCount_It; struct Kmer_BankId_Count{ Type _type; u_int32_t _bankId; u_int16_t _count; Kmer_BankId_Count(){ } Kmer_BankId_Count(Type type, u_int64_t bankId, u_int64_t count){ _type = type; _bankId = bankId; _count = count; } }; string _outputDir; size_t _nbPartitions; vector _nbKmerPerParts; vector _nbDistinctKmerPerParts; vector _chordNiPerParts; vector* > _bags; vector* > _cachedBags; SimkaPartitionWriter(const string& oututDir, size_t nbPartitions){ _outputDir = oututDir; _nbPartitions = nbPartitions; _nbKmerPerParts = vector(_nbPartitions, 0); _nbDistinctKmerPerParts = vector(_nbPartitions, 0); _chordNiPerParts = vector(_nbPartitions, 0); //vector* > bags; //vector* > cachedBags; for(size_t i=0; i<_nbPartitions; i++){ //string outputFilename = _outputDir + "/" + _datasetID + "_" + Stringify::format("%i", i) + ".gz"; string outputFilename = _outputDir + "/" + Stringify::format("%i", i) + ".gz"; Bag* bag = new BagGzFile(outputFilename); Bag* cachedBag = new BagCache(bag, 10000); _cachedBags.push_back(cachedBag); //BagCache bagCache(*bag, 10000); _bags.push_back(bag); } } void insert(u_int64_t kmer, u_int64_t bankId, u_int64_t abundance){ //kmer.to_long(kmer_bin); size_t part = korenXor(kmer) % _nbPartitions; //hash_kmer(kmer_bin) % _nbPartitions; Type type; //(kmer_bin[0]); //type.setVal(kmer_bin[0]); type.setVal(kmer); //size_t part = oahash(kmer) % _nbPartitions; _cachedBags[part]->insert(Kmer_BankId_Count(type, bankId, abundance)); _nbDistinctKmerPerParts[part] += 1; _nbKmerPerParts[part] += abundance; _chordNiPerParts[part] += pow(abundance, 2); } void end(){ for(size_t i=0; i<_nbPartitions; i++){ //bags[i]->flush(); //cachedBags[i]->flush(); delete _cachedBags[i]; //delete bags[i]; } for(size_t i=0; i<_nbPartitions; i++){ string outputFilename = _outputDir + "/" + Stringify::format("%i", i) + ".gz"; checkGzFile(outputFilename); } } //There is a bug in simka, sometimes a gz file is erroneous at the end //It's really rare and I can't find it //My bad solution is to read the whole gz file as soon as it is close and a segfault will occur if it has a bad format //Of course it's a bad solution because it has a impact on simka performances... void checkGzFile(const string& filename){ IterableGzFile* gzFile = new IterableGzFile(filename, 10000); Iterator* it = gzFile->iterator(); it->first(); while(!it->isDone()){ it->next(); } delete it; delete gzFile; } inline u_int64_t korenXor(u_int64_t x)const{ x ^= (x << 21); x ^= (x >> 35); x ^= (x << 4); return x; } }; */ /********************************************************************* * ** SimkaAlgorithm *********************************************************************/ template class Simka2ComputeKmerSpectrumAlgorithm : public Algorithm { public: typedef typename Kmer::Type KmerType; typedef typename Kmer::ModelCanonical ModelCanonical; typedef typename Kmer::ModelCanonical::Iterator ModelCanonicalIterator; struct KmerCountSorter{ bool operator() (u_int64_t l, u_int64_t r) { return r > l; } }; //struct kxpcomp { bool operator() (KmerCount_It& l,KmerCount_It& r) { return (r._count.value < l._count.value); } } ; //u_int64_t _nbReads; //size_t _nbPartitions; u_int64_t _maxMemory; size_t _nbCores; string _outputDir; string _outputDirTemp; //size_t _nbBanks; string _inputFilename; //string _datasetID; u_int8_t _kmerSize; //pair _abundanceThreshold; //SIMKA_SOLID_KIND _solidKind; //bool _soliditySingle; int64_t _maxNbReads; size_t _minReadSize; double _minReadShannonIndex; //double _minKmerShannonIndex; //size_t _nbMinimizers; //size_t _nbCores; //SimkaStatistics* _stats; //SimkaDistance* _simkaDistance; //string _banksInputFilename; //string _h5Filename; //vector _tempFilenamesToDelete; //IBank* _banks; IProperties* _options; //size_t _localNbPartitions; //vector _bankNames; //vector _nbReadsPerDataset; //string _outputFilenameSuffix; //u_int64_t _totalKmers; //vector _nbBankPerDataset; //size_t _nbBankPerDataset; //string _largerBankId; //bool _computeSimpleDistances; //bool _computeComplexDistances; //bool _keepTmpFiles; //string _kmerDatataseFilename; //vector _cmds; //SimkaPartitionWriter* _partitionWriter; u_int32_t _seed; u_int32_t _sketchSize; bool _useAbundanceFilter; u_int32_t _nbDatasets; //pthread_mutex_t _mutex; //typedef typename SelectKmersCommand::KmerCountSorter KmerCountSorter; //std::priority_queue< u_int64_t, vector, KmerCountSorter> _kmerCountSorter; //KmerCountDictionaryType _kmerCounts; //size_t _nbBanks; //vector _bankNames; //vector _nbBankPerDataset; vector _threads; size_t _maxRunningThreads; vector _runningThreadIds; size_t _nbRunningThreads; vector _finishedThreads; mutex countKmersMutex; ofstream _outputFile; //string _outputFilenameKmers; //string _outputFilenameIds; IteratorListener* _progress; u_int64_t _progress_nbDatasetsToProcess; u_int64_t _progress_nbDatasetsProcessed; string _progress_text; Simka2ComputeKmerSpectrumAlgorithm(IProperties* options): Algorithm("simka", -1, options) { } void execute(){ //pthread_mutex_init(&_mutex, NULL); parseArgs(); createDirs(); cout << endl << "Checking input file validity..." << endl; SimkaCommons::checkInputValidity(_outputDirTemp, _inputFilename, _progress_nbDatasetsToProcess); _progress = this->createIteratorListener (_progress_nbDatasetsToProcess, ""); //new ProgressSynchro ( //this->createIteratorListener (_progress_nbDatasetsToProcess, ""), //System::thread().newSynchronizer()); _progress->setMessage (Stringify::format (_progress_text.c_str(), _progress_nbDatasetsProcessed, _progress_nbDatasetsToProcess)); _progress->init (); countDatasets(); string command = "rm -rf " + _outputDirTemp; system(command.c_str()); cout << "Output results: " << _outputDir << endl; } void parseArgs(){ _options = getInput(); _seed = _options->getInt(STR_SIMKA_SEED); _sketchSize = _options->getInt(STR_SIMKA_SKETCH_SIZE); _useAbundanceFilter = _options->get(STR_SIMKA_ABUNDANCE_FILTER); _maxMemory = _options->getInt(STR_MAX_MEMORY); _nbCores = _options->getInt(STR_NB_CORES); _inputFilename = _options->getStr(STR_URI_INPUT); //_datasetID = _options->getStr(STR_SIMKA2_DATASET_ID); _outputDir = _options->getStr(STR_URI_OUTPUT); // ? _options->getStr(STR_URI_OUTPUT) : "./"; if(_outputDir.empty()) _outputDir = "./simkaMin_kmers.bin"; _outputDirTemp = System::file().getDirectory(_outputDir) + "/__simkaMin_temp__/"; //cout << "outputdir temp to check: " << _outputDirTemp << endl; //_outputDirTemp = _options->get(STR_URI_OUTPUT_TMP) ? _options->getStr(STR_URI_OUTPUT_TMP) : "./"; _kmerSize = _options->getInt(STR_KMER_SIZE); //_abundanceThreshold.first = _options->getInt(STR_KMER_ABUNDANCE_MIN); //_abundanceThreshold.second = min((u_int64_t)_options->getInt(STR_KMER_ABUNDANCE_MAX), (u_int64_t)(999999999)); //_nbPartitions = _options->getInt(STR_SIMKA2_NB_PARTITION); //cout << _options->getInt(STR_KMER_ABUNDANCE_MAX) << endl; //cout << _abundanceThreshold.second << endl; //_soliditySingle = _options->get(STR_SIMKA_SOLIDITY_PER_DATASET); //_nbMinimizers = _options->getInt(STR_KMER_PER_READ); //_maxDisk = getInput()->getInt(STR_MAX_DISK); //read filter _maxNbReads = _options->getInt(STR_SIMKA_MAX_READS); _minReadSize = _options->getInt(STR_SIMKA_MIN_READ_SIZE); _minReadShannonIndex = _options->getDouble(STR_SIMKA_MIN_READ_SHANNON_INDEX); _minReadShannonIndex = std::max(_minReadShannonIndex, 0.0); _minReadShannonIndex = std::min(_minReadShannonIndex, 2.0); if(!System::file().doesExist(_inputFilename)){ std::cerr << "Error: input does not exist (" << _inputFilename << ")" << std::endl; exit(1); } if(System::file().doesExist(_outputDir)){ std::cerr << "Error: output file already exist (" << _outputDir << ")" << std::endl; exit(1); } _progress_text = "Sketching datasets (%d/%d)"; //_nbBankPerDataset = _options->getInt("-nb-dataset"); //_minKmerShannonIndex = _options->getDouble(STR_SIMKA_MIN_KMER_SHANNON_INDEX); //_minKmerShannonIndex = std::max(_minKmerShannonIndex, 0.0); //_minKmerShannonIndex = std::min(_minKmerShannonIndex, 2.0); //if(!System::file().doesExist(_inputFilename)){ // cerr << "ERROR: Input filename does not exist" << endl; // exit(1); //} //if(!System::file().doesExist(_outputDir)){ // std::cerr << "Error: can't create output directory (" << _outputDir << ")" << std::endl; // exit(EXIT_FAILURE); /* int ok = System::file().mkdir(_outputDir, -1); if(ok != 0){ std::cerr << "Error: can't create output directory (" << _outputDir << ")" << std::endl; exit(1); }*/ //} //_outputDirTemp = _outputDirTemp; //if(!System::file().doesExist(_outputDirTemp)){ //std::cerr << "Error: can't create output temp directory (" << _outputDirTemp << ")" << std::endl; //exit(EXIT_FAILURE); /* int ok = System::file().mkdir(_outputDirTemp, -1); if(ok != 0){ std::cerr << "Error: can't create output temp directory (" << _outputDirTemp << ")" << std::endl; exit(1); }*/ //} //_outputDirTemp = System::file().getRealPath(_outputDirTemp) + "/"; //cout << _outputDirTemp << endl; //_outputDirTemp += "/" + _datasetID + "_temp" + "/"; //System::file().mkdir(_outputDirTemp, -1); //_options->setStr(STR_URI_OUTPUT_TMP, _outputDirTemp); //System::file().mkdir(_outputDirTemp + "/input/", -1); //_maxMemory = _maxMemory / 1000; //_maxMemory = max(_maxMemory, (u_int64_t) 1); /* if(_outputDir.empty()){ _outputDir = "./simkaMin_kmers.bin"; } else if (_outputDir.find(".") == std::string::npos){ _outputDir += ".bin"; } _outputDir = System::file().getBaseName(_outputDir); cout << endl << endl; cout << _outputDir << endl; vector fields; stringstream outputFilenameStream(_outputDir); string field; while(std::getline(outputFilenameStream, field, '.')) { cout << field << endl; fields.push_back(field); } string prefix = fields[0]; string extension = ""; for(size_t i=1; isetStr(STR_URI_OUTPUT_TMP, _outputDirTemp); //System::file().mkdir(_outputDirTemp + "/input/", -1); } void countDatasets(){ //cout << endl << endl; //cout << "Sketching..." << endl; _outputFile.open(_outputDir, ios::binary); //Save sketch info //u_int8_t kmerSize = _kmerSize; //u_int32_t sketchSize = _sketchSize; //u_int32_t seed = _seed; _nbDatasets = 0; _outputFile.write((const char*)&_kmerSize, sizeof(_kmerSize)); _outputFile.write((const char*)&_sketchSize, sizeof(_sketchSize)); _outputFile.write((const char*)&_seed, sizeof(_seed)); _outputFile.write((const char*)&_nbDatasets, sizeof(_nbDatasets)); //cout << _maxRunningThreads << endl; size_t threadId = 0; //vector threads; //(_nbCores); //_isThreadRunning = vector(_nbCores); _nbRunningThreads = 0; _maxRunningThreads = _nbCores; string inputDir = _outputDirTemp; // + "/input/"; ifstream inputFile(_inputFilename.c_str()); //ofstream outputFileIds(_outputFilenameIds.c_str(), ios::binary); //_banksInputFilename = inputDir + "__input_simka__"; //_inputFilename + "_dsk_dataset_temp__"; //IFile* bankFile = System::file().newFile(_banksInputFilename, "wb"); string line; string linePart; vector lineIdDatasets; vector linepartPairedDatasets; vector linepartDatasets; //string bankFileContents = ""; size_t datasetId = 0; u_int64_t lineIndex = 0; u_int64_t bankIdBytePos = 0; while(getline(inputFile, line)){ line.erase(std::remove(line.begin(),line.end(),' '),line.end()); if(line == "") continue; //cout << line << endl; lineIdDatasets.clear(); linepartPairedDatasets.clear(); //vector filenames; stringstream lineStream(line); while(getline(lineStream, linePart, ':')){ lineIdDatasets.push_back(linePart); } string bankId = lineIdDatasets[0]; string linePairedDatasets = lineIdDatasets[1]; stringstream linePairedDatasetsStream(linePairedDatasets); while(getline(linePairedDatasetsStream, linePart, ';')){ linepartPairedDatasets.push_back(linePart); } string subBankFilename = inputDir + bankId; IFile* subBankFile = System::file().newFile(subBankFilename, "wb"); //cout << subBankFile->getPath() << endl; string subBankContents = ""; size_t nbBankPerDataset = linepartPairedDatasets.size(); for(size_t i=0; ifwrite(subBankContents.c_str(), subBankContents.size(), 1); subBankFile->flush(); delete subBankFile; //bankFileContents += inputDir + "/" + bankId + "\n"; lineIndex += 1; startNewThread(datasetId, subBankFilename, nbBankPerDataset); //count(); //_bankNames.push_back(bankId); datasetId += 1; _nbDatasets += 1; } //bankFileContents.erase(bankFileContents.size()-1); //bankFile->fwrite(bankFileContents.c_str(), bankFileContents.size(), 1); //bankFile->flush(); //delete bankFile; joinThreads(); _progress->finish(); inputFile.close(); writeIds(); //outputFileIds.seekp(0); //outputFileIds.write((const char*)&nbDatasets, sizeof(nbDatasets)); _outputFile.close(); //outputFileIds.close(); } void writeIds(){ _outputFile.seekp(SimkaMinCommons::getFilePosition_nbDatasets()); _outputFile.write((const char*)&_nbDatasets, sizeof(_nbDatasets)); _outputFile.seekp(SimkaMinCommons::getFilePosition_sketchIds(_nbDatasets, _sketchSize)); ifstream inputFile(_inputFilename.c_str()); string line; string linePart; vector lineIdDatasets; while(getline(inputFile, line)){ line.erase(std::remove(line.begin(),line.end(),' '),line.end()); if(line == "") continue; lineIdDatasets.clear(); stringstream lineStream(line); while(getline(lineStream, linePart, ':')){ lineIdDatasets.push_back(linePart); } string bankId = lineIdDatasets[0]; u_int8_t idSize = bankId.size(); _outputFile.write((const char*)& idSize, sizeof(idSize)); _outputFile.write(bankId.c_str(), bankId.size()); } inputFile.close(); } void startNewThread(size_t datasetId, const string& inputFilename, size_t nbBankPerDataset){ //for (size_t i=0; i<_nbBanks; i++){ // cout << i << endl; thread* t = new thread(&Simka2ComputeKmerSpectrumAlgorithm::countKmersOfDataset, this, datasetId, inputFilename, nbBankPerDataset); _threads.push_back(t); _runningThreadIds.push_back(datasetId); //threadId += 1; _nbRunningThreads += 1; //_isThreadRunning[threadId] = true; //_nbRunningThreads[i] += 1; if(_nbRunningThreads >= _maxRunningThreads){ waitThreads(); } //} //string filename = _outputDirTemp + "/selectedKmers.bin"; //ofstream selectKmersFile(filename.c_str(), ios::binary); //cout << _selectedKmerSorter.size() << " " << _nbUsedKmers << endl; //_selectedKmerSorter.pop(); //there is always one extra element because of a >= optimization... //cout << _selectedKmerSorter.size() << " " << _nbUsedKmers << endl; //u_int64_t size = _selectedKmerSorter.size(); //for(size_t i=0; i > _skecthCounts; //unordered_map > _; void countKmersOfDataset(size_t datasetId, const string& inputFilename, size_t nbBankPerDataset){ //TODO lock probably not required //countKmersMutex.lock(); //cout << "start: " << inputFilename << endl; //countKmersMutex.unlock(); IBank* bank = Bank::open(inputFilename); LOCAL(bank); SimkaSequenceFilter sequenceFilter(_minReadSize, _minReadShannonIndex); IBank* filteredBank = new SimkaPotaraBankFiltered(bank, sequenceFilter, _maxNbReads, nbBankPerDataset); LOCAL(filteredBank); Iterator* itSeq = filteredBank->iterator(); LOCAL(itSeq); //Iterator* itSeq = createIterator ( // filteredBank->iterator(), // filteredBank->estimateNbItems(), // "Computing minhash sketch and counting" //); //LOCAL(itSeq); IDispatcher* dispatcher = new SerialDispatcher(); Bloom* bloomFilter = 0; if(_useAbundanceFilter){ u_int64_t bloomMemoryBits = (_maxMemory * MBYTE * 8) / _maxRunningThreads; bloomMemoryBits = max(bloomMemoryBits, (u_int64_t) 10000); bloomFilter = new BloomCacheCoherent(bloomMemoryBits, 7); } //mutex commandMutex; //std::priority_queue< u_int64_t, vector, KmerCountSorter> kmerCountSorter; //unordered_map kmerCounts; vector kmers(_sketchSize, 0); //TODO only used for reversing kmers not really optimized... KmerCountDictionaryType _kmerCounts; { SelectKmersCommand command(_kmerSize, _sketchSize, _seed, bloomFilter, kmers, _kmerCounts, _useAbundanceFilter); dispatcher->iterate (itSeq, command, 1000); } /* ModelCanonical model; ModelCanonicalIterator itKmer(model); u_int64_t _hash_otpt[2]; u_int64_t _nbInsertedKmersInBloom = 0; for(itSeq->first(); !itSeq->isDone(); itSeq->next()){ Sequence& sequence = itSeq->item(); } */ delete dispatcher; delete bloomFilter; countKmersMutex.lock(); u_int64_t filePos = (datasetId * _sketchSize * sizeof(KmerAndCountType)) + KMER_SPECTRUM_HEADER_SIZE; //cout << "DATASTE ID: " << datasetId << " " << filePos << endl; _outputFile.seekp(filePos); //_kmerCountSorter.pop(); //Discard greater element because queue size is always equal to (_sketchSize + 1) because of an optimization //cout << "----------" << endl; for(size_t i=0; iinsert(kmerCount._kmer, _datasetIDbin, kmerCount._count); //_kmerCountSorter.pop(); //_partitionWriter->insert(_minHashKmers[i], _datasetIDbin, _minHashKmersCounts[i] ); //cout << _minHashKmers[i] << " " << _minHashKmersCounts[i] << endl; } System::file().remove(inputFilename); _progress_nbDatasetsProcessed += 1; _progress->setMessage (Stringify::format (_progress_text.c_str(), _progress_nbDatasetsProcessed, _progress_nbDatasetsToProcess)); _progress->inc(1); //cout << "end: " << inputFilename << endl; _finishedThreads.push_back(datasetId); countKmersMutex.unlock(); } void waitThreads(){ while(1){ bool isThreadAvailbale = false; countKmersMutex.lock(); for(size_t i=0; i<_finishedThreads.size(); i++){ size_t threadId = _finishedThreads[i]; //_runningThreadIds.erase(std::remove(_runningThreadIds.begin(), _runningThreadIds.end(), threadId), _runningThreadIds.end()); auto it = find(_runningThreadIds.begin(), _runningThreadIds.end(), threadId); int pos = distance(_runningThreadIds.begin(), it); //cout << "\t removing thread " << threadId << " (pos: " << pos << ")" << endl; _runningThreadIds.erase(_runningThreadIds.begin()+pos); _threads[pos]->join(); delete _threads[pos]; _threads.erase(_threads.begin()+pos); _nbRunningThreads -= 1; isThreadAvailbale = true; } _finishedThreads.clear(); countKmersMutex.unlock(); if(isThreadAvailbale){ //cout << _runningThreadIds.size() << " " << _threads.size() << endl; //countKmersMutex.unlock(); break; } sleep(1); } } void joinThreads(){ while(_nbRunningThreads > 0) waitThreads(); } }; class Simka2ComputeKmerSpectrum : public Tool{ public: Simka2ComputeKmerSpectrum(): Tool ("SimkaMin-ComputeKmerSpectrum"){ IOptionsParser* parser = getParser();//new OptionsParser ("Simka2 - Compute Kmer Spectrum"); //Main parser //parser->push_front (new OptionNoParam (STR_SIMKA_COMPUTE_DATA_INFO, "compute (and display) information before running Simka, such as the number of reads per dataset", false)); //parser->push_front (new OptionNoParam (STR_SIMKA_KEEP_TMP_FILES, "keep temporary files", false)); //parser->push_front (new OptionOneParam (STR_URI_OUTPUT_TMP, "output directory for temporary files", true)); parser->push_front (new OptionOneParam (STR_SIMKA_SEED, "seed used for random k-mer selection", false, "100")); parser->push_front (new OptionOneParam (STR_URI_OUTPUT, "output filename for kmer spectrum", false, "./simkaMin_kmers.bin")); parser->push_front (new OptionOneParam (STR_URI_INPUT, "input filename | TODO SPECIF", true)); //parser->push_front (new OptionOneParam (STR_SIMKA2_DATASET_ID, "identifier of the input dataset", true)); //parser->push_back (new OptionOneParam (STR_URI_OUTPUT_TMP, "output directory for temporary files", true)); //IOptionsParser* parser = getParser(); //IOptionsParser* dskParser = SortingCountAlgorithm<>::getOptionsParser(); //parser->push_back(dskParser); //dskParser->setVisible(false); //cout << parser->getParser(STR_NB_CORES) << endl; // //parser->push_back(new OptionOneParam(parser->getParser(STR_NB_CORES)->getName(), parser->getParser(STR_NB_CORES)->getHelp(), false, "0")); //parser->push_front(dskParser->getParser (STR_URI_OUTPUT_TMP)); //dskParser->getParser (STR_URI_OUTPUT_TMP)->setMandatory //parser->push_front(dskParser->getParser (STR_URI_OUTPUT)); //parser->getParser (STR_URI_OUTPUT)->setHelp("output directory for result files (similarity matrix, heatmaps)"); //parser->push_front(dskParser->getParser (STR_URI_INPUT)); //parser->getParser(STR_URI_INPUT)->setHelp("input file of datasets. One dataset per line: id filename1 filename2..."); //if (Option* p = dynamic_cast (parser->getParser(STR_URI_OUTPUT_TMP))) { p->s; } //Distance parser //IOptionsParser* distanceParser = new OptionsParser ("distance"); //distanceParser->push_back (new OptionNoParam (STR_SIMKA_COMPUTE_ALL_SIMPLE_DISTANCES, "compute all simple distances (Chord, Hellinger...)", false)); //distanceParser->push_back (new OptionNoParam (STR_SIMKA_COMPUTE_ALL_COMPLEX_DISTANCES, "compute all complex distances (Jensen-Shannon...)", false)); //Kmer parser IOptionsParser* kmerParser = new OptionsParser ("kmer"); kmerParser->push_back (new OptionOneParam (STR_KMER_SIZE, "size of a kmer", false, "21")); kmerParser->push_back (new OptionOneParam (STR_SIMKA_SKETCH_SIZE, "number of kmers used to compute distances", false, "100000")); kmerParser->push_back (new OptionNoParam (STR_SIMKA_ABUNDANCE_FILTER, "filter out k-mer seen one time (potentially erroneous)", false)); //kmerParser->push_back(dskParser->getParser (STR_KMER_SIZE)); //kmerParser->push_back(new OptionOneParam (STR_KMER_PER_READ.c_str(), "number of selected kmers per read", false, "0")); //kmerParser->push_back (new OptionOneParam (STR_KMER_ABUNDANCE_MIN, "min abundance a kmer need to be considered", false, "1")); //kmerParser->push_back (new OptionOneParam (STR_KMER_ABUNDANCE_MIN, "min abundance a kmer need to be considered", false, "2")); //KmerCountType maxAbundance = -1; //kmerParser->push_back (new OptionOneParam (STR_KMER_ABUNDANCE_MAX, "max abundance a kmer can have to be considered", false, Stringify::format("%i", maxAbundance))); //kmerParser->push_back(dskParser->getParser (STR_KMER_ABUNDANCE_MIN)); //if (Option* p = dynamic_cast (parser->getParser(STR_KMER_ABUNDANCE_MIN))) { p->setDefaultValue ("0"); } //if (Option* p = dynamic_cast (parser->getParser(STR_SOLIDITY_KIND))) { p->setDefaultValue ("all"); } //kmerParser->push_back(dskParser->getParser (STR_KMER_ABUNDANCE_MAX)); //kmerParser->push_back(dskParser->getParser (STR_SOLIDITY_KIND)); //kmerParser->getParser (STR_SOLIDITY_KIND)->setHelp("TODO"); //kmerParser->push_back (new OptionNoParam (STR_SIMKA_SOLIDITY_PER_DATASET.c_str(), "do not take into consideration multi-counting when determining solid kmers", false )); //kmerParser->push_back (new OptionOneParam (STR_SIMKA_MIN_KMER_SHANNON_INDEX.c_str(), "minimal Shannon index a kmer should have to be kept. Float in [0,2]", false, "0" )); //Read filter parser IOptionsParser* readParser = new OptionsParser ("read"); readParser->push_back (new OptionOneParam (STR_SIMKA_MAX_READS.c_str(), "maximum number of reads to process. Set to 0 to use all reads", false, "0" )); readParser->push_back (new OptionOneParam (STR_SIMKA_MIN_READ_SIZE.c_str(), "minimal size a read should have to be kept", false, "0" )); readParser->push_back (new OptionOneParam (STR_SIMKA_MIN_READ_SHANNON_INDEX.c_str(), "minimal Shannon index a read should have to be kept. Float in [0,2]", false, "0" )); //readParser->push_back (new OptionOneParam ("-nb-dataset", "nb paired datasets", true)); //Core parser IOptionsParser* coreParser = new OptionsParser ("core"); coreParser->push_back(new OptionOneParam(STR_NB_CORES, "number of cores", false, "0")); coreParser->push_back (new OptionOneParam (STR_MAX_MEMORY, "max memory (MB). Only used if -filter is enabled", false, "8000")); //coreParser->push_back (new OptionOneParam (STR_SIMKA2_NB_PARTITION, "nb partitions", true)); //coreParser->push_back(dskParser->getParser ()); //coreParser->push_back(dskParser->getParser (STR_MAX_DISK)); //Distances //IOptionsParser* distanceParser = new OptionsParser ("distances"); //distanceParser->push_back (new OptionNoParam (STR_SIMKA_DISTANCE_BRAYCURTIS.c_str(), "compute Bray Curtis distance")); //distanceParser->push_back (new OptionNoParam (STR_SIMKA_DISTANCE_CHORD.c_str(), "compute Chord distance")); //distanceParser->push_back (new OptionNoParam (STR_SIMKA_DISTANCE_HELLINGER.c_str(), "compute Hellinger distance")); //distanceParser->push_back (new OptionNoParam (STR_SIMKA_DISTANCE_CANBERRA.c_str(), "compute Canberra distance")); //distanceParser->push_back (new OptionNoParam (STR_SIMKA_DISTANCE_KULCZYNSKI.c_str(), "compute Kulczynski distance")); //parser->push_back(distanceParser); parser->push_back(kmerParser); parser->push_back(readParser); parser->push_back(coreParser); //parser->push_back(distanceParser); //IOptionsParser* dskParser = SortingCountAlgorithm<>::getOptionsParser(); //if (Option* p = dynamic_cast (dskParser->getParser(STR_MINIMIZER_SIZE))) { p->setDefaultValue ("7"); } //parser->push_back(dskParser); //if (Option* p = dynamic_cast (dskParser->getParser(STR_MINIMIZER_SIZE))) { p->setDefaultValue ("7"); } //dskParser->setVisible(false); parser->getParser(STR_NB_CORES)->setVisible(false); //getParser()->push_back(parser); //if (Option* p = dynamic_cast (parser->getParser(STR_SOLIDITY_KIND))) { p->setDefaultValue ("all"); } //return parser; } ~Simka2ComputeKmerSpectrum(){ } struct Parameter { //Parameter (Simka& simka, IProperties* props) : props(props) {} Parameter (IProperties* props) : _props(props) {} //Simka& _simka; IProperties* _props; }; template struct Functor { void operator () (Parameter p){ Simka2ComputeKmerSpectrumAlgorithm* algo = new Simka2ComputeKmerSpectrumAlgorithm(p._props); algo->execute(); delete algo; } }; void execute () { IProperties* input = getInput(); //Parameter params(*this, getInput()); Parameter params(input); size_t kmerSize = getInput()->getInt (STR_KMER_SIZE); Integer::apply (kmerSize, params); } }; #endif /* SIMKA1_4_SRC_SIMKAMIN_SIMKAMINCOUNT_HPP_ */ simka-1.5.1/src/simkaMin/SimkaMinDistance.hpp000066400000000000000000000744071353413740300210740ustar00rootroot00000000000000/***************************************************************************** * SimkaMin: Fast kmer-based method for estimating the similarity between numerous metagenomic datasets * A tool from the GATB (Genome Assembly Tool Box) * Copyright (C) 2019 INRIA * Authors: G.Benoit * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU Affero General Public License as * published by the Free Software Foundation, either version 3 of the * License, or (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU Affero General Public License for more details. * * You should have received a copy of the GNU Affero General Public License * along with this program. If not, see . *****************************************************************************/ #ifndef SIMKA1_4_SRC_SIMKAMIN_SIMKAMINDISTANCE_HPP_ #define SIMKA1_4_SRC_SIMKAMIN_SIMKAMINDISTANCE_HPP_ #include "SimkaMinCommons.hpp" #include class KmerSpectrumIterator{ public: //FILE * _is; //ifstream _kmerSpectrumFile; size_t _sketchSize; bool _isDone; u_int64_t _nbItems; size_t _datasetId; vector >& _kmercountSketches; size_t _nbDatasetOffset; //u_int8_t* _buffer; //u_int64_t _bufferSize; //KmerAndCountType* _buffer; //vector > _buffers; //vector _buffers_isInit; KmerSpectrumIterator(const string& filename, vector >& kmercountSketches, size_t nbDatasetOffset) : _kmercountSketches(kmercountSketches) { //_buffer = 0; //_is = fopen((filename).c_str(), "rb"); //_kmerSpectrumFile.open(filename + ".kmers", ios::binary); //_sketchSize = sketchSize; _nbDatasetOffset = nbDatasetOffset; //cout << nbDatasetOffset << endl; //_buffer = (KmerAndCountType*) MALLOC (sizeof(KmerAndCountType) * _sketchSize); } ~KmerSpectrumIterator(){ //fclose(_is); //_kmerSpectrumFile.close(); //if(_buffer){FREE (_buffer);} } void first(size_t datasetId){ //cout << datasetId << " " << _nbDatasetOffset << endl; _datasetId = datasetId-_nbDatasetOffset; //cout << datasetId << " " << _nbDatasetOffset << " " << _datasetId<< endl; //if(_buffer){FREE (_buffer);} //u_int64_t pos = KMER_SPECTRUM_HEADER_SIZE + (datasetId*_sketchSize*sizeof(KmerAndCountType)); //fseek(_is, pos, SEEK_SET); _nbItems = 0; _sketchSize = _kmercountSketches[_datasetId].size(); //cout << sizeof(KmerAndCountType) << endl; //_kmerSpectrumFile.read((char*)_buffer, 10*_sketchSize); //int res = fread(_buffer, sizeof(KmerAndCountType), _sketchSize, _is); } inline bool isDone(){ return _nbItems >= _sketchSize; } inline void next(u_int64_t& kmer, KmerCountType& count){ //KmerAndCountType kmerCount; // = _buffer[_nbItems]; //memcpy(&kmerCount, &_buffer[_nbItems*10], 10); KmerAndCountType& kmerCount = _kmercountSketches[_datasetId][_nbItems]; //cout << _datasetId << " " << _nbDatasetOffset << " " << _kmercountSketches[_datasetId].size() << endl; kmer = kmerCount._kmer; count = kmerCount._count; //cout << kmer << " " << count << endl; //_kmerSpectrumFile.read((char*)(&kmer), sizeof(kmer)); //_kmerSpectrumFile.read((char*)(&count), sizeof(count)); _nbItems += 1; } }; class ComputeDistanceManager{ public: ofstream& _distanceMatrixJaccard; ofstream& _distanceMatrixBrayCurtis; //size_t _sketchSize; KmerSpectrumIterator* _kmerSpectrumiterator1; KmerSpectrumIterator* _kmerSpectrumiterator2; bool _isSymmetrical; u_int64_t _nbDistinctKmers; u_int64_t _nbDistinctSharedKmers; u_int64_t _nbKmers; u_int64_t _nbSharedKmers; size_t _nbDatasets1; size_t _nbDatasets2; vector _jaccardDistances; vector _braycurtisDistances; u_int64_t _jaccardDistances_nb; mutex& _mutex; //u_int64_t nbLala; ComputeDistanceManager(const string& filename1, const string& filename2, ofstream& distanceMatrixJaccard, ofstream& distanceMatrixBrayCurtis, bool isSymmetrical, size_t nbDatasets1, size_t nbDatasets2, mutex& mutex, size_t main_start_i, size_t main_start_j, size_t n_i, size_t n_j, vector >& _kmercountSketches_i, vector >& _kmercountSketches_j) : _distanceMatrixJaccard(distanceMatrixJaccard), _distanceMatrixBrayCurtis(distanceMatrixBrayCurtis), _mutex(mutex) { //_sketchSize = sketchSize; _kmerSpectrumiterator1 = new KmerSpectrumIterator(filename1, _kmercountSketches_i, main_start_i); _kmerSpectrumiterator2 = new KmerSpectrumIterator(filename2, _kmercountSketches_j, main_start_j); _isSymmetrical = isSymmetrical; _nbDatasets1 = nbDatasets1; _nbDatasets2 = nbDatasets2; _jaccardDistances.resize(1000); _braycurtisDistances.resize(1000); _jaccardDistances_nb = 0; //nbLala = 0; } ~ComputeDistanceManager(){ delete _kmerSpectrumiterator1; delete _kmerSpectrumiterator2; if(_jaccardDistances_nb > 0){ writeDistances(); /* _mutex.lock(); for(size_t i=0; i<_jaccardDistances.size() ; i++){ PairwiseDistance& jaccard = _jaccardDistances[i]; PairwiseDistance& braycurtis = _braycurtisDistances[i]; u_int64_t pos = jaccard._i*_nbDatasets2*sizeof(DistanceValueType) + (jaccard._j*sizeof(DistanceValueType)); _distanceMatrixJaccard.seekp(pos); _distanceMatrixBrayCurtis.seekp(pos); _distanceMatrixJaccard.write((const char*)&jaccard._distance, sizeof(jaccard._distance)); _distanceMatrixBrayCurtis.write((const char*)&braycurtis._distance, sizeof(braycurtis._distance)); if(_isSymmetrical){ u_int64_t pos = jaccard._j*_nbDatasets1*sizeof(DistanceValueType) + (jaccard._i*sizeof(DistanceValueType)); _distanceMatrixJaccard.seekp(pos); _distanceMatrixBrayCurtis.seekp(pos); _distanceMatrixJaccard.write((const char*)&jaccard._distance, sizeof(jaccard._distance)); _distanceMatrixBrayCurtis.write((const char*)&braycurtis._distance, sizeof(braycurtis._distance)); } } _mutex.unlock(); _jaccardDistances_nb = 0; //_braycurtisDistances.clear(); //_jaccardDistances.clear(); */ } //cout << nbLala << endl; } void computeDistance_unsynch(size_t i, size_t j){ //nbLala += 1; //_mutex.lock(); //lala += 1; //_mutex.unlock(); _nbDistinctSharedKmers = 0; _nbDistinctKmers = 0; _nbKmers = 0; _nbSharedKmers = 0; _kmerSpectrumiterator1->first(i); _kmerSpectrumiterator2->first(j); u_int64_t sketchSize = min(_kmerSpectrumiterator1->_sketchSize, _kmerSpectrumiterator2->_sketchSize); u_int64_t kmer1; u_int64_t kmer2; KmerCountType count1; KmerCountType count2; _kmerSpectrumiterator1->next(kmer1, count1); _kmerSpectrumiterator2->next(kmer2, count2); while(_nbDistinctKmers < sketchSize){ //_nbDistinctKmers < _sketchSize && (!_kmerSpectrumiterator1->isDone()) && (!_kmerSpectrumiterator2->isDone()) ){ //cout << kmer1 << " " << kmer2 << endl; if(kmer1 > kmer2){ _nbDistinctKmers += 1; _nbKmers += count2; if(_kmerSpectrumiterator2->isDone()) break; _kmerSpectrumiterator2->next(kmer2, count2); } else if(kmer1 < kmer2){ _nbDistinctKmers += 1; _nbKmers += count1; if(_kmerSpectrumiterator1->isDone()) break; _kmerSpectrumiterator1->next(kmer1, count1); } else{ _nbDistinctKmers += 1; _nbKmers += count1 + count2; _nbDistinctSharedKmers += 1; _nbSharedKmers += min(count1, count2); if(_kmerSpectrumiterator2->isDone() || _kmerSpectrumiterator1->isDone()) break; _kmerSpectrumiterator1->next(kmer1, count1); _kmerSpectrumiterator2->next(kmer2, count2); } } DistanceValueType jaccard; DistanceValueType braycurtis; if(_nbDistinctKmers == 0){ jaccard = 1; } else{ jaccard = 1 - (long double) _nbDistinctSharedKmers / (long double) _nbDistinctKmers; } if(_nbKmers == 0){ braycurtis = 1; }else{ braycurtis = 1 - (long double) (2*_nbSharedKmers) / (long double) _nbKmers; } //_mutex.lock(); //cout << i << " " << j << " " << braycurtis << endl; //_mutex.unlock(); _jaccardDistances[_jaccardDistances_nb].set(i, j, jaccard); _braycurtisDistances[_jaccardDistances_nb].set(i, j, braycurtis); _jaccardDistances_nb += 1; if(_jaccardDistances_nb == _jaccardDistances.size()){ writeDistances(); //_braycurtisDistances.clear(); //_jaccardDistances.clear(); } //cout << "NB DISTINCT KMERS: " << _nbDistinctKmers << endl; //cout << "NB SHARED DISTINCT KMERS: " << _nbDistinctSharedKmers << endl; //cout << "JACCARD: " << << endl; //cout << "BRAY CURTIS: " << 1 - (long double) (2*_nbSharedKmers) / (long double) _nbKmers << endl; } void writeDistances(){ _mutex.lock(); size_t last_i = -1; for(size_t i=0; i<_jaccardDistances_nb ; i++){ PairwiseDistance& jaccard = _jaccardDistances[i]; PairwiseDistance& braycurtis = _braycurtisDistances[i]; //cout << jaccard._i << " " << jaccard._j << endl; if(jaccard._i != last_i){ u_int64_t pos = jaccard._i*_nbDatasets1*sizeof(DistanceValueType) + (jaccard._j*sizeof(DistanceValueType)); _distanceMatrixJaccard.seekp(pos); _distanceMatrixBrayCurtis.seekp(pos); last_i = jaccard._i; } _distanceMatrixJaccard.write((const char*)&jaccard._distance, sizeof(jaccard._distance)); _distanceMatrixBrayCurtis.write((const char*)&braycurtis._distance, sizeof(braycurtis._distance)); } //reprise: essayer d'écrire la partie symétrique sans acces random au disque /* //if(_isSymmetrical){ for(size_t i=0; i<_jaccardDistances_nb ; i++){ PairwiseDistance& jaccard = _jaccardDistances[i]; PairwiseDistance& braycurtis = _braycurtisDistances[i]; //if(i==0){ u_int64_t pos = jaccard._j*_nbDatasets1*sizeof(DistanceValueType) + (jaccard._i*sizeof(DistanceValueType)); //cout << pos << endl; _distanceMatrixJaccard.seekp(pos); _distanceMatrixBrayCurtis.seekp(pos); //} _distanceMatrixJaccard.write((const char*)&jaccard._distance, sizeof(jaccard._distance)); _distanceMatrixBrayCurtis.write((const char*)&braycurtis._distance, sizeof(braycurtis._distance)); } //} */ _mutex.unlock(); _jaccardDistances_nb = 0; } }; class SimkaMinDistanceAlgorithm : public Algorithm { public: size_t _nbCores; string _outputDir; string _inputFilename1; string _inputFilename2; //pair _abundanceThreshold; //SIMKA_SOLID_KIND _solidKind; //bool _soliditySingle; //int64_t _maxNbReads; //size_t _minReadSize; //double _minReadShannonIndex; //double _minKmerShannonIndex; //size_t _nbMinimizers; //size_t _nbCores; //SimkaStatistics* _stats; //SimkaDistance* _simkaDistance; //string _banksInputFilename; //string _h5Filename; //vector _tempFilenamesToDelete; //IBank* _banks; IProperties* _options; //size_t _localNbPartitions; //vector _bankNames; //vector _nbReadsPerDataset; //string _outputFilenameSuffix; //u_int64_t _totalKmers; //vector _nbBankPerDataset; //size_t _nbBankPerDataset; //string _largerBankId; //bool _computeSimpleDistances; //bool _computeComplexDistances; //bool _keepTmpFiles; //string _kmerDatataseFilename; //vector _cmds; //SimkaPartitionWriter* _partitionWriter; //vector> _bufferKmers; //vector> _bufferCounts; //vector _bufferIndex; //vector _minHashValues; //vector _minHashKmers; //vector _minHashKmersCounts; u_int32_t _sketchSize_1, _sketchSize_2; u_int32_t _seed; //pthread_mutex_t _mutex; //typedef typename SelectKmersCommand::KmerCountSorter KmerCountSorter; //std::priority_queue< u_int64_t, vector, KmerCountSorter> _kmerCountSorter; //KmerCountDictionaryType _kmerCounts; size_t _nbBanks; //vector _bankNames; //vector _nbBankPerDataset; vector _threads; size_t _maxRunningThreads; vector _runningThreadIds; size_t _nbRunningThreads; vector _finishedThreads; mutex countKmersMutex; //vector _datasetIds1; //vector _datasetIds2; u_int32_t _nbDataset1; u_int32_t _nbDataset2; ofstream _distanceMatrixJaccard; ofstream _distanceMatrixBrayCurtis; mutex _mutex; IteratorListener* _progress; u_int64_t _progress_distanceStep; //u_int64_t _progress_nbDistancesToCompute; u_int64_t _progress_nbDistancesComputed; //string _progress_text; size_t _start_i, _start_j; size_t _n_i, _n_j; vector > _kmercountSketches_i; vector > _kmercountSketches_j; SimkaMinDistanceAlgorithm(IProperties* options): Algorithm("simkaMinDistanceAlgorithm", -1, options) { } void execute(){ //pthread_mutex_init(&_mutex, NULL); parseArgs(); readInfos(); loadSketches(); distance(); //createDirs(); //SimkaCommons::checkInputValidity(_outputDirTemp, _inputFilename); //countDatasets(); //string command = "rm -rf " + _outputDirTemp; //system(command.c_str()); cout << "Output results: " << _outputDir << endl; } void parseArgs(){ _options = getInput(); //_sketchSize = _options->getInt(STR_SIMKA_SKETCH_SIZE); _nbCores = _options->getInt(STR_NB_CORES); _inputFilename1 = _options->getStr(STR_SIMKA_URI_INPUT_1); _inputFilename2 = _options->getStr(STR_SIMKA_URI_INPUT_2); _outputDir = _options->getStr(STR_URI_OUTPUT); _start_i = _options->getInt("-start-i"); _start_j = _options->getInt("-start-j"); _n_i = _options->getInt("-n-i"); _n_j = _options->getInt("-n-j"); //_kmerSize = _options->getInt(STR_KMER_SIZE); if(!System::file().doesExist(_outputDir)){ int ok = System::file().mkdir(_outputDir, -1); if(ok != 0){ std::cerr << "Error: can't create output directory (" << _outputDir << ")" << std::endl; exit(1); } } } void readInfos(){ //_nbDataset1 = SimkaMinCommons::readNbDatasets(_inputFilename1); //_nbDataset2 = SimkaMinCommons::readNbDatasets(_inputFilename2); //u_int32_t sketchSize1; //u_int32_t sketchSize2; u_int8_t kmerSizeDummy; SimkaMinCommons::getKmerInfos(_inputFilename1, kmerSizeDummy, _sketchSize_1, _seed, _nbDataset1); SimkaMinCommons::getKmerInfos(_inputFilename2, kmerSizeDummy, _sketchSize_2, _seed, _nbDataset2); //_sketchSize = min(sketchSize1, sketchSize2); if(_sketchSize_1 != _sketchSize_2){ cout << "WARNING: both spectrums have different sizes (" << _sketchSize_1 << " and " << _sketchSize_2 << "), will use " << min(_sketchSize_1, _sketchSize_2) << " k-mers" << endl; } if(_n_i == 0){ _n_i = _nbDataset1; } if(_n_j == 0){ _n_j = _nbDataset2; } //_nbdatasetsToProcess = min(_nbdatasetsToProcess, ) //cout << _nbDataset1 << " " << _nbDataset2 << endl; //cout << _sketchSize << endl; //cout << _seed << endl; } /* void createDirs(){ //if(!System::file().doesExist(_outputDir)){ //int ok = System::file().mkdir(_outputDir, -1); //if(ok != 0){ // std::cerr << "Error: can't create output directory (" << _outputDir << ")" << std::endl; // exit(1); //} //} if(!System::file().doesExist(_outputDirTemp)){ int ok = System::file().mkdir(_outputDirTemp, -1); if(ok != 0){ std::cerr << "Error: can't create output temp directory (" << _outputDirTemp << ")" << std::endl; exit(1); } } _outputDirTemp = System::file().getRealPath(_outputDirTemp) + "/"; //_outputDirTemp += "/simka_output_temp/"; //System::file().mkdir(_outputDirTemp, -1); //_args->setStr(STR_URI_OUTPUT_TMP, _outputDirTemp); //System::file().mkdir(_outputDirTemp + "/input/", -1); }*/ void loadSketches(){ ifstream sketchFile_1; sketchFile_1.open(_inputFilename1.c_str(), ios::binary); ifstream sketchFile_2; sketchFile_2.open(_inputFilename2.c_str(), ios::binary); _kmercountSketches_i.resize(_n_i); _kmercountSketches_j.resize(_n_j); u_int32_t sketchSize = min(_sketchSize_1, _sketchSize_2); size_t index = 0; for(size_t i=_start_i; i<_start_i+_n_i; i++){ u_int64_t pos = KMER_SPECTRUM_HEADER_SIZE + (i*_sketchSize_1*sizeof(KmerAndCountType)); sketchFile_1.seekg(pos); _kmercountSketches_i[index].resize(sketchSize); //for(size_t k=0; k<_sketchSize; k++){ sketchFile_1.read((char*)&(_kmercountSketches_i[index][0]), sizeof(KmerAndCountType)*sketchSize); //} index += 1; } index = 0; for(size_t j=_start_j; j<_start_j+_n_j; j++){ u_int64_t pos = KMER_SPECTRUM_HEADER_SIZE + (j*_sketchSize_2*sizeof(KmerAndCountType)); sketchFile_2.seekg(pos); _kmercountSketches_j[index].resize(sketchSize); sketchFile_2.read((char*)&(_kmercountSketches_j[index][0]), sizeof(KmerAndCountType)*sketchSize); //for(size_t k=0; k<_sketchSize; k++){ //sketchFile_2.read(&_kmercountSketches_j[index][k], sizeof(KmerAndCountType)); //} index += 1; } sketchFile_1.close(); sketchFile_2.close(); for(size_t i=0; i<_kmercountSketches_i.size(); i++){ u_int64_t start = 0; for(size_t j=0; j<_sketchSize_1; j++){ if(_kmercountSketches_i[i][j]._kmer == 0){ start += 1; } } _kmercountSketches_i[i].erase(_kmercountSketches_i[i].begin(), _kmercountSketches_i[i].begin()+start); } for(size_t i=0; i<_kmercountSketches_j.size(); i++){ u_int64_t start = 0; for(size_t j=0; j<_kmercountSketches_j[i].size(); j++){ if(_kmercountSketches_j[i][j]._kmer == 0){ start += 1; } } _kmercountSketches_j[i].erase(_kmercountSketches_j[i].begin(), _kmercountSketches_j[i].begin()+start); } } void distance(){ if(System::file().doesExist(_outputDir + "/mat_presenceAbsence_jaccard.bin")){ _distanceMatrixJaccard.open((_outputDir + "/mat_presenceAbsence_jaccard.bin").c_str(), ios::binary | ios::in); _distanceMatrixBrayCurtis.open((_outputDir + "/mat_abundance_braycurtis.bin").c_str(), ios::binary | ios::in); } else{ _distanceMatrixJaccard.open((_outputDir + "/mat_presenceAbsence_jaccard.bin").c_str(), ios::binary); _distanceMatrixBrayCurtis.open((_outputDir + "/mat_abundance_braycurtis.bin").c_str(), ios::binary); } bool isSymmetrical = false; if(_inputFilename1 == _inputFilename2 && _start_i == _start_j){ computeDistanceSymetrical(); isSymmetrical = true; } else{ computeDistanceRectangle(); } for(size_t i=0; i<_threads.size(); i++){ _threads[i]->join(); delete _threads[i]; //cout << i << endl; } _progress->finish(); //Fill diagonal with 0 if(isSymmetrical){ for(size_t i=_start_i; i<_start_i+_n_i; i++){ size_t j=i; u_int64_t pos = i*_nbDataset1*sizeof(DistanceValueType) + (j*sizeof(DistanceValueType)); _distanceMatrixJaccard.seekp(pos); _distanceMatrixBrayCurtis.seekp(pos); DistanceValueType nullDist = 0; _distanceMatrixJaccard.write((const char*)&nullDist, sizeof(nullDist)); _distanceMatrixBrayCurtis.write((const char*)&nullDist, sizeof(nullDist)); } } _distanceMatrixJaccard.close(); _distanceMatrixBrayCurtis.close(); //string command = "cp " + string(_inputFilename1+".ids") + " " + _outputDir + "/matrix_infos.ids "; //cout << command << endl; //system(command.c_str()); } void computeDistanceSymetrical(){ //cout << "compute symetrical distances" << endl; u_int64_t nbDistancesToCompute = (_n_i*(_n_i-1)) / 2; //u_int64_t nbDistancesToCompute = _nbDataset1*_nbDataset1; //(_nbDataset1*(_nbDataset1-1)) / 2; u_int64_t nbDistancePerThreads = nbDistancesToCompute / _nbCores; u_int64_t nbDistancesRemaining = nbDistancesToCompute-(nbDistancePerThreads*_nbCores); //vector startDistanceI; //vector startDistanceJ; //size_t si=0; //size_t sj=0; //cout << "NB CORES: " << _nbCores << endl; //cout << "NB DISTANCES: " << nbDistancesToCompute << endl; //cout << "NB DISTANCES PER CORE: " << nbDistancePerThreads << endl; _progress = this->createIteratorListener (nbDistancesToCompute, "Computing distances"); _progress->init (); _progress_distanceStep = max((u_int64_t)1, (u_int64_t) (nbDistancePerThreads / 100)); u_int64_t nbDistances = 0; size_t nbRunnedThreads = 0; size_t i=_start_i; size_t j=i+1; size_t maxDatasets = _start_i+_n_i;//min((u_int64_t)_start_i+_nbdatasetsToProcess, (u_int64_t)_nbDataset1); //_computeDistanceManagers.push_back(); thread* t = new thread(&SimkaMinDistanceAlgorithm::computeDistances_unsynch, this, i, j, nbDistancePerThreads, nbRunnedThreads); _threads.push_back(t); //computeDistances_unsynch(i, j, nbDistancePerThreads, true); bool done = false; nbRunnedThreads += 1; for(; i= nbDistancePerThreads){ //cout << i << " " << j << endl; //cout << "lol: " << nbRunnedThreads << " " << nbDistancesRemaining << endl; if(nbRunnedThreads == _nbCores-1){ //Last threads compute remaining distances //cout << " LOL " << endl;); thread* t = new thread(&SimkaMinDistanceAlgorithm::computeDistances_unsynch, this, i, j, nbDistancePerThreads+nbDistancesRemaining, nbRunnedThreads); _threads.push_back(t); //computeDistances_unsynch(i, j, nbDistancePerThreads+nbDistancesRemaining, true); done = true; //nbDistances -= nbDistancesRemaining; } else{ thread* t = new thread(&SimkaMinDistanceAlgorithm::computeDistances_unsynch, this, i, j, nbDistancePerThreads, nbRunnedThreads); _threads.push_back(t); //computeDistances_unsynch(i, j, nbDistancePerThreads, true); } nbRunnedThreads += 1; nbDistances = 0; } nbDistances += 1; } } /* uint64_t iFloor = nbDistancePerThreads / _nbDataset1; uint64_t iMod = nbDistancePerThreads % _nbDataset1; for ( uint64_t i = 0, j = 0; i < _nbDataset1; i += iFloor, j += iMod ) { if ( j >= _nbDataset1 ) { if ( i == _nbDataset1 - 1 ) { break; } i++; j -= _nbDataset1; } cout << i << " " << j << endl; //thread* t = new thread(&SimkaMinDistanceAlgorithm::computeDistances_unsynch, this, i, j); //_threads.push_back(t); } */ /* while(true){ startDistanceI.push_back(si); startDistanceI.push_back(sj); u_int64_t nbDistances = 0; while(nbDistances < nbDistancesToCompute){ //for(size_t i=0; i<) } } for(size_t i=1; i<_nbDataset1; i++){ for(size_t j=(i+1); j<_nbDataset1; j++){ } }*/ } void computeDistanceRectangle(){ //cout << "compute rectangle distances" << endl; u_int64_t nbDistancesToCompute = _n_i*_n_j; //u_int64_t nbDistancesToCompute = _nbDataset1*_nbDataset1; //(_nbDataset1*(_nbDataset1-1)) / 2; u_int64_t nbDistancePerThreads = nbDistancesToCompute / _nbCores; u_int64_t nbDistancesRemaining = nbDistancesToCompute-(nbDistancePerThreads*_nbCores); //vector startDistanceI; //vector startDistanceJ; //size_t si=0; //size_t sj=0; //cout << "NB CORES: " << _nbCores << endl; //cout << "NB DISTANCES: " << nbDistancesToCompute << endl; //cout << "NB DISTANCES PER CORE: " << nbDistancePerThreads << endl; //cout << "NB DISTANCES REMAINING: " << nbDistancesRemaining << endl; _progress = this->createIteratorListener (nbDistancesToCompute, "Computing distances"); _progress->init (); u_int64_t nbDistances = 0; size_t nbRunnedThreads = 0; size_t i=_start_i; size_t j=_start_j; thread* t = new thread(&SimkaMinDistanceAlgorithm::computeDistances_rectanglular_unsynch, this, i, j, nbDistancePerThreads, nbRunnedThreads); _threads.push_back(t); bool done = false; nbRunnedThreads += 1; //size_t maxDatasetsI = min((u_int64_t)_start_i+_nbdatasetsToProcess, (u_int64_t)_nbDataset1); for(i=_start_i; i<_start_i+_n_i; i++){ if(done) break; //size_t maxDatasetsJ = min((u_int64_t)_start_j+_nbdatasetsToProcess, (u_int64_t)_nbDataset2); for(j=_start_j; j<_start_j+_n_j; j++){ if(done) break; if(nbDistances >= nbDistancePerThreads){ //cout << i << " " << j << endl; //cout << "lol: " << nbRunnedThreads << " " << nbDistancesRemaining << endl; if(nbRunnedThreads == _nbCores-1){ //Last threads compute remaining distances //cout << " LOL " << endl;); //cout << i << " " << j << endl; thread* t = new thread(&SimkaMinDistanceAlgorithm::computeDistances_rectanglular_unsynch, this, i, j, nbDistancePerThreads+nbDistancesRemaining, nbRunnedThreads); _threads.push_back(t); //computeDistances_unsynch(i, j, nbDistancePerThreads+nbDistancesRemaining, true); done = true; //nbDistances -= nbDistancesRemaining; } else{ //cout << i << " " << j << endl; thread* t = new thread(&SimkaMinDistanceAlgorithm::computeDistances_rectanglular_unsynch, this, i, j, nbDistancePerThreads, nbRunnedThreads); _threads.push_back(t); //computeDistances_unsynch(i, j, nbDistancePerThreads, true); } nbRunnedThreads += 1; nbDistances = 0; } //cout << nbDistances << " " << nbDistancePerThreads << endl; nbDistances += 1; } } } void computeDistances_unsynch(size_t si, size_t sj, size_t nbDistancesToCompute, size_t id){ ComputeDistanceManager computeDistanceManager(_inputFilename1, _inputFilename2, _distanceMatrixJaccard, _distanceMatrixBrayCurtis, true, _nbDataset1, _nbDataset2, _mutex, _start_i, _start_j, _n_i, _n_j, _kmercountSketches_i, _kmercountSketches_j); //cout << "-------------------" << endl; u_int64_t progress_nbComputedistances = 0; u_int64_t nbComputedDistances = 0; //size_t maxDatasetsI = min((u_int64_t)si+_nbdatasetsToProcess, (u_int64_t)_nbDataset1); for(size_t j=sj; j<_start_i+_n_i; j++){ //cout << j << endl; //cout << si << " " << j << endl; computeDistanceManager.computeDistance_unsynch(si, j); nbComputedDistances += 1; progress_nbComputedistances += 1; if(nbComputedDistances >= nbDistancesToCompute) break; } si += 1; if(nbComputedDistances < nbDistancesToCompute){ //cout << "lala2" << endl; for(size_t i=si; i<_start_i+_n_i; i++){ for(size_t j=i+1; j<_start_i+_n_i; j++){ //cout << i << " " << j << endl; computeDistanceManager.computeDistance_unsynch(i, j); nbComputedDistances += 1; progress_nbComputedistances += 1; //_mutex.lock(); //cout << progress_nbComputedistances << " " << _progress_distanceStep << endl; //_mutex.unlock(); if(progress_nbComputedistances > _progress_distanceStep){ _mutex.lock(); _progress->inc(progress_nbComputedistances); _mutex.unlock(); progress_nbComputedistances = 0; } if(nbComputedDistances >= nbDistancesToCompute) break; } if(nbComputedDistances >= nbDistancesToCompute) break; } } _mutex.lock(); _progress->inc(progress_nbComputedistances); _mutex.unlock(); } void computeDistances_rectanglular_unsynch(size_t si, size_t sj, size_t nbDistancesToCompute, size_t id){ //isSymetrical set to false ComputeDistanceManager computeDistanceManager(_inputFilename1, _inputFilename2, _distanceMatrixJaccard, _distanceMatrixBrayCurtis, false, _nbDataset1, _nbDataset2, _mutex, _start_i, _start_j, _n_i, _n_j, _kmercountSketches_i, _kmercountSketches_j); //_mutex.lock(); //cout << "------------------- " << si << " " << sj << endl; //_mutex.unlock(); u_int64_t nbComputedDistances = 0; u_int64_t progress_nbComputedistances = 0; //size_t maxDatasetsI = min((u_int64_t)si+_nbdatasetsToProcess, (u_int64_t)_nbDataset1); //size_t maxDatasetsJ = min((u_int64_t)sj+_nbdatasetsToProcess, (u_int64_t)_nbDataset2); for(size_t j=sj; j<_start_j+_n_j; j++){ //cout << si << " " << j << endl; computeDistanceManager.computeDistance_unsynch(si, j); nbComputedDistances += 1; progress_nbComputedistances += 1; if(nbComputedDistances >= nbDistancesToCompute) break; } si += 1; if(nbComputedDistances < nbDistancesToCompute){ for(size_t i=si; i<_start_i+_n_i; i++){ for(size_t j=_start_j; j<_start_j+_n_j; j++){ // (0 instead of i+1) //cout << i << " " << j << endl; computeDistanceManager.computeDistance_unsynch(i, j); nbComputedDistances += 1; progress_nbComputedistances += 1; if(progress_nbComputedistances > _progress_distanceStep){ _mutex.lock(); _progress->inc(progress_nbComputedistances); _mutex.unlock(); progress_nbComputedistances = 0; } if(nbComputedDistances >= nbDistancesToCompute) break; } if(nbComputedDistances >= nbDistancesToCompute) break; } } _mutex.lock(); //cout << nbComputedDistances << " " << nbDistancesToCompute << endl; _progress->inc(progress_nbComputedistances); _mutex.unlock(); } }; class SimkaMinDistance : public Tool{ public: SimkaMinDistance(): Tool ("SimkaMin-Distance"){ IOptionsParser* parser = getParser();//new OptionsParser ("Simka2 - Compute Kmer Spectrum"); parser->push_front (new OptionOneParam (STR_URI_OUTPUT, "output dir for distance matrices", false, "./simkaMin_results")); parser->push_front (new OptionOneParam (STR_SIMKA_URI_INPUT_2, "filename to a sketch file to compare with -in1", true)); parser->push_front (new OptionOneParam (STR_SIMKA_URI_INPUT_1, "filename to a sketch file to compare with -in2", true)); parser->push_back (new OptionOneParam ("-start-i", "start i (row)", false, "0")); parser->push_back (new OptionOneParam ("-start-j", "start j (column)", false, "0")); parser->push_back (new OptionOneParam ("-n-i", "Nb datasets to process (row)", false, "0")); parser->push_back (new OptionOneParam ("-n-j", "Nb datasets to process (column)", false, "0")); } void execute () { IProperties* args = getInput(); u_int32_t seed1; u_int32_t seed2; u_int32_t dummy; u_int8_t kmerSize1; u_int8_t kmerSize2; string inputFilename1 = args->getStr(STR_SIMKA_URI_INPUT_1); string inputFilename2 = args->getStr(STR_SIMKA_URI_INPUT_2); SimkaMinCommons::getKmerInfos(inputFilename1, kmerSize1, dummy, seed1, dummy); SimkaMinCommons::getKmerInfos(inputFilename2, kmerSize2, dummy, seed2, dummy); //size_t kmerSize = getInput()->getInt (STR_KMER_SIZE); if(kmerSize1 != kmerSize2){ cerr << "ERROR: can't compare both sketches because of different kmer sizes (" << kmerSize1 << " and " << kmerSize2 << ")" << endl; exit(1); } if(seed1 != seed2){ cerr << "ERROR: can't compare both sketches because of different seeds (" << seed1 << " and " << seed2 << ")" << endl; exit(1); } //cout << seed1 << " " << seed2 << endl; SimkaMinDistanceAlgorithm* algo = new SimkaMinDistanceAlgorithm(args); algo->execute(); delete algo; } }; #endif /* SIMKA1_4_SRC_SIMKAMIN_SIMKAMINDISTANCE_HPP_ */ simka-1.5.1/src/simkaMin/SimkaMinDistanceMatrixExporter.hpp000066400000000000000000000330031353413740300237750ustar00rootroot00000000000000/***************************************************************************** * SimkaMin: Fast kmer-based method for estimating the similarity between numerous metagenomic datasets * A tool from the GATB (Genome Assembly Tool Box) * Copyright (C) 2019 INRIA * Authors: G.Benoit * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU Affero General Public License as * published by the Free Software Foundation, either version 3 of the * License, or (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU Affero General Public License for more details. * * You should have received a copy of the GNU Affero General Public License * along with this program. If not, see . *****************************************************************************/ #ifndef SIMKA1_4_SRC_SIMKAMIN_SIMKAMINDISTANCEMATRIXEXPORTER_HPP_ #define SIMKA1_4_SRC_SIMKAMIN_SIMKAMINDISTANCEMATRIXEXPORTER_HPP_ #include "SimkaMinCommons.hpp" class SimkaDistanceMatrixBinary { public: static void loadRow(size_t rowIndex, ifstream& matrixBinaryFile, vector& resultRow){ matrixBinaryFile.seekg(rowIndex*resultRow.size()*sizeof(float), ios_base::beg); matrixBinaryFile.read((char*)resultRow.data(), sizeof(float)*resultRow.size()); } static void mergeMatrices(const string& existingMatrixFilename, const string& newMatrixFilename_existingVsNew, const string& newMatrixFilename_newVsNew, u_int32_t nbDatasets_existing, u_int32_t nbDatasets_new){ ifstream existingMatrixFile; existingMatrixFile.open(existingMatrixFilename.c_str(), ios::binary); vector existingRowData(nbDatasets_existing, 0); ifstream matrixFile_existingVsNew; matrixFile_existingVsNew.open(newMatrixFilename_existingVsNew.c_str(), ios::binary); ifstream matrixFile_newVsNew; matrixFile_newVsNew.open(newMatrixFilename_newVsNew.c_str(), ios::binary); vector newRowData(nbDatasets_new, 0); string tempOutputFilename = existingMatrixFilename + ".temp"; ofstream tempOutputFile; tempOutputFile.open(tempOutputFilename.c_str(), ios::binary); //Write existing distance + matrixFile_existingVsNew (right part) for(size_t i=0; i >& distanceMatrix_rectangular, const vector >& distanceMatrix_squaredHalf){ //string distanceMatrixDir = distanceMatricesDir + "/" + distanceName; //if(System::file().doesExist(distanceMatrixDir)){ //} //else{ // System::file().mkdir(distanceMatrixDir, -1); //} //string distanceMatrixDir = outputDirTemp + "/distance_matrix"; string filename = distanceMatricesDir + "/" + distanceName + ".bin"; ofstream outputFile(filename.c_str(), ios::binary); u_int64_t nbOldBanks = 0; if(distanceMatrix_rectangular.size() > 0){ nbOldBanks = distanceMatrix_rectangular[0].size(); } u_int64_t nbNewBanks = distanceMatrix_squaredHalf.size() + 1; u_int64_t nbBanks = nbOldBanks + nbNewBanks; if(nbOldBanks > 0){ if(nbNewBanks > 1){ for(size_t i=0; i >& distanceMatrix_rectangular){ //cout << endl; //cout << distanceMatrix_rectangular.size() << " " << distanceMatrix_rectangular[i].size() << endl; //cout << endl; //for(size_t j=0; j >& distanceMatrix_squaredHalf){ u_int64_t nbNewBanks = distanceMatrix_squaredHalf.size() + 1; //for(size_t i=0; i _ids1; vector _ids2; //vector _wantedIds; //vector _wantedIdsIndex_1; //vector _wantedIdsIndex_2; //unordered_map _idToIndex_1; //unordered_map _idToIndex_2; size_t _inputMatrixSize_1; size_t _inputMatrixSize_2; //size_t _outputMatrixSize; SimkaMinDistanceMatrixExporterAlgorithm(IProperties* options): Algorithm("simkaMinDistanceExporterAlgorithm", -1, options) { } void execute(){ _inputFilenameIds = ""; parseArgs(); createWantedIds(); //createIdsIndex(); writeMatrices(); } void parseArgs(){ _options = getInput(); _inputDir = _options->getStr(STR_URI_INPUT); _inputSketchFilename1 = _options->getStr(STR_SIMKA_URI_INPUT_1); _inputSketchFilename2 = _options->getStr(STR_SIMKA_URI_INPUT_2); _outputDir = _options->getStr(STR_URI_OUTPUT); if(getInput()->get(STR_SIMKA_INPUT_IDS)){ _inputFilenameIds = getInput()->getStr(STR_SIMKA_INPUT_IDS); } if(!System::file().doesExist(_outputDir)){ int ok = System::file().mkdir(_outputDir, -1); if(ok != 0){ std::cerr << "Error: can't create output directory (" << _outputDir << ")" << std::endl; exit(1); } } } void createWantedIds(){ SimkaMinCommons::readIds(_inputSketchFilename1, _ids1); SimkaMinCommons::readIds(_inputSketchFilename2, _ids2); /* if(_inputFilenameIds.empty()){ _wantedIds = vector(_ids1); _wantedIds.insert(_wantedIds.end(), _ids2.begin(), _ids2.end()); } else{ string line; ifstream inputFile(_inputFilenameIds.c_str()); while(getline(inputFile, line)){ line.erase(std::remove(line.begin(),line.end(),' '),line.end()); if(line == "") continue; _wantedIds.push_back(line); } } */ _inputMatrixSize_1 = _ids1.size(); _inputMatrixSize_2 = _ids2.size(); //_outputMatrixSize = _wantedIds.size(); cout << "Matrix size: " << _inputMatrixSize_1 << " x " << _inputMatrixSize_2 << endl; } /* void createIdsIndex(){ for(size_t i=0; i<_ids1.size(); i++){ _idToIndex_1[_ids1[i]] = i; } for(size_t i=0; i<_ids2.size(); i++){ _idToIndex_2[_ids2[i]] = i; } for(size_t i=0; i<_wantedIds.size(); i++){ if(_idToIndex_1.find(_wantedIds[i]) == _idToIndex_1.end()){ cout << "ID not found in distance matrix: " << _wantedIds[i] << endl; } else{ _wantedIdsIndex.push_back(_idToIndex[_wantedIds[i]]); } } //_wantedIdsIndex.resize(_outputMatrixSize); //for(size_t i=0; i<_outputMatrixSize; i++){ //} _outputMatrixSize = _wantedIdsIndex.size(); cout << "output matrix size: " << _outputMatrixSize << endl; } */ void writeMatrices(){ vector matrixFilenames = System::file().listdir(_inputDir); for(size_t i=0; i rowData(_ids2.size(), 0); ifstream binaryMatrixFile(binaryMatrixFilename.c_str(), ios::binary); string filename = _outputDir + "/" + distanceName + ".csv"; gzFile out = gzopen((filename + ".gz").c_str(),"wb"); string str = ""; for(size_t i=0; i<_ids2.size(); i++){ str += ";" + _ids2[i]; //_ids[_wantedIdsIndex[i]]; } str += '\n'; gzwrite(out, str.c_str(), str.size()); for(size_t i=0; i<_ids1.size(); i++){ str = ""; str += _ids1[i] + ";"; //[_wantedIdsIndex[i]] + ";"; //size_t rowIndex = _wantedIdsIndex[i]; SimkaDistanceMatrixBinary::loadRow(i, binaryMatrixFile, rowData); for(size_t j=0; j<_ids2.size(); j++){ //str += Stringify::format("%f", rowData[_wantedIdsIndex[j]]) + ";"; str += Stringify::format("%f", rowData[j]) + ";"; } str.erase(str.size()-1); str += '\n'; gzwrite(out, str.c_str(), str.size()); } gzclose(out); binaryMatrixFile.close(); } }; class SimkaMinDistanceMatrixExporter : public Tool{ public: SimkaMinDistanceMatrixExporter(): Tool ("SimkaMin-DistanceMatrixExporter"){ IOptionsParser* parser = getParser();//new OptionsParser ("Simka2 - Compute Kmer Spectrum"); parser->push_front (new OptionOneParam (STR_URI_OUTPUT, "output dir for distance matrices", false, "./simkaMin_results")); //parser->push_front (new OptionOneParam (STR_SIMKA_INPUT_IDS, "filename of ids in the result matrix (one id per line). Do not used this option to used all ids.", false)); parser->push_front (new OptionOneParam (STR_SIMKA_URI_INPUT_2, "second used sketch file (-in2 argument of ./simkaMin distance)", true)); parser->push_front (new OptionOneParam (STR_SIMKA_URI_INPUT_1, "first used sketch file (-in1 argument of ./simkaMin distance)", true)); parser->push_front (new OptionOneParam (STR_URI_INPUT, "input dir containing distance matrices in binary format (-out argument of ./simkaMin distance)", true)); } void execute (){ IProperties* args = getInput(); SimkaMinDistanceMatrixExporterAlgorithm* algo = new SimkaMinDistanceMatrixExporterAlgorithm(args); algo->execute(); delete algo; } }; #endif /* SIMKA1_4_SRC_SIMKAMIN_SIMKAMINDISTANCEMATRIXEXPORTER_HPP_ */ simka-1.5.1/src/simkaMin/SimkaMinDistanceMatrixMerger.hpp000066400000000000000000000130031353413740300234040ustar00rootroot00000000000000/***************************************************************************** * SimkaMin: Fast kmer-based method for estimating the similarity between numerous metagenomic datasets * A tool from the GATB (Genome Assembly Tool Box) * Copyright (C) 2019 INRIA * Authors: G.Benoit * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU Affero General Public License as * published by the Free Software Foundation, either version 3 of the * License, or (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU Affero General Public License for more details. * * You should have received a copy of the GNU Affero General Public License * along with this program. If not, see . *****************************************************************************/ #ifndef SIMKA1_4_SRC_SIMKAMIN_SIMKAMINDISTANCEMATRIXMERGER_HPP_ #define SIMKA1_4_SRC_SIMKAMIN_SIMKAMINDISTANCEMATRIXMERGER_HPP_ #include "SimkaMinCommons.hpp" #include "SimkaMinDistanceMatrixExporter.hpp" class SimkaMinDistanceMatrixMergerAlgorithm : public Algorithm{ public: IProperties* _options; string _inputDir; //string _outputDir; string _inputSketchFilename_existingDatasets; string _inputSketchFilename_newDatasets; //vector _ids1; //vector _ids2; //vector _wantedIds; //vector _wantedIdsIndex_1; //vector _wantedIdsIndex_2; //unordered_map _idToIndex_1; //unordered_map _idToIndex_2; //size_t _inputMatrixSize_1; //size_t _inputMatrixSize_2; //size_t _outputMatrixSize; SimkaMinDistanceMatrixMergerAlgorithm(IProperties* options): Algorithm("simkaMinDistanceMatrixMergerAlgorithm", -1, options) { } void execute(){ parseArgs(); mergeMatrices(); } void parseArgs(){ _options = getInput(); _inputDir = _options->getStr(STR_URI_INPUT) + "/"; _inputSketchFilename_existingDatasets = _options->getStr(STR_SIMKA_URI_INPUT_1); _inputSketchFilename_newDatasets = _options->getStr(STR_SIMKA_URI_INPUT_2); //_outputDir = _options->getStr(STR_URI_OUTPUT); //if(getInput()->get(STR_SIMKA_INPUT_IDS)){ // _inputFilenameIds = getInput()->getStr(STR_SIMKA_INPUT_IDS); //} //if(!System::file().doesExist(_outputDir)){ // int ok = System::file().mkdir(_outputDir, -1); // if(ok != 0){ // std::cerr << "Error: can't create output directory (" << _outputDir << ")" << std::endl; // exit(1); // } //} } void mergeMatrices(){ u_int32_t dummy; u_int8_t dummy_k; u_int32_t nbDatasets_existing, nbDatasets_new; SimkaMinCommons::getKmerInfos(_inputSketchFilename_existingDatasets, dummy_k, dummy, dummy, nbDatasets_existing); SimkaMinCommons::getKmerInfos(_inputSketchFilename_newDatasets, dummy_k, dummy, dummy, nbDatasets_new); vector matrixFilenames = System::file().listdir(_inputDir); for(size_t i=0; ipush_front (new OptionOneParam (STR_URI_OUTPUT, "output dir for distance matrices", false, "./simkaMin_results")); //parser->push_front (new OptionOneParam (STR_SIMKA_INPUT_IDS, "filename of ids in the result matrix (one id per line). Do not used this option to used all ids.", false)); parser->push_front (new OptionOneParam (STR_SIMKA_URI_INPUT_2, "sketch file of new datasets", true)); parser->push_front (new OptionOneParam (STR_SIMKA_URI_INPUT_1, "sketch file of existing datasets", true)); parser->push_front (new OptionOneParam (STR_URI_INPUT, "input dir containing existing simka results", true)); } void execute (){ IProperties* args = getInput(); SimkaMinDistanceMatrixMergerAlgorithm* algo = new SimkaMinDistanceMatrixMergerAlgorithm(args); algo->execute(); delete algo; } }; #endif /* SIMKA1_4_SRC_SIMKAMIN_SIMKAMINDISTANCEMATRIXMERGER_HPP_ */ simka-1.5.1/src/simkaMin/SimkaMinInfos.hpp000066400000000000000000000065751353413740300204210ustar00rootroot00000000000000/***************************************************************************** * SimkaMin: Fast kmer-based method for estimating the similarity between numerous metagenomic datasets * A tool from the GATB (Genome Assembly Tool Box) * Copyright (C) 2019 INRIA * Authors: G.Benoit * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU Affero General Public License as * published by the Free Software Foundation, either version 3 of the * License, or (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU Affero General Public License for more details. * * You should have received a copy of the GNU Affero General Public License * along with this program. If not, see . *****************************************************************************/ #ifndef SIMKA1_4_SRC_SIMKAMIN_SIMKAMININFOS_HPP_ #define SIMKA1_4_SRC_SIMKAMIN_SIMKAMININFOS_HPP_ #include "SimkaMinCommons.hpp" class SimkaMinInfosAlgorithm : public Algorithm { public: IProperties* _options; string _inputFilename; u_int32_t _nbDatasets; u_int32_t _sketchSize; SimkaMinInfosAlgorithm(IProperties* options): Algorithm("simkaMinInfosAlgorithm", -1, options) { } void execute(){ parseArgs(); printInfos(); } void parseArgs(){ _options = getInput(); _inputFilename = _options->getStr(STR_URI_INPUT); if(!System::file().doesExist(_inputFilename)){ std::cerr << "Error: input does not exist (" << _inputFilename << ")" << std::endl; exit(1); } } void printInfos(){ //vector datasetIds; //SimkaMinCommons::readIds(_inputFilename, datasetIds); u_int32_t seed; u_int8_t kmerSize; SimkaMinCommons::getKmerInfos(_inputFilename, kmerSize, _sketchSize, seed, _nbDatasets); cout << "Sketch info: " << _inputFilename << endl; cout << endl; cout << "k-mer size : " << (u_int32_t) kmerSize << endl; cout << "Sketch size : " << _sketchSize << endl; cout << "Seed : " << seed << endl; cout << endl; cout << "Nb Datasets: " << _nbDatasets << endl; printIds(); cout << endl; } void printIds(){ ifstream file(_inputFilename.c_str(), ios::binary); file.seekg(SimkaMinCommons::getFilePosition_sketchIds(_nbDatasets, _sketchSize)); //u_int32_t nbDatasets; //file.read((char*)(&nbDatasets), sizeof(nbDatasets)); string datasetId; for(size_t i=0; i<_nbDatasets; i++){ SimkaMinCommons::readString(datasetId, file); cout << datasetId << endl; //datasetIds.push_back(datasetId); } file.close(); } }; class SimkaMinInfos : public Tool{ public: SimkaMinInfos(): Tool ("SimkaMin-Infos"){ IOptionsParser* parser = getParser();//new OptionsParser ("Simka2 - Compute Kmer Spectrum"); parser->push_front (new OptionOneParam (STR_URI_INPUT, "filename to a sketch file", true)); parser->getParser (STR_NB_CORES)->setVisible (false); parser->getParser (STR_VERBOSE)->setVisible (false); } void execute () { IProperties* args = getInput(); SimkaMinInfosAlgorithm* algo = new SimkaMinInfosAlgorithm(args); algo->execute(); delete algo; } }; #endif /* SIMKA1_4_SRC_SIMKAMIN_SIMKAMININFOS_HPP_ */ simka-1.5.1/tests/000077500000000000000000000000001353413740300137675ustar00rootroot00000000000000simka-1.5.1/tests/simkaMin/000077500000000000000000000000001353413740300155375ustar00rootroot00000000000000simka-1.5.1/tests/simkaMin/__results__/000077500000000000000000000000001353413740300200345ustar00rootroot00000000000000simka-1.5.1/tests/simkaMin/__results__/k21__0-1000_n1/000077500000000000000000000000001353413740300217635ustar00rootroot00000000000000simka-1.5.1/tests/simkaMin/__results__/k21__0-1000_n1/sketch/000077500000000000000000000000001353413740300232445ustar00rootroot00000000000000simka-1.5.1/tests/simkaMin/__results__/k21__0-1000_n1/sketch/sketch.bin000066400000000000000000002250151353413740300252240ustar00rootroot00000000000000dg~D =`1% *U}L1pH;ۯ/RML [%=h%b)@>fu{sS*2=; {c<ǩ9)5ߑ,i}l\R 㚥E,^`)^1<AmASjzcJ(wx&{X zI ~Vczv7yyDn\g'-iu-g1`.%'*d3FuDSg O{/+π9{_J5RId~`W Z. ÿ.D60¾CpYnhr !O*uh|IӺ>J~ ~X,x5ꮴݚ?@W0cuT9aB(٫ܣdאNHDzb2o*Sw@CjRJ;I`jx_Kw}d;,-鎘GӴѧ/xkOQ@lB!wʒ&.0E/fP-0=uD}dmIOvLKWM N˜bY#aa|a%߷-e' _j7닣LGmlᴕ7Vb|'6޳f="Ϋ-y+DH}%` Hz&DGKY"Sq?_HPcӞosl+TuTTߊEӧ1ʊر}jy9e`H5ÓW˵q],b aQ]}k9q QPyP~UM'd"gNd(hW,}lNm+MPb P.RdJr#]a$|Ppq8ϛwR&i?5UrɄDiB v[?3ĺȒy|K嶫 C@+-**cMNkR L]&bSc;Bv rh:i Y즎!o'9%tsMUF^qI5,9Hڷ 5-?@70öz ̾ ɛ t1/ &Dsc8 |J\ BU/a ~OWe 먣j n Qe5 t w}Rv B >/gu 4` Wl` dG KP.oɵ 9Π }ft5 ktS -~$ m2E1 }iץ5 |6 h˕8 [ߗ8 &R 4|2g >J} i~ !Us B. 8+H ۲& ;rYw T"8 y| ?? 'ta n1@^ _`Z< 3Z <8] ]#:j XRr 2Wlp #ȉp > k f il ̇[ xέa C: /k Ĥ3 zfZ-# tW č y3 НK_ s6p` bd A;*g ,n #,ݕu .8+i׍ ZHb &z S"c ^,  lݢC  18t 5%D # qJܩ 65PN ͱNeJ6 P@Q; eNG &iJ NdMT BDAB\ Rfa Lk t=Ъ ~!# r@yҶ Pw  \6R  C R a J>  o5 >' 0h6uE7\BW0s"M(M$Ād2 Ѵ9CI_mNhR!64Z` - ]jQO_{LPazuԀ͐G'?ou 1,@L ^Vk>`3**I$><'~<ˆ -iƨ.d:n;c"́DaMr!< ".Տ P\f~|(`.+1j4?u|\C~:Xi@[lrbf!Q9vH6bez fHPK>' ?' ܚaF d,AZᖹ  [ g]|g$j\~Y>yfãpxvn2܆;-Jkр!I/I@'fBqi_̆ns 5<5,tuCeyznEhw(υ(2t@=:\r;SsMW:dJO/?G }(]`E/¿|0'ْ%G@5JpOYG -U*2蜇U/rX90_loqQy>`I]nK_6bSz}g(MI¼ʤUb(3,D>v,}h)'v@̟XKޯ`L78XScrwhza,OIㅂ8[6S>Dh_7ZYxQz;0+2i7F,(ݬiXP:ɗl+qwГ,gF3JCuOUP?C\qmM@g~Pj&Cum .qK9rIb{ﮦ䪛BRA@H)\wDX  UzQ/V;NWk$}RW7IJػ?X B*/^2Y}w w9.؆\&lPro7QRɤn/zFҧb받5,rSg*1y.(Ty,\':zL5R&ZTfzpUWNB>~`ݽu؀;)ײ6X4,P4{}3~ߗʭB6;-:{"qR1މ*/Fa HP@ L?EU0$ kߨ&y)1HSbϤO$\EBqޫŪ?G#SZqmQbfg5v(gc/4h@G#kٖ]B'|e,^K*Vy"[~?c[^TZ"}Smy!3%90?B+ cd+ԦYdiFstt٦!܏PWp"R(tH:ofxA!wcg YO }y! bxz' S!p-  A x6 t--?@ cD G2a V_h 1jp x :^Kϊ G]9 J_ I F]  =/D [_!^mD:!!H "!M,! y 5!eݕC!J)>aR!?{̘j!Wr!D=!퐠!8!]!QL!B&OU!b! x! !I\!"tbKD9Z"RZ4RF6"~?_="aB"o.3ԹI"i(!d[" ;vc"d">{k"Bޥ~"o5"t"#-g"r"5.## #*#>$60#;#5>Y# p\#)-b#^o#흵#M>̦#PQE#)em#8 55#/fj#d#B7 $$G<͊8$1*_C$sk6J$~o$6]AU^u$TxAKz$a-ɓRz$B%W$Lp;$92$͓$.:$4cR$"V+%%u&Z2%g"=%8dZ%>~m%:Xw%922ۀ%eo%If%O%/1%%Ps%!W %[e춓%2uX%w%d 6%jkl% = Ħ&ы$&!%&pF *&PB.&YK=&\[&\:Cۯ_&d}~xSw&-4T&"_zkG&m!)'&#b&">oƥ&. F'x|'ar~' 4'R/'ŷfcDZ'#' .|'P'~OE' &wm'ҭVI'5O5'ק'a'~Cb,'b'@"^'T *`*ޡC#*tƛw]*|)q!`*"g*@l*Vqll*T33x*駢up*#Kyj܇*8Tو* 4*/D,* ,*CV*A͜ *14* \**d%K*(Ut*M5 *"ׁI +c 9+5U[R#+N\2>+qф%N+tЊk+pTl+J&{Tq+²8ӏ+wƈ~+C+bkĤ+ED$+"+{+@{+$~Ro+@64E+|[h,2Y ,,]Q ,*zB,Y,Q6d,ǥvq,4o ,F]e9,箚,rK,2,Iܹ_, ,B7,a',ɚyM,2F,/Q -[Na{,-s?+5-LCb-, n-H4k--V:ؐOp-Er-/-6!-q-<TA9--(-{ME-_/@-uU.T,.SԐ.:a xʱ .} M.4,p.9ը8.ot;;.lo pM.U.@B.3jr . `ɟ.3ؑi..!Ґn.M 7.ѯy.?w,RM.י?~.HEA<.Z.Bh.<:/Ki/ F/w1mr/p4/K8e6A/,M/2 GT/j/b/0/$t_+/pE=`f//Z$0o,嬞0iU:}"0,f+O0)1]0G1B11 "31vYI81 ~NUS1L[ݯb1fVWec1?:l1] s15%rT@u1G0z180i1xr1_281ƈ1:1۪Sax1Dr~ڟ1al1~W+1Bc+1:1 1|1 @2~^2x"v:d2P-f2q(4s2c]22Xoj2u|]22QVLE2eϋ2xT'&2ե 2Ć]i 3h:813o(gG&3x{f*3۰6$,3ùBW:3vL3%g Q3bi3U,(1u3NxFf3|03H&3BM3ߴU+3٣u3K;h4\{% 4ڪka 4X-j*4*G[+4Ұ?,4cww04?ѴJ4K48rt42O4+M#46b4O45'9ξ4"4zن4 Td4D4)N4$:4 5Aၤ5«򱷋5զ1(5>T/+5gu/5xXKD35>u/tu=5ɫfG5K5Jl;O5YDrkQ5oP5U5s{ h5 VMk5hPy5#Y5/5' bo5O+v67~6]VN6vbYWa6:?m6ͩxo}6 =6%$96m[GA6LRk76H6*A665_77yL+7027-z7'Ճ78HY7MC\7kp8W7P5"?7{m7M}<7=K&1r7&>735j7-s`7-,7z,)q8 8Ԁ8 q\8 dGb?g~D =`1% *U"dn&}L1f16pH;)FBۯ/RML [%=h%bD.0bvgf)@>fu{sS*2=; {c<ǩ9)5ߑ,ioW}l\R 㚥E,9^`DE+il7)^1<AmASΟqWjzc vJ(wx&{X zI ~Vczv7ywGD^yDn\/(g'J@g-iu-4V @xMg1`.%'**&ĥ06&;O2_z=42~o68]_758k<8d3FuD^3Kn[Sgc(v&%r O{40 /+π.bϋx=i05ܙ9{_J5RId+~`hr KW Z. p\0Gÿ.D60¾CpYn-̓$ZCS~shr *P !O*uh|IEn|/D?6/2){e<Ӻ>$*B^-CJ~ ~X,xA M5ꮴݚ?@-~IW0cuT9aB(9m٫ܣdא# HNHDz5ɣX: b2o*dZ\}>Sw@CjR.bI[J;I`ܮ|Uu  {jx_Kw} 7<ÉmW6d;,-鎘ƙGӴѧƓZr/xNr~췰"ZY֭&ѩRi$r9ltk:*OQ@liP#m!B!wʒ&.0E/fP-n mj.0=uD}dmIOvLKWM NDQ˜bY#aa|a%߷-e' _jS_{zm"q|.΁7-Ӄ?,k 닣LGZtiɑmlᴕ5JaŢ7Vb|e'6(O޳wbǒt,Nf="Ϋ-y+țDH{uA&͇zMB%}%` Hz&DGKY"SH#Vq?_HPcӞo95rsl+TuT"d{TTߊ.tEӧ1ʊر}jy9e`H5ÓW˵q],4$p bjR aQ]}k9q QPyPe4!~UM'd"gNd(`;ѿ*hW,{<'.a%l3}lNm+MPb P̩ꩤQ.RdJr#] :_a$|Ppq~8lIu^ٛx8ϛwR&i?5UrɄDiB v[?3ĺȒyF|K嶫 C3hcY#` @+-*-OJ*cMNkR|dtV L]Ls/`&bScd;Bv rh:ig#zL3_0߆v Y즎!o>'9%tsMUF^T꽘qI5,9HڷN=~ 5-?@߉QC(~"I 170öz e% ̾ Z tO, ɛ t1/ @u\5 &Dsc8 m9`M |J\ BU/a ~OWe 먣j n q?n @%p Qe5 t w}Rv }Tsw B 5%+ >/gu &Q^ 4` ౓w Wl` dG KP.oɵ u 9Π }ft5 Y(& t @u ktS -~$ om& l(  ^M0 m2E1 }iץ5 |6 h˕8 [ߗ8 fz|: )mE &R z_xX 4|2g >J} }~ nʤ   Q$Sԃ i~  Z UNߡ !Us B. 8+H ۲& ;rYw ;S3t T"8 y| ?? P x?  pٍ 'ta n1@^ nцT _`Z< J3Jod> gS 3Z \-dZ <8] :zgáf ]#:j 9I ]m XRr ƬoN_À 2Wlp #ȉp > xVY: k f >%KYm il 鱷2 ̇[ xέa C: /k j@ Ĥ3 '4k  #  ļ@n GӾy zfZ-# tW č k˫#/ y3 "-`> yPX НK_ s6p` bd A;*g ,n #,ݕu .8+i׍ ZHb ,/Ö M &z S"c ^, S؄ 1\=P ˢ  lݢC  18t JFt 5%D # qJܩ  lT_ 65PN ( yK /y HY1 QgQ  ͱNeJ6  1 P@Q; C eNG &iJ NdMT BDAB\ Pd e.f Rfa Lk / >n t=Ъ ą ~!# r@yҶ I( Pw  zXPԙ  \6R  C R a J>  o5 >' 0h6urЃE7\BW0s"M(MOh׆#$gX'$Ād2nZB Ѵ9CI_mNhR!64Z6.\΀P\` - ]jQO_{LPaF,Axi4y:Qeo}x$0zuԀ͐G'LmYhvRk%?ou 1Ff 3q:9bW?,@~zJL ^Vk>`3*k*F;I$><O-'~<ˆ -iƨ.d:n;c"́DaMr!<yt:A9 ".Տ P\'Lľ4 澣f;?%~|(`.ނ2+1j4?u|\C~:XVgb_i@[lrbf۵Q(!Q9vKH6bez $_X(JI<Bpg#VI?ذY"fHPK>' ?'~@e(S)uOY:Ci]E ܚaFo0XJ d,AZᖹ  [ g]|g$j\~B)Y>yf8x`4Y(812fãpʌi{ |c*Bxvn2܆;*-Jkр!GQMI/I@'fBqi%Β: _̆ns 5<5,uF{6ՙ#&A?tuC&кݬRIUfa"/j+k]yeyzs~nEhw(υ(2t(r<jc2ʕ@=:\r;SsMW:dJO/?G FR}(]bJEZ`E/¿|0C-8 I 7F'ْ%G@5JpOYG -U*2蜇U/rX90_Dc ^bW|OoloqQy>`Iht51]nK_6bSI~3B)( 5wVz}g(ALMI¼ʤUb(3,D>q46n2v,}&!h)'&WKv@̟XKޯ`L78XS4ٗ_'rcrwaČyhza,OIㅂ8^^V[6S>D+;HWC4Nh_7ZY@"䜭x_[fQz;0f7{fd sbk+2i7\'sF,(ݬz#XBiXP:B (\ Ú>i{4\ɗl+qwГ,gF3JCuOUP?C\qmM@g~Pj&Cum .qK9rލ<Ib{S uﮦ䪛BRA@H)\wDX  'UzQ/V{CX;NWk$n@.}RW7IJػ?X B*/^6~Q w2Y}w w9.؆\bԬc].ۅ&lPro7QRvP0$"eCj^s]O!.̄ͅɤn/|XHrOzFҧb$?7 받5,rSg*1y.H;L4> 3Hp]@MgNGW(Ty,\':zL5hR&ZTfzpUW<< o/N5 HPor=W d|w~Tz)"+dUQ^wlcyT,2¥A,0L f 28$of:cHU{߿r?Y-2 feۖ${?G  j#ԃ@2g~6̧4FU36zn !dL~-aB|o4UWall;ˢuEi!hl[H5\ZM4I^ ?Ȓs yv4UuwdC@;R&"d 뽀%-29wOhohpHz$Ud5! B*'F q>eB>l>UDRRM;]b q&ו=q~`ݽutkw؀;axޜx )ײw1ʻ6X4,P?+M44{}(4Xf8/TGY.<3~ߗz"lʭB6;-:{"ɮq 8qR1މ*o[6{/98d9E/Fa HP@ L?EU0$ kߨ&y)1HSbϤʘ]Oi[I$\EBqޫŪ?G#pϊSZq>mQbfgߍSRz' R,|J.focz5v(툾pϝ*K_,gc/4h@v+0@jAG#kٖ]B'|e,^$l Ot ""I{s|=|M>jىK*Vy"[~Lq!9u=9 h<@-3?c[^$T *Z([>z$: "+ t2#}Smy!3%90?Bk;P+ cd+ԦYdig~D $ǂ=`1% *U=8gF:ߊi$U{%}L1f G6"䡻8-oCnAWIۯ/RML [%=h%bVv-Uߟwy<̄bw)@>fu{/cBMsS*2K6Sբ=; {c<ǩVP]9)5Zeߑ,iq+Kr}ekN7l\R CCi 㚥U6<E,^`>~U8)^1<hVGAmASjzcI7OvJ(wx&{X zb4}I ~Vc|Pkzv7yyDn\C뤢'qg'-iuS}p-g1`.%'*'Cd3FuD0ReY _Y\C]Sg;1XpT3u O{-$}/+π vrfsPzW{1Hq`9{_J5RId~`W Z.  b5Cwt˚ÿ.D6fAtH0¾CpYn4G Yhr *uh|I#~-Ӻ>FUhBJLT~ ~Xr_ӝm,x{ UοN\$5ꮴݚ?@W0B(l٫ܣdאm ֕NHDz4#iJ|'b2o*l`#6p6Sw@CjRJ;I`֐ib fEӀJJrjx_Kw}:[ćH[d;,zLDGӴѧlMk,^/x}NtG)lk"OQ@l'pw\ 'Q67E}wʒ&iX.E/F \cNoƅTz,*~>.Q,)h.P810=uDOvLKWM Ni>-P+/ RT˜bY?( Z;`#a' _j7i e}mlᴕ7Vb|'6뮿*Rzf="Ϋ-y+DH}%` M%YHx$oH3x5×X6 X_Cz&DGKY"S.Uq?_HPcӞo*E0p7 u㩜|TTߊ PÕ)8h5#f1p!m3HEӧ1}jy9e`H5? ѹpjFÓW˵q],VIz{ aQ]}k9q QPyP~UM'd"gNd(hW,@z %+2 }7o CplM}lNm+MPb Pf^o\dJr#]@KWOa]:ma$|PpUWs{Mer5t )8Mϛ8Đy뎻?wR&i?5U$<WbrɄX?Di4gDozB v[?7J3ĺkzQȒy-|K 1;嶫 CccNzNR( @+.(5.*cMU(NNkR L]gJE`[ȿ`&bSc;Bv rh:i Y즎!o'9%tsvq VJ} _JyU nӕ 7Zl i~ B. 8+H ۲& [ AC  pⲏ ;rYw T"8 y| ??  B j ܆1 m 85  'ta n1@^ ?X- _`Z< C>= ~> !hHG t/J 3Z <8] m <=i ]#:j XRr 1k#w T MmJz Fa| 檐x( + 2Wlp #ȉp > k f il ̇[ تOJ :ч*s xέa C: /k ~$ 0~A Ĥ3 ^i 7_s zfZ-# GW^v/ ;  # tW č {vt^, p*5 8] НK_ s6p` ߹NO` bd A;*g vg,bl ,n #,ݕu G~  *4 _r" ܮv_P .8+i׍ ZHb &U { &z S"c ^, ,$i̼  H4  lݢC  18t qJܩ [ 65PN D L! ;f. <| F|Ë ͱNeJ6 lc( eNG &iJ NdMT BDAB\ Rfa Lk  l \1ELx ]]켂 t=Ъ ~!# r| r@yҶ   Pw 3R"aӯ  \6R  Tm. C R a J>  o5 mw ,|3 >' bT0h6uE7\BW0s"M(M%]w!b"NI Q)$Ād2 Ѵ9CI_mNhR!64Z` - ]jQO_{LPap00@}TIU~zuZ5 ~Ԁ͐lD%4!J ӳt G'?ou 1=X4,@ HL%bS ^V'YSrWk>`s\Sj{uYs$-<'~<ˆ -iƨ.d:n;c"́DaMQ喳r!<@.Տ P\IFf^= 51!~|(`.+1j4&A?u|\C$C~:Xi@[lrbf\bnCnu%ox!Q9v&߆WcӢH6bez 64h&y1S"f' ?'乏.{ 01={2gb~D>Rp>ԥD ܚaFZᅌH%6J^^mQ d,AZᖹ  [fE`uk䄡\lUlpl0zNc{ g]|g$j\~lω4Y>y,nEンfãpjkpxvn&yZRt'f2܆;-Jkр!I/8I@'fBqi_̆ns 5<Dmxb\ + E-W2tuCrgHSeyznEhw(υ(2t:j @=:\r;SsMW:dJO/?pC@8dG )N}(]Q@W {FڏC .> p#||@ %K &`E/G&0¿|0h̍޼3'ْ%G@5J!0qdMpOYG -U*2蜇UY4 V/rX90_ZQ/b&Ai\✇v`I8Wx=EW?~2Nn p:*_6bS b)#&Pr|lz}g(,9nMI¼ʤUb(3,D>w,*jEy٘ v,ø)'}bc׎+T!'^%h)'4Ϫ3v@̟XKޯ`L78XScrw߭w]s zhzG }a,OIㅂ8uOǼ[6S>D4[͝ךh_7)x<ㄡZYUxQz;0q]~^+2i7F,(ݬiXP:J݈ɗl+qwГ,gF3Jar{K4iM!lNCuOUPLZ;V?C\0NgYcqmM@g~Pj+Wl&CumVDBqK9rfxrP{OČIb{ﮦ䪛d nİBR#A@HreDNH;)\wDX  m UzQ/VX);NWk$9.}RW7IJػ?C`pBX B.TC7M#2MQ*/^6\j_N!riu2Y}w w9e.؆r9\$GxV E<#C2@߸т&lPru/(ըo7QRqɤn/&j#W~$U\%zFҧbM받ƨD5,rSg*1y.QYߏq5ao<(Ty,\9we`]OnpBs\':zL5 %~ՊR&ZTfzpUWO#zLƎ2N_)yl:$%+-=XRjCKo vVrsFCgvxm:s|c߃72@'x;Ɲߎ_ڃhk}6=V  iimT"M[$$-s_1,M?%~X d{(3l[}ub,"+dUþ{wSlc f 2ǔ٩f:xBBcHU{߿rj;%<&bۖ${[;Q ?G ~h +CZԃ@2g~6̧4FU3$#OV+ - 6zn l zr!dLexV%Z-3s:^;.^]6BӥP4I^OّI$dZ3ɛm4Uuw(b@;ROh[QnWxL&~AA$Ud5! q>$Ƕ6ܮ!jgS&Ha.`-A5'*:B>Fe}uY^?<*W\/wq\f >o lt~`ݽu/2{ўt~Mn؀;)ײI% R6X4,P4{}3~ߗʭB6;-:{"G樆7 qR0B{!1މ*?84v7SC/Fa HP@ Lv[dM;!R?EUǍ+Z] amb0$ kߨ&yz J)1HSbϤ𼔜 {ǖY\Oh}|0Z($\EBqޫŪ?G#mQPBbfgx^G[  R 5v(gc/~U8)^1<hVGΟqWjzc vI7OvJ(wx&{X zb4}I ~Vc|Pkzv7ywGD^yDn\C뤢'q/(g'J@g-iuS}p-4V @xMg1`.%'**&ĥ06&;O2_z=42~o68]_758k<8'Cd3FuD0ReY _Y^3Kn[\C]Sg;1Xpc(v&%rT3u O{-$}40  vrf.bϋsPzWx=i{1Hq05ܙ`9{_J5RId+~`hr KW Z.  b5Cwt˚p\0Gÿ.D6fAtH0¾CpYn-̓$4G YZCS~shr *P *uh|I#~-En|/D?6/2){e<Ӻ>FUhB$*B^-CJLT~ ~Xr_ӝm,x{ UοN\$A Mݚ?@-~IW0B(9lm٫ܣdא# Hm ֕NHDz5ɣX: 4#iJ|'b2o*l`#6p6dZ\}>CjR.bI[J;I`֐ib fEӀJJrܮ|Uu  {jx_Kw}:[ćH 7<É[mW6d;,zLDƙGӴѧƓZrlMk,^/xNr~췰"ZY}N֭&ѩRi$r9tG)lltk":*OQ@liP'pw\#m! 'Q67E}wʒ&.E/F \cNoƅTz,*~>.Q,n mj.)h.P810=uDOvLKWM Ni>-PDQ+/ RT˜bY?( Z;`#a' _jS_{zm"q|.΁7-Ӄi e}?,k Ztiɑmlᴕ5JaŢ7Vb|e'6(O뮿*wbǒtRz,Nf="Ϋ-y+țDH{uA&͇zMB%}%` M%YHx$oH3x5 X_Cz&DGK.UH#Vq?_HPcӞo*E0p95r7 uT"d{㩜|TTߊ PÕ)8h5#f1p!m3H.tEӧ1}jy9e`H5? ѹpjFÓW˵q],VI4$p jRz{ aQ]}k9q QPyPe4!~UM'`;ѿ*hW,{<'.@z %+2a%l3 }7o CplM}lNm+MP̩ꩤQf^o\dJr#] :_@KWOaa$|Pp~8lIu^ٛxUWs{Mer5t )8Mϛ8Đy뎻?wR&i?5U$<WbrɄX?Di4gDozB v[?3ĺkzQȒyF-|K 1;嶫 C3hcYccNzNR( #` @+.(5.-OJ*cMU(NNkR|dtV L]Ls/`gJE`[ȿ`&bScd;Bv rh:ig#zL3_0߆v Y즎!o>'9%tsvT꽘q V= J3Jod> ~> !hHG t/J gS 3Z \-dZ <8] :zgáf m <=i ]#:j 9I ]m 1k#w T MmJz Fa| ƬoN_À 檐x( + 2Wlp #ȉp > xVY: k f >%KYm il 鱷2 ̇[ تOJ :ч*s xέa C: /k ~$ 0~A j@ Ĥ3 ^i '4k 7_s  #  ļ@n GӾy zfZ-# GW^v/ ;  # tW {vt^, k˫#/ p*5 "-`> yPX 8] НK_ ߹NO` bd A;*g vg,bl ,n #,ݕu G~  *4 ܮv_P .8+i׍ ZHb ,/Ö &U { M &z S"c ^, S؄ 1\=P ,$i̼  H4 ˢ  lݢC  18t JFt qJܩ [  lT_ 65PN D ( yK /y L! ;f. <| HY1 F|Ë QgQ  ͱNeJ6 lc(  1 C eNG &iJ NdMT BDAB\ Pd e.f Rfa Lk  l / >n \1ELx ]]켂 t=Ъ ą ~!# r| r@yҶ I(   3R"aӯ  zXPԙ  \6R  Tm. C R a J>  o5 mw ,|3 >' bTrЃE7\BW0s"M(M%]w!b"Oh׆#$gX'NI Q)$Ād2nZBI_mNhR!64Z6.\΀P\` - ]jQO_{LPaF,Axi4y:Qeo}TIU~x$0zuZ5 ~Ԁ͐lD%4!J ӳt G'LmYhvRk%?ou 1Ff 3=X4q:9bW?,@ H~zJL%bS ^V'YSrWk>`s\Sj{uYs$-<O-<ˆ -iƨ.d:n;c"́DaMQ喳yt@:A9.Տ P\'LľIF4 澣f^= 51!;?%~|(`.ނ2+1j4&A?u|\C$C~:XVgb_i@[lrbf\bnCnu%ox۵Q(!Q9v&߆WcӢKH6bez $_X(J64h&yI<1Bpg#VI?S"f' ?'~@e(S)乏.{ 01={2uOY:gb~D>Rp>ԥDCi]E ܚaFZᅌHo0XJ%6J^^mQ d,AZᖹ  [fE`uk䄡\lUlpl0zNc{g$j\~lω4B)Y>y,nEンf8x`4Y(812fãpʌi{ jkp|c*Bxvn&yZRt'f2܆;*-Jkр!GQMI/8I@'fBqi%Β: _̆ns 5<Dmxb\ +W2uF{6ՙ#&A?tuCrgH&кݬRSIUfa"/j+k]yeyzs~nEhw(2t(r<jc2ʕ:j @=:\r;SsMW:dJO/?pC@8dG FR)N}(]bJEQ@W {FڏZC .> p#||@ %K &`E/G&0¿|0h̍޼3C-8 I 7F'ْ%G@5J!0qdMpOYG -U*2蜇UY4 V/rX90_ZQ/bDc ^b&Ai\✇v`I8Wx=EW?~2Nnht51 p:*_6bSI~3B b)#&Pr|l)( 5wVz}g(,9nALMI¼ʤUb(3,D>q46n2w,*jEy٘ v,ø)'}bc&!׎+T!'^%h)'4Ϫ3&WKv@̟XKޯ`L78XS4ٗ_'rcrw߭waČy]s zhzG }a,OIㅂ8^^VuOǼ[6S>D4+;HWC4N[͝ךh_7)x<ㄡZY@"䜭Ux_[fQz;0q]f7{f~^d sbk+2i7\'sF,(ݬz#XBiXP:B (\ Ú>J݈i{4\ɗl+qwГ,gF3Jar{K4iM!lNCuOUPLZ;V?C\0NgYcqmM@g dGb?g~D =`1% *U"dn&}L1f16pH;)FBۯ/RML [%=h%bD.0bvgf)@>fu{sS*2=; {c<ǩ9)5ߑ,ioW}l\R 㚥E,9^`DE+il7)^1<AmASΟqWjzc vJ(wx&{X zI ~ Vczv7ywGD^yDn\/(g'J@g-iu-4V @xMg1`.%'**&ĥ06&;O2_z=42~o68]_758k<8d3FuD^3Kn[Sgc(v&%r O{40 /+π.bϋx=i05ܙ9{_J5RId+~` hr KW Z.  p\0Gÿ.D60¾CpYn-̓$ZCS~shr *P !O*uh|IEn|/D?6/2){e<Ӻ>$*B^-CJ~ ~X,xA M5ꮴݚ?@-~IW0cuT9aB(9m٫ܣdא# HNHDz5ɣX: b2o*dZ\}>Sw@CjR.bI[J;I`ܮ|Uu  {jx_Kw} 7<ÉmW6d;,-鎘ƙGӴѧƓZr/xNr~췰"ZY֭&ѩRi$r9ltk:*OQ@liP#m!B!wʒ&.0E/fP-n mj.0=uD}dmIOvLKWM NDQ˜bY#aa|a%߷-e' _jS_{zm"q|.΁7-Ӄ?,k 닣LGZtiɑmlᴕ5JaŢ7Vb|e'6(O޳wbǒt,Nf="Ϋ-y+țDH{uA&͇zMB%}%` Hz&DGKY"SH#Vq?_HPcӞo95rsl+TuT"d{TTߊ.tEӧ1 ʊر}jy9e`H5ÓW˵q],4$p bjR aQ]}k9q QPyPe4!~UM' d"gNd(`;ѿ*hW,{<'.a%l3}lNm+MPb P̩ꩤQ.RdJr#] :_a$|Pp q~8lIu^ٛx8ϛwR&i?5UrɄDiB v[?3ĺȒy F|K嶫 C3hcY#` @+-*-OJ*cMNkR|dtV L]Ls/`&bScd;Bv rh:ig#zL3_0߆v Y즎!o>'9%tsMUF^T꽘qI5,9HڷN=~ 5-?@߉QC(~"I 170öz e% ̾ Z tO, ɛ t1/ @u\5 &Dsc8 m9`M |J\ BU/a ~OWe 먣j n q?n @%p Qe5 t w}Rv }Tsw B 5%+ >/gu &Q^ 4` ౓w Wl` dG KP.oɵ u 9Π }ft5 Y(& t @u ktS -~$ om& l(  ^M0 m2E1 }iץ5 |6 h˕8 [ߗ8 fz|: )mE &R z_xX 4|2g >J} }~ nʤ   Q$Sԃ i~  Z UNߡ !Us B. 8+H ۲& ;rYw ;S3t T"8 y| ?? P x?  pٍ 'ta n1@^ nцT _`Z< J3Jod> gS 3Z \-dZ <8] :zgáf ]#:j 9I ]m XRr ƬoN_À 2Wlp #ȉp > xVY: k f >%KYm il 鱷2 ̇[ xέa C: /k j@ Ĥ3 '4k  #  ļ@n GӾy zfZ-# tW č k˫#/ y3 "-`> yPX НK_ s6p` bd A;*g ,n #,ݕu .8+i׍ ZHb ,/Ö M &z S"c ^, S؄ 1\=P ˢ  lݢC  18t JFt 5%D # qJܩ  lT_ 65PN ( yK /y HY1 QgQ  ͱNeJ6  1 P@Q; C eNG &iJ NdMT BDAB\ Pd e.f Rfa Lk / >n t=Ъ ą ~!# r@yҶ I( Pw  zXPԙ  \6R  C R a J>  o5 >' 0h6urЃE7\BW0s"M(MOh׆#$gX'$Ād2nZB Ѵ9CI_mNhR!64Z6.\΀P\` - ]jQO_{LPaF,Axi4y:Qeo}x$0zuԀ͐G'LmYhvRk%?ou 1Ff 3q:9bW?,@~zJL ^Vk>`3*k*F;I$>< O-'~<ˆ -iƨ.d:n;c"́DaMr!<yt:A9 ".Տ P\'Lľ4 澣f;?%~|(`.ނ2+1j4?u|\C~:XVgb_i@[lrbf۵Q(!Q9vKH6bez  $_X(JI<Bpg#VI?ذY"fHPK>' ?'~@e(S)uOY:Ci]E ܚaFo0XJ d,AZᖹ  [ g]|g$j\~B)Y>yf8x`4Y(812fãp ʌi{ |c*Bxvn2܆;*-Jkр!GQMI/I@'fBqi%Β: _̆ns 5<5,uF{6ՙ#&A?tuC&кݬRIUfa"/j+k]yeyzs~nEhw(υ(2t(r<jc2ʕ@=:\r;SsMW:dJO/?G FR}(] bJEZ`E/¿|0 C-8 I 7F'ْ%G@5JpOYG -U*2蜇U/rX90_Dc ^bW|OoloqQy>`Iht51]nK_6bSI~3B)( 5wVz}g(ALMI¼ʤUb(3,D>q46n2v,}&!h)'&WKv@̟XKޯ`L78XS4ٗ_'rcrwaČyhza,OIㅂ8^^V[6S>D+;HWC4Nh_7ZY@"䜭x_[fQz;0f7{fd sbk+2i7\'sF,(ݬ z#XBiXP:B (\ Ú>i{4\ɗl+qwГ,gF3JCuOUP?C\qmM@g~Pj&Cum .qK9rލ<Ib{S uﮦ䪛BRA@H)\wDX  'UzQ/V{CX;NWk$n@.}RW7 IJػ?X B*/^6~Q w2Y}w w9.؆\bԬc].ۅ&lPro7QRvP0$"eCj^s]O!.̄ͅɤn/|XHrOzFҧb$?7 받5,rSg*1y.H;L4> 3Hp]@MgNGW(Ty,\ ':zL5hR&ZTfzpUW<< o/Nj56Z9Y(9" +avf/LRԁ,Nd*=D3D ;_'F? p4Psimka-1.5.1/tests/simkaMin/truth_simkaMin/k21_filter_100-100_n0/mat_presenceAbsence_jaccard.csv.gz000066400000000000000000000001321353413740300324770ustar00rootroot00000000000000vvvvvr63 ʰ4X@5\N Kc($.g_9BU\D*+"simka-1.5.1/tests/simkaMin/truth_simkaMin/k21_filter_100-100_n1/000077500000000000000000000000001353413740300237535ustar00rootroot00000000000000simka-1.5.1/tests/simkaMin/truth_simkaMin/k21_filter_100-100_n1/mat_abundance_braycurtis.csv.gz000066400000000000000000000001631353413740300321370ustar00rootroot00000000000000 3R+OzWmvc>j56Z9Y(9" +avf/LRԁ,Nd*=D3D ;_'F? p4Psimka-1.5.1/tests/simkaMin/truth_simkaMin/k21_filter_100-100_n1/mat_presenceAbsence_jaccard.csv.gz000066400000000000000000000001321353413740300325000ustar00rootroot00000000000000vvvvvr63 ʰ4X@5\N Kc($.g_9BU\D*+"simka-1.5.1/tests/simkaMin/truth_simkaMin/k31__0-1000_n0/000077500000000000000000000000001353413740300224645ustar00rootroot00000000000000simka-1.5.1/tests/simkaMin/truth_simkaMin/k31__0-1000_n0/mat_abundance_braycurtis.csv.gz000066400000000000000000000001571353413740300306530ustar00rootroot0000000000000010Ø pGiynZ]K;>р!kod3 m'simka-1.5.1/tests/simkaMin/truth_simkaMin/k31_filter_0-100_n0/mat_presenceAbsence_jaccard.csv.gz000066400000000000000000000001241353413740300323400ustar00rootroot00000000000000vvvvvr63 ư4E111&"`Xc4+wtW"C\M5simka-1.5.1/tests/simkaMin/truth_simkaMin/k31_filter_0-100_n1/000077500000000000000000000000001353413740300236135ustar00rootroot00000000000000simka-1.5.1/tests/simkaMin/truth_simkaMin/k31_filter_0-100_n1/mat_abundance_braycurtis.csv.gz000066400000000000000000000001641353413740300320000ustar00rootroot00000000000000A0=^_J& 3V_O%.KO:g'X5,>pGiynZ]K;>р!kod3 m'simka-1.5.1/tests/simkaMin/truth_simkaMin/k31_filter_0-100_n1/mat_presenceAbsence_jaccard.csv.gz000066400000000000000000000001241353413740300323410ustar00rootroot00000000000000vvvvvr63 ư4E111&"`Xc4+wtW"C\M5simka-1.5.1/tests/simkaMin/truth_simkaMin/k31_filter_100-1000_n0/000077500000000000000000000000001353413740300240335ustar00rootroot00000000000000simka-1.5.1/tests/simkaMin/truth_simkaMin/k31_filter_100-1000_n0/mat_abundance_braycurtis.csv.gz000066400000000000000000000001631353413740300322170ustar00rootroot00000000000000A0<@ SKRMcd |<'a‪%8I ֧>A% @BLi$f>AT-Ql[ed/Cpsimka-1.5.1/tests/simkaMin/truth_simkaMin/k31_filter_100-1000_n0/mat_presenceAbsence_jaccard.csv.gz000066400000000000000000000001371353413740300325650ustar00rootroot00000000000000vvvvvr63 °0XB5\N 3Ks$.g_9BU\D*cRsimka-1.5.1/tests/simkaMin/truth_simkaMin/k31_filter_100-1000_n1/000077500000000000000000000000001353413740300240345ustar00rootroot00000000000000simka-1.5.1/tests/simkaMin/truth_simkaMin/k31_filter_100-1000_n1/mat_abundance_braycurtis.csv.gz000066400000000000000000000001631353413740300322200ustar00rootroot00000000000000A0<@ SKRMcd |<'a‪%8I ֧>A% @BLi$f>AT-Ql[ed/Cpsimka-1.5.1/tests/simkaMin/truth_simkaMin/k31_filter_100-1000_n1/mat_presenceAbsence_jaccard.csv.gz000066400000000000000000000001371353413740300325660ustar00rootroot00000000000000vvvvvr63 °0XB5\N 3Ks$.g_9BU\D*cRsimka-1.5.1/tests/simkaMin/truth_simkaMin/k31_filter_100-100_n0/000077500000000000000000000000001353413740300237535ustar00rootroot00000000000000simka-1.5.1/tests/simkaMin/truth_simkaMin/k31_filter_100-100_n0/mat_abundance_braycurtis.csv.gz000066400000000000000000000001641353413740300321400ustar00rootroot00000000000000M 0὇)1Wm_I a0=yE{<3`1,67Vtft| `u;]8|AD0(/gn_simka-1.5.1/tests/simkaMin/truth_simkaMin/k31_filter_100-100_n0/mat_presenceAbsence_jaccard.csv.gz000066400000000000000000000001311353413740300324770ustar00rootroot00000000000000vvvvvr63 ʰ4 jP%I\X&(qWp'1D*isimka-1.5.1/tests/simkaMin/truth_simkaMin/k31_filter_100-100_n1/000077500000000000000000000000001353413740300237545ustar00rootroot00000000000000simka-1.5.1/tests/simkaMin/truth_simkaMin/k31_filter_100-100_n1/mat_abundance_braycurtis.csv.gz000066400000000000000000000001641353413740300321410ustar00rootroot00000000000000M 0὇)1Wm_I a0=yE{<3`1,67Vtft| `u;]8|AD0(/gn_simka-1.5.1/tests/simkaMin/truth_simkaMin/k31_filter_100-100_n1/mat_presenceAbsence_jaccard.csv.gz000066400000000000000000000001311353413740300325000ustar00rootroot00000000000000vvvvvr63 ʰ4 jP%I\X&(qWp'1D*isimka-1.5.1/tests/simkaMin/truth_simkaMin_symetrical/000077500000000000000000000000001353413740300227715ustar00rootroot00000000000000simka-1.5.1/tests/simkaMin/truth_simkaMin_symetrical/k21__0-100/000077500000000000000000000000001353413740300242425ustar00rootroot00000000000000simka-1.5.1/tests/simkaMin/truth_simkaMin_symetrical/k21__0-100/mat_abundance_braycurtis.csv000066400000000000000000000003661353413740300320140ustar00rootroot00000000000000;A;B;C;D;E A;0.000000;0.364807;0.485437;0.519126;0.602763 B;0.364807;0.000000;0.486486;0.215470;0.594502 C;0.485437;0.486486;0.000000;0.215470;0.798409 D;0.519126;0.215470;0.215470;0.000000;0.653659 E;0.602763;0.594502;0.798409;0.653659;0.000000 simka-1.5.1/tests/simkaMin/truth_simkaMin_symetrical/k21__0-100/mat_presenceAbsence_jaccard.csv000066400000000000000000000003661353413740300323610ustar00rootroot00000000000000;A;B;C;D;E A;0.000000;0.440000;0.520000;0.660000;0.440000 B;0.440000;0.000000;0.630000;0.390000;0.000000 C;0.520000;0.630000;0.000000;0.300000;0.630000 D;0.660000;0.390000;0.300000;0.000000;0.390000 E;0.440000;0.000000;0.630000;0.390000;0.000000 simka-1.5.1/tests/simkaMin/truth_simkaMin_symetrical/k21__0-1000/000077500000000000000000000000001353413740300243225ustar00rootroot00000000000000simka-1.5.1/tests/simkaMin/truth_simkaMin_symetrical/k21__0-1000/mat_abundance_braycurtis.csv.gz000066400000000000000000000001711353413740300325050ustar00rootroot00000000000000MA <pR&av!G䛓F,(-cfLkBVۺc+g*]S 񲠳hzϋ肅'h/\D,|Esimka-1.5.1/tests/simkaMin/truth_simkaMin_symetrical/k21__0-1000/mat_presenceAbsence_jaccard.csv.gz000066400000000000000000000001421353413740300330500ustar00rootroot00000000000000vvvvvr63 0560,PUZ@U#Kq9Sglh,b#,*;R\9wsimka-1.5.1/tests/simkaMin/truth_simkaMin_symetrical/k21__0-1000_n0/000077500000000000000000000000001353413740300247175ustar00rootroot00000000000000simka-1.5.1/tests/simkaMin/truth_simkaMin_symetrical/k21__0-1000_n0/mat_abundance_braycurtis.csv000066400000000000000000000003661353413740300324710ustar00rootroot00000000000000;A;B;C;D;E A;0.000000;0.392794;0.505175;0.552632;0.599578 B;0.392794;0.000000;0.492455;0.235064;0.600141 C;0.505175;0.492455;0.000000;0.219163;0.801289 D;0.552632;0.235064;0.219163;0.000000;0.655392 E;0.599578;0.600141;0.801289;0.655392;0.000000 simka-1.5.1/tests/simkaMin/truth_simkaMin_symetrical/k21__0-1000_n0/mat_presenceAbsence_jaccard.csv000066400000000000000000000003661353413740300330360ustar00rootroot00000000000000;A;B;C;D;E A;0.000000;0.446000;0.534000;0.680000;0.446000 B;0.446000;0.000000;0.658000;0.403000;0.000000 C;0.534000;0.658000;0.000000;0.319000;0.658000 D;0.680000;0.403000;0.319000;0.000000;0.403000 E;0.446000;0.000000;0.658000;0.403000;0.000000 simka-1.5.1/tests/simkaMin/truth_simkaMin_symetrical/k21__0-1000_n1/000077500000000000000000000000001353413740300247205ustar00rootroot00000000000000simka-1.5.1/tests/simkaMin/truth_simkaMin_symetrical/k21__0-1000_n1/mat_abundance_braycurtis.csv000066400000000000000000000003661353413740300324720ustar00rootroot00000000000000;A;B;C;D;E A;0.000000;0.392794;0.505175;0.552632;0.599578 B;0.392794;0.000000;0.492455;0.235064;0.600141 C;0.505175;0.492455;0.000000;0.219163;0.801289 D;0.552632;0.235064;0.219163;0.000000;0.655392 E;0.599578;0.600141;0.801289;0.655392;0.000000 simka-1.5.1/tests/simkaMin/truth_simkaMin_symetrical/k21__0-1000_n1/mat_presenceAbsence_jaccard.csv000066400000000000000000000003661353413740300330370ustar00rootroot00000000000000;A;B;C;D;E A;0.000000;0.446000;0.534000;0.680000;0.446000 B;0.446000;0.000000;0.658000;0.403000;0.000000 C;0.534000;0.658000;0.000000;0.319000;0.658000 D;0.680000;0.403000;0.319000;0.000000;0.403000 E;0.446000;0.000000;0.658000;0.403000;0.000000 simka-1.5.1/tests/simkaMin/truth_simkaMin_symetrical/k21__0-100_n0/000077500000000000000000000000001353413740300246375ustar00rootroot00000000000000simka-1.5.1/tests/simkaMin/truth_simkaMin_symetrical/k21__0-100_n0/mat_abundance_braycurtis.csv000066400000000000000000000003661353413740300324110ustar00rootroot00000000000000;A;B;C;D;E A;0.000000;0.364807;0.485437;0.519126;0.602763 B;0.364807;0.000000;0.486486;0.215470;0.594502 C;0.485437;0.486486;0.000000;0.215470;0.798409 D;0.519126;0.215470;0.215470;0.000000;0.653659 E;0.602763;0.594502;0.798409;0.653659;0.000000 simka-1.5.1/tests/simkaMin/truth_simkaMin_symetrical/k21__0-100_n0/mat_presenceAbsence_jaccard.csv000066400000000000000000000003661353413740300327560ustar00rootroot00000000000000;A;B;C;D;E A;0.000000;0.440000;0.520000;0.660000;0.440000 B;0.440000;0.000000;0.630000;0.390000;0.000000 C;0.520000;0.630000;0.000000;0.300000;0.630000 D;0.660000;0.390000;0.300000;0.000000;0.390000 E;0.440000;0.000000;0.630000;0.390000;0.000000 simka-1.5.1/tests/simkaMin/truth_simkaMin_symetrical/k21__0-100_n1/000077500000000000000000000000001353413740300246405ustar00rootroot00000000000000simka-1.5.1/tests/simkaMin/truth_simkaMin_symetrical/k21__0-100_n1/mat_abundance_braycurtis.csv000066400000000000000000000003661353413740300324120ustar00rootroot00000000000000;A;B;C;D;E A;0.000000;0.364807;0.485437;0.519126;0.602763 B;0.364807;0.000000;0.486486;0.215470;0.594502 C;0.485437;0.486486;0.000000;0.215470;0.798409 D;0.519126;0.215470;0.215470;0.000000;0.653659 E;0.602763;0.594502;0.798409;0.653659;0.000000 simka-1.5.1/tests/simkaMin/truth_simkaMin_symetrical/k21__0-100_n1/mat_presenceAbsence_jaccard.csv000066400000000000000000000003661353413740300327570ustar00rootroot00000000000000;A;B;C;D;E A;0.000000;0.440000;0.520000;0.660000;0.440000 B;0.440000;0.000000;0.630000;0.390000;0.000000 C;0.520000;0.630000;0.000000;0.300000;0.630000 D;0.660000;0.390000;0.300000;0.000000;0.390000 E;0.440000;0.000000;0.630000;0.390000;0.000000 simka-1.5.1/tests/simkaMin/truth_simkaMin_symetrical/k21__100-100/000077500000000000000000000000001353413740300244035ustar00rootroot00000000000000simka-1.5.1/tests/simkaMin/truth_simkaMin_symetrical/k21__100-100/mat_abundance_braycurtis.csv.gz000066400000000000000000000001671353413740300325730ustar00rootroot00000000000000MA <D,KU[9My=|J>x݀cJP%D}>-,F uKӠwBNZ>s*Qz/, r-simka-1.5.1/tests/simkaMin/truth_simkaMin_symetrical/k21__100-100/mat_presenceAbsence_jaccard.csv.gz000066400000000000000000000001341353413740300331320ustar00rootroot00000000000000vvvvvr63 052PUCƖ(R\(f3F5FYpU}`).WR"simka-1.5.1/tests/simkaMin/truth_simkaMin_symetrical/k21__100-1000/000077500000000000000000000000001353413740300244635ustar00rootroot00000000000000simka-1.5.1/tests/simkaMin/truth_simkaMin_symetrical/k21__100-1000/mat_abundance_braycurtis.csv.gz000066400000000000000000000001711353413740300326460ustar00rootroot00000000000000UI@<&b!N%3q8PuCmQg]>]a`ʹ-䜓emt\Ok!k&2_.X 'mat_presenceAbsence_jaccard.csv.gz000066400000000000000000000001421353413740300331320ustar00rootroot00000000000000simka-1.5.1/tests/simkaMin/truth_simkaMin_symetrical/k21__100-1000vvvvvr63 0560,PUZ@U#Kq9Sglh,b#,*;R\9wsimka-1.5.1/tests/simkaMin/truth_simkaMin_symetrical/k21__100-1000_n0/000077500000000000000000000000001353413740300250605ustar00rootroot00000000000000simka-1.5.1/tests/simkaMin/truth_simkaMin_symetrical/k21__100-1000_n0/mat_abundance_braycurtis.csv000066400000000000000000000003661353413740300326320ustar00rootroot00000000000000;A;B;C;D;E A;0.000000;0.405765;0.505175;0.552632;0.332552 B;0.405765;0.000000;0.509015;0.246197;0.341593 C;0.505175;0.509015;0.000000;0.219163;0.666667 D;0.552632;0.246197;0.219163;0.000000;0.450996 E;0.332552;0.341593;0.666667;0.450996;0.000000 mat_presenceAbsence_jaccard.csv000066400000000000000000000003661353413740300331200ustar00rootroot00000000000000simka-1.5.1/tests/simkaMin/truth_simkaMin_symetrical/k21__100-1000_n0;A;B;C;D;E A;0.000000;0.446000;0.534000;0.680000;0.446000 B;0.446000;0.000000;0.658000;0.403000;0.000000 C;0.534000;0.658000;0.000000;0.319000;0.658000 D;0.680000;0.403000;0.319000;0.000000;0.403000 E;0.446000;0.000000;0.658000;0.403000;0.000000 simka-1.5.1/tests/simkaMin/truth_simkaMin_symetrical/k21__100-1000_n1/000077500000000000000000000000001353413740300250615ustar00rootroot00000000000000simka-1.5.1/tests/simkaMin/truth_simkaMin_symetrical/k21__100-1000_n1/mat_abundance_braycurtis.csv000066400000000000000000000003661353413740300326330ustar00rootroot00000000000000;A;B;C;D;E A;0.000000;0.405765;0.505175;0.552632;0.332552 B;0.405765;0.000000;0.509015;0.246197;0.341593 C;0.505175;0.509015;0.000000;0.219163;0.666667 D;0.552632;0.246197;0.219163;0.000000;0.450996 E;0.332552;0.341593;0.666667;0.450996;0.000000 mat_presenceAbsence_jaccard.csv000066400000000000000000000003661353413740300331210ustar00rootroot00000000000000simka-1.5.1/tests/simkaMin/truth_simkaMin_symetrical/k21__100-1000_n1;A;B;C;D;E A;0.000000;0.446000;0.534000;0.680000;0.446000 B;0.446000;0.000000;0.658000;0.403000;0.000000 C;0.534000;0.658000;0.000000;0.319000;0.658000 D;0.680000;0.403000;0.319000;0.000000;0.403000 E;0.446000;0.000000;0.658000;0.403000;0.000000 simka-1.5.1/tests/simkaMin/truth_simkaMin_symetrical/k21__100-100_n0/000077500000000000000000000000001353413740300250005ustar00rootroot00000000000000simka-1.5.1/tests/simkaMin/truth_simkaMin_symetrical/k21__100-100_n0/mat_abundance_braycurtis.csv000066400000000000000000000003661353413740300325520ustar00rootroot00000000000000;A;B;C;D;E A;0.000000;0.370690;0.485437;0.519126;0.339080 B;0.370690;0.000000;0.496599;0.222222;0.331429 C;0.485437;0.496599;0.000000;0.215470;0.662222 D;0.519126;0.222222;0.215470;0.000000;0.449612 E;0.339080;0.331429;0.662222;0.449612;0.000000 simka-1.5.1/tests/simkaMin/truth_simkaMin_symetrical/k21__100-100_n0/mat_presenceAbsence_jaccard.csv000066400000000000000000000003661353413740300331170ustar00rootroot00000000000000;A;B;C;D;E A;0.000000;0.440000;0.520000;0.660000;0.440000 B;0.440000;0.000000;0.630000;0.390000;0.000000 C;0.520000;0.630000;0.000000;0.300000;0.630000 D;0.660000;0.390000;0.300000;0.000000;0.390000 E;0.440000;0.000000;0.630000;0.390000;0.000000 simka-1.5.1/tests/simkaMin/truth_simkaMin_symetrical/k21__100-100_n1/000077500000000000000000000000001353413740300250015ustar00rootroot00000000000000simka-1.5.1/tests/simkaMin/truth_simkaMin_symetrical/k21__100-100_n1/mat_abundance_braycurtis.csv000066400000000000000000000003661353413740300325530ustar00rootroot00000000000000;A;B;C;D;E A;0.000000;0.370690;0.485437;0.519126;0.339080 B;0.370690;0.000000;0.496599;0.222222;0.331429 C;0.485437;0.496599;0.000000;0.215470;0.662222 D;0.519126;0.222222;0.215470;0.000000;0.449612 E;0.339080;0.331429;0.662222;0.449612;0.000000 simka-1.5.1/tests/simkaMin/truth_simkaMin_symetrical/k21__100-100_n1/mat_presenceAbsence_jaccard.csv000066400000000000000000000003661353413740300331200ustar00rootroot00000000000000;A;B;C;D;E A;0.000000;0.440000;0.520000;0.660000;0.440000 B;0.440000;0.000000;0.630000;0.390000;0.000000 C;0.520000;0.630000;0.000000;0.300000;0.630000 D;0.660000;0.390000;0.300000;0.000000;0.390000 E;0.440000;0.000000;0.630000;0.390000;0.000000 simka-1.5.1/tests/simkaMin/truth_simkaMin_symetrical/k21_filter_0-100/000077500000000000000000000000001353413740300254505ustar00rootroot00000000000000mat_abundance_braycurtis.csv.gz000066400000000000000000000001701353413740300335530ustar00rootroot00000000000000simka-1.5.1/tests/simkaMin/truth_simkaMin_symetrical/k21_filter_0-100M9@C{6@DWJ&PYֳM}Wߴ7WP}Ŧ.)lEJ!7et4'^aI'b4EtB\F9 =mat_presenceAbsence_jaccard.csv.gz000066400000000000000000000001301353413740300341140ustar00rootroot00000000000000simka-1.5.1/tests/simkaMin/truth_simkaMin_symetrical/k21_filter_0-100vvvvvr63 ʰ4G111E, ʝQPr!tWbE ]ĝsimka-1.5.1/tests/simkaMin/truth_simkaMin_symetrical/k21_filter_0-1000/000077500000000000000000000000001353413740300255305ustar00rootroot00000000000000mat_abundance_braycurtis.csv.gz000066400000000000000000000001711353413740300336340ustar00rootroot00000000000000simka-1.5.1/tests/simkaMin/truth_simkaMin_symetrical/k21_filter_0-1000M;0CwS)_I7LgSGu]ն.*CޛtT!,5\SLS"MZ.Sgн. )pM_XDR2}׬mat_presenceAbsence_jaccard.csv.gz000066400000000000000000000001411353413740300341760ustar00rootroot00000000000000simka-1.5.1/tests/simkaMin/truth_simkaMin_symetrical/k21_filter_0-1000vvvvvr63 °0A1112, -D,MAʝT4)w!tWbpE `ba?simka-1.5.1/tests/simkaMin/truth_simkaMin_symetrical/k21_filter_0-1000_n0/000077500000000000000000000000001353413740300261255ustar00rootroot00000000000000mat_abundance_braycurtis.csv000066400000000000000000000003661353413740300336200ustar00rootroot00000000000000simka-1.5.1/tests/simkaMin/truth_simkaMin_symetrical/k21_filter_0-1000_n0;A;B;C;D;E A;0.000000;0.634391;0.872054;0.643417;0.599649 B;0.634391;0.000000;0.533693;0.061224;0.891154 C;0.872054;0.533693;0.000000;0.582630;0.969913 D;0.643417;0.061224;0.582630;0.000000;0.880485 E;0.599649;0.891154;0.969913;0.880485;0.000000 mat_presenceAbsence_jaccard.csv000066400000000000000000000003661353413740300341650ustar00rootroot00000000000000simka-1.5.1/tests/simkaMin/truth_simkaMin_symetrical/k21_filter_0-1000_n0;A;B;C;D;E A;0.000000;0.783000;0.984000;0.783000;0.446000 B;0.783000;0.000000;0.918000;0.000000;0.875000 C;0.984000;0.918000;0.000000;0.918000;0.992000 D;0.783000;0.000000;0.918000;0.000000;0.875000 E;0.446000;0.875000;0.992000;0.875000;0.000000 simka-1.5.1/tests/simkaMin/truth_simkaMin_symetrical/k21_filter_0-1000_n1/000077500000000000000000000000001353413740300261265ustar00rootroot00000000000000mat_abundance_braycurtis.csv000066400000000000000000000003661353413740300336210ustar00rootroot00000000000000simka-1.5.1/tests/simkaMin/truth_simkaMin_symetrical/k21_filter_0-1000_n1;A;B;C;D;E A;0.000000;0.634391;0.872054;0.643417;0.599649 B;0.634391;0.000000;0.533693;0.061224;0.891154 C;0.872054;0.533693;0.000000;0.582630;0.969913 D;0.643417;0.061224;0.582630;0.000000;0.880485 E;0.599649;0.891154;0.969913;0.880485;0.000000 mat_presenceAbsence_jaccard.csv000066400000000000000000000003661353413740300341660ustar00rootroot00000000000000simka-1.5.1/tests/simkaMin/truth_simkaMin_symetrical/k21_filter_0-1000_n1;A;B;C;D;E A;0.000000;0.783000;0.984000;0.783000;0.446000 B;0.783000;0.000000;0.918000;0.000000;0.875000 C;0.984000;0.918000;0.000000;0.918000;0.992000 D;0.783000;0.000000;0.918000;0.000000;0.875000 E;0.446000;0.875000;0.992000;0.875000;0.000000 simka-1.5.1/tests/simkaMin/truth_simkaMin_symetrical/k21_filter_0-100_n0/000077500000000000000000000000001353413740300260455ustar00rootroot00000000000000mat_abundance_braycurtis.csv000066400000000000000000000003661353413740300335400ustar00rootroot00000000000000simka-1.5.1/tests/simkaMin/truth_simkaMin_symetrical/k21_filter_0-100_n0;A;B;C;D;E A;0.000000;0.630522;0.924171;0.639216;0.603448 B;0.630522;0.000000;0.580153;0.048276;0.863727 C;0.924171;0.580153;0.000000;0.611307;0.966173 D;0.639216;0.048276;0.611307;0.000000;0.841584 E;0.603448;0.863727;0.966173;0.841584;0.000000 mat_presenceAbsence_jaccard.csv000066400000000000000000000003661353413740300341050ustar00rootroot00000000000000simka-1.5.1/tests/simkaMin/truth_simkaMin_symetrical/k21_filter_0-100_n0;A;B;C;D;E A;0.000000;0.780000;0.970000;0.780000;0.440000 B;0.780000;0.000000;0.760000;0.000000;0.840000 C;0.970000;0.760000;0.000000;0.760000;0.970000 D;0.780000;0.000000;0.760000;0.000000;0.840000 E;0.440000;0.840000;0.970000;0.840000;0.000000 simka-1.5.1/tests/simkaMin/truth_simkaMin_symetrical/k21_filter_0-100_n1/000077500000000000000000000000001353413740300260465ustar00rootroot00000000000000mat_abundance_braycurtis.csv000066400000000000000000000003661353413740300335410ustar00rootroot00000000000000simka-1.5.1/tests/simkaMin/truth_simkaMin_symetrical/k21_filter_0-100_n1;A;B;C;D;E A;0.000000;0.630522;0.924171;0.639216;0.603448 B;0.630522;0.000000;0.580153;0.048276;0.863727 C;0.924171;0.580153;0.000000;0.611307;0.966173 D;0.639216;0.048276;0.611307;0.000000;0.841584 E;0.603448;0.863727;0.966173;0.841584;0.000000 mat_presenceAbsence_jaccard.csv000066400000000000000000000003661353413740300341060ustar00rootroot00000000000000simka-1.5.1/tests/simkaMin/truth_simkaMin_symetrical/k21_filter_0-100_n1;A;B;C;D;E A;0.000000;0.780000;0.970000;0.780000;0.440000 B;0.780000;0.000000;0.760000;0.000000;0.840000 C;0.970000;0.760000;0.000000;0.760000;0.970000 D;0.780000;0.000000;0.760000;0.000000;0.840000 E;0.440000;0.840000;0.970000;0.840000;0.000000 simka-1.5.1/tests/simkaMin/truth_simkaMin_symetrical/k21_filter_100-1000_n0/000077500000000000000000000000001353413740300262665ustar00rootroot00000000000000mat_abundance_braycurtis.csv000066400000000000000000000003661353413740300337610ustar00rootroot00000000000000simka-1.5.1/tests/simkaMin/truth_simkaMin_symetrical/k21_filter_100-1000_n0;A;B;C;D;E A;0.000000;0.672544;0.872054;0.643417;0.232293 B;0.672544;0.000000;0.734756;0.156335;0.782972 C;0.872054;0.734756;0.000000;0.582630;0.918455 D;0.643417;0.156335;0.582630;0.000000;0.726216 E;0.232293;0.782972;0.918455;0.726216;0.000000 mat_presenceAbsence_jaccard.csv000066400000000000000000000003661353413740300343260ustar00rootroot00000000000000simka-1.5.1/tests/simkaMin/truth_simkaMin_symetrical/k21_filter_100-1000_n0;A;B;C;D;E A;0.000000;0.845000;0.984000;0.783000;0.000000 B;0.845000;0.000000;0.781000;0.329000;0.845000 C;0.984000;0.781000;0.000000;0.918000;0.984000 D;0.783000;0.329000;0.918000;0.000000;0.783000 E;0.000000;0.845000;0.984000;0.783000;0.000000 simka-1.5.1/tests/simkaMin/truth_simkaMin_symetrical/k21_filter_100-1000_n1/000077500000000000000000000000001353413740300262675ustar00rootroot00000000000000mat_abundance_braycurtis.csv000066400000000000000000000003661353413740300337620ustar00rootroot00000000000000simka-1.5.1/tests/simkaMin/truth_simkaMin_symetrical/k21_filter_100-1000_n1;A;B;C;D;E A;0.000000;0.672544;0.872054;0.643417;0.232293 B;0.672544;0.000000;0.734756;0.156335;0.782972 C;0.872054;0.734756;0.000000;0.582630;0.918455 D;0.643417;0.156335;0.582630;0.000000;0.726216 E;0.232293;0.782972;0.918455;0.726216;0.000000 mat_presenceAbsence_jaccard.csv000066400000000000000000000003661353413740300343270ustar00rootroot00000000000000simka-1.5.1/tests/simkaMin/truth_simkaMin_symetrical/k21_filter_100-1000_n1;A;B;C;D;E A;0.000000;0.845000;0.984000;0.783000;0.000000 B;0.845000;0.000000;0.781000;0.329000;0.845000 C;0.984000;0.781000;0.000000;0.918000;0.984000 D;0.783000;0.329000;0.918000;0.000000;0.783000 E;0.000000;0.845000;0.984000;0.783000;0.000000 simka-1.5.1/tests/simkaMin/truth_simkaMin_symetrical/k21_filter_100-100_n0/000077500000000000000000000000001353413740300262065ustar00rootroot00000000000000mat_abundance_braycurtis.csv000066400000000000000000000003661353413740300337010ustar00rootroot00000000000000simka-1.5.1/tests/simkaMin/truth_simkaMin_symetrical/k21_filter_100-100_n0;A;B;C;D;E A;0.000000;0.643725;0.924171;0.639216;0.233962 B;0.643725;0.000000;0.815789;0.137157;0.762803 C;0.924171;0.815789;0.000000;0.611307;0.952239 D;0.639216;0.137157;0.611307;0.000000;0.725594 E;0.233962;0.762803;0.952239;0.725594;0.000000 mat_presenceAbsence_jaccard.csv000066400000000000000000000003661353413740300342460ustar00rootroot00000000000000simka-1.5.1/tests/simkaMin/truth_simkaMin_symetrical/k21_filter_100-100_n0;A;B;C;D;E A;0.000000;0.790000;0.970000;0.780000;0.000000 B;0.790000;0.000000;0.930000;0.170000;0.790000 C;0.970000;0.930000;0.000000;0.760000;0.970000 D;0.780000;0.170000;0.760000;0.000000;0.780000 E;0.000000;0.790000;0.970000;0.780000;0.000000 simka-1.5.1/tests/simkaMin/truth_simkaMin_symetrical/k21_filter_100-100_n1/000077500000000000000000000000001353413740300262075ustar00rootroot00000000000000mat_abundance_braycurtis.csv000066400000000000000000000003661353413740300337020ustar00rootroot00000000000000simka-1.5.1/tests/simkaMin/truth_simkaMin_symetrical/k21_filter_100-100_n1;A;B;C;D;E A;0.000000;0.643725;0.924171;0.639216;0.233962 B;0.643725;0.000000;0.815789;0.137157;0.762803 C;0.924171;0.815789;0.000000;0.611307;0.952239 D;0.639216;0.137157;0.611307;0.000000;0.725594 E;0.233962;0.762803;0.952239;0.725594;0.000000 mat_presenceAbsence_jaccard.csv000066400000000000000000000003661353413740300342470ustar00rootroot00000000000000simka-1.5.1/tests/simkaMin/truth_simkaMin_symetrical/k21_filter_100-100_n1;A;B;C;D;E A;0.000000;0.790000;0.970000;0.780000;0.000000 B;0.790000;0.000000;0.930000;0.170000;0.790000 C;0.970000;0.930000;0.000000;0.760000;0.970000 D;0.780000;0.170000;0.760000;0.000000;0.780000 E;0.000000;0.790000;0.970000;0.780000;0.000000 simka-1.5.1/tests/simkaMin/truth_simkaMin_symetrical/k31__0-1000_n0/000077500000000000000000000000001353413740300247205ustar00rootroot00000000000000simka-1.5.1/tests/simkaMin/truth_simkaMin_symetrical/k31__0-1000_n0/mat_abundance_braycurtis.csv000066400000000000000000000003661353413740300324720ustar00rootroot00000000000000;A;B;C;D;E A;0.000000;0.422120;0.527750;0.595335;0.607560 B;0.422120;0.000000;0.546436;0.237230;0.592219 C;0.527750;0.546436;0.000000;0.255734;0.818444 D;0.595335;0.237230;0.255734;0.000000;0.645236 E;0.607560;0.592219;0.818444;0.645236;0.000000 simka-1.5.1/tests/simkaMin/truth_simkaMin_symetrical/k31__0-1000_n0/mat_presenceAbsence_jaccard.csv000066400000000000000000000003661353413740300330370ustar00rootroot00000000000000;A;B;C;D;E A;0.000000;0.484000;0.564000;0.728000;0.484000 B;0.484000;0.000000;0.708000;0.403000;0.000000 C;0.564000;0.708000;0.000000;0.376000;0.708000 D;0.728000;0.403000;0.376000;0.000000;0.403000 E;0.484000;0.000000;0.708000;0.403000;0.000000 simka-1.5.1/tests/simkaMin/truth_simkaMin_symetrical/k31__0-1000_n1/000077500000000000000000000000001353413740300247215ustar00rootroot00000000000000simka-1.5.1/tests/simkaMin/truth_simkaMin_symetrical/k31__0-1000_n1/mat_abundance_braycurtis.csv000066400000000000000000000003661353413740300324730ustar00rootroot00000000000000;A;B;C;D;E A;0.000000;0.422120;0.527750;0.595335;0.607560 B;0.422120;0.000000;0.546436;0.237230;0.592219 C;0.527750;0.546436;0.000000;0.255734;0.818444 D;0.595335;0.237230;0.255734;0.000000;0.645236 E;0.607560;0.592219;0.818444;0.645236;0.000000 simka-1.5.1/tests/simkaMin/truth_simkaMin_symetrical/k31__0-1000_n1/mat_presenceAbsence_jaccard.csv000066400000000000000000000003661353413740300330400ustar00rootroot00000000000000;A;B;C;D;E A;0.000000;0.484000;0.564000;0.728000;0.484000 B;0.484000;0.000000;0.708000;0.403000;0.000000 C;0.564000;0.708000;0.000000;0.376000;0.708000 D;0.728000;0.403000;0.376000;0.000000;0.403000 E;0.484000;0.000000;0.708000;0.403000;0.000000 simka-1.5.1/tests/simkaMin/truth_simkaMin_symetrical/k31__0-100_n0/000077500000000000000000000000001353413740300246405ustar00rootroot00000000000000simka-1.5.1/tests/simkaMin/truth_simkaMin_symetrical/k31__0-100_n0/mat_abundance_braycurtis.csv000066400000000000000000000003661353413740300324120ustar00rootroot00000000000000;A;B;C;D;E A;0.000000;0.384615;0.477064;0.581921;0.587734 B;0.384615;0.000000;0.531915;0.219780;0.609672 C;0.477064;0.531915;0.000000;0.280000;0.814085 D;0.581921;0.219780;0.280000;0.000000;0.635468 E;0.587734;0.609672;0.814085;0.635468;0.000000 simka-1.5.1/tests/simkaMin/truth_simkaMin_symetrical/k31__0-100_n0/mat_presenceAbsence_jaccard.csv000066400000000000000000000003661353413740300327570ustar00rootroot00000000000000;A;B;C;D;E A;0.000000;0.410000;0.480000;0.710000;0.410000 B;0.410000;0.000000;0.690000;0.370000;0.000000 C;0.480000;0.690000;0.000000;0.390000;0.690000 D;0.710000;0.370000;0.390000;0.000000;0.370000 E;0.410000;0.000000;0.690000;0.370000;0.000000 simka-1.5.1/tests/simkaMin/truth_simkaMin_symetrical/k31__0-100_n1/000077500000000000000000000000001353413740300246415ustar00rootroot00000000000000simka-1.5.1/tests/simkaMin/truth_simkaMin_symetrical/k31__0-100_n1/mat_abundance_braycurtis.csv000066400000000000000000000003661353413740300324130ustar00rootroot00000000000000;A;B;C;D;E A;0.000000;0.384615;0.477064;0.581921;0.587734 B;0.384615;0.000000;0.531915;0.219780;0.609672 C;0.477064;0.531915;0.000000;0.280000;0.814085 D;0.581921;0.219780;0.280000;0.000000;0.635468 E;0.587734;0.609672;0.814085;0.635468;0.000000 simka-1.5.1/tests/simkaMin/truth_simkaMin_symetrical/k31__0-100_n1/mat_presenceAbsence_jaccard.csv000066400000000000000000000003661353413740300327600ustar00rootroot00000000000000;A;B;C;D;E A;0.000000;0.410000;0.480000;0.710000;0.410000 B;0.410000;0.000000;0.690000;0.370000;0.000000 C;0.480000;0.690000;0.000000;0.390000;0.690000 D;0.710000;0.370000;0.390000;0.000000;0.370000 E;0.410000;0.000000;0.690000;0.370000;0.000000 simka-1.5.1/tests/simkaMin/truth_simkaMin_symetrical/k31__100-1000_n0/000077500000000000000000000000001353413740300250615ustar00rootroot00000000000000simka-1.5.1/tests/simkaMin/truth_simkaMin_symetrical/k31__100-1000_n0/mat_abundance_braycurtis.csv000066400000000000000000000003661353413740300326330ustar00rootroot00000000000000;A;B;C;D;E A;0.000000;0.435349;0.527750;0.595335;0.342440 B;0.435349;0.000000;0.561047;0.246427;0.329235 C;0.527750;0.561047;0.000000;0.255734;0.694026 D;0.595335;0.246427;0.255734;0.000000;0.437346 E;0.342440;0.329235;0.694026;0.437346;0.000000 mat_presenceAbsence_jaccard.csv000066400000000000000000000003661353413740300331210ustar00rootroot00000000000000simka-1.5.1/tests/simkaMin/truth_simkaMin_symetrical/k31__100-1000_n0;A;B;C;D;E A;0.000000;0.484000;0.564000;0.728000;0.484000 B;0.484000;0.000000;0.708000;0.403000;0.000000 C;0.564000;0.708000;0.000000;0.376000;0.708000 D;0.728000;0.403000;0.376000;0.000000;0.403000 E;0.484000;0.000000;0.708000;0.403000;0.000000 simka-1.5.1/tests/simkaMin/truth_simkaMin_symetrical/k31__100-1000_n1/000077500000000000000000000000001353413740300250625ustar00rootroot00000000000000simka-1.5.1/tests/simkaMin/truth_simkaMin_symetrical/k31__100-1000_n1/mat_abundance_braycurtis.csv000066400000000000000000000003661353413740300326340ustar00rootroot00000000000000;A;B;C;D;E A;0.000000;0.435349;0.527750;0.595335;0.342440 B;0.435349;0.000000;0.561047;0.246427;0.329235 C;0.527750;0.561047;0.000000;0.255734;0.694026 D;0.595335;0.246427;0.255734;0.000000;0.437346 E;0.342440;0.329235;0.694026;0.437346;0.000000 mat_presenceAbsence_jaccard.csv000066400000000000000000000003661353413740300331220ustar00rootroot00000000000000simka-1.5.1/tests/simkaMin/truth_simkaMin_symetrical/k31__100-1000_n1;A;B;C;D;E A;0.000000;0.484000;0.564000;0.728000;0.484000 B;0.484000;0.000000;0.708000;0.403000;0.000000 C;0.564000;0.708000;0.000000;0.376000;0.708000 D;0.728000;0.403000;0.376000;0.000000;0.403000 E;0.484000;0.000000;0.708000;0.403000;0.000000 simka-1.5.1/tests/simkaMin/truth_simkaMin_symetrical/k31__100-100_n0/000077500000000000000000000000001353413740300250015ustar00rootroot00000000000000simka-1.5.1/tests/simkaMin/truth_simkaMin_symetrical/k31__100-100_n0/mat_abundance_braycurtis.csv000066400000000000000000000003661353413740300325530ustar00rootroot00000000000000;A;B;C;D;E A;0.000000;0.390558;0.477064;0.581921;0.318310 B;0.390558;0.000000;0.531915;0.219780;0.352601 C;0.477064;0.531915;0.000000;0.280000;0.688679 D;0.581921;0.219780;0.280000;0.000000;0.424125 E;0.318310;0.352601;0.688679;0.424125;0.000000 simka-1.5.1/tests/simkaMin/truth_simkaMin_symetrical/k31__100-100_n0/mat_presenceAbsence_jaccard.csv000066400000000000000000000003661353413740300331200ustar00rootroot00000000000000;A;B;C;D;E A;0.000000;0.410000;0.480000;0.710000;0.410000 B;0.410000;0.000000;0.690000;0.370000;0.000000 C;0.480000;0.690000;0.000000;0.390000;0.690000 D;0.710000;0.370000;0.390000;0.000000;0.370000 E;0.410000;0.000000;0.690000;0.370000;0.000000 simka-1.5.1/tests/simkaMin/truth_simkaMin_symetrical/k31__100-100_n1/000077500000000000000000000000001353413740300250025ustar00rootroot00000000000000simka-1.5.1/tests/simkaMin/truth_simkaMin_symetrical/k31__100-100_n1/mat_abundance_braycurtis.csv000066400000000000000000000003661353413740300325540ustar00rootroot00000000000000;A;B;C;D;E A;0.000000;0.390558;0.477064;0.581921;0.318310 B;0.390558;0.000000;0.531915;0.219780;0.352601 C;0.477064;0.531915;0.000000;0.280000;0.688679 D;0.581921;0.219780;0.280000;0.000000;0.424125 E;0.318310;0.352601;0.688679;0.424125;0.000000 simka-1.5.1/tests/simkaMin/truth_simkaMin_symetrical/k31__100-100_n1/mat_presenceAbsence_jaccard.csv000066400000000000000000000003661353413740300331210ustar00rootroot00000000000000;A;B;C;D;E A;0.000000;0.410000;0.480000;0.710000;0.410000 B;0.410000;0.000000;0.690000;0.370000;0.000000 C;0.480000;0.690000;0.000000;0.390000;0.690000 D;0.710000;0.370000;0.390000;0.000000;0.370000 E;0.410000;0.000000;0.690000;0.370000;0.000000 simka-1.5.1/tests/simkaMin/truth_simkaMin_symetrical/k31_filter_0-1000_n0/000077500000000000000000000000001353413740300261265ustar00rootroot00000000000000mat_abundance_braycurtis.csv000066400000000000000000000003661353413740300336210ustar00rootroot00000000000000simka-1.5.1/tests/simkaMin/truth_simkaMin_symetrical/k31_filter_0-1000_n0;A;B;C;D;E A;0.000000;0.638423;0.874763;0.646590;0.607632 B;0.638423;0.000000;0.533537;0.061224;0.905759 C;0.874763;0.533537;0.000000;0.577931;0.973881 D;0.646590;0.061224;0.577931;0.000000;0.895833 E;0.607632;0.905759;0.973881;0.895833;0.000000 mat_presenceAbsence_jaccard.csv000066400000000000000000000003661353413740300341660ustar00rootroot00000000000000simka-1.5.1/tests/simkaMin/truth_simkaMin_symetrical/k31_filter_0-1000_n0;A;B;C;D;E A;0.000000;0.809000;0.986000;0.809000;0.484000 B;0.809000;0.000000;0.815000;0.000000;0.909000 C;0.986000;0.815000;0.000000;0.815000;0.995000 D;0.809000;0.000000;0.815000;0.000000;0.909000 E;0.484000;0.909000;0.995000;0.909000;0.000000 simka-1.5.1/tests/simkaMin/truth_simkaMin_symetrical/k31_filter_0-1000_n1/000077500000000000000000000000001353413740300261275ustar00rootroot00000000000000mat_abundance_braycurtis.csv000066400000000000000000000003661353413740300336220ustar00rootroot00000000000000simka-1.5.1/tests/simkaMin/truth_simkaMin_symetrical/k31_filter_0-1000_n1;A;B;C;D;E A;0.000000;0.638423;0.874763;0.646590;0.607632 B;0.638423;0.000000;0.533537;0.061224;0.905759 C;0.874763;0.533537;0.000000;0.577931;0.973881 D;0.646590;0.061224;0.577931;0.000000;0.895833 E;0.607632;0.905759;0.973881;0.895833;0.000000 mat_presenceAbsence_jaccard.csv000066400000000000000000000003661353413740300341670ustar00rootroot00000000000000simka-1.5.1/tests/simkaMin/truth_simkaMin_symetrical/k31_filter_0-1000_n1;A;B;C;D;E A;0.000000;0.809000;0.986000;0.809000;0.484000 B;0.809000;0.000000;0.815000;0.000000;0.909000 C;0.986000;0.815000;0.000000;0.815000;0.995000 D;0.809000;0.000000;0.815000;0.000000;0.909000 E;0.484000;0.909000;0.995000;0.909000;0.000000 simka-1.5.1/tests/simkaMin/truth_simkaMin_symetrical/k31_filter_0-100_n0/000077500000000000000000000000001353413740300260465ustar00rootroot00000000000000mat_abundance_braycurtis.csv000066400000000000000000000003661353413740300335410ustar00rootroot00000000000000simka-1.5.1/tests/simkaMin/truth_simkaMin_symetrical/k31_filter_0-100_n0;A;B;C;D;E A;0.000000;0.646586;0.872146;0.662835;0.588435 B;0.646586;0.000000;0.556391;0.048276;0.902240 C;0.872146;0.556391;0.000000;0.588850;0.966316 D;0.662835;0.048276;0.588850;0.000000;0.879276 E;0.588435;0.902240;0.966316;0.879276;0.000000 mat_presenceAbsence_jaccard.csv000066400000000000000000000003661353413740300341060ustar00rootroot00000000000000simka-1.5.1/tests/simkaMin/truth_simkaMin_symetrical/k31_filter_0-100_n0;A;B;C;D;E A;0.000000;0.800000;0.950000;0.800000;0.410000 B;0.800000;0.000000;0.740000;0.000000;0.890000 C;0.950000;0.740000;0.000000;0.740000;0.970000 D;0.800000;0.000000;0.740000;0.000000;0.890000 E;0.410000;0.890000;0.970000;0.890000;0.000000 simka-1.5.1/tests/simkaMin/truth_simkaMin_symetrical/k31_filter_0-100_n1/000077500000000000000000000000001353413740300260475ustar00rootroot00000000000000mat_abundance_braycurtis.csv000066400000000000000000000003661353413740300335420ustar00rootroot00000000000000simka-1.5.1/tests/simkaMin/truth_simkaMin_symetrical/k31_filter_0-100_n1;A;B;C;D;E A;0.000000;0.646586;0.872146;0.662835;0.588435 B;0.646586;0.000000;0.556391;0.048276;0.902240 C;0.872146;0.556391;0.000000;0.588850;0.966316 D;0.662835;0.048276;0.588850;0.000000;0.879276 E;0.588435;0.902240;0.966316;0.879276;0.000000 mat_presenceAbsence_jaccard.csv000066400000000000000000000003661353413740300341070ustar00rootroot00000000000000simka-1.5.1/tests/simkaMin/truth_simkaMin_symetrical/k31_filter_0-100_n1;A;B;C;D;E A;0.000000;0.800000;0.950000;0.800000;0.410000 B;0.800000;0.000000;0.740000;0.000000;0.890000 C;0.950000;0.740000;0.000000;0.740000;0.970000 D;0.800000;0.000000;0.740000;0.000000;0.890000 E;0.410000;0.890000;0.970000;0.890000;0.000000 simka-1.5.1/tests/simkaMin/truth_simkaMin_symetrical/k31_filter_100-1000_n0/000077500000000000000000000000001353413740300262675ustar00rootroot00000000000000mat_abundance_braycurtis.csv000066400000000000000000000003661353413740300337620ustar00rootroot00000000000000simka-1.5.1/tests/simkaMin/truth_simkaMin_symetrical/k31_filter_100-1000_n0;A;B;C;D;E A;0.000000;0.699024;0.874763;0.646590;0.230279 B;0.699024;0.000000;0.758741;0.151412;0.799847 C;0.874763;0.758741;0.000000;0.577931;0.919315 D;0.646590;0.151412;0.577931;0.000000;0.730458 E;0.230279;0.799847;0.919315;0.730458;0.000000 mat_presenceAbsence_jaccard.csv000066400000000000000000000003661353413740300343270ustar00rootroot00000000000000simka-1.5.1/tests/simkaMin/truth_simkaMin_symetrical/k31_filter_100-1000_n0;A;B;C;D;E A;0.000000;0.875000;0.986000;0.809000;0.000000 B;0.875000;0.000000;0.697000;0.288000;0.875000 C;0.986000;0.697000;0.000000;0.815000;0.986000 D;0.809000;0.288000;0.815000;0.000000;0.809000 E;0.000000;0.875000;0.986000;0.809000;0.000000 simka-1.5.1/tests/simkaMin/truth_simkaMin_symetrical/k31_filter_100-1000_n1/000077500000000000000000000000001353413740300262705ustar00rootroot00000000000000mat_abundance_braycurtis.csv000066400000000000000000000003661353413740300337630ustar00rootroot00000000000000simka-1.5.1/tests/simkaMin/truth_simkaMin_symetrical/k31_filter_100-1000_n1;A;B;C;D;E A;0.000000;0.699024;0.874763;0.646590;0.230279 B;0.699024;0.000000;0.758741;0.151412;0.799847 C;0.874763;0.758741;0.000000;0.577931;0.919315 D;0.646590;0.151412;0.577931;0.000000;0.730458 E;0.230279;0.799847;0.919315;0.730458;0.000000 mat_presenceAbsence_jaccard.csv000066400000000000000000000003661353413740300343300ustar00rootroot00000000000000simka-1.5.1/tests/simkaMin/truth_simkaMin_symetrical/k31_filter_100-1000_n1;A;B;C;D;E A;0.000000;0.875000;0.986000;0.809000;0.000000 B;0.875000;0.000000;0.697000;0.288000;0.875000 C;0.986000;0.697000;0.000000;0.815000;0.986000 D;0.809000;0.288000;0.815000;0.000000;0.809000 E;0.000000;0.875000;0.986000;0.809000;0.000000 simka-1.5.1/tests/simkaMin/truth_simkaMin_symetrical/k31_filter_100-100_n0/000077500000000000000000000000001353413740300262075ustar00rootroot00000000000000mat_abundance_braycurtis.csv000066400000000000000000000003661353413740300337020ustar00rootroot00000000000000simka-1.5.1/tests/simkaMin/truth_simkaMin_symetrical/k31_filter_100-100_n0;A;B;C;D;E A;0.000000;0.659919;0.872146;0.662835;0.233645 B;0.659919;0.000000;0.815789;0.148615;0.774194 C;0.872146;0.815789;0.000000;0.588850;0.918605 D;0.662835;0.148615;0.588850;0.000000;0.709845 E;0.233645;0.774194;0.918605;0.709845;0.000000 mat_presenceAbsence_jaccard.csv000066400000000000000000000003661353413740300342470ustar00rootroot00000000000000simka-1.5.1/tests/simkaMin/truth_simkaMin_symetrical/k31_filter_100-100_n0;A;B;C;D;E A;0.000000;0.810000;0.950000;0.800000;0.000000 B;0.810000;0.000000;0.930000;0.190000;0.810000 C;0.950000;0.930000;0.000000;0.740000;0.950000 D;0.800000;0.190000;0.740000;0.000000;0.800000 E;0.000000;0.810000;0.950000;0.800000;0.000000 simka-1.5.1/tests/simkaMin/truth_simkaMin_symetrical/k31_filter_100-100_n1/000077500000000000000000000000001353413740300262105ustar00rootroot00000000000000mat_abundance_braycurtis.csv000066400000000000000000000003661353413740300337030ustar00rootroot00000000000000simka-1.5.1/tests/simkaMin/truth_simkaMin_symetrical/k31_filter_100-100_n1;A;B;C;D;E A;0.000000;0.659919;0.872146;0.662835;0.233645 B;0.659919;0.000000;0.815789;0.148615;0.774194 C;0.872146;0.815789;0.000000;0.588850;0.918605 D;0.662835;0.148615;0.588850;0.000000;0.709845 E;0.233645;0.774194;0.918605;0.709845;0.000000 mat_presenceAbsence_jaccard.csv000066400000000000000000000003661353413740300342500ustar00rootroot00000000000000simka-1.5.1/tests/simkaMin/truth_simkaMin_symetrical/k31_filter_100-100_n1;A;B;C;D;E A;0.000000;0.810000;0.950000;0.800000;0.000000 B;0.810000;0.000000;0.930000;0.190000;0.810000 C;0.950000;0.930000;0.000000;0.740000;0.950000 D;0.800000;0.190000;0.740000;0.000000;0.800000 E;0.000000;0.810000;0.950000;0.800000;0.000000 simka-1.5.1/tests/simple_test.py000077500000000000000000000106171353413740300167010ustar00rootroot00000000000000 import sys, os, shutil, glob, gzip os.chdir(os.path.split(os.path.realpath(__file__))[0]) suffix = " > /dev/null 2>&1" dir = "__results__" def clear(): if os.path.exists("temp_output"): shutil.rmtree("temp_output") if os.path.exists("__results__"): shutil.rmtree("__results__") os.mkdir(dir) def decompress_simka_results(dir): result_filenames = glob.glob(os.path.join(dir, '*.csv.gz')) for filename_gz in result_filenames: #filename_gz = result_dir + "/" + filename with gzip.open(filename_gz, 'rb') as f: outFile = open(filename_gz[:-3], "w") outFile.write(f.read()) outFile.close() os.remove(filename_gz) def __test_matrices(simka_vs_truth, result_dir, truth_dir): ok = True decompress_simka_results(result_dir) result_filenames = glob.glob(os.path.join(result_dir, '*.csv')) if len(result_filenames) == 0: print("Error: no results") exit(1) if simka_vs_truth: truth_filenames = glob.glob(os.path.join(truth_dir, '*.csv')) else: #simka vs simka #if result_dir+"/mat_abundance_jaccard.csv" in truth_filenames: #comparing simka results vs simka results #truth_filenames.remove(result_dir+"/mat_abundance_jaccard.csv") #This distance is computed from Bray Curtis distance decompress_simka_results(truth_dir) truth_filenames = glob.glob(os.path.join(truth_dir, '*.csv')) truth_filenames.sort() result_filenames.sort() for result_filename in result_filenames: distanceName = os.path.split(result_filename)[1] for truth_filename in truth_filenames: distanceName2 = os.path.split(truth_filename)[1] if distanceName != distanceName2: continue res_file = open(result_filename, "r") truth_file = open(truth_filename, "r") #print res_file, truth_file res_str = res_file.read() truth_str = truth_file.read() res_file.close() truth_file.close() if(res_str != truth_str): print("\t- TEST ERROR: " + distanceName) ok = False return ok def test_dists(dir): if(__test_matrices(True, "__results__/" + dir, "truth/" + dir)): print("\tOK") else: print("\tFAILED") sys.exit(1) def test_parallelization(): if(__test_matrices(False, "__results__/results_resources1", "__results__/results_resources2")): print("\tOK") else: print("\tFAILED") sys.exit(1) #---------------------------------------------------------------- #---------------------------------------------------------------- #---------------------------------------------------------------- #test k=31 t=0 clear() print("TESTING k=31 t=0") command = "../build/bin/simka -in ../example/simka_input.txt -out ./__results__/results_k31_t0 -out-tmp ./temp_output -simple-dist -complex-dist -kmer-size 31 -abundance-min 0 -verbose 0" print(command) os.system(command + suffix) test_dists("results_k31_t0") #test k=21 t=0 clear() print("TESTING k=21 t=0") command = "../build/bin/simka -in ../example/simka_input.txt -out ./__results__/results_k21_t0 -out-tmp ./temp_output -simple-dist -complex-dist -kmer-size 21 -abundance-min 0 -verbose 0" print(command) os.system(command + suffix) test_dists("results_k21_t0") #test k=31 t=2 clear() print("TESTING k=31 t=2") command = "../build/bin/simka -in ../example/simka_input.txt -out ./__results__/results_k31_t2 -out-tmp ./temp_output -simple-dist -complex-dist -kmer-size 31 -abundance-min 2 -verbose 0" print(command) os.system(command + suffix) test_dists("results_k31_t2") #test k=21 t=2 clear() print("TESTING k=21 t=2") command = "../build/bin/simka -in ../example/simka_input.txt -out ./__results__/results_k21_t2 -out-tmp ./temp_output -simple-dist -complex-dist -kmer-size 21 -abundance-min 2 -verbose 0" print(command) os.system(command + suffix) test_dists("results_k21_t2") #test resources 1 clear() print("TESTING parallelization") command = "../build/bin/simka -in ../example/simka_input.txt -out ./__results__/results_resources1 -out-tmp ./temp_output -simple-dist -complex-dist -kmer-size 21 -abundance-min 0 -nb-cores 20 -max-memory 4000 -verbose 0" os.system(command + suffix) command = "../build/bin/simka -in ../example/simka_input.txt -out ./__results__/results_resources2 -out-tmp ./temp_output -simple-dist -complex-dist -kmer-size 21 -abundance-min 0 -nb-cores 2 -max-memory 2000 -verbose 0" os.system(command + suffix) test_parallelization() #---------------------------------------------------------------- #---------------------------------------------------------------- #---------------------------------------------------------------- clear() simka-1.5.1/tests/truth/000077500000000000000000000000001353413740300151355ustar00rootroot00000000000000simka-1.5.1/tests/truth/results_k21_t0/000077500000000000000000000000001353413740300177165ustar00rootroot00000000000000simka-1.5.1/tests/truth/results_k21_t0/mat_abundance_ab-jaccard.csv000066400000000000000000000003661353413740300252500ustar00rootroot00000000000000;A;B;C;D;E A;0.000000;0.411765;0.530424;0.639230;0.208955 B;0.411765;0.000000;0.650728;0.360401;0.000000 C;0.530424;0.650728;0.000000;0.313945;0.584892 D;0.639230;0.360401;0.313945;0.000000;0.377514 E;0.208955;0.000000;0.584892;0.377514;0.000000 simka-1.5.1/tests/truth/results_k21_t0/mat_abundance_ab-ochiai.csv000066400000000000000000000003661353413740300251150ustar00rootroot00000000000000;A;B;C;D;E A;0.000000;0.233035;0.329018;0.426135;0.110593 B;0.233035;0.000000;0.481437;0.209324;0.000000 C;0.329018;0.481437;0.000000;0.177817;0.401494 D;0.426135;0.209324;0.177817;0.000000;0.224691 E;0.110593;0.000000;0.401494;0.224691;0.000000 simka-1.5.1/tests/truth/results_k21_t0/mat_abundance_ab-sorensen.csv000066400000000000000000000003661353413740300255150ustar00rootroot00000000000000;A;B;C;D;E A;0.000000;0.259259;0.360936;0.469756;0.116667 B;0.259259;0.000000;0.482280;0.219811;0.000000 C;0.360936;0.482280;0.000000;0.186201;0.413320 D;0.469756;0.219811;0.186201;0.000000;0.232676 E;0.116667;0.000000;0.413320;0.232676;0.000000 simka-1.5.1/tests/truth/results_k21_t0/mat_abundance_braycurtis.csv000066400000000000000000000003661353413740300254700ustar00rootroot00000000000000;A;B;C;D;E A;0.000000;0.402985;0.512690;0.567347;0.604790 B;0.402985;0.000000;0.520000;0.233871;0.595238 C;0.512690;0.520000;0.000000;0.237705;0.808000 D;0.567347;0.233871;0.237705;0.000000;0.642336 E;0.604790;0.595238;0.808000;0.642336;0.000000 simka-1.5.1/tests/truth/results_k21_t0/mat_abundance_canberra.csv000066400000000000000000000003661353413740300250560ustar00rootroot00000000000000;A;B;C;D;E A;0.000000;0.466667;0.551020;0.702899;0.466667 B;0.466667;0.000000;0.685714;0.398551;0.000000 C;0.551020;0.685714;0.000000;0.345588;0.685714 D;0.702899;0.398551;0.345588;0.000000;0.398551 E;0.466667;0.000000;0.685714;0.398551;0.000000 simka-1.5.1/tests/truth/results_k21_t0/mat_abundance_chord.csv000066400000000000000000000003661353413740300244000ustar00rootroot00000000000000;A;B;C;D;E A;0.000000;0.663003;0.788829;0.888049;0.294818 B;0.663003;0.000000;0.936652;0.598750;0.377612 C;0.788829;0.936652;0.000000;0.591969;0.812323 D;0.888049;0.598750;0.591969;0.000000;0.718781 E;0.294818;0.377612;0.812323;0.718781;0.000000 simka-1.5.1/tests/truth/results_k21_t0/mat_abundance_hellinger.csv000066400000000000000000000003661353413740300252520ustar00rootroot00000000000000;A;B;C;D;E A;0.000000;0.897684;1.012605;1.049611;0.637158 B;0.897684;0.000000;1.019710;0.653186;0.632421 C;1.012605;1.019710;0.000000;0.654450;1.016203 D;1.049611;0.653186;0.654450;0.000000;0.872745 E;0.637158;0.632421;1.016203;0.872745;0.000000 simka-1.5.1/tests/truth/results_k21_t0/mat_abundance_jensenshannon.csv000066400000000000000000000003661353413740300261500ustar00rootroot00000000000000;A;B;C;D;E A;0.000000;0.421799;0.490033;0.566082;0.282932 B;0.421799;0.000000;0.582440;0.387729;0.164543 C;0.490033;0.582440;0.000000;0.363881;0.532800 D;0.566082;0.387729;0.363881;0.000000;0.425073 E;0.282932;0.164543;0.532800;0.425073;0.000000 simka-1.5.1/tests/truth/results_k21_t0/mat_abundance_kulczynski.csv000066400000000000000000000003661353413740300255070ustar00rootroot00000000000000;A;B;C;D;E A;0.000000;0.696970;0.757576;0.732323;0.500000 B;0.696970;0.000000;0.764706;0.534314;0.500000 C;0.757576;0.764706;0.000000;0.525510;0.755102 D;0.732323;0.534314;0.525510;0.000000;0.664384 E;0.500000;0.500000;0.755102;0.664384;0.000000 simka-1.5.1/tests/truth/results_k21_t0/mat_abundance_simka-jaccard.csv000066400000000000000000000003661353413740300257720ustar00rootroot00000000000000;A;B;C;D;E A;0.000000;0.208955;0.294416;0.424490;0.167665 B;0.208955;0.000000;0.480000;0.221774;0.000000 C;0.294416;0.480000;0.000000;0.192623;0.316000 D;0.424490;0.221774;0.192623;0.000000;0.164234 E;0.167665;0.000000;0.316000;0.164234;0.000000 simka-1.5.1/tests/truth/results_k21_t0/mat_abundance_simka-jaccard_asym.csv000066400000000000000000000003661353413740300270230ustar00rootroot00000000000000;A;B;C;D;E A;0.000000;0.000000;0.080808;0.141414;0.000000 B;0.411765;0.000000;0.450980;0.068627;0.000000 C;0.510204;0.510204;0.000000;0.051020;0.510204 D;0.616438;0.328767;0.287671;0.000000;0.328767 E;0.208955;0.000000;0.268657;0.104478;0.000000 simka-1.5.1/tests/truth/results_k21_t0/mat_abundance_whittaker.csv000066400000000000000000000003661353413740300253030ustar00rootroot00000000000000;A;B;C;D;E A;0.000000;0.411765;0.510513;0.627231;0.208955 B;0.411765;0.000000;0.526210;0.340451;0.202809 C;0.510513;0.526210;0.000000;0.326111;0.510966 D;0.627231;0.340451;0.326111;0.000000;0.418728 E;0.208955;0.202809;0.510966;0.418728;0.000000 simka-1.5.1/tests/truth/results_k21_t0/mat_presenceAbsence_braycurtis.csv000066400000000000000000000003661353413740300266350ustar00rootroot00000000000000;A;B;C;D;E A;0.000000;0.304348;0.380282;0.541899;0.304348 B;0.304348;0.000000;0.521739;0.248869;0.000000 C;0.380282;0.521739;0.000000;0.208889;0.521739 D;0.541899;0.248869;0.208889;0.000000;0.248869 E;0.304348;0.000000;0.521739;0.248869;0.000000 simka-1.5.1/tests/truth/results_k21_t0/mat_presenceAbsence_chord.csv000066400000000000000000000003661353413740300255450ustar00rootroot00000000000000;A;B;C;D;E A;0.000000;0.734443;0.830614;0.982808;0.734443 B;0.734443;0.000000;1.021397;0.686439;0.000000 C;0.830614;1.021397;0.000000;0.629238;1.021397 D;0.982808;0.686439;0.629238;0.000000;0.686439 E;0.734443;0.000000;1.021397;0.686439;0.000000 simka-1.5.1/tests/truth/results_k21_t0/mat_presenceAbsence_jaccard.csv000066400000000000000000000003661353413740300260350ustar00rootroot00000000000000;A;B;C;D;E A;0.000000;0.466667;0.551020;0.702899;0.466667 B;0.466667;0.000000;0.685714;0.398551;0.000000 C;0.551020;0.685714;0.000000;0.345588;0.685714 D;0.702899;0.398551;0.345588;0.000000;0.398551 E;0.466667;0.000000;0.685714;0.398551;0.000000 simka-1.5.1/tests/truth/results_k21_t0/mat_presenceAbsence_kulczynski.csv000066400000000000000000000003661353413740300266540ustar00rootroot00000000000000;A;B;C;D;E A;0.000000;0.233333;0.307624;0.416428;0.233333 B;0.233333;0.000000;0.521513;0.222095;0.000000 C;0.307624;0.521513;0.000000;0.186901;0.521513 D;0.416428;0.222095;0.186901;0.000000;0.222095 E;0.233333;0.000000;0.521513;0.222095;0.000000 simka-1.5.1/tests/truth/results_k21_t0/mat_presenceAbsence_ochiai.csv000066400000000000000000000003661353413740300257020ustar00rootroot00000000000000;A;B;C;D;E A;0.000000;0.269703;0.344960;0.482956;0.269703 B;0.269703;0.000000;0.521626;0.235599;0.000000 C;0.344960;0.521626;0.000000;0.197970;0.521626 D;0.482956;0.235599;0.197970;0.000000;0.235599 E;0.269703;0.000000;0.521626;0.235599;0.000000 simka-1.5.1/tests/truth/results_k21_t0/mat_presenceAbsence_simka-jaccard.csv000066400000000000000000000003661353413740300271370ustar00rootroot00000000000000;A;B;C;D;E A;0.000000;0.304348;0.380282;0.541899;0.304348 B;0.304348;0.000000;0.521739;0.248869;0.000000 C;0.380282;0.521739;0.000000;0.208889;0.521739 D;0.541899;0.248869;0.208889;0.000000;0.248869 E;0.304348;0.000000;0.521739;0.248869;0.000000 simka-1.5.1/tests/truth/results_k21_t0/mat_presenceAbsence_simka-jaccard_asym.csv000066400000000000000000000003661353413740300301700ustar00rootroot00000000000000;A;B;C;D;E A;0.000000;0.000000;0.083333;0.145833;0.000000 B;0.466667;0.000000;0.511111;0.077778;0.000000 C;0.531915;0.531915;0.000000;0.053191;0.531915 D;0.687023;0.366412;0.320611;0.000000;0.366412 E;0.466667;0.000000;0.511111;0.077778;0.000000 simka-1.5.1/tests/truth/results_k21_t0/mat_presenceAbsence_whittaker.csv000066400000000000000000000003661353413740300264500ustar00rootroot00000000000000;A;B;C;D;E A;0.000000;0.466667;0.531915;0.687023;0.466667 B;0.466667;0.000000;0.531915;0.366412;0.000000 C;0.531915;0.531915;0.000000;0.320611;0.531915 D;0.687023;0.366412;0.320611;0.000000;0.366412 E;0.466667;0.000000;0.531915;0.366412;0.000000 simka-1.5.1/tests/truth/results_k21_t2/000077500000000000000000000000001353413740300177205ustar00rootroot00000000000000simka-1.5.1/tests/truth/results_k21_t2/mat_abundance_ab-jaccard.csv000066400000000000000000000003661353413740300252520ustar00rootroot00000000000000;A;B;C;D;E A;0.000000;0.747475;0.909091;0.747475;0.208955 B;0.747475;0.000000;0.695652;0.000000;0.761194 C;0.909091;0.695652;0.000000;0.615385;0.920398 D;0.747475;0.000000;0.615385;0.000000;0.761194 E;0.208955;0.761194;0.920398;0.761194;0.000000 simka-1.5.1/tests/truth/results_k21_t2/mat_abundance_ab-ochiai.csv000066400000000000000000000003661353413740300251170ustar00rootroot00000000000000;A;B;C;D;E A;0.000000;0.497481;0.698489;0.497481;0.110593 B;0.497481;0.000000;0.448323;0.000000;0.511322 C;0.698489;0.448323;0.000000;0.379826;0.717862 D;0.497481;0.000000;0.379826;0.000000;0.511322 E;0.110593;0.511322;0.717862;0.511322;0.000000 simka-1.5.1/tests/truth/results_k21_t2/mat_abundance_ab-sorensen.csv000066400000000000000000000003661353413740300255170ustar00rootroot00000000000000;A;B;C;D;E A;0.000000;0.596774;0.833333;0.596774;0.116667 B;0.596774;0.000000;0.533333;0.000000;0.614458 C;0.833333;0.533333;0.000000;0.444444;0.852535 D;0.596774;0.000000;0.444444;0.000000;0.614458 E;0.116667;0.614458;0.852535;0.614458;0.000000 simka-1.5.1/tests/truth/results_k21_t2/mat_abundance_braycurtis.csv000066400000000000000000000003661353413740300254720ustar00rootroot00000000000000;A;B;C;D;E A;0.000000;0.622951;0.867925;0.632000;0.604790 B;0.622951;0.000000;0.533333;0.061224;0.891765 C;0.867925;0.533333;0.000000;0.575758;0.965770 D;0.632000;0.061224;0.575758;0.000000;0.878505 E;0.604790;0.891765;0.965770;0.878505;0.000000 simka-1.5.1/tests/truth/results_k21_t2/mat_abundance_canberra.csv000066400000000000000000000003661353413740300250600ustar00rootroot00000000000000;A;B;C;D;E A;0.000000;0.770833;0.937500;0.770833;0.466667 B;0.770833;0.000000;0.727273;0.000000;0.877778 C;0.937500;0.727273;0.000000;0.727273;0.966667 D;0.770833;0.000000;0.727273;0.000000;0.877778 E;0.466667;0.877778;0.966667;0.877778;0.000000 simka-1.5.1/tests/truth/results_k21_t2/mat_abundance_chord.csv000066400000000000000000000003661353413740300244020ustar00rootroot00000000000000;A;B;C;D;E A;0.000000;0.973395;1.136657;1.005674;0.294818 B;0.973395;0.000000;0.906626;0.314844;0.893974 C;1.136657;0.906626;0.000000;0.743856;1.113774 D;1.005674;0.314844;0.743856;0.000000;0.930404 E;0.294818;0.893974;1.113774;0.930404;0.000000 simka-1.5.1/tests/truth/results_k21_t2/mat_abundance_hellinger.csv000066400000000000000000000003661353413740300252540ustar00rootroot00000000000000;A;B;C;D;E A;0.000000;1.017842;1.211686;1.026596;0.637158 B;1.017842;0.000000;0.946914;0.192702;1.021383 C;1.211686;0.946914;0.000000;0.902220;1.213329 D;1.026596;0.192702;0.902220;0.000000;1.030042 E;0.637158;1.021383;1.213329;1.030042;0.000000 simka-1.5.1/tests/truth/results_k21_t2/mat_abundance_jensenshannon.csv000066400000000000000000000003661353413740300261520ustar00rootroot00000000000000;A;B;C;D;E A;0.000000;0.616237;0.733133;0.619517;0.282932 B;0.616237;0.000000;0.582099;0.092719;0.624404 C;0.733133;0.582099;0.000000;0.540656;0.742493 D;0.619517;0.092719;0.540656;0.000000;0.627238 E;0.282932;0.624404;0.742493;0.627238;0.000000 simka-1.5.1/tests/truth/results_k21_t2/mat_abundance_kulczynski.csv000066400000000000000000000003661353413740300255110ustar00rootroot00000000000000;A;B;C;D;E A;0.000000;0.883838;0.964646;0.883838;0.500000 B;0.883838;0.000000;0.847826;0.500000;0.500000 C;0.964646;0.847826;0.000000;0.500000;0.500000 D;0.883838;0.500000;0.500000;0.000000;0.500000 E;0.500000;0.500000;0.500000;0.500000;0.000000 simka-1.5.1/tests/truth/results_k21_t2/mat_abundance_simka-jaccard.csv000066400000000000000000000003661353413740300257740ustar00rootroot00000000000000;A;B;C;D;E A;0.000000;0.606557;0.849057;0.592000;0.167665 B;0.606557;0.000000;0.533333;0.000000;0.720000 C;0.849057;0.533333;0.000000;0.484848;0.904646 D;0.592000;0.000000;0.484848;0.000000;0.714953 E;0.167665;0.720000;0.904646;0.714953;0.000000 simka-1.5.1/tests/truth/results_k21_t2/mat_abundance_simka-jaccard_asym.csv000066400000000000000000000003661353413740300270250ustar00rootroot00000000000000;A;B;C;D;E A;0.000000;0.747475;0.909091;0.747475;0.000000 B;0.000000;0.000000;0.695652;0.000000;0.000000 C;0.000000;0.000000;0.000000;0.000000;0.000000 D;0.000000;0.000000;0.615385;0.000000;0.000000 E;0.208955;0.761194;0.920398;0.761194;0.000000 simka-1.5.1/tests/truth/results_k21_t2/mat_abundance_whittaker.csv000066400000000000000000000003661353413740300253050ustar00rootroot00000000000000;A;B;C;D;E A;0.000000;0.747475;0.909091;0.747475;0.208955 B;0.747475;0.000000;0.695652;0.100334;0.761194 C;0.909091;0.695652;0.000000;0.615385;0.920398 D;0.747475;0.100334;0.615385;0.000000;0.761194 E;0.208955;0.761194;0.920398;0.761194;0.000000 simka-1.5.1/tests/truth/results_k21_t2/mat_presenceAbsence_braycurtis.csv000066400000000000000000000003661353413740300266370ustar00rootroot00000000000000;A;B;C;D;E A;0.000000;0.627119;0.882353;0.627119;0.304348 B;0.627119;0.000000;0.571429;0.000000;0.782178 C;0.882353;0.571429;0.000000;0.571429;0.935484 D;0.627119;0.000000;0.571429;0.000000;0.782178 E;0.304348;0.782178;0.935484;0.782178;0.000000 simka-1.5.1/tests/truth/results_k21_t2/mat_presenceAbsence_chord.csv000066400000000000000000000003661353413740300255470ustar00rootroot00000000000000;A;B;C;D;E A;0.000000;1.021065;1.224745;1.021065;0.734443 B;1.021065;0.000000;0.977514;0.000000;1.140524 C;1.224745;0.977514;0.000000;0.977514;1.278613 D;1.021065;0.000000;0.977514;0.000000;1.140524 E;0.734443;1.140524;1.278613;1.140524;0.000000 simka-1.5.1/tests/truth/results_k21_t2/mat_presenceAbsence_jaccard.csv000066400000000000000000000003661353413740300260370ustar00rootroot00000000000000;A;B;C;D;E A;0.000000;0.770833;0.937500;0.770833;0.466667 B;0.770833;0.000000;0.727273;0.000000;0.877778 C;0.937500;0.727273;0.000000;0.727273;0.966667 D;0.770833;0.000000;0.727273;0.000000;0.877778 E;0.466667;0.877778;0.966667;0.877778;0.000000 simka-1.5.1/tests/truth/results_k21_t2/mat_presenceAbsence_kulczynski.csv000066400000000000000000000003661353413740300266560ustar00rootroot00000000000000;A;B;C;D;E A;0.000000;0.385417;0.468750;0.385417;0.233333 B;0.385417;0.000000;0.363636;0.000000;0.438889 C;0.468750;0.363636;0.000000;0.363636;0.483333 D;0.385417;0.000000;0.363636;0.000000;0.438889 E;0.233333;0.438889;0.483333;0.438889;0.000000 simka-1.5.1/tests/truth/results_k21_t2/mat_presenceAbsence_ochiai.csv000066400000000000000000000003661353413740300257040ustar00rootroot00000000000000;A;B;C;D;E A;0.000000;0.521286;0.750000;0.521286;0.269703 B;0.521286;0.000000;0.477767;0.000000;0.650397 C;0.750000;0.477767;0.000000;0.477767;0.817426 D;0.521286;0.000000;0.477767;0.000000;0.650397 E;0.269703;0.650397;0.817426;0.650397;0.000000 simka-1.5.1/tests/truth/results_k21_t2/mat_presenceAbsence_simka-jaccard.csv000066400000000000000000000003661353413740300271410ustar00rootroot00000000000000;A;B;C;D;E A;0.000000;0.627119;0.882353;0.627119;0.304348 B;0.627119;0.000000;0.571429;0.000000;0.782178 C;0.882353;0.571429;0.000000;0.571429;0.935484 D;0.627119;0.000000;0.571429;0.000000;0.782178 E;0.304348;0.782178;0.935484;0.782178;0.000000 simka-1.5.1/tests/truth/results_k21_t2/mat_presenceAbsence_simka-jaccard_asym.csv000066400000000000000000000003661353413740300301720ustar00rootroot00000000000000;A;B;C;D;E A;0.000000;0.770833;0.937500;0.770833;0.000000 B;0.000000;0.000000;0.727273;0.000000;0.000000 C;0.000000;0.000000;0.000000;0.000000;0.000000 D;0.000000;0.000000;0.727273;0.000000;0.000000 E;0.466667;0.877778;0.966667;0.877778;0.000000 simka-1.5.1/tests/truth/results_k21_t2/mat_presenceAbsence_whittaker.csv000066400000000000000000000003661353413740300264520ustar00rootroot00000000000000;A;B;C;D;E A;0.000000;0.770833;0.937500;0.770833;0.466667 B;0.770833;0.000000;0.727273;0.000000;0.877778 C;0.937500;0.727273;0.000000;0.727273;0.966667 D;0.770833;0.000000;0.727273;0.000000;0.877778 E;0.466667;0.877778;0.966667;0.877778;0.000000 simka-1.5.1/tests/truth/results_k31_t0/000077500000000000000000000000001353413740300177175ustar00rootroot00000000000000simka-1.5.1/tests/truth/results_k31_t0/mat_abundance_ab-jaccard.csv000066400000000000000000000003661353413740300252510ustar00rootroot00000000000000;A;B;C;D;E A;0.000000;0.411765;0.530424;0.639230;0.208955 B;0.411765;0.000000;0.650728;0.360401;0.000000 C;0.530424;0.650728;0.000000;0.313945;0.584892 D;0.639230;0.360401;0.313945;0.000000;0.377514 E;0.208955;0.000000;0.584892;0.377514;0.000000 simka-1.5.1/tests/truth/results_k31_t0/mat_abundance_ab-ochiai.csv000066400000000000000000000003661353413740300251160ustar00rootroot00000000000000;A;B;C;D;E A;0.000000;0.233035;0.329018;0.426135;0.110593 B;0.233035;0.000000;0.481437;0.209324;0.000000 C;0.329018;0.481437;0.000000;0.177817;0.401494 D;0.426135;0.209324;0.177817;0.000000;0.224691 E;0.110593;0.000000;0.401494;0.224691;0.000000 simka-1.5.1/tests/truth/results_k31_t0/mat_abundance_ab-sorensen.csv000066400000000000000000000003661353413740300255160ustar00rootroot00000000000000;A;B;C;D;E A;0.000000;0.259259;0.360936;0.469756;0.116667 B;0.259259;0.000000;0.482280;0.219811;0.000000 C;0.360936;0.482280;0.000000;0.186201;0.413320 D;0.469756;0.219811;0.186201;0.000000;0.232676 E;0.116667;0.000000;0.413320;0.232676;0.000000 simka-1.5.1/tests/truth/results_k31_t0/mat_abundance_braycurtis.csv000066400000000000000000000003661353413740300254710ustar00rootroot00000000000000;A;B;C;D;E A;0.000000;0.402985;0.512690;0.567347;0.604790 B;0.402985;0.000000;0.520000;0.233871;0.595238 C;0.512690;0.520000;0.000000;0.237705;0.808000 D;0.567347;0.233871;0.237705;0.000000;0.642336 E;0.604790;0.595238;0.808000;0.642336;0.000000 simka-1.5.1/tests/truth/results_k31_t0/mat_abundance_canberra.csv000066400000000000000000000003661353413740300250570ustar00rootroot00000000000000;A;B;C;D;E A;0.000000;0.466667;0.551020;0.702899;0.466667 B;0.466667;0.000000;0.685714;0.398551;0.000000 C;0.551020;0.685714;0.000000;0.345588;0.685714 D;0.702899;0.398551;0.345588;0.000000;0.398551 E;0.466667;0.000000;0.685714;0.398551;0.000000 simka-1.5.1/tests/truth/results_k31_t0/mat_abundance_chord.csv000066400000000000000000000003661353413740300244010ustar00rootroot00000000000000;A;B;C;D;E A;0.000000;0.663003;0.788829;0.888049;0.294818 B;0.663003;0.000000;0.936652;0.598750;0.377612 C;0.788829;0.936652;0.000000;0.591969;0.812323 D;0.888049;0.598750;0.591969;0.000000;0.718781 E;0.294818;0.377612;0.812323;0.718781;0.000000 simka-1.5.1/tests/truth/results_k31_t0/mat_abundance_hellinger.csv000066400000000000000000000003661353413740300252530ustar00rootroot00000000000000;A;B;C;D;E A;0.000000;0.897684;1.012605;1.049611;0.637158 B;0.897684;0.000000;1.019710;0.653186;0.632421 C;1.012605;1.019710;0.000000;0.654450;1.016203 D;1.049611;0.653186;0.654450;0.000000;0.872745 E;0.637158;0.632421;1.016203;0.872745;0.000000 simka-1.5.1/tests/truth/results_k31_t0/mat_abundance_jensenshannon.csv000066400000000000000000000003661353413740300261510ustar00rootroot00000000000000;A;B;C;D;E A;0.000000;0.421799;0.490033;0.566082;0.282932 B;0.421799;0.000000;0.582440;0.387729;0.164543 C;0.490033;0.582440;0.000000;0.363881;0.532800 D;0.566082;0.387729;0.363881;0.000000;0.425073 E;0.282932;0.164543;0.532800;0.425073;0.000000 simka-1.5.1/tests/truth/results_k31_t0/mat_abundance_kulczynski.csv000066400000000000000000000003661353413740300255100ustar00rootroot00000000000000;A;B;C;D;E A;0.000000;0.696970;0.757576;0.732323;0.500000 B;0.696970;0.000000;0.764706;0.534314;0.500000 C;0.757576;0.764706;0.000000;0.525510;0.755102 D;0.732323;0.534314;0.525510;0.000000;0.664384 E;0.500000;0.500000;0.755102;0.664384;0.000000 simka-1.5.1/tests/truth/results_k31_t0/mat_abundance_simka-jaccard.csv000066400000000000000000000003661353413740300257730ustar00rootroot00000000000000;A;B;C;D;E A;0.000000;0.208955;0.294416;0.424490;0.167665 B;0.208955;0.000000;0.480000;0.221774;0.000000 C;0.294416;0.480000;0.000000;0.192623;0.316000 D;0.424490;0.221774;0.192623;0.000000;0.164234 E;0.167665;0.000000;0.316000;0.164234;0.000000 simka-1.5.1/tests/truth/results_k31_t0/mat_abundance_simka-jaccard_asym.csv000066400000000000000000000003661353413740300270240ustar00rootroot00000000000000;A;B;C;D;E A;0.000000;0.000000;0.080808;0.141414;0.000000 B;0.411765;0.000000;0.450980;0.068627;0.000000 C;0.510204;0.510204;0.000000;0.051020;0.510204 D;0.616438;0.328767;0.287671;0.000000;0.328767 E;0.208955;0.000000;0.268657;0.104478;0.000000 simka-1.5.1/tests/truth/results_k31_t0/mat_abundance_whittaker.csv000066400000000000000000000003661353413740300253040ustar00rootroot00000000000000;A;B;C;D;E A;0.000000;0.411765;0.510513;0.627231;0.208955 B;0.411765;0.000000;0.526210;0.340451;0.202809 C;0.510513;0.526210;0.000000;0.326111;0.510966 D;0.627231;0.340451;0.326111;0.000000;0.418728 E;0.208955;0.202809;0.510966;0.418728;0.000000 simka-1.5.1/tests/truth/results_k31_t0/mat_presenceAbsence_braycurtis.csv000066400000000000000000000003661353413740300266360ustar00rootroot00000000000000;A;B;C;D;E A;0.000000;0.304348;0.380282;0.541899;0.304348 B;0.304348;0.000000;0.521739;0.248869;0.000000 C;0.380282;0.521739;0.000000;0.208889;0.521739 D;0.541899;0.248869;0.208889;0.000000;0.248869 E;0.304348;0.000000;0.521739;0.248869;0.000000 simka-1.5.1/tests/truth/results_k31_t0/mat_presenceAbsence_chord.csv000066400000000000000000000003661353413740300255460ustar00rootroot00000000000000;A;B;C;D;E A;0.000000;0.734443;0.830614;0.982808;0.734443 B;0.734443;0.000000;1.021397;0.686439;0.000000 C;0.830614;1.021397;0.000000;0.629238;1.021397 D;0.982808;0.686439;0.629238;0.000000;0.686439 E;0.734443;0.000000;1.021397;0.686439;0.000000 simka-1.5.1/tests/truth/results_k31_t0/mat_presenceAbsence_jaccard.csv000066400000000000000000000003661353413740300260360ustar00rootroot00000000000000;A;B;C;D;E A;0.000000;0.466667;0.551020;0.702899;0.466667 B;0.466667;0.000000;0.685714;0.398551;0.000000 C;0.551020;0.685714;0.000000;0.345588;0.685714 D;0.702899;0.398551;0.345588;0.000000;0.398551 E;0.466667;0.000000;0.685714;0.398551;0.000000 simka-1.5.1/tests/truth/results_k31_t0/mat_presenceAbsence_kulczynski.csv000066400000000000000000000003661353413740300266550ustar00rootroot00000000000000;A;B;C;D;E A;0.000000;0.233333;0.307624;0.416428;0.233333 B;0.233333;0.000000;0.521513;0.222095;0.000000 C;0.307624;0.521513;0.000000;0.186901;0.521513 D;0.416428;0.222095;0.186901;0.000000;0.222095 E;0.233333;0.000000;0.521513;0.222095;0.000000 simka-1.5.1/tests/truth/results_k31_t0/mat_presenceAbsence_ochiai.csv000066400000000000000000000003661353413740300257030ustar00rootroot00000000000000;A;B;C;D;E A;0.000000;0.269703;0.344960;0.482956;0.269703 B;0.269703;0.000000;0.521626;0.235599;0.000000 C;0.344960;0.521626;0.000000;0.197970;0.521626 D;0.482956;0.235599;0.197970;0.000000;0.235599 E;0.269703;0.000000;0.521626;0.235599;0.000000 simka-1.5.1/tests/truth/results_k31_t0/mat_presenceAbsence_simka-jaccard.csv000066400000000000000000000003661353413740300271400ustar00rootroot00000000000000;A;B;C;D;E A;0.000000;0.304348;0.380282;0.541899;0.304348 B;0.304348;0.000000;0.521739;0.248869;0.000000 C;0.380282;0.521739;0.000000;0.208889;0.521739 D;0.541899;0.248869;0.208889;0.000000;0.248869 E;0.304348;0.000000;0.521739;0.248869;0.000000 simka-1.5.1/tests/truth/results_k31_t0/mat_presenceAbsence_simka-jaccard_asym.csv000066400000000000000000000003661353413740300301710ustar00rootroot00000000000000;A;B;C;D;E A;0.000000;0.000000;0.083333;0.145833;0.000000 B;0.466667;0.000000;0.511111;0.077778;0.000000 C;0.531915;0.531915;0.000000;0.053191;0.531915 D;0.687023;0.366412;0.320611;0.000000;0.366412 E;0.466667;0.000000;0.511111;0.077778;0.000000 simka-1.5.1/tests/truth/results_k31_t0/mat_presenceAbsence_whittaker.csv000066400000000000000000000003661353413740300264510ustar00rootroot00000000000000;A;B;C;D;E A;0.000000;0.466667;0.531915;0.687023;0.466667 B;0.466667;0.000000;0.531915;0.366412;0.000000 C;0.531915;0.531915;0.000000;0.320611;0.531915 D;0.687023;0.366412;0.320611;0.000000;0.366412 E;0.466667;0.000000;0.531915;0.366412;0.000000 simka-1.5.1/tests/truth/results_k31_t2/000077500000000000000000000000001353413740300177215ustar00rootroot00000000000000simka-1.5.1/tests/truth/results_k31_t2/mat_abundance_ab-jaccard.csv000066400000000000000000000003661353413740300252530ustar00rootroot00000000000000;A;B;C;D;E A;0.000000;0.747475;0.909091;0.747475;0.208955 B;0.747475;0.000000;0.695652;0.000000;0.761194 C;0.909091;0.695652;0.000000;0.615385;0.920398 D;0.747475;0.000000;0.615385;0.000000;0.761194 E;0.208955;0.761194;0.920398;0.761194;0.000000 simka-1.5.1/tests/truth/results_k31_t2/mat_abundance_ab-ochiai.csv000066400000000000000000000003661353413740300251200ustar00rootroot00000000000000;A;B;C;D;E A;0.000000;0.497481;0.698489;0.497481;0.110593 B;0.497481;0.000000;0.448323;0.000000;0.511322 C;0.698489;0.448323;0.000000;0.379826;0.717862 D;0.497481;0.000000;0.379826;0.000000;0.511322 E;0.110593;0.511322;0.717862;0.511322;0.000000 simka-1.5.1/tests/truth/results_k31_t2/mat_abundance_ab-sorensen.csv000066400000000000000000000003661353413740300255200ustar00rootroot00000000000000;A;B;C;D;E A;0.000000;0.596774;0.833333;0.596774;0.116667 B;0.596774;0.000000;0.533333;0.000000;0.614458 C;0.833333;0.533333;0.000000;0.444444;0.852535 D;0.596774;0.000000;0.444444;0.000000;0.614458 E;0.116667;0.614458;0.852535;0.614458;0.000000 simka-1.5.1/tests/truth/results_k31_t2/mat_abundance_braycurtis.csv000066400000000000000000000003661353413740300254730ustar00rootroot00000000000000;A;B;C;D;E A;0.000000;0.622951;0.867925;0.632000;0.604790 B;0.622951;0.000000;0.533333;0.061224;0.891765 C;0.867925;0.533333;0.000000;0.575758;0.965770 D;0.632000;0.061224;0.575758;0.000000;0.878505 E;0.604790;0.891765;0.965770;0.878505;0.000000 simka-1.5.1/tests/truth/results_k31_t2/mat_abundance_canberra.csv000066400000000000000000000003661353413740300250610ustar00rootroot00000000000000;A;B;C;D;E A;0.000000;0.770833;0.937500;0.770833;0.466667 B;0.770833;0.000000;0.727273;0.000000;0.877778 C;0.937500;0.727273;0.000000;0.727273;0.966667 D;0.770833;0.000000;0.727273;0.000000;0.877778 E;0.466667;0.877778;0.966667;0.877778;0.000000 simka-1.5.1/tests/truth/results_k31_t2/mat_abundance_chord.csv000066400000000000000000000003661353413740300244030ustar00rootroot00000000000000;A;B;C;D;E A;0.000000;0.973395;1.136657;1.005674;0.294818 B;0.973395;0.000000;0.906626;0.314844;0.893974 C;1.136657;0.906626;0.000000;0.743856;1.113774 D;1.005674;0.314844;0.743856;0.000000;0.930404 E;0.294818;0.893974;1.113774;0.930404;0.000000 simka-1.5.1/tests/truth/results_k31_t2/mat_abundance_hellinger.csv000066400000000000000000000003661353413740300252550ustar00rootroot00000000000000;A;B;C;D;E A;0.000000;1.017842;1.211686;1.026596;0.637158 B;1.017842;0.000000;0.946914;0.192702;1.021383 C;1.211686;0.946914;0.000000;0.902220;1.213329 D;1.026596;0.192702;0.902220;0.000000;1.030042 E;0.637158;1.021383;1.213329;1.030042;0.000000 simka-1.5.1/tests/truth/results_k31_t2/mat_abundance_jensenshannon.csv000066400000000000000000000003661353413740300261530ustar00rootroot00000000000000;A;B;C;D;E A;0.000000;0.616237;0.733133;0.619517;0.282932 B;0.616237;0.000000;0.582099;0.092719;0.624404 C;0.733133;0.582099;0.000000;0.540656;0.742493 D;0.619517;0.092719;0.540656;0.000000;0.627238 E;0.282932;0.624404;0.742493;0.627238;0.000000 simka-1.5.1/tests/truth/results_k31_t2/mat_abundance_kulczynski.csv000066400000000000000000000003661353413740300255120ustar00rootroot00000000000000;A;B;C;D;E A;0.000000;0.883838;0.964646;0.883838;0.500000 B;0.883838;0.000000;0.847826;0.500000;0.500000 C;0.964646;0.847826;0.000000;0.500000;0.500000 D;0.883838;0.500000;0.500000;0.000000;0.500000 E;0.500000;0.500000;0.500000;0.500000;0.000000 simka-1.5.1/tests/truth/results_k31_t2/mat_abundance_simka-jaccard.csv000066400000000000000000000003661353413740300257750ustar00rootroot00000000000000;A;B;C;D;E A;0.000000;0.606557;0.849057;0.592000;0.167665 B;0.606557;0.000000;0.533333;0.000000;0.720000 C;0.849057;0.533333;0.000000;0.484848;0.904646 D;0.592000;0.000000;0.484848;0.000000;0.714953 E;0.167665;0.720000;0.904646;0.714953;0.000000 simka-1.5.1/tests/truth/results_k31_t2/mat_abundance_simka-jaccard_asym.csv000066400000000000000000000003661353413740300270260ustar00rootroot00000000000000;A;B;C;D;E A;0.000000;0.747475;0.909091;0.747475;0.000000 B;0.000000;0.000000;0.695652;0.000000;0.000000 C;0.000000;0.000000;0.000000;0.000000;0.000000 D;0.000000;0.000000;0.615385;0.000000;0.000000 E;0.208955;0.761194;0.920398;0.761194;0.000000 simka-1.5.1/tests/truth/results_k31_t2/mat_abundance_whittaker.csv000066400000000000000000000003661353413740300253060ustar00rootroot00000000000000;A;B;C;D;E A;0.000000;0.747475;0.909091;0.747475;0.208955 B;0.747475;0.000000;0.695652;0.100334;0.761194 C;0.909091;0.695652;0.000000;0.615385;0.920398 D;0.747475;0.100334;0.615385;0.000000;0.761194 E;0.208955;0.761194;0.920398;0.761194;0.000000 simka-1.5.1/tests/truth/results_k31_t2/mat_presenceAbsence_braycurtis.csv000066400000000000000000000003661353413740300266400ustar00rootroot00000000000000;A;B;C;D;E A;0.000000;0.627119;0.882353;0.627119;0.304348 B;0.627119;0.000000;0.571429;0.000000;0.782178 C;0.882353;0.571429;0.000000;0.571429;0.935484 D;0.627119;0.000000;0.571429;0.000000;0.782178 E;0.304348;0.782178;0.935484;0.782178;0.000000 simka-1.5.1/tests/truth/results_k31_t2/mat_presenceAbsence_chord.csv000066400000000000000000000003661353413740300255500ustar00rootroot00000000000000;A;B;C;D;E A;0.000000;1.021065;1.224745;1.021065;0.734443 B;1.021065;0.000000;0.977514;0.000000;1.140524 C;1.224745;0.977514;0.000000;0.977514;1.278613 D;1.021065;0.000000;0.977514;0.000000;1.140524 E;0.734443;1.140524;1.278613;1.140524;0.000000 simka-1.5.1/tests/truth/results_k31_t2/mat_presenceAbsence_jaccard.csv000066400000000000000000000003661353413740300260400ustar00rootroot00000000000000;A;B;C;D;E A;0.000000;0.770833;0.937500;0.770833;0.466667 B;0.770833;0.000000;0.727273;0.000000;0.877778 C;0.937500;0.727273;0.000000;0.727273;0.966667 D;0.770833;0.000000;0.727273;0.000000;0.877778 E;0.466667;0.877778;0.966667;0.877778;0.000000 simka-1.5.1/tests/truth/results_k31_t2/mat_presenceAbsence_kulczynski.csv000066400000000000000000000003661353413740300266570ustar00rootroot00000000000000;A;B;C;D;E A;0.000000;0.385417;0.468750;0.385417;0.233333 B;0.385417;0.000000;0.363636;0.000000;0.438889 C;0.468750;0.363636;0.000000;0.363636;0.483333 D;0.385417;0.000000;0.363636;0.000000;0.438889 E;0.233333;0.438889;0.483333;0.438889;0.000000 simka-1.5.1/tests/truth/results_k31_t2/mat_presenceAbsence_ochiai.csv000066400000000000000000000003661353413740300257050ustar00rootroot00000000000000;A;B;C;D;E A;0.000000;0.521286;0.750000;0.521286;0.269703 B;0.521286;0.000000;0.477767;0.000000;0.650397 C;0.750000;0.477767;0.000000;0.477767;0.817426 D;0.521286;0.000000;0.477767;0.000000;0.650397 E;0.269703;0.650397;0.817426;0.650397;0.000000 simka-1.5.1/tests/truth/results_k31_t2/mat_presenceAbsence_simka-jaccard.csv000066400000000000000000000003661353413740300271420ustar00rootroot00000000000000;A;B;C;D;E A;0.000000;0.627119;0.882353;0.627119;0.304348 B;0.627119;0.000000;0.571429;0.000000;0.782178 C;0.882353;0.571429;0.000000;0.571429;0.935484 D;0.627119;0.000000;0.571429;0.000000;0.782178 E;0.304348;0.782178;0.935484;0.782178;0.000000 simka-1.5.1/tests/truth/results_k31_t2/mat_presenceAbsence_simka-jaccard_asym.csv000066400000000000000000000003661353413740300301730ustar00rootroot00000000000000;A;B;C;D;E A;0.000000;0.770833;0.937500;0.770833;0.000000 B;0.000000;0.000000;0.727273;0.000000;0.000000 C;0.000000;0.000000;0.000000;0.000000;0.000000 D;0.000000;0.000000;0.727273;0.000000;0.000000 E;0.466667;0.877778;0.966667;0.877778;0.000000 simka-1.5.1/tests/truth/results_k31_t2/mat_presenceAbsence_whittaker.csv000066400000000000000000000003661353413740300264530ustar00rootroot00000000000000;A;B;C;D;E A;0.000000;0.770833;0.937500;0.770833;0.466667 B;0.770833;0.000000;0.727273;0.000000;0.877778 C;0.937500;0.727273;0.000000;0.727273;0.966667 D;0.770833;0.000000;0.727273;0.000000;0.877778 E;0.466667;0.877778;0.966667;0.877778;0.000000 simka-1.5.1/thirdparty/000077500000000000000000000000001353413740300150175ustar00rootroot00000000000000simka-1.5.1/thirdparty/gatb-core/000077500000000000000000000000001353413740300166625ustar00rootroot00000000000000