pax_global_header00006660000000000000000000000064137731200000014503gustar00rootroot0000000000000052 comment=b0d4ec990df0b1dc621a19802f1e0f041b13b4f5 simka-1.5.3/000077500000000000000000000000001377312000000126155ustar00rootroot00000000000000simka-1.5.3/.devcontainer/000077500000000000000000000000001377312000000153545ustar00rootroot00000000000000simka-1.5.3/.devcontainer/Dockerfile000066400000000000000000000030331377312000000173450ustar00rootroot00000000000000#------------------------------------------------------------------------------------------------------------- # Copyright (c) Microsoft Corporation. All rights reserved. # Licensed under the MIT License. See https://go.microsoft.com/fwlink/?linkid=2090316 for license information. #------------------------------------------------------------------------------------------------------------- FROM mcr.microsoft.com/vscode/devcontainers/base:0-buster # This Dockerfile's base image has a non-root user with sudo access. Use the "remoteUser" # property in devcontainer.json to use it. On Linux, the container user's GID/UIDs # will be updated to match your local UID/GID (when using the dockerFile property). # See https://aka.ms/vscode-remote/containers/non-root-user for details. ARG USERNAME=vscode ARG USER_UID=1000 ARG USER_GID=$USER_UID # Configure apt and install packages RUN apt-get update \ && export DEBIAN_FRONTEND=noninteractive \ # # Install C++ tools && apt-get -y install build-essential cmake cppcheck valgrind \ # # Additions for GATB and Simka && apt-get -y install zlib1g-dev doxygen graphviz ack vim \ # # [Optional] Update UID/GID if needed && if [ "$USER_GID" != "1000" ] || [ "$USER_UID" != "1000" ]; then \ groupmod --gid $USER_GID $USERNAME \ && usermod --uid $USER_UID --gid $USER_GID $USERNAME \ && chown -R $USER_UID:$USER_GID /home/$USERNAME; \ fi \ # # Clean up && apt-get autoremove -y \ && apt-get clean -y \ && rm -rf /var/lib/apt/lists/* simka-1.5.3/.devcontainer/devcontainer.json000066400000000000000000000017131377312000000207320ustar00rootroot00000000000000// For format details, see https://aka.ms/vscode-remote/devcontainer.json or this file's README at: // https://github.com/microsoft/vscode-dev-containers/tree/v0.128.0/containers/cpp { "name": "GATB/Simka remote development container", "dockerFile": "Dockerfile", "runArgs": [ "--cap-add=SYS_PTRACE", "--security-opt", "seccomp=unconfined"], // Set *default* container specific settings.json values on container create. "settings": { "terminal.integrated.shell.linux": "/bin/bash" }, // Add the IDs of extensions you want installed when the container is created. "extensions": [ "ms-vscode.cpptools" ] // Use 'forwardPorts' to make a list of ports inside the container available locally. // "forwardPorts": [], // Use 'postCreateCommand' to run commands after the container is created. // "postCreateCommand": "gcc -v", // Uncomment to connect as a non-root user. See https://aka.ms/vscode-remote/containers/non-root. // "remoteUser": "vscode" }simka-1.5.3/.gitignore000066400000000000000000000001131377312000000146000ustar00rootroot00000000000000/build/ .DS_Store /docker/simka_results/ /docker/simka_temp_output/ *.pyc simka-1.5.3/.gitlab-ci.yml000066400000000000000000000053741377312000000152620ustar00rootroot00000000000000stages: - update_simka_sq_image - analysis - aggregate - deploy variables: GIT_SUBMODULE_STRATEGY: recursive ######################################################################################################### sonar:analysis: stage: analysis image: $CI_REGISTRY_IMAGE/simka_sq:latest script: - echo "Launching sonar:analysis diagnostics..." - scripts/sonarqube_diags/analysis.sh > analysis.log - mkdir -p public/doc - mv coverage-html public/doc/lcov # for gitlabpages publication tags: #- simka-igrida - large cache: {} artifacts: paths: - build/ - analyzer_reports/ - simka-scan-build-cmake.log - simka-scan-build-make.log - gcov.xml - simka-clang-tidy-report.log - compile_commands.json - simka-cppcheck.xml - simka-rats.xml - analysis.log - public/ when: always # useful to debug expire_in: 5 days ######################################################################################################### sonar:aggregate: stage: aggregate image: $CI_REGISTRY_IMAGE/simka_sq:latest script: - echo "Launching sonar:aggregate task..." - echo "====================="; pwd; ls -atlhrsF; echo "=====================" - ls analyzer_reports/*/*.plist - sonar-scanner -X -Dsonar.login=$SONARQUBE_LOGIN -Dsonar.verbose=true &> sonar-scanner.log tags: #- simka-igrida artifacts: paths: - sonar-scanner.log when: always # useful to debug expire_in: 5 days ################################################################################################################################################################################################################## # Ref. https://gitlab.inria.fr/help/ci/docker/using_docker_build.md#making-docker-in-docker-builds-faster-with-docker-layer-caching update_simka_sq_image: stage: update_simka_sq_image image: docker tags: #- simka-igrida - large services: - docker:19.03.12-dind script: - echo "Launching update_ci_image job..." - docker login -u $CI_REGISTRY_USER -p $CI_REGISTRY_PASSWORD $CI_REGISTRY - docker pull $CI_REGISTRY_IMAGE/simka_sq:latest || true - docker --version - cd scripts/sonarqube_diags - docker build -f Dockerfile.sq --cache-from $CI_REGISTRY_IMAGE/simka_sq:latest --tag $CI_REGISTRY_IMAGE/simka_sq:latest . - docker push $CI_REGISTRY_IMAGE/simka_sq:latest - docker image ls when: manual pages: image: $CI_REGISTRY_IMAGE/simka_sq:latest stage: deploy script: - mkdir -p public/doc - test -d build || mkdir build - cd build - cmake --target doc-gatb-simka .. - make doc-gatb-simka - mv doc/html ../public/doc/doxygen artifacts: paths: - public simka-1.5.3/.gitmodules000066400000000000000000000001551377312000000147730ustar00rootroot00000000000000[submodule "thirdparty/gatb-core"] path = thirdparty/gatb-core url = https://github.com/GATB/gatb-core.git simka-1.5.3/CMakeLists.txt000077500000000000000000000131561377312000000153660ustar00rootroot00000000000000project(simka) cmake_minimum_required(VERSION 2.6) ################################################################################ # The version number. ################################################################################ SET (gatb-tool_VERSION_MAJOR 1) SET (gatb-tool_VERSION_MINOR 5) SET (gatb-tool_VERSION_PATCH 3) IF (DEFINED MAJOR) SET (gatb-tool_VERSION_MAJOR ${MAJOR}) ENDIF() IF (DEFINED MINOR) SET (gatb-tool_VERSION_MINOR ${MINOR}) ENDIF() IF (DEFINED PATCH) SET (gatb-tool_VERSION_PATCH ${PATCH}) ENDIF() set (gatb-tool-version ${gatb-tool_VERSION_MAJOR}.${gatb-tool_VERSION_MINOR}.${gatb-tool_VERSION_PATCH}) # However, continuous integration has priority over local compilation IF (DEFINED JENKINS_TAG) SET (gatb-tool-version ${JENKINS_TAG}) ENDIF() ################################################################################ # Define cmake modules directory ################################################################################ SET (GATB_CORE_HOME ${PROJECT_SOURCE_DIR}/thirdparty/gatb-core/gatb-core) SET (CMAKE_MODULE_PATH ${GATB_CORE_HOME}/cmake) ################################################################################ # THIRD PARTIES ################################################################################ # We don't want to install some GATB-CORE artifacts SET (GATB_CORE_EXCLUDE_TOOLS 1) SET (GATB_CORE_EXCLUDE_TESTS 1) SET (GATB_CORE_EXCLUDE_EXAMPLES 1) # GATB CORE include (GatbCore) ################################################################################ # TOOL ################################################################################ # we get compilation definitions from the gatb-core part add_definitions (${gatb-core-flags}) # we add a new compilation variable if (PRINTALL) SET( CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DPRINTALL" ) endif() # we give the headers directories from : # - from project source # - from GATB-CORE source # - from simka source include_directories (include ${gatb-core-includes} ${PROJECT_SOURCE_DIR}/src/core ${PROJECT_SOURCE_DIR}/src/minikc ${PROJECT_SOURCE_DIR}/src) # we generate one file per template specialization FOREACH (KSIZE ${gatb-core-klist}) configure_file ( ${PROJECT_SOURCE_DIR}/src/core/SimkaAlgorithmTemplate.cpp.in ${PROJECT_BINARY_DIR}/src/core/template/SimkaAlgorithmTemplate_${KSIZE}.cpp ) ENDFOREACH () # we define the files to be compiled file (GLOB_RECURSE ProjectFiles src/core/Simka* ${PROJECT_BINARY_DIR}/src/core/template/*.cpp) file (GLOB_RECURSE SimkaMinFiles src/simkaMin/MurmurHash3.h src/simkaMin/MurmurHash3.cpp src/simkaMin/*.hpp)# ${PROJECT_BINARY_DIR}/src/core/template/*.cpp) SET(CMAKE_RUNTIME_OUTPUT_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}/bin) set(PROJECT_BINARY_DIR ${CMAKE_CURRENT_BINARY_DIR}/bin) add_executable (simka src/SimkaPotara.cpp ${ProjectFiles}) target_link_libraries (simka ${gatb-core-libraries}) add_executable (simkaCountProcess src/minikc/SimkaCountProcess.cpp ${ProjectFiles}) target_link_libraries (simkaCountProcess ${gatb-core-libraries}) add_executable (simkaCount src/SimkaCount.cpp ${ProjectFiles}) target_link_libraries (simkaCount ${gatb-core-libraries}) add_executable (simkaMerge src/SimkaMerge.cpp ${ProjectFiles}) target_link_libraries (simkaMerge ${gatb-core-libraries}) add_executable (simkaMinCore src/simkaMin/SimkaMin.cpp ${SimkaMinFiles}) target_link_libraries (simkaMinCore ${gatb-core-libraries}) ################################################################################ # DOCUMENTATION GENERATION ################################################################################ IF (EXISTS "${PROJECT_SOURCE_DIR}/doc") ADD_SUBDIRECTORY(doc EXCLUDE_FROM_ALL) ENDIF() ################################################################################ # PACKAGING ################################################################################ SET (CPACK_PACKAGE_DESCRIPTION_SUMMARY "gatb-tool ${PROJECT_NAME}") SET (CPACK_PACKAGE_VENDOR "Genscale team (INRIA)") SET (CPACK_PACKAGE_VERSION_MAJOR "${gatb-tool_VERSION_MAJOR}") SET (CPACK_PACKAGE_VERSION_MINOR "${gatb-tool_VERSION_MINOR}") SET (CPACK_PACKAGE_VERSION_PATCH "${gatb-tool_VERSION_PATCH}") SET (CPACK_PACKAGE_VERSION "${gatb-tool-version}") # We chose the kind of archive we want to generate SET (CPACK_GENERATOR "TGZ") SET (CPACK_SOURCE_GENERATOR "TGZ") # We ignore unwanted files for the source archive SET (CPACK_SOURCE_IGNORE_FILES "^${PROJECT_SOURCE_DIR}/\\.git/" ; "^${PROJECT_SOURCE_DIR}/\\.gitmodules" ; "^${PROJECT_SOURCE_DIR}/\\.gitignore" ; "^${PROJECT_SOURCE_DIR}/build/" ; "^${PROJECT_SOURCE_DIR}/dependency/" ; "^${GATB_CORE_HOME}/\\.cproject" ; "^${GATB_CORE_HOME}/\\.git/" ; "^${GATB_CORE_HOME}/\\.project" ; "^${GATB_CORE_HOME}/\\.gitignore" ) # For creating the BINARY package we include the files we want INSTALL (DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}/bin DESTINATION .) INSTALL (DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}/example DESTINATION .) INSTALL (DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}/simkaMin DESTINATION .) INSTALL (DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}/scripts DESTINATION . FILES_MATCHING REGEX ".*\\.(py|r)$" PATTERN "jenkins" EXCLUDE) INSTALL (FILES ${CMAKE_CURRENT_SOURCE_DIR}/README.md DESTINATION .) INSTALL (FILES ${CMAKE_CURRENT_SOURCE_DIR}/LICENSE DESTINATION .) # We include the "bin" tag into binary archive file name set (CPACK_PACKAGE_FILE_NAME ${PROJECT_NAME}-${CPACK_PACKAGE_VERSION}-bin-${CMAKE_SYSTEM_NAME}) # To be done at the end. INCLUDE (CPack) simka-1.5.3/INSTALL000077500000000000000000000006051377312000000136520ustar00rootroot00000000000000# CMake is required to compile simka (http://www.cmake.org/cmake/resources/software.html) # # you can install simka by executing this file: sh INSTALL # # Prepare GATB sub-module git submodule init git submodule update # Prepare directories: rm -rf build mkdir build # Go in the 'build' directory cd build # Prepare the makefile cmake .. # Run the newly created makefile: make -j8 simka-1.5.3/LICENSE000077500000000000000000001033301377312000000136250ustar00rootroot00000000000000 GNU AFFERO GENERAL PUBLIC LICENSE Version 3, 19 November 2007 Copyright (C) 2007 Free Software Foundation, Inc. Everyone is permitted to copy and distribute verbatim copies of this license document, but changing it is not allowed. Preamble The GNU Affero General Public License is a free, copyleft license for software and other kinds of works, specifically designed to ensure cooperation with the community in the case of network server software. The licenses for most software and other practical works are designed to take away your freedom to share and change the works. By contrast, our General Public Licenses are intended to guarantee your freedom to share and change all versions of a program--to make sure it remains free software for all its users. When we speak of free software, we are referring to freedom, not price. Our General Public Licenses are designed to make sure that you have the freedom to distribute copies of free software (and charge for them if you wish), that you receive source code or can get it if you want it, that you can change the software or use pieces of it in new free programs, and that you know you can do these things. Developers that use our General Public Licenses protect your rights with two steps: (1) assert copyright on the software, and (2) offer you this License which gives you legal permission to copy, distribute and/or modify the software. A secondary benefit of defending all users' freedom is that improvements made in alternate versions of the program, if they receive widespread use, become available for other developers to incorporate. Many developers of free software are heartened and encouraged by the resulting cooperation. However, in the case of software used on network servers, this result may fail to come about. The GNU General Public License permits making a modified version and letting the public access it on a server without ever releasing its source code to the public. The GNU Affero General Public License is designed specifically to ensure that, in such cases, the modified source code becomes available to the community. It requires the operator of a network server to provide the source code of the modified version running there to the users of that server. Therefore, public use of a modified version, on a publicly accessible server, gives the public access to the source code of the modified version. An older license, called the Affero General Public License and published by Affero, was designed to accomplish similar goals. This is a different license, not a version of the Affero GPL, but Affero has released a new version of the Affero GPL which permits relicensing under this license. The precise terms and conditions for copying, distribution and modification follow. TERMS AND CONDITIONS 0. Definitions. "This License" refers to version 3 of the GNU Affero General Public License. "Copyright" also means copyright-like laws that apply to other kinds of works, such as semiconductor masks. "The Program" refers to any copyrightable work licensed under this License. Each licensee is addressed as "you". "Licensees" and "recipients" may be individuals or organizations. To "modify" a work means to copy from or adapt all or part of the work in a fashion requiring copyright permission, other than the making of an exact copy. The resulting work is called a "modified version" of the earlier work or a work "based on" the earlier work. A "covered work" means either the unmodified Program or a work based on the Program. To "propagate" a work means to do anything with it that, without permission, would make you directly or secondarily liable for infringement under applicable copyright law, except executing it on a computer or modifying a private copy. Propagation includes copying, distribution (with or without modification), making available to the public, and in some countries other activities as well. To "convey" a work means any kind of propagation that enables other parties to make or receive copies. Mere interaction with a user through a computer network, with no transfer of a copy, is not conveying. An interactive user interface displays "Appropriate Legal Notices" to the extent that it includes a convenient and prominently visible feature that (1) displays an appropriate copyright notice, and (2) tells the user that there is no warranty for the work (except to the extent that warranties are provided), that licensees may convey the work under this License, and how to view a copy of this License. If the interface presents a list of user commands or options, such as a menu, a prominent item in the list meets this criterion. 1. Source Code. The "source code" for a work means the preferred form of the work for making modifications to it. "Object code" means any non-source form of a work. A "Standard Interface" means an interface that either is an official standard defined by a recognized standards body, or, in the case of interfaces specified for a particular programming language, one that is widely used among developers working in that language. The "System Libraries" of an executable work include anything, other than the work as a whole, that (a) is included in the normal form of packaging a Major Component, but which is not part of that Major Component, and (b) serves only to enable use of the work with that Major Component, or to implement a Standard Interface for which an implementation is available to the public in source code form. A "Major Component", in this context, means a major essential component (kernel, window system, and so on) of the specific operating system (if any) on which the executable work runs, or a compiler used to produce the work, or an object code interpreter used to run it. The "Corresponding Source" for a work in object code form means all the source code needed to generate, install, and (for an executable work) run the object code and to modify the work, including scripts to control those activities. However, it does not include the work's System Libraries, or general-purpose tools or generally available free programs which are used unmodified in performing those activities but which are not part of the work. For example, Corresponding Source includes interface definition files associated with source files for the work, and the source code for shared libraries and dynamically linked subprograms that the work is specifically designed to require, such as by intimate data communication or control flow between those subprograms and other parts of the work. The Corresponding Source need not include anything that users can regenerate automatically from other parts of the Corresponding Source. The Corresponding Source for a work in source code form is that same work. 2. Basic Permissions. All rights granted under this License are granted for the term of copyright on the Program, and are irrevocable provided the stated conditions are met. This License explicitly affirms your unlimited permission to run the unmodified Program. The output from running a covered work is covered by this License only if the output, given its content, constitutes a covered work. This License acknowledges your rights of fair use or other equivalent, as provided by copyright law. You may make, run and propagate covered works that you do not convey, without conditions so long as your license otherwise remains in force. You may convey covered works to others for the sole purpose of having them make modifications exclusively for you, or provide you with facilities for running those works, provided that you comply with the terms of this License in conveying all material for which you do not control copyright. Those thus making or running the covered works for you must do so exclusively on your behalf, under your direction and control, on terms that prohibit them from making any copies of your copyrighted material outside their relationship with you. Conveying under any other circumstances is permitted solely under the conditions stated below. Sublicensing is not allowed; section 10 makes it unnecessary. 3. Protecting Users' Legal Rights From Anti-Circumvention Law. No covered work shall be deemed part of an effective technological measure under any applicable law fulfilling obligations under article 11 of the WIPO copyright treaty adopted on 20 December 1996, or similar laws prohibiting or restricting circumvention of such measures. When you convey a covered work, you waive any legal power to forbid circumvention of technological measures to the extent such circumvention is effected by exercising rights under this License with respect to the covered work, and you disclaim any intention to limit operation or modification of the work as a means of enforcing, against the work's users, your or third parties' legal rights to forbid circumvention of technological measures. 4. Conveying Verbatim Copies. You may convey verbatim copies of the Program's source code as you receive it, in any medium, provided that you conspicuously and appropriately publish on each copy an appropriate copyright notice; keep intact all notices stating that this License and any non-permissive terms added in accord with section 7 apply to the code; keep intact all notices of the absence of any warranty; and give all recipients a copy of this License along with the Program. You may charge any price or no price for each copy that you convey, and you may offer support or warranty protection for a fee. 5. Conveying Modified Source Versions. You may convey a work based on the Program, or the modifications to produce it from the Program, in the form of source code under the terms of section 4, provided that you also meet all of these conditions: a) The work must carry prominent notices stating that you modified it, and giving a relevant date. b) The work must carry prominent notices stating that it is released under this License and any conditions added under section 7. This requirement modifies the requirement in section 4 to "keep intact all notices". c) You must license the entire work, as a whole, under this License to anyone who comes into possession of a copy. This License will therefore apply, along with any applicable section 7 additional terms, to the whole of the work, and all its parts, regardless of how they are packaged. This License gives no permission to license the work in any other way, but it does not invalidate such permission if you have separately received it. d) If the work has interactive user interfaces, each must display Appropriate Legal Notices; however, if the Program has interactive interfaces that do not display Appropriate Legal Notices, your work need not make them do so. A compilation of a covered work with other separate and independent works, which are not by their nature extensions of the covered work, and which are not combined with it such as to form a larger program, in or on a volume of a storage or distribution medium, is called an "aggregate" if the compilation and its resulting copyright are not used to limit the access or legal rights of the compilation's users beyond what the individual works permit. Inclusion of a covered work in an aggregate does not cause this License to apply to the other parts of the aggregate. 6. Conveying Non-Source Forms. You may convey a covered work in object code form under the terms of sections 4 and 5, provided that you also convey the machine-readable Corresponding Source under the terms of this License, in one of these ways: a) Convey the object code in, or embodied in, a physical product (including a physical distribution medium), accompanied by the Corresponding Source fixed on a durable physical medium customarily used for software interchange. b) Convey the object code in, or embodied in, a physical product (including a physical distribution medium), accompanied by a written offer, valid for at least three years and valid for as long as you offer spare parts or customer support for that product model, to give anyone who possesses the object code either (1) a copy of the Corresponding Source for all the software in the product that is covered by this License, on a durable physical medium customarily used for software interchange, for a price no more than your reasonable cost of physically performing this conveying of source, or (2) access to copy the Corresponding Source from a network server at no charge. c) Convey individual copies of the object code with a copy of the written offer to provide the Corresponding Source. This alternative is allowed only occasionally and noncommercially, and only if you received the object code with such an offer, in accord with subsection 6b. d) Convey the object code by offering access from a designated place (gratis or for a charge), and offer equivalent access to the Corresponding Source in the same way through the same place at no further charge. You need not require recipients to copy the Corresponding Source along with the object code. If the place to copy the object code is a network server, the Corresponding Source may be on a different server (operated by you or a third party) that supports equivalent copying facilities, provided you maintain clear directions next to the object code saying where to find the Corresponding Source. Regardless of what server hosts the Corresponding Source, you remain obligated to ensure that it is available for as long as needed to satisfy these requirements. e) Convey the object code using peer-to-peer transmission, provided you inform other peers where the object code and Corresponding Source of the work are being offered to the general public at no charge under subsection 6d. A separable portion of the object code, whose source code is excluded from the Corresponding Source as a System Library, need not be included in conveying the object code work. A "User Product" is either (1) a "consumer product", which means any tangible personal property which is normally used for personal, family, or household purposes, or (2) anything designed or sold for incorporation into a dwelling. In determining whether a product is a consumer product, doubtful cases shall be resolved in favor of coverage. For a particular product received by a particular user, "normally used" refers to a typical or common use of that class of product, regardless of the status of the particular user or of the way in which the particular user actually uses, or expects or is expected to use, the product. A product is a consumer product regardless of whether the product has substantial commercial, industrial or non-consumer uses, unless such uses represent the only significant mode of use of the product. "Installation Information" for a User Product means any methods, procedures, authorization keys, or other information required to install and execute modified versions of a covered work in that User Product from a modified version of its Corresponding Source. The information must suffice to ensure that the continued functioning of the modified object code is in no case prevented or interfered with solely because modification has been made. If you convey an object code work under this section in, or with, or specifically for use in, a User Product, and the conveying occurs as part of a transaction in which the right of possession and use of the User Product is transferred to the recipient in perpetuity or for a fixed term (regardless of how the transaction is characterized), the Corresponding Source conveyed under this section must be accompanied by the Installation Information. But this requirement does not apply if neither you nor any third party retains the ability to install modified object code on the User Product (for example, the work has been installed in ROM). The requirement to provide Installation Information does not include a requirement to continue to provide support service, warranty, or updates for a work that has been modified or installed by the recipient, or for the User Product in which it has been modified or installed. Access to a network may be denied when the modification itself materially and adversely affects the operation of the network or violates the rules and protocols for communication across the network. Corresponding Source conveyed, and Installation Information provided, in accord with this section must be in a format that is publicly documented (and with an implementation available to the public in source code form), and must require no special password or key for unpacking, reading or copying. 7. Additional Terms. "Additional permissions" are terms that supplement the terms of this License by making exceptions from one or more of its conditions. Additional permissions that are applicable to the entire Program shall be treated as though they were included in this License, to the extent that they are valid under applicable law. If additional permissions apply only to part of the Program, that part may be used separately under those permissions, but the entire Program remains governed by this License without regard to the additional permissions. When you convey a copy of a covered work, you may at your option remove any additional permissions from that copy, or from any part of it. (Additional permissions may be written to require their own removal in certain cases when you modify the work.) You may place additional permissions on material, added by you to a covered work, for which you have or can give appropriate copyright permission. Notwithstanding any other provision of this License, for material you add to a covered work, you may (if authorized by the copyright holders of that material) supplement the terms of this License with terms: a) Disclaiming warranty or limiting liability differently from the terms of sections 15 and 16 of this License; or b) Requiring preservation of specified reasonable legal notices or author attributions in that material or in the Appropriate Legal Notices displayed by works containing it; or c) Prohibiting misrepresentation of the origin of that material, or requiring that modified versions of such material be marked in reasonable ways as different from the original version; or d) Limiting the use for publicity purposes of names of licensors or authors of the material; or e) Declining to grant rights under trademark law for use of some trade names, trademarks, or service marks; or f) Requiring indemnification of licensors and authors of that material by anyone who conveys the material (or modified versions of it) with contractual assumptions of liability to the recipient, for any liability that these contractual assumptions directly impose on those licensors and authors. All other non-permissive additional terms are considered "further restrictions" within the meaning of section 10. If the Program as you received it, or any part of it, contains a notice stating that it is governed by this License along with a term that is a further restriction, you may remove that term. If a license document contains a further restriction but permits relicensing or conveying under this License, you may add to a covered work material governed by the terms of that license document, provided that the further restriction does not survive such relicensing or conveying. If you add terms to a covered work in accord with this section, you must place, in the relevant source files, a statement of the additional terms that apply to those files, or a notice indicating where to find the applicable terms. Additional terms, permissive or non-permissive, may be stated in the form of a separately written license, or stated as exceptions; the above requirements apply either way. 8. Termination. You may not propagate or modify a covered work except as expressly provided under this License. Any attempt otherwise to propagate or modify it is void, and will automatically terminate your rights under this License (including any patent licenses granted under the third paragraph of section 11). However, if you cease all violation of this License, then your license from a particular copyright holder is reinstated (a) provisionally, unless and until the copyright holder explicitly and finally terminates your license, and (b) permanently, if the copyright holder fails to notify you of the violation by some reasonable means prior to 60 days after the cessation. Moreover, your license from a particular copyright holder is reinstated permanently if the copyright holder notifies you of the violation by some reasonable means, this is the first time you have received notice of violation of this License (for any work) from that copyright holder, and you cure the violation prior to 30 days after your receipt of the notice. Termination of your rights under this section does not terminate the licenses of parties who have received copies or rights from you under this License. If your rights have been terminated and not permanently reinstated, you do not qualify to receive new licenses for the same material under section 10. 9. Acceptance Not Required for Having Copies. You are not required to accept this License in order to receive or run a copy of the Program. Ancillary propagation of a covered work occurring solely as a consequence of using peer-to-peer transmission to receive a copy likewise does not require acceptance. However, nothing other than this License grants you permission to propagate or modify any covered work. These actions infringe copyright if you do not accept this License. Therefore, by modifying or propagating a covered work, you indicate your acceptance of this License to do so. 10. Automatic Licensing of Downstream Recipients. Each time you convey a covered work, the recipient automatically receives a license from the original licensors, to run, modify and propagate that work, subject to this License. You are not responsible for enforcing compliance by third parties with this License. An "entity transaction" is a transaction transferring control of an organization, or substantially all assets of one, or subdividing an organization, or merging organizations. If propagation of a covered work results from an entity transaction, each party to that transaction who receives a copy of the work also receives whatever licenses to the work the party's predecessor in interest had or could give under the previous paragraph, plus a right to possession of the Corresponding Source of the work from the predecessor in interest, if the predecessor has it or can get it with reasonable efforts. You may not impose any further restrictions on the exercise of the rights granted or affirmed under this License. For example, you may not impose a license fee, royalty, or other charge for exercise of rights granted under this License, and you may not initiate litigation (including a cross-claim or counterclaim in a lawsuit) alleging that any patent claim is infringed by making, using, selling, offering for sale, or importing the Program or any portion of it. 11. Patents. A "contributor" is a copyright holder who authorizes use under this License of the Program or a work on which the Program is based. The work thus licensed is called the contributor's "contributor version". A contributor's "essential patent claims" are all patent claims owned or controlled by the contributor, whether already acquired or hereafter acquired, that would be infringed by some manner, permitted by this License, of making, using, or selling its contributor version, but do not include claims that would be infringed only as a consequence of further modification of the contributor version. For purposes of this definition, "control" includes the right to grant patent sublicenses in a manner consistent with the requirements of this License. Each contributor grants you a non-exclusive, worldwide, royalty-free patent license under the contributor's essential patent claims, to make, use, sell, offer for sale, import and otherwise run, modify and propagate the contents of its contributor version. In the following three paragraphs, a "patent license" is any express agreement or commitment, however denominated, not to enforce a patent (such as an express permission to practice a patent or covenant not to sue for patent infringement). To "grant" such a patent license to a party means to make such an agreement or commitment not to enforce a patent against the party. If you convey a covered work, knowingly relying on a patent license, and the Corresponding Source of the work is not available for anyone to copy, free of charge and under the terms of this License, through a publicly available network server or other readily accessible means, then you must either (1) cause the Corresponding Source to be so available, or (2) arrange to deprive yourself of the benefit of the patent license for this particular work, or (3) arrange, in a manner consistent with the requirements of this License, to extend the patent license to downstream recipients. "Knowingly relying" means you have actual knowledge that, but for the patent license, your conveying the covered work in a country, or your recipient's use of the covered work in a country, would infringe one or more identifiable patents in that country that you have reason to believe are valid. If, pursuant to or in connection with a single transaction or arrangement, you convey, or propagate by procuring conveyance of, a covered work, and grant a patent license to some of the parties receiving the covered work authorizing them to use, propagate, modify or convey a specific copy of the covered work, then the patent license you grant is automatically extended to all recipients of the covered work and works based on it. A patent license is "discriminatory" if it does not include within the scope of its coverage, prohibits the exercise of, or is conditioned on the non-exercise of one or more of the rights that are specifically granted under this License. You may not convey a covered work if you are a party to an arrangement with a third party that is in the business of distributing software, under which you make payment to the third party based on the extent of your activity of conveying the work, and under which the third party grants, to any of the parties who would receive the covered work from you, a discriminatory patent license (a) in connection with copies of the covered work conveyed by you (or copies made from those copies), or (b) primarily for and in connection with specific products or compilations that contain the covered work, unless you entered into that arrangement, or that patent license was granted, prior to 28 March 2007. Nothing in this License shall be construed as excluding or limiting any implied license or other defenses to infringement that may otherwise be available to you under applicable patent law. 12. No Surrender of Others' Freedom. If conditions are imposed on you (whether by court order, agreement or otherwise) that contradict the conditions of this License, they do not excuse you from the conditions of this License. If you cannot convey a covered work so as to satisfy simultaneously your obligations under this License and any other pertinent obligations, then as a consequence you may not convey it at all. For example, if you agree to terms that obligate you to collect a royalty for further conveying from those to whom you convey the Program, the only way you could satisfy both those terms and this License would be to refrain entirely from conveying the Program. 13. Remote Network Interaction; Use with the GNU General Public License. Notwithstanding any other provision of this License, if you modify the Program, your modified version must prominently offer all users interacting with it remotely through a computer network (if your version supports such interaction) an opportunity to receive the Corresponding Source of your version by providing access to the Corresponding Source from a network server at no charge, through some standard or customary means of facilitating copying of software. This Corresponding Source shall include the Corresponding Source for any work covered by version 3 of the GNU General Public License that is incorporated pursuant to the following paragraph. Notwithstanding any other provision of this License, you have permission to link or combine any covered work with a work licensed under version 3 of the GNU General Public License into a single combined work, and to convey the resulting work. The terms of this License will continue to apply to the part which is the covered work, but the work with which it is combined will remain governed by version 3 of the GNU General Public License. 14. Revised Versions of this License. The Free Software Foundation may publish revised and/or new versions of the GNU Affero General Public License from time to time. Such new versions will be similar in spirit to the present version, but may differ in detail to address new problems or concerns. Each version is given a distinguishing version number. If the Program specifies that a certain numbered version of the GNU Affero General Public License "or any later version" applies to it, you have the option of following the terms and conditions either of that numbered version or of any later version published by the Free Software Foundation. If the Program does not specify a version number of the GNU Affero General Public License, you may choose any version ever published by the Free Software Foundation. If the Program specifies that a proxy can decide which future versions of the GNU Affero General Public License can be used, that proxy's public statement of acceptance of a version permanently authorizes you to choose that version for the Program. Later license versions may give you additional or different permissions. However, no additional obligations are imposed on any author or copyright holder as a result of your choosing to follow a later version. 15. Disclaimer of Warranty. THERE IS NO WARRANTY FOR THE PROGRAM, TO THE EXTENT PERMITTED BY APPLICABLE LAW. EXCEPT WHEN OTHERWISE STATED IN WRITING THE COPYRIGHT HOLDERS AND/OR OTHER PARTIES PROVIDE THE PROGRAM "AS IS" WITHOUT WARRANTY OF ANY KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE. THE ENTIRE RISK AS TO THE QUALITY AND PERFORMANCE OF THE PROGRAM IS WITH YOU. SHOULD THE PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF ALL NECESSARY SERVICING, REPAIR OR CORRECTION. 16. Limitation of Liability. IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MODIFIES AND/OR CONVEYS THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES, INCLUDING ANY GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING OUT OF THE USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED TO LOSS OF DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY YOU OR THIRD PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER PROGRAMS), EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE POSSIBILITY OF SUCH DAMAGES. 17. Interpretation of Sections 15 and 16. If the disclaimer of warranty and limitation of liability provided above cannot be given local legal effect according to their terms, reviewing courts shall apply local law that most closely approximates an absolute waiver of all civil liability in connection with the Program, unless a warranty or assumption of liability accompanies a copy of the Program in return for a fee. END OF TERMS AND CONDITIONS How to Apply These Terms to Your New Programs If you develop a new program, and you want it to be of the greatest possible use to the public, the best way to achieve this is to make it free software which everyone can redistribute and change under these terms. To do so, attach the following notices to the program. It is safest to attach them to the start of each source file to most effectively state the exclusion of warranty; and each file should have at least the "copyright" line and a pointer to where the full notice is found. Copyright (C) This program is free software: you can redistribute it and/or modify it under the terms of the GNU Affero General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more details. You should have received a copy of the GNU Affero General Public License along with this program. If not, see . Also add information on how to contact you by electronic and paper mail. If your software can interact with users remotely through a computer network, you should also make sure that it provides a way for users to get its source. For example, if your program is a web application, its interface could display a "Source" link that leads users to an archive of the code. There are many ways you could offer source, and different solutions will be better for different programs; see section 13 for the specific requirements. You should also get your employer (if you work as a programmer) or school, if any, to sign a "copyright disclaimer" for the program, if necessary. For more information on this, and how to apply and follow the GNU AGPL, see . simka-1.5.3/README.md000077500000000000000000000362661377312000000141140ustar00rootroot00000000000000
Table of contents [[_TOC_]]
# Simka & SimkaMin [![License](http://img.shields.io/:license-affero-blue.svg)](http://www.gnu.org/licenses/agpl-3.0.en.html) > This directory stores Simka and SimkaMin software. This readme focuses on Simka features. All information about SimkaMin is located in the [simkaMin](simkaMin/) directory. # Continuous integration status (master branch) ### Build status | **Linux** | **Mac OSX** | |-----------|-------------| [![Build Status](https://ci.inria.fr/gatb-core/view/Simka/job/tool-simka-build-debian7-64bits-gcc-4.7-gitlab/badge/icon)](https://ci.inria.fr/gatb-core/view/Simka/job/tool-simka-build-debian7-64bits-gcc-4.7-gitlab/) | [![Build Status](https://ci.inria.fr/gatb-core/view/Simka/job/tool-simka-build-macos-10.9.5-gcc-4.2.1-gitlab/badge/icon)](https://ci.inria.fr/gatb-core/view/Simka/job/tool-simka-build-macos-10.9.5-gcc-4.2.1-gitlab/) ### SonarQube metrics
Click me to expand [![Lines of code](https://sonarqube.inria.fr/sonarqube/api/badges/measure?key=genscale:gatb:tools:simka:gitlab:master&metric=ncloc)](https://sonarqube.inria.fr/sonarqube/component_measures?id=genscale%3Agatb%3Atools%3Asimka%3Agitlab%3Amaster&metric=ncloc) [![Comment line density](https://sonarqube.inria.fr/sonarqube/api/badges/measure?key=genscale:gatb:tools:simka:gitlab:master&metric=comment_lines_density)](https://sonarqube.inria.fr/sonarqube/component_measures?id=genscale%3Agatb%3Atools%3Asimka%3Agitlab%3Amaster&metric=comment_lines_density) [![Coverage](https://sonarqube.inria.fr/sonarqube/api/badges/measure?key=genscale:gatb:tools:simka:gitlab:master&metric=coverage)](https://sonarqube.inria.fr/sonarqube/component_measures?id=genscale%3Agatb%3Atools%3Asimka%3Agitlab%3Amaster&metric=coverage) [![Bugs](https://sonarqube.inria.fr/sonarqube/api/badges/measure?key=genscale:gatb:tools:simka:gitlab:master&metric=bugs)](https://sonarqube.inria.fr/sonarqube/component_measures?id=genscale%3Agatb%3Atools%3Asimka%3Agitlab%3Amaster&metric=bugs) [![Vulnerabilities](https://sonarqube.inria.fr/sonarqube/api/badges/measure?key=genscale:gatb:tools:simka:gitlab:master&metric=vulnerabilities)](https://sonarqube.inria.fr/sonarqube/component_measures?id=genscale%3Agatb%3Atools%3Asimka%3Agitlab%3Amaster&metric=vulnerabilities) [![Code Smells](https://sonarqube.inria.fr/sonarqube/api/badges/measure?key=genscale:gatb:tools:simka:gitlab:master&metric=code_smells)](https://sonarqube.inria.fr/sonarqube/component_measures?id=genscale%3Agatb%3Atools%3Asimka%3Agitlab%3Amaster&metric=code_smells) [![New Bugs](https://sonarqube.inria.fr/sonarqube/api/badges/measure?key=genscale:gatb:tools:simka:gitlab:master&metric=new_bugs)](https://sonarqube.inria.fr/sonarqube/component_measures?id=genscale%3Agatb%3Atools%3Asimka%3Agitlab%3Amaster&metric=new_bugs) [![New Vulnerabilities](https://sonarqube.inria.fr/sonarqube/api/badges/measure?key=genscale:gatb:tools:simka:gitlab:master&metric=new_vulnerabilities)](https://sonarqube.inria.fr/sonarqube/component_measures?id=genscale%3Agatb%3Atools%3Asimka%3Agitlab%3Amaster&metric=new_vulnerabilities) [![New Code Smells](https://sonarqube.inria.fr/sonarqube/api/badges/measure?key=genscale:gatb:tools:simka:gitlab:master&metric=new_code_smells)](https://sonarqube.inria.fr/sonarqube/component_measures?id=genscale%3Agatb%3Atools%3Asimka%3Agitlab%3Amaster&metric=new_code_smells)
# What is Simka? Simka is a de novo comparative metagenomics tool. Simka represents each dataset as a k-mer spectrum and compute several classical ecological distances between them. Developper: [Gaëtan Benoit](http://people.rennes.inria.fr/Gaetan.Benoit/), PhD, former member of the [Genscale](http://team.inria.fr/genscale/) team at Inria. Contact: claire dot lemaitre at inria dot fr # References * Simka: Benoit G, Peterlongo P, Mariadassou M, Drezen E, Schbath S, Lavenier D, Lemaitre C. (2016) [Multiple comparative metagenomics using multiset k-mer counting](https://doi.org/10.7717/peerj-cs.94). PeerJ Computer Science 2:e94 * SimkaMin: Gaetan Benoit, Mahendra Mariadassou, Stéphane Robin, Sophie Schbath, Pierre Peterlongo, Claire Lemaitre [SimkaMin: fast and resource frugal de novo comparative metagenomics](https://academic.oup.com/bioinformatics/advance-article/doi/10.1093/bioinformatics/btz685/5559271) Bioinformatics, https://doi.org/10.1093/bioinformatics/btz685 * Benoit G (2017) [Large scale de novo comparative metagenomics (PhD thesis in french)](https://tel.archives-ouvertes.fr/tel-01659395v2/). # Install a binary release of simka Retrieve the binary archive file from one of the official simka releases (see "Releases" tab on the Github web page of the simka project); file name is "simka-xyz-bin-Darwin.tar.gz" or "simka-xyz-bin-Linux.tar.gz" (where xyz is a release version). Then, from the command-line: gunzip simka-xyz-bin-Dawrin.tar.gz tar -xf simka-xyz-bin-Dawrin.tar cd simka-xyz-Dawrin chmod +x bin/* example/*.sh Binary of simka is in folder "bin". You can try the software on your computer, as follows: cd example ./simple_test.sh In case the software does not run appropriately on your system, you should consider to install it from its source code, as explained below. For further instructions on using simka, see User Manual, below. # Install simka from source code: git clone Requirements: cmake 2.6+ and gcc 4.4.7+ (Linux) or clang 4.1+ (Mac OSX). From the command-line: git clone https://github.com/GATB/simka.git cd simka sh INSTALL See the INSTALL file for more information. Then, you can try the software on your computer, as follows: cd example ./simple_test.sh The installation creates 4 executables (./build/bin directory): simka: main software to be used for your analysis simkaCount, simkaMerge and simkaCountProcess: not to be used directly, called by 'simka' All softwares must stay in the same folder; so, if you want to move them elsewhere on your system, consider to let them altogether. For further instructions on using simka, see User Manual, below. # Install simka from source code: using a source release archive Requirements: cmake 2.6+ and gcc 4.5+ (Linux) or clang 4.1+ (Mac OSX). Retrieve the source code archive file from one of the official simka releases (see "Releases" tab on the Github web page of the simka project); file name is "simka-xyz-Source.tar.gz" (where xyz is a release version). Then, from the command-line: gunzip simka-xyz-Source.tar.gz tar -xf simka-xyz-Source.tar cd simka-xyz-Source sh INSTALL Then, you can try the software on your computer, as follows: cd example ./simple_test.sh For further instructions on using simka, see User Manual, below. # Changelog * version 1.5.1 Sept 05, 2019: - simkaMin: easier usage of simkaMin, usefull for conda packaging * version 1.5 Jun 07, 2019: - simkaMin software: faster results by subsampling the kmer space * version 1.4 Jun 21, 2017: - update gatb-core to version 1.2.2 - simka now provide gz compressed results - new scripts for result visualization * version 1.3.2 Oct 25, 2016: - improve memory usage of symetrical distances - option -data-info to compute information on the input data (nb reads per dataset...) - intermediate merge sort passes to handle large number of datasets - prevent distances from producing nan value - fix bug that occur during k-mer counting * version 1.3.0 July 29, 2016: - Bray-Crutis computed by default - Better k-mer statistics - Fix bug in script for creating heatmaps - Add "all in memory" k-mer counter when k <= 15 - Fine grain paralellization for computing distances - Clean all memory leaks with valgrind - Update help messages - Redirect stdout and stderr of parallel processes in specific log files * version 1.0.1 March 16, 2016: minor updates ang bug fixes, first release on Github * version 1 Feb 16, 2016: stable version * version 0.1 May 28, 2015: initial public release # User manual ## Description Simka computes several classical ecological distances between N (metagenomic) read sets based on k-mer counts. Simka is implemented with the GATB library (http://gatb.inria.fr/). ## Input The input file (-in) lists the datasets. These datasets can be in fasta, fastq and in gzip compressed format (.gz). One dataset per line with the following syntax (you can put any number of spaces and/or tabs between syntax): ID1: filename.fasta ID2: filename.fasta ID3: filename.fasta The dataset ID in the name that will appear in the headers of the distance matrices. You can find a simka input file in example directory: ./example/data/simka_input.txt If a given datset has been splitted in several parts, Simka can automatically concatenate them. ID1: filename_part1.fasta , filename_part2.fasta , ... If you have paired files, you can list them separated by a ‘;’: ID1: filename_pair1.fasta ; filename_pair2.fasta You can combine concatenated and paired operations: ID1: filename_part1_pair1.fasta , filename_part2_pair1.fasta ; filename_part1_pair2.fasta , filename_part2_pair2.fasta Paired syntax is only usefull if the -max-reads option of Simka is set. Example: If -max-reads is set to 100, then Simka will considered the 100 first reads of the first paired files and the 100 first reads of the second paired files… ## Output ### Temporary output The option -out-tmp controls where the temporary files of Simka will be stored. This option is mandatory since the disk usage of Simka can be high depending on the input size. This option must target a directory on your faster disk with some free space. One may want to add new datasets to existing Simka results without recomputing everything again (for instance, if your metagenomic project is incomplete). This can only be achieved by keeping those temporary files on the disk using the option -keep-tmp of Simka. ### Result output Simka results are distance matrices. A distance matrix is a squared matrix of size N (where N is the number of input datasets). Each value in the matrix give you the distance between a pair of datasets. These values are usually in the range [0, 1]. A distance value of 0 means that the pair of dataset is perfectly similar. The higher the distance value is, the more dissimilar is the pair of datasets. Simka results will be stored in the directory indicated by -out option. By default, Simka compute an abundance-based Bray-Curtis distance matrix and a presence-absence-based Jaccard distance matrix. The option -simple-dist allows to compute more ecology distances which are fast to compute (Chord, Hellinger, Kulczinski...). The option -complex-dist allows to compute others ecology distances which can be very long to compute (Jensen-Shannon, Canberra, Whittaker...). The matrice names follow this template: mat_[abundance|presenceAbsence]_[distanceName].csv.gz The distance matrices containing ‘simka’ are distances introduces by the comparead method. These distances have the advantage of having a symmetrical and asymmetrical version. ## Visualize simka results Simka results can be visualized through heatmaps, hierarchical clustering and PCA (MDS or PCoA to be exact). Requirements: R, gplots package (only for heatmap) Use the script run-visualization.py (located in "scripts/visualization" folder). Example: ```bash python run-visualization.py -in simka_results_dir -out output_figures_dir -pca -heatmap -tree ``` where simka_results_dir is the folder containing the distances matrices of Simka (-out) Figures can be annotated by providing a metadata data in standard csv format: ```bash DATASET_ID;VARIABLE_NAME_1;VARIABLE_NAME_2 A;1;aquatic B;1;human C;2;human D;2;soil E;3;soil ``` An example of this table is given at ./example/dataset_metadata.csv Dataset ID in the metadata table must match with the dataset ID in simka distance matrices Add the following options to activate annotations: ```bash -metadata-in: filename to a metadata table -metadata-variable: the name of the variable that you want to display in figures (the name of the column), for instance VARIABLE_NAME_1 in example above ``` Visualization example commands are given when running simka example (./example/simple_test.sh). ## Usage for simka To see simka in-line help: ```bash ./bin/simka ``` ## Simka command examples Run the toy example: ```bash ./bin/simka -in example/simka_input.txt -out results -out-tmp temp_output ``` Compute all the distances that Simka can provide (Bray-Curtis, Jensen-Shannon…): ```bash ./bin/simka … -simple-dist -complex-dist ``` Change the kmer size ```bash ./bin/simka … -kmer-size 31 ``` Filter kmers seen one time (potentially erroneous) and very high abundance kmers (potentially contaminants): ```bash ./bin/simka … -abundance-min 2 -abundance-max 200 ``` Filter over the sequences of the reads and k-mers: Minimum read size of 90. Discards low complexity reads and k-mers (shannon index < 1.5) ```bash ./bin/simka … -min-read-size 90 -read-shannon-index 1.5 -kmer-shannon-index 1.5 ``` Consider a subset of the reads of the input dataset (for dataset with non-uniform reads per sample): Considers all the reads of each samples (default) ```bash ./bin/simka … -max-reads -1 ``` Let Simka compute automatically the maximum of read per samples (normalization) ```bash ./bin/simka … -max-reads 0 ``` Used only the first 1000 reads of each samples: ```bash ./bin/simka … -max-reads 1000 ``` Allow more memory and cores improve the execution time: ```bash ./bin/simka … -max-memory 20000 -nb-cores 8 ``` ## Computer cluster options Simka can be ran on computer cluster equipped of a job scheduling system such as SGE. Giving a job file template and a submission command, Simka will take care of creating and synchronizing the jobs until the end of the execution. You must provide the filenames to two job templates, one for counting and one for merging (-count-file -count-merge). There are example of file templates in the folder ‘example/potara_job’. And you must provide a submission command for both job (-count-cmd -merge-cmd) Example for SGE: ```bash -count-cmd ‘qsub -pe make 8’ -merge-cmd qsub ``` The option -max-count and -max-merge controls the maximum of simultaneous jobs. They have to be fixed if you system have a maximum of jobs restriction. Command example: ```bash ./bin/simka … -count-file example/potara_job/sge/job_count -merge-file example/potara_job/sge/job_merge \ -count-cmd qsub -pe make 34 -merge-cmd qsub \ -max-count 6 -max-merge 18 -nb-cores 200 -max-memory 500000 ``` Simka will run a maximum of 6 simultaneous counting jobs, each using 200/6 cores and 500000/6 MB of memory. Simka will run a maximum of 18 merging jobs. A merging job can not be ran on more than 1 core and use very low memory. By default Simka use -nb-cores/2 counting jobs simultaneously and -nb-cores merging jobs simultaneously. ## Possible issues with Simka ### TOO MUCH OPENED FILES Simka is a disk-based method. Depending on the chosen options (-nb-cores -max-memory), it is possible that Simka required a lot of open files. You can fix this issue in two ways: * increasing the maximum open files limit imposed by your system: ulimit -n maxFiles * reducing the number of files opened by Simka by using the option -max-count and -max-merge simka-1.5.3/doc/000077500000000000000000000000001377312000000133625ustar00rootroot00000000000000simka-1.5.3/doc/CMakeLists.txt000066400000000000000000000004631377312000000161250ustar00rootroot00000000000000 find_package (Doxygen) if (DOXYGEN_FOUND) CONFIGURE_FILE (${CMAKE_CURRENT_SOURCE_DIR}/doxygen/gatb-simka.doxyfile ${CMAKE_CURRENT_BINARY_DIR}/Doxyfile @ONLY) ADD_CUSTOM_TARGET (doc-gatb-simka ${DOXYGEN_EXECUTABLE} ${CMAKE_CURRENT_BINARY_DIR}/Doxyfile ) endif (DOXYGEN_FOUND) simka-1.5.3/doc/doxygen/000077500000000000000000000000001377312000000150375ustar00rootroot00000000000000simka-1.5.3/doc/doxygen/doxygen.css000066400000000000000000000513451377312000000172360ustar00rootroot00000000000000/* The standard CSS for doxygen $doxygenversion */ body, table, div, p, dl { font: 400 14px/19px Roboto,sans-serif; } /* @group Heading Levels */ h1.groupheader { font-size: 150%; } .title { font-size: 250%; font-weight: bold; margin: 10px 2px; } h2.groupheader { border-bottom: 1px solid ##99; color: ##44; font-size: 150%; font-weight: normal; margin-top: 1.75em; padding-top: 8px; padding-bottom: 4px; width: 100%; } h3.groupheader { font-size: 100%; } h1, h2, h3, h4, h5, h6 { -webkit-transition: text-shadow 0.5s linear; -moz-transition: text-shadow 0.5s linear; -ms-transition: text-shadow 0.5s linear; -o-transition: text-shadow 0.5s linear; transition: text-shadow 0.5s linear; margin-right: 15px; } h1.glow, h2.glow, h3.glow, h4.glow, h5.glow, h6.glow { text-shadow: 0 0 15px cyan; } dt { font-weight: bold; } div.multicol { -moz-column-gap: 1em; -webkit-column-gap: 1em; -moz-column-count: 3; -webkit-column-count: 3; } p.startli, p.startdd, p.starttd { margin-top: 2px; } p.endli { margin-bottom: 0px; } p.enddd { margin-bottom: 4px; } p.endtd { margin-bottom: 2px; } /* @end */ caption { font-weight: bold; } span.legend { font-size: 70%; text-align: center; } h3.version { font-size: 90%; text-align: center; } div.qindex, div.navtab{ background-color: ##ee; border: 1px solid ##b0; text-align: center; } div.qindex, div.navpath { width: 100%; line-height: 140%; } div.navtab { margin-right: 15px; } /* @group Link Styling */ a { color: ##50; font-weight: normal; text-decoration: none; } .contents a:visited { color: ##60; } a:hover { text-decoration: underline; } a.qindex { font-weight: bold; } a.qindexHL { font-weight: bold; background-color: ##AA; color: #ffffff; border: 1px double ##98; } .contents a.qindexHL:visited { color: #ffffff; } a.el { font-weight: bold; } a.elRef { } a.code, a.code:visited { color: #4665A2; } a.codeRef, a.codeRef:visited { color: #4665A2; } /* @end */ dl.el { margin-left: -1cm; } pre.fragment { border: 1px solid #C4CFE5; background-color: #FBFCFD; padding: 4px 6px; margin: 4px 8px 4px 2px; overflow: auto; word-wrap: break-word; font-size: 9pt; line-height: 125%; font-family: monospace, fixed; font-size: 105%; } div.fragment { padding: 10px; margin: 10px; background-color: #FFFEFA; border: 1px solid #C4CFE5; } div.line { font-family: monospace, fixed; font-size: 13px; min-height: 13px; line-height: 1.0; text-wrap: unrestricted; white-space: -moz-pre-wrap; /* Moz */ white-space: -pre-wrap; /* Opera 4-6 */ white-space: -o-pre-wrap; /* Opera 7 */ white-space: pre-wrap; /* CSS3 */ word-wrap: break-word; /* IE 5.5+ */ text-indent: -53px; padding-left: 53px; padding-bottom: 0px; margin: 0px; -webkit-transition-property: background-color, box-shadow; -webkit-transition-duration: 0.5s; -moz-transition-property: background-color, box-shadow; -moz-transition-duration: 0.5s; -ms-transition-property: background-color, box-shadow; -ms-transition-duration: 0.5s; -o-transition-property: background-color, box-shadow; -o-transition-duration: 0.5s; transition-property: background-color, box-shadow; transition-duration: 0.5s; } div.line.glow { background-color: cyan; box-shadow: 0 0 10px cyan; } span.lineno { padding-right: 4px; text-align: right; border-right: 2px solid #0F0; background-color: #E8E8E8; white-space: pre; } span.lineno a { background-color: #D8D8D8; } span.lineno a:hover { background-color: #C8C8C8; } div.ah { background-color: black; font-weight: bold; color: #ffffff; margin-bottom: 3px; margin-top: 3px; padding: 0.2em; border: solid thin #333; border-radius: 0.5em; -webkit-border-radius: .5em; -moz-border-radius: .5em; box-shadow: 2px 2px 3px #999; -webkit-box-shadow: 2px 2px 3px #999; -moz-box-shadow: rgba(0, 0, 0, 0.15) 2px 2px 2px; background-image: -webkit-gradient(linear, left top, left bottom, from(#eee), to(#000),color-stop(0.3, #444)); background-image: -moz-linear-gradient(center top, #eee 0%, #444 40%, #000); } div.groupHeader { margin-left: 16px; margin-top: 12px; font-weight: bold; } div.groupText { margin-left: 16px; font-style: italic; } body { background-color: white; color: black; margin: 0; } div.contents { margin-top: 10px; margin-left: 12px; margin-right: 8px; } td.indexkey { background-color: ##ee; font-weight: bold; border: 1px solid ##cc; margin: 2px 0px 2px 0; padding: 2px 10px; white-space: nowrap; vertical-align: top; } td.indexvalue { background-color: ##ee; border: 1px solid ##cc; padding: 2px 10px; margin: 2px 0px; } tr.memlist { background-color: ##f0; } p.formulaDsp { text-align: center; } img.formulaDsp { } img.formulaInl { vertical-align: middle; } div.center { text-align: center; margin-top: 0px; margin-bottom: 0px; padding: 0px; } div.center img { border: 0px; } address.footer { text-align: right; padding-right: 12px; } img.footer { border: 0px; vertical-align: middle; } /* @group Code Colorization */ span.keyword { color: #545454 } span.keywordtype { color: #545454 } span.keywordflow { color: #545454 } span.comment { color: #2B7A2B } span.preprocessor { color: #545454 } span.stringliteral { color: #545454 } span.charliteral { color: #545454 } span.vhdldigit { color: #ff00ff } span.vhdlchar { color: #000000 } span.vhdlkeyword { color: #700070 } span.vhdllogic { color: #ff0000 } blockquote { background-color: ##F8; border-left: 2px solid ##FF; margin: 0 24px 0 4px; padding: 0 12px 0 16px; } /* @end */ /* .search { color: #003399; font-weight: bold; } form.search { margin-bottom: 0px; margin-top: 0px; } input.search { font-size: 75%; color: #000080; font-weight: normal; background-color: #e8eef2; } */ td.tiny { font-size: 75%; } .dirtab { padding: 4px; border-collapse: collapse; border: 1px solid ##b0; } th.dirtab { background: ##ee; font-weight: bold; } hr { height: 0px; border: none; border-top: 1px solid ##66; } hr.footer { height: 1px; } /* @group Member Descriptions */ table.memberdecls { border-spacing: 0px; padding: 0px; } .memberdecls td, .fieldtable tr { -webkit-transition-property: background-color, box-shadow; -webkit-transition-duration: 0.5s; -moz-transition-property: background-color, box-shadow; -moz-transition-duration: 0.5s; -ms-transition-property: background-color, box-shadow; -ms-transition-duration: 0.5s; -o-transition-property: background-color, box-shadow; -o-transition-duration: 0.5s; transition-property: background-color, box-shadow; transition-duration: 0.5s; } .memberdecls td.glow, .fieldtable tr.glow { background-color: cyan; box-shadow: 0 0 15px cyan; } .mdescLeft, .mdescRight, .memItemLeft, .memItemRight, .memTemplItemLeft, .memTemplItemRight, .memTemplParams { background-color: ##FA; border: none; margin: 4px; padding: 1px 0 0 8px; } .mdescLeft, .mdescRight { padding: 0px 8px 4px 8px; color: #555; } .memSeparator { border-bottom: 1px solid #DEE4F0; line-height: 1px; margin: 0px; padding: 0px; } .memItemLeft, .memTemplItemLeft { white-space: nowrap; } .memItemRight { width: 100%; } .memTemplParams { color: ##60; white-space: nowrap; font-size: 80%; } /* @end */ /* @group Member Details */ /* Styles for detailed member documentation */ .memtemplate { font-size: 80%; color: ##60; font-weight: normal; margin-left: 9px; } .memnav { background-color: ##ee; border: 1px solid ##b0; text-align: center; margin: 2px; margin-right: 15px; padding: 2px; } .mempage { width: 100%; } .memitem { padding: 0; margin-bottom: 10px; margin-right: 5px; -webkit-transition: box-shadow 0.5s linear; -moz-transition: box-shadow 0.5s linear; -ms-transition: box-shadow 0.5s linear; -o-transition: box-shadow 0.5s linear; transition: box-shadow 0.5s linear; display: table !important; width: 100%; } .memitem.glow { box-shadow: 0 0 15px cyan; } .memname { font-weight: bold; margin-left: 6px; } .memname td { vertical-align: bottom; } .memproto, dl.reflist dt { border-top: 1px solid ##B4; border-left: 1px solid ##B4; border-right: 1px solid ##B4; padding: 6px 0px 6px 0px; color: ##2b; font-weight: bold; text-shadow: 0px 1px 1px rgba(255, 255, 255, 0.9); background-image:url('nav_f.png'); background-repeat:repeat-x; background-color: ##E6; /* opera specific markup */ box-shadow: 5px 5px 5px rgba(0, 0, 0, 0.15); border-top-right-radius: 4px; border-top-left-radius: 4px; /* firefox specific markup */ -moz-box-shadow: rgba(0, 0, 0, 0.15) 5px 5px 5px; -moz-border-radius-topright: 4px; -moz-border-radius-topleft: 4px; /* webkit specific markup */ -webkit-box-shadow: 5px 5px 5px rgba(0, 0, 0, 0.15); -webkit-border-top-right-radius: 4px; -webkit-border-top-left-radius: 4px; } .memdoc, dl.reflist dd { border-bottom: 1px solid ##B4; border-left: 1px solid ##B4; border-right: 1px solid ##B4; padding: 6px 10px 2px 10px; background-color: ##FC; border-top-width: 0; background-image:url('nav_g.png'); background-repeat:repeat-x; background-color: #FFFFFF; /* opera specific markup */ border-bottom-left-radius: 4px; border-bottom-right-radius: 4px; box-shadow: 5px 5px 5px rgba(0, 0, 0, 0.15); /* firefox specific markup */ -moz-border-radius-bottomleft: 4px; -moz-border-radius-bottomright: 4px; -moz-box-shadow: rgba(0, 0, 0, 0.15) 5px 5px 5px; /* webkit specific markup */ -webkit-border-bottom-left-radius: 4px; -webkit-border-bottom-right-radius: 4px; -webkit-box-shadow: 5px 5px 5px rgba(0, 0, 0, 0.15); } dl.reflist dt { padding: 5px; } dl.reflist dd { margin: 0px 0px 10px 0px; padding: 5px; } .paramkey { text-align: right; } .paramtype { white-space: nowrap; } .paramname { color: #602020; white-space: nowrap; } .paramname em { font-style: normal; } .paramname code { line-height: 14px; } .params, .retval, .exception, .tparams { margin-left: 0px; padding-left: 0px; } .params .paramname, .retval .paramname { font-weight: bold; vertical-align: top; } .params .paramtype { font-style: italic; vertical-align: top; } .params .paramdir { font-family: "courier new",courier,monospace; vertical-align: top; } table.mlabels { border-spacing: 0px; } td.mlabels-left { width: 100%; padding: 0px; } td.mlabels-right { vertical-align: bottom; padding: 0px; white-space: nowrap; } span.mlabels { margin-left: 8px; } span.mlabel { background-color: ##88; border-top:1px solid ##70; border-left:1px solid ##70; border-right:1px solid ##CC; border-bottom:1px solid ##CC; text-shadow: none; color: white; margin-right: 4px; padding: 2px 3px; border-radius: 3px; font-size: 7pt; white-space: nowrap; vertical-align: middle; } /* @end */ /* these are for tree view when not used as main index */ div.directory { margin: 10px 0px; border-top: 1px solid #A8B8D9; border-bottom: 1px solid #A8B8D9; width: 100%; } .directory table { border-collapse:collapse; } .directory td { margin: 0px; padding: 0px; vertical-align: top; } .directory td.entry { white-space: nowrap; padding-right: 6px; } .directory td.entry a { outline:none; } .directory td.entry a img { border: none; } .directory td.desc { width: 100%; padding-left: 6px; padding-right: 6px; padding-top: 3px; border-left: 1px solid rgba(0,0,0,0.05); } .directory tr.even { padding-left: 6px; background-color: ##F8; } .directory img { vertical-align: -30%; } .directory .levels { white-space: nowrap; width: 100%; text-align: right; font-size: 9pt; } .directory .levels span { cursor: pointer; padding-left: 2px; padding-right: 2px; color: ##50; } div.dynheader { margin-top: 8px; -webkit-touch-callout: none; -webkit-user-select: none; -khtml-user-select: none; -moz-user-select: none; -ms-user-select: none; user-select: none; } address { font-style: normal; color: ##33; } table.doxtable { border-collapse:collapse; margin-top: 4px; margin-bottom: 4px; } table.doxtable td, table.doxtable th { border: 1px solid ##37; padding: 3px 7px 2px; } table.doxtable th { background-color: ##47; color: #FFFFFF; font-size: 110%; padding-bottom: 4px; padding-top: 5px; } table.fieldtable { /*width: 100%;*/ margin-bottom: 10px; border: 1px solid ##B4; border-spacing: 0px; -moz-border-radius: 4px; -webkit-border-radius: 4px; border-radius: 4px; -moz-box-shadow: rgba(0, 0, 0, 0.15) 2px 2px 2px; -webkit-box-shadow: 2px 2px 2px rgba(0, 0, 0, 0.15); box-shadow: 2px 2px 2px rgba(0, 0, 0, 0.15); } .fieldtable td, .fieldtable th { padding: 3px 7px 2px; } .fieldtable td.fieldtype, .fieldtable td.fieldname { white-space: nowrap; border-right: 1px solid ##B4; border-bottom: 1px solid ##B4; vertical-align: top; } .fieldtable td.fieldname { padding-top: 5px; } .fieldtable td.fielddoc { border-bottom: 1px solid ##B4; /*width: 100%;*/ } .fieldtable td.fielddoc p:first-child { margin-top: 2px; } .fieldtable td.fielddoc p:last-child { margin-bottom: 2px; } .fieldtable tr:last-child td { border-bottom: none; } .fieldtable th { background-image:url('nav_f.png'); background-repeat:repeat-x; background-color: ##E6; font-size: 90%; color: ##2B; padding-bottom: 4px; padding-top: 5px; text-align:left; -moz-border-radius-topleft: 4px; -moz-border-radius-topright: 4px; -webkit-border-top-left-radius: 4px; -webkit-border-top-right-radius: 4px; border-top-left-radius: 4px; border-top-right-radius: 4px; border-bottom: 1px solid ##B4; } .tabsearch { top: 0px; left: 10px; height: 36px; background-image: url('tab_b.png'); z-index: 101; overflow: hidden; font-size: 13px; } .navpath ul { font-size: 11px; background-image:url('tab_b.png'); background-repeat:repeat-x; background-position: 0 -5px; height:30px; line-height:30px; color:##9b; border:solid 1px ##ca; overflow:hidden; margin:0px; padding:0px; } .navpath li { list-style-type:none; float:left; padding-left:10px; padding-right:15px; background-image:url('bc_s.png'); background-repeat:no-repeat; background-position:right; color:##45; } .navpath li.navelem a { height:32px; display:block; text-decoration: none; outline: none; color: ##30; font-family: 'Lucida Grande',Geneva,Helvetica,Arial,sans-serif; text-shadow: 0px 1px 1px rgba(255, 255, 255, 0.9); text-decoration: none; } .navpath li.navelem a:hover { color:##80; } .navpath li.footer { list-style-type:none; float:right; padding-left:10px; padding-right:15px; background-image:none; background-repeat:no-repeat; background-position:right; color:##45; font-size: 8pt; } div.summary { float: right; font-size: 8pt; padding-right: 5px; width: 50%; text-align: right; } div.summary a { white-space: nowrap; } div.ingroups { font-size: 8pt; width: 50%; text-align: left; } div.ingroups a { white-space: nowrap; } div.header { background-image:url('nav_h.png'); background-repeat:repeat-x; background-color: ##FA; margin: 0px; border-bottom: 1px solid ##CC; } div.headertitle { padding: 5px 5px 5px 10px; } dl { padding: 0 0 0 10px; } /* dl.note, dl.warning, dl.attention, dl.pre, dl.post, dl.invariant, dl.deprecated, dl.todo, dl.test, dl.bug */ dl.section { margin-left: 0px; padding-left: 0px; } dl.note { margin-left:-7px; padding-left: 3px; border-left:4px solid; border-color: #D0C000; } dl.warning, dl.attention { margin-left:-7px; padding-left: 3px; border-left:4px solid; border-color: #FF0000; } dl.pre, dl.post, dl.invariant { margin-left:-7px; padding-left: 3px; border-left:4px solid; border-color: #00D000; } dl.deprecated { margin-left:-7px; padding-left: 3px; border-left:4px solid; border-color: #505050; } dl.todo { margin-left:-7px; padding-left: 3px; border-left:4px solid; border-color: #00C0E0; } dl.test { margin-left:-7px; padding-left: 3px; border-left:4px solid; border-color: #3030E0; } dl.bug { margin-left:-7px; padding-left: 3px; border-left:4px solid; border-color: #C08050; } dl.section dd { margin-bottom: 6px; } #projectlogo { text-align: center; vertical-align: bottom; border-collapse: separate; } #projectlogo img { border: 0px none; } #projectname { font: 300% Tahoma, Arial,sans-serif; margin: 0px; padding: 2px 0px; } #projectbrief { font: 120% Tahoma, Arial,sans-serif; margin: 0px; padding: 0px; } #projectnumber { font: 50% Tahoma, Arial,sans-serif; margin: 0px; padding: 0px; } #titlearea { padding: 0px; margin: 0px; width: 100%; border-bottom: 1px solid ##70; } .image { text-align: center; } .dotgraph { text-align: center; } .mscgraph { text-align: center; } .caption { font-weight: bold; } div.zoom { border: 1px solid ##A0; } dl.citelist { margin-bottom:50px; } dl.citelist dt { color:##40; float:left; font-weight:bold; margin-right:10px; padding:5px; } dl.citelist dd { margin:2px 0; padding:5px 0; } div.toc { padding: 14px 25px; background-color: ##F6; border: 1px solid ##DD; border-radius: 7px 7px 7px 7px; float: right; height: auto; margin: 0 20px 10px 10px; width: 200px; } div.toc li { background: url("bdwn.png") no-repeat scroll 0 5px transparent; font: 10px/1.2 Verdana,DejaVu Sans,Geneva,sans-serif; margin-top: 5px; padding-left: 10px; padding-top: 2px; } div.toc h3 { font: bold 12px/1.2 Arial,FreeSans,sans-serif; color: ##60; border-bottom: 0 none; margin: 0; } div.toc ul { list-style: none outside none; border: medium none; padding: 0px; } div.toc li.level1 { margin-left: 0px; } div.toc li.level2 { margin-left: 15px; } div.toc li.level3 { margin-left: 30px; } div.toc li.level4 { margin-left: 45px; } .inherit_header { font-weight: bold; color: gray; cursor: pointer; -webkit-touch-callout: none; -webkit-user-select: none; -khtml-user-select: none; -moz-user-select: none; -ms-user-select: none; user-select: none; } .inherit_header td { padding: 6px 0px 2px 5px; } .inherit { display: none; } tr.heading h2 { margin-top: 12px; margin-bottom: 4px; } @media print { #top { display: none; } #side-nav { display: none; } #nav-path { display: none; } body { overflow:visible; } h1, h2, h3, h4, h5, h6 { page-break-after: avoid; } .summary { display: none; } .memitem { page-break-inside: avoid; } #doc-content { margin-left:0 !important; height:auto !important; width:auto !important; overflow:inherit; display:inline; } } simka-1.5.3/doc/doxygen/gatb-simka.doxyfile000066400000000000000000002130561377312000000206320ustar00rootroot00000000000000# Doxyfile 1.7.4 # This file describes the settings to be used by the documentation system # doxygen (www.doxygen.org) for a project. # # All text after a hash (#) is considered a comment and will be ignored. # The format is: # TAG = value [value, ...] # For lists items can also be appended using: # TAG += value [value, ...] # Values that contain spaces should be placed between quotes (" "). #--------------------------------------------------------------------------- # Project related configuration options #--------------------------------------------------------------------------- # This tag specifies the encoding used for all characters in the config file # that follow. The default is UTF-8 which is also the encoding used for all # text before the first occurrence of this tag. Doxygen uses libiconv (or the # iconv built into libc) for the transcoding. See # http://www.gnu.org/software/libiconv for the list of possible encodings. DOXYFILE_ENCODING = UTF-8 # The PROJECT_NAME tag is a single word (or a sequence of words surrounded # by quotes) that should identify the project. #PROJECT_NAME = SIMKA-gatb.core-API-@gatb-core-version@ PROJECT_NAME = simka-@gatb-tool-version@ # The PROJECT_NUMBER tag can be used to enter a project or revision number. # This could be handy for archiving the generated documentation or # if some version control system is used. PROJECT_NUMBER = # Using the PROJECT_BRIEF tag one can provide an optional one line description # for a project that appears at the top of each page and should give viewer # a quick idea about the purpose of the project. Keep the description short. PROJECT_BRIEF = # With the PROJECT_LOGO tag one can specify an logo or icon that is # included in the documentation. The maximum height of the logo should not # exceed 55 pixels and the maximum width should not exceed 200 pixels. # Doxygen will copy the logo to the output directory. PROJECT_LOGO = # The OUTPUT_DIRECTORY tag is used to specify the (relative or absolute) # base path where the generated documentation will be put. # If a relative path is entered, it will be relative to the location # where doxygen was started. If left blank the current directory will be used. OUTPUT_DIRECTORY = . # If the CREATE_SUBDIRS tag is set to YES, then doxygen will create # 4096 sub-directories (in 2 levels) under the output directory of each output # format and will distribute the generated files over these directories. # Enabling this option can be useful when feeding doxygen a huge amount of # source files, where putting all generated files in the same directory would # otherwise cause performance problems for the file system. CREATE_SUBDIRS = NO # The OUTPUT_LANGUAGE tag is used to specify the language in which all # documentation generated by doxygen is written. Doxygen will use this # information to generate all constant output in the proper language. # The default language is English, other supported languages are: # Afrikaans, Arabic, Brazilian, Catalan, Chinese, Chinese-Traditional, # Croatian, Czech, Danish, Dutch, Esperanto, Farsi, Finnish, French, German, # Greek, Hungarian, Italian, Japanese, Japanese-en (Japanese with English # messages), Korean, Korean-en, Lithuanian, Norwegian, Macedonian, Persian, # Polish, Portuguese, Romanian, Russian, Serbian, Serbian-Cyrillic, Slovak, # Slovene, Spanish, Swedish, Ukrainian, and Vietnamese. OUTPUT_LANGUAGE = English # If the BRIEF_MEMBER_DESC tag is set to YES (the default) Doxygen will # include brief member descriptions after the members that are listed in # the file and class documentation (similar to JavaDoc). # Set to NO to disable this. BRIEF_MEMBER_DESC = YES # If the REPEAT_BRIEF tag is set to YES (the default) Doxygen will prepend # the brief description of a member or function before the detailed description. # Note: if both HIDE_UNDOC_MEMBERS and BRIEF_MEMBER_DESC are set to NO, the # brief descriptions will be completely suppressed. REPEAT_BRIEF = YES # This tag implements a quasi-intelligent brief description abbreviator # that is used to form the text in various listings. Each string # in this list, if found as the leading text of the brief description, will be # stripped from the text and the result after processing the whole list, is # used as the annotated text. Otherwise, the brief description is used as-is. # If left blank, the following values are used ("$name" is automatically # replaced with the name of the entity): "The $name class" "The $name widget" # "The $name file" "is" "provides" "specifies" "contains" # "represents" "a" "an" "the" ABBREVIATE_BRIEF = # If the ALWAYS_DETAILED_SEC and REPEAT_BRIEF tags are both set to YES then # Doxygen will generate a detailed section even if there is only a brief # description. ALWAYS_DETAILED_SEC = NO # If the INLINE_INHERITED_MEMB tag is set to YES, doxygen will show all # inherited members of a class in the documentation of that class as if those # members were ordinary class members. Constructors, destructors and assignment # operators of the base classes will not be shown. INLINE_INHERITED_MEMB = NO # If the FULL_PATH_NAMES tag is set to YES then Doxygen will prepend the full # path before files name in the file list and in the header files. If set # to NO the shortest path that makes the file name unique will be used. FULL_PATH_NAMES = YES # If the FULL_PATH_NAMES tag is set to YES then the STRIP_FROM_PATH tag # can be used to strip a user-defined part of the path. Stripping is # only done if one of the specified strings matches the left-hand part of # the path. The tag can be used to show relative paths in the file list. # If left blank the directory from which doxygen is run is used as the # path to strip. STRIP_FROM_PATH = # The STRIP_FROM_INC_PATH tag can be used to strip a user-defined part of # the path mentioned in the documentation of a class, which tells # the reader which header file to include in order to use a class. # If left blank only the name of the header file containing the class # definition is used. Otherwise one should specify the include paths that # are normally passed to the compiler using the -I flag. STRIP_FROM_INC_PATH = # If the SHORT_NAMES tag is set to YES, doxygen will generate much shorter # (but less readable) file names. This can be useful if your file system # doesn't support long names like on DOS, Mac, or CD-ROM. SHORT_NAMES = NO # If the JAVADOC_AUTOBRIEF tag is set to YES then Doxygen # will interpret the first line (until the first dot) of a JavaDoc-style # comment as the brief description. If set to NO, the JavaDoc # comments will behave just like regular Qt-style comments # (thus requiring an explicit @brief command for a brief description.) JAVADOC_AUTOBRIEF = NO # If the QT_AUTOBRIEF tag is set to YES then Doxygen will # interpret the first line (until the first dot) of a Qt-style # comment as the brief description. If set to NO, the comments # will behave just like regular Qt-style comments (thus requiring # an explicit \brief command for a brief description.) QT_AUTOBRIEF = NO # The MULTILINE_CPP_IS_BRIEF tag can be set to YES to make Doxygen # treat a multi-line C++ special comment block (i.e. a block of //! or /// # comments) as a brief description. This used to be the default behaviour. # The new default is to treat a multi-line C++ comment block as a detailed # description. Set this tag to YES if you prefer the old behaviour instead. MULTILINE_CPP_IS_BRIEF = NO # If the INHERIT_DOCS tag is set to YES (the default) then an undocumented # member inherits the documentation from any documented member that it # re-implements. INHERIT_DOCS = YES # If the SEPARATE_MEMBER_PAGES tag is set to YES, then doxygen will produce # a new page for each member. If set to NO, the documentation of a member will # be part of the file/class/namespace that contains it. SEPARATE_MEMBER_PAGES = NO # The TAB_SIZE tag can be used to set the number of spaces in a tab. # Doxygen uses this value to replace tabs by spaces in code fragments. TAB_SIZE = 8 # This tag can be used to specify a number of aliases that acts # as commands in the documentation. An alias has the form "name=value". # For example adding "sideeffect=\par Side Effects:\n" will allow you to # put the command \sideeffect (or @sideeffect) in the documentation, which # will result in a user-defined paragraph with heading "Side Effects:". # You can put \n's in the value part of an alias to insert newlines. ALIASES = # Set the OPTIMIZE_OUTPUT_FOR_C tag to YES if your project consists of C # sources only. Doxygen will then generate output that is more tailored for C. # For instance, some of the names that are used will be different. The list # of all members will be omitted, etc. OPTIMIZE_OUTPUT_FOR_C = NO # Set the OPTIMIZE_OUTPUT_JAVA tag to YES if your project consists of Java # sources only. Doxygen will then generate output that is more tailored for # Java. For instance, namespaces will be presented as packages, qualified # scopes will look different, etc. OPTIMIZE_OUTPUT_JAVA = NO # Set the OPTIMIZE_FOR_FORTRAN tag to YES if your project consists of Fortran # sources only. Doxygen will then generate output that is more tailored for # Fortran. OPTIMIZE_FOR_FORTRAN = NO # Set the OPTIMIZE_OUTPUT_VHDL tag to YES if your project consists of VHDL # sources. Doxygen will then generate output that is tailored for # VHDL. OPTIMIZE_OUTPUT_VHDL = NO # Doxygen selects the parser to use depending on the extension of the files it # parses. With this tag you can assign which parser to use for a given extension. # Doxygen has a built-in mapping, but you can override or extend it using this # tag. The format is ext=language, where ext is a file extension, and language # is one of the parsers supported by doxygen: IDL, Java, Javascript, CSharp, C, # C++, D, PHP, Objective-C, Python, Fortran, VHDL, C, C++. For instance to make # doxygen treat .inc files as Fortran files (default is PHP), and .f files as C # (default is Fortran), use: inc=Fortran f=C. Note that for custom extensions # you also need to set FILE_PATTERNS otherwise the files are not read by doxygen. EXTENSION_MAPPING = # If you use STL classes (i.e. std::string, std::vector, etc.) but do not want # to include (a tag file for) the STL sources as input, then you should # set this tag to YES in order to let doxygen match functions declarations and # definitions whose arguments contain STL classes (e.g. func(std::string); v.s. # func(std::string) {}). This also makes the inheritance and collaboration # diagrams that involve STL classes more complete and accurate. BUILTIN_STL_SUPPORT = NO # If you use Microsoft's C++/CLI language, you should set this option to YES to # enable parsing support. CPP_CLI_SUPPORT = NO # Set the SIP_SUPPORT tag to YES if your project consists of sip sources only. # Doxygen will parse them like normal C++ but will assume all classes use public # instead of private inheritance when no explicit protection keyword is present. SIP_SUPPORT = NO # For Microsoft's IDL there are propget and propput attributes to indicate getter # and setter methods for a property. Setting this option to YES (the default) # will make doxygen replace the get and set methods by a property in the # documentation. This will only work if the methods are indeed getting or # setting a simple type. If this is not the case, or you want to show the # methods anyway, you should set this option to NO. IDL_PROPERTY_SUPPORT = YES # If member grouping is used in the documentation and the DISTRIBUTE_GROUP_DOC # tag is set to YES, then doxygen will reuse the documentation of the first # member in the group (if any) for the other members of the group. By default # all members of a group must be documented explicitly. DISTRIBUTE_GROUP_DOC = NO # Set the SUBGROUPING tag to YES (the default) to allow class member groups of # the same type (for instance a group of public functions) to be put as a # subgroup of that type (e.g. under the Public Functions section). Set it to # NO to prevent subgrouping. Alternatively, this can be done per class using # the \nosubgrouping command. SUBGROUPING = YES # When the INLINE_GROUPED_CLASSES tag is set to YES, classes, structs and # unions are shown inside the group in which they are included (e.g. using # @ingroup) instead of on a separate page (for HTML and Man pages) or # section (for LaTeX and RTF). INLINE_GROUPED_CLASSES = NO # When TYPEDEF_HIDES_STRUCT is enabled, a typedef of a struct, union, or enum # is documented as struct, union, or enum with the name of the typedef. So # typedef struct TypeS {} TypeT, will appear in the documentation as a struct # with name TypeT. When disabled the typedef will appear as a member of a file, # namespace, or class. And the struct will be named TypeS. This can typically # be useful for C code in case the coding convention dictates that all compound # types are typedef'ed and only the typedef is referenced, never the tag name. TYPEDEF_HIDES_STRUCT = NO # The SYMBOL_CACHE_SIZE determines the size of the internal cache use to # determine which symbols to keep in memory and which to flush to disk. # When the cache is full, less often used symbols will be written to disk. # For small to medium size projects (<1000 input files) the default value is # probably good enough. For larger projects a too small cache size can cause # doxygen to be busy swapping symbols to and from disk most of the time # causing a significant performance penalty. # If the system has enough physical memory increasing the cache will improve the # performance by keeping more symbols in memory. Note that the value works on # a logarithmic scale so increasing the size by one will roughly double the # memory usage. The cache size is given by this formula: # 2^(16+SYMBOL_CACHE_SIZE). The valid range is 0..9, the default is 0, # corresponding to a cache size of 2^16 = 65536 symbols SYMBOL_CACHE_SIZE = 0 #--------------------------------------------------------------------------- # Build related configuration options #--------------------------------------------------------------------------- # If the EXTRACT_ALL tag is set to YES doxygen will assume all entities in # documentation are documented, even if no documentation was available. # Private class members and static file members will be hidden unless # the EXTRACT_PRIVATE and EXTRACT_STATIC tags are set to YES EXTRACT_ALL = YES # If the EXTRACT_PRIVATE tag is set to YES all private members of a class # will be included in the documentation. EXTRACT_PRIVATE = NO # If the EXTRACT_STATIC tag is set to YES all static members of a file # will be included in the documentation. EXTRACT_STATIC = NO # If the EXTRACT_LOCAL_CLASSES tag is set to YES classes (and structs) # defined locally in source files will be included in the documentation. # If set to NO only classes defined in header files are included. EXTRACT_LOCAL_CLASSES = YES # This flag is only useful for Objective-C code. When set to YES local # methods, which are defined in the implementation section but not in # the interface are included in the documentation. # If set to NO (the default) only methods in the interface are included. EXTRACT_LOCAL_METHODS = NO # If this flag is set to YES, the members of anonymous namespaces will be # extracted and appear in the documentation as a namespace called # 'anonymous_namespace{file}', where file will be replaced with the base # name of the file that contains the anonymous namespace. By default # anonymous namespaces are hidden. EXTRACT_ANON_NSPACES = NO # If the HIDE_UNDOC_MEMBERS tag is set to YES, Doxygen will hide all # undocumented members of documented classes, files or namespaces. # If set to NO (the default) these members will be included in the # various overviews, but no documentation section is generated. # This option has no effect if EXTRACT_ALL is enabled. HIDE_UNDOC_MEMBERS = YES # If the HIDE_UNDOC_CLASSES tag is set to YES, Doxygen will hide all # undocumented classes that are normally visible in the class hierarchy. # If set to NO (the default) these classes will be included in the various # overviews. This option has no effect if EXTRACT_ALL is enabled. HIDE_UNDOC_CLASSES = YES # If the HIDE_FRIEND_COMPOUNDS tag is set to YES, Doxygen will hide all # friend (class|struct|union) declarations. # If set to NO (the default) these declarations will be included in the # documentation. HIDE_FRIEND_COMPOUNDS = NO # If the HIDE_IN_BODY_DOCS tag is set to YES, Doxygen will hide any # documentation blocks found inside the body of a function. # If set to NO (the default) these blocks will be appended to the # function's detailed documentation block. HIDE_IN_BODY_DOCS = YES # The INTERNAL_DOCS tag determines if documentation # that is typed after a \internal command is included. If the tag is set # to NO (the default) then the documentation will be excluded. # Set it to YES to include the internal documentation. INTERNAL_DOCS = NO # If the CASE_SENSE_NAMES tag is set to NO then Doxygen will only generate # file names in lower-case letters. If set to YES upper-case letters are also # allowed. This is useful if you have classes or files whose names only differ # in case and if your file system supports case sensitive file names. Windows # and Mac users are advised to set this option to NO. CASE_SENSE_NAMES = YES # If the HIDE_SCOPE_NAMES tag is set to NO (the default) then Doxygen # will show members with their full class and namespace scopes in the # documentation. If set to YES the scope will be hidden. HIDE_SCOPE_NAMES = NO # If the SHOW_INCLUDE_FILES tag is set to YES (the default) then Doxygen # will put a list of the files that are included by a file in the documentation # of that file. SHOW_INCLUDE_FILES = YES # If the FORCE_LOCAL_INCLUDES tag is set to YES then Doxygen # will list include files with double quotes in the documentation # rather than with sharp brackets. FORCE_LOCAL_INCLUDES = NO # If the INLINE_INFO tag is set to YES (the default) then a tag [inline] # is inserted in the documentation for inline members. INLINE_INFO = YES # If the SORT_MEMBER_DOCS tag is set to YES (the default) then doxygen # will sort the (detailed) documentation of file and class members # alphabetically by member name. If set to NO the members will appear in # declaration order. SORT_MEMBER_DOCS = YES # If the SORT_BRIEF_DOCS tag is set to YES then doxygen will sort the # brief documentation of file, namespace and class members alphabetically # by member name. If set to NO (the default) the members will appear in # declaration order. SORT_BRIEF_DOCS = NO # If the SORT_MEMBERS_CTORS_1ST tag is set to YES then doxygen # will sort the (brief and detailed) documentation of class members so that # constructors and destructors are listed first. If set to NO (the default) # the constructors will appear in the respective orders defined by # SORT_MEMBER_DOCS and SORT_BRIEF_DOCS. # This tag will be ignored for brief docs if SORT_BRIEF_DOCS is set to NO # and ignored for detailed docs if SORT_MEMBER_DOCS is set to NO. SORT_MEMBERS_CTORS_1ST = NO # If the SORT_GROUP_NAMES tag is set to YES then doxygen will sort the # hierarchy of group names into alphabetical order. If set to NO (the default) # the group names will appear in their defined order. SORT_GROUP_NAMES = NO # If the SORT_BY_SCOPE_NAME tag is set to YES, the class list will be # sorted by fully-qualified names, including namespaces. If set to # NO (the default), the class list will be sorted only by class name, # not including the namespace part. # Note: This option is not very useful if HIDE_SCOPE_NAMES is set to YES. # Note: This option applies only to the class list, not to the # alphabetical list. SORT_BY_SCOPE_NAME = YES # If the STRICT_PROTO_MATCHING option is enabled and doxygen fails to # do proper type resolution of all parameters of a function it will reject a # match between the prototype and the implementation of a member function even # if there is only one candidate or it is obvious which candidate to choose # by doing a simple string match. By disabling STRICT_PROTO_MATCHING doxygen # will still accept a match between prototype and implementation in such cases. STRICT_PROTO_MATCHING = NO # The GENERATE_TODOLIST tag can be used to enable (YES) or # disable (NO) the todo list. This list is created by putting \todo # commands in the documentation. GENERATE_TODOLIST = YES # The GENERATE_TESTLIST tag can be used to enable (YES) or # disable (NO) the test list. This list is created by putting \test # commands in the documentation. GENERATE_TESTLIST = YES # The GENERATE_BUGLIST tag can be used to enable (YES) or # disable (NO) the bug list. This list is created by putting \bug # commands in the documentation. GENERATE_BUGLIST = YES # The GENERATE_DEPRECATEDLIST tag can be used to enable (YES) or # disable (NO) the deprecated list. This list is created by putting # \deprecated commands in the documentation. GENERATE_DEPRECATEDLIST = YES # The ENABLED_SECTIONS tag can be used to enable conditional # documentation sections, marked by \if sectionname ... \endif. ENABLED_SECTIONS = # The MAX_INITIALIZER_LINES tag determines the maximum number of lines # the initial value of a variable or macro consists of for it to appear in # the documentation. If the initializer consists of more lines than specified # here it will be hidden. Use a value of 0 to hide initializers completely. # The appearance of the initializer of individual variables and macros in the # documentation can be controlled using \showinitializer or \hideinitializer # command in the documentation regardless of this setting. MAX_INITIALIZER_LINES = 30 # Set the SHOW_USED_FILES tag to NO to disable the list of files generated # at the bottom of the documentation of classes and structs. If set to YES the # list will mention the files that were used to generate the documentation. SHOW_USED_FILES = YES # If the sources in your project are distributed over multiple directories # then setting the SHOW_DIRECTORIES tag to YES will show the directory hierarchy # in the documentation. The default is NO. #SHOW_DIRECTORIES = NO # Set the SHOW_FILES tag to NO to disable the generation of the Files page. # This will remove the Files entry from the Quick Index and from the # Folder Tree View (if specified). The default is YES. SHOW_FILES = YES # Set the SHOW_NAMESPACES tag to NO to disable the generation of the # Namespaces page. # This will remove the Namespaces entry from the Quick Index # and from the Folder Tree View (if specified). The default is YES. SHOW_NAMESPACES = YES # The FILE_VERSION_FILTER tag can be used to specify a program or script that # doxygen should invoke to get the current version for each file (typically from # the version control system). Doxygen will invoke the program by executing (via # popen()) the command , where is the value of # the FILE_VERSION_FILTER tag, and is the name of an input file # provided by doxygen. Whatever the program writes to standard output # is used as the file version. See the manual for examples. FILE_VERSION_FILTER = # The LAYOUT_FILE tag can be used to specify a layout file which will be parsed # by doxygen. The layout file controls the global structure of the generated # output files in an output format independent way. The create the layout file # that represents doxygen's defaults, run doxygen with the -l option. # You can optionally specify a file name after the option, if omitted # DoxygenLayout.xml will be used as the name of the layout file. LAYOUT_FILE = #--------------------------------------------------------------------------- # configuration options related to warning and progress messages #--------------------------------------------------------------------------- # The QUIET tag can be used to turn on/off the messages that are generated # by doxygen. Possible values are YES and NO. If left blank NO is used. QUIET = NO # The WARNINGS tag can be used to turn on/off the warning messages that are # generated by doxygen. Possible values are YES and NO. If left blank # NO is used. WARNINGS = YES # If WARN_IF_UNDOCUMENTED is set to YES, then doxygen will generate warnings # for undocumented members. If EXTRACT_ALL is set to YES then this flag will # automatically be disabled. WARN_IF_UNDOCUMENTED = YES # If WARN_IF_DOC_ERROR is set to YES, doxygen will generate warnings for # potential errors in the documentation, such as not documenting some # parameters in a documented function, or documenting parameters that # don't exist or using markup commands wrongly. WARN_IF_DOC_ERROR = YES # The WARN_NO_PARAMDOC option can be enabled to get warnings for # functions that are documented, but have no documentation for their parameters # or return value. If set to NO (the default) doxygen will only warn about # wrong or incomplete parameter documentation, but not about the absence of # documentation. WARN_NO_PARAMDOC = NO # The WARN_FORMAT tag determines the format of the warning messages that # doxygen can produce. The string should contain the $file, $line, and $text # tags, which will be replaced by the file and line number from which the # warning originated and the warning text. Optionally the format may contain # $version, which will be replaced by the version of the file (if it could # be obtained via FILE_VERSION_FILTER) WARN_FORMAT = "$file:$line: $text" # The WARN_LOGFILE tag can be used to specify a file to which warning # and error messages should be written. If left blank the output is written # to stderr. WARN_LOGFILE = #--------------------------------------------------------------------------- # configuration options related to the input files #--------------------------------------------------------------------------- # The INPUT tag can be used to specify the files and/or directories that contain # documented source files. You may enter file names like "myfile.cpp" or # directories like "/usr/src/myproject". Separate the files or directories # with spaces. INPUT = @CMAKE_CURRENT_SOURCE_DIR@/../src/ @CMAKE_CURRENT_SOURCE_DIR@/../test/src/ @CMAKE_CURRENT_SOURCE_DIR@/../doc/doxygen # This tag can be used to specify the character encoding of the source files # that doxygen parses. Internally doxygen uses the UTF-8 encoding, which is # also the default input encoding. Doxygen uses libiconv (or the iconv built # into libc) for the transcoding. See http://www.gnu.org/software/libiconv for # the list of possible encodings. INPUT_ENCODING = UTF-8 # If the value of the INPUT tag contains directories, you can use the # FILE_PATTERNS tag to specify one or more wildcard pattern (like *.cpp # and *.h) to filter out the source-files in the directories. If left # blank the following patterns are tested: # *.c *.cc *.cxx *.cpp *.c++ *.d *.java *.ii *.ixx *.ipp *.i++ *.inl *.h *.hh # *.hxx *.hpp *.h++ *.idl *.odl *.cs *.php *.php3 *.inc *.m *.mm *.dox *.py # *.f90 *.f *.for *.vhd *.vhdl FILE_PATTERNS = # The RECURSIVE tag can be used to turn specify whether or not subdirectories # should be searched for input files as well. Possible values are YES and NO. # If left blank NO is used. RECURSIVE = YES # The EXCLUDE tag can be used to specify files and/or directories that should # excluded from the INPUT source files. This way you can easily exclude a # subdirectory from a directory tree whose root is specified with the INPUT tag. EXCLUDE = # The EXCLUDE_SYMLINKS tag can be used select whether or not files or # directories that are symbolic links (a Unix file system feature) are excluded # from the input. EXCLUDE_SYMLINKS = NO # If the value of the INPUT tag contains directories, you can use the # EXCLUDE_PATTERNS tag to specify one or more wildcard patterns to exclude # certain files from those directories. Note that the wildcards are matched # against the file with absolute path, so to exclude all test directories # for example use the pattern */test/* EXCLUDE_PATTERNS = # The EXCLUDE_SYMBOLS tag can be used to specify one or more symbol names # (namespaces, classes, functions, etc.) that should be excluded from the # output. The symbol name can be a fully qualified name, a word, or if the # wildcard * is used, a substring. Examples: ANamespace, AClass, # AClass::ANamespace, ANamespace::*Test EXCLUDE_SYMBOLS = # The EXAMPLE_PATH tag can be used to specify one or more files or # directories that contain example code fragments that are included (see # the \include command). EXAMPLE_PATH = @CMAKE_CURRENT_SOURCE_DIR@/../examples # If the value of the EXAMPLE_PATH tag contains directories, you can use the # EXAMPLE_PATTERNS tag to specify one or more wildcard pattern (like *.cpp # and *.h) to filter out the source-files in the directories. If left # blank all files are included. EXAMPLE_PATTERNS = # If the EXAMPLE_RECURSIVE tag is set to YES then subdirectories will be # searched for input files to be used with the \include or \dontinclude # commands irrespective of the value of the RECURSIVE tag. # Possible values are YES and NO. If left blank NO is used. EXAMPLE_RECURSIVE = NO # The IMAGE_PATH tag can be used to specify one or more files or # directories that contain image that are included in the documentation (see # the \image command). IMAGE_PATH = @CMAKE_CURRENT_SOURCE_DIR@/../doc/doxygen/images # The INPUT_FILTER tag can be used to specify a program that doxygen should # invoke to filter for each input file. Doxygen will invoke the filter program # by executing (via popen()) the command , where # is the value of the INPUT_FILTER tag, and is the name of an # input file. Doxygen will then use the output that the filter program writes # to standard output. # If FILTER_PATTERNS is specified, this tag will be # ignored. INPUT_FILTER = # The FILTER_PATTERNS tag can be used to specify filters on a per file pattern # basis. # Doxygen will compare the file name with each pattern and apply the # filter if there is a match. # The filters are a list of the form: # pattern=filter (like *.cpp=my_cpp_filter). See INPUT_FILTER for further # info on how filters are used. If FILTER_PATTERNS is empty or if # non of the patterns match the file name, INPUT_FILTER is applied. FILTER_PATTERNS = # If the FILTER_SOURCE_FILES tag is set to YES, the input filter (if set using # INPUT_FILTER) will be used to filter the input files when producing source # files to browse (i.e. when SOURCE_BROWSER is set to YES). FILTER_SOURCE_FILES = NO # The FILTER_SOURCE_PATTERNS tag can be used to specify source filters per file # pattern. A pattern will override the setting for FILTER_PATTERN (if any) # and it is also possible to disable source filtering for a specific pattern # using *.ext= (so without naming a filter). This option only has effect when # FILTER_SOURCE_FILES is enabled. FILTER_SOURCE_PATTERNS = #--------------------------------------------------------------------------- # configuration options related to source browsing #--------------------------------------------------------------------------- # If the SOURCE_BROWSER tag is set to YES then a list of source files will # be generated. Documented entities will be cross-referenced with these sources. # Note: To get rid of all source code in the generated output, make sure also # VERBATIM_HEADERS is set to NO. SOURCE_BROWSER = NO # Setting the INLINE_SOURCES tag to YES will include the body # of functions and classes directly in the documentation. INLINE_SOURCES = NO # Setting the STRIP_CODE_COMMENTS tag to YES (the default) will instruct # doxygen to hide any special comment blocks from generated source code # fragments. Normal C and C++ comments will always remain visible. STRIP_CODE_COMMENTS = YES # If the REFERENCED_BY_RELATION tag is set to YES # then for each documented function all documented # functions referencing it will be listed. REFERENCED_BY_RELATION = NO # If the REFERENCES_RELATION tag is set to YES # then for each documented function all documented entities # called/used by that function will be listed. REFERENCES_RELATION = NO # If the REFERENCES_LINK_SOURCE tag is set to YES (the default) # and SOURCE_BROWSER tag is set to YES, then the hyperlinks from # functions in REFERENCES_RELATION and REFERENCED_BY_RELATION lists will # link to the source code. # Otherwise they will link to the documentation. REFERENCES_LINK_SOURCE = YES # If the USE_HTAGS tag is set to YES then the references to source code # will point to the HTML generated by the htags(1) tool instead of doxygen # built-in source browser. The htags tool is part of GNU's global source # tagging system (see http://www.gnu.org/software/global/global.html). You # will need version 4.8.6 or higher. USE_HTAGS = NO # If the VERBATIM_HEADERS tag is set to YES (the default) then Doxygen # will generate a verbatim copy of the header file for each class for # which an include is specified. Set to NO to disable this. VERBATIM_HEADERS = NO #--------------------------------------------------------------------------- # configuration options related to the alphabetical class index #--------------------------------------------------------------------------- # If the ALPHABETICAL_INDEX tag is set to YES, an alphabetical index # of all compounds will be generated. Enable this if the project # contains a lot of classes, structs, unions or interfaces. ALPHABETICAL_INDEX = YES # If the alphabetical index is enabled (see ALPHABETICAL_INDEX) then # the COLS_IN_ALPHA_INDEX tag can be used to specify the number of columns # in which this list will be split (can be a number in the range [1..20]) COLS_IN_ALPHA_INDEX = 5 # In case all classes in a project start with a common prefix, all # classes will be put under the same header in the alphabetical index. # The IGNORE_PREFIX tag can be used to specify one or more prefixes that # should be ignored while generating the index headers. IGNORE_PREFIX = #--------------------------------------------------------------------------- # configuration options related to the HTML output #--------------------------------------------------------------------------- # If the GENERATE_HTML tag is set to YES (the default) Doxygen will # generate HTML output. GENERATE_HTML = YES # The HTML_OUTPUT tag is used to specify where the HTML docs will be put. # If a relative path is entered the value of OUTPUT_DIRECTORY will be # put in front of it. If left blank `html' will be used as the default path. HTML_OUTPUT = html # The HTML_FILE_EXTENSION tag can be used to specify the file extension for # each generated HTML page (for example: .htm,.php,.asp). If it is left blank # doxygen will generate files with .html extension. HTML_FILE_EXTENSION = .html # The HTML_HEADER tag can be used to specify a personal HTML header for # each generated HTML page. If it is left blank doxygen will generate a # standard header. Note that when using a custom header you are responsible # for the proper inclusion of any scripts and style sheets that doxygen # needs, which is dependent on the configuration options used. # It is adviced to generate a default header using "doxygen -w html # header.html footer.html stylesheet.css YourConfigFile" and then modify # that header. Note that the header is subject to change so you typically # have to redo this when upgrading to a newer version of doxygen or when changing the value of configuration settings such as GENERATE_TREEVIEW! HTML_HEADER = # The HTML_FOOTER tag can be used to specify a personal HTML footer for # each generated HTML page. If it is left blank doxygen will generate a # standard footer. HTML_FOOTER = # If the HTML_TIMESTAMP tag is set to YES then the generated HTML documentation will contain the timesstamp. HTML_TIMESTAMP = YES # The HTML_STYLESHEET tag can be used to specify a user-defined cascading # style sheet that is used by each HTML page. It can be used to # fine-tune the look of the HTML output. If the tag is left blank doxygen # will generate a default style sheet. Note that doxygen will try to copy # the style sheet file to the HTML output directory, so don't put your own # stylesheet in the HTML output directory as well, or it will be erased! HTML_STYLESHEET = @CMAKE_CURRENT_SOURCE_DIR@/../doc/doxygen/doxygen.css # The HTML_EXTRA_FILES tag can be used to specify one or more extra images or # other source files which should be copied to the HTML output directory. Note # that these files will be copied to the base HTML output directory. Use the # $relpath$ marker in the HTML_HEADER and/or HTML_FOOTER files to load these # files. In the HTML_STYLESHEET file, use the file name only. Also note that # the files will be copied as-is; there are no commands or markers available. HTML_EXTRA_FILES = # The HTML_COLORSTYLE_HUE tag controls the color of the HTML output. # Doxygen will adjust the colors in the stylesheet and background images # according to this color. Hue is specified as an angle on a colorwheel, # see http://en.wikipedia.org/wiki/Hue for more information. # For instance the value 0 represents red, 60 is yellow, 120 is green, # 180 is cyan, 240 is blue, 300 purple, and 360 is red again. # The allowed range is 0 to 359. HTML_COLORSTYLE_HUE = 220 # The HTML_COLORSTYLE_SAT tag controls the purity (or saturation) of # the colors in the HTML output. For a value of 0 the output will use # grayscales only. A value of 255 will produce the most vivid colors. HTML_COLORSTYLE_SAT = 100 # The HTML_COLORSTYLE_GAMMA tag controls the gamma correction applied to # the luminance component of the colors in the HTML output. Values below # 100 gradually make the output lighter, whereas values above 100 make # the output darker. The value divided by 100 is the actual gamma applied, # so 80 represents a gamma of 0.8, The value 220 represents a gamma of 2.2, # and 100 does not change the gamma. HTML_COLORSTYLE_GAMMA = 80 # If the HTML_TIMESTAMP tag is set to YES then the footer of each generated HTML # page will contain the date and time when the page was generated. Setting # this to NO can help when comparing the output of multiple runs. # If the HTML_ALIGN_MEMBERS tag is set to YES, the members of classes, # files or namespaces will be aligned in HTML using tables. If set to # NO a bullet list will be used. #HTML_ALIGN_MEMBERS = YES # If the HTML_DYNAMIC_SECTIONS tag is set to YES then the generated HTML # documentation will contain sections that can be hidden and shown after the # page has loaded. For this to work a browser that supports # JavaScript and DHTML is required (for instance Mozilla 1.0+, Firefox # Netscape 6.0+, Internet explorer 5.0+, Konqueror, or Safari). HTML_DYNAMIC_SECTIONS = NO # If the GENERATE_DOCSET tag is set to YES, additional index files # will be generated that can be used as input for Apple's Xcode 3 # integrated development environment, introduced with OSX 10.5 (Leopard). # To create a documentation set, doxygen will generate a Makefile in the # HTML output directory. Running make will produce the docset in that # directory and running "make install" will install the docset in # ~/Library/Developer/Shared/Documentation/DocSets so that Xcode will find # it at startup. # See http://developer.apple.com/tools/creatingdocsetswithdoxygen.html # for more information. GENERATE_DOCSET = NO # When GENERATE_DOCSET tag is set to YES, this tag determines the name of the # feed. A documentation feed provides an umbrella under which multiple # documentation sets from a single provider (such as a company or product suite) # can be grouped. DOCSET_FEEDNAME = "Doxygen generated docs" # When GENERATE_DOCSET tag is set to YES, this tag specifies a string that # should uniquely identify the documentation set bundle. This should be a # reverse domain-name style string, e.g. com.mycompany.MyDocSet. Doxygen # will append .docset to the name. DOCSET_BUNDLE_ID = org.doxygen.Project # When GENERATE_PUBLISHER_ID tag specifies a string that should uniquely identify # the documentation publisher. This should be a reverse domain-name style # string, e.g. com.mycompany.MyDocSet.documentation. DOCSET_PUBLISHER_ID = org.doxygen.Publisher # The GENERATE_PUBLISHER_NAME tag identifies the documentation publisher. DOCSET_PUBLISHER_NAME = Publisher # If the GENERATE_HTMLHELP tag is set to YES, additional index files # will be generated that can be used as input for tools like the # Microsoft HTML help workshop to generate a compiled HTML help file (.chm) # of the generated HTML documentation. GENERATE_HTMLHELP = NO # If the GENERATE_HTMLHELP tag is set to YES, the CHM_FILE tag can # be used to specify the file name of the resulting .chm file. You # can add a path in front of the file if the result should not be # written to the html output directory. CHM_FILE = # If the GENERATE_HTMLHELP tag is set to YES, the HHC_LOCATION tag can # be used to specify the location (absolute path including file name) of # the HTML help compiler (hhc.exe). If non-empty doxygen will try to run # the HTML help compiler on the generated index.hhp. HHC_LOCATION = # If the GENERATE_HTMLHELP tag is set to YES, the GENERATE_CHI flag # controls if a separate .chi index file is generated (YES) or that # it should be included in the master .chm file (NO). GENERATE_CHI = NO # If the GENERATE_HTMLHELP tag is set to YES, the CHM_INDEX_ENCODING # is used to encode HtmlHelp index (hhk), content (hhc) and project file # content. CHM_INDEX_ENCODING = # If the GENERATE_HTMLHELP tag is set to YES, the BINARY_TOC flag # controls whether a binary table of contents is generated (YES) or a # normal table of contents (NO) in the .chm file. BINARY_TOC = NO # The TOC_EXPAND flag can be set to YES to add extra items for group members # to the contents of the HTML help documentation and to the tree view. TOC_EXPAND = NO # If the GENERATE_QHP tag is set to YES and both QHP_NAMESPACE and # QHP_VIRTUAL_FOLDER are set, an additional index file will be generated # that can be used as input for Qt's qhelpgenerator to generate a # Qt Compressed Help (.qch) of the generated HTML documentation. GENERATE_QHP = NO # If the QHG_LOCATION tag is specified, the QCH_FILE tag can # be used to specify the file name of the resulting .qch file. # The path specified is relative to the HTML output folder. QCH_FILE = # The QHP_NAMESPACE tag specifies the namespace to use when generating # Qt Help Project output. For more information please see # http://doc.trolltech.com/qthelpproject.html#namespace QHP_NAMESPACE = org.doxygen.Project # The QHP_VIRTUAL_FOLDER tag specifies the namespace to use when generating # Qt Help Project output. For more information please see # http://doc.trolltech.com/qthelpproject.html#virtual-folders QHP_VIRTUAL_FOLDER = doc # If QHP_CUST_FILTER_NAME is set, it specifies the name of a custom filter to # add. For more information please see # http://doc.trolltech.com/qthelpproject.html#custom-filters QHP_CUST_FILTER_NAME = # The QHP_CUST_FILT_ATTRS tag specifies the list of the attributes of the # custom filter to add. For more information please see # # Qt Help Project / Custom Filters. QHP_CUST_FILTER_ATTRS = # The QHP_SECT_FILTER_ATTRS tag specifies the list of the attributes this # project's # filter section matches. # # Qt Help Project / Filter Attributes. QHP_SECT_FILTER_ATTRS = # If the GENERATE_QHP tag is set to YES, the QHG_LOCATION tag can # be used to specify the location of Qt's qhelpgenerator. # If non-empty doxygen will try to run qhelpgenerator on the generated # .qhp file. QHG_LOCATION = # If the GENERATE_ECLIPSEHELP tag is set to YES, additional index files # will be generated, which together with the HTML files, form an Eclipse help # plugin. To install this plugin and make it available under the help contents # menu in Eclipse, the contents of the directory containing the HTML and XML # files needs to be copied into the plugins directory of eclipse. The name of # the directory within the plugins directory should be the same as # the ECLIPSE_DOC_ID value. After copying Eclipse needs to be restarted before # the help appears. GENERATE_ECLIPSEHELP = NO # A unique identifier for the eclipse help plugin. When installing the plugin # the directory name containing the HTML and XML files should also have # this name. ECLIPSE_DOC_ID = org.doxygen.Project # The DISABLE_INDEX tag can be used to turn on/off the condensed index at # top of each HTML page. The value NO (the default) enables the index and # the value YES disables it. DISABLE_INDEX = NO # The ENUM_VALUES_PER_LINE tag can be used to set the number of enum values # (range [0,1..20]) that doxygen will group on one line in the generated HTML # documentation. Note that a value of 0 will completely suppress the enum # values from appearing in the overview section. ENUM_VALUES_PER_LINE = 4 # The GENERATE_TREEVIEW tag is used to specify whether a tree-like index # structure should be generated to display hierarchical information. # If the tag value is set to YES, a side panel will be generated # containing a tree-like index structure (just like the one that # is generated for HTML Help). For this to work a browser that supports # JavaScript, DHTML, CSS and frames is required (i.e. any modern browser). # Windows users are probably better off using the HTML help feature. GENERATE_TREEVIEW = YES # By enabling USE_INLINE_TREES, doxygen will generate the Groups, Directories, # and Class Hierarchy pages using a tree view instead of an ordered list. #USE_INLINE_TREES = NO # If the treeview is enabled (see GENERATE_TREEVIEW) then this tag can be # used to set the initial width (in pixels) of the frame in which the tree # is shown. TREEVIEW_WIDTH = 250 # When the EXT_LINKS_IN_WINDOW option is set to YES doxygen will open # links to external symbols imported via tag files in a separate window. EXT_LINKS_IN_WINDOW = NO # Use this tag to change the font size of Latex formulas included # as images in the HTML documentation. The default is 10. Note that # when you change the font size after a successful doxygen run you need # to manually remove any form_*.png images from the HTML output directory # to force them to be regenerated. FORMULA_FONTSIZE = 10 # Use the FORMULA_TRANPARENT tag to determine whether or not the images # generated for formulas are transparent PNGs. Transparent PNGs are # not supported properly for IE 6.0, but are supported on all modern browsers. # Note that when changing this option you need to delete any form_*.png files # in the HTML output before the changes have effect. FORMULA_TRANSPARENT = YES # Enable the USE_MATHJAX option to render LaTeX formulas using MathJax # (see http://www.mathjax.org) which uses client side Javascript for the # rendering instead of using prerendered bitmaps. Use this if you do not # have LaTeX installed or if you want to formulas look prettier in the HTML # output. When enabled you also need to install MathJax separately and # configure the path to it using the MATHJAX_RELPATH option. USE_MATHJAX = NO # When MathJax is enabled you need to specify the location relative to the # HTML output directory using the MATHJAX_RELPATH option. The destination # directory should contain the MathJax.js script. For instance, if the mathjax # directory is located at the same level as the HTML output directory, then # MATHJAX_RELPATH should be ../mathjax. The default value points to the # mathjax.org site, so you can quickly see the result without installing # MathJax, but it is strongly recommended to install a local copy of MathJax # before deployment. MATHJAX_RELPATH = http://www.mathjax.org/mathjax # When the SEARCHENGINE tag is enabled doxygen will generate a search box # for the HTML output. The underlying search engine uses javascript # and DHTML and should work on any modern browser. Note that when using # HTML help (GENERATE_HTMLHELP), Qt help (GENERATE_QHP), or docsets # (GENERATE_DOCSET) there is already a search function so this one should # typically be disabled. For large projects the javascript based search engine # can be slow, then enabling SERVER_BASED_SEARCH may provide a better solution. SEARCHENGINE = YES # When the SERVER_BASED_SEARCH tag is enabled the search engine will be # implemented using a PHP enabled web server instead of at the web client # using Javascript. Doxygen will generate the search PHP script and index # file to put on the web server. The advantage of the server # based approach is that it scales better to large projects and allows # full text search. The disadvantages are that it is more difficult to setup # and does not have live searching capabilities. SERVER_BASED_SEARCH = NO #--------------------------------------------------------------------------- # configuration options related to the LaTeX output #--------------------------------------------------------------------------- # If the GENERATE_LATEX tag is set to YES (the default) Doxygen will # generate Latex output. GENERATE_LATEX = NO # The LATEX_OUTPUT tag is used to specify where the LaTeX docs will be put. # If a relative path is entered the value of OUTPUT_DIRECTORY will be # put in front of it. If left blank `latex' will be used as the default path. LATEX_OUTPUT = latex # The LATEX_CMD_NAME tag can be used to specify the LaTeX command name to be # invoked. If left blank `latex' will be used as the default command name. # Note that when enabling USE_PDFLATEX this option is only used for # generating bitmaps for formulas in the HTML output, but not in the # Makefile that is written to the output directory. LATEX_CMD_NAME = latex # The MAKEINDEX_CMD_NAME tag can be used to specify the command name to # generate index for LaTeX. If left blank `makeindex' will be used as the # default command name. MAKEINDEX_CMD_NAME = makeindex # If the COMPACT_LATEX tag is set to YES Doxygen generates more compact # LaTeX documents. This may be useful for small projects and may help to # save some trees in general. COMPACT_LATEX = NO # The PAPER_TYPE tag can be used to set the paper type that is used # by the printer. Possible values are: a4, letter, legal and # executive. If left blank a4wide will be used. PAPER_TYPE = a4 # The EXTRA_PACKAGES tag can be to specify one or more names of LaTeX # packages that should be included in the LaTeX output. EXTRA_PACKAGES = # The LATEX_HEADER tag can be used to specify a personal LaTeX header for # the generated latex document. The header should contain everything until # the first chapter. If it is left blank doxygen will generate a # standard header. Notice: only use this tag if you know what you are doing! LATEX_HEADER = # The LATEX_FOOTER tag can be used to specify a personal LaTeX footer for # the generated latex document. The footer should contain everything after # the last chapter. If it is left blank doxygen will generate a # standard footer. Notice: only use this tag if you know what you are doing! LATEX_FOOTER = # If the PDF_HYPERLINKS tag is set to YES, the LaTeX that is generated # is prepared for conversion to pdf (using ps2pdf). The pdf file will # contain links (just like the HTML output) instead of page references # This makes the output suitable for online browsing using a pdf viewer. PDF_HYPERLINKS = YES # If the USE_PDFLATEX tag is set to YES, pdflatex will be used instead of # plain latex in the generated Makefile. Set this option to YES to get a # higher quality PDF documentation. USE_PDFLATEX = YES # If the LATEX_BATCHMODE tag is set to YES, doxygen will add the \\batchmode. # command to the generated LaTeX files. This will instruct LaTeX to keep # running if errors occur, instead of asking the user for help. # This option is also used when generating formulas in HTML. LATEX_BATCHMODE = NO # If LATEX_HIDE_INDICES is set to YES then doxygen will not # include the index chapters (such as File Index, Compound Index, etc.) # in the output. LATEX_HIDE_INDICES = NO # If LATEX_SOURCE_CODE is set to YES then doxygen will include # source code with syntax highlighting in the LaTeX output. # Note that which sources are shown also depends on other settings # such as SOURCE_BROWSER. LATEX_SOURCE_CODE = NO #--------------------------------------------------------------------------- # configuration options related to the RTF output #--------------------------------------------------------------------------- # If the GENERATE_RTF tag is set to YES Doxygen will generate RTF output # The RTF output is optimized for Word 97 and may not look very pretty with # other RTF readers or editors. GENERATE_RTF = NO # The RTF_OUTPUT tag is used to specify where the RTF docs will be put. # If a relative path is entered the value of OUTPUT_DIRECTORY will be # put in front of it. If left blank `rtf' will be used as the default path. RTF_OUTPUT = rtf # If the COMPACT_RTF tag is set to YES Doxygen generates more compact # RTF documents. This may be useful for small projects and may help to # save some trees in general. COMPACT_RTF = NO # If the RTF_HYPERLINKS tag is set to YES, the RTF that is generated # will contain hyperlink fields. The RTF file will # contain links (just like the HTML output) instead of page references. # This makes the output suitable for online browsing using WORD or other # programs which support those fields. # Note: wordpad (write) and others do not support links. RTF_HYPERLINKS = NO # Load stylesheet definitions from file. Syntax is similar to doxygen's # config file, i.e. a series of assignments. You only have to provide # replacements, missing definitions are set to their default value. RTF_STYLESHEET_FILE = # Set optional variables used in the generation of an rtf document. # Syntax is similar to doxygen's config file. RTF_EXTENSIONS_FILE = #--------------------------------------------------------------------------- # configuration options related to the man page output #--------------------------------------------------------------------------- # If the GENERATE_MAN tag is set to YES (the default) Doxygen will # generate man pages GENERATE_MAN = NO # The MAN_OUTPUT tag is used to specify where the man pages will be put. # If a relative path is entered the value of OUTPUT_DIRECTORY will be # put in front of it. If left blank `man' will be used as the default path. MAN_OUTPUT = man # The MAN_EXTENSION tag determines the extension that is added to # the generated man pages (default is the subroutine's section .3) MAN_EXTENSION = .3 # If the MAN_LINKS tag is set to YES and Doxygen generates man output, # then it will generate one additional man file for each entity # documented in the real man page(s). These additional files # only source the real man page, but without them the man command # would be unable to find the correct page. The default is NO. MAN_LINKS = NO #--------------------------------------------------------------------------- # configuration options related to the XML output #--------------------------------------------------------------------------- # If the GENERATE_XML tag is set to YES Doxygen will # generate an XML file that captures the structure of # the code including all documentation. GENERATE_XML = NO # The XML_OUTPUT tag is used to specify where the XML pages will be put. # If a relative path is entered the value of OUTPUT_DIRECTORY will be # put in front of it. If left blank `xml' will be used as the default path. XML_OUTPUT = xml # The XML_SCHEMA tag can be used to specify an XML schema, # which can be used by a validating XML parser to check the # syntax of the XML files. XML_SCHEMA = # The XML_DTD tag can be used to specify an XML DTD, # which can be used by a validating XML parser to check the # syntax of the XML files. XML_DTD = # If the XML_PROGRAMLISTING tag is set to YES Doxygen will # dump the program listings (including syntax highlighting # and cross-referencing information) to the XML output. Note that # enabling this will significantly increase the size of the XML output. XML_PROGRAMLISTING = YES #--------------------------------------------------------------------------- # configuration options for the AutoGen Definitions output #--------------------------------------------------------------------------- # If the GENERATE_AUTOGEN_DEF tag is set to YES Doxygen will # generate an AutoGen Definitions (see autogen.sf.net) file # that captures the structure of the code including all # documentation. Note that this feature is still experimental # and incomplete at the moment. GENERATE_AUTOGEN_DEF = NO #--------------------------------------------------------------------------- # configuration options related to the Perl module output #--------------------------------------------------------------------------- # If the GENERATE_PERLMOD tag is set to YES Doxygen will # generate a Perl module file that captures the structure of # the code including all documentation. Note that this # feature is still experimental and incomplete at the # moment. GENERATE_PERLMOD = NO # If the PERLMOD_LATEX tag is set to YES Doxygen will generate # the necessary Makefile rules, Perl scripts and LaTeX code to be able # to generate PDF and DVI output from the Perl module output. PERLMOD_LATEX = NO # If the PERLMOD_PRETTY tag is set to YES the Perl module output will be # nicely formatted so it can be parsed by a human reader. # This is useful # if you want to understand what is going on. # On the other hand, if this # tag is set to NO the size of the Perl module output will be much smaller # and Perl will parse it just the same. PERLMOD_PRETTY = YES # The names of the make variables in the generated doxyrules.make file # are prefixed with the string contained in PERLMOD_MAKEVAR_PREFIX. # This is useful so different doxyrules.make files included by the same # Makefile don't overwrite each other's variables. PERLMOD_MAKEVAR_PREFIX = #--------------------------------------------------------------------------- # Configuration options related to the preprocessor #--------------------------------------------------------------------------- # If the ENABLE_PREPROCESSING tag is set to YES (the default) Doxygen will # evaluate all C-preprocessor directives found in the sources and include # files. ENABLE_PREPROCESSING = YES # If the MACRO_EXPANSION tag is set to YES Doxygen will expand all macro # names in the source code. If set to NO (the default) only conditional # compilation will be performed. Macro expansion can be done in a controlled # way by setting EXPAND_ONLY_PREDEF to YES. MACRO_EXPANSION = NO # If the EXPAND_ONLY_PREDEF and MACRO_EXPANSION tags are both set to YES # then the macro expansion is limited to the macros specified with the # PREDEFINED and EXPAND_AS_DEFINED tags. EXPAND_ONLY_PREDEF = NO # If the SEARCH_INCLUDES tag is set to YES (the default) the includes files # pointed to by INCLUDE_PATH will be searched when a #include is found. SEARCH_INCLUDES = YES # The INCLUDE_PATH tag can be used to specify one or more directories that # contain include files that are not input files but should be processed by # the preprocessor. INCLUDE_PATH = # You can use the INCLUDE_FILE_PATTERNS tag to specify one or more wildcard # patterns (like *.h and *.hpp) to filter out the header-files in the # directories. If left blank, the patterns specified with FILE_PATTERNS will # be used. INCLUDE_FILE_PATTERNS = # The PREDEFINED tag can be used to specify one or more macro names that # are defined before the preprocessor is started (similar to the -D option of # gcc). The argument of the tag is a list of macros of the form: name # or name=definition (no spaces). If the definition and the = are # omitted =1 is assumed. To prevent a macro definition from being # undefined via #undef or recursively expanded use the := operator # instead of the = operator. PREDEFINED = # If the MACRO_EXPANSION and EXPAND_ONLY_PREDEF tags are set to YES then # this tag can be used to specify a list of macro names that should be expanded. # The macro definition that is found in the sources will be used. # Use the PREDEFINED tag if you want to use a different macro definition that # overrules the definition found in the source code. EXPAND_AS_DEFINED = # If the SKIP_FUNCTION_MACROS tag is set to YES (the default) then # doxygen's preprocessor will remove all references to function-like macros # that are alone on a line, have an all uppercase name, and do not end with a # semicolon, because these will confuse the parser if not removed. SKIP_FUNCTION_MACROS = YES #--------------------------------------------------------------------------- # Configuration::additions related to external references #--------------------------------------------------------------------------- # The TAGFILES option can be used to specify one or more tagfiles. # Optionally an initial location of the external documentation # can be added for each tagfile. The format of a tag file without # this location is as follows: # # TAGFILES = file1 file2 ... # Adding location for the tag files is done as follows: # # TAGFILES = file1=loc1 "file2 = loc2" ... # where "loc1" and "loc2" can be relative or absolute paths or # URLs. If a location is present for each tag, the installdox tool # does not have to be run to correct the links. # Note that each tag file must have a unique name # (where the name does NOT include the path) # If a tag file is not located in the directory in which doxygen # is run, you must also specify the path to the tagfile here. TAGFILES = # When a file name is specified after GENERATE_TAGFILE, doxygen will create # a tag file that is based on the input files it reads. GENERATE_TAGFILE = # If the ALLEXTERNALS tag is set to YES all external classes will be listed # in the class index. If set to NO only the inherited external classes # will be listed. ALLEXTERNALS = NO # If the EXTERNAL_GROUPS tag is set to YES all external groups will be listed # in the modules index. If set to NO, only the current project's groups will # be listed. EXTERNAL_GROUPS = YES # The PERL_PATH should be the absolute path and name of the perl script # interpreter (i.e. the result of `which perl'). PERL_PATH = /usr/bin/perl #--------------------------------------------------------------------------- # Configuration options related to the dot tool #--------------------------------------------------------------------------- # If the CLASS_DIAGRAMS tag is set to YES (the default) Doxygen will # generate a inheritance diagram (in HTML, RTF and LaTeX) for classes with base # or super classes. Setting the tag to NO turns the diagrams off. Note that # this option also works with HAVE_DOT disabled, but it is recommended to # install and use dot, since it yields more powerful graphs. CLASS_DIAGRAMS = NO # You can define message sequence charts within doxygen comments using the \msc # command. Doxygen will then run the mscgen tool (see # http://www.mcternan.me.uk/mscgen/) to produce the chart and insert it in the # documentation. The MSCGEN_PATH tag allows you to specify the directory where # the mscgen tool resides. If left empty the tool is assumed to be found in the # default search path. MSCGEN_PATH = # If set to YES, the inheritance and collaboration graphs will hide # inheritance and usage relations if the target is undocumented # or is not a class. HIDE_UNDOC_RELATIONS = YES # If you set the HAVE_DOT tag to YES then doxygen will assume the dot tool is # available from the path. This tool is part of Graphviz, a graph visualization # toolkit from AT&T and Lucent Bell Labs. The other options in this section # have no effect if this option is set to NO (the default) HAVE_DOT = YES # The DOT_NUM_THREADS specifies the number of dot invocations doxygen is # allowed to run in parallel. When set to 0 (the default) doxygen will # base this on the number of processors available in the system. You can set it # explicitly to a value larger than 0 to get control over the balance # between CPU load and processing speed. DOT_NUM_THREADS = 0 # By default doxygen will write a font called Helvetica to the output # directory and reference it in all dot files that doxygen generates. # When you want a differently looking font you can specify the font name # using DOT_FONTNAME. You need to make sure dot is able to find the font, # which can be done by putting it in a standard location or by setting the # DOTFONTPATH environment variable or by setting DOT_FONTPATH to the directory # containing the font. DOT_FONTNAME = Helvetica # The DOT_FONTSIZE tag can be used to set the size of the font of dot graphs. # The default size is 10pt. DOT_FONTSIZE = 11 # By default doxygen will tell dot to use the output directory to look for the # FreeSans.ttf font (which doxygen will put there itself). If you specify a # different font using DOT_FONTNAME you can set the path where dot # can find it using this tag. DOT_FONTPATH = # If the CLASS_GRAPH and HAVE_DOT tags are set to YES then doxygen # will generate a graph for each documented class showing the direct and # indirect inheritance relations. Setting this tag to YES will force the # the CLASS_DIAGRAMS tag to NO. CLASS_GRAPH = YES # If the COLLABORATION_GRAPH and HAVE_DOT tags are set to YES then doxygen # will generate a graph for each documented class showing the direct and # indirect implementation dependencies (inheritance, containment, and # class references variables) of the class with other documented classes. COLLABORATION_GRAPH = NO # If the GROUP_GRAPHS and HAVE_DOT tags are set to YES then doxygen # will generate a graph for groups, showing the direct groups dependencies GROUP_GRAPHS = YES # If the UML_LOOK tag is set to YES doxygen will generate inheritance and # collaboration diagrams in a style similar to the OMG's Unified Modeling # Language. UML_LOOK = YES # If set to YES, the inheritance and collaboration graphs will show the # relations between templates and their instances. TEMPLATE_RELATIONS = NO # If the ENABLE_PREPROCESSING, SEARCH_INCLUDES, INCLUDE_GRAPH, and HAVE_DOT # tags are set to YES then doxygen will generate a graph for each documented # file showing the direct and indirect include dependencies of the file with # other documented files. INCLUDE_GRAPH = YES # If the ENABLE_PREPROCESSING, SEARCH_INCLUDES, INCLUDED_BY_GRAPH, and # HAVE_DOT tags are set to YES then doxygen will generate a graph for each # documented header file showing the documented files that directly or # indirectly include this file. INCLUDED_BY_GRAPH = NO # If the CALL_GRAPH and HAVE_DOT options are set to YES then # doxygen will generate a call dependency graph for every global function # or class method. Note that enabling this option will significantly increase # the time of a run. So in most cases it will be better to enable call graphs # for selected functions only using the \callgraph command. CALL_GRAPH = NO # If the CALLER_GRAPH and HAVE_DOT tags are set to YES then # doxygen will generate a caller dependency graph for every global function # or class method. Note that enabling this option will significantly increase # the time of a run. So in most cases it will be better to enable caller # graphs for selected functions only using the \callergraph command. CALLER_GRAPH = NO # If the GRAPHICAL_HIERARCHY and HAVE_DOT tags are set to YES then doxygen # will generate a graphical hierarchy of all classes instead of a textual one. GRAPHICAL_HIERARCHY = YES # If the DIRECTORY_GRAPH, SHOW_DIRECTORIES and HAVE_DOT tags are set to YES # then doxygen will show the dependencies a directory has on other directories # in a graphical way. The dependency relations are determined by the #include # relations between the files in the directories. DIRECTORY_GRAPH = YES # The DOT_IMAGE_FORMAT tag can be used to set the image format of the images # generated by dot. Possible values are svg, png, jpg, or gif. # If left blank png will be used. DOT_IMAGE_FORMAT = png # The tag DOT_PATH can be used to specify the path where the dot tool can be # found. If left blank, it is assumed the dot tool can be found in the path. DOT_PATH = # The DOTFILE_DIRS tag can be used to specify one or more directories that # contain dot files that are included in the documentation (see the # \dotfile command). DOTFILE_DIRS = # The MSCFILE_DIRS tag can be used to specify one or more directories that # contain msc files that are included in the documentation (see the # \mscfile command). MSCFILE_DIRS = # The DOT_GRAPH_MAX_NODES tag can be used to set the maximum number of # nodes that will be shown in the graph. If the number of nodes in a graph # becomes larger than this value, doxygen will truncate the graph, which is # visualized by representing a node as a red box. Note that doxygen if the # number of direct children of the root node in a graph is already larger than # DOT_GRAPH_MAX_NODES then the graph will not be shown at all. Also note # that the size of a graph can be further restricted by MAX_DOT_GRAPH_DEPTH. DOT_GRAPH_MAX_NODES = 50 # The MAX_DOT_GRAPH_DEPTH tag can be used to set the maximum depth of the # graphs generated by dot. A depth value of 3 means that only nodes reachable # from the root by following a path via at most 3 edges will be shown. Nodes # that lay further from the root node will be omitted. Note that setting this # option to 1 or 2 may greatly reduce the computation time needed for large # code bases. Also note that the size of a graph can be further restricted by # DOT_GRAPH_MAX_NODES. Using a depth of 0 means no depth restriction. MAX_DOT_GRAPH_DEPTH = 0 # Set the DOT_TRANSPARENT tag to YES to generate images with a transparent # background. This is disabled by default, because dot on Windows does not # seem to support this out of the box. Warning: Depending on the platform used, # enabling this option may lead to badly anti-aliased labels on the edges of # a graph (i.e. they become hard to read). DOT_TRANSPARENT = NO # Set the DOT_MULTI_TARGETS tag to YES allow dot to generate multiple output # files in one run (i.e. multiple -o and -T options on the command line). This # makes dot run faster, but since only newer versions of dot (>1.8.10) # support this, this feature is disabled by default. DOT_MULTI_TARGETS = NO # If the GENERATE_LEGEND tag is set to YES (the default) Doxygen will # generate a legend page explaining the meaning of the various boxes and # arrows in the dot generated graphs. GENERATE_LEGEND = YES # If the DOT_CLEANUP tag is set to YES (the default) Doxygen will # remove the intermediate dot files that are used to generate # the various graphs. DOT_CLEANUP = YES simka-1.5.3/docker/000077500000000000000000000000001377312000000140645ustar00rootroot00000000000000simka-1.5.3/docker/Dockerfile000066400000000000000000000124341377312000000160620ustar00rootroot00000000000000######################################################################################### # # Docker file for Simka project. # # It prepares a Docker container to run Simka jobs: # # - bin/simka: computing simka results from sequencing data # - scripts/visualization/run-visualization.py: making images from results of # bin/simka. # ######################################################################################### # # == Docker build command: # # docker build -f Dockerfile -t simka_machine . # # == Docker test command: # # docker run --rm -i -t simka_machine -c test # # -> you should see a simka test with some provided data. # # == Running a Simka job: # # docker run --rm -i -t simka_machine -c -- # # where: # : MUST BE one of: simka, visu, test # : remaining arguments passed in after are passed # to the appropriate simka program: # - simka: will run 'bin/simka' within the container # - visu: will run 'scripts/visualization/run-visualization.py' # within the container # Please refer to these programs to review their expected arguments. # See https://github.com/GATB/simka # # == Sample Simka job with provided data: # # docker run --rm -i -t -v $PWD:/tmp simka_machine -c simka -- -in /opt/simka/example/simka_input.txt -out /tmp/simka_results/ -out-tmp /tmp/simka_temp_output # # -> you should have results in $PWD/simka_results directory when Simka job is done. # # This command-line line explained: # # docker run [1] # --rm [2] # -i -t [3] # -v $PWD:/tmp [4] # simka_machine [5] # -c simka [6] # -- [7] # -in /opt/simka/example/simka_input.txt [8] # -out /tmp/simka_results/ [9] # -out-tmp /tmp/simka_temp_output [10] # # [1]-[5]: Docker arguments # [6]-[7]: simka container's invoker program # [8]-[10]: 'bin/simka' arguments # # [1]: start Docker container # [2]: destroy container when Docker finishes # (it does NOT delete the 'simka_machine' image) # [3]: start an interactive job # (for instance, you'll see messages on stdout, if any) # [4]: mount a volume. This is required to get the results from Simka. # Here, we say that current local directory will be viewed as '/tmp' # from the inside of the container. # [5]: tell Docker which image to start: the 'simka_machine' of course. # [6]: ask to start the simka program. Other option is to start the # 'visu' task (see below). See companion file 'run_simka.sh' for # more information. # [7]: '--' is required to separate arguments [6] from the rest of the # command line # [8]: the data file to process with simka. Here we use a data file # provided with the simka software to test it. # [9]: tells simka where to put results. Of course, simka will write # within /tmp directory inside the container. However, since we # have directive [4], data writing is actually done in $PWD, i.e. # a local directory. # [10]: tells simka where to put temporary files. # # == Sample Simka Visualization job with provided data # # After running the previous command, you can do this: # # docker run --rm -i -t -v $PWD:/tmp simka_machine -c visu -- -in /tmp/simka_results/ -out /tmp/simka_results/ -pca -heatmap -tree # # -> you should have PNG files in $PWD/simka_results directory. # # == Additional notes # # Root access inside the container: # # - if running: docker exec -it simka_machine bash # # - if not yet running: docker run --rm -i -t simka_machine bash # ######################################################################################### # Simka binary available on Github (see below) is built using a # Debian 8 (jessie) based system on Inria Jenkins CI platform FROM debian:jessie # who to blame? MAINTAINER Patrick Durand patrick.durand@inria.fr # ### # We always use the latest official SIMKA release. # ENV SIMKA_VERSION=1.4.0 # ### # Package installation and configuration # RUN apt-get update && apt-get -y dist-upgrade \ && apt-get install -y --no-install-recommends curl python2.7 r-base \ && apt-get clean # ### # SIMKA installation: get the binary release from Github mirror. # RUN cd /opt \ && export SIMKA_TGZ=simka-v${SIMKA_VERSION}-bin-Linux.tar.gz \ && export GIT_URL=https://github.com/GATB/simka/releases/download \ && export SIMKA_URL=${GIT_URL}/v${SIMKA_VERSION}/${SIMKA_TGZ} \ && curl -ksL ${SIMKA_URL} | tar xz \ && rm -f ${SIMKA_TGZ} \ && mv simka-v${SIMKA_VERSION}-bin-Linux simka \ && cd simka/bin \ && chmod +x simka* \ && cd ../example \ && chmod +x *.sh \ && ./simple_test.sh COPY run_simka.sh /opt/simka # Fix: ensure script has exec permission RUN chmod +x /opt/simka/run_simka.sh # ### # Start simka. # ENTRYPOINT ["/opt/simka/run_simka.sh"] simka-1.5.3/docker/README.md000066400000000000000000000123301377312000000153420ustar00rootroot00000000000000# *SIMKA* and *Docker* This document explains how you can setup and use *SIMKA* within a Docker container. ## Requirements Of course, you need to have [Docker](https://docs.docker.com/engine/installation/) installed on your system. We also suppose that you are familiar with [docker build](https://docs.docker.com/engine/reference/commandline/build/) and [docker run](https://docs.docker.com/engine/reference/commandline/run/) commands. Note: this SIMKA's *Dockerfile* was made and tested using *Docker version 17* on *Mac OSX Yosemite*. However, it should work on other releases of Docker and OS (Linux, Windows or OSX). # How to build and run using the command-line? ## Build the container docker build -f Dockerfile -t simka_machine . ## Run a Simka job with sample data docker run --rm -i -t -v $PWD:/tmp simka_machine -c simka -- -in /opt/simka/example/simka_input.txt -out /tmp/simka_results/ -out-tmp /tmp/simka_temp_output You should have results in ```$PWD/simka_results``` directory when Simka job is done. ## Run Simka Visualization job with provided data docker run --rm -i -t -v $PWD:/tmp simka_machine -c visu -- -in /tmp/simka_results/ -out /tmp/simka_results/ -pca -heatmap -tree You should have PNG files in ```$PWD/simka_results``` directory. ## More documentation Please refer to the documented header of the ```Dockerfile``` located in this directory. # How to run Simka using the GoDocker platform? ## What is GoDocker? [GoDocker](http://www.genouest.org/godocker/) is a front-end to execute Docker containers on the [Genouest](http://www.genouest.org) bioinformatics platform. An account is required to access this service. ## How to prepare a Simka job * Log in to the GoDocker platform [here](https://godocker.genouest.org/) using your GenOuest credentials * Click on ```Create Job``` (top-left toolbar) * Then fill in the new job as follows: * Name: ```simka``` *(adapt to your needs)* * Description: ```simka job``` *(adapt to your needs)* * Tags *(leave empty)* * Projects *(leave value to 'default')* * Container image: ```pgdurand56/simka140``` *(see ```Comment 1``` below)* * Command: *(see ```Comment 2``` below)* * CPU: ```4``` * GPU: *(leave value to '0')* * RAM: ```8``` * Mount volumes: select ```home``` and/or ```omaha``` *(see ```Comment 2``` below)* * Advanced options: *(do not modify)* * Click on [Submit] ### Comment 1: the Simka Docker Image pgdurand56/simka140 In this tutorial you'll use the [pgdurand56/simka140](https://hub.docker.com/r/pgdurand56/simka140/) Docker Image: this is an official Simka 1.4.0 runtime made by Genscale team member. If you want to use your own, see below. ### Comment 2: the Simka command to use In order to use Simka Docker Image, you'll have to know that: * GoDocker won't use the default entrypoint defined in [Simka Dockerfile](https://github.com/GATB/simka/blob/master/docker/Dockerfile). As a consequence, you do no start Simka on GoDocker as you do on the command-line. * GoDocker enables you to access YOUR data located either in your *home directory* or in the *Omaha* storage on Genocluster machine #### Start a Simka data processing Job So, here is an example of command to use while setting up a Simka job for GoDocker: #!/bin/bash /opt/simka/bin/simka -in $GODOCKER_HOME/simka/example/simka_input.txt -out $GODOCKER_HOME/simka/example/simka_results/ -out-tmp $GODOCKER_HOME/simka/example/simka_temp_output In the above short script, we suppose that the data are located in the home directory of the user (denoted by variable $GODOCKER\_HOME). Simply adapt paths to your needs. If you want to use data located in Omaha, use '/omaha-beach' instead. In this script, please DO NOT modify path: ```/opt/simka/bin/simka```. It targets the simka binary within the Simka Docker image. #### Start a Simka visualization Job After running a Simka data processing job, you can prepare PNG images using: #!/bin/bash python2.7 /opt/simka/scripts/visualization/run-visualization.py -in $GODOCKER_HOME/simka/example/simka_results/ -out $GODOCKER_HOME/simka/example/simka_results/ -pca -heatmap -tree In this script: * DO NOT modify path: "python2.7 /opt/simka/scripts/visualization/run-visualization.py". It targets a simka python script within the Simka Docker container. * adapt the use of $GODOCKER\_HOME to your needs; you can also targets data located in Omaha using '/omaha-beach' ### Making your own Simka image for GoDocker On your local computer: [1] cd /tmp git clone https://github.com/GATB/simka.git [2] cd simka/docker docker build -f Dockerfile -t simka_machine . [3] docker login -u -p (e.g. docker login -u pgdurand56 -p xxxx) [4] docker tag / (e.g. docker tag 2520e066828a pgdurand56/simka140) [5] docker push (e.g. docker push pgdurand56/simka140) Steps are as follows: [1] get a copy of simka project [2] build the Simka Docker image [3] login to your DockerHub account [4] give a name to your Simka Docker Image suitable for DockerHub publication [5] push the image to DockerHub Now, on GoDocker use "\/\" (e.g. pgdurand56/simka140) to access your own Simka Image. simka-1.5.3/docker/run_simka.sh000066400000000000000000000030431377312000000164100ustar00rootroot00000000000000#!/usr/bin/env bash # # A script to be used within a Docker container: it aims at starting a simka # task given some parameters. # # Use: ./run_simka.sh -c -- # # : MUST BE one of: simka, visu, test # : remaining arguments passed in after are passed # to the appropriate simka program: # bin/simka # scripts/visualization/run-visualization.py # Please refer to these programs to review their expected arguments. # # Author: Patrick G. Durand, Inria, June 2017 # ======================================================================================== # Section: utility function declarations # -------- # FUNCTION: display help message function help(){ printf "\n$0: a tool to invoke simka within a Docker container.\n\n" printf "usage: $0 -c [arguments]\n\n" exit 1 } # ======================================================================================== # Section: Main # Prepare arguments for processing while getopts hc: opt do case "$opt" in c) COMMAND="$OPTARG";; h) help;; \?) help;; esac done shift `expr $OPTIND - 1` # remaining arguments, if any, are supposed to be the [file ...] part of the command-line ALL_ARGS=$@ #execute command case "$COMMAND" in test) cd /opt/simka/example ./simple_test.sh ;; simka) /opt/simka/bin/simka $ALL_ARGS ;; visu) python2.7 /opt/simka/scripts/visualization/run-visualization.py $ALL_ARGS ;; esac exit 0 simka-1.5.3/example/000077500000000000000000000000001377312000000142505ustar00rootroot00000000000000simka-1.5.3/example/A.fasta000077500000000000000000000261631377312000000154630ustar00rootroot00000000000000>1 AAAAAACATTAGTTACGGAAGGTGGGTGGAGCGGGGGCCGCCAGTCTATATTCATACTAGAAAGGGGCTAAGGGCATCGCGCTCATGAAGTGGCACTTGCAGAGGTGAGT >4 AAAAAATTGTCGTTAAGATGAGGAGCTCTTTCGCATTTGACCCATCAAATCTCGGAATGCACTTGAGATCGACCCGTTTGATACAAGCCTTCATCGTCGATAATATATCG >6 AAAAACCGATGGGGCCGAGCTGTTCTTTGGCCGGGTTACTCTACGCCCACACGGGTACACAGCCGCGAAACGGGAGGCTCGTGCGCGGTCACCTAAGTCCCTGTGGCGGG >7 AAAAACGGATGAGAAATAAAAGGGGAAATAGCGACATGTCAAATGGCCTCTTGGCTGGCGGTGTCTGGCTGGACTAACCCTCTTAAGGACTTAAAGCGTAGGCAAGGTTA >10 AAAAAGCTTACAGTGTTCGTAGTTCTGCTCGTGTCGGTATTCTCATACTCACTCCAGGACTTCGGAAACATTAGTGAAAGTGTCACCGGCCGTGCATTTTCCGGAGTAAG >12 AAAAAGTCTCACTTATTTACGCGCTTGATAGCCCAAACCGTTCACAGCATCTCCAGTAGTCGATTCCGGCTCCTCTCCATGTTGGAGATGTCAGCACGGCAGGTATATGA >14 AAAAATATTTGAATCCGCTGCTGACTTTTTCAAAATTCGATTATCACTACGGCGGTCAGCATATTTCCTTGAACTTATAACGTATCTCAGTATTGCTGGTTTGGAAACGT >15 AAAAATGCCCAGTTCGGCGATCCAACTCGTTAGTCTCAGGGTTCCCGCGGCGAACTCCCTTCGGTTGACTTTACGGCCTACACGCTCAGCCCTGTACCCGCATTGATGTC >100 AAACCGTGGGTAAGGCCGGGAGCTTTACACCAATTGGAACCACAAGTGGCTTGGCGGCCCAGGCTTCCTCATGCAGGATCATAACAATTACGCAAGTCCGGGAATCAGGA >19 AAAACACGCATGTCCGACGGGCGCATTAAACGGCATTGTTGTTCTTGAACCGGAAGCAATAGTCTAGGACCGTTCATATTGACTTACTTTACGCGTGGGCCTTGGATGAA >22 AAAACCATCTGTTTCCCAGAATGCTCCCGATACTTAATCACCTGCGCTTTGATCGTAGGCACTCACCCTCTCAAGTACCCTTGGACAGATTTAAGATCGATAGTTCGTAT >26 AAAACGAACCGGACTACATTCCTCATAGGCTTGAGGGGCAAAAGTTACAGTACAGATTTCGGGACCCGTCGCTTTACAGTGTAGACTGTTTTCCGAGAGTGCCTAGTCCA >28 AAAACGCTCGTGCCCAGTTCGAGCGCCCTGGAGTTTACCGCCCAAGTAAACGCTATTTTTTAAATGACATAGCTCCTTACAGGCGTGGGGGGACGCCTATAGGGCCGTCG >29 AAAACGGTGCCAACCATTGAAGATAGAAGATTCAATACATTGCACAAACGAATATAACCGGTAGGTAAAGTCTCGTTTCAAAGCGGCTCTATTTGTGCACCTTTTGCTGA >31 AAAACTCACCACCAGGACAGAAGCTAAAAGACAGGCCGATTCACGGGCGAGCGGTCGAAGCATACTTTTTAGGGCATCATGTGACCACACGTGTTGACCGTCCATCGTTT >34 AAAACTTTACCGATTTGCAAGATGTAAACAGCACGGGGACGCTATATCGACTGAGTGTTGTAGTGGAATCTACCACCCCGGACATGGGGCTGGACTGATTATTGAACGCT >38 AAAAGAGTTTGCAGGGTCACTAGCCCGGCTCTGCATACATTGGCAAAGAGGCGATAAGCTGTGAATTCCAGGCTGGGGACGAGCTTGAACCTACTGTGCGAAGTTACCAC >40 AAAAGCCAGAAGGTATGCAGATCGATATCGCTCCAATGTGTCAGTAGCCCGGACGAAGGCCTGCTCGATGATGCGTCATATTAATGTCAGAGAACCTAGTAGCCATCGGT >44 AAAAGGACAACGATGTGTGTGCTTCAAATATTCCTCCAATTCGTTACTGTCGCGGATGTTATGTCGGGCACTCTTAGCTTCCAAGGTGGGTGGATGTTAACGCGCATAGA >46 AAAAGGGGTTTGCTGACCCACAGACAATAGATAGCGGATTACTTACTCCACTTACGAGACTAACGCAATGCTAATAACTTGACGTCTAAACGGGACCACATAAGCCTTTA >47 AAAAGGTATTATATTGATGATTAGCCATTTAGGGAAGATCAACAAGATGAAGCATGACGCGAGAGGTAAGGGATCGACGAACGCTCCTGCCATGGGCAATCCAGGAGGGC >49 AAAAGTCCCTGTCGTACCTCAATCCAGGCCACGTATACTCAGGGGGTACCGAAACCCTAATAACTCTCCCGTTGGTGACCAGATCTGAATCTGCACCGCACACAACTACA >51 AAAAGTGGAAAATACCCTCACTAACAGGGTATCTTGCGTTGAGCTGGTAAGTGCACCAGGGAAAACTTGTTATCCTCGAGTCTGATCGATGCTATCTCATATTTCTGAGT >54 AAAATAGAAATCGCAGCTTGAATGCTGTTACTCAATGTTTATGAACATAGCTCCCGCGTTACTCTGTTTCGACATAGGATGCGCGCACCGGAAGCCTGCTACCACAACAT >57 AAAATCAAACTAGCTACTCCGAAGGCGGGATATTTGCCTGGTAGAGGGAAATGTAGCTCACGGGCCGTTTACTCTTCTTCAGAGCAACTAAGTATTCCGGAAAACCTCAG >60 AAAATCGTGTTCAAGAAAGAGAGGGCCGGGCGCTGAATTGGGCCCGAATCACAAAAAAGTGAGTCGCGCTCCTACAAAGTCCTAATCTAATAGTGGATGATGTGTCTGGT >62 AAAATGCCATACTAAATTAACGGTGTCTCATAGCGACATTGTTATTGTCACCTGACATAGCCAGAAGGTTAAAAGTAGTGCGCGACGCGAAATACCCATGCTGCGGAGTC >67 AAAATTCAGCCTTTCTTACCCAAGGCCTCTCTCGGACAAGAACTATGAAAGCATGCCCTACAGCATACTTTCGCTGACATATGGAGCAGGGAGCCTAAAGGCCGTTTATC >71 AAACAACCACAGATGGCCACTATGTGAAGTTTTGGACGAGTCCATACATTTTTCACTAAGTAAGAAGCTACCTTAATACGTGCACGCATCACATCCTAGACGCTCTGGCT >72 AAACAAGCCGAGAATCTGGCGTGACAAATCCTCCGGAACGGGCTGACCCCACTGTACAAACAATGTAAATCACCGCAGTTTCACTGTACGTTTGGTCTTTTTGATAGACA >74 AAACAATGGACGGAACCATGTTCTGTTACAAGCGCTGACCACACAAGCCGAGAGTACCCCAAGATGATGTCTTAGGATCGTATATACCCTCCATACCCGAGCTTTCCCCG >77 AAACACGTCACCGAGCGCTTAGTGGATCGTACTCAACATGTTGAACAGACATTATCTACATTCGATTCTTCCCATTATGTATCATCGCAGTACACGCCGCTTTCCATTTT >78 AAACACTCGCACAGACCGGTAACCGAGGGAATACAGAATTATAGCCCATATTCGCTGTCCCAAACTGCACCCATCGTTGGCAATTCCGAGACCTCTATTTCCGGTATGCG >79 AAACAGAGACATGTACGTTTTGCGCGGTGGTAGCTCTGGAGTCGGAGGCAGGGTTTTTTGGCCGGCAAAATCAGTATCCGACCTCGTTGGATGACTCCGGAAAACCTTTT >80 AAACAGATCGCTATTAGCACGCGTATGCTTTCACTAGCGAACAAAAGTGCCCCTTTGAGTCCTAGCAGCTACAGTGCCCGTAACTGATATTCTTAAGGCTATTTACAGTT >81 AAACAGGGCACTGGAGGGCAGCCCTTGAACCGCATAGATGGTGGAATTTCATACGGACTGGCGGGCATTATCGGGGTCGTATTGCCTTTGGGGGCATAGCCCACGAGTGC >82 AAACAGTAGAGTTTCATGTCCCTTGTATCGGAGGCAGCGACTCGCTTGAGCAGACCAAGTCCCGTCACTGAGGGTTATCAGTGAGGATACCTTGGTTCAGACAAAAAGAT >83 AAACATACGATCAAGTGTCGAAATTATATCACCGGCATTTGGTCTTTAGATATCTAAAGAAATGGCGCTAGGCCATCTCCCGGGTTTTTTCTGCTTCATGGCTAGATTCG >85 AAACATGATTTCGTACCCCGTGTAGGGCATGTTACCCACGTGAGGCGAGGTATGCGTGGGTCGATGTAGTACCTGTTGACCCGCATTTAGCCTCGACTCAATCTGCTGGA >86 AAACATGCGTTAAATGCACTGCGAGTTGTCCGGTAGCCGTCCAACCTCCTCTAGTACCAAGTAATGGCATTAGCACGCGACAACATGGCTGTAAGGGCCCGTGCGACTAA >87 AAACATTTGCACCTAGGCTTCCTAGGTGTTTGGCTGGAAACGTAGGCAAGGTCAGGTATTCGACACATCGCCCATTATCCGTTACACGAATATACAAGACGAGAGACCGG >90 AAACCAGACGCCTAAGTTGCACTTCGTGTGGACAGTTCACCTGAAAAGGCAGAAAGTTCTGAGATGAGTGCGCGGAGTTACTAACCTAGGCCCTGTACGAGAGCAACATA >91 AAACCATAGATCACAAGTCCACCTCGAGGCGATTATGCATGCCCTTCACTTCTCACGGCTGGATGGGCTTGCCTTAGTCACTTGCGATTGAGTCGTACGATTATAAAGCG >93 AAACCCCCCCGTTACCCTCACTCCGCTCGGCCTAGCGGTGGTCAGGTCAGGAGTTGCAATCGGAGTCACACGATCATACTTTCTCACTGCGCACAACATATCTGCTTGCA >95 AAACCCTAAAGAGGTTACAGATACCATTTTAAAGTCTAGATTCTATGTGGGTATTTGCGGTCGGGACCCGTTCGCGTCCGCGTCAGACTTGCAATCGTGAGCCCGTCACA >97 AAACCGCAGTGCAATCCTCTGGCAACACGGATAGATTCCTTGCTGTAGCAAGACCGACCGCTGTCCCGGGTGCCGCGATGCGCGAGCATGCCCTGCAGGATCCCACACAT >98 AAACCGCATCGGGCTGGGTACCGGACGGTGCTAAGAGTGCCAGAATGAAGGTAAATAAGGTGGATTGAACATTTTATTAGCTCGTCTCGTGGTGCCATTGCCCAGCATCG >99 AAACCGGGTCAATGTGATTCGTATTACTTGTCAAACAGTACTATCAAACCACCGTTCAGTCGCCCGCTTGATCCCTTGATTCTAGAGGCCATACGGCGCGCCTACTTTTT >100 AAACCGTGGGTAAGGCCGGGAGCTTTACACCAATTGGAACCACAAGTGGCTTGGCGGCCCAGGCTTCCTCATGCAGGATCATAACAATTACGCAAGTCCGGGAATCAGGA >1 AAAAAACATTAGTTACGGAAGGTGGGTGGAGCGGGGGCCGCCAGTCTATATTCATACTAGAAAGGGGCTAAGGGCATCGCGCTCATGAAGTGGCACTTGCAGAGGTGAGT >4 AAAAAATTGTCGTTAAGATGAGGAGCTCTTTCGCATTTGACCCATCAAATCTCGGAATGCACTTGAGATCGACCCGTTTGATACAAGCCTTCATCGTCGATAATATATCG >6 AAAAACCGATGGGGCCGAGCTGTTCTTTGGCCGGGTTACTCTACGCCCACACGGGTACACAGCCGCGAAACGGGAGGCTCGTGCGCGGTCACCTAAGTCCCTGTGGCGGG >7 AAAAACGGATGAGAAATAAAAGGGGAAATAGCGACATGTCAAATGGCCTCTTGGCTGGCGGTGTCTGGCTGGACTAACCCTCTTAAGGACTTAAAGCGTAGGCAAGGTTA >10 AAAAAGCTTACAGTGTTCGTAGTTCTGCTCGTGTCGGTATTCTCATACTCACTCCAGGACTTCGGAAACATTAGTGAAAGTGTCACCGGCCGTGCATTTTCCGGAGTAAG >12 AAAAAGTCTCACTTATTTACGCGCTTGATAGCCCAAACCGTTCACAGCATCTCCAGTAGTCGATTCCGGCTCCTCTCCATGTTGGAGATGTCAGCACGGCAGGTATATGA >14 AAAAATATTTGAATCCGCTGCTGACTTTTTCAAAATTCGATTATCACTACGGCGGTCAGCATATTTCCTTGAACTTATAACGTATCTCAGTATTGCTGGTTTGGAAACGT >15 AAAAATGCCCAGTTCGGCGATCCAACTCGTTAGTCTCAGGGTTCCCGCGGCGAACTCCCTTCGGTTGACTTTACGGCCTACACGCTCAGCCCTGTACCCGCATTGATGTC >19 AAAACACGCATGTCCGACGGGCGCATTAAACGGCATTGTTGTTCTTGAACCGGAAGCAATAGTCTAGGACCGTTCATATTGACTTACTTTACGCGTGGGCCTTGGATGAA >22 AAAACCATCTGTTTCCCAGAATGCTCCCGATACTTAATCACCTGCGCTTTGATCGTAGGCACTCACCCTCTCAAGTACCCTTGGACAGATTTAAGATCGATAGTTCGTAT >26 AAAACGAACCGGACTACATTCCTCATAGGCTTGAGGGGCAAAAGTTACAGTACAGATTTCGGGACCCGTCGCTTTACAGTGTAGACTGTTTTCCGAGAGTGCCTAGTCCA >28 AAAACGCTCGTGCCCAGTTCGAGCGCCCTGGAGTTTACCGCCCAAGTAAACGCTATTTTTTAAATGACATAGCTCCTTACAGGCGTGGGGGGACGCCTATAGGGCCGTCG >29 AAAACGGTGCCAACCATTGAAGATAGAAGATTCAATACATTGCACAAACGAATATAACCGGTAGGTAAAGTCTCGTTTCAAAGCGGCTCTATTTGTGCACCTTTTGCTGA >31 AAAACTCACCACCAGGACAGAAGCTAAAAGACAGGCCGATTCACGGGCGAGCGGTCGAAGCATACTTTTTAGGGCATCATGTGACCACACGTGTTGACCGTCCATCGTTT >34 AAAACTTTACCGATTTGCAAGATGTAAACAGCACGGGGACGCTATATCGACTGAGTGTTGTAGTGGAATCTACCACCCCGGACATGGGGCTGGACTGATTATTGAACGCT >38 AAAAGAGTTTGCAGGGTCACTAGCCCGGCTCTGCATACATTGGCAAAGAGGCGATAAGCTGTGAATTCCAGGCTGGGGACGAGCTTGAACCTACTGTGCGAAGTTACCAC >40 AAAAGCCAGAAGGTATGCAGATCGATATCGCTCCAATGTGTCAGTAGCCCGGACGAAGGCCTGCTCGATGATGCGTCATATTAATGTCAGAGAACCTAGTAGCCATCGGT >44 AAAAGGACAACGATGTGTGTGCTTCAAATATTCCTCCAATTCGTTACTGTCGCGGATGTTATGTCGGGCACTCTTAGCTTCCAAGGTGGGTGGATGTTAACGCGCATAGA >46 AAAAGGGGTTTGCTGACCCACAGACAATAGATAGCGGATTACTTACTCCACTTACGAGACTAACGCAATGCTAATAACTTGACGTCTAAACGGGACCACATAAGCCTTTA >47 AAAAGGTATTATATTGATGATTAGCCATTTAGGGAAGATCAACAAGATGAAGCATGACGCGAGAGGTAAGGGATCGACGAACGCTCCTGCCATGGGCAATCCAGGAGGGC >49 AAAAGTCCCTGTCGTACCTCAATCCAGGCCACGTATACTCAGGGGGTACCGAAACCCTAATAACTCTCCCGTTGGTGACCAGATCTGAATCTGCACCGCACACAACTACA >51 AAAAGTGGAAAATACCCTCACTAACAGGGTATCTTGCGTTGAGCTGGTAAGTGCACCAGGGAAAACTTGTTATCCTCGAGTCTGATCGATGCTATCTCATATTTCTGAGT >54 AAAATAGAAATCGCAGCTTGAATGCTGTTACTCAATGTTTATGAACATAGCTCCCGCGTTACTCTGTTTCGACATAGGATGCGCGCACCGGAAGCCTGCTACCACAACAT >57 AAAATCAAACTAGCTACTCCGAAGGCGGGATATTTGCCTGGTAGAGGGAAATGTAGCTCACGGGCCGTTTACTCTTCTTCAGAGCAACTAAGTATTCCGGAAAACCTCAG >60 AAAATCGTGTTCAAGAAAGAGAGGGCCGGGCGCTGAATTGGGCCCGAATCACAAAAAAGTGAGTCGCGCTCCTACAAAGTCCTAATCTAATAGTGGATGATGTGTCTGGT >62 AAAATGCCATACTAAATTAACGGTGTCTCATAGCGACATTGTTATTGTCACCTGACATAGCCAGAAGGTTAAAAGTAGTGCGCGACGCGAAATACCCATGCTGCGGAGTC >67 AAAATTCAGCCTTTCTTACCCAAGGCCTCTCTCGGACAAGAACTATGAAAGCATGCCCTACAGCATACTTTCGCTGACATATGGAGCAGGGAGCCTAAAGGCCGTTTATC >71 AAACAACCACAGATGGCCACTATGTGAAGTTTTGGACGAGTCCATACATTTTTCACTAAGTAAGAAGCTACCTTAATACGTGCACGCATCACATCCTAGACGCTCTGGCT >72 AAACAAGCCGAGAATCTGGCGTGACAAATCCTCCGGAACGGGCTGACCCCACTGTACAAACAATGTAAATCACCGCAGTTTCACTGTACGTTTGGTCTTTTTGATAGACA >74 AAACAATGGACGGAACCATGTTCTGTTACAAGCGCTGACCACACAAGCCGAGAGTACCCCAAGATGATGTCTTAGGATCGTATATACCCTCCATACCCGAGCTTTCCCCG >77 AAACACGTCACCGAGCGCTTAGTGGATCGTACTCAACATGTTGAACAGACATTATCTACATTCGATTCTTCCCATTATGTATCATCGCAGTACACGCCGCTTTCCATTTT >78 AAACACTCGCACAGACCGGTAACCGAGGGAATACAGAATTATAGCCCATATTCGCTGTCCCAAACTGCACCCATCGTTGGCAATTCCGAGACCTCTATTTCCGGTATGCG >79 AAACAGAGACATGTACGTTTTGCGCGGTGGTAGCTCTGGAGTCGGAGGCAGGGTTTTTTGGCCGGCAAAATCAGTATCCGACCTCGTTGGATGACTCCGGAAAACCTTTT >80 AAACAGATCGCTATTAGCACGCGTATGCTTTCACTAGCGAACAAAAGTGCCCCTTTGAGTCCTAGCAGCTACAGTGCCCGTAACTGATATTCTTAAGGCTATTTACAGTT >81 AAACAGGGCACTGGAGGGCAGCCCTTGAACCGCATAGATGGTGGAATTTCATACGGACTGGCGGGCATTATCGGGGTCGTATTGCCTTTGGGGGCATAGCCCACGAGTGC >82 AAACAGTAGAGTTTCATGTCCCTTGTATCGGAGGCAGCGACTCGCTTGAGCAGACCAAGTCCCGTCACTGAGGGTTATCAGTGAGGATACCTTGGTTCAGACAAAAAGAT >83 AAACATACGATCAAGTGTCGAAATTATATCACCGGCATTTGGTCTTTAGATATCTAAAGAAATGGCGCTAGGCCATCTCCCGGGTTTTTTCTGCTTCATGGCTAGATTCG >85 AAACATGATTTCGTACCCCGTGTAGGGCATGTTACCCACGTGAGGCGAGGTATGCGTGGGTCGATGTAGTACCTGTTGACCCGCATTTAGCCTCGACTCAATCTGCTGGA >86 AAACATGCGTTAAATGCACTGCGAGTTGTCCGGTAGCCGTCCAACCTCCTCTAGTACCAAGTAATGGCATTAGCACGCGACAACATGGCTGTAAGGGCCCGTGCGACTAA >87 AAACATTTGCACCTAGGCTTCCTAGGTGTTTGGCTGGAAACGTAGGCAAGGTCAGGTATTCGACACATCGCCCATTATCCGTTACACGAATATACAAGACGAGAGACCGG >90 AAACCAGACGCCTAAGTTGCACTTCGTGTGGACAGTTCACCTGAAAAGGCAGAAAGTTCTGAGATGAGTGCGCGGAGTTACTAACCTAGGCCCTGTACGAGAGCAACATA >91 AAACCATAGATCACAAGTCCACCTCGAGGCGATTATGCATGCCCTTCACTTCTCACGGCTGGATGGGCTTGCCTTAGTCACTTGCGATTGAGTCGTACGATTATAAAGCG >93 AAACCCCCCCGTTACCCTCACTCCGCTCGGCCTAGCGGTGGTCAGGTCAGGAGTTGCAATCGGAGTCACACGATCATACTTTCTCACTGCGCACAACATATCTGCTTGCA >95 AAACCCTAAAGAGGTTACAGATACCATTTTAAAGTCTAGATTCTATGTGGGTATTTGCGGTCGGGACCCGTTCGCGTCCGCGTCAGACTTGCAATCGTGAGCCCGTCACA >97 AAACCGCAGTGCAATCCTCTGGCAACACGGATAGATTCCTTGCTGTAGCAAGACCGACCGCTGTCCCGGGTGCCGCGATGCGCGAGCATGCCCTGCAGGATCCCACACAT >98 AAACCGCATCGGGCTGGGTACCGGACGGTGCTAAGAGTGCCAGAATGAAGGTAAATAAGGTGGATTGAACATTTTATTAGCTCGTCTCGTGGTGCCATTGCCCAGCATCG >99 AAACCGGGTCAATGTGATTCGTATTACTTGTCAAACAGTACTATCAAACCACCGTTCAGTCGCCCGCTTGATCCCTTGATTCTAGAGGCCATACGGCGCGCCTACTTTTT >100 AAACCGTGGGTAAGGCCGGGAGCTTTACACCAATTGGAACCACAAGTGGCTTGGCGGCCCAGGCTTCCTCATGCAGGATCATAACAATTACGCAAGTCCGGGAATCAGGA >91 AAACCATAGATCACAAGTCCACCTCGAGGCGATTATGCATGCCCTTCACTTCTCACGGCTGGATGGGCTTGCCTTAGTCACTTGCGATTGAGTCGTACGATTATAAAGCG >93 AAACCCCCCCGTTACCCTCACTCCGCTCGGCCTAGCGGTGGTCAGGTCAGGAGTTGCAATCGGAGTCACACGATCATACTTTCTCACTGCGCACAACATATCTGCTTGCAsimka-1.5.3/example/B.fasta000077500000000000000000000267071377312000000154700ustar00rootroot00000000000000>0 AAAAAAACTCTACAGCGAGCAGTGTTAAAATCGTGGCGCACCCAGACAGCCACTTCGCCGTTCTAAGTGGCCGATCGTAGCACATCGGATGACCTTGGTTGGTACGACAT >2 AAAAAAGAGACGAGCCACGCGGTGCGCCTGAACGTTGGGTCCAGACCACACTTATGGATTCGACGGGGCACCTATCAGGTTCTCCATCGTATAGTCGTCTGTAGGTCTGA >3 AAAAAATCGCTAGGGGGGATGGCCATCAACCCCCCCTCCCGTACCTATGATAGTGGGATCAGATTTAAGCACGGGCCCTACGACTCCCCTTCATGGAATAGGCTAAGGTG >4 AAAAAATTGTCGTTAAGATGAGGAGCTCTTTCGCATTTGACCCATCAAATCTCGGAATGCACTTGAGATCGACCCGTTTGATACAAGCCTTCATCGTCGATAATATATCG >5 AAAAACATTGCGGACTACCGTCGTTGCAGTGGGTCGCCCATTCTAGGCTGCGAGTTCATATGTGTGCCTGTCGCTTAGGGCAATCCTCGGATTGGCTGTTTAACAGGGGT >6 AAAAACCGATGGGGCCGAGCTGTTCTTTGGCCGGGTTACTCTACGCCCACACGGGTACACAGCCGCGAAACGGGAGGCTCGTGCGCGGTCACCTAAGTCCCTGTGGCGGG >7 AAAAACGGATGAGAAATAAAAGGGGAAATAGCGACATGTCAAATGGCCTCTTGGCTGGCGGTGTCTGGCTGGACTAACCCTCTTAAGGACTTAAAGCGTAGGCAAGGTTA >8 AAAAACTTTGACTTTTTCAAGACATGAAAGGATGCGGGCTCATACTGGACGGGTTCATTCCTACCGCGGAACGAAGGGCTATTTTTTGTTTGGGCGAGAGTACATCCGTC >9 AAAAAGACTCAGCTTGACATGGCGGTCTGAGCTTTGCTTGGGCTCTTACTATGTCAGGGTTGGAAACTATGGCAGAAGGGCTTCTCGCATCCTCACGGCTCGAATTAGCT >100 AAACCGTGGGTAAGGCCGGGAGCTTTACACCAATTGGAACCACAAGTGGCTTGGCGGCCCAGGCTTCCTCATGCAGGATCATAACAATTACGCAAGTCCGGGAATCAGGA >100 AAACCGTGGGTAAGGCCGGGAGCTTTACACCAATTGGAACCACAAGTGGCTTGGCGGCCCAGGCTTCCTCATGCAGGATCATAACAATTACGCAAGTCCGGGAATCAGGA >11 AAAAAGGCAAGGATCTTTGACCACGGCAGTTTGCAATAGTCAATTACGCTTCCCTGAGTACAAACAATGGTCTAATGCTGTTCAGTTGGGGTTAACTGGCCTGACGCTCT >13 AAAAATAAAAATGGCCTAGGTCCAACTCGTCCCCGGGGTAAGTTAGTGTAAGAGCTTGGAGCAAGTCTGTTCCTCGTCTGTCCCAGTAACACTCGGGTCTACGGTCGCAG >16 AAAAATTAGTCAGGTTACCCCCAATTAGGTGAAATACGTCGAAGGGTCGCGTCCAAGAAAGAATGATAGCTGACAGTTCTCTAGGTATTTATATTTGTTTGCATCGACTG >17 AAAACAAATATCCTGAATTCATAGAGCCGTGACTTACAGTTCTATGAAAAGTGTCTGGCAAGGGAGATTTCACGTTTCCCTGTATAGGGTCGTCGTATTGCCCACCATTT >18 AAAACAATGCTGAAGACCCTAATGCGTAACCAACAATGTAAGACTGGCACGTATTCTATGATATCTTATTGGCACTCCATCGCGAGGATACTAATAGACACCTAAAAGGA >19 AAAACACGCATGTCCGACGGGCGCATTAAACGGCATTGTTGTTCTTGAACCGGAAGCAATAGTCTAGGACCGTTCATATTGACTTACTTTACGCGTGGGCCTTGGATGAA >20 AAAACAGTCATTGTCTGATTTAGCATCGGGTGCACCGAAACGCTATCGCCTTCTCGTGAATTCGCAATTCAGCTCCAAGCATCAATCAGAATCACATGCCGCACACAGAA >21 AAAACATAGTTATAATGTTTTGAGTGTAAATGCTTACATCGAAATATTCGACGTACTACACCGTGAGCCAAGACCCTGACTTCGAACCATGCGCTCCTGTGAGTGTGACG >23 AAAACCCACCGTGCCGGTTTAAGAGCAGTAGCTGTTGGTGTTGTGTCCGCGTGGCATCGGAAAGTCGCTACAACGCCGGAAGCCGGGGATTTACAGAATACGTTATACGA >24 AAAACCGGTTCGTGAAGTCCGAAGGAATTCCACAACGCACTGCATGACACCTGGAAGAAGAAGGCCTTTTCCACGCCCTGAACGTAATTTCTGGTAAAGCAAGTGCTCCA >25 AAAACCTGATCGTGTTCATGAGTCTGGTATAGACGGATCCTTGGGCCAAAGTCTTCCGGTCTTCTGGCCGCCTTCAGGAGTCTAATTACCTGAACCTCATCGTAATTGCT >27 AAAACGATTATTTATCATTTACCGCCTTAGAGTGTGGCTTATATAGCATGGGTTTGATTTGAGTGGGACAACAGATCCATTTGATGCAGTATGTATTAGCGGATCATGAC >28 AAAACGCTCGTGCCCAGTTCGAGCGCCCTGGAGTTTACCGCCCAAGTAAACGCTATTTTTTAAATGACATAGCTCCTTACAGGCGTGGGGGGACGCCTATAGGGCCGTCG >30 AAAACGTGATGTAATCGGACGCATTTGACCAGACGTAGCCACCTTATTCGGTGCGTGCCATGACCCGAGAGCGCCAGGGATGCTTCTCGTTCCGGGTCACTGATAAATAG >31 AAAACTCACCACCAGGACAGAAGCTAAAAGACAGGCCGATTCACGGGCGAGCGGTCGAAGCATACTTTTTAGGGCATCATGTGACCACACGTGTTGACCGTCCATCGTTT >32 AAAACTCCTCCGCCGAGGAGGCACTTAGCCTCGTATGGATGCTTAAGGCATGATCGAGCCGGCCGGAAATCTCCTACCGTCTAATTAGGGGCATTGAAGTCCGGTTCCGC >33 AAAACTGTCAGCTCTAATCGAATGCTTGGAAGTTCTGTCCAAAGTGTTGCGAAGCCGAGCTTGAACGTATATAATAACTGCGGTCCTCATACCGGAACAAGTTTACTGCC >35 AAAACTTTTGGATCGCCATGTGACTAATTCCTATAATTACAATCTGTCATTAGTCGGAGCGGTGCGAGATGTGAGTAGTATAGTCGACGCGGCTAATCGAGGCAGATTCC >36 AAAAGACATCGCTAAGTAGTCGATAACTTTAGGTCTGGCTCAGCGAAGTCCGCGCACCGAGGTACGCGATGAACGTGTAGTAGCTGTGCTGCCGACTCTGAGGCGGTAGC >37 AAAAGACGTGACAGAGGCGATGATACCGCAGACGATACGCCACTACAGCTAAAGAGTCTGTCTAGAAATGCCTAGCGGCACCTGGCGCCGCCGTCTAATGGAGTGCAAGC >38 AAAAGAGTTTGCAGGGTCACTAGCCCGGCTCTGCATACATTGGCAAAGAGGCGATAAGCTGTGAATTCCAGGCTGGGGACGAGCTTGAACCTACTGTGCGAAGTTACCAC >39 AAAAGATGCACCGGCTATCTTAGTTCGTTCCAGGCCAACGAGTGTGACTATAGACGAGTTCTGCTCAGAACGGACTAGGCTTCGGGTGTCACGCGGGATCATATTATCTT >41 AAAAGCCTGGCCTTAACAAAATCTAGGTGCGTCTCGAATCGAACGGAAAACAACGTCTGGTTTACTGAGAAATCCTAGGATGCTGCTGGCTATTTGACCTCACGGGGGTT >42 AAAAGCGTGCCGAAGATGCCGTAATTTCCGAGGATGCACTCTCGTGACATCTCTTTTTAACGAACAAATTGCAGAGGTCAAGGTGATCGAGGCACGCTATGCTAAGCACT >43 AAAAGGAAAGGGAGAACGAATGATTGTTTCCAGGTATCAGGAAGCAACAAAATATAATCGATTCGTCACTGTGAGCCAACAGGCGTGTATGTCTGCGTCAAGCGTGCATC >45 AAAAGGCACGCATCGTCATCTGAACAGCAAAGTTGGGCGTTTCCGCCAATAAAGCGTTTCCCTTCCATTTTATTGTACTAGGAAGAAACCACTCCTATAAGCAAACAAGT >48 AAAAGTACTAACTTATCACGAACCGCTTTTGACGTCTTAATTACAGGTTGGGTGACGCGGCATGTCAGGGGCAAACTAACTATGATATCCACGGAACTGCCGACGACTAA >50 AAAAGTCGGAGTCGATAATGATATAGCGGCACGACTCGAACCCGCTTCGCAGCTCATCTCAGGAGATAGGCCTCGAACCTTCCCTGGTATGTACTCGGAGGCTCTCACCC >51 AAAAGTGGAAAATACCCTCACTAACAGGGTATCTTGCGTTGAGCTGGTAAGTGCACCAGGGAAAACTTGTTATCCTCGAGTCTGATCGATGCTATCTCATATTTCTGAGT >52 AAAAGTTTAGTAAGTTGGGTACGAGGCGTTATGGAGGGTTGCGTCGCTTTCACGAGCCTCATCGATAGCATACCTGTCGCAGATGTATTCAATGGTAACATGACGGTTTT >53 AAAATAATTTTCTGCAGGACCTCTGCTTCGGGAGAAAATATATTTAGATCTCCACCCGGAACCGCTCGCGACTTCACGAGCATCGGGTAGGACTTCGGCCGCTTGGATGC >55 AAAATAGACCTTTCCCCGAAACTGTATGCTAAGACTGTGAGGCGGCGAACGGTCTTTGTTCCTCAGTTAACTGACAACTCACAACAGCGCATAACGAATCACATGCCAGG >56 AAAATATTATTGATCTGACACTACAGAACTTTCTCGTTGACATCGTGCATTGAACTATCAGATGCCCAACCGAGTGGCGGCGATCGCGTTTCGAGTATCACGCGGGTTCC >58 AAAATCCCACCGACGCACTCAGCTAAGTTGATGCATAACAACGTTTGAGCGCTACCTGAGTTAGTTGCAAGCTCGGTCAACGTGTAAAATGTCCATCAGAGTCACCTCAT >59 AAAATCCTAATCCAGCACACGAGCCTCAGTCAGGTTCAAGGTGCGGCTTACTCTGCCGACACCAGCAAAGATATACTCGGGCAGGGAGATTAAATTGGGTTTGTCGACCT >61 AAAATGACACTAACAACGTTGACCGAGAATGAACGTCTAAACCCTTAGTGTGAATTCGTTTCAATATGTACAGGGCCCTCTGGCATATCCCGCTGCCCGGGCTAATGTCA >63 AAAATGCGAAGGTATTCTGTAGAGGGGAATAACTGGGCTTCCATCCCCAGAGCTAACACAGCCGACTACACACTACATAGATGGTCGGGGGTGGTCCGCCGGAAGACGCT >64 AAAATGGCTTTAGGCTAGTAGTAATCTAATGTGTAACAAAGTCTTGTGGCCCGATCGTTATATCTCTGGCACGATCGGTTGGCGGTTTTTCTAGATTACCTTACGCGATA >65 AAAATGTGGTCGGAGCCGCGTACATTATGTGTGGCTTCACCTATATCTAGGGGAGTTCCCGGCCTAGCACACCAGCGGTCCGTAGGAACCGCGCCCGCCAAACCGAGCAC >66 AAAATTACGGCGCAACTGTTGGCTTCTTCATTCCCTGTTAGGTCCAAGAGCTGACAGGTCATATCTAATTCGACAGTTGCTAGATCGTAGTGAGTTAAGTATTCGTGGAG >68 AAAATTGCGTAGTTAGAACGACGAGCATTCTAATGTACAACCTATAATAAATAACGGGCCCTTGTTGCCTAACCAACAACAGTACCGCCAGGCCACTCCGCTAAGGTCAG >1 AAAAAACATTAGTTACGGAAGGTGGGTGGAGCGGGGGCCGCCAGTCTATATTCATACTAGAAAGGGGCTAAGGGCATCGCGCTCATGAAGTGGCACTTGCAGAGGTGAGT >4 AAAAAATTGTCGTTAAGATGAGGAGCTCTTTCGCATTTGACCCATCAAATCTCGGAATGCACTTGAGATCGACCCGTTTGATACAAGCCTTCATCGTCGATAATATATCG >6 AAAAACCGATGGGGCCGAGCTGTTCTTTGGCCGGGTTACTCTACGCCCACACGGGTACACAGCCGCGAAACGGGAGGCTCGTGCGCGGTCACCTAAGTCCCTGTGGCGGG >7 AAAAACGGATGAGAAATAAAAGGGGAAATAGCGACATGTCAAATGGCCTCTTGGCTGGCGGTGTCTGGCTGGACTAACCCTCTTAAGGACTTAAAGCGTAGGCAAGGTTA >10 AAAAAGCTTACAGTGTTCGTAGTTCTGCTCGTGTCGGTATTCTCATACTCACTCCAGGACTTCGGAAACATTAGTGAAAGTGTCACCGGCCGTGCATTTTCCGGAGTAAG >12 AAAAAGTCTCACTTATTTACGCGCTTGATAGCCCAAACCGTTCACAGCATCTCCAGTAGTCGATTCCGGCTCCTCTCCATGTTGGAGATGTCAGCACGGCAGGTATATGA >14 AAAAATATTTGAATCCGCTGCTGACTTTTTCAAAATTCGATTATCACTACGGCGGTCAGCATATTTCCTTGAACTTATAACGTATCTCAGTATTGCTGGTTTGGAAACGT >15 AAAAATGCCCAGTTCGGCGATCCAACTCGTTAGTCTCAGGGTTCCCGCGGCGAACTCCCTTCGGTTGACTTTACGGCCTACACGCTCAGCCCTGTACCCGCATTGATGTC >19 AAAACACGCATGTCCGACGGGCGCATTAAACGGCATTGTTGTTCTTGAACCGGAAGCAATAGTCTAGGACCGTTCATATTGACTTACTTTACGCGTGGGCCTTGGATGAA >22 AAAACCATCTGTTTCCCAGAATGCTCCCGATACTTAATCACCTGCGCTTTGATCGTAGGCACTCACCCTCTCAAGTACCCTTGGACAGATTTAAGATCGATAGTTCGTAT >26 AAAACGAACCGGACTACATTCCTCATAGGCTTGAGGGGCAAAAGTTACAGTACAGATTTCGGGACCCGTCGCTTTACAGTGTAGACTGTTTTCCGAGAGTGCCTAGTCCA >28 AAAACGCTCGTGCCCAGTTCGAGCGCCCTGGAGTTTACCGCCCAAGTAAACGCTATTTTTTAAATGACATAGCTCCTTACAGGCGTGGGGGGACGCCTATAGGGCCGTCG >29 AAAACGGTGCCAACCATTGAAGATAGAAGATTCAATACATTGCACAAACGAATATAACCGGTAGGTAAAGTCTCGTTTCAAAGCGGCTCTATTTGTGCACCTTTTGCTGA >31 AAAACTCACCACCAGGACAGAAGCTAAAAGACAGGCCGATTCACGGGCGAGCGGTCGAAGCATACTTTTTAGGGCATCATGTGACCACACGTGTTGACCGTCCATCGTTT >34 AAAACTTTACCGATTTGCAAGATGTAAACAGCACGGGGACGCTATATCGACTGAGTGTTGTAGTGGAATCTACCACCCCGGACATGGGGCTGGACTGATTATTGAACGCT >38 AAAAGAGTTTGCAGGGTCACTAGCCCGGCTCTGCATACATTGGCAAAGAGGCGATAAGCTGTGAATTCCAGGCTGGGGACGAGCTTGAACCTACTGTGCGAAGTTACCAC >40 AAAAGCCAGAAGGTATGCAGATCGATATCGCTCCAATGTGTCAGTAGCCCGGACGAAGGCCTGCTCGATGATGCGTCATATTAATGTCAGAGAACCTAGTAGCCATCGGT >44 AAAAGGACAACGATGTGTGTGCTTCAAATATTCCTCCAATTCGTTACTGTCGCGGATGTTATGTCGGGCACTCTTAGCTTCCAAGGTGGGTGGATGTTAACGCGCATAGA >46 AAAAGGGGTTTGCTGACCCACAGACAATAGATAGCGGATTACTTACTCCACTTACGAGACTAACGCAATGCTAATAACTTGACGTCTAAACGGGACCACATAAGCCTTTA >47 AAAAGGTATTATATTGATGATTAGCCATTTAGGGAAGATCAACAAGATGAAGCATGACGCGAGAGGTAAGGGATCGACGAACGCTCCTGCCATGGGCAATCCAGGAGGGC >49 AAAAGTCCCTGTCGTACCTCAATCCAGGCCACGTATACTCAGGGGGTACCGAAACCCTAATAACTCTCCCGTTGGTGACCAGATCTGAATCTGCACCGCACACAACTACA >51 AAAAGTGGAAAATACCCTCACTAACAGGGTATCTTGCGTTGAGCTGGTAAGTGCACCAGGGAAAACTTGTTATCCTCGAGTCTGATCGATGCTATCTCATATTTCTGAGT >54 AAAATAGAAATCGCAGCTTGAATGCTGTTACTCAATGTTTATGAACATAGCTCCCGCGTTACTCTGTTTCGACATAGGATGCGCGCACCGGAAGCCTGCTACCACAACAT >57 AAAATCAAACTAGCTACTCCGAAGGCGGGATATTTGCCTGGTAGAGGGAAATGTAGCTCACGGGCCGTTTACTCTTCTTCAGAGCAACTAAGTATTCCGGAAAACCTCAG >60 AAAATCGTGTTCAAGAAAGAGAGGGCCGGGCGCTGAATTGGGCCCGAATCACAAAAAAGTGAGTCGCGCTCCTACAAAGTCCTAATCTAATAGTGGATGATGTGTCTGGT >62 AAAATGCCATACTAAATTAACGGTGTCTCATAGCGACATTGTTATTGTCACCTGACATAGCCAGAAGGTTAAAAGTAGTGCGCGACGCGAAATACCCATGCTGCGGAGTC >67 AAAATTCAGCCTTTCTTACCCAAGGCCTCTCTCGGACAAGAACTATGAAAGCATGCCCTACAGCATACTTTCGCTGACATATGGAGCAGGGAGCCTAAAGGCCGTTTATC >71 AAACAACCACAGATGGCCACTATGTGAAGTTTTGGACGAGTCCATACATTTTTCACTAAGTAAGAAGCTACCTTAATACGTGCACGCATCACATCCTAGACGCTCTGGCT >72 AAACAAGCCGAGAATCTGGCGTGACAAATCCTCCGGAACGGGCTGACCCCACTGTACAAACAATGTAAATCACCGCAGTTTCACTGTACGTTTGGTCTTTTTGATAGACA >74 AAACAATGGACGGAACCATGTTCTGTTACAAGCGCTGACCACACAAGCCGAGAGTACCCCAAGATGATGTCTTAGGATCGTATATACCCTCCATACCCGAGCTTTCCCCG >77 AAACACGTCACCGAGCGCTTAGTGGATCGTACTCAACATGTTGAACAGACATTATCTACATTCGATTCTTCCCATTATGTATCATCGCAGTACACGCCGCTTTCCATTTT >78 AAACACTCGCACAGACCGGTAACCGAGGGAATACAGAATTATAGCCCATATTCGCTGTCCCAAACTGCACCCATCGTTGGCAATTCCGAGACCTCTATTTCCGGTATGCG >79 AAACAGAGACATGTACGTTTTGCGCGGTGGTAGCTCTGGAGTCGGAGGCAGGGTTTTTTGGCCGGCAAAATCAGTATCCGACCTCGTTGGATGACTCCGGAAAACCTTTT >80 AAACAGATCGCTATTAGCACGCGTATGCTTTCACTAGCGAACAAAAGTGCCCCTTTGAGTCCTAGCAGCTACAGTGCCCGTAACTGATATTCTTAAGGCTATTTACAGTT >81 AAACAGGGCACTGGAGGGCAGCCCTTGAACCGCATAGATGGTGGAATTTCATACGGACTGGCGGGCATTATCGGGGTCGTATTGCCTTTGGGGGCATAGCCCACGAGTGC >82 AAACAGTAGAGTTTCATGTCCCTTGTATCGGAGGCAGCGACTCGCTTGAGCAGACCAAGTCCCGTCACTGAGGGTTATCAGTGAGGATACCTTGGTTCAGACAAAAAGAT >83 AAACATACGATCAAGTGTCGAAATTATATCACCGGCATTTGGTCTTTAGATATCTAAAGAAATGGCGCTAGGCCATCTCCCGGGTTTTTTCTGCTTCATGGCTAGATTCG >85 AAACATGATTTCGTACCCCGTGTAGGGCATGTTACCCACGTGAGGCGAGGTATGCGTGGGTCGATGTAGTACCTGTTGACCCGCATTTAGCCTCGACTCAATCTGCTGGA >86 AAACATGCGTTAAATGCACTGCGAGTTGTCCGGTAGCCGTCCAACCTCCTCTAGTACCAAGTAATGGCATTAGCACGCGACAACATGGCTGTAAGGGCCCGTGCGACTAA >87 AAACATTTGCACCTAGGCTTCCTAGGTGTTTGGCTGGAAACGTAGGCAAGGTCAGGTATTCGACACATCGCCCATTATCCGTTACACGAATATACAAGACGAGAGACCGG >90 AAACCAGACGCCTAAGTTGCACTTCGTGTGGACAGTTCACCTGAAAAGGCAGAAAGTTCTGAGATGAGTGCGCGGAGTTACTAACCTAGGCCCTGTACGAGAGCAACATA >91 AAACCATAGATCACAAGTCCACCTCGAGGCGATTATGCATGCCCTTCACTTCTCACGGCTGGATGGGCTTGCCTTAGTCACTTGCGATTGAGTCGTACGATTATAAAGCG >93 AAACCCCCCCGTTACCCTCACTCCGCTCGGCCTAGCGGTGGTCAGGTCAGGAGTTGCAATCGGAGTCACACGATCATACTTTCTCACTGCGCACAACATATCTGCTTGCA >95 AAACCCTAAAGAGGTTACAGATACCATTTTAAAGTCTAGATTCTATGTGGGTATTTGCGGTCGGGACCCGTTCGCGTCCGCGTCAGACTTGCAATCGTGAGCCCGTCACA >97 AAACCGCAGTGCAATCCTCTGGCAACACGGATAGATTCCTTGCTGTAGCAAGACCGACCGCTGTCCCGGGTGCCGCGATGCGCGAGCATGCCCTGCAGGATCCCACACAT >98 AAACCGCATCGGGCTGGGTACCGGACGGTGCTAAGAGTGCCAGAATGAAGGTAAATAAGGTGGATTGAACATTTTATTAGCTCGTCTCGTGGTGCCATTGCCCAGCATCG >99 AAACCGGGTCAATGTGATTCGTATTACTTGTCAAACAGTACTATCAAACCACCGTTCAGTCGCCCGCTTGATCCCTTGATTCTAGAGGCCATACGGCGCGCCTACTTTTT >100 AAACCGTGGGTAAGGCCGGGAGCTTTACACCAATTGGAACCACAAGTGGCTTGGCGGCCCAGGCTTCCTCATGCAGGATCATAACAATTACGCAAGTCCGGGAATCAGGA >91 AAACCATAGATCACAAGTCCACCTCGAGGCGATTATGCATGCCCTTCACTTCTCACGGCTGGATGGGCTTGCCTTAGTCACTTGCGATTGAGTCGTACGATTATAAAGCG >93 AAACCCCCCCGTTACCCTCACTCCGCTCGGCCTAGCGGTGGTCAGGTCAGGAGTTGCAATCGGAGTCACACGATCATACTTTCTCACTGCGCACAACATATCTGCTTGCAsimka-1.5.3/example/C.fasta000077500000000000000000000257761377312000000154760ustar00rootroot00000000000000>0 AAAAAAACCGAAGTCTCTTGCAGTATGCACAGGAGATCGCGGAGAACCGGAATCCCCTGGTTCGACTGTGACTCTTGGAGCGTCGCTAATCGCGTCGGTAACTTTAATTT >2 AAAAAAGCGGAATCACTGAAGACGTCATTTCCTCTTCAGAAAAAGTTTACCATGTTCTATTCTTGACCCTACGAATCAAGGTTCCGTTTAGCGTGTCGTTATAGAACAAT >3 AAAAAATACAGTATATGCCCCGAACTAGCCATCCGGATCCCAATAATCAAATCACCGTGACTGATGGTCAGATATCTTCTCCGGGATTGTGGGGAACGCCTACTTCGTGG >4 AAAAACAAGGGCTGAGCATCTGCGACATAAGTATCCCGTACATTATGTTCGATTGTAGTTAACTAAATTGTCCTACACTAGCGACCCTAAACCACCGTTACTCGAGGGCA >7 AAAAACGGTGGTTTGCATGTTAGGCGGAAGATCCCAATCCCTTGAAGAATCACGGAAGTAAGTTACTAGTGAAGGTTACTTTGCGATCTAAGGCGCTCGTGGGTGTCACA >8 AAAAACTGGCTACATTTAGACTTTTCCCAGCCTTGCCTACATGCCTAGCACTGAATCAAATACCCGGCTACCGGGTAGCGACCCCACAATATCAAGTTGCTTCTCTGAGT >10 AAAAAGCAGTCGGAAGACCTTTAGCTGTCCTTTAGCAGTGACTCTCCGCCCGTCTCAATGCAAAGAGATTCTGCGCATTGCTCCTTGTCCTAAGACACCATTAAGCGAAT >12 AAAAAGTTAGTCCTAAATTGTGGGCATCGGGTTGGCTGAATTAGACTGTGTGATCATATTCTACTTTTCGTCGACAGGATCTGGGCCGATTAGGGACATGTAAAGTGTCC >13 AAAAATAAGAAGCCTTACGAGCGTTGCGGGTTCGTTCTATATAAGGGCAGTACGTTTAGTTTACCCATCTGGGGTTCTTGTTACAACATGGAGTCGACTGTGTCTTTTAC >14 AAAAATCCGTTTAAGTCCACACCCCATATCTTTTGCAAGTTACGTCTTCACTTGTGTACTGTAGACTGCTGGGGACTAGAGTCCATCGTTGTGACTTAAAGGACGCTTTC >100 AAACCGTGGGTAAGGCCGGGAGCTTTACACCAATTGGAACCACAAGTGGCTTGGCGGCCCAGGCTTCCTCATGCAGGATCATAACAATTACGCAAGTCCGGGAATCAGGA >100 AAACCGTGGGTAAGGCCGGGAGCTTTACACCAATTGGAACCACAAGTGGCTTGGCGGCCCAGGCTTCCTCATGCAGGATCATAACAATTACGCAAGTCCGGGAATCAGGA >15 AAAAATGAATGATCCACGCCCGGTCAGTGTAGATCACTGAGCCTCGGCTATTCAGCCCATAAAACACAAAGCCTGCGGAGCGGCGGCCTACTAGAAATGCGATCCATCCT >17 AAAACAAAAGCAATTAGCAACGGACGCGAACTCTCGATCTTACAAGGATACCAGAATAGTTCACCCTGGCTGCGCAATGCCGATATTAAGTGTAGGCTTCGGCCCATTGT >18 AAAACAAGCGATTGAGATTCTAATCCAGCTTGCGACATACTTTGTCAGCGATCTGCGCTGTCTAATTTGCATCAAGTGTCTATACACGGTCCCTCCTCAGGGCGGGCACG >19 AAAACACTCCCGTATTCATGTCTACGTACCGTTATCCACTATTAAATGTACACTAGCTAACAGTGGTCATTGAGTAATCAAGATCAAGCGCGAGATGGGGGAGGATTGCT >23 AAAACCCAAGATGACGAATATCTGGTTACTGGGTTGACGGGATATAATCATTGCAAAACCTTTTGGGAAGGGACCTTAAAGCCTAGTCTGTCTTTTAGTTGCCCTCGTGG >25 AAAACCTGACGTGTAGATTAGTTACGTCCCCCGGCGCATGCAATCGTCCCACACGGCAGATGTGTTCGGACTGGAGACAATAGTGCTAGTTGGACTACGGCCTATTGGCG >26 AAAACGAAGTAGCGGGTTACGTGAATGGTGCCCACTTACGATAGGTTCGAGGCTGAGGGCTGTACCACGCCGGACGATTCGTGTTCGCCTCCTTGAATCTGATTCCCCGT >29 AAAACGGGACCTGCTTTTTTGCTTTGCGGAGGTCAATGGTTAAGTACGAACCAGCATGCGGGATGTGGCCCCGCCAATGTGGTACCGTTTGGTACTGGATCAGTCCTTTT >38 AAAAGATACCTACGCTCCATCTAGAGGTTAATAGCGAACCTCTCGCTTCTAGAGCGAGGTGGCATTCAGCGAGATGAACCTCTAATCTAAGCCCCGAATATCTGACGCAG >39 AAAAGATTGCGTCTCAGGCGGGCTCTAAACACGCTTCGCCGGACCAACTTTCTGACATTCTGGGACTGTCACTTGCAAGGCCTTATAGCTTTACGGCATTCTCCTTCGCC >40 AAAAGCACTACTCCGTTTGGACCATTTGCGCAAACATGATCCCCGCGCGGTACGGAACTTTTTTACAACGCAGGATCTTCTGACCCGGTAACATGCTACCTGGCGCCTGT >41 AAAAGCCGGGATGGGTAACTCTTTTACTATTTGGGGACAAAGCTGCATAGTGACCGGCACTCAAACATACATTGTAATAGTGAAATCGGAGAGCACCATGCGCCAGTCGG >42 AAAAGCGATTTGAGCGGAGGAGGCGTTGAGGAGATCAGAAGGGGGCACACCGCAACGGTCTTCAACACACACTGGCAGTCTTTAAGACGTTTGAAGTTTAGCTTAGTTAT >43 AAAAGCTGGTCCTGCCATTGATGTCCGATGAGCGACTCTTACCCTTGCAAAGAGCACAAAAGTTATATTCTGATCAATGTCATGCATTTAACATTACTGACAGGGTGGCG >44 AAAAGGAAGATCACAGGAGAAGTAGACACCTCATCGATGGCACGGACCCAGTTCACATATAGGGTGGGGAGCTATCCAGAAGTGGCCATGGTTTTGTAACTTCGCGTTAG >46 AAAAGGGGGAAACAACTGAAGGTCGGTGCCGCGATAAGCCCGTTAACAGGGACGGGCCAAAGCATTAGCTTGTTAAGAACCGATGCTATGTTTATCCGATTGGATGTTGT >50 AAAAGTGACGAATTCCTGCATCGAGAGGATGGCGTCTCTCGCATCGCCGGTCCAAGAAGACCACAGGAAACAGATCGGAGAAGGCCGCAGGTATTCAGGAAGTTCATATA >51 AAAAGTTCCATTGGGGGCACGCCTAAGTTACGGCACCCGAGTTTCGCCAGGAAGTGGAAATTTATTCTTTTGAATCCGCAGAAGTGTAAAATGCCGTCCAATAAAATTAC >53 AAAATAAGGCTTTGGTGCCAAGACCAAACTCGCTTTGATGTCGTCTTGGCCAAAAAGATACCTTCGGATGGGCCCACCCCTATGCTTCCTCATGCTTTCACTAGGGAGAC >55 AAAATAGTAGGCCGAATGGATTGGGTGACGTGTGGCACAGTAAGGGAGGACTATCAGGTGATTCTCTACCTGGAGCCACCATGAACCTCATGAGTAGAGGCGGAACAAGG >58 AAAATCCGTACCTCGAGCGGTTGGAGAACGCTCGCGCTGAATGCCCGTAAGATGTTGACAGTGAAGTGATTTTGCAATCGATGTATTCGTGTCGAATCATCATGCCCGCT >60 AAAATCTATTAGGAGATCAACTGTCCGAGTATTGTGGGGTTGGCTCTACTTACGCAACCCGCGATACAGCAATACGATCCTCGAGCTCTCCTCAACCCCGATTGCGTATG >62 AAAATGCACGAAAATAGGAGCATTCGTCCCAGTAGTGATTGAAAGTCCTTAGGCATAATTCAATACATTCGTTTGGACCCCAAGTGTTGGGCGTTCAACGCGCGAGATTG >64 AAAATGGCGGAATCTGTTGGATCCTGGCCGGTAGAGTGTGCCTACAGATTTGTCGAGGCGGGTAGTCTGCCTGCGGCCTGCACGTTAGAGTACTACCTCATAGTGTTAAG >65 AAAATGTCAACTCACGTTCTTTCGCACTTATGTTTCAGCCTCAGATTCAATTTGACATCCTACAAATATGAGAAAGGCAACCAGGAATGGGGCTGAACCCGTTCAGCCGT >66 AAAATTATGAGAGGGGCGTTCTCGCAATGGAGATTCTTCTCGTCGACTCACCAGGGGACCAGTGCACGCAGCTCCATAGGTGCACGCTCTCGGACGTGGCAACGGAGGAC >68 AAAATTGGCCGCTGAGACATGGGACATGGATTATCGTGTCATATAGACGGCGGCATTTTGCTACTAGCGAACACTCAATGGGGCTTTCCGTGGACTACTATCAATACTGA >70 AAACAAAATTTTGTTCTATCGAGTCTACCAGAGCCGAACACGGCCAAGCCCAATATGCCAGTGGTGTGCTGCTTGAGCAATTCGCAGGTATCTCTCCAACTACATCGCCG >73 AAACAAGGTCCGTACGAAAACCTAGTGACCTCAAATCAGTTGTAGGTGTACTGGCTTGCAACGTTGCCGGTGAACGAAAGAACCGCTAGGGGGCCGTGATGCATTCTACC >79 AAACAGATAGCCACCAACTATACCCTTCTTCGATGTCCATGCGGGTCGTTAACGTCGCTAGGCGTGAACGGACGCTCGTGGATGTCCGTCTACGCAATGTTACGAGTCAA >80 AAACAGCCTCGATGGGTGTACAGTGCACCCTTGCTCGACTACGGCTTCAATTCTGATGTAAAACCTGTACGTGAGACTGCCAAGGCAGATGCAACCAGATCTCCTGGATT >81 AAACAGCTAGGGGTCGACCACTTGCCGGCAGACACTGAGGTAGATATTAAGCAGAACACCGGGTGGTAATTGCATGTCGTATTAGTTCCCGTTGCTTAGCATGCCTAAGG >84 AAACATATGTCATAAGGCGCTGAAGACACCGCACGGGGACTAACACAACAGCACCAGATTGTCGACGTAAGTGCTTTTCCTATTTCTTAGCCCATCTCTAATCAGCCCGG >85 AAACATCGAGCTTGGACGCGTGCAGGTATTAAATTTGAGCCCCAAGGCTTATAATGCATCCTCCCACAAGAAGGCATAGATGTACTCGTCTTAGTACAAGGCTGCTAGAG >87 AAACATTTGTACGCGAATAAATTATTTCGGTGTCAGAGGCGACACCCGTAAACGGGAGCAAGGCTAAGTCAAGGTGTTGAAGAGAATTTTCTGTGGTCATTTACTGTCCT >89 AAACCACGAATACCAAAATAAAGTCACCCTGTGCCTTAGTGTTTAAGATGTACTGACAATTTCCTGTGGATCGTTGTGCGGTTGCTGTGGGGGCCCTATCAGCGAACGGG >90 AAACCAGTATGCTTTTAAGGGAACCGAGGAATCGCATGATCTTCCGGTGATTATGCCATCTCTAACAGGGAGGCGCCTTGCTTTAACGCTGTACCCGTTTTGTACTCGAA >91 AAACCATCTTGATAATTCTAAGGTCAGTACGAAAGGCCTCTAGTCAACCGTCTCGTGGATCGGGACTCAGCCGTGGAATGATCATCATTAGCAGACAGACAGTCGATATC >94 AAACCCGCCCCATGCATAGTAAACGAAGAAGTCCACTCTTAATGTCAAACTAACTTTTTAGGGCATCCGTTGAAGGGCATCGATACCGTCCAACCGGTCGGTGGAGGACG >99 AAACCGTAAATGCCGCCCCCCCCACCAGGCTGGAAGGGAAGGGATCTAGTAGCAAACCTACATCCATGAATGGAGAAGAACTGGTTCGAACACCATGCGCATGTTGACCA >1 AAAAAACATTAGTTACGGAAGGTGGGTGGAGCGGGGGCCGCCAGTCTATATTCATACTAGAAAGGGGCTAAGGGCATCGCGCTCATGAAGTGGCACTTGCAGAGGTGAGT >4 AAAAAATTGTCGTTAAGATGAGGAGCTCTTTCGCATTTGACCCATCAAATCTCGGAATGCACTTGAGATCGACCCGTTTGATACAAGCCTTCATCGTCGATAATATATCG >6 AAAAACCGATGGGGCCGAGCTGTTCTTTGGCCGGGTTACTCTACGCCCACACGGGTACACAGCCGCGAAACGGGAGGCTCGTGCGCGGTCACCTAAGTCCCTGTGGCGGG >7 AAAAACGGATGAGAAATAAAAGGGGAAATAGCGACATGTCAAATGGCCTCTTGGCTGGCGGTGTCTGGCTGGACTAACCCTCTTAAGGACTTAAAGCGTAGGCAAGGTTA >10 AAAAAGCTTACAGTGTTCGTAGTTCTGCTCGTGTCGGTATTCTCATACTCACTCCAGGACTTCGGAAACATTAGTGAAAGTGTCACCGGCCGTGCATTTTCCGGAGTAAG >12 AAAAAGTCTCACTTATTTACGCGCTTGATAGCCCAAACCGTTCACAGCATCTCCAGTAGTCGATTCCGGCTCCTCTCCATGTTGGAGATGTCAGCACGGCAGGTATATGA >14 AAAAATATTTGAATCCGCTGCTGACTTTTTCAAAATTCGATTATCACTACGGCGGTCAGCATATTTCCTTGAACTTATAACGTATCTCAGTATTGCTGGTTTGGAAACGT >15 AAAAATGCCCAGTTCGGCGATCCAACTCGTTAGTCTCAGGGTTCCCGCGGCGAACTCCCTTCGGTTGACTTTACGGCCTACACGCTCAGCCCTGTACCCGCATTGATGTC >19 AAAACACGCATGTCCGACGGGCGCATTAAACGGCATTGTTGTTCTTGAACCGGAAGCAATAGTCTAGGACCGTTCATATTGACTTACTTTACGCGTGGGCCTTGGATGAA >22 AAAACCATCTGTTTCCCAGAATGCTCCCGATACTTAATCACCTGCGCTTTGATCGTAGGCACTCACCCTCTCAAGTACCCTTGGACAGATTTAAGATCGATAGTTCGTAT >26 AAAACGAACCGGACTACATTCCTCATAGGCTTGAGGGGCAAAAGTTACAGTACAGATTTCGGGACCCGTCGCTTTACAGTGTAGACTGTTTTCCGAGAGTGCCTAGTCCA >28 AAAACGCTCGTGCCCAGTTCGAGCGCCCTGGAGTTTACCGCCCAAGTAAACGCTATTTTTTAAATGACATAGCTCCTTACAGGCGTGGGGGGACGCCTATAGGGCCGTCG >29 AAAACGGTGCCAACCATTGAAGATAGAAGATTCAATACATTGCACAAACGAATATAACCGGTAGGTAAAGTCTCGTTTCAAAGCGGCTCTATTTGTGCACCTTTTGCTGA >31 AAAACTCACCACCAGGACAGAAGCTAAAAGACAGGCCGATTCACGGGCGAGCGGTCGAAGCATACTTTTTAGGGCATCATGTGACCACACGTGTTGACCGTCCATCGTTT >34 AAAACTTTACCGATTTGCAAGATGTAAACAGCACGGGGACGCTATATCGACTGAGTGTTGTAGTGGAATCTACCACCCCGGACATGGGGCTGGACTGATTATTGAACGCT >38 AAAAGAGTTTGCAGGGTCACTAGCCCGGCTCTGCATACATTGGCAAAGAGGCGATAAGCTGTGAATTCCAGGCTGGGGACGAGCTTGAACCTACTGTGCGAAGTTACCAC >40 AAAAGCCAGAAGGTATGCAGATCGATATCGCTCCAATGTGTCAGTAGCCCGGACGAAGGCCTGCTCGATGATGCGTCATATTAATGTCAGAGAACCTAGTAGCCATCGGT >44 AAAAGGACAACGATGTGTGTGCTTCAAATATTCCTCCAATTCGTTACTGTCGCGGATGTTATGTCGGGCACTCTTAGCTTCCAAGGTGGGTGGATGTTAACGCGCATAGA >46 AAAAGGGGTTTGCTGACCCACAGACAATAGATAGCGGATTACTTACTCCACTTACGAGACTAACGCAATGCTAATAACTTGACGTCTAAACGGGACCACATAAGCCTTTA >47 AAAAGGTATTATATTGATGATTAGCCATTTAGGGAAGATCAACAAGATGAAGCATGACGCGAGAGGTAAGGGATCGACGAACGCTCCTGCCATGGGCAATCCAGGAGGGC >49 AAAAGTCCCTGTCGTACCTCAATCCAGGCCACGTATACTCAGGGGGTACCGAAACCCTAATAACTCTCCCGTTGGTGACCAGATCTGAATCTGCACCGCACACAACTACA >51 AAAAGTGGAAAATACCCTCACTAACAGGGTATCTTGCGTTGAGCTGGTAAGTGCACCAGGGAAAACTTGTTATCCTCGAGTCTGATCGATGCTATCTCATATTTCTGAGT >54 AAAATAGAAATCGCAGCTTGAATGCTGTTACTCAATGTTTATGAACATAGCTCCCGCGTTACTCTGTTTCGACATAGGATGCGCGCACCGGAAGCCTGCTACCACAACAT >57 AAAATCAAACTAGCTACTCCGAAGGCGGGATATTTGCCTGGTAGAGGGAAATGTAGCTCACGGGCCGTTTACTCTTCTTCAGAGCAACTAAGTATTCCGGAAAACCTCAG >60 AAAATCGTGTTCAAGAAAGAGAGGGCCGGGCGCTGAATTGGGCCCGAATCACAAAAAAGTGAGTCGCGCTCCTACAAAGTCCTAATCTAATAGTGGATGATGTGTCTGGT >62 AAAATGCCATACTAAATTAACGGTGTCTCATAGCGACATTGTTATTGTCACCTGACATAGCCAGAAGGTTAAAAGTAGTGCGCGACGCGAAATACCCATGCTGCGGAGTC >67 AAAATTCAGCCTTTCTTACCCAAGGCCTCTCTCGGACAAGAACTATGAAAGCATGCCCTACAGCATACTTTCGCTGACATATGGAGCAGGGAGCCTAAAGGCCGTTTATC >71 AAACAACCACAGATGGCCACTATGTGAAGTTTTGGACGAGTCCATACATTTTTCACTAAGTAAGAAGCTACCTTAATACGTGCACGCATCACATCCTAGACGCTCTGGCT >72 AAACAAGCCGAGAATCTGGCGTGACAAATCCTCCGGAACGGGCTGACCCCACTGTACAAACAATGTAAATCACCGCAGTTTCACTGTACGTTTGGTCTTTTTGATAGACA >74 AAACAATGGACGGAACCATGTTCTGTTACAAGCGCTGACCACACAAGCCGAGAGTACCCCAAGATGATGTCTTAGGATCGTATATACCCTCCATACCCGAGCTTTCCCCG >77 AAACACGTCACCGAGCGCTTAGTGGATCGTACTCAACATGTTGAACAGACATTATCTACATTCGATTCTTCCCATTATGTATCATCGCAGTACACGCCGCTTTCCATTTT >78 AAACACTCGCACAGACCGGTAACCGAGGGAATACAGAATTATAGCCCATATTCGCTGTCCCAAACTGCACCCATCGTTGGCAATTCCGAGACCTCTATTTCCGGTATGCG >79 AAACAGAGACATGTACGTTTTGCGCGGTGGTAGCTCTGGAGTCGGAGGCAGGGTTTTTTGGCCGGCAAAATCAGTATCCGACCTCGTTGGATGACTCCGGAAAACCTTTT >80 AAACAGATCGCTATTAGCACGCGTATGCTTTCACTAGCGAACAAAAGTGCCCCTTTGAGTCCTAGCAGCTACAGTGCCCGTAACTGATATTCTTAAGGCTATTTACAGTT >81 AAACAGGGCACTGGAGGGCAGCCCTTGAACCGCATAGATGGTGGAATTTCATACGGACTGGCGGGCATTATCGGGGTCGTATTGCCTTTGGGGGCATAGCCCACGAGTGC >82 AAACAGTAGAGTTTCATGTCCCTTGTATCGGAGGCAGCGACTCGCTTGAGCAGACCAAGTCCCGTCACTGAGGGTTATCAGTGAGGATACCTTGGTTCAGACAAAAAGAT >83 AAACATACGATCAAGTGTCGAAATTATATCACCGGCATTTGGTCTTTAGATATCTAAAGAAATGGCGCTAGGCCATCTCCCGGGTTTTTTCTGCTTCATGGCTAGATTCG >85 AAACATGATTTCGTACCCCGTGTAGGGCATGTTACCCACGTGAGGCGAGGTATGCGTGGGTCGATGTAGTACCTGTTGACCCGCATTTAGCCTCGACTCAATCTGCTGGA >86 AAACATGCGTTAAATGCACTGCGAGTTGTCCGGTAGCCGTCCAACCTCCTCTAGTACCAAGTAATGGCATTAGCACGCGACAACATGGCTGTAAGGGCCCGTGCGACTAA >87 AAACATTTGCACCTAGGCTTCCTAGGTGTTTGGCTGGAAACGTAGGCAAGGTCAGGTATTCGACACATCGCCCATTATCCGTTACACGAATATACAAGACGAGAGACCGG >90 AAACCAGACGCCTAAGTTGCACTTCGTGTGGACAGTTCACCTGAAAAGGCAGAAAGTTCTGAGATGAGTGCGCGGAGTTACTAACCTAGGCCCTGTACGAGAGCAACATA >91 AAACCATAGATCACAAGTCCACCTCGAGGCGATTATGCATGCCCTTCACTTCTCACGGCTGGATGGGCTTGCCTTAGTCACTTGCGATTGAGTCGTACGATTATAAAGCG >93 AAACCCCCCCGTTACCCTCACTCCGCTCGGCCTAGCGGTGGTCAGGTCAGGAGTTGCAATCGGAGTCACACGATCATACTTTCTCACTGCGCACAACATATCTGCTTGCA >100 AAACCGTGGGTAAGGCCGGGAGCTTTACACCAATTGGAACCACAAGTGGCTTGGCGGCCCAGGCTTCCTCATGCAGGATCATAACAATTACGCAAGTCCGGGAATCAGGA >91 AAACCATAGATCACAAGTCCACCTCGAGGCGATTATGCATGCCCTTCACTTCTCACGGCTGGATGGGCTTGCCTTAGTCACTTGCGATTGAGTCGTACGATTATAAAGCG >93 AAACCCCCCCGTTACCCTCACTCCGCTCGGCCTAGCGGTGGTCAGGTCAGGAGTTGCAATCGGAGTCACACGATCATACTTTCTCACTGCGCACAACATATCTGCTTGCAsimka-1.5.3/example/D_paired_1.fasta000077500000000000000000000246751377312000000172400ustar00rootroot00000000000000>0 AAAAAAACTCTACAGCGAGCAGTGTTAAAATCGTGGCGCACCCAGACAGCCACTTCGCCGTTCTAAGTGGCCGATCGTAGCACATCGGATGACCTTGGTTGGTACGACAT >2 AAAAAAGAGACGAGCCACGCGGTGCGCCTGAACGTTGGGTCCAGACCACACTTATGGATTCGACGGGGCACCTATCAGGTTCTCCATCGTATAGTCGTCTGTAGGTCTGA >3 AAAAAATCGCTAGGGGGGATGGCCATCAACCCCCCCTCCCGTACCTATGATAGTGGGATCAGATTTAAGCACGGGCCCTACGACTCCCCTTCATGGAATAGGCTAAGGTG >4 AAAAAATTGTCGTTAAGATGAGGAGCTCTTTCGCATTTGACCCATCAAATCTCGGAATGCACTTGAGATCGACCCGTTTGATACAAGCCTTCATCGTCGATAATATATCG >5 AAAAACATTGCGGACTACCGTCGTTGCAGTGGGTCGCCCATTCTAGGCTGCGAGTTCATATGTGTGCCTGTCGCTTAGGGCAATCCTCGGATTGGCTGTTTAACAGGGGT >6 AAAAACCGATGGGGCCGAGCTGTTCTTTGGCCGGGTTACTCTACGCCCACACGGGTACACAGCCGCGAAACGGGAGGCTCGTGCGCGGTCACCTAAGTCCCTGTGGCGGG >7 AAAAACGGATGAGAAATAAAAGGGGAAATAGCGACATGTCAAATGGCCTCTTGGCTGGCGGTGTCTGGCTGGACTAACCCTCTTAAGGACTTAAAGCGTAGGCAAGGTTA >8 AAAAACTTTGACTTTTTCAAGACATGAAAGGATGCGGGCTCATACTGGACGGGTTCATTCCTACCGCGGAACGAAGGGCTATTTTTTGTTTGGGCGAGAGTACATCCGTC >9 AAAAAGACTCAGCTTGACATGGCGGTCTGAGCTTTGCTTGGGCTCTTACTATGTCAGGGTTGGAAACTATGGCAGAAGGGCTTCTCGCATCCTCACGGCTCGAATTAGCT >11 AAAAAGGCAAGGATCTTTGACCACGGCAGTTTGCAATAGTCAATTACGCTTCCCTGAGTACAAACAATGGTCTAATGCTGTTCAGTTGGGGTTAACTGGCCTGACGCTCT >13 AAAAATAAAAATGGCCTAGGTCCAACTCGTCCCCGGGGTAAGTTAGTGTAAGAGCTTGGAGCAAGTCTGTTCCTCGTCTGTCCCAGTAACACTCGGGTCTACGGTCGCAG >16 AAAAATTAGTCAGGTTACCCCCAATTAGGTGAAATACGTCGAAGGGTCGCGTCCAAGAAAGAATGATAGCTGACAGTTCTCTAGGTATTTATATTTGTTTGCATCGACTG >17 AAAACAAATATCCTGAATTCATAGAGCCGTGACTTACAGTTCTATGAAAAGTGTCTGGCAAGGGAGATTTCACGTTTCCCTGTATAGGGTCGTCGTATTGCCCACCATTT >18 AAAACAATGCTGAAGACCCTAATGCGTAACCAACAATGTAAGACTGGCACGTATTCTATGATATCTTATTGGCACTCCATCGCGAGGATACTAATAGACACCTAAAAGGA >19 AAAACACGCATGTCCGACGGGCGCATTAAACGGCATTGTTGTTCTTGAACCGGAAGCAATAGTCTAGGACCGTTCATATTGACTTACTTTACGCGTGGGCCTTGGATGAA >20 AAAACAGTCATTGTCTGATTTAGCATCGGGTGCACCGAAACGCTATCGCCTTCTCGTGAATTCGCAATTCAGCTCCAAGCATCAATCAGAATCACATGCCGCACACAGAA >21 AAAACATAGTTATAATGTTTTGAGTGTAAATGCTTACATCGAAATATTCGACGTACTACACCGTGAGCCAAGACCCTGACTTCGAACCATGCGCTCCTGTGAGTGTGACG >23 AAAACCCACCGTGCCGGTTTAAGAGCAGTAGCTGTTGGTGTTGTGTCCGCGTGGCATCGGAAAGTCGCTACAACGCCGGAAGCCGGGGATTTACAGAATACGTTATACGA >24 AAAACCGGTTCGTGAAGTCCGAAGGAATTCCACAACGCACTGCATGACACCTGGAAGAAGAAGGCCTTTTCCACGCCCTGAACGTAATTTCTGGTAAAGCAAGTGCTCCA >25 AAAACCTGATCGTGTTCATGAGTCTGGTATAGACGGATCCTTGGGCCAAAGTCTTCCGGTCTTCTGGCCGCCTTCAGGAGTCTAATTACCTGAACCTCATCGTAATTGCT >27 AAAACGATTATTTATCATTTACCGCCTTAGAGTGTGGCTTATATAGCATGGGTTTGATTTGAGTGGGACAACAGATCCATTTGATGCAGTATGTATTAGCGGATCATGAC >28 AAAACGCTCGTGCCCAGTTCGAGCGCCCTGGAGTTTACCGCCCAAGTAAACGCTATTTTTTAAATGACATAGCTCCTTACAGGCGTGGGGGGACGCCTATAGGGCCGTCG >30 AAAACGTGATGTAATCGGACGCATTTGACCAGACGTAGCCACCTTATTCGGTGCGTGCCATGACCCGAGAGCGCCAGGGATGCTTCTCGTTCCGGGTCACTGATAAATAG >31 AAAACTCACCACCAGGACAGAAGCTAAAAGACAGGCCGATTCACGGGCGAGCGGTCGAAGCATACTTTTTAGGGCATCATGTGACCACACGTGTTGACCGTCCATCGTTT >32 AAAACTCCTCCGCCGAGGAGGCACTTAGCCTCGTATGGATGCTTAAGGCATGATCGAGCCGGCCGGAAATCTCCTACCGTCTAATTAGGGGCATTGAAGTCCGGTTCCGC >33 AAAACTGTCAGCTCTAATCGAATGCTTGGAAGTTCTGTCCAAAGTGTTGCGAAGCCGAGCTTGAACGTATATAATAACTGCGGTCCTCATACCGGAACAAGTTTACTGCC >35 AAAACTTTTGGATCGCCATGTGACTAATTCCTATAATTACAATCTGTCATTAGTCGGAGCGGTGCGAGATGTGAGTAGTATAGTCGACGCGGCTAATCGAGGCAGATTCC >36 AAAAGACATCGCTAAGTAGTCGATAACTTTAGGTCTGGCTCAGCGAAGTCCGCGCACCGAGGTACGCGATGAACGTGTAGTAGCTGTGCTGCCGACTCTGAGGCGGTAGC >37 AAAAGACGTGACAGAGGCGATGATACCGCAGACGATACGCCACTACAGCTAAAGAGTCTGTCTAGAAATGCCTAGCGGCACCTGGCGCCGCCGTCTAATGGAGTGCAAGC >38 AAAAGAGTTTGCAGGGTCACTAGCCCGGCTCTGCATACATTGGCAAAGAGGCGATAAGCTGTGAATTCCAGGCTGGGGACGAGCTTGAACCTACTGTGCGAAGTTACCAC >39 AAAAGATGCACCGGCTATCTTAGTTCGTTCCAGGCCAACGAGTGTGACTATAGACGAGTTCTGCTCAGAACGGACTAGGCTTCGGGTGTCACGCGGGATCATATTATCTT >41 AAAAGCCTGGCCTTAACAAAATCTAGGTGCGTCTCGAATCGAACGGAAAACAACGTCTGGTTTACTGAGAAATCCTAGGATGCTGCTGGCTATTTGACCTCACGGGGGTT >42 AAAAGCGTGCCGAAGATGCCGTAATTTCCGAGGATGCACTCTCGTGACATCTCTTTTTAACGAACAAATTGCAGAGGTCAAGGTGATCGAGGCACGCTATGCTAAGCACT >43 AAAAGGAAAGGGAGAACGAATGATTGTTTCCAGGTATCAGGAAGCAACAAAATATAATCGATTCGTCACTGTGAGCCAACAGGCGTGTATGTCTGCGTCAAGCGTGCATC >45 AAAAGGCACGCATCGTCATCTGAACAGCAAAGTTGGGCGTTTCCGCCAATAAAGCGTTTCCCTTCCATTTTATTGTACTAGGAAGAAACCACTCCTATAAGCAAACAAGT >48 AAAAGTACTAACTTATCACGAACCGCTTTTGACGTCTTAATTACAGGTTGGGTGACGCGGCATGTCAGGGGCAAACTAACTATGATATCCACGGAACTGCCGACGACTAA >50 AAAAGTCGGAGTCGATAATGATATAGCGGCACGACTCGAACCCGCTTCGCAGCTCATCTCAGGAGATAGGCCTCGAACCTTCCCTGGTATGTACTCGGAGGCTCTCACCC >51 AAAAGTGGAAAATACCCTCACTAACAGGGTATCTTGCGTTGAGCTGGTAAGTGCACCAGGGAAAACTTGTTATCCTCGAGTCTGATCGATGCTATCTCATATTTCTGAGT >52 AAAAGTTTAGTAAGTTGGGTACGAGGCGTTATGGAGGGTTGCGTCGCTTTCACGAGCCTCATCGATAGCATACCTGTCGCAGATGTATTCAATGGTAACATGACGGTTTT >53 AAAATAATTTTCTGCAGGACCTCTGCTTCGGGAGAAAATATATTTAGATCTCCACCCGGAACCGCTCGCGACTTCACGAGCATCGGGTAGGACTTCGGCCGCTTGGATGC >55 AAAATAGACCTTTCCCCGAAACTGTATGCTAAGACTGTGAGGCGGCGAACGGTCTTTGTTCCTCAGTTAACTGACAACTCACAACAGCGCATAACGAATCACATGCCAGG >56 AAAATATTATTGATCTGACACTACAGAACTTTCTCGTTGACATCGTGCATTGAACTATCAGATGCCCAACCGAGTGGCGGCGATCGCGTTTCGAGTATCACGCGGGTTCC >58 AAAATCCCACCGACGCACTCAGCTAAGTTGATGCATAACAACGTTTGAGCGCTACCTGAGTTAGTTGCAAGCTCGGTCAACGTGTAAAATGTCCATCAGAGTCACCTCAT >59 AAAATCCTAATCCAGCACACGAGCCTCAGTCAGGTTCAAGGTGCGGCTTACTCTGCCGACACCAGCAAAGATATACTCGGGCAGGGAGATTAAATTGGGTTTGTCGACCT >61 AAAATGACACTAACAACGTTGACCGAGAATGAACGTCTAAACCCTTAGTGTGAATTCGTTTCAATATGTACAGGGCCCTCTGGCATATCCCGCTGCCCGGGCTAATGTCA >63 AAAATGCGAAGGTATTCTGTAGAGGGGAATAACTGGGCTTCCATCCCCAGAGCTAACACAGCCGACTACACACTACATAGATGGTCGGGGGTGGTCCGCCGGAAGACGCT >64 AAAATGGCTTTAGGCTAGTAGTAATCTAATGTGTAACAAAGTCTTGTGGCCCGATCGTTATATCTCTGGCACGATCGGTTGGCGGTTTTTCTAGATTACCTTACGCGATA >65 AAAATGTGGTCGGAGCCGCGTACATTATGTGTGGCTTCACCTATATCTAGGGGAGTTCCCGGCCTAGCACACCAGCGGTCCGTAGGAACCGCGCCCGCCAAACCGAGCAC >66 AAAATTACGGCGCAACTGTTGGCTTCTTCATTCCCTGTTAGGTCCAAGAGCTGACAGGTCATATCTAATTCGACAGTTGCTAGATCGTAGTGAGTTAAGTATTCGTGGAG >68 AAAATTGCGTAGTTAGAACGACGAGCATTCTAATGTACAACCTATAATAAATAACGGGCCCTTGTTGCCTAACCAACAACAGTACCGCCAGGCCACTCCGCTAAGGTCAG >1 AAAAAACATTAGTTACGGAAGGTGGGTGGAGCGGGGGCCGCCAGTCTATATTCATACTAGAAAGGGGCTAAGGGCATCGCGCTCATGAAGTGGCACTTGCAGAGGTGAGT >4 AAAAAATTGTCGTTAAGATGAGGAGCTCTTTCGCATTTGACCCATCAAATCTCGGAATGCACTTGAGATCGACCCGTTTGATACAAGCCTTCATCGTCGATAATATATCG >6 AAAAACCGATGGGGCCGAGCTGTTCTTTGGCCGGGTTACTCTACGCCCACACGGGTACACAGCCGCGAAACGGGAGGCTCGTGCGCGGTCACCTAAGTCCCTGTGGCGGG >7 AAAAACGGATGAGAAATAAAAGGGGAAATAGCGACATGTCAAATGGCCTCTTGGCTGGCGGTGTCTGGCTGGACTAACCCTCTTAAGGACTTAAAGCGTAGGCAAGGTTA >10 AAAAAGCTTACAGTGTTCGTAGTTCTGCTCGTGTCGGTATTCTCATACTCACTCCAGGACTTCGGAAACATTAGTGAAAGTGTCACCGGCCGTGCATTTTCCGGAGTAAG >12 AAAAAGTCTCACTTATTTACGCGCTTGATAGCCCAAACCGTTCACAGCATCTCCAGTAGTCGATTCCGGCTCCTCTCCATGTTGGAGATGTCAGCACGGCAGGTATATGA >14 AAAAATATTTGAATCCGCTGCTGACTTTTTCAAAATTCGATTATCACTACGGCGGTCAGCATATTTCCTTGAACTTATAACGTATCTCAGTATTGCTGGTTTGGAAACGT >15 AAAAATGCCCAGTTCGGCGATCCAACTCGTTAGTCTCAGGGTTCCCGCGGCGAACTCCCTTCGGTTGACTTTACGGCCTACACGCTCAGCCCTGTACCCGCATTGATGTC >19 AAAACACGCATGTCCGACGGGCGCATTAAACGGCATTGTTGTTCTTGAACCGGAAGCAATAGTCTAGGACCGTTCATATTGACTTACTTTACGCGTGGGCCTTGGATGAA >22 AAAACCATCTGTTTCCCAGAATGCTCCCGATACTTAATCACCTGCGCTTTGATCGTAGGCACTCACCCTCTCAAGTACCCTTGGACAGATTTAAGATCGATAGTTCGTAT >26 AAAACGAACCGGACTACATTCCTCATAGGCTTGAGGGGCAAAAGTTACAGTACAGATTTCGGGACCCGTCGCTTTACAGTGTAGACTGTTTTCCGAGAGTGCCTAGTCCA >28 AAAACGCTCGTGCCCAGTTCGAGCGCCCTGGAGTTTACCGCCCAAGTAAACGCTATTTTTTAAATGACATAGCTCCTTACAGGCGTGGGGGGACGCCTATAGGGCCGTCG >29 AAAACGGTGCCAACCATTGAAGATAGAAGATTCAATACATTGCACAAACGAATATAACCGGTAGGTAAAGTCTCGTTTCAAAGCGGCTCTATTTGTGCACCTTTTGCTGA >31 AAAACTCACCACCAGGACAGAAGCTAAAAGACAGGCCGATTCACGGGCGAGCGGTCGAAGCATACTTTTTAGGGCATCATGTGACCACACGTGTTGACCGTCCATCGTTT >34 AAAACTTTACCGATTTGCAAGATGTAAACAGCACGGGGACGCTATATCGACTGAGTGTTGTAGTGGAATCTACCACCCCGGACATGGGGCTGGACTGATTATTGAACGCT >38 AAAAGAGTTTGCAGGGTCACTAGCCCGGCTCTGCATACATTGGCAAAGAGGCGATAAGCTGTGAATTCCAGGCTGGGGACGAGCTTGAACCTACTGTGCGAAGTTACCAC >40 AAAAGCCAGAAGGTATGCAGATCGATATCGCTCCAATGTGTCAGTAGCCCGGACGAAGGCCTGCTCGATGATGCGTCATATTAATGTCAGAGAACCTAGTAGCCATCGGT >44 AAAAGGACAACGATGTGTGTGCTTCAAATATTCCTCCAATTCGTTACTGTCGCGGATGTTATGTCGGGCACTCTTAGCTTCCAAGGTGGGTGGATGTTAACGCGCATAGA >46 AAAAGGGGTTTGCTGACCCACAGACAATAGATAGCGGATTACTTACTCCACTTACGAGACTAACGCAATGCTAATAACTTGACGTCTAAACGGGACCACATAAGCCTTTA >47 AAAAGGTATTATATTGATGATTAGCCATTTAGGGAAGATCAACAAGATGAAGCATGACGCGAGAGGTAAGGGATCGACGAACGCTCCTGCCATGGGCAATCCAGGAGGGC >49 AAAAGTCCCTGTCGTACCTCAATCCAGGCCACGTATACTCAGGGGGTACCGAAACCCTAATAACTCTCCCGTTGGTGACCAGATCTGAATCTGCACCGCACACAACTACA >51 AAAAGTGGAAAATACCCTCACTAACAGGGTATCTTGCGTTGAGCTGGTAAGTGCACCAGGGAAAACTTGTTATCCTCGAGTCTGATCGATGCTATCTCATATTTCTGAGT >54 AAAATAGAAATCGCAGCTTGAATGCTGTTACTCAATGTTTATGAACATAGCTCCCGCGTTACTCTGTTTCGACATAGGATGCGCGCACCGGAAGCCTGCTACCACAACAT >57 AAAATCAAACTAGCTACTCCGAAGGCGGGATATTTGCCTGGTAGAGGGAAATGTAGCTCACGGGCCGTTTACTCTTCTTCAGAGCAACTAAGTATTCCGGAAAACCTCAG >60 AAAATCGTGTTCAAGAAAGAGAGGGCCGGGCGCTGAATTGGGCCCGAATCACAAAAAAGTGAGTCGCGCTCCTACAAAGTCCTAATCTAATAGTGGATGATGTGTCTGGT >62 AAAATGCCATACTAAATTAACGGTGTCTCATAGCGACATTGTTATTGTCACCTGACATAGCCAGAAGGTTAAAAGTAGTGCGCGACGCGAAATACCCATGCTGCGGAGTC >67 AAAATTCAGCCTTTCTTACCCAAGGCCTCTCTCGGACAAGAACTATGAAAGCATGCCCTACAGCATACTTTCGCTGACATATGGAGCAGGGAGCCTAAAGGCCGTTTATC >71 AAACAACCACAGATGGCCACTATGTGAAGTTTTGGACGAGTCCATACATTTTTCACTAAGTAAGAAGCTACCTTAATACGTGCACGCATCACATCCTAGACGCTCTGGCT >72 AAACAAGCCGAGAATCTGGCGTGACAAATCCTCCGGAACGGGCTGACCCCACTGTACAAACAATGTAAATCACCGCAGTTTCACTGTACGTTTGGTCTTTTTGATAGACA >74 AAACAATGGACGGAACCATGTTCTGTTACAAGCGCTGACCACACAAGCCGAGAGTACCCCAAGATGATGTCTTAGGATCGTATATACCCTCCATACCCGAGCTTTCCCCG >77 AAACACGTCACCGAGCGCTTAGTGGATCGTACTCAACATGTTGAACAGACATTATCTACATTCGATTCTTCCCATTATGTATCATCGCAGTACACGCCGCTTTCCATTTT >78 AAACACTCGCACAGACCGGTAACCGAGGGAATACAGAATTATAGCCCATATTCGCTGTCCCAAACTGCACCCATCGTTGGCAATTCCGAGACCTCTATTTCCGGTATGCG >79 AAACAGAGACATGTACGTTTTGCGCGGTGGTAGCTCTGGAGTCGGAGGCAGGGTTTTTTGGCCGGCAAAATCAGTATCCGACCTCGTTGGATGACTCCGGAAAACCTTTT >80 AAACAGATCGCTATTAGCACGCGTATGCTTTCACTAGCGAACAAAAGTGCCCCTTTGAGTCCTAGCAGCTACAGTGCCCGTAACTGATATTCTTAAGGCTATTTACAGTT >81 AAACAGGGCACTGGAGGGCAGCCCTTGAACCGCATAGATGGTGGAATTTCATACGGACTGGCGGGCATTATCGGGGTCGTATTGCCTTTGGGGGCATAGCCCACGAGTGC >82 AAACAGTAGAGTTTCATGTCCCTTGTATCGGAGGCAGCGACTCGCTTGAGCAGACCAAGTCCCGTCACTGAGGGTTATCAGTGAGGATACCTTGGTTCAGACAAAAAGAT >83 AAACATACGATCAAGTGTCGAAATTATATCACCGGCATTTGGTCTTTAGATATCTAAAGAAATGGCGCTAGGCCATCTCCCGGGTTTTTTCTGCTTCATGGCTAGATTCG >85 AAACATGATTTCGTACCCCGTGTAGGGCATGTTACCCACGTGAGGCGAGGTATGCGTGGGTCGATGTAGTACCTGTTGACCCGCATTTAGCCTCGACTCAATCTGCTGGA >100 AAACCGTGGGTAAGGCCGGGAGCTTTACACCAATTGGAACCACAAGTGGCTTGGCGGCCCAGGCTTCCTCATGCAGGATCATAACAATTACGCAAGTCCGGGAATCAGGA >100 AAACCGTGGGTAAGGCCGGGAGCTTTACACCAATTGGAACCACAAGTGGCTTGGCGGCCCAGGCTTCCTCATGCAGGATCATAACAATTACGCAAGTCCGGGAATCAGGA >100 AAACCGTGGGTAAGGCCGGGAGCTTTACACCAATTGGAACCACAAGTGGCTTGGCGGCCCAGGCTTCCTCATGCAGGATCATAACAATTACGCAAGTCCGGGAATCAGGA >91 AAACCATAGATCACAAGTCCACCTCGAGGCGATTATGCATGCCCTTCACTTCTCACGGCTGGATGGGCTTGCCTTAGTCACTTGCGATTGAGTCGTACGATTATAAAGCG >93 AAACCCCCCCGTTACCCTCACTCCGCTCGGCCTAGCGGTGGTCAGGTCAGGAGTTGCAATCGGAGTCACACGATCATACTTTCTCACTGCGCACAACATATCTGCTTGCA simka-1.5.3/example/D_paired_2.fasta000077500000000000000000000137141377312000000172310ustar00rootroot00000000000000>0 AAAAAAACCGAAGTCTCTTGCAGTATGCACAGGAGATCGCGGAGAACCGGAATCCCCTGGTTCGACTGTGACTCTTGGAGCGTCGCTAATCGCGTCGGTAACTTTAATTT >2 AAAAAAGCGGAATCACTGAAGACGTCATTTCCTCTTCAGAAAAAGTTTACCATGTTCTATTCTTGACCCTACGAATCAAGGTTCCGTTTAGCGTGTCGTTATAGAACAAT >3 AAAAAATACAGTATATGCCCCGAACTAGCCATCCGGATCCCAATAATCAAATCACCGTGACTGATGGTCAGATATCTTCTCCGGGATTGTGGGGAACGCCTACTTCGTGG >4 AAAAACAAGGGCTGAGCATCTGCGACATAAGTATCCCGTACATTATGTTCGATTGTAGTTAACTAAATTGTCCTACACTAGCGACCCTAAACCACCGTTACTCGAGGGCA >7 AAAAACGGTGGTTTGCATGTTAGGCGGAAGATCCCAATCCCTTGAAGAATCACGGAAGTAAGTTACTAGTGAAGGTTACTTTGCGATCTAAGGCGCTCGTGGGTGTCACA >8 AAAAACTGGCTACATTTAGACTTTTCCCAGCCTTGCCTACATGCCTAGCACTGAATCAAATACCCGGCTACCGGGTAGCGACCCCACAATATCAAGTTGCTTCTCTGAGT >10 AAAAAGCAGTCGGAAGACCTTTAGCTGTCCTTTAGCAGTGACTCTCCGCCCGTCTCAATGCAAAGAGATTCTGCGCATTGCTCCTTGTCCTAAGACACCATTAAGCGAAT >12 AAAAAGTTAGTCCTAAATTGTGGGCATCGGGTTGGCTGAATTAGACTGTGTGATCATATTCTACTTTTCGTCGACAGGATCTGGGCCGATTAGGGACATGTAAAGTGTCC >13 AAAAATAAGAAGCCTTACGAGCGTTGCGGGTTCGTTCTATATAAGGGCAGTACGTTTAGTTTACCCATCTGGGGTTCTTGTTACAACATGGAGTCGACTGTGTCTTTTAC >14 AAAAATCCGTTTAAGTCCACACCCCATATCTTTTGCAAGTTACGTCTTCACTTGTGTACTGTAGACTGCTGGGGACTAGAGTCCATCGTTGTGACTTAAAGGACGCTTTC >15 AAAAATGAATGATCCACGCCCGGTCAGTGTAGATCACTGAGCCTCGGCTATTCAGCCCATAAAACACAAAGCCTGCGGAGCGGCGGCCTACTAGAAATGCGATCCATCCT >17 AAAACAAAAGCAATTAGCAACGGACGCGAACTCTCGATCTTACAAGGATACCAGAATAGTTCACCCTGGCTGCGCAATGCCGATATTAAGTGTAGGCTTCGGCCCATTGT >18 AAAACAAGCGATTGAGATTCTAATCCAGCTTGCGACATACTTTGTCAGCGATCTGCGCTGTCTAATTTGCATCAAGTGTCTATACACGGTCCCTCCTCAGGGCGGGCACG >19 AAAACACTCCCGTATTCATGTCTACGTACCGTTATCCACTATTAAATGTACACTAGCTAACAGTGGTCATTGAGTAATCAAGATCAAGCGCGAGATGGGGGAGGATTGCT >23 AAAACCCAAGATGACGAATATCTGGTTACTGGGTTGACGGGATATAATCATTGCAAAACCTTTTGGGAAGGGACCTTAAAGCCTAGTCTGTCTTTTAGTTGCCCTCGTGG >25 AAAACCTGACGTGTAGATTAGTTACGTCCCCCGGCGCATGCAATCGTCCCACACGGCAGATGTGTTCGGACTGGAGACAATAGTGCTAGTTGGACTACGGCCTATTGGCG >26 AAAACGAAGTAGCGGGTTACGTGAATGGTGCCCACTTACGATAGGTTCGAGGCTGAGGGCTGTACCACGCCGGACGATTCGTGTTCGCCTCCTTGAATCTGATTCCCCGT >29 AAAACGGGACCTGCTTTTTTGCTTTGCGGAGGTCAATGGTTAAGTACGAACCAGCATGCGGGATGTGGCCCCGCCAATGTGGTACCGTTTGGTACTGGATCAGTCCTTTT >38 AAAAGATACCTACGCTCCATCTAGAGGTTAATAGCGAACCTCTCGCTTCTAGAGCGAGGTGGCATTCAGCGAGATGAACCTCTAATCTAAGCCCCGAATATCTGACGCAG >39 AAAAGATTGCGTCTCAGGCGGGCTCTAAACACGCTTCGCCGGACCAACTTTCTGACATTCTGGGACTGTCACTTGCAAGGCCTTATAGCTTTACGGCATTCTCCTTCGCC >40 AAAAGCACTACTCCGTTTGGACCATTTGCGCAAACATGATCCCCGCGCGGTACGGAACTTTTTTACAACGCAGGATCTTCTGACCCGGTAACATGCTACCTGGCGCCTGT >41 AAAAGCCGGGATGGGTAACTCTTTTACTATTTGGGGACAAAGCTGCATAGTGACCGGCACTCAAACATACATTGTAATAGTGAAATCGGAGAGCACCATGCGCCAGTCGG >42 AAAAGCGATTTGAGCGGAGGAGGCGTTGAGGAGATCAGAAGGGGGCACACCGCAACGGTCTTCAACACACACTGGCAGTCTTTAAGACGTTTGAAGTTTAGCTTAGTTAT >43 AAAAGCTGGTCCTGCCATTGATGTCCGATGAGCGACTCTTACCCTTGCAAAGAGCACAAAAGTTATATTCTGATCAATGTCATGCATTTAACATTACTGACAGGGTGGCG >44 AAAAGGAAGATCACAGGAGAAGTAGACACCTCATCGATGGCACGGACCCAGTTCACATATAGGGTGGGGAGCTATCCAGAAGTGGCCATGGTTTTGTAACTTCGCGTTAG >46 AAAAGGGGGAAACAACTGAAGGTCGGTGCCGCGATAAGCCCGTTAACAGGGACGGGCCAAAGCATTAGCTTGTTAAGAACCGATGCTATGTTTATCCGATTGGATGTTGT >50 AAAAGTGACGAATTCCTGCATCGAGAGGATGGCGTCTCTCGCATCGCCGGTCCAAGAAGACCACAGGAAACAGATCGGAGAAGGCCGCAGGTATTCAGGAAGTTCATATA >51 AAAAGTTCCATTGGGGGCACGCCTAAGTTACGGCACCCGAGTTTCGCCAGGAAGTGGAAATTTATTCTTTTGAATCCGCAGAAGTGTAAAATGCCGTCCAATAAAATTAC >53 AAAATAAGGCTTTGGTGCCAAGACCAAACTCGCTTTGATGTCGTCTTGGCCAAAAAGATACCTTCGGATGGGCCCACCCCTATGCTTCCTCATGCTTTCACTAGGGAGAC >55 AAAATAGTAGGCCGAATGGATTGGGTGACGTGTGGCACAGTAAGGGAGGACTATCAGGTGATTCTCTACCTGGAGCCACCATGAACCTCATGAGTAGAGGCGGAACAAGG >58 AAAATCCGTACCTCGAGCGGTTGGAGAACGCTCGCGCTGAATGCCCGTAAGATGTTGACAGTGAAGTGATTTTGCAATCGATGTATTCGTGTCGAATCATCATGCCCGCT >60 AAAATCTATTAGGAGATCAACTGTCCGAGTATTGTGGGGTTGGCTCTACTTACGCAACCCGCGATACAGCAATACGATCCTCGAGCTCTCCTCAACCCCGATTGCGTATG >62 AAAATGCACGAAAATAGGAGCATTCGTCCCAGTAGTGATTGAAAGTCCTTAGGCATAATTCAATACATTCGTTTGGACCCCAAGTGTTGGGCGTTCAACGCGCGAGATTG >64 AAAATGGCGGAATCTGTTGGATCCTGGCCGGTAGAGTGTGCCTACAGATTTGTCGAGGCGGGTAGTCTGCCTGCGGCCTGCACGTTAGAGTACTACCTCATAGTGTTAAG >65 AAAATGTCAACTCACGTTCTTTCGCACTTATGTTTCAGCCTCAGATTCAATTTGACATCCTACAAATATGAGAAAGGCAACCAGGAATGGGGCTGAACCCGTTCAGCCGT >66 AAAATTATGAGAGGGGCGTTCTCGCAATGGAGATTCTTCTCGTCGACTCACCAGGGGACCAGTGCACGCAGCTCCATAGGTGCACGCTCTCGGACGTGGCAACGGAGGAC >68 AAAATTGGCCGCTGAGACATGGGACATGGATTATCGTGTCATATAGACGGCGGCATTTTGCTACTAGCGAACACTCAATGGGGCTTTCCGTGGACTACTATCAATACTGA >70 AAACAAAATTTTGTTCTATCGAGTCTACCAGAGCCGAACACGGCCAAGCCCAATATGCCAGTGGTGTGCTGCTTGAGCAATTCGCAGGTATCTCTCCAACTACATCGCCG >73 AAACAAGGTCCGTACGAAAACCTAGTGACCTCAAATCAGTTGTAGGTGTACTGGCTTGCAACGTTGCCGGTGAACGAAAGAACCGCTAGGGGGCCGTGATGCATTCTACC >79 AAACAGATAGCCACCAACTATACCCTTCTTCGATGTCCATGCGGGTCGTTAACGTCGCTAGGCGTGAACGGACGCTCGTGGATGTCCGTCTACGCAATGTTACGAGTCAA >80 AAACAGCCTCGATGGGTGTACAGTGCACCCTTGCTCGACTACGGCTTCAATTCTGATGTAAAACCTGTACGTGAGACTGCCAAGGCAGATGCAACCAGATCTCCTGGATT >81 AAACAGCTAGGGGTCGACCACTTGCCGGCAGACACTGAGGTAGATATTAAGCAGAACACCGGGTGGTAATTGCATGTCGTATTAGTTCCCGTTGCTTAGCATGCCTAAGG >84 AAACATATGTCATAAGGCGCTGAAGACACCGCACGGGGACTAACACAACAGCACCAGATTGTCGACGTAAGTGCTTTTCCTATTTCTTAGCCCATCTCTAATCAGCCCGG >85 AAACATCGAGCTTGGACGCGTGCAGGTATTAAATTTGAGCCCCAAGGCTTATAATGCATCCTCCCACAAGAAGGCATAGATGTACTCGTCTTAGTACAAGGCTGCTAGAG >87 AAACATTTGTACGCGAATAAATTATTTCGGTGTCAGAGGCGACACCCGTAAACGGGAGCAAGGCTAAGTCAAGGTGTTGAAGAGAATTTTCTGTGGTCATTTACTGTCCT >89 AAACCACGAATACCAAAATAAAGTCACCCTGTGCCTTAGTGTTTAAGATGTACTGACAATTTCCTGTGGATCGTTGTGCGGTTGCTGTGGGGGCCCTATCAGCGAACGGG >90 AAACCAGTATGCTTTTAAGGGAACCGAGGAATCGCATGATCTTCCGGTGATTATGCCATCTCTAACAGGGAGGCGCCTTGCTTTAACGCTGTACCCGTTTTGTACTCGAA >91 AAACCATCTTGATAATTCTAAGGTCAGTACGAAAGGCCTCTAGTCAACCGTCTCGTGGATCGGGACTCAGCCGTGGAATGATCATCATTAGCAGACAGACAGTCGATATC >100 AAACCGTGGGTAAGGCCGGGAGCTTTACACCAATTGGAACCACAAGTGGCTTGGCGGCCCAGGCTTCCTCATGCAGGATCATAACAATTACGCAAGTCCGGGAATCAGGA >100 AAACCGTGGGTAAGGCCGGGAGCTTTACACCAATTGGAACCACAAGTGGCTTGGCGGCCCAGGCTTCCTCATGCAGGATCATAACAATTACGCAAGTCCGGGAATCAGGA >100 AAACCGTGGGTAAGGCCGGGAGCTTTACACCAATTGGAACCACAAGTGGCTTGGCGGCCCAGGCTTCCTCATGCAGGATCATAACAATTACGCAAGTCCGGGAATCAGGA >91 AAACCATAGATCACAAGTCCACCTCGAGGCGATTATGCATGCCCTTCACTTCTCACGGCTGGATGGGCTTGCCTTAGTCACTTGCGATTGAGTCGTACGATTATAAAGCG >93 AAACCCCCCCGTTACCCTCACTCCGCTCGGCCTAGCGGTGGTCAGGTCAGGAGTTGCAATCGGAGTCACACGATCATACTTTCTCACTGCGCACAACATATCTGCTTGCA simka-1.5.3/example/dataset_metadata.csv000077500000000000000000000001221377312000000202500ustar00rootroot00000000000000DATASET_ID;VARIABLE_1;VARIABLE_2 A;1;aquatic B;1;human C;2;human D;2;soil E;3;soilsimka-1.5.3/example/potara_job/000077500000000000000000000000001377312000000163705ustar00rootroot00000000000000simka-1.5.3/example/potara_job/sge/000077500000000000000000000000001377312000000171465ustar00rootroot00000000000000simka-1.5.3/example/potara_job/sge/job_count.bash000077500000000000000000000001231377312000000217660ustar00rootroot00000000000000#!/bin/bash #$ -S /bin/bash #$ -m bea #$ -cwd source /local/env/envgcc-4.9.1.shsimka-1.5.3/example/potara_job/sge/job_merge.bash000077500000000000000000000001231377312000000217350ustar00rootroot00000000000000#!/bin/bash #$ -S /bin/bash #$ -m bea #$ -cwd source /local/env/envgcc-4.9.1.shsimka-1.5.3/example/potara_job/tgcc/000077500000000000000000000000001377312000000173105ustar00rootroot00000000000000simka-1.5.3/example/potara_job/tgcc/job_count.bash000077500000000000000000000001461377312000000221350ustar00rootroot00000000000000#!/bin/bash #MSUB -r Counting #MSUB -T 86400 #MSUB -q large #MSUB -A fg0001 #MSUB -n 1 #MSUB -Q normalsimka-1.5.3/example/potara_job/tgcc/job_merge.bash000077500000000000000000000001461377312000000221040ustar00rootroot00000000000000#!/bin/bash #MSUB -r Counting #MSUB -T 86400 #MSUB -q large #MSUB -A fg0001 #MSUB -n 1 #MSUB -Q normalsimka-1.5.3/example/simka_input.txt000077500000000000000000000001601377312000000173340ustar00rootroot00000000000000A: A.fasta B: B.fasta C: C.fasta D: D_paired_1.fasta ; D_paired_2.fasta E: A.fasta , A.fasta ; B.fasta , B.fastasimka-1.5.3/example/simple_test.sh000077500000000000000000000024301377312000000171360ustar00rootroot00000000000000#!/bin/bash ADDITIONAL_SIMKA_OPTIONS=$* #simple test with real data # look for simka binary. In devel mode, it's in ../build/bin directory. # In production mode, it's in ../bin directory. if [ -f "../bin/simka" ] then bindir="../bin" elif [ -f "../build/bin/simka" ] then bindir="../build/bin" else echo "could not find a compiled simka binary" exit 1 fi # run simka command="$bindir/simka -in ../example/simka_input.txt -out ./simka_results/ -out-tmp ./simka_temp_output $ADDITIONAL_SIMKA_OPTIONS" #printf "$command\n\n" # DO NOT add lines between '$command' exec and 'var...' ! $command var=$? printf "\n\n\n" if [ $var -eq 0 ] then echo "*** Test: PASSED" else echo "*** Test: FAILED" exit 1 fi #printf "\nremoving all created dirs\n" # clean temp files rm -rf temp_output printf "\nCommand used:\n" printf "\t$command\n" printf "\nCommand for visualizing results:\n" printf "\tpython ../scripts/visualization/run-visualization.py -in ./simka_results/ -out ./simka_results/ -pca -heatmap -tree\n" printf "\nCommand for visualizing results with metadata annotations:\n" printf "\tpython ../scripts/visualization/run-visualization.py -in ./simka_results/ -out ./simka_results/ -pca -heatmap -tree -metadata-in ../example/dataset_metadata.csv -metadata-variable VARIABLE_1\n" simka-1.5.3/scripts/000077500000000000000000000000001377312000000143045ustar00rootroot00000000000000simka-1.5.3/scripts/jenkins/000077500000000000000000000000001377312000000157455ustar00rootroot00000000000000simka-1.5.3/scripts/jenkins/README000066400000000000000000000003361377312000000166270ustar00rootroot00000000000000These scripts are intended to be used with the Jenkins CI Platform available at Inria. They can be called from a Jenkins Task / Build / Execute script, as follows: /bin/bash -xv gatb-${TOOL_NAME}/scripts/jenkins/xxx.sh simka-1.5.3/scripts/jenkins/tool-simka-build-debian7-64bits-gcc-4.7.sh000077500000000000000000000103361377312000000251430ustar00rootroot00000000000000#!/bin/bash #--------------------------------------------------------------# # Continuous integration script for Jenkins # #--------------------------------------------------------------# # # Default mode : # This script will exit with error (exit code 1) if any of its steps fails. # To change this behaviour, choose DO_NOT_STOP_AT_ERROR in Jenkins (see below). #--------------------------------------------------------------# set +xv echo " ----------------------------------------- Miscellaneous information ----------------------------------------- date : `date` hostname : `hostname` pwd : `pwd` ----------------------------------------- Jenkins build parameters (user defined) ----------------------------------------- BRANCH_TO_BUILD : ${BRANCH_TO_BUILD} INRIA_FORGE_LOGIN : ${INRIA_FORGE_LOGIN} DO_NOT_STOP_AT_ERROR : ${DO_NOT_STOP_AT_ERROR} ----------------------------------------- Jenkins build parameters (built in) ----------------------------------------- BUILD_NUMBER : ${BUILD_NUMBER} " error_code () { [ "$DO_NOT_STOP_AT_ERROR" = "true" ] && { return 0 ; } } [ "$DO_NOT_STOP_AT_ERROR" != "true" ] && { set -e ; } || { echo "(!) DEBUG mode, the script will NOT stop..." ; echo; } set -xv # quick look at resources #----------------------------------------------- free -h #----------------------------------------------- lstopo #----------------------------------------------- df -kh #----------------------------------------------- ################################################################ # COMPILATION # ################################################################ gcc --version g++ --version [ `gcc -dumpversion` = 4.7 ] && { echo "GCC 4.7"; } || { echo "GCC version is not 4.7, we exit"; exit 1; } JENKINS_TASK=tool-${TOOL_NAME}-build-debian7-64bits-gcc-4.7-gitlab JENKINS_WORKSPACE=/scratchdir/builds/workspace GIT_DIR=$JENKINS_WORKSPACE/gatb-${TOOL_NAME} BUILD_DIR=/scratchdir/$JENKINS_TASK/gatb-${TOOL_NAME}/build rm -rf $BUILD_DIR mkdir -p $BUILD_DIR #----------------------------------------------- # we need gatb-core submodule to be initialized cd $GIT_DIR git submodule init git submodule update #----------------------------------------------- cd $BUILD_DIR #----------------------------------------------- cmake -Wno-dev -DJENKINS_TAG=${BRANCH_TO_BUILD} $GIT_DIR #----------------------------------------------- make -j 2 || error_code ################################################################ # TEST # ################################################################ # prepare data and scripts cp -R $GIT_DIR/example/ .. # 'tests' directory does not exist on older releases of simka if [ -d "$GIT_DIR/tests" ]; then cp -R $GIT_DIR/tests/ .. fi if [ -d "$GIT_DIR/simkaMin" ]; then cp -R $GIT_DIR/simkaMin/ .. fi # run tests cd ../example ./simple_test.sh || error_code if [ -d "../tests" ]; then cd ../tests python simple_test.py || error_code cd ./simkaMin python test_simkaMin.py || error_code fi # cleanup disk space cd ../.. rm -rf example if [ -d "tests" ]; then rm -rf tests fi # go bask to build for packaging step cd build ################################################################ # PACKAGING # ################################################################ #-- Upload bin bundle as a build artifact # -> bin bundle *-bin-Linux.tar.gz will be archived as a build artifact # -> source package is handled by the osx task if [ $? -eq 0 ] && [ "$INRIA_FORGE_LOGIN" != none ] && [ "$DO_NOT_STOP_AT_ERROR" != true ]; then make package pwd ls -atlhrsF # scp ${ARCHIVE_NAME}-${BRANCH_TO_BUILD}-bin-Linux.tar.gz ${INRIA_FORGE_LOGIN}@scm.gforge.inria.fr:/home/groups/gatb-tools/htdocs/ci-inria fi #-- Move the generated bin bundle to the workspace (so that it can be uploaded as a Jenkins job artifact) # NB: raw command sample # mv /scratchdir/tool-simka-build-debian7-64bits-gcc-4.7/gatb-simka/build/simka-master-bin-Linux.tar.gz \ # /scratchdir/builds/workspace/gatb-simka/build/ mv ${BUILD_DIR}/${ARCHIVE_NAME}-${BRANCH_TO_BUILD}-bin-Linux.tar.gz $JENKINS_WORKSPACE/gatb-${TOOL_NAME}/build simka-1.5.3/scripts/jenkins/tool-simka-build-macos-10.9.5-gcc-4.2.1.sh000077500000000000000000000104351377312000000245050ustar00rootroot00000000000000#!/bin/bash #--------------------------------------------------------------# # Continuous integration script for Jenkins # #--------------------------------------------------------------# # # Default mode : # This script will exit with error (exit code 1) if any of its steps fails. # To change this behaviour, choose DO_NOT_STOP_AT_ERROR in Jenkins (see below). #--------------------------------------------------------------# set +xv echo " ----------------------------------------- Miscellaneous information ----------------------------------------- date : `date` hostname : `hostname` pwd : `pwd` ----------------------------------------- Jenkins build parameters (user defined) ----------------------------------------- BRANCH_TO_BUILD : ${BRANCH_TO_BUILD} INRIA_FORGE_LOGIN : ${INRIA_FORGE_LOGIN} DO_NOT_STOP_AT_ERROR : ${DO_NOT_STOP_AT_ERROR} ----------------------------------------- Jenkins build parameters (built in) ----------------------------------------- BUILD_NUMBER : ${BUILD_NUMBER} " error_code () { [ "$DO_NOT_STOP_AT_ERROR" = "true" ] && { return 0 ; } } [ "$DO_NOT_STOP_AT_ERROR" != "true" ] && { set -e ; } || { echo "(!) DEBUG mode, the script will NOT stop..." ; echo; } set -xv # quick look at resources #----------------------------------------------- sw_vers -productVersion #----------------------------------------------- system_profiler SPSoftwareDataType #----------------------------------------------- lstopo #----------------------------------------------- top -l 1|head -15 #----------------------------------------------- ################################################################ # COMPILATION # ################################################################ gcc --version g++ --version [ `gcc -dumpversion` = 4.2.1 ] && { echo "GCC 4.2.1"; } || { echo "GCC version is not 4.2.1, we exit"; exit 1; } JENKINS_TASK=tool-${TOOL_NAME}-build-macos-10.9.5-gcc-4.2.1-gitlab JENKINS_WORKSPACE=/builds/workspace/$JENKINS_TASK GIT_DIR=$JENKINS_WORKSPACE/gatb-${TOOL_NAME} #N.B. /scratchdir not yet mounted on the osx slave (ciosx). # as soon as /scratchdir is created, one has to update TEST procedure, below. # refer to linux build target to see how to do that BUILD_DIR=$GIT_DIR/build rm -rf $BUILD_DIR mkdir -p $BUILD_DIR #----------------------------------------------- # we need gatb-core submodule to be initialized cd $GIT_DIR git submodule init git submodule update #----------------------------------------------- cd $BUILD_DIR #----------------------------------------------- cmake -Wno-dev -DJENKINS_TAG=${BRANCH_TO_BUILD} $GIT_DIR #----------------------------------------------- make -j 2 || error_code ################################################################ # TEST # ################################################################ cd ../example ./simple_test.sh || error_code # 'tests' directory does not exist on older releases of simka if [ -d "../tests" ]; then cd ../tests python simple_test.py || error_code cd ./simkaMin python test_simkaMin.py || error_code fi cd ../../build ################################################################ # PACKAGING # ################################################################ #--Prepare and upload bin and source bundle to the forge if [ $? -eq 0 ] && [ "$INRIA_FORGE_LOGIN" != none ] && [ "$DO_NOT_STOP_AT_ERROR" != true ]; then make package make package_source pwd ls -atlhrsF # scp ${ARCHIVE_NAME}-${BRANCH_TO_BUILD}-bin-Darwin.tar.gz ${INRIA_FORGE_LOGIN}@scm.gforge.inria.fr:/home/groups/gatb-tools/htdocs/ci-inria # scp ${ARCHIVE_NAME}-${BRANCH_TO_BUILD}-Source.tar.gz ${INRIA_FORGE_LOGIN}@scm.gforge.inria.fr:/home/groups/gatb-tools/htdocs/ci-inria fi #-- Move the generated bundles, bin and sources, to the workspace (so that it can be uploaded as a Jenkins job artifact) # Not necessary in this macos script, since BUILD_DIR is in the workspace (cf. above) #mv ${BUILD_DIR}/${ARCHIVE_NAME}-${BRANCH_TO_BUILD}-bin-Darwin.tar.gz $JENKINS_WORKSPACE/gatb-${TOOL_NAME}/build #mv ${BUILD_DIR}/${ARCHIVE_NAME}-${BRANCH_TO_BUILD}-Source.tar.gz $JENKINS_WORKSPACE/gatb-${TOOL_NAME}/build simka-1.5.3/scripts/jenkins/tool-simka-release-debian.sh000077500000000000000000000117121377312000000232230ustar00rootroot00000000000000#!/bin/bash #--------------------------------------------------------------# # Continuous integration script for Jenkins # #--------------------------------------------------------------# # # Default mode : # This script will exit with error (exit code 1) if any of its steps fails. # To change this behaviour, choose DO_NOT_STOP_AT_ERROR in Jenkins (see below). #--------------------------------------------------------------# set +xv echo " ----------------------------------------- Miscellaneous information ----------------------------------------- date : `date` hostname : `hostname` pwd : `pwd` ----------------------------------------- Jenkins build parameters (user defined) ----------------------------------------- BRANCH_TO_BUILD : ${BRANCH_TO_BUILD} RELEASE_TO_BUILD : ${RELEASE_TO_BUILD} INRIA_FORGE_LOGIN : ${INRIA_FORGE_LOGIN} TEST_VARIABLE : ${TEST_VARIABLE} DO_NOT_STOP_AT_ERROR : ${DO_NOT_STOP_AT_ERROR} ----------------------------------------- Jenkins build parameters (built in) ----------------------------------------- BUILD_NUMBER : ${BUILD_NUMBER} " set -xv # quick look at resources #----------------------------------------------- free -h #----------------------------------------------- lstopo #----------------------------------------------- df -kh #----------------------------------------------- ################################################################ # PREPARE RELEASE # ################################################################ # paths to access tool source code and build JENKINS_TASK=tool-${TOOL_NAME}-build-debian7-64bits-gcc-4.7 BUILD_DIR=/scratchdir/$JENKINS_TASK/gatb-${TOOL_NAME}-release TOOL_GIT_HOME="/scratchdir/builds/workspace/gatb-${TOOL_NAME}" # path to 'github_release_manager.sh' script GRM_PATH="${BUILD_DIR}/github-release-api" GRM_CMD="${GRM_PATH}/github_release_manager.sh" # github credentials and repository GITHUB_REPO=${TOOL_NAME} GITHUB_OWNER=GATB GRM_CREDENTIALS="-l $GITHUB_ADMIN -t $GITHUB_TOKEN -o ${GITHUB_OWNER} -r ${GITHUB_REPO}" # Prepare build dir rm -rf $BUILD_DIR mkdir -p $BUILD_DIR #----------------------------------------------- # check tag version; 'master' is not allowed if [ ! "${BRANCH_TO_BUILD}" == "master" ] ; then cd ${TOOL_GIT_HOME} DOES_TAG_EXIST=`git tag -l | grep "^${BRANCH_TO_BUILD}$"` if [ -z ${DOES_TAG_EXIST} ] ; then echo "/!\ Error: tag '${BRANCH_TO_BUILD}' does not exist on 'gatb-tool-${TOOL_NAME}' repository" exit 1 fi else echo "/!\ Error: cannot make an official release on 'master' branch" exit 1 fi #----------------------------------------------- if [ "$INRIA_FORGE_LOGIN" == none ]; then echo "/!\ Error: No login name to connect to Inria Forge" exit 1 fi cd $BUILD_DIR git clone https://github.com/pgdurand/github-release-api.git ##################################################################### # RETRIEVE ARTIFACTS FROM DEPENDENT TASKS # ##################################################################### CI_URL=https://ci.inria.fr/gatb-core/view/Simka/job JENKINS_TASK_DEB=tool-simka-build-debian7-64bits-gcc-4.7-gitlab JENKINS_TASK_MAC=tool-simka-build-macos-10.9.5-gcc-4.2.1-gitlab #retrieve last build from ci-inria (see tool-lean-build-XXX tasks) #scp ${INRIA_FORGE_LOGIN}@scm.gforge.inria.fr:/home/groups/gatb-tools/htdocs/ci-inria/${TOOL_NAME}-${BRANCH_TO_BUILD}-bin-Linux.tar.gz . wget $CI_URL/$JENKINS_TASK_DEB/lastSuccessfulBuild/artifact/gatb-simka/build/${TOOL_NAME}-${BRANCH_TO_BUILD}-bin-Linux.tar.gz [ $? != 0 ] && exit 1 #scp ${INRIA_FORGE_LOGIN}@scm.gforge.inria.fr:/home/groups/gatb-tools/htdocs/ci-inria/${TOOL_NAME}-${BRANCH_TO_BUILD}-bin-Darwin.tar.gz . wget $CI_URL/$JENKINS_TASK_MAC/lastSuccessfulBuild/artifact/gatb-simka/build/${TOOL_NAME}-${BRANCH_TO_BUILD}-bin-Darwin.tar.gz [ $? != 0 ] && exit 1 #scp ${INRIA_FORGE_LOGIN}@scm.gforge.inria.fr:/home/groups/gatb-tools/htdocs/ci-inria/${TOOL_NAME}-${BRANCH_TO_BUILD}-Source.tar.gz . wget $CI_URL/$JENKINS_TASK_MAC/lastSuccessfulBuild/artifact/gatb-simka/build/${TOOL_NAME}-${BRANCH_TO_BUILD}-Source.tar.gz [ $? != 0 ] && exit 1 ################################################################ # INTERACT WITH GITHUB # ################################################################ # create Github release ${GRM_CMD} ${GRM_CREDENTIALS} -d ${BRANCH_TO_BUILD} -c create if [ $? != 0 ] ; then echo "/!\ Error: unable to create release, check above error" exit 1 fi #upload files function uploadFile(){ local FILE_TO_LOAD=$1 echo "Uploading: ${FILE_TO_LOAD}" ${GRM_CMD} ${GRM_CREDENTIALS} -d ${BRANCH_TO_BUILD} -c upload ${FILE_TO_LOAD} if [ $? != 0 ] ; then echo "/!\ Error: unable to upload file, check above error" exit 1 fi } uploadFile ${TOOL_NAME}-${BRANCH_TO_BUILD}-bin-Linux.tar.gz uploadFile ${TOOL_NAME}-${BRANCH_TO_BUILD}-bin-Darwin.tar.gz uploadFile ${TOOL_NAME}-${BRANCH_TO_BUILD}-Source.tar.gz simka-1.5.3/scripts/sonarqube_diags/000077500000000000000000000000001377312000000174525ustar00rootroot00000000000000simka-1.5.3/scripts/sonarqube_diags/Dockerfile.sq000066400000000000000000000101041377312000000220620ustar00rootroot00000000000000# This Dockerfile prepares an image equipped with all necessary softwares to prepare # and upload SonarQube diagnostics # # Usage: # docker login registry.gitlab.inria.fr # docker build -f Dockerfile.sq -t registry.gitlab.inria.fr/gatb/simka/simka_sq . # docker push registry.gitlab.inria.fr/gatb/simka/simka_sq # NB: these tasks may be launched by gitlab-ci (manual job: update_simka_sq_image) # # References: # see eg https://sed-bso.gitlabpages.inria.fr/sonarqube/#sec-2-5 FROM debian:10 ENV FORCE_UNSAFE_CONFIGURE=1 ENV DEBIAN_FRONTEND noninteractive LABEL maintainer="Charles Deltel " RUN apt-get update RUN apt install -y \ make autoconf wget unzip \ zlib1g-dev libcppunit-dev \ git build-essential cmake clang clang-tidy gcovr lcov cppcheck valgrind python-pip pylint sudo vim tree \ doxygen graphviz # for doxygen doc generation RUN pip install --upgrade pip RUN python -m pip install pytest pytest-cov setuptools scan-build RUN chmod a+rx /root && \ mkdir -p /root/apps ENV version_rats 2.4 RUN cd /root/apps && \ wget https://storage.googleapis.com/google-code-archive-downloads/v2/code.google.com/rough-auditing-tool-for-security/rats-${version_rats}.tgz && \ tar -xzvf rats-${version_rats}.tgz && \ cd rats-${version_rats} && \ ./configure && make && sudo make install && \ rm /root/apps/rats-${version_rats}.tgz ENV version_drmemory 2.2.0-1 RUN cd /root/apps && \ wget https://github.com/DynamoRIO/drmemory/releases/download/release_2.2.0/DrMemory-Linux-${version_drmemory}.tar.gz && \ tar xf DrMemory-Linux-${version_drmemory}.tar.gz && \ rm /root/apps/DrMemory-Linux-${version_drmemory}.tar.gz RUN cd /root/apps && \ wget --no-check-certificate https://scan.coverity.com/download/linux64 --post-data "token=XEJaJ1cAnqW-9M_zkmxd7w&project=Heat" -O coverity_tool.tgz && \ tar xf coverity_tool.tgz && \ ln -s -f $PWD/cov-analysis-linux64-*/bin/cov-build /usr/local/bin/cov-build && \ rm /root/apps/coverity_tool.tgz RUN cd /root/apps && \ wget https://github.com/eriwen/lcov-to-cobertura-xml/archive/1.6.tar.gz && \ tar xvf 1.6.tar.gz && \ ln -s /root/apps/lcov-to-cobertura-xml-1.6/lcov_cobertura/lcov_cobertura.py /usr/local/bin/lcov_cobertura.py && \ rm /root/apps/1.6.tar.gz RUN cd /root/apps && \ git clone https://github.com/SonarOpenCommunity/sonar-cxx.git && \ chmod +x /root/apps/sonar-cxx/cxx-sensors/src/tools/vera++Report2checkstyleReport.perl && \ ln -s /root/apps/sonar-cxx/cxx-sensors/src/tools/vera++Report2checkstyleReport.perl /usr/local/bin/vera++Report2checkstyleReport.perl ENV version_sonar 4.2.0.1873 RUN cd /root/apps && \ wget https://binaries.sonarsource.com/Distribution/sonar-scanner-cli/sonar-scanner-cli-${version_sonar}-linux.zip && \ unzip sonar-scanner-cli-${version_sonar}-linux.zip && \ ln -s /root/apps/sonar-scanner-${version_sonar}-linux/bin/sonar-scanner /usr/local/bin/sonar-scanner && \ rm /root/apps/sonar-scanner-cli-${version_sonar}-linux.zip # cf. https://docs.docker.com/install/linux/docker-ce/debian/ #RUN apt-get remove docker docker-engine docker.io containerd runc RUN sudo apt-get install -y \ apt-transport-https \ ca-certificates \ curl \ gnupg2 \ software-properties-common RUN curl -fsSL https://download.docker.com/linux/debian/gpg | sudo apt-key add - RUN apt-key fingerprint 0EBFCD88 RUN sudo add-apt-repository \ "deb [arch=amd64] https://download.docker.com/linux/debian $(lsb_release -cs) stable" && \ apt-get update && \ apt-get install -y docker-ce docker-ce-cli containerd.io RUN docker --version RUN groupadd -f -g 1000 gitlab && \ useradd -u 1000 -g gitlab -d /home/gitlab/ -ms /bin/bash gitlab && \ mkdir /builds && \ chown -R gitlab:gitlab /builds && \ echo "gitlab:gitlab" | chpasswd && adduser gitlab sudo USER gitlab # change the default shell to be bash SHELL ["/bin/bash", "-c"] # set DRMEMORY path (does not work without using an absolute path) ENV DRMEMORY /root/apps/DrMemory-Linux-${version_drmemory}/bin64 # default working directory is WORKDIR /buildssimka-1.5.3/scripts/sonarqube_diags/analysis.sh000077500000000000000000000107411377312000000216370ustar00rootroot00000000000000# This script launches several diagnostics, whose results will be uploaded to the # SonarQube server. #!/bin/bash set -xv echo_stderr() { echo "$@" 1>&2; } echo_stderr "===> Launching analysis script (to prepare SonarQube diagnostics)..." #################################################################################################### # CppCheck and RATS analysis #################################################################################################### echo_stderr "===> Launching CppCheck analysis..." # see e.g. https://sonarqube.inria.fr/pages/documentation.html#org4575413 export CPPCHECK_INCLUDES="-Icore -Iminikc -IsimkaMin" export SOURCES_TO_ANALYZE="src" export SOURCES_TO_EXCLUDE= # ex. "-isrc/beta" export DEFINITIONS= # -D cppcheck -v -f --language=c++ --platform=unix64 --enable=all --suppress=missingIncludeSystem --xml --xml-version=2 \ ${DEFINITIONS} ${CPPCHECK_INCLUDES} ${SOURCES_TO_EXCLUDE} ${SOURCES_TO_ANALYZE} \ 2> simka-cppcheck.xml echo_stderr "===> Launching RATS analysis..." export SOURCES_TO_ANALYZE="src" rats -w 3 --xml ${SOURCES_TO_ANALYZE} > simka-rats.xml #################################################################################################### # Compile the code #################################################################################################### mkdir build cd build # compilation options echo_stderr "===> Compilation options" CFLAGS="--coverage \ -fPIC -fdiagnostics-show-option \ -Wall -Wunused-parameter -Wundef -Wno-long-long \ -Wsign-compare -Wmissing-prototypes -Wstrict-prototypes -Wcomment \ -pedantic -g" LDFLAGS="--coverage" echo_stderr "CFLAGS: $CFLAGS" echo_stderr "LDFLAGS: $LDFLAGS" # launch cmake echo_stderr "===> Launching cmake with scan-build..." scan-build -v -plist --intercept-first --analyze-headers -o analyzer_reports \ cmake .. -DCMAKE_VERBOSE_MAKEFILE=ON -DCMAKE_EXPORT_COMPILE_COMMANDS=ON \ -DCMAKE_CXX_FLAGS="$CFLAGS" -DCMAKE_EXE_LINKER_FLAGS="$LDFLAGS" \ -DCMAKE_BUILD_TYPE=DEBUG \ -DCMAKE_CXX_OUTPUT_EXTENSION_REPLACE=ON &> simka-scan-build-cmake.log # launch make echo_stderr "===> Launching make with scan-build..." time scan-build -v -plist --intercept-first --analyze-headers -o analyzer_reports \ make -j 4 &> simka-scan-build-make.log # make -j 4 [[ -x "bin/simka" ]] || { echo "Error, simka executable not generated"; exit 111; } mv simka-scan-build-cmake.log .. mv simka-scan-build-make.log .. mv analyzer_reports .. #################################################################################################### # Clang-tidy analysis #################################################################################################### echo_stderr "===> Launching clang-tidy..." test -f compile_commands.json || echo "Warning, compilation database missing" cd .. #clang-tidy -p build/ $(find src/ -name *.cpp) -checks='*' > simka-clang-tidy-report.log clang-tidy -p build/ $(find src/ -name *.cpp) -checks=-*,clang-analyzer-*,cppcoreguidelines-* > simka-clang-tidy-report.log cd - # back in build/ again mv compile_commands.json .. #################################################################################################### # Coverage analysis #################################################################################################### echo_stderr "===> Launching lcov (initial)..." # run initial/baseline lcov lcov --capture --initial --directory . --output-file simka_coverage_base.info # run tests echo_stderr "===> Launching a simple test run..." ../example/simple_test.sh -max-count 2 -max-merge 4 -nb-cores 4 -max-memory 3000 # run lcov again after tests complete echo_stderr "===> Launching lcov (after test completion)..." lcov --capture --directory . --output-file simka_coverage_test.info # combine lcov tracefiles lcov --add-tracefile simka_coverage_base.info \ --add-tracefile simka_coverage_test.info \ --output-file simka_coverage_total.info # extract useful data lcov --extract simka_coverage_total.info '*simka/src/*' \ --output-file simka_coverage_total_filtered.info # generate the html lcov page that can be published echo_stderr "===> Generating coverage html pages..." genhtml -o coverage-html simka_coverage_total_filtered.info mv coverage-html .. # convert the lcov report to an xml format convertible with SonarQube echo_stderr "===> Generating coverage report for SonarQube..." cd .. lcov_cobertura.py build/simka_coverage_total_filtered.info --output gcov.xml --base-dir src/ echo_stderr "===> Done..." simka-1.5.3/scripts/visualization/000077500000000000000000000000001377312000000172055ustar00rootroot00000000000000simka-1.5.3/scripts/visualization/dendro.r000077500000000000000000000047641377312000000206610ustar00rootroot00000000000000#Author: Gaetan Benoit #Contact: gaetan.benoit@inria.fr args <- commandArgs(trailingOnly = TRUE) distanceMatrixFilename = args[1] distance_name = basename(distanceMatrixFilename) distance_name = unlist(strsplit(distance_name, "[.]"))[1] distance_name = gsub("mat_", "", distance_name) distanceMatrix = as.matrix(read.table(file=distanceMatrixFilename, sep=";", header=TRUE, row.names=1)) distanceMatrix[lower.tri(distanceMatrix)] <- t(distanceMatrix)[lower.tri(distanceMatrix)] #symmetrize matrix width = as.numeric(args[3]) height = as.numeric(args[4]) format = args[5] if(format == "png"){ png(file=paste0(args[2], ".png"), width=width, height=height, units="in",res=72) } else{ pdf(file=paste0(args[2], ".pdf"), width=width, height=height) } use_metadata = F if(length(args) == 7){ suppressPackageStartupMessages(library(dendextend)) use_metadata = T metadata_table = as.matrix(read.table(file=args[6], sep=";", header=TRUE, row.names=1)) metadata_variable = args[7] #print(metadata_table) variables = metadata_table[,metadata_variable] #print(variables) meatadata_index = list() dataset_ids = rownames(metadata_table) for(i in 1:length(dataset_ids)){ dataset_id = dataset_ids[i] #print(dataset_id) #print(variables[[i]]) meatadata_index[[dataset_id]] = variables[[i]] print(paste0(dataset_id, " ", variables[[i]])) #print(meatadata_index[[dataset_id]]) } colors = c() dataset_ids = rownames(distanceMatrix) for(i in 1:dim(distanceMatrix)[1]){ dataset_id = dataset_ids[i] colors = c(colors, meatadata_index[[dataset_id]]) } colors_numeric_temp = c() colors_numeric = as.numeric(as.factor(colors)) for(i in 1:length(colors_numeric)){ colors_numeric_temp = c(colors_numeric_temp, colors_numeric[i]+1) } colors_numeric = colors_numeric_temp #print(colors) } distanceMatrix = distanceMatrix*100 #inv_cr3 = matrix(100, ncol=dim(cr3)[1], nrow=dim(cr3)[1]) - cr3 Commet_distance = as.dist(distanceMatrix) hc = hclust(Commet_distance, method="average") dendo_cr3 = as.dendrogram(hc) if(use_metadata){ colors_numeric_hc = colors_numeric[hc$order] dendo_cr3 %>% set("labels_col", colors_numeric_hc) %>% set("branches_k_color", colors_numeric_hc) %>% # change color plot(main=paste0("Simka hierarchical clustering\n", distance_name), cex = 0.3, xlab="", sub="") legend("topright", title=metadata_variable, legend=unique(colors), col=unique(colors_numeric), pch=16) } else{ plot(dendo_cr3, main=paste0("Simka hierarchical clustering\n", distance_name), cex = 0.3, xlab="", sub="") } simka-1.5.3/scripts/visualization/heatmap.r000077500000000000000000000145141377312000000210170ustar00rootroot00000000000000# Contributors : # Pierre PETERLONGO, pierre.peterlongo@inria.fr [12/06/13] # Nicolas MAILLET, nicolas.maillet@inria.fr [12/06/13] # Guillaume Collet, guillaume@gcollet.fr [27/05/14] # Gaetan BENOIT, gaetan.benoit@inria.fr [08/10/15] # Claire LEMAITRE, claire.lemaitre@inria.fr [06/07/16] # # This software is a computer program whose purpose is to find all the # similar reads between sets of NGS reads. It also provide a similarity # score between the two samples. # # Copyright (C) 2014 INRIA # # This program is free software: you can redistribute it and/or modify # it under the terms of the GNU Affero General Public License as # published by the Free Software Foundation, either version 3 of the # License, or any later version. # # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU Affero General Public License for more details. # # You should have received a copy of the GNU Affero General Public License # along with this program. If not, see . ## Usage : Rscript heatmap.r matrix_asym.csv matrix_sym.csv output_file.pdf title if (!require("gplots")) { install.packages("gplots", dependencies = TRUE) library(gplots) } #options(echo=TRUE) # if you want see commands in output file args <- commandArgs(trailingOnly = TRUE) #png(file=args[3],width=800,height=800,res=65) width = as.numeric(args[4]) height = as.numeric(args[5]) format = args[6] if(format == "png"){ png(file=paste0(args[3], ".png"), width=width, height=height, units="in",res=72) } else{ pdf(file=paste0(args[3], ".pdf"), width=width, height=height) } cr3 = as.matrix(read.table(file=args[1], sep=";", header=TRUE, row.names=1)) # can be symetric matrix cr3_norm = as.matrix(read.table(file=args[2], sep=";", header=TRUE, row.names=1)) # must be a symetric matrix cr3[lower.tri(cr3)] <- t(cr3)[lower.tri(cr3)] #symmetrize matrix cr3_norm[lower.tri(cr3_norm)] <- t(cr3_norm)[lower.tri(cr3_norm)] #symmetrize matrix distance_name = basename(args[1]) distance_name = unlist(strsplit(distance_name, "[.]"))[1] distance_name = gsub("mat_", "", distance_name) use_metadata = F if(length(args) == 8){ use_metadata = T metadata_table = as.matrix(read.table(file=args[7], sep=";", header=TRUE, row.names=1)) metadata_variable = args[8] #print(metadata_table) variables = metadata_table[,metadata_variable] #print(variables) meatadata_index = list() dataset_ids = rownames(metadata_table) for(i in 1:length(dataset_ids)){ dataset_id = dataset_ids[i] #print(dataset_id) #print(variables[[i]]) meatadata_index[[dataset_id]] = variables[[i]] #print(meatadata_index[[dataset_id]]) } colors = c() dataset_ids = rownames(cr3_norm) for(i in 1:dim(cr3_norm)[1]){ dataset_id = dataset_ids[i] colors = c(colors, meatadata_index[[dataset_id]]) } colors_numeric_temp = c() colors_numeric = as.numeric(as.factor(colors)) for(i in 1:length(colors_numeric)){ colors_numeric_temp = c(colors_numeric_temp, colors_numeric[i]+1) } colors_numeric = colors_numeric_temp #print(colors) } n=100 # number of steps between 2 colors ## Transforming 0-1 distances in 0-100 similarity measure if(grepl("chord",args[1]) || grepl("hellinger",args[1])){ cr3 = (sqrt(2) - cr3) * 100 } else { cr3 = (1 - cr3) * 100 } ## Computing mini-maxi for colour palette mini=min(cr3[]) maxi=max(cr3[row(cr3)!=col(cr3)]) # ignoring the diagonal trueMax=max(cr3[]) # typically the value in the diagonal = 100 q25=quantile(cr3[row(cr3)!=col(cr3)],0.25,1) q50=quantile(cr3[row(cr3)!=col(cr3)],0.5,1) q75=quantile(cr3[row(cr3)!=col(cr3)],0.75,1) ## We use the quantiles to ignore some outlier values in the matrix (valuesmaxi will have a colour between brown and grey23) mini=max(q25-1.5*(q75-q25),0) maxi=min(q75+1.5*(q75-q25),trueMax) palette=colorRampPalette(c("green", "yellow", "red", "brown", "grey23"))(n = 5*n-1) ## Checking if maxi = trueMax trueMax.needed=ifelse(maxi /dev/null 2>&1 ") def outputHclust(outputFilename, matrixNormFilename): if not args.want_tree: return command = "Rscript " + hclust_script_filename + " " + join(args.input_dir, matrixNormFilename) + " " + join(args.output_dir, outputFilename) command = add_metadata_args(command) print("\t"+command) #print command os.system(command)# + " > /dev/null 2>&1 ") def outputPca(outputFilename, matrixNormFilename): if not args.want_pca: return command = "Rscript " + pca_script_filename + " " + join(args.input_dir, matrixNormFilename) + " " + join(args.output_dir, outputFilename) + " " + args.pca_axis_1 + " " + args.pca_axis_2 command = add_metadata_args(command) print("\t"+command) #print(args.metadata_filename) #print(args.metadata_variable) #print command os.system(command)# + " > /dev/null 2>&1 ") def execute(): files = [ f for f in listdir(args.input_dir) if isfile(join(args.input_dir,f))] for filename in files: asym = False if not ".csv.gz" in filename: continue if "asym" in filename: asym = True asym_filename = filename filename = filename.replace("_asym", "") method_name = filename.split(".")[0] method_name = method_name.replace("mat_", "") try: if asym: matrix[method_name].append(asym_filename) else: matrix[method_name].append(filename) except: matrix[method_name] = [] if asym: matrix[method_name].append(asym_filename) else: matrix[method_name].append(filename) #for method_name in matrix.keys(): #print(filename, method_name) #if method_name in filename: #matrix[method_name].append(filename) #break for method_name, matrix_filenames in matrix.items(): print("") print(method_name) #one version of the similairty function (sym) if len(matrix_filenames) == 1: #print("lala") outputHeatmap("heatmap_" + method_name, matrix_filenames[0], matrix_filenames[0]) outputHclust("hclust_" + method_name, matrix_filenames[0]) outputPca("pca_" + method_name, matrix_filenames[0]) #two version of the similarity function (sym and asym) else: sym = "" asym = "" for filename in matrix_filenames: if "asym" in filename: asym = filename else: sym = filename outputHeatmap("heatmap_" + method_name, asym, sym) outputHclust("hclust_" + method_name, sym) outputPca("pca_" + method_name, sym) #args = sys.argv #mat_input_dir = args[1] #try: # rscript_dir = args[2] #except: # rscript_dir = os.path.dirname(os.path.realpath(__file__)) rscript_dir = os.path.dirname(os.path.realpath(__file__)) heatmap_script_filename = join(rscript_dir, "heatmap.r") hclust_script_filename = join(rscript_dir, "dendro.r") pca_script_filename = join(rscript_dir, "pca.r") if not args.want_heatmap and not args.want_pca and not args.want_tree: print("Please, choose at least one option among: -heatmap -tree -pca") exit(1) if not os.path.exists(args.output_dir): os.makedirs(args.output_dir) execute() simka-1.5.3/simkaMin/000077500000000000000000000000001377312000000143655ustar00rootroot00000000000000simka-1.5.3/simkaMin/README.md000066400000000000000000000123331377312000000156460ustar00rootroot00000000000000# SimkaMin [![License](http://img.shields.io/:license-affero-blue.svg)](http://www.gnu.org/licenses/agpl-3.0.en.html) ## What is SimkaMin? As in the case of Simka, SimkaMin is a *de novo* comparative metagenomics tool. The difference with Simka stands in the fact that SimkaMin outputs approximate (but very similar) results by subsampling the kmer space. With this strategy, and with default parameters, SimkaMin is an order of magnitude faster, uses 10 times less memory and 70 times less disk than Simka. Developper: [Gaëtan Benoit](http://people.rennes.inria.fr/Gaetan.Benoit/), PhD, former member of the [Genscale](http://team.inria.fr/genscale/) team at Inria. Contact: claire dot lemaitre at inria dot fr ## References Benoit G, Mariadassou M, Robin S, Schbath S, Peterlongo P and Lemaitre C. (2019) [SimkaMin: fast and resource frugal *de novo* comparative metagenomics](https://doi.org/10.1093/bioinformatics/btz685). Bioinformatics Benoit G, Peterlongo P, Mariadassou M, Drezen E, Schbath S, Lavenier D, Lemaitre C. (2016) [Multiple comparative metagenomics using multiset k-mer counting](https://doi.org/10.7717/peerj-cs.94). PeerJ Computer Science 2:e94 Benoit G (2017) [Large scale de novo comparative metagenomics (PhD thesis in french)](https://tel.archives-ouvertes.fr/tel-01659395v2/). ## Install simkaMin SimkaMin comes with Simka installation. Refer to [Simka install instructions](../README.md). ## User manual ### Description SimkaMin computes Bray-Curtis (abundance based) and Jaccard (presence/absence based) distances between N (metagenomic) read sets based on subsamples of k-mer counts. Basically it takes as input the N metagenomic read sets and it outputs two matrices respectively providing the pairwise Bray-Curtis and the Jaccard distances between each dataset pairs. ### A simple command example Run the toy example: ```bash ./simkaMin/simkaMin.py -in example/simka_input.txt -out results ``` ### Input The input file (`-in`) lists the datasets. These datasets can be in fasta, fastq and in gzip compressed format (.gz). One dataset per line with the following syntax (you can put any number of spaces and/or tabs between syntax): ID1: filename.fasta ID2: filename.fasta ID3: filename.fasta The dataset ID in the name that will appear in the headers of the distance matrices. You can find a simka input file in example directory: ./example/data/simka_input.txt If a given datset has been splitted in several parts, Simka can automatically concatenate them. ID1: filename_part1.fasta , filename_part2.fasta , ... If you have paired files, you can list them separated by a ‘;’: ID1: filename_pair1.fasta ; filename_pair2.fasta You can combine concatenated and paired operations: ID1: filename_part1_pair1.fasta , filename_part2_pair1.fasta ; filename_part1_pair2.fasta , filename_part2_pair2.fasta Paired syntax is only usefull if the `-max-reads` option of SimkaMin is set. Example: If `-max-reads` is set to 100, then Simka will considered the 100 first reads of the first paired files and the 100 first reads of the second paired files… ### Output SimkaMin results are an abundance-based Bray-Curtis distance matrix `mat_presenceAbsence_jaccard.csv.gz` and a presence-absence-based Jaccard distance matrix `mat_abundance_braycurtis.csv.gz`. A distance matrix is a squared matrix of size N (where N is the number of input datasets). Each value in the matrix gives the distance between a pair of datasets. These values are in the range [0, 1]. A distance value of 0 means that the pair of datasets is perfectly similar. The greater the distance value is, the more dissimilar is the pair of datasets. SimkaMin results will be stored in the directory indicated by `-out` option. #### Visualize SimkaMin results SimkaMin results can be visualised through heatmaps, hierarchical clustering and PCA. This module is common with the Simka visualisation script `../scripts/visualization/run-visualization.py`. Please refer to the documentation provided in the [Simka Readme file](../README.md). ## Usage To see simka in-line help: ```bash ./simkaMin/simkaMin.py ``` ## Simka command examples Run the toy example: ```bash ./simkaMin/simkaMin.py -in example/simka_input.txt -out results ``` Change the kmer size ```bash ./simkaMin/simkaMin.py … -kmer-size 31 ``` Change the sub-sampling effort (default 1 million kmers are used per read set) ```bash ./simkaMin/simkaMin.py … -nb-kmers 10000 ``` Filter kmers seen one time (potentially erroneous): ```bash ./simkaMin/simkaMin.py … -filter ``` Consider all the reads of each samples (set 0 to use all reads) ```bash ./simkaMin/simkaMin.py … -max-reads 0 ``` Use only the first 1000 reads of each sample: ```bash ./simkaMin/simkaMin.py … -max-reads 1000 ``` Allow more memory and cores to improve the execution time: ```bash ./simkaMin/simkaMin.py … -max-memory 20000 -nb-cores 8 ``` Filter low complexity reads ```bash ./simkaMin/simkaMin.py … -min-shannon-index 1 ``` Filter small reads ```bash ./simkaMin/simkaMin.py … -min-read-size 80 ``` Update existing results with additional datasets ```bash ./simkaMin/simkaMin_update.py -in another_simka_input.txt -in-to-update results/simkamin # updated matrices will be in dir results/simkamin/ ``` simka-1.5.3/simkaMin/simkaMin.py000077500000000000000000000213011377312000000165070ustar00rootroot00000000000000#!/usr/bin/env python #***************************************************************************** # SimkaMin: Fast kmer-based method for estimating the similarity between numerous metagenomic datasets # A tool from the GATB (Genome Assembly Tool Box) # Copyright (C) 2019 INRIA # Authors: G.Benoit, C.Lemaitre, P.Peterlongo # # This program is free software: you can redistribute it and/or modify # it under the terms of the GNU Affero General Public License as # published by the Free Software Foundation, either version 3 of the # License, or (at your option) any later version. # # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU Affero General Public License for more details. # # You should have received a copy of the GNU Affero General Public License # along with this program. If not, see . #***************************************************************************** import os, math, subprocess from os import listdir from os.path import isfile, join, splitext import sys, argparse from simkaMin_utils import SimkaParser, ArgumentFormatterSimka, read_sketch_header, ProgressBar, is_executable #------------------------------------------------------------------------------------------------------------- # Arg parser #------------------------------------------------------------------------------------------------------------- parser = SimkaParser(formatter_class=ArgumentFormatterSimka) parserMain = parser.add_argument_group("[main options]") parserCore = parser.add_argument_group("[core options]") parserDistance = parser.add_argument_group("[distance options]") parserKmer = parser.add_argument_group("[k-mer options]") parserRead = parser.add_argument_group("[read options]") parserDev = parser.add_argument_group("[advanced (developer) options]") parserMain.add_argument('-in', action="store", dest="input_filename", help="input file of datasets. One sample per line: id1: filename1...", required=True) parserMain.add_argument('-out', action="store", dest="out", default="./simka_results", help="output directory for result files (distance matrices)") parserMain.add_argument('-seed', action="store", dest="seed", default="100", help="seed used for random k-mer selection") parserMain.add_argument('-bin', action="store", dest="bin", help="path to simkaMinCore program (to be specified if not in PATH, or not in standard installation directory /build/bin/simkaMinCore)") parserKmer.add_argument('-kmer-size', action="store", dest="kmer_size", help="size of a kmer", default="21") parserKmer.add_argument('-nb-kmers', action="store", dest="nb_kmers", help="number of kmers used to compute distances", default="1000000") parserKmer.add_argument('-filter', action="store_true", dest="filter", help="filter out k-mer seen one time (potentially erroneous)") parserRead.add_argument('-max-reads', action="store", dest="max_reads", default="0", help="maximum number of reads per sample to process") parserRead.add_argument('-min-read-size', action="store", dest="min_read_size", default="0", help="minimal size a read should have to be kept") parserRead.add_argument('-min-shannon-index', action="store", dest="min_shannon_index", default="0", help="minimal Shannon index a read should have to be kept. Float in [0,2]") parserCore.add_argument('-nb-cores', action="store", dest="nb_cores", help="number of cores", default="0") parserCore.add_argument('-max-memory', action="store", dest="max_memory", help="max memory (MB)", default="8000") args = parser.parse_args() # Check SimkaMinCore executable # ----------------------------- simkaMinCoreBin=args.bin if args.bin is not None: # given by the user if not is_executable(simkaMinCoreBin): print("Error: "+simkaMinCoreBin+" not found or not executable, should be /build/bin/simkaMinCore") exit(1) else: # Check if is in the PATH simkaMinCoreBin="simkaMinCore" if not is_executable(simkaMinCoreBin): # not in PATH, checking "../build/bin/simkaMinCore" simkaMinCoreBin=os.path.join(os.path.split(os.path.realpath(__file__))[0],"../build/bin/simkaMinCore") if not is_executable(simkaMinCoreBin): print("Error: simkaMinCore executable not found, please give the executable path with option -bin (should be /build/bin/simkaMinCore)") exit(1) #------------------------------------------------------------------------------------------------------------- # SimkaMin pipeline #------------------------------------------------------------------------------------------------------------- #Create some dirs and filenames if not os.path.exists(args.out): os.makedirs(args.out) outDir = os.path.join(args.out, "simkamin") if not os.path.exists(outDir): os.makedirs(outDir) sketchDir = os.path.join(outDir, "sketch") if not os.path.exists(sketchDir): os.makedirs(sketchDir) sketchFilename = os.path.join(sketchDir, "sketch.bin") distanceOutputDir = os.path.join(outDir, "distance") if not os.path.exists(distanceOutputDir): os.makedirs(distanceOutputDir) logsDir = os.path.join(outDir, "logs") if not os.path.exists(logsDir): os.makedirs(logsDir) #Create commands sketchCommand = simkaMinCoreBin + " sketch " sketchCommand += " -in " + args.input_filename sketchCommand += " -out " + sketchFilename sketchCommand += " -seed " + args.seed sketchCommand += " -kmer-size " + args.kmer_size sketchCommand += " -nb-kmers " + args.nb_kmers if args.filter: sketchCommand += " -filter " sketchCommand += " -max-reads " + args.max_reads sketchCommand += " -min-read-size " + args.min_read_size sketchCommand += " -min-shannon-index " + args.min_shannon_index sketchCommand += " -nb-cores " + args.nb_cores sketchCommand += " -max-memory " + args.max_memory exportCommand = simkaMinCoreBin + " export " exportCommand += " -in " + distanceOutputDir exportCommand += " -in1 " + sketchFilename exportCommand += " -in2 " + sketchFilename #exportCommand += " -in-ids " + distanceOutputDir #not applicable here exportCommand += " -out " + args.out exportCommand += " -nb-cores " + args.nb_cores print("\n\n#-----------------------------") print("# Sketching") print(sketchCommand) print("#-----------------------------\n") print("\n\n") ret = os.system(sketchCommand) if ret != 0: print("ERROR"); exit(1) print("\n\n#-----------------------------") print("# Computing distances") print("#-----------------------------\n") print("\n\n") #Create binary matrix file (required in case the following distance commands are run in parallel if os.path.exists(distanceOutputDir + "/mat_presenceAbsence_jaccard.bin"): os.remove(distanceOutputDir + "/mat_presenceAbsence_jaccard.bin") if os.path.exists(distanceOutputDir + "/mat_abundance_braycurtis.bin"): os.remove(distanceOutputDir + "/mat_abundance_braycurtis.bin") open(distanceOutputDir + "/mat_presenceAbsence_jaccard.bin", "wb").close() open(distanceOutputDir + "/mat_abundance_braycurtis.bin", "wb").close() sketch_header = read_sketch_header(sketchFilename) nbDatasetToProcess = sketch_header["nbDatasets"] MAX_DATASETS_PROCESS = 100 def create_distance_command(i, j, n1, n2): distanceCommand = simkaMinCoreBin + " distance " distanceCommand += " -in1 " + sketchFilename distanceCommand += " -in2 " + sketchFilename distanceCommand += " -out " + distanceOutputDir distanceCommand += " -nb-cores " + args.nb_cores distanceCommand += " -start-i " + str(i*MAX_DATASETS_PROCESS) distanceCommand += " -start-j " + str(j*MAX_DATASETS_PROCESS) distanceCommand += " -n-i " + str(n1) distanceCommand += " -n-j " + str(n2) distanceCommand += " > " + os.path.join(logsDir, "log_distance_" + str(i) + "-" + str(j)) + " 2>&1 " return distanceCommand step = int(math.ceil( float(nbDatasetToProcess) / float(MAX_DATASETS_PROCESS))) nbCommands = int(math.ceil( float(step * step) / float(2))) progressBar = ProgressBar("Computing distances", nbCommands) progressBar.start() done = False for i in range(0, step): n1 = min(MAX_DATASETS_PROCESS, nbDatasetToProcess-i*MAX_DATASETS_PROCESS) for j in range(i, step): n2 = min(MAX_DATASETS_PROCESS, nbDatasetToProcess-j*MAX_DATASETS_PROCESS) distanceCommand = create_distance_command(i, j, n1, n2) #print distanceCommand ret = os.system(distanceCommand) if ret != 0: print("ERROR"); exit(1) progressBar.step(1) #print("\n\n#-----------------------------") #print("# Exporting distances") #print("#-----------------------------\n") print("\n\nExporting distance matrices in csv.gz format...") ret = os.system(exportCommand) if ret != 0: print("ERROR"); exit(1) print("\n\n") print("Result dir: " + args.out) simka-1.5.3/simkaMin/simkaMin_update.py000077500000000000000000000233501377312000000200570ustar00rootroot00000000000000#!/usr/bin/env python #***************************************************************************** # SimkaMin: Fast kmer-based method for estimating the similarity between numerous metagenomic datasets # A tool from the GATB (Genome Assembly Tool Box) # Copyright (C) 2019 INRIA # Authors: G.Benoit, C.Lemaitre, P.Peterlongo # # This program is free software: you can redistribute it and/or modify # it under the terms of the GNU Affero General Public License as # published by the Free Software Foundation, either version 3 of the # License, or (at your option) any later version. # # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU Affero General Public License for more details. # # You should have received a copy of the GNU Affero General Public License # along with this program. If not, see . #***************************************************************************** import os, struct, shutil from os import listdir from os.path import isfile, join, splitext import sys, argparse from simkaMin_utils import SimkaParser, ArgumentFormatterSimka, read_sketch_header, is_executable #------------------------------------------------------------------------------------------------------------- # Arg parser #------------------------------------------------------------------------------------------------------------- parser = SimkaParser(formatter_class=ArgumentFormatterSimka) parserMain = parser.add_argument_group("[main options]") parserCore = parser.add_argument_group("[core options]") parserDistance = parser.add_argument_group("[distance options]") parserKmer = parser.add_argument_group("[k-mer options]") parserRead = parser.add_argument_group("[read options]") parserDev = parser.add_argument_group("[advanced (developer) options]") parserMain.add_argument('-in', action="store", dest="input_filename", help="input file of datasets (datasets to add to existing simka results", required=True) parserMain.add_argument('-in-to-update', action="store", dest="input_existingResults", help="path to existing simka results to update (existing results will be overwritten)", required=True) parserMain.add_argument('-bin', action="store", dest="bin", help="path to simkaMinCore program (to be specified if not in PATH, or not in standard installation directory /build/bin/simkaMinCore)") parserKmer.add_argument('-filter', action="store_true", dest="filter", help="filter out k-mer seen one time (potentially erroneous)") parserRead.add_argument('-max-reads', action="store", dest="max_reads", default="0", help="maximum number of reads per sample to process") parserRead.add_argument('-min-read-size', action="store", dest="min_read_size", default="0", help="minimal size a read should have to be kept") parserRead.add_argument('-min-shannon-index', action="store", dest="min_shannon_index", default="0", help="minimal Shannon index a read should have to be kept. Float in [0,2]") parserCore.add_argument('-nb-cores', action="store", dest="nb_cores", help="number of cores", default="0") parserCore.add_argument('-max-memory', action="store", dest="max_memory", help="max memory (MB)", default="8000") args = parser.parse_args() # Check SimkaMinCore executable # ----------------------------- simkaMinCoreBin=args.bin if args.bin is not None: # given by the user if not is_executable(simkaMinCoreBin): print("Error: "+simkaMinCoreBin+" not found or not executable, should be /build/bin/simkaMinCore") exit(1) else: # Check if is in the PATH simkaMinCoreBin="simkaMinCore" if not is_executable(simkaMinCoreBin): # not in PATH, checking "../build/bin/simkaMinCore" simkaMinCoreBin=os.path.join(os.path.split(os.path.realpath(__file__))[0],"../build/bin/simkaMinCore") if not is_executable(simkaMinCoreBin): print("Error: simkaMinCore executable not found, please give the executable path with option -bin (should be /build/bin/simkaMinCore)") exit(1) #------------------------------------------------------------------------------------------------------------- # SimkaMin pipeline #------------------------------------------------------------------------------------------------------------- #Create some dirs and filenames #if not os.path.exists(args.out): os.makedirs(args.out) existingDir = args.input_existingResults sketchDir = os.path.join(existingDir, "sketch") #if not os.path.exists(sketchDir): os.makedirs(sketchDir) sketchFilename_existing = os.path.join(sketchDir, "sketch.bin") sketchFilename_new = os.path.join(sketchDir, "sketch_new.bin") distanceOutputDir = os.path.join(existingDir, "distance") distanceDir_existingVsNew = os.path.join(distanceOutputDir, "existingVsNew") if not os.path.exists(distanceDir_existingVsNew): os.makedirs(distanceDir_existingVsNew) distanceDir_newVsNew = os.path.join(distanceOutputDir, "newVsNew") if not os.path.exists(distanceDir_newVsNew): os.makedirs(distanceDir_newVsNew) #simkaMin_pipeline_filename = "./simkaMin_pipeline.py" #Existing datasets: datasets that have already been processed by SimkaMin # - ids and k-mers are contained in (-in-existing)/sketch/sketch.bin # - distances are contained in (-in-existing)/distance/mat_*.bin #New datasets: datasets to add contains in -in file existing_sketch_header = read_sketch_header(sketchFilename_existing) print(existing_sketch_header) #Sketch new datasets command_sketchNewDatasets = simkaMinCoreBin + " sketch " command_sketchNewDatasets += " -in " + args.input_filename command_sketchNewDatasets += " -out " + sketchFilename_new command_sketchNewDatasets += " -seed " + str(existing_sketch_header["seed"]) command_sketchNewDatasets += " -kmer-size " + str(existing_sketch_header["kmerSize"]) command_sketchNewDatasets += " -nb-kmers " + str(existing_sketch_header["sketchSize"]) if args.filter: command_sketchNewDatasets += " -filter " command_sketchNewDatasets += " -max-reads " + args.max_reads command_sketchNewDatasets += " -min-read-size " + args.min_read_size command_sketchNewDatasets += " -min-shannon-index " + args.min_shannon_index command_sketchNewDatasets += " -nb-cores " + args.nb_cores command_sketchNewDatasets += " -max-memory " + args.max_memory #Compute distance between existing datasets and new datasets command_distance_existingVsNew = simkaMinCoreBin + " distance " command_distance_existingVsNew += " -in1 " + sketchFilename_existing command_distance_existingVsNew += " -in2 " + sketchFilename_new command_distance_existingVsNew += " -out " + distanceDir_existingVsNew command_distance_existingVsNew += " -nb-cores " + args.nb_cores #Compute distance between new datasets and new datasets command_distance_newVsNew = simkaMinCoreBin + " distance " command_distance_newVsNew += " -in1 " + sketchFilename_new command_distance_newVsNew += " -in2 " + sketchFilename_new command_distance_newVsNew += " -out " + distanceDir_newVsNew command_distance_newVsNew += " -nb-cores " + args.nb_cores #Update existing distance matrix command_distanceMatrix_update = simkaMinCoreBin + " matrix-update " command_distanceMatrix_update += " -in " + distanceOutputDir command_distanceMatrix_update += " -in1 " + sketchFilename_existing command_distanceMatrix_update += " -in2 " + sketchFilename_new #Append new sketch to existing sketch command_sketch_append = simkaMinCoreBin + " append " command_sketch_append += " -in1 " + sketchFilename_existing command_sketch_append += " -in2 " + sketchFilename_new exportCommand = simkaMinCoreBin + " export " exportCommand += " -in " + distanceOutputDir exportCommand += " -in1 " + sketchFilename_existing exportCommand += " -in2 " + sketchFilename_existing #exportCommand += " -in-ids " + distanceOutputDir #not applicable here exportCommand += " -out " + args.input_existingResults print("\n\n#-----------------------------") print("# Sketching new datasets") print("#-----------------------------\n") ret = os.system(command_sketchNewDatasets) if ret != 0: print("ERROR"); exit(1) print("\n\n#-----------------------------") print("# Computing distances between existing datasets and new datasets") print("#-----------------------------\n") ret = os.system(command_distance_existingVsNew) if ret != 0: print("ERROR"); exit(1) ######################## #exportCommand = args.bin + " export " #exportCommand += " -in " + distanceDir_existingVsNew #exportCommand += " -in1 " + sketchFilename_existing #exportCommand += " -in2 " + sketchFilename_new #exportCommand += " -in-ids " + distanceOutputDir #not applicable here #exportCommand += " -out " + distanceDir_existingVsNew #os.system(exportCommand) #os.system("gzip -cd "+ distanceDir_existingVsNew +"/mat_abundance_braycurtis.csv.gz") ######################## print("\n\n#-----------------------------") print("# Computing distances between new datasets") print("#-----------------------------\n") ret = os.system(command_distance_newVsNew) if ret != 0: print("ERROR"); exit(1) print("\n\n#-----------------------------") print("# Update existing distance matrices") print("#-----------------------------\n") ret = os.system(command_distanceMatrix_update) if ret != 0: print("ERROR"); exit(1) print("\n\n#-----------------------------") print("# Append new sketch to existing sketch") print("#-----------------------------\n") ret = os.system(command_sketch_append) if ret != 0: print("ERROR"); exit(1) print("\n\n#-----------------------------") print("# Exporting distances") print("#-----------------------------\n") ret = os.system(exportCommand) if ret != 0: print("ERROR"); exit(1) #Clear temp dir shutil.rmtree(distanceDir_existingVsNew) shutil.rmtree(distanceDir_newVsNew) os.remove(sketchFilename_new) print("\n\n") print("Result dir: " + existingDir) simka-1.5.3/simkaMin/simkaMin_utils.py000077500000000000000000000123521377312000000177350ustar00rootroot00000000000000#***************************************************************************** # SimkaMin: Fast kmer-based method for estimating the similarity between numerous metagenomic datasets # A tool from the GATB (Genome Assembly Tool Box) # Copyright (C) 2019 INRIA # Authors: G.Benoit, C.Lemaitre, P.Peterlongo # # This program is free software: you can redistribute it and/or modify # it under the terms of the GNU Affero General Public License as # published by the Free Software Foundation, either version 3 of the # License, or (at your option) any later version. # # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU Affero General Public License for more details. # # You should have received a copy of the GNU Affero General Public License # along with this program. If not, see . #***************************************************************************** import argparse, struct, time, datetime, sys, os, subprocess def is_executable(bin): try: subprocess.call([bin, "-h"],stdout=open(os.devnull, 'wb'), stderr=open(os.devnull, 'wb')) except OSError as e: return(0) return(1) #------------------------------------------------------------------------------------------------------------- # ProgressBar #------------------------------------------------------------------------------------------------------------- class ProgressBar(): def __init__(self, text, max): self.text = text self.max = max self.progress = 0 self.start_time = 0 def start(self): self.progress = 0 self.start_time = time.time() self.display() def step(self, value): self.progress += value self.display() def display(self): progress_percent = float(self.progress) / float(self.max) * 100 duration = int(time.time() - self.start_time) duration_str = str(datetime.timedelta(seconds=duration)) #--- sys.stdout.write('\r') sys.stdout.write("[" + str(round(progress_percent, 1)) + "%] " + self.text + " [Time: " + duration_str + "]") if self.progress == self.max: sys.stdout.write("\n") sys.stdout.flush() #------------------------------------------------------------------------------------------------------------- # ArgumentFormatterSimka #------------------------------------------------------------------------------------------------------------- class SimkaParser(argparse.ArgumentParser): def error(self, message): print("") sys.stderr.write('error: %s\n' % message) print("") self.print_help() sys.exit(2) class ArgumentFormatterSimka(argparse.HelpFormatter): #def _fill_text(self, text, width, indent): # return ''.join([indent + line for line in text.splitlines(True)]) def _split_lines(self, text, width): return text.splitlines() #remove default args layout def _format_args(self, action, default_metavar): result = "" return result #Remove "usage: ..." header def _format_usage(self, usage, actions, groups, prefix): return "" #Changed layout of each item def _get_help_string(self, action): text = "" if type(action) == argparse._StoreAction: text = "(1 arg) : " + action.help elif type(action) == argparse._StoreTrueAction: text = "(0 arg) : " + action.help if type(action) == argparse._StoreAction and action.default != None: text += " [Default: " + str(action.default) + "]" #print type(action), action #print action #return "-5-" #return action.help if text != "": return text return "__none__" #Hack for removing useless "optional arguments:" section def _join_parts(self, part_strings): #print part_strings return ''.join([part for part in part_strings if part and part is not argparse.SUPPRESS and not "optional arguments:" in part and not "__none__" in part and not "--help" in part]) #------------------------------------------------------------------------------------------------------------- # Sketch reader #------------------------------------------------------------------------------------------------------------- def read_sketch_header(sketchFilename): f = open(sketchFilename, mode='rb') kmerSize = struct.unpack("B", f.read(1))[0] #B = unsigned char sketchSize = struct.unpack("I", f.read(4))[0] #I = unsigned int seed = struct.unpack("I", f.read(4))[0] #I = unsigned int nbDatasets = struct.unpack("I", f.read(4))[0] #I = unsigned int f.close() #u_int8_t kmerSize_; #file.read((char*)(&kmerSize_), sizeof(kmerSize_)); #u_int32_t sketchSize_; #file.read((char*)(&sketchSize_), sizeof(sketchSize_)); #u_int32_t seed_; #file.read((char*)(&seed_), sizeof(seed_)); #u_int32_t nbDatasets_; #file.read((char*)(&nbDatasets_), sizeof(nbDatasets_)); return {"kmerSize": kmerSize, "sketchSize": sketchSize, "seed": seed, "nbDatasets": nbDatasets} simka-1.5.3/sonar-project.properties000066400000000000000000000014531377312000000175240ustar00rootroot00000000000000sonar.links.homepage=https://gitlab.inria.fr/GATB/simka sonar.links.scm=https://gitlab.inria.fr/GATB/simka.git sonar.projectDescription=Simka project sonar.host.url=https://sonarqube.inria.fr/sonarqube sonar.projectKey=genscale:gatb:tools:simka:gitlab:master sonar.sources=src sonar.exclusions=thirdparty/** sonar.language=c++ sonar.cxx.includeDirectories=$(echo | gcc -E -Wp,-v - 2>&1 | grep "^ " | tr '\n' ',') sonar.cxx.gcc.reportPath=simka-scan-build.log sonar.cxx.gcc.regex=(?.*):(?[0-9]+):[0-9]+:\\x20warning:\\x20(?.*)\\x20\\[(?.*)\\] sonar.cxx.clangtidy.reportPath=simka-clang-tidy-report.log sonar.cxx.clangsa.reportPath=analyzer_reports/*/*.plist sonar.cxx.cppcheck.reportPath=simka-cppcheck.xml sonar.cxx.rats.reportPath=simka-rats.xml sonar.cxx.coverage.reportPath=gcov.xml simka-1.5.3/src/000077500000000000000000000000001377312000000134045ustar00rootroot00000000000000simka-1.5.3/src/SimkaCount.cpp000077500000000000000000000331121377312000000161700ustar00rootroot00000000000000/***************************************************************************** * Simka: Fast kmer-based method for estimating the similarity between numerous metagenomic datasets * A tool from the GATB (Genome Assembly Tool Box) * Copyright (C) 2015 INRIA * Authors: G.Benoit, C.Lemaitre, P.Peterlongo * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU Affero General Public License as * published by the Free Software Foundation, either version 3 of the * License, or (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU Affero General Public License for more details. * * You should have received a copy of the GNU Affero General Public License * along with this program. If not, see . *****************************************************************************/ #include "SimkaPotara.hpp" #include "minikc/MiniKC.hpp" //#include // We use the required packages using namespace std; //#define NB_COUNT_CACHE 1 //#define TRACK_DISK_USAGE /* template class SimkaPotaraBankFiltered : public BankDelegate { public: Iterator* _it; SimkaPotaraBankFiltered (IBank* ref, const Filter& filter, u_int64_t maxReads, size_t nbDatasets) : BankDelegate (ref), _filter(filter) { //_nbReadsPerDataset = nbReadsPerDataset; _maxReads = maxReads; _nbDatasets = nbDatasets; } ~SimkaPotaraBankFiltered(){ delete _it; } Iterator* iterator () { _it = _ref->iterator (); //std::vector*> iterators = it->getComposition(); return new SimkaInputIterator (_it, _nbDatasets, _maxReads, _filter); //return filterIt; } private: //vector _nbReadsPerDataset; u_int64_t _maxReads; Filter _filter; u_int64_t _nbReadToProcess; size_t _datasetId; size_t _nbDatasets; }; */ class SimkaCount : public Tool { public: SimkaCount () : Tool ("SimkaCount") { //getParser()->push_front (new OptionOneParam (STR_URI_OUTPUT, "output file", true)); //getParser()->push_back (new OptionOneParam (STR_ID, "dataset id", true)); //getParser()->push_back (new OptionOneParam (STR_KMER_SIZE, "kmer size", true)); getParser()->push_back (new OptionOneParam ("-out-tmp-simka", "tmp output", true)); getParser()->push_back (new OptionOneParam ("-bank-name", "bank name", true)); getParser()->push_back (new OptionOneParam ("-bank-index", "bank name", true)); getParser()->push_back (new OptionOneParam (STR_SIMKA_MIN_READ_SIZE, "bank name", true)); getParser()->push_back (new OptionOneParam (STR_SIMKA_MIN_READ_SHANNON_INDEX, "bank name", true)); getParser()->push_back (new OptionOneParam (STR_SIMKA_MAX_READS, "bank name", true)); getParser()->push_back (new OptionOneParam ("-nb-datasets", "bank name", true)); getParser()->push_back (new OptionOneParam ("-nb-partitions", "bank name", true)); //getParser()->push_back (new OptionOneParam ("-nb-cores", "bank name", true)); //getParser()->push_back (new OptionOneParam ("-max-memory", "bank name", true)); getParser()->push_back (SortingCountAlgorithm<>::getOptionsParser(), 1); if (Option* p = dynamic_cast (getParser()->getParser(STR_KMER_ABUNDANCE_MIN))) { p->setDefaultValue ("0"); } } void execute () { //size_t datasetId = getInput()->getInt(STR_ID); size_t kmerSize = getInput()->getInt(STR_KMER_SIZE); //cout << kmerSize << endl; string outputDir = getInput()->getStr("-out-tmp-simka"); string bankName = getInput()->getStr("-bank-name"); size_t bankIndex = getInput()->getInt("-bank-index"); size_t minReadSize = getInput()->getInt(STR_SIMKA_MIN_READ_SIZE); double minReadShannonIndex = getInput()->getDouble(STR_SIMKA_MIN_READ_SHANNON_INDEX); u_int64_t maxReads = getInput()->getInt(STR_SIMKA_MAX_READS); size_t nbDatasets = getInput()->getInt("-nb-datasets"); size_t nbPartitions = getInput()->getInt("-nb-partitions"); CountNumber abundanceMin = getInput()->getInt(STR_KMER_ABUNDANCE_MIN); CountNumber abundanceMax = getInput()->getInt(STR_KMER_ABUNDANCE_MAX); Parameter params(*this, kmerSize, outputDir, bankName, minReadSize, minReadShannonIndex, maxReads, nbDatasets, nbPartitions, abundanceMin, abundanceMax, bankIndex); Integer::apply (kmerSize, params); //SimkaBankId* bank = new SimkaBankId(_banks, i); //cout << config._nb_partitions << endl; //KmerCountCompressor* kmerCountCompressor = new KmerCountCompressor(outputDir, config._nb_partitions, 1); //SimkaCompProcessor* processor = new SimkaCompProcessor(kmerCountCompressor); //vector*> procs; //procs.push_back(processor); //algo.addProcessor(processor); //algo.execute(); //delete kmerCountCompressor; //itBanks[i]-> // We get a handle on the HDF5 storage object. // Note that we use an auto pointer since the StorageFactory dynamically allocates an instance //Storage* storage = StorageFactory(DSK::getStorageMode()).load (getInput()->getStr(STR_URI_FILE)); //LOCAL (storage); //string kmerSizeStr = storage->getGroup("params").getProperty ("kmer_size"); //if (kmerSizeStr.empty()) { throw Exception ("unable to get the kmer size"); } //size_t kmerSize = atoi (kmerSizeStr.c_str()); } struct Parameter { Parameter (SimkaCount& tool, size_t kmerSize, string outputDir, string bankName, size_t minReadSize, double minReadShannonIndex, u_int64_t maxReads, size_t nbDatasets, size_t nbPartitions, CountNumber abundanceMin, CountNumber abundanceMax, size_t bankIndex) : tool(tool), kmerSize(kmerSize), outputDir(outputDir), bankName(bankName), minReadSize(minReadSize), minReadShannonIndex(minReadShannonIndex), maxReads(maxReads), nbDatasets(nbDatasets), nbPartitions(nbPartitions), abundanceMin(abundanceMin), abundanceMax(abundanceMax), bankIndex(bankIndex) {} SimkaCount& tool; //size_t datasetId; size_t kmerSize; string outputDir; string bankName; size_t minReadSize; double minReadShannonIndex; u_int64_t maxReads; size_t nbDatasets; size_t nbPartitions; CountNumber abundanceMin; CountNumber abundanceMax; size_t bankIndex; }; template struct Functor { typedef typename Kmer::Type Type; typedef typename Kmer::Count Count; typedef typename SimkaCompressedProcessor::Kmer_BankId_Count Kmer_BankId_Count; void operator () (Parameter p){ IProperties* props = p.tool.getInput(); vector outInfo; IBank* bank = Bank::open(p.outputDir + "/input/" + p.bankName); LOCAL(bank); /* u_int64_t nbSeqs = 1; IBank* sampleBank = new SimkaBankSample(bank, nbSeqs); SortingCountAlgorithm sortingCount (sampleBank, props); SimkaNullProcessor* proc = new SimkaNullProcessor(); sortingCount.addProcessor (proc); sortingCount.execute(); Configuration config = sortingCount.getConfig(); //_nbPartitions = _maxJobMerge; config._nb_partitions = p.nbPartitions; uint64_t memoryUsageCachedItems; config._nb_cached_items_per_core_per_part = 1 << 8; // cache at least 256 items (128 here, then * 2 in the next while loop) do { config._nb_cached_items_per_core_per_part *= 2; memoryUsageCachedItems = 1LL * config._nb_cached_items_per_core_per_part *config._nb_partitions * config._nbCores * sizeof(Type); } while (memoryUsageCachedItems < config._max_memory * MBYTE / 10); */ vector nbKmerPerParts(p.nbPartitions, 0); vector nbDistinctKmerPerParts(p.nbPartitions, 0); vector chordNiPerParts(p.nbPartitions, 0); Configuration config; { Repartitor* repartitor = new Repartitor(); LOCAL(repartitor); { Storage* storage = StorageFactory(STORAGE_HDF5).load (p.outputDir + "/" + "config.h5"); LOCAL (storage); config.load(storage->getGroup("")); repartitor->load(storage->getGroup("")); } //config._abundanceUserNb = 1; //config._abundance.clear(); //CountRange range(props->getInt(STR_KMER_ABUNDANCE_MIN), 100000); //config._abundance.push_back(range); /* vector cacheIndexes; cacheIndexes.resize(p.nbPartitions); vector > caches; caches.resize(p.nbPartitions); for(size_t i=0; i* > bags; vector* > cachedBags; for(size_t i=0; i* bag = new BagGzFile(outputFilename); Bag* cachedBag = new BagCache(bag, 10000); cachedBags.push_back(cachedBag); //BagCache bagCache(*bag, 10000); bags.push_back(bag); } string tempDir = p.outputDir + "/temp/" + p.bankName; System::file().mkdir(tempDir, -1); //cout << i << endl; //string outputDir = p.outputDir + "/comp_part" + to_string(p.datasetId) + "/"; //cout << "\tinput: " << p.outputDir + "/input/" + p.bankName << endl; SimkaSequenceFilter sequenceFilter(p.minReadSize, p.minReadShannonIndex); IBank* filteredBank = new SimkaPotaraBankFiltered(bank, sequenceFilter, p.maxReads, p.nbDatasets); // = new SimkaPotaraBankFiltered(bank) LOCAL(filteredBank); //LOCAL(bank); //Storage* solidStorage = 0: //string solidsName = p.outputDir + "/solid/" + p.bankName + ".h5"; //bool autoDelete = false; // (solidsName == "none") || (solidsName == "null"); //solidStorage = StorageFactory(STORAGE_HDF5).create (solidsName, true, autoDelete); //LOCAL(solidStorage); SimkaCompressedProcessor* proc = new SimkaCompressedProcessor(cachedBags, nbKmerPerParts, nbDistinctKmerPerParts, chordNiPerParts, p.abundanceMin, p.abundanceMax, p.bankIndex); u_int64_t nbReads = 0; if(p.kmerSize <= 15){ MiniKC miniKc(p.tool.getInput(), p.kmerSize, filteredBank, *repartitor, proc); miniKc.execute(); nbReads = miniKc._nbReads; } else{ //SimkaCompressedProcessor* proc = new SimkaCompressedProcessor(bags, caches, cacheIndexes, p.abundanceMin, p.abundanceMax); std::vector* > procs; procs.push_back(proc); SortingCountAlgorithm algo (filteredBank, config, repartitor, procs, props); algo.execute(); nbReads = algo.getInfo()->getInt("seq_number"); } u_int64_t nbDistinctKmers = 0; u_int64_t nbKmers = 0; u_int64_t chord_N2 = 0; for(size_t i=0; iflush(); //cachedBags[i]->flush(); delete cachedBags[i]; //delete bags[i]; } //delete proc; } string contents = ""; for(size_t i=0; ifwrite(contents.c_str(), contents.size(), 1); nbKmerPerPartFile->flush(); delete nbKmerPerPartFile; //cout << "heo" << endl; //delete config; //cout << "heo" << endl; writeFinishSignal(p, outInfo); //cout << "heo" << endl; } void writeFinishSignal(Parameter& p, const vector& outInfo){ string finishFilename = p.outputDir + "/count_synchro/" + p.bankName + ".ok"; IFile* file = System::file().newFile(finishFilename, "w"); string contents = ""; for(size_t i=0; ifwrite(contents.c_str(), contents.size(), 1); file->flush(); delete file; } }; }; /********************************************************************************/ /* Dump solid kmers in ASCII format */ /********************************************************************************/ int main (int argc, char* argv[]) { try { SimkaCount().run (argc, argv); } catch (Exception& e) { std::cout << "EXCEPTION: " << e.getMessage() << std::endl; return EXIT_FAILURE; } } //! [snippet1] simka-1.5.3/src/SimkaMerge.cpp000077500000000000000000001237311377312000000161460ustar00rootroot00000000000000/***************************************************************************** * Simka: Fast kmer-based method for estimating the similarity between numerous metagenomic datasets * A tool from the GATB (Genome Assembly Tool Box) * Copyright (C) 2015 INRIA * Authors: G.Benoit, C.Lemaitre, P.Peterlongo * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU Affero General Public License as * published by the Free Software Foundation, either version 3 of the * License, or (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU Affero General Public License for more details. * * You should have received a copy of the GNU Affero General Public License * along with this program. If not, see . *****************************************************************************/ #include #include #include // We use the required packages using namespace std; using namespace gatb::core::system; using namespace gatb::core::system::impl; #define MERGE_BUFFER_SIZE 1000 #define SIMKA_MERGE_MAX_FILE_USED 200 struct sortItem_Size_Filename_ID{ u_int64_t _size; size_t _datasetID; sortItem_Size_Filename_ID(){} sortItem_Size_Filename_ID(u_int64_t size, size_t datasetID){ _size = size; _datasetID = datasetID; } }; bool sortFileBySize (sortItem_Size_Filename_ID i, sortItem_Size_Filename_ID j){ return ( i._size < j._size ); } u_int64_t getFileSize(const string& filename){ std::ifstream in(filename.c_str(), std::ifstream::ate | std::ifstream::binary); u_int64_t size = in.tellg(); in.close(); return size; } template class DistanceCommand : public gatb::core::tools::dp::ICommand //, public gatb::core::system::SmartPointer { public: /** Shortcut. */ typedef typename Kmer::Type Type; typedef typename Kmer::Count Count; size_t _bufferIndex; size_t _partitionId; SimkaStatistics* _stats; SimkaCountProcessorSimple* _processor; vector _bufferKmers; vector _bufferCounts; /** Constructor. */ DistanceCommand ( const string& tmpDir, const vector& datasetIds, size_t partitionId, size_t nbBanks, bool computeSimpleDistances, bool computeComplexDistances, size_t kmerSize, pair& abundanceThreshold, float minShannonIndex ) { _partitionId = partitionId; _stats = new SimkaStatistics(nbBanks, computeSimpleDistances, computeComplexDistances, tmpDir, datasetIds); _processor = new SimkaCountProcessorSimple (_stats, nbBanks, kmerSize, abundanceThreshold, SUM, false, minShannonIndex); _bufferKmers.resize(MERGE_BUFFER_SIZE); _bufferCounts.resize(MERGE_BUFFER_SIZE); _bufferIndex = 0; } ~DistanceCommand(){ delete _processor; delete _stats; } //void add(Type& kmer, CountVector& counts){ // _bufferIndex += //} void setup(size_t bufferIndex, vector& bufferKmers, vector& bufferCounts){ //cout << "hey " << bufferIndex << endl; _bufferIndex = bufferIndex; for(size_t i=0; i<_bufferIndex; i++){ _bufferKmers[i] = bufferKmers[i]; _bufferCounts[i] = bufferCounts[i]; } } void execute (){ for(size_t i=0; i<_bufferIndex; i++){ _processor->process(_partitionId, _bufferKmers[i], _bufferCounts[i]); } } void use () {} void forget () {} }; struct Parameter { Parameter (IProperties* props, string inputFilename, string outputDir, size_t partitionId, size_t kmerSize, double minShannonIndex, bool computeSimpleDistances, bool computeComplexDistances, size_t nbCores) : props(props), inputFilename(inputFilename), outputDir(outputDir), partitionId(partitionId), kmerSize(kmerSize), minShannonIndex(minShannonIndex), computeSimpleDistances(computeSimpleDistances), computeComplexDistances(computeComplexDistances), nbCores(nbCores) {} IProperties* props; string inputFilename; string outputDir; size_t partitionId; size_t kmerSize; double minShannonIndex; bool computeSimpleDistances; bool computeComplexDistances; size_t nbCores; }; template class StorageIt { public: typedef typename Kmer::Type Type; typedef typename Kmer::Count Count; struct Kmer_BankId_Count{ Type _type; u_int32_t _bankId; u_int64_t _count; Kmer_BankId_Count(){ } Kmer_BankId_Count(Type type, u_int64_t bankId, u_int64_t count){ _type = type; _bankId = bankId; _count = count; } }; //typedef tuple Kmer_BankId_Count; //typedef typename Kmer::ModelCanonical ModelCanonical; //typedef typename ModelCanonical::Kmer KmerType; StorageIt(Iterator* it, size_t bankId, size_t partitionId){ _it = it; //cout << h5filename << endl; _bankId = bankId; _partitionId = partitionId; //Iterator* it2 = partition1.iterator(); //Collection& kmers1 = (*partition1)[_partitionId]; //collections.push_back(&kmers1); //_it = kmers1.iterator(); //_nbKmers = it->estimateNbItems(); //it2->first(); //while(!it2->isDone()){ // cout << it2->item().value.toString(31) << endl; // it2->next(); //} } ~StorageIt(){ delete _it; } //void setPartitionId(size_t partitionId){ // _partitionId = partitionId; //} bool next(){ _it->next(); //cout << "is done?" << _it->isDone() << endl; return !_it->isDone(); } Type& value(){ return _it->item()._type; } u_int16_t getBankId(){ return _it->item()._bankId; } u_int64_t& abundance(){ return _it->item()._count; } //u_int64_t getNbKmers(){ // return _nbKmers; //} u_int16_t _bankId; u_int16_t _partitionId; Iterator* _it; //u_int64_t _nbKmers; }; class SimkaCounterBuilderMerge { public: /** Constructor. * \param[in] nbBanks : number of banks parsed during kmer counting. */ SimkaCounterBuilderMerge (CountVector& abundancePerBank) : _abundancePerBank(abundancePerBank) {} /** Get the number of banks. * \return the number of banks. */ size_t size() const { return _abundancePerBank.size(); } /** Initialization of the counting for the current kmer. This method should be called * when a kmer is seen for the first time. * \param[in] idxBank : bank index where the new current kmer has been found. */ void init (size_t idxBank, CountNumber abundance) { for (size_t k=0; k<_abundancePerBank.size(); k++) { _abundancePerBank[k]=0; } _abundancePerBank [idxBank]= abundance; } /** Increase the abundance of the current kmer for the provided bank index. * \param[in] idxBank : index of the bank */ void increase (size_t idxBank, CountNumber abundance) { _abundancePerBank [idxBank] += abundance; } /** Set the abundance of the current kmer for the provided bank index. * \param[in] idxBank : index of the bank */ //void set (CountNumber val, size_t idxBank=0) { _abundancePerBank [idxBank] = val; } /** Get the abundance of the current kmer for the provided bank index. * \param[in] idxBank : index of the bank * \return the abundance of the current kmer for the given bank. */ //CountNumber operator[] (size_t idxBank) const { return _abundancePerBank[idxBank]; } /** */ //const CountVector& get () const { return _abundancePerBank; } void print(const string& kmer){ cout << kmer << ": "; for(size_t i=0; i class MergeCommand : public gatb::core::tools::dp::ICommand //, public gatb::core::system::SmartPointer { public: void use () {} void forget () {} typedef typename Kmer::Type Type; typedef typename Kmer::Count Count; typedef std::pair kxp; //id pointer in vec_pointer , value struct kxpcomp { bool operator() (kxp l,kxp r) { return ((r.second) < (l.second)); } } ; size_t _currentBuffer; u_int64_t _progressStep; vector > _bufferKmers; vector > _bufferCounts; vector _bufferIndex; u_int64_t _nbDistinctKmers; u_int64_t _nbSharedDistinctKmers; MergeCommand ( size_t partitionId, size_t nbBanks, IteratorListener* progress, vector*>& its, u_int64_t progressStep, size_t nbCores, bool computeComplexDistances ) : its(its) { _nbBanks = nbBanks; _partitionId = partitionId; _progress = progress; _progressStep = progressStep; _nbCores = nbCores; _computeComplexDistances = computeComplexDistances; _nbDistinctKmers = 0; _nbSharedDistinctKmers = 0; init(); } ~MergeCommand(){ delete solidCounter; } //void add(Type& kmer, CountVector& counts){ // _bufferIndex += //} //void setup(vector& bufferKmers, vector& bufferCounts){ // _bufferKmers = bufferKmers; // _bufferCounts = bufferCounts; //} size_t _nbCores; size_t _partitionId; size_t _nbBanks; vector*>& its; std::priority_queue< kxp, vector,kxpcomp > pq; u_int64_t nbKmersProcessed; IteratorListener* _progress; bool _computeComplexDistances; u_int16_t best_p; Type previous_kmer; CountVector abundancePerBank; size_t nbBankThatHaveKmer; SimkaCounterBuilderMerge* solidCounter; bool _isDone; void init(){ _isDone = false; solidCounter = new SimkaCounterBuilderMerge(abundancePerBank); for(size_t i=0; i<_nbCores; i++){ vector vec = vector(MERGE_BUFFER_SIZE); _bufferKmers.push_back(vec); vector vec2 = vector(MERGE_BUFFER_SIZE); _bufferCounts.push_back(vec2); _bufferIndex.push_back(0); } nbBankThatHaveKmer = 0; abundancePerBank.resize(_nbBanks, 0); _currentBuffer = 0; //_bufferIndex = 0; //_bufferSize = 1000; nbKmersProcessed = 0; //vector*> partitions; //vector*> collections; //vector*> its; //vector storages; //size_t nbPartitions; for(size_t i=0; i<_nbBanks; i++){ StorageIt* it = its[i]; it->_it->first(); //partitionIts[i]->first(); //while(!it->_it->isDone()){ // it->_it->next(); // cout << it->_it->item().value.toString(_kmerSize) << " " << it->_it->item().abundance << endl; //} } //fill the priority queue with the first elems for (size_t ii=0; ii<_nbBanks; ii++) { //if(its[ii]->next()) { pq.push(kxp(ii,its[ii]->value())); } pq.push(kxp(ii,its[ii]->value())); } if (pq.size() != 0) // everything empty, no kmer at all { //get first pointer best_p = pq.top().first ; pq.pop(); previous_kmer = its[best_p]->value(); solidCounter->init (its[best_p]->getBankId(), its[best_p]->abundance()); nbBankThatHaveKmer = 1; } } void reset(){ for(size_t i=0; i<_bufferIndex.size(); i++){ _bufferIndex[i] = 0; } } void execute (){ //cout << "lala " << pq.size() << endl; //merge-scan all 'virtual' arrays and output counts while (_currentBuffer < _nbCores) { //cout << _currentBuffer << endl; //go forward in this array or in new array of reaches end of this one if (! its[best_p]->next()) { //reaches end of one array if(pq.size() == 0){ _isDone = true; break; } //otherwise get new best best_p = pq.top().first ; pq.pop(); } if (its[best_p]->value() != previous_kmer ) { //if diff, changes to new array, get new min pointer pq.push(kxp(best_p,its[best_p]->value())); //push new val of this pointer in pq, will be counted later best_p = pq.top().first ; pq.pop(); //if new best is diff, this is the end of this kmer if(its[best_p]->value()!=previous_kmer ) { nbKmersProcessed += nbBankThatHaveKmer; if(nbKmersProcessed > _progressStep){ //cout << "queue size: " << pq.size() << endl; //cout << nbKmersProcessed << endl; _progress->inc(nbKmersProcessed); nbKmersProcessed = 0; } //cout << previous_kmer.toString(p.kmerSize) << endl; //for(size_t i=0; i 1) // _processor->process (_partitionId, previous_kmer, abundancePerBank); //this->insert (previous_kmer, solidCounter); solidCounter->init (its[best_p]->getBankId(), its[best_p]->abundance()); nbBankThatHaveKmer = 1; previous_kmer = its[best_p]->value(); } else { solidCounter->increase (its[best_p]->getBankId(), its[best_p]->abundance()); nbBankThatHaveKmer += 1; } } else { solidCounter->increase (its[best_p]->getBankId(), its[best_p]->abundance()); nbBankThatHaveKmer += 1; } } if(_isDone){ insert(previous_kmer, abundancePerBank, nbBankThatHaveKmer); } else{ } _currentBuffer = 0; //_bufferIndex = 0; //cout << nbBankThatHaveKmer << endl; //cout << previous_kmer.toString(p.kmerSize) << endl; //for(size_t i=0; iinsert (previous_kmer, solidCounter); // } //cout << "end " << endl; } void insert(const Type& kmer, const CountVector& counts, size_t nbBankThatHaveKmer){ _nbDistinctKmers += 1; if(_computeComplexDistances || nbBankThatHaveKmer > 1){ if(nbBankThatHaveKmer > 1){ _nbSharedDistinctKmers += 1; } //DistanceCommand* cmd = dynamic_cast*>(_cmds[_currentBuffer]); //cmd->_bufferKmers[cmd->_bufferIndex] = kmer; //cmd->_bufferCounts[cmd->_bufferIndex] = counts; _bufferKmers[_currentBuffer][_bufferIndex[_currentBuffer]] = kmer; _bufferCounts[_currentBuffer][_bufferIndex[_currentBuffer]] = counts; _bufferIndex[_currentBuffer] += 1; if(_bufferIndex[_currentBuffer] >= MERGE_BUFFER_SIZE){ //DistanceCommand* cmd = dynamic_cast*>(_cmds[_currentBuffer]); //cmd->setup(_bufferKmers[_currentBuffer], _bufferCounts[_currentBuffer]); _currentBuffer += 1; if(_currentBuffer >= _nbCores){ //dispatch(); } else{ //_bufferIndex = 0; } } //_processor->process (_partitionId, kmer, counts); } //_processor->process (_partitionId, kmer, counts); //cout <<_partitiontId << " "<< kmer.toString(31) << endl; //_processor->process (_partitionId, kmer, counter.get()); } }; */ template class DiskBasedMergeSort { public: typedef typename Kmer::Type Type; typedef typename Kmer::Count Count; //typedef tuple Kmer_BankId_Count; //typedef tuple*> kxp; typedef typename StorageIt::Kmer_BankId_Count Kmer_BankId_Count; struct kxp{ Type _type; u_int32_t _bankId; u_int64_t _count; StorageIt* _it; kxp(){ } kxp(Type type, u_int64_t bankId, u_int64_t count, StorageIt* it){ _type = type; _bankId = bankId; _count = count; _it = it; } }; struct kxpcomp { bool operator() (kxp& l, kxp& r) { return (r._type < l._type); } } ; string _outputDir; string _outputFilename; vector& _datasetIds; size_t _partitionId; Bag* _outputGzFile; Bag* _cachedBag; DiskBasedMergeSort(size_t mergeId, const string& outputDir, vector& datasetIds, size_t partitionId): _datasetIds(datasetIds) { _outputDir = outputDir; _partitionId = partitionId; _outputFilename = _outputDir + "/solid/part_" + Stringify::format("%i", partitionId) + "/__p__" + Stringify::format("%i", mergeId) + ".gz.temp"; _outputGzFile = new BagGzFile(_outputFilename); _cachedBag = new BagCache(_outputGzFile, 10000); } ~DiskBasedMergeSort(){ } void execute(){ vector* > partitions; vector*> its; size_t _nbBanks = _datasetIds.size(); for(size_t i=0; i<_nbBanks; i++){ //cout << _datasetIds[i] << endl; string filename = _outputDir + "/solid/part_" + Stringify::format("%i", _partitionId) + "/__p__" + Stringify::format("%i", _datasetIds[i]) + ".gz"; //cout << "\t\t" << filename << endl; IterableGzFile* partition = new IterableGzFile(filename, 10000); partitions.push_back(partition); its.push_back(new StorageIt(partition->iterator(), i, _partitionId)); //nbKmers += partition->estimateNbItems(); //size_t currentPart = 0; //ifstream file((_outputDir + "/kmercount_per_partition/" + _datasetIds[i] + ".txt").c_str()); //while(getline(file, line)){ // if(line == "") continue; // if(currentPart == _partitionId){ // //cout << stoull(line) << endl; // nbKmers += strtoull(line.c_str(), NULL, 10); // break; // } // currentPart += 1; //} //file.close(); } //u_int64_t progressStep = nbKmers / 1000; //_progress = new ProgressSynchro ( // createIteratorListener (nbKmers, "Merging kmers"), // System::thread().newSynchronizer()); //_progress->init (); //_nbDistinctKmers = 0; //_nbSharedDistinctKmers = 0; //u_int64_t nbKmersProcessed = 0; //size_t nbBankThatHaveKmer = 0; //u_int16_t best_p = 0; Type previous_kmer; //CountVector abundancePerBank; //abundancePerBank.resize(_nbBanks, 0); //SimkaCounterBuilderMerge* solidCounter = new SimkaCounterBuilderMerge(abundancePerBank);; std::priority_queue< kxp, vector,kxpcomp > pq; StorageIt* bestIt; for(size_t i=0; i<_nbBanks; i++){ StorageIt* it = its[i]; it->_it->first(); } //fill the priority queue with the first elems for (size_t ii=0; ii<_nbBanks; ii++) { //pq.push(Kmer_BankId_Count(ii,its[ii]->value())); pq.push(kxp(its[ii]->value(), its[ii]->getBankId(), its[ii]->abundance(), its[ii])); } if (pq.size() != 0) // everything empty, no kmer at all { //get first pointer bestIt = pq.top()._it; pq.pop(); _cachedBag->insert(Kmer_BankId_Count(bestIt->value(), bestIt->getBankId(), bestIt->abundance())); //best_p = get<1>(pq.top()) ; pq.pop(); //previous_kmer = bestIt->value(); //solidCounter->init (bestIt->getBankId(), bestIt->abundance()); //nbBankThatHaveKmer = 1; while(1){ if (! bestIt->next()) { //reaches end of one array if(pq.size() == 0){ break; } //otherwise get new best //best_p = get<1>(pq.top()) ; pq.pop(); bestIt = pq.top()._it; pq.pop(); } pq.push(kxp(bestIt->value(), bestIt->getBankId(), bestIt->abundance(), bestIt)); //push new val of this pointer in pq, will be counted later bestIt = pq.top()._it; pq.pop(); _cachedBag->insert(Kmer_BankId_Count(bestIt->value(), bestIt->getBankId(), bestIt->abundance())); //cout << bestIt->value().toString(31) << " " << bestIt->getBankId() << " "<< bestIt->abundance() << endl; //bestIt = get<3>(pq.top()); pq.pop(); //pq.push(kxp(bestIt->value(), bestIt->getBankId(), bestIt->abundance(), bestIt)); } //_outputGzFile->insert(Kmer_BankId_Count(bestIt->value(), bestIt->getBankId(), bestIt->abundance())); //cout << bestIt->value().toString(31) << " " << bestIt->getBankId() << " "<< bestIt->abundance() << endl; } for(size_t i=0; iflush(); delete _cachedBag; for(size_t i=0; i<_nbBanks; i++){ //cout << _datasetIds[i] << endl; string filename = _outputDir + "/solid/part_" + Stringify::format("%i", _partitionId) + "/__p__" + Stringify::format("%i", _datasetIds[i]) + ".gz"; System::file().remove(filename); } string newOutputFilename = _outputFilename; newOutputFilename.erase(_outputFilename.size()-5, 5); System::file().rename(_outputFilename, newOutputFilename); //remove .temp at the end of new merged file //_outputFilename = newOutputFilename; } }; template class SimkaMergeAlgorithm : public Algorithm { public: typedef typename Kmer::Type Type; typedef typename Kmer::Count Count; //typedef tuple Kmer_BankId_Count; //typedef tuple*> kxp; typedef typename DiskBasedMergeSort::Kmer_BankId_Count Kmer_BankId_Count; typedef typename DiskBasedMergeSort::kxp kxp; /* struct Kmer_BankId_Count{ Type _type; u_int64_t _bankId; u_int64_t _count; Kmer_BankId_Count(){ } Kmer_BankId_Count(Type type, u_int64_t bankId, u_int64_t count){ _type = type; _bankId = bankId; _count = count; } }; struct kxp{ Type _type; u_int32_t _bankId; u_int64_t _count; StorageIt* _it; kxp(){ } kxp(Type type, u_int64_t bankId, u_int64_t count, StorageIt* it){ _type = type; _bankId = bankId; _count = count; _it = it; } };*/ //typedef std::pair kxp; //id pointer in vec_pointer , value //typedef std::pair kxp; //id pointer in vec_pointer , value //struct kxpcomp { bool operator() (Kmer_BankId_Count l,Kmer_BankId_Count r) { return ((r.second) < (l.second)); } } ; struct kxpcomp { bool operator() (kxp& l,kxp& r) { return (r._type < l._type); } } ; Parameter& p; SimkaMergeAlgorithm(Parameter& p) : Algorithm("SimkaMergeAlgorithm", p.nbCores, p.props), p(p) { _abundanceThreshold.first = 0; _abundanceThreshold.second = 999999999; _computeSimpleDistances = p.computeSimpleDistances; _computeComplexDistances = p.computeComplexDistances; _kmerSize = p.kmerSize; _minShannonIndex = p.minShannonIndex; } ~SimkaMergeAlgorithm(){ //delete _progress; } //pthread_t statThread;_datasetNbReads /* void createInfo(Parameter& p){ } void loadCountInfo(){ for(size_t i=0; i<_nbBanks; i++){ string name = _datasetIds[i]; string countFilename = p.outputDir + "/count_synchro/" + name + ".ok"; string line; ifstream file(countFilename.c_str()); vector lines; while(getline(file, line)){ if(line == "") continue; lines.push_back(line); } file.close(); u_int64_t nbReads = strtoull(lines[0].c_str(), NULL, 10); _stats->_datasetNbReads[i] = nbReads; _stats->_nbSolidDistinctKmersPerBank[i] = strtoull(lines[1].c_str(), NULL, 10); _stats->_nbSolidKmersPerBank[i] = strtoull(lines[2].c_str(), NULL, 10); _stats->_chord_sqrt_N2[i] = sqrt(strtoull(lines[3].c_str(), NULL, 10)); //cout << _stats->_chord_sqrt_N2[i] << endl; } }*/ //struct sortFileBySize { bool operator() (sortItem_Size_Filename_ID& l,sortItem_Size_Filename_ID& r) { return (r._size < l._size); } } ; void execute(){ _nbCores = p.nbCores; removeStorage(p); _partitionId = p.partitionId; createDatasetIdList(p); _nbBanks = _datasetIds.size(); string partDir = p.outputDir + "/solid/part_" + Stringify::format("%i", _partitionId) + "/"; vector filenames = System::file().listdir(partDir); //cout << filenames.size() << endl; vector partFilenames; vector filenameSizes; for(size_t i=0; i SIMKA_MERGE_MAX_FILE_USED){ //cout << "Start merging pass" << endl; sort(filenameSizes.begin(),filenameSizes.end(),sortFileBySize); vector mergeDatasetIds; vector toRemoveItem; for(size_t i=0; i= _nbBanks) break; //cout << mergeDatasetIds[i] << endl; //cout << "First val must never be greater than second: " << i << " " << _nbBanks << endl; //cout << "\t" << get<1>(sfi) << endl; } for(size_t i=0; i diskBasedMergeSort(mergedId, p.outputDir, mergeDatasetIds, _partitionId); diskBasedMergeSort.execute(); filenameSizes.push_back(sortItem_Size_Filename_ID(getFileSize(diskBasedMergeSort._outputFilename), mergedId)); //cout << "\tmerged id: " << mergedId << endl; //cout << "\tremainging files: " << filenameSizes.size() << endl; } //cout << filenameSizes.size() << endl; //for(size_t i=0; i mergeDatasetIds; for(size_t j=0; j= _nbBanks) break; } cout << "doivent etre égaux a la dernière passe: " << _nbBanks << " " << mergeDatasetIds.size() << " " << datasetIndex << endl; DiskBasedMergeSort diskBasedMergeSort(i, p.outputDir, mergeDatasetIds, _partitionId); diskBasedMergeSort.execute(); }*/ //exit(1); /* PARALLEL for (size_t i=0; i<_nbCores; i++) { //cout << i << endl; ICommand* cmd = 0; cmd = new DistanceCommand(p.outputDir, _datasetIds, _partitionId, _nbBanks, _computeSimpleDistances, _computeComplexDistances, _kmerSize, _abundanceThreshold, _minShannonIndex); //cmd->use(); _cmds.push_back (cmd); //cout << _cmds[i] << endl; } resetCommands(); */ //SimkaDistanceParam distanceParams(p.props); //createInfo(p); //createProcessor(p); //PARALLEL line to remove _stats = new SimkaStatistics(_nbBanks, p.computeSimpleDistances, p.computeComplexDistances, p.outputDir, _datasetIds); _processor = new SimkaCountProcessorSimple (_stats, _nbBanks, p.kmerSize, _abundanceThreshold, SUM, false, p.minShannonIndex); //_processor->use(); string line; vector* > partitions; vector*> its; u_int64_t nbKmers = 0; for(size_t i=0; i* partition = new IterableGzFile(filename, 10000); partitions.push_back(partition); its.push_back(new StorageIt(partition->iterator(), i, _partitionId)); //nbKmers += partition->estimateNbItems(); size_t currentPart = 0; ifstream file((p.outputDir + "/kmercount_per_partition/" + _datasetIds[i] + ".txt").c_str()); while(getline(file, line)){ if(line == "") continue; if(currentPart == _partitionId){ //cout << stoull(line) << endl; nbKmers += strtoull(line.c_str(), NULL, 10); break; } currentPart += 1; } file.close(); } /* //vector* > partitionIts; for(size_t i=0; i<_nbBanks; i++){ string filename = p.outputDir + "/solid/" + _datasetIds[i] + "/" + "part" + Stringify::format("%i", _partitionId); //cout << filename << endl; IterableGzFile* partition = new IterableGzFile(filename, 1000); partitions.push_back(partition); its.push_back(new StorageIt(partition->iterator(), i, _partitionId)); //nbKmers += partition->estimateNbItems(); size_t currentPart = 0; ifstream file((p.outputDir + "/kmercount_per_partition/" + _datasetIds[i] + ".txt").c_str()); while(getline(file, line)){ if(line == "") continue; if(currentPart == _partitionId){ //cout << stoull(line) << endl; nbKmers += strtoull(line.c_str(), NULL, 10); break; } currentPart += 1; } file.close(); }*/ //u_int64_t progressStep = nbKmers / 1000; //_progress = new ProgressSynchro ( // createIteratorListener (nbKmers, "Merging kmers"), // System::thread().newSynchronizer()); //_progress->init (); /* PARALLEL _mergeCommand = new MergeCommand( _partitionId, _nbBanks, _progress, its, progressStep, _nbCores, p.computeComplexDistances); //_mergeCommand->use(); _cmds.push_back(_mergeCommand); //cout << "CMDS SIZE:" << _cmds.size() << endl; MergeCommand* mergeCmd = dynamic_cast*>(_mergeCommand); mergeCmd->execute(); while(!mergeCmd->_isDone){ //cout << mergeCmd->_isDone << endl; //mergeCmd->execute(); dispatch(); } dispatch();*/ _nbDistinctKmers = 0; _nbSharedDistinctKmers = 0; u_int64_t nbKmersProcessed = 0; size_t nbBankThatHaveKmer = 0; u_int16_t best_p = 0; Type previous_kmer; CountVector abundancePerBank; abundancePerBank.resize(_nbBanks, 0); SimkaCounterBuilderMerge* solidCounter = new SimkaCounterBuilderMerge(abundancePerBank);; std::priority_queue< kxp, vector,kxpcomp > pq; StorageIt* bestIt; for(size_t i=0; i* it = its[i]; it->_it->first(); } //fill the priority queue with the first elems for (size_t ii=0; iivalue())); pq.push(kxp(its[ii]->value(), its[ii]->getBankId(), its[ii]->abundance(), its[ii])); } if (pq.size() != 0) // everything empty, no kmer at all { //get first pointer bestIt = pq.top()._it; pq.pop(); //best_p = get<1>(pq.top()) ; pq.pop(); previous_kmer = bestIt->value(); solidCounter->init (bestIt->getBankId(), bestIt->abundance()); nbBankThatHaveKmer = 1; while(1){ if (! bestIt->next()) { //reaches end of one array if(pq.size() == 0){ break; } //otherwise get new best //best_p = get<1>(pq.top()) ; pq.pop(); bestIt = pq.top()._it; pq.pop(); } //cout << bestIt->value().toString(31) << " " << bestIt->getBankId() << " "<< bestIt->abundance() << endl; if (bestIt->value() != previous_kmer ) { //if diff, changes to new array, get new min pointer pq.push(kxp(bestIt->value(), bestIt->getBankId(), bestIt->abundance(), bestIt)); //push new val of this pointer in pq, will be counted later bestIt = pq.top()._it; pq.pop(); //best_p = get<1>(pq.top()) ; pq.pop(); //if new best is diff, this is the end of this kmer if(bestIt->value()!=previous_kmer ) { //nbKmersProcessed += nbBankThatHaveKmer; //if(nbKmersProcessed > progressStep){ //cout << "queue size: " << pq.size() << endl; //cout << nbKmersProcessed << endl; //_progress->inc(nbKmersProcessed); //nbKmersProcessed = 0; //} //cout << previous_kmer.toString(p.kmerSize) << endl; //for(size_t i=0; i 1) // _processor->process (_partitionId, previous_kmer, abundancePerBank); //this->insert (previous_kmer, solidCounter); solidCounter->init (bestIt->getBankId(), bestIt->abundance()); nbBankThatHaveKmer = 1; previous_kmer = bestIt->value(); } else { solidCounter->increase (bestIt->getBankId(), bestIt->abundance()); nbBankThatHaveKmer += 1; } } else { //cout << "increase" << endl; solidCounter->increase (bestIt->getBankId(), bestIt->abundance()); nbBankThatHaveKmer += 1; } } insert(previous_kmer, abundancePerBank, nbBankThatHaveKmer); } _processor->end(); //cout << "lala" << endl; for(size_t i=0; i_nbDistinctKmers, mergeCmd->_nbSharedDistinctKmers); //cout << _cmds.size() << endl; for(size_t i=0; i<_cmds.size(); i++){ //cout << _cmds[i] << endl; //_cmds[i]->forget(); delete _cmds[i]; } //_cmds.clear(); //delete _mergeCommand; */ delete solidCounter; for(size_t i=0; ifinish(); } void insert(const Type& kmer, const CountVector& counts, size_t nbBankThatHaveKmer){ //cout << kmer.toString(31) << endl; //for(size_t i=0; i_nbDistinctKmers += 1; if(_computeComplexDistances || nbBankThatHaveKmer > 1){ if(nbBankThatHaveKmer > 1){ _stats->_nbSharedKmers += 1; } _processor->process(_partitionId, kmer, counts); } } void createDatasetIdList(Parameter& p){ string datasetIdFilename = p.outputDir + "/" + "datasetIds"; IFile* inputFile = System::file().newFile(datasetIdFilename, "rb"); //IFile* bankFile = System::file().newFile(_banksInputFilename, "wb"); inputFile->seeko(0, SEEK_END); u_int64_t size = inputFile->tell(); inputFile->seeko(0, SEEK_SET); char buffer2[size]; inputFile->fread(buffer2, size, size); string fileContents(buffer2, size); string line; string linePart; vector linePartList; stringstream fileContentsStream(fileContents); //string bankFileContents = ""; //u_int64_t lineIndex = 0; while(getline(fileContentsStream, line)){ if(line == "") continue; _datasetIds.push_back(line); } //bankFileContents.erase(bankFileContents.size()-1); //bankFileContents.pop_back(); // "remove last /n //bankFile->fwrite(bankFileContents.c_str(), bankFileContents.size(), 1); delete inputFile; } void createProcessor(Parameter& p){ //ICountProcessor* proc = _processor->clone(); //proc->use(); //_processors.push_back(proc); } void resetCommands(){ for (size_t i=0; i<_nbCores; i++){ DistanceCommand* cmd = dynamic_cast*>(_cmds[i]); cmd->_bufferIndex = 0; } } /* void dispatch(){ MergeCommand* mergeCommand = dynamic_cast*>(_mergeCommand); for (size_t i=0; i<_nbCores; i++){ //cout << mergeCommand->_bufferKmers.size() << endl; //cout << i << endl; DistanceCommand* cmd = dynamic_cast*>(_cmds[i]); cmd->setup(mergeCommand->_bufferIndex[i], mergeCommand->_bufferKmers[i], mergeCommand->_bufferCounts[i]); } //MergeCommand* mergeCommand = dynamic_cast*>(_mergeCommand); mergeCommand->reset(); //cout << "start dispatch" << endl; getDispatcher()->dispatchCommands(_cmds, 0); //cout << "end dispatch" << endl; resetCommands(); }*/ void removeStorage(Parameter& p){ //Storage* storage = 0; //storage = StorageFactory(STORAGE_HDF5).create (p.outputDir + "/stats/part_" + SimkaAlgorithm<>::toString(p.partitionId) + ".stats", true, true); //LOCAL (storage); } /* PARALLEL void saveStats(Parameter& p, const u_int64_t nbDistinctKmers, const u_int64_t nbSharedDistinctKmers){ _stats = new SimkaStatistics(_nbBanks, p.computeSimpleDistances, p.computeComplexDistances, p.outputDir, _datasetIds); for (size_t i=0; i<_nbCores; i++){ DistanceCommand* cmd = dynamic_cast*>(_cmds[i]); cmd->_processor->end(); (*_stats) += (*cmd->_stats); } //loadCountInfo(); string filename = p.outputDir + "/stats/part_" + SimkaAlgorithm<>::toString(p.partitionId) + ".gz"; _stats->_nbDistinctKmers = nbDistinctKmers; _stats->_nbSharedKmers = nbSharedDistinctKmers; _stats->save(filename); //storage->getGroup("")); delete _stats; //string filename = p.outputDir + "/stats/part_" + SimkaAlgorithm<>::toString(p.partitionId) + ".gz"; //_processor->finishClones(_processors); //Storage* storage = 0; //storage = StorageFactory(STORAGE_HDF5).create (p.outputDir + "/stats/part_" + SimkaAlgorithm<>::toString(p.partitionId) + ".stats", true, false); //LOCAL (storage); //_stats->save(filename); //storage->getGroup("")); //cout << _stats->_nbKmers << endl; //_processors[0]->forget(); //_processor->forget(); }*/ void saveStats(Parameter& p){ string filename = p.outputDir + "/stats/part_" + SimkaAlgorithm<>::toString(p.partitionId) + ".gz"; _stats->save(filename); //storage->getGroup("")); //string filename = p.outputDir + "/stats/part_" + SimkaAlgorithm<>::toString(p.partitionId) + ".gz"; //_processor->finishClones(_processors); //Storage* storage = 0; //storage = StorageFactory(STORAGE_HDF5).create (p.outputDir + "/stats/part_" + SimkaAlgorithm<>::toString(p.partitionId) + ".stats", true, false); //LOCAL (storage); //_stats->save(filename); //storage->getGroup("")); //cout << _stats->_nbKmers << endl; //_processors[0]->forget(); //_processor->forget(); } void writeFinishSignal(Parameter& p){ string finishFilename = p.outputDir + "/merge_synchro/" + SimkaAlgorithm<>::toString(p.partitionId) + ".ok"; IFile* file = System::file().newFile(finishFilename, "w"); delete file; } private: size_t _nbBanks; bool _computeSimpleDistances; bool _computeComplexDistances; size_t _kmerSize; float _minShannonIndex; pair _abundanceThreshold; vector _datasetIds; size_t _partitionId; //vector*> _processors; IteratorListener* _progress; vector _cmds; ICommand* _mergeCommand; size_t _nbCores; SimkaStatistics* _stats; SimkaCountProcessorSimple* _processor; u_int64_t _nbDistinctKmers; u_int64_t _nbSharedDistinctKmers; }; class SimkaMerge : public Tool { public: SimkaMerge () : Tool ("SimkaMerge") { //Original input filename given to simka. Used to recreate dataset id list getParser()->push_back (new OptionOneParam (STR_NB_CORES, "nb cores", true)); getParser()->push_back (new OptionOneParam (STR_KMER_SIZE, "kmer size", true)); getParser()->push_back (new OptionOneParam (STR_URI_INPUT, "input filename", true)); getParser()->push_back (new OptionOneParam ("-out-tmp-simka", "tmp output", true)); getParser()->push_back (new OptionOneParam ("-partition-id", "bank name", true)); getParser()->push_back (new OptionOneParam ("-nb-cores", "bank name", true)); getParser()->push_back (new OptionOneParam ("-max-memory", "bank name", true)); getParser()->push_back (new OptionOneParam (STR_SIMKA_MIN_KMER_SHANNON_INDEX, "bank name", true)); getParser()->push_back (new OptionNoParam (STR_SIMKA_COMPUTE_ALL_SIMPLE_DISTANCES.c_str(), "compute simple distances")); getParser()->push_back (new OptionNoParam (STR_SIMKA_COMPUTE_ALL_COMPLEX_DISTANCES.c_str(), "compute complex distances")); } void execute () { size_t nbCores = getInput()->getInt(STR_NB_CORES); size_t kmerSize = getInput()->getInt(STR_KMER_SIZE); size_t partitionId = getInput()->getInt("-partition-id"); string inputFilename = getInput()->getStr(STR_URI_INPUT); string outputDir = getInput()->getStr("-out-tmp-simka"); double minShannonIndex = getInput()->getDouble(STR_SIMKA_MIN_KMER_SHANNON_INDEX); bool computeSimpleDistances = getInput()->get(STR_SIMKA_COMPUTE_ALL_SIMPLE_DISTANCES); bool computeComplexDistances = getInput()->get(STR_SIMKA_COMPUTE_ALL_COMPLEX_DISTANCES); Parameter params(getInput(), inputFilename, outputDir, partitionId, kmerSize, minShannonIndex, computeSimpleDistances, computeComplexDistances, nbCores); Integer::apply (kmerSize, params); } template struct Functor { void operator () (Parameter& p) { SimkaMergeAlgorithm(p).execute(); } }; }; int main (int argc, char* argv[]) { try { SimkaMerge().run (argc, argv); } catch (Exception& e) { std::cout << "EXCEPTION: " << e.getMessage() << std::endl; return EXIT_FAILURE; } } //! [snippet1] simka-1.5.3/src/SimkaPotara.cpp000077500000000000000000000157721377312000000163420ustar00rootroot00000000000000/***************************************************************************** * Simka: Fast kmer-based method for estimating the similarity between numerous metagenomic datasets * A tool from the GATB (Genome Assembly Tool Box) * Copyright (C) 2015 INRIA * Authors: G.Benoit, C.Lemaitre, P.Peterlongo * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU Affero General Public License as * published by the Free Software Foundation, either version 3 of the * License, or (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU Affero General Public License for more details. * * You should have received a copy of the GNU Affero General Public License * along with this program. If not, see . *****************************************************************************/ #include "SimkaPotara.hpp" /* TODO: - Faire la config dans un job a part (job_count.bash) pour avoir la même config pour les job de comptage et la config - Verifier les paramètre passer au jobs généré (nbcores, maxmemory...) */ SimkaPotara::SimkaPotara(const string& execFilename) : Tool ("Simka") { _execFilename = execFilename; Simka::createOptionsParser(getParser()); //Kmer parser IOptionsParser* coreParser = getParser()->getParser("core"); //clusterParser->push_back (new OptionNoParam (STR_SIMKA_CLUSTER_MODE, "enable cluster mode. All cluster args below must be set", false)); coreParser->push_back (new OptionOneParam (STR_SIMKA_NB_JOB_COUNT, "maximum number of simultaneous counting jobs (a higher value improve execution time but increase temporary disk usage)", false)); coreParser->push_back (new OptionOneParam (STR_SIMKA_NB_JOB_MERGE, "maximum number of simultaneous merging jobs (1 job = 1 core)", false)); IOptionsParser* clusterParser = new OptionsParser ("cluster"); //clusterParser->push_back (new OptionOneParam (STR_SIMKA_NB_PARTITIONS, "nb partitions", false, "0" )); clusterParser->push_back (new OptionOneParam (STR_SIMKA_JOB_COUNT_COMMAND, "command to submit counting job", false )); clusterParser->push_back (new OptionOneParam (STR_SIMKA_JOB_MERGE_COMMAND, "command to submit merging job", false )); clusterParser->push_back (new OptionOneParam (STR_SIMKA_JOB_COUNT_FILENAME, "filename to the couting job template", false )); clusterParser->push_back (new OptionOneParam (STR_SIMKA_JOB_MERGE_FILENAME, "filename to the merging job template", false )); //getParser()->push_back(coreParser); getParser()->push_back(clusterParser); //getParser()->getParser("core")->getParser(STR_NB_CORES)->setHelp("number of cores per counting job"); //if (Option* p = dynamic_cast (getParser()->getParser(STR_MAX_MEMORY))) { p->setHelp("max memory per counting job (in MBytes) "); } //if (Option* p = dynamic_cast (getParser()->getParser(STR_NB_CORES))) { p->setHelp("number of cores per job"); } //coreParser->push_back(new OptionOneParam(parser->getParser(STR_NB_CORES)->getName(), parser->getParser(STR_NB_CORES)->getHelp(), false, "0")); //if (IOptionsParser* input = dskParser->getParser (STR_KMER_ABUNDANCE_MIN_THRESHOLD)) { input->setVisible (false); } /* IOptionsParser* parser = getParser(); IOptionsParser* dskParser = SortingCountAlgorithm<>::getOptionsParser(); parser->push_back (dskParser, 1); parser->push_back(dskParser); parser->getParser (STR_URI_INPUT)->setHelp("input file of datasets and their id. One dataset per line: dataset_id dataset_filename"); parser->getParser (STR_KMER_ABUNDANCE_MIN_THRESHOLD)->setVisible (false); parser->getParser (STR_HISTOGRAM_MAX)->setVisible (false); parser->getParser (STR_URI_SOLID_KMERS)->setVisible (false); parser->getParser (STR_URI_OUTPUT_DIR)->setHelp("output directory for temporary files"); parser->getParser (STR_URI_OUTPUT)->setHelp("output directory for result files"); parser->getParser (STR_SOLIDITY_KIND)->setHelp("TODO"); parser->getParser (STR_MINIMIZER_TYPE)->setVisible (false); parser->getParser (STR_MINIMIZER_SIZE)->setVisible (false); parser->getParser (STR_REPARTITION_TYPE)->setVisible (false); if (Option* p = dynamic_cast (parser->getParser(STR_KMER_ABUNDANCE_MIN))) { p->setDefaultValue ("0"); } parser->push_back (new OptionNoParam (STR_SOLIDITY_PER_DATASET.c_str(), "Do not take into consideration multi-counting when determining solidity of kmers", false )); */ /* parser->push_back (new OptionOneParam (STR_URI_INPUT, "reads file", true )); parser->push_back (new OptionOneParam (STR_KMER_SIZE, "size of a kmer", false, "31" )); parser->push_back (new OptionOneParam (STR_KMER_ABUNDANCE_MIN,"min abundance threshold for solid kmers", false, "3" )); parser->push_back (new OptionOneParam (STR_KMER_ABUNDANCE_MAX,"min abundance threshold for solid kmers", false, "3" )); parser->push_back (new OptionOneParam (STR_MAX_MEMORY, "max memory (in MBytes)", false, "2000")); parser->push_back (new OptionOneParam (STR_URI_OUTPUT_DIR, "output folder for solid kmers", false)); parser->push_back (new OptionOneParam (STR_URI_OUTPUT, "output file", false)); */ //setParser (parser); } struct Parameter { //Parameter (Simka& simka, IProperties* props) : props(props) {} Parameter (IProperties* props, const string& execFilename) : _props(props), _execFilename(execFilename) {} //Simka& _simka; IProperties* _props; string _execFilename; /* string _inputFilename; string _outputDir; size_t _kmerSize; pair _abundanceThreshold; bool _soliditySingle;*/ }; template struct Functor { void operator () (Parameter p) { /* cout << "SimkaAlgo.cpp 1" << endl; clear(); delete _banks; cout << "SimkaAlgo.cpp 2" << endl; SimkaFusion* simkaFusion = new SimkaFusion(_options, _inputFilename, _outputDir, _outputDirTemp, _nbReadsPerDataset, _maxNbReads); simkaFusion->execute(); return;*/ SimkaPotaraAlgorithm simkaAlgorithm (p._props, p._execFilename); simkaAlgorithm.execute(); /* #ifdef SIMKA_MIN simkaAlgorithm.executeSimkamin(); #else #endif*/ }}; void SimkaPotara::execute () { IProperties* input = getInput(); //Parameter params(*this, getInput()); Parameter params(input, _execFilename); size_t kmerSize = getInput()->getInt (STR_KMER_SIZE); Integer::apply (kmerSize, params); } int main (int argc, char* argv[]) { try { // We run the tool with the provided command line arguments. //cout << argv[0] << endl; SimkaPotara(string(argv[0])).run (argc, argv); } catch (Exception& e) { std::cout << "EXCEPTION: " << e.getMessage() << std::endl; return EXIT_FAILURE; } return EXIT_SUCCESS; } simka-1.5.3/src/SimkaPotara.hpp000066400000000000000000001125201377312000000163310ustar00rootroot00000000000000/***************************************************************************** * Simka: Fast kmer-based method for estimating the similarity between numerous metagenomic datasets * A tool from the GATB (Genome Assembly Tool Box) * Copyright (C) 2015 INRIA * Authors: G.Benoit, C.Lemaitre, P.Peterlongo * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU Affero General Public License as * published by the Free Software Foundation, either version 3 of the * License, or (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU Affero General Public License for more details. * * You should have received a copy of the GNU Affero General Public License * along with this program. If not, see . *****************************************************************************/ #ifndef TOOLS_SIMKA_SRC_SIMKAFUSION_HPP_ #define TOOLS_SIMKA_SRC_SIMKAFUSION_HPP_ #include #include #include #include #include #include //#include //#include #include //#define CLUSTER //#define SERIAL #define SLEEP_TIME_SEC 1 const string STR_SIMKA_CLUSTER_MODE = "-cluster"; const string STR_SIMKA_NB_JOB_COUNT = "-max-count"; const string STR_SIMKA_NB_JOB_MERGE = "-max-merge"; const string STR_SIMKA_JOB_COUNT_COMMAND = "-count-cmd"; const string STR_SIMKA_JOB_MERGE_COMMAND = "-merge-cmd"; const string STR_SIMKA_JOB_COUNT_FILENAME = "-count-file"; const string STR_SIMKA_JOB_MERGE_FILENAME = "-merge-file"; class SimkaBankSample : public BankDelegate { public: SimkaBankSample (IBank* ref, u_int64_t nbRead) : BankDelegate (ref) { _nbRead = nbRead; } /** \copydoc tools::collections::Iterable::iterator */ Iterator* iterator () { Iterator* it = _ref->iterator (); std::vector*> iterators = it->getComposition(); TruncateIterator* truncIt = new TruncateIterator(*iterators[0], _nbRead); return truncIt; } private: u_int64_t _nbRead; }; template class SimkaNullProcessor : public CountProcessorAbstract{ public: typedef typename Kmer::Type Type; //typedef typename Kmer::Count Count; SimkaNullProcessor(){} ~SimkaNullProcessor(){} CountProcessorAbstract* clone () { return new SimkaNullProcessor (); } void finishClones (vector*>& clones){} bool process (size_t partId, const typename Kmer::Type& kmer, const CountVector& count, CountNumber sum){return false;} }; template class SimkaCompProcessor : public CountProcessorAbstract{ public: SimkaCompProcessor(KmerCountCompressor* comp){ _comp = comp; } ~SimkaCompProcessor(){} CountProcessorAbstract* clone () { return new SimkaCompProcessor (_comp); } void finishClones (vector*>& clones){} bool process (size_t partId, const typename Kmer::Type& kmer, const CountVector& count, CountNumber sum){ _comp->insert(partId, kmer, count); return true; } private: KmerCountCompressor* _comp; }; class SimkaBankTemp : public BankDelegate { public: u_int64_t _refNbReads; u_int64_t _refTotalSeqSize; u_int64_t _refMaxReadSize; /** Constructor. * \param[in] ref : referred bank. * \param[in] filter : functor that filters sequence. */ SimkaBankTemp (IBank* ref, u_int64_t maxReads) : BankDelegate (ref) { _maxReads = maxReads; //_nbBanks = ref->getCompositionNb(); ref->estimate(_refNbReads, _refTotalSeqSize, _refMaxReadSize); //cout << _refNbReads << endl; //cout << _refTotalSeqSize << endl; //cout << _refMaxReadSize << endl; } void estimate (u_int64_t& number, u_int64_t& totalSize, u_int64_t& maxSize){ if(_maxReads == 0){ number = _refNbReads; totalSize = _refTotalSeqSize; maxSize = _refMaxReadSize; } else{ u_int64_t maxReads = _maxReads; //u_int64_t maxReads = 0; //for(size_t i=0; i<_nbBanks; i++){ // maxReads += _maxReads * _nbPaireds[i]; //} //cout << _refNbReads << endl; //cout << _maxReads*_nbBanks << endl; maxReads = min (maxReads, _refNbReads); //cout << "ha " << maxReads << endl; if(maxReads == _refNbReads){ number = _refNbReads; totalSize = _refTotalSeqSize; maxSize = _refMaxReadSize; } else{ number = maxReads; double factor = (double)maxReads / (double)_refNbReads; totalSize = _refTotalSeqSize * factor; maxSize = _refMaxReadSize; } } //number = _maxReads; //totalSize = (_totalSizeRef*_nbReadToProcess)/_numberRef; //maxSize = _maxSizeRef; //cout << number2 << endl; //u_int64_t readSize = totalSize2 / number2; //cout << "lal:" << number2 << endl; //number = _maxReads; //number = _nbReadToProcess; //totalSize = _nbReadToProcess*readSize; //maxSize = readSize; //cout << number << endl; //cout << totalSize << endl; //cout << maxSize << endl; } u_int64_t _maxReads; }; template class SimkaPotaraAlgorithm : public SimkaAlgorithm{ public: typedef typename Kmer::Type Type; SimkaPotaraAlgorithm(IProperties* options, const string& execFilename): SimkaAlgorithm(options) { _isClusterMode = false; //cout << "lala" << endl; //cout << _execDir << endl; if (execFilename == "simka") // i.e. simka binary is found in $PATH { //cout << endl << "Debug info : execFilename = simka (i.e. simka binary is found in $PATH)" << endl; _execDir = ""; } else { //cout << endl << "Debug info : execFilename = " << execFilename << endl; _execDir = System::file().getRealPath(execFilename); _execDir = System::file().getDirectory(_execDir) + "/"; } //_options = options; //_inputFilename = _options->getStr(STR_URI_INPUT); //_outputDir = _options->getStr(STR_URI_OUTPUT); //_outputDirTemp = _options->getStr(STR_URI_OUTPUT_TMP); //_maxNbReads = _options->getInt(STR_SIMKA_MAX_READS); //_maxJobCount = _options->getInt(STR_SIMKA_NB_JOB_COUNT); //_maxJobMerge = _options->getInt(STR_SIMKA_NB_JOB_MERGE); //_jobCountFilename = _options->getStr(STR_SIMKA_JOB_COUNT_FILENAME); //_jobMergeFilename = _options->getStr(STR_SIMKA_JOB_MERGE_FILENAME); //_jobCountCommand = _options->getStr(STR_SIMKA_JOB_COUNT_COMMAND); //_jobMergeCommand = _options->getStr(STR_SIMKA_JOB_MERGE_COMMAND); //string solidFilename = _outputDir + "/solid/" + p.bankName + suffix + ".h5"; //cout << "SimkaFusion constructor " << _outputDirTemp << endl; } ~SimkaPotaraAlgorithm(){ } void execute(){ parseArgs(); setup(); if(!SimkaAlgorithm::isInputValid()) exit(1); SimkaAlgorithm::computeMaxReads(); createConfig(); count(); printCountInfo(); merge(); stats(); if(this->_options->getInt(STR_VERBOSE) != 0){ cout << endl; cout << "Output dir: " << this->_outputDir << endl; cout << endl; } //bool keepTempFiles = false; if(!this->_keepTmpFiles){ string command = "rm -rf " + this->_outputDirTemp + "/solid/"; system(command.c_str()); command = "rm -rf " + this->_outputDirTemp + "/temp/"; system(command.c_str()); command = "rm -rf " + this->_outputDirTemp + "/count_synchro/"; system(command.c_str()); command = "rm -rf " + this->_outputDirTemp + "/merge_synchro/"; system(command.c_str()); command = "rm -rf " + this->_outputDirTemp + "/stats/"; system(command.c_str()); command = "rm -rf " + this->_outputDirTemp + "/job_count/"; system(command.c_str()); command = "rm -rf " + this->_outputDirTemp + "/job_merge/"; system(command.c_str()); command = "rm -rf " + this->_outputDirTemp + "/kmercount_per_partition/"; system(command.c_str()); command = "rm -rf " + this->_outputDirTemp + "/input/"; system(command.c_str()); command = "rm " + this->_outputDirTemp + "/config.h5"; system(command.c_str()); command = "rm " + this->_outputDirTemp + "/datasetIds"; system(command.c_str()); //cout << command << endl; //System::file().rmdir(this->_outputDirTemp); //System::file().mkdir(this->_outputDirTemp + "/solid/", -1); //System::file().mkdir(this->_outputDirTemp + "/temp/", -1); //System::file().mkdir(this->_outputDirTemp + "/log/", -1); //System::file().mkdir(this->_outputDirTemp + "/count_synchro/", -1); //System::file().mkdir(this->_outputDirTemp + "/merge_synchro/", -1); //System::file().mkdir(this->_outputDirTemp + "/stats/", -1); //System::file().mkdir(this->_outputDirTemp + "/job_count/", -1); //System::file().mkdir(this->_outputDirTemp + "/job_merge/", -1); //System::file().mkdir(this->_outputDirTemp + "/kmercount_per_partition/", -1); } } void parseArgs() { SimkaAlgorithm::parseArgs(); if(this->_options->get(STR_SIMKA_JOB_COUNT_FILENAME) || this->_options->get(STR_SIMKA_JOB_MERGE_FILENAME) || this->_options->get(STR_SIMKA_JOB_COUNT_COMMAND) || this->_options->get(STR_SIMKA_JOB_MERGE_COMMAND)){ _isClusterMode = true; _jobCountFilename = this->_options->getStr(STR_SIMKA_JOB_COUNT_FILENAME); _jobMergeFilename = this->_options->getStr(STR_SIMKA_JOB_MERGE_FILENAME); _jobCountCommand = this->_options->getStr(STR_SIMKA_JOB_COUNT_COMMAND); _jobMergeCommand = this->_options->getStr(STR_SIMKA_JOB_MERGE_COMMAND); if(! this->_options->get(STR_SIMKA_NB_JOB_COUNT) || this->_options->get(STR_SIMKA_NB_JOB_MERGE)){ //cout << endl; cout << "Cluster mode enable. Be sure to set correctly the following arguments if you have any job submission constraints:" << endl; cout << "\t" << STR_SIMKA_NB_JOB_COUNT << " : the maximum number of simultaneous couting" << endl; //job (each job will use up to " << STR_NB_CORES << " cores and " << STR_MAX_MEMORY << " MB memory)" << endl; cout << "\t" << STR_SIMKA_NB_JOB_MERGE << " : the maximum number of simultaneous merging job" << endl; // (each job will use up to 1 core and " << STR_MAX_MEMORY << " MB memory)" << endl; //cout << endl; } IFile* inputFile = System::file().newFile(_jobCountFilename, "rb"); inputFile->seeko(0, SEEK_END); u_int64_t size = inputFile->tell(); inputFile->seeko(0, SEEK_SET); char buffer2[size]; inputFile->fread(buffer2, size, size); string fileContents(buffer2, size); _jobCountContents = fileContents; delete inputFile; inputFile = System::file().newFile(_jobMergeFilename, "rb"); inputFile->seeko(0, SEEK_END); size = inputFile->tell(); inputFile->seeko(0, SEEK_SET); char buffer3[size]; inputFile->fread(buffer3, size, size); string fileContents2(buffer3, size); _jobMergeContents = fileContents2; delete inputFile; } else{ _isClusterMode = false; } if (this->_options->getInt(STR_MAX_MEMORY) < 2000) { std::cout << "WARNING: running Simka with low memory is risky. Simka may hang because of that. Consider running with -max-memory X where X > 2000" << std::endl; } if (this->_options->getInt(STR_MAX_MEMORY) < 500) { std::cout << "Please run Simka with higher memory usage than 500 MB" << std::endl; exit(1); } //_isClusterMode = true; //if(this->_options->get(STR_SIMKA_CLUSTER_MODE)){ //cout << "cluster mode activated" << endl; //cout << "\t-max-memory = memory per job" << endl; //cout << "\t-nb-cores = cores per job" << endl; //cout << endl; //if(_isClusterMode) // _maxJobMerge = max((int)_maxJobMerge, (int)30); //else{ // _maxJobMerge = max((int)_maxJobMerge, (int)30); //} //} /* _maxJobMerge = maxCores-1; size_t maxCoreCount = maxCores-1; size_t nbCoresCount = min(maxCoreCount, this->_nbCores); u_int64_t minMemory = 2000; size_t maxJobCountTemp = this->_maxMemory/minMemory; _maxJobCount = min(nbCoresCount, maxJobCountTemp); _memoryPerJob = this->_maxMemory / _maxJobCount; _coresPerJob = ceil(nbCoresCount / (float)_maxJobCount); cout << "Nb jobs count in parallel: " << _maxJobCount << endl; cout << "\tCores per jobs: " << _coresPerJob << endl; cout << "\tMemory per jobs: " << _memoryPerJob << endl; cout << "Nb jobs merge in parallel: " << _maxJobMerge << endl; cout << endl; */ } void setup(){ SimkaAlgorithm::setup(); createDirs(); layoutInputFilename(); } void layoutInputFilename(){ //SimkaAlgorithm::layoutInputFilename(); string datasetIdFilename = this->_outputDirTemp + "/" + "datasetIds"; IFile* datasetIdFile = System::file().newFile(datasetIdFilename, "wb"); for(size_t i=0; i_bankNames.size(); i++){ string bankName = this->_bankNames[i]; string bankIdLine = bankName + '\n'; datasetIdFile->fwrite(bankIdLine.c_str(), bankIdLine.size(), 1); } datasetIdFile->flush(); delete datasetIdFile; } void createDirs(){ /* string suffix = ""; suffix += "m" + _options->getStr(STR_SIMKA_MIN_READ_SIZE); suffix += "_s" + _options->getStr(STR_SIMKA_MIN_READ_SHANNON_INDEX); suffix += "_n" + SimkaAlgorithm<>::toString(_maxNbReads); suffix += "_p" + SimkaAlgorithm<>::toString(_nbAskedPartitions);*/ //_outputDirTemp = _outputDirTemp; // + "/" + suffix + "/"; //System::file().mkdir(_outputDirTemp, -1); System::file().mkdir(this->_outputDirTemp + "/solid/", -1); //System::file().mkdir(this->_outputDirTemp + "/solid/merged/", -1); System::file().mkdir(this->_outputDirTemp + "/temp/", -1); System::file().mkdir(this->_outputDirTemp + "/log/", -1); System::file().mkdir(this->_outputDirTemp + "/count_synchro/", -1); System::file().mkdir(this->_outputDirTemp + "/merge_synchro/", -1); System::file().mkdir(this->_outputDirTemp + "/stats/", -1); System::file().mkdir(this->_outputDirTemp + "/job_count/", -1); System::file().mkdir(this->_outputDirTemp + "/job_merge/", -1); System::file().mkdir(this->_outputDirTemp + "/kmercount_per_partition/", -1); } void createConfig(){ size_t maxCores = this->_nbCores; size_t maxMemory = this->_maxMemory; size_t minMemoryPerJobMB = 500; if(this->_options->get(STR_SIMKA_NB_JOB_COUNT)){ _maxJobCount = this->_options->getInt(STR_SIMKA_NB_JOB_COUNT); //maxCores = _maxJobCount; //TO REMOVE WHEN BUG IN DISPATCHER IS RESOLVED } else{ size_t maxjob_byCore = min(maxCores/2, this->_nbBanks); //size_t maxjob_byCore = min(maxCores, this->_nbBanks); //TO REMOVE WHEN BUG IN DISPATCHER IS RESOLVED maxjob_byCore = max(maxjob_byCore, (size_t)1); size_t maxjob_byMemory = maxMemory/minMemoryPerJobMB; maxjob_byMemory = max(maxjob_byMemory, (size_t) 1); size_t maxJobs = min(maxjob_byCore, maxjob_byMemory); _maxJobCount = maxJobs; //maxCores = _maxJobCount; //TO REMOVE WHEN BUG IN DISPATCHER IS RESOLVED } //_maxJobCount = 1; if(this->_options->get(STR_SIMKA_NB_JOB_MERGE)){ _maxJobMerge = this->_options->getInt(STR_SIMKA_NB_JOB_MERGE); } else{ _maxJobMerge = maxCores; /* if(this->_computeComplexDistances && this->_computeSimpleDistances){ _maxJobMerge = max((size_t)maxCores/4, (size_t)1); } else if(this->_computeSimpleDistances){ _maxJobMerge = max((size_t)maxCores/2, (size_t)1); } else if(this->_computeComplexDistances){ _maxJobMerge = max((size_t)maxCores/3, (size_t)1); } else{ _maxJobMerge = maxCores; }*/ } _maxJobCount = min(_maxJobCount, maxCores); _maxJobMerge = min(_maxJobMerge, maxCores); _coresPerJob = maxCores / _maxJobCount; _coresPerJob = max((size_t)1, _coresPerJob); _memoryPerJob = maxMemory / _maxJobCount; _memoryPerJob = max(_memoryPerJob, (size_t)minMemoryPerJobMB); _coresPerMergeJob = maxCores / _maxJobMerge; _coresPerMergeJob = max((size_t)1, _coresPerMergeJob); cout << endl; cout << "Maximum ressources used by Simka: " << endl; cout << "\t - " << _maxJobCount << " simultaneous processes for counting the kmers (per job: " << _coresPerJob << " cores, " << _memoryPerJob << " MB memory)" << endl; cout << "\t - " << _maxJobMerge << " simultaneous processes for merging the kmer counts (per job: " << _coresPerMergeJob << " cores, memory undefined)" << endl; cout << endl; //_coresPerJob = this->_nbCores; //_memoryPerJob = this->_maxMemory; string filename = this->_outputDirTemp + "/" + "config.h5"; if(System::file().doesExist(filename)){ try{ cout << "\tconfig already exists (remove file " << filename << " to config again)" << endl; Storage* storage = StorageFactory(STORAGE_HDF5).load (filename); LOCAL (storage); Configuration* config = new Configuration(); config->load(storage->getGroup("")); _nbPartitions = config->_nb_partitions; delete config; Repartitor* repartitor = new Repartitor(); //LOCAL(repartitor); repartitor->load(storage->getGroup("")); delete repartitor; return; } catch (Exception& e) { cout << "\tcan't open config, computing it again" << endl; System::file().remove(filename); createConfig(); return; } } this->_options->setInt(STR_MAX_MEMORY, _memoryPerJob - _memoryPerJob/3); this->_options->setInt(STR_NB_CORES, _coresPerJob); Storage* storage = 0; storage = StorageFactory(STORAGE_HDF5).create (filename, true, false); LOCAL (storage); size_t chosenBankId = 0; SimkaSequenceFilter dummyFilter(0, 0); //vector*> banksToDelete; string inputDir = this->_outputDirTemp + "/input/"; u_int64_t maxPart = 0; for (size_t i=0; i_nbBanks; i++){ IBank* bank = Bank::open(inputDir + this->_bankNames[i]); LOCAL(bank); // Check that data files are not empty int64_t nbItems = bank->estimateNbItems(); if (nbItems==0) { cerr << "ERROR: Dataset is empty: " << this->_bankNames[i] << endl; exit(1); } //size_t nbBank_ = bank->getCompositionNb(); SimkaBankTemp* simkaBank = new SimkaBankTemp(bank, this->_maxNbReads*this->_nbBankPerDataset[i]); //banksToDelete.push_back(simkaBank); ConfigurationAlgorithm testConfig(simkaBank, this->_options); testConfig.execute(); size_t part = testConfig.getConfiguration()._nb_partitions; if(part > maxPart){ maxPart = part; chosenBankId = i; } //delete simkaBank; /* u_int64_t nbReads = bank->estimateNbItems(); nbReads /= _nbBankPerDataset[i]; totalReads += nbReads; if(nbReads < minReads){ minReads = nbReads; //_smallerBankId = _bankNames[i]; } if(nbReads > maxReads){ maxReads = nbReads; _largerBankId = _bankNames[i]; }*/ } //maxPart += 2; //for(size_t i=0; i_options->setInt(STR_MAX_MEMORY, _memoryPerJob); IBank* inputbank = Bank::open(this->_banksInputFilename); LOCAL(inputbank); IBank* bank = Bank::open(this->_outputDirTemp + "/input/" + this->_bankNames[chosenBankId]); LOCAL(bank); //IBank* bank = Bank::open(_outputDirTemp + "/input/" + _bankNames[0]); //LOCAL(inputbank); //bank->finalize(); //u_int64_t nbSeqs = 1; //IBank* sampleBank = new SimkaBankSample(bank, nbSeqs); //SortingCountAlgorithm sortingCount (sampleBank, this->_options); //SimkaNullProcessor* proc = new SimkaNullProcessor(); //sortingCount.addProcessor (proc); // We launch the algorithm //sortingCount.execute(); //Configuration config = sortingCount.getConfig(); ConfigurationAlgorithm testConfig1(inputbank, this->_options); testConfig1.execute(); Configuration config1 = testConfig1.getConfiguration(); ConfigurationAlgorithm testConfig2(bank, this->_options); testConfig2.execute(); Configuration config2 = testConfig2.getConfiguration(); //IBank* inputbank = Bank::open(this->_banksInputFilename); //LOCAL(inputbank); //IBank* sampleBank2 = new SimkaBankSample(inputbank, nbSeqs); //SortingCountAlgorithm sortingCount2 (sampleBank2, this->_options); //SimkaNullProcessor* proc2 = new SimkaNullProcessor(); //sortingCount2.addProcessor (proc2); //sortingCount2.execute(); //Configuration config2 = sortingCount2.getConfig(); //cout << config2._nb_partitions << endl; _nbPartitions = max((size_t)maxPart, (size_t)_maxJobMerge); //_nbPartitions = max(_nbPartitions, (size_t)32); cout << "Nb partitions: " << _nbPartitions << " partitions" << endl << endl << endl; //_nbPartitions = max((int)_nbPartitions, (int)30); config1._nb_partitions = _nbPartitions; config2._nb_partitions = _nbPartitions; RepartitorAlgorithm repart (inputbank, storage->getGroup(""), config1); repart.execute (); uint64_t memoryUsageCachedItems; config2._nb_cached_items_per_core_per_part = 1 << 8; // cache at least 256 items (128 here, then * 2 in the next while loop) do { config2._nb_cached_items_per_core_per_part *= 2; memoryUsageCachedItems = 1LL * config2._nb_cached_items_per_core_per_part *config2._nb_partitions * config2._nbCores * sizeof(Type); } while (memoryUsageCachedItems < config2._max_memory * MBYTE / 10); /* if(_isClusterMode){ //config._nb_cached_items_per_core_per_part = 100000; _nbPartitions = _maxJobMerge; config._nb_partitions = _nbPartitions; uint64_t memoryUsageCachedItems; config._nb_cached_items_per_core_per_part = 1 << 8; // cache at least 256 items (128 here, then * 2 in the next while loop) do { config._nb_cached_items_per_core_per_part *= 2; memoryUsageCachedItems = 1LL * config._nb_cached_items_per_core_per_part *config._nb_partitions * config._nbCores * sizeof(Type); } while (memoryUsageCachedItems < config._max_memory * MBYTE / 10); //cout << config._nb_cached_items_per_core_per_part << endl; } else{ _nbPartitions = config._nb_partitions; }*/ //IBank* inputbank = Bank::open(this->_banksInputFilename); //LOCAL(inputbank); //ConfigurationAlgorithm inputconfig(inputbank, this->_options); //inputconfig.execute(); //RepartitorAlgorithm repart (inputbank, storage->getGroup(""), config); //repart.execute (); //setRepartitor (new Repartitor(storage->getGroup("minimizers"))); //SortingCountAlgorithm sortingCount (sampleBank, _options); config2.save(storage->getGroup("")); //sortingCount.getRepartitor()->save(storage->getGroup("")); //delete sampleBank; //setStorage (storage); //delete storage; //sampleBank->forget(); } void removeMergeSynchro(){ for (size_t i=0; i_bankNames.size(); i++){ string finishFilename = this->_outputDirTemp + "/merge_synchro/" + this->_bankNames[i] + ".ok"; if(System::file().doesExist(finishFilename)) System::file().remove(finishFilename); } } void printCountInfo(){ char * pEnd; vector kmerPerParts(_nbPartitions, 0); for(size_t i=0; i_bankNames.size(); i++){ //cout << filename << endl; string line; size_t currentPart = 0; ifstream file((this->_outputDirTemp + "/kmercount_per_partition/" + this->_bankNames[i] + ".txt").c_str()); size_t j = 0; while(getline(file, line)){ if(line == "") continue; kmerPerParts[j] += strtoull(line.c_str(), NULL, 10); j += 1; } file.close(); } cout << endl << endl << "Kmer repartition" << endl; for(size_t i=0; i_outputDirTemp + "/log/count_*)" << endl; for (size_t i=0; i<_nbPartitions; i++){ //System::file().mkdir(this->_outputDirTemp + "/solid/merged/part_" + Stringify::format("%i", i), -1); System::file().mkdir(this->_outputDirTemp + "/solid/part_" + Stringify::format("%i", i), -1); } vector commands; _progress = new ProgressSynchro ( this->createIteratorListener (this->_bankNames.size(), "Counting datasets"), System::thread().newSynchronizer()); _progress->init (); vector filenameQueue; vector filenameQueueToRemove; size_t nbJobs = 0; for (size_t i=0; i_bankNames.size(); i++){ string logFilename = this->_outputDirTemp + "/log/count_" + this->_bankNames[i] + ".txt"; string finishFilename = this->_outputDirTemp + "/count_synchro/" + this->_bankNames[i] + ".ok"; if(System::file().doesExist(finishFilename)){ _progress->inc(1); cout << "\t" << this->_bankNames[i] << " already counted (remove file " << finishFilename << " to count again)" << endl; continue; } //else{ string tempDir = this->_outputDirTemp + "/temp/" + this->_bankNames[i]; string command = "nohup " + _execDir + "simkaCountProcess " + _execDir + "simkaCount "; command += " " + string(STR_KMER_SIZE) + " " + SimkaAlgorithm<>::toString(this->_kmerSize); command += " " + string("-out-tmp-simka") + " " + this->_outputDirTemp; command += " " + string("-out-tmp") + " " + tempDir; command += " -bank-name " + this->_bankNames[i]; command += " -bank-index " + SimkaAlgorithm<>::toString(i); command += " -nb-datasets " + SimkaAlgorithm<>::toString(this->_nbBankPerDataset[i]); command += " " + string(STR_MAX_MEMORY) + " " + SimkaAlgorithm<>::toString(_memoryPerJob); command += " " + string(STR_NB_CORES) + " " + SimkaAlgorithm<>::toString(_coresPerJob); command += " " + string(STR_URI_INPUT) + " dummy "; command += " " + string(STR_KMER_ABUNDANCE_MIN) + " " + SimkaAlgorithm<>::toString(this->_abundanceThreshold.first); command += " " + string(STR_KMER_ABUNDANCE_MAX) + " " + SimkaAlgorithm<>::toString(this->_abundanceThreshold.second); command += " " + string(STR_SIMKA_MIN_READ_SIZE) + " " + SimkaAlgorithm<>::toString(this->_minReadSize); command += " " + string(STR_SIMKA_MIN_READ_SHANNON_INDEX) + " " + Stringify::format("%f", this->_minReadShannonIndex); command += " " + string(STR_SIMKA_MAX_READS) + " " + SimkaAlgorithm<>::toString(this->_maxNbReads); command += " -nb-partitions " + SimkaAlgorithm<>::toString(_nbPartitions); //command += " -verbose " + Stringify::format("%d", this->_options->getInt(STR_VERBOSE)); command += " >> " + logFilename + " 2>&1"; filenameQueue.push_back(this->_bankNames[i]); System::file().mkdir(tempDir, -1); string str = "Counting dataset " + SimkaAlgorithm<>::toString(i) + "\n"; str += "\t" + command + "\n\n\n"; system(("echo \"" + str + "\" > " + logFilename).c_str()); //cout << "Counting dataset " << i << endl; //cout << "\t" << command << endl; removeMergeSynchro(); //_progress->inc(1); //nanosleep((const struct timespec[]){{0, 10000000L}}, NULL); if(_isClusterMode){ string jobFilename = this->_outputDirTemp + "/job_count/job_count_" + SimkaAlgorithm<>::toString(i) + ".bash"; IFile* jobFile = System::file().newFile(jobFilename.c_str(), "w"); system(("chmod 755 " + jobFilename).c_str()); string jobCommand = _jobCountContents + '\n' + '\n'; jobCommand += command; jobFile->fwrite(jobCommand.c_str(), jobCommand.size(), 1); jobFile->flush(); string submitCommand = _jobCountCommand + " " + jobFile->getPath(); delete jobFile; system(submitCommand.c_str()); } else{ command += " &"; system(command.c_str()); } nbJobs += 1; //cout << "job started" << endl; if(nbJobs >= _maxJobCount){ while(true){ bool isJobAvailbale = false; for(size_t j=0; j_outputDirTemp + "/count_synchro/" + filenameQueue[j] + ".ok"; if(System::file().doesExist(finishFilename2)){ filenameQueueToRemove.push_back(filenameQueue[j]); isJobAvailbale = true; nbJobs -= 1; //cout << "job finished" << endl; _progress->inc(1); } } if(isJobAvailbale){ for(size_t j=0; j= this->_bankNames.size()) break; } } } while(nbJobs > 0){ bool isJobAvailbale = false; for(size_t j=0; j_outputDirTemp + "/count_synchro/" + filenameQueue[j] + ".ok"; if(System::file().doesExist(finishFilename2)){ filenameQueueToRemove.push_back(filenameQueue[j]); isJobAvailbale = true; nbJobs -= 1; _progress->inc(1); } } if(isJobAvailbale){ for(size_t j=0; jfinish(); delete _progress; } void merge(){ cout << endl << "Merging k-mer counts and computing distances... (log files are " + this->_outputDirTemp + "/log/merge_*)" << endl; _progress = new ProgressSynchro ( this->createIteratorListener (_nbPartitions, "Merging datasets"), System::thread().newSynchronizer()); _progress->init (); vector filenameQueue; vector filenameQueueToRemove; size_t nbJobs = 0; for (size_t i=0; i<_nbPartitions; i++){ string datasetId = SimkaAlgorithm<>::toString(i); string finishFilename = this->_outputDirTemp + "/merge_synchro/" + datasetId + ".ok"; string logFilename = this->_outputDirTemp + "/log/merge_" + datasetId + ".txt"; if(System::file().doesExist(finishFilename)){ _progress->inc(1); cout << "\t" << datasetId << " already merged (remove file " << finishFilename << " to merge again)" << endl; } else{ //if(System::file().doesExist(finishFilename)){ // System::file().remove(finishFilename); // cout << "\t" << _bankNames[i] << " already (remove file " << finishFilename << " to count again)" << endl; //} //else{ filenameQueue.push_back(datasetId); string command = "nohup " + _execDir + "simkaMerge "; command += " " + string(STR_KMER_SIZE) + " " + SimkaAlgorithm<>::toString(this->_kmerSize); command += " " + string(STR_URI_INPUT) + " " + this->_inputFilename; command += " " + string("-out-tmp-simka") + " " + this->_outputDirTemp; command += " -partition-id " + SimkaAlgorithm<>::toString(i); command += " " + string(STR_MAX_MEMORY) + " " + SimkaAlgorithm<>::toString(this->_maxMemory / this->_nbCores); command += " " + string(STR_NB_CORES) + " " + SimkaAlgorithm<>::toString(_coresPerMergeJob); command += " " + string(STR_SIMKA_MIN_KMER_SHANNON_INDEX) + " " + Stringify::format("%f", this->_minKmerShannonIndex); command += " -verbose " + Stringify::format("%d", this->_options->getInt(STR_VERBOSE)); if(this->_computeSimpleDistances) command += " " + string(STR_SIMKA_COMPUTE_ALL_SIMPLE_DISTANCES); if(this->_computeComplexDistances) command += " " + string(STR_SIMKA_COMPUTE_ALL_COMPLEX_DISTANCES); command += " >> " + logFilename + " 2>&1"; //SimkaDistanceParam distanceParams(this->_options); //if(distanceParams._computeBrayCurtis) command += " " + STR_SIMKA_DISTANCE_BRAYCURTIS + " "; //if(distanceParams._computeCanberra) command += " " + STR_SIMKA_DISTANCE_CANBERRA + " "; //if(distanceParams._computeChord) command += " " + STR_SIMKA_DISTANCE_CHORD + " "; //if(distanceParams._computeHellinger) command += " " + STR_SIMKA_DISTANCE_HELLINGER + " "; //if(distanceParams._computeKulczynski) command += " " + STR_SIMKA_DISTANCE_KULCZYNSKI + " "; string str = "Merging partition " + SimkaAlgorithm<>::toString(i) + "\n"; str += "\t" + command + "\n\n\n"; system(("echo \"" + str + "\" > " + logFilename).c_str()); if(_isClusterMode){ string jobFilename = this->_outputDirTemp + "/job_merge/job_merge_" + SimkaAlgorithm<>::toString(i) + ".bash"; IFile* jobFile = System::file().newFile(jobFilename.c_str(), "w"); system(("chmod 755 " + jobFilename).c_str()); string jobCommand = _jobMergeContents + '\n' + '\n'; jobCommand += command; jobFile->fwrite(jobCommand.c_str(), jobCommand.size(), 1); jobFile->flush(); string submitCommand = _jobMergeCommand + " " + jobFile->getPath(); delete jobFile; system(submitCommand.c_str()); } else{ command += " &"; system(command.c_str()); } nbJobs += 1; } if(nbJobs >= _maxJobMerge){ while(true){ bool isJobAvailbale = false; for(size_t j=0; j_outputDirTemp + "/merge_synchro/" + filenameQueue[j] + ".ok"; if(System::file().doesExist(finishFilename2)){ filenameQueueToRemove.push_back(filenameQueue[j]); isJobAvailbale = true; nbJobs -= 1; _progress->inc(1); } } if(isJobAvailbale){ for(size_t j=0; j= this->_bankNames.size()) break; } } } //cout << nbJobs << endl; while(nbJobs > 0){ bool isJobAvailbale = false; for(size_t j=0; j_outputDirTemp + "/merge_synchro/" + filenameQueue[j] + ".ok"; if(System::file().doesExist(finishFilename2)){ filenameQueueToRemove.push_back(filenameQueue[j]); isJobAvailbale = true; nbJobs -= 1; _progress->inc(1); } } if(isJobAvailbale){ for(size_t j=0; jfinish(); delete _progress; } /* void getCountInfo(SimkaStatistics& mainStats){ for(size_t i=0; i_nbBanks; i++){ string name = this->_bankNames[i]; string countFilename = this->_outputDirTemp + "/count_synchro/" + name + ".ok"; string line; ifstream file(countFilename.c_str()); vector lines; while(getline(file, line)){ if(line == "") continue; lines.push_back(line); } file.close(); u_int64_t nbReads = strtoull(lines[0].c_str(), NULL, 10) ; mainStats._datasetNbReads[i] = nbReads; mainStats._nbSolidDistinctKmersPerBank[i] = strtoull(lines[1].c_str(), NULL, 10); //cout << mainStats._nbSolidDistinctKmersPerBank[i] << endl; mainStats._nbSolidKmersPerBank[i] = strtoull(lines[2].c_str(), NULL, 10); mainStats._chord_sqrt_N2[i] = sqrt(strtoull(lines[3].c_str(), NULL, 10)); } }*/ void stats(){ cout << endl << "Computing stats..." << endl; //cout << this->_nbBanks << endl; //u_int64_t nbKmers = 0; //SimkaDistanceParam distanceParams(this->_options); SimkaStatistics mainStats(this->_nbBanks, this->_computeSimpleDistances, this->_computeComplexDistances, this->_outputDirTemp, this->_bankNames); for(size_t i=0; i<_nbPartitions; i++){ string filename = this->_outputDirTemp + "/stats/part_" + SimkaAlgorithm<>::toString(i) + ".gz"; //Storage* storage = StorageFactory(STORAGE_HDF5).load (this->_outputDirTemp + "/stats/part_" + SimkaAlgorithm<>::toString(i) + ".stats"); //LOCAL (storage); SimkaStatistics stats(this->_nbBanks, this->_computeSimpleDistances, this->_computeComplexDistances, this->_outputDirTemp, this->_bankNames); stats.load(filename); //cout << stats._nbDistinctKmers << " " << stats._nbKmers << endl; mainStats += stats; //nbKmers += stats._nbKmers; } //cout << "Nb kmers: " << nbKmers << endl; //getCountInfo(mainStats); //for(size_t i=0; i_nbBanks; i++){ // cout << mainStats._nbSolidDistinctKmersPerBank[i] << endl; //} mainStats.outputMatrix(this->_outputDir, this->_bankNames); #//ifdef PRINT_STATS if(this->_options->getInt(STR_VERBOSE) != 0) mainStats.print(); #//endif } //u_int64_t _maxMemory; //size_t _nbCores; size_t _memoryPerJob; size_t _coresPerJob; size_t _coresPerMergeJob; //IBank* _banks; //IProperties* _options; //string _inputFilename; //vector _bankNames; //vector _nbBankPerDataset; size_t _nbPartitions; //size_t _nbBanks; //vector _nbReadsPerDataset; //string _banksInputFilename; //vector _tempFilenamesToDelete; //u_int64_t _maxNbReads; //IBank* _sampleBank; string _execDir; bool _isClusterMode; size_t _maxJobCount; size_t _maxJobMerge; string _jobCountFilename; string _jobMergeFilename; string _jobCountCommand; string _jobMergeCommand; //u_int64_t _nbAskedPartitions; string _jobCountContents; string _jobMergeContents; IteratorListener* _progress; }; class SimkaPotara : public Tool{ public: SimkaPotara(const string& execFilename); void execute(); string _execFilename; }; #endif simka-1.5.3/src/core/000077500000000000000000000000001377312000000143345ustar00rootroot00000000000000simka-1.5.3/src/core/KmerCountCompressor.hpp000077500000000000000000000462251377312000000210450ustar00rootroot00000000000000 #ifndef _GATB_CORE_KMER_IMPL_KMER_COUNT_COMPRESSOR_HPP_ #define _GATB_CORE_KMER_IMPL_KMER_COUNT_COMPRESSOR_HPP_ /********************************************************************************/ #include #include #include #include #include #include #include #include #include using namespace std; using namespace gatb::core::system; using namespace gatb::core::system::impl; const u_int64_t MAX_MEMORY_PER_BLOCK = 100000; //#define INDEXING #define MONO_BANK /********************************************************************************/ namespace gatb { namespace core { namespace kmer { namespace impl { /********************************************************************************/ /********************************************************************* * ** KmerCountCoder *********************************************************************/ class KmerCountCoder { public: KmerCountCoder(int nbBanks, int partitionIndex) //_bankCountDeltaModel(3) { _nbBanks = nbBanks; _partitionIndex = partitionIndex; _nbKmers = 0; /* for(int i=0; i models; vector bankModels; for(int j=0; j models; vector bankModels; for(int j=0; j _kmerModel; vector > _bankModels; vector > _abundanceModels; //vector _bankDeltaModels; //vector _deltaModels; //vector _lastAbundances; //vector _lastBanks; u_int64_t _lastKmerValue; u_int64_t _lastNbBankCount; //vector _blockSizes; vector _bankCountModel; //Order0Model _bankCountDeltaModel; }; /********************************************************************* * ** KmerCountCompressorPartition *********************************************************************/ template class KmerCountCompressorPartition : public KmerCountCoder { public: /** Shortcuts. */ typedef typename Kmer::Type Type; typedef typename Kmer::Count Count; typedef typename Kmer::ModelCanonical::Kmer Kmer; KmerCountCompressorPartition(const string& outputDir, int partitionIndex, int nbBanks) : KmerCountCoder(nbBanks, partitionIndex) { string filename = outputDir + "/part_" + KmerCountCoder::toString(_partitionIndex); _outputFile = System::file().newFile(filename.c_str(), "wb"); } ~KmerCountCompressorPartition(){ //cout << _rangeEncoder.getBufferSize() << endl; /* string path = _outputFile->getPath(); cout << "Partition " << _partitionIndex << endl; cout << "\tNb kmers: " << _nbKmers << endl; cout << "\tCompressed size: " << System::file().getSize(path) << endl; cout << "\tByte per kmer count: " << System::file().getSize(path) / (float)_nbKmers<< endl;*/ delete _outputFile; } void flush(){ _rangeEncoder.flush(); writeBlock(); clear(); _rangeEncoder.clear(); CompressionUtils::encodeNumeric(_rangeEncoder, _kmerModel, _nbKmers); _rangeEncoder.flush(); //for(u_int64_t blockSize : _blockSizes){ // CompressionUtils::encodeNumeric(_rangeEncoder, _kmerModel, blockSize); //} _outputFile->fwrite((const char*) _rangeEncoder.getBuffer(true), _rangeEncoder.getBufferSize(), 1); _outputFile->flush(); } void insert(const Type& kmer, const CountVector& abundancePerBank){ _nbKmers += 1; u_int64_t kmerValue = kmer.getVal(); CompressionUtils::encodeNumeric(_rangeEncoder, _kmerModel, kmerValue - _lastKmerValue); _lastKmerValue = kmerValue; if(abundancePerBank.size() == 1){ CompressionUtils::encodeNumeric(_rangeEncoder, _bankCountModel, abundancePerBank[0]); } else{ //u_int64_t deltaValue; //u_int8_t deltaType; int modelIndex = 0; _banks.clear(); //int nbBankCount; for(size_t bankId=0; bankId 0){ _banks.push_back(bankId); //nbBankCount += 1; } } //deltaType = CompressionUtils::getDeltaValue(nbBankCount, _lastNbBankCount, &deltaValue); //_rangeEncoder.encode(_bankCountDeltaModel, deltaType); //CompressionUtils::encodeNumeric(_rangeEncoder, _bankCountModel, deltaValue); //_lastNbBankCount = nbBankCount; CompressionUtils::encodeNumeric(_rangeEncoder, _bankCountModel, _banks.size()); int lastBankId = 0; for(size_t i=0; i<_banks.size(); i++){ int bankId = _banks[i]; u_int16_t abundance = abundancePerBank[bankId]; //if(abundance == 0){ //} //else{ if(modelIndex >= _bankModels.size()){ addField(); } //deltaType = CompressionUtils::getDeltaValue(bankId, _lastBanks[modelIndex], &deltaValue); //_rangeEncoder.encode(_bankDeltaModels[modelIndex], deltaType); //CompressionUtils::encodeNumeric(_rangeEncoder, _bankModels[modelIndex], deltaValue); //_lastBanks[modelIndex] = bankId; CompressionUtils::encodeNumeric(_rangeEncoder, _bankModels[modelIndex], bankId - lastBankId); lastBankId = bankId; //deltaType = CompressionUtils::getDeltaValue(abundance, _lastAbundances[modelIndex], &deltaValue); //_rangeEncoder.encode(_deltaModels[modelIndex], deltaType); //CompressionUtils::encodeNumeric(_rangeEncoder, _abundanceModels[modelIndex], deltaValue); //_lastAbundances[modelIndex] = abundance; CompressionUtils::encodeNumeric(_rangeEncoder, _abundanceModels[modelIndex], abundance); modelIndex += 1; //} } } if(_rangeEncoder.getBufferSize() >= MAX_MEMORY_PER_BLOCK){ writeBlock(); } } void writeBlock(){ if(_rangeEncoder.getBufferSize() > 0){ //_rangeEncoder.flush(); //_blockSizes.push_back(_rangeEncoder.getBufferSize()); _outputFile->fwrite((const char*) _rangeEncoder.getBuffer(), _rangeEncoder.getBufferSize(), 1); } _rangeEncoder.clearBuffer(); //_rangeEncoder.clear(); //clear(); } u_int64_t getSizeByte(){ //cout << _outputFile->getPath() << endl; //cout << System::file().getSize(_outputFile->getPath()) << endl; return System::file().getSize(_outputFile->getPath()); } private: RangeEncoder _rangeEncoder; IFile* _outputFile; vector _banks; }; /********************************************************************* * ** KmerCountCompressor *********************************************************************/ template class KmerCountCompressor { public: /** Shortcuts. */ typedef typename Kmer::Type Type; typedef typename Kmer::Count Count; typedef typename Kmer::ModelCanonical::Kmer Kmer; /** */ KmerCountCompressor(const string& outputDir, int nbPartitions, int nbBanks){ _rootDir = outputDir; // + "/dsk_output/"; _nbPartitions = nbPartitions; _nbBanks = nbBanks; System::file().rmdir(_rootDir); System::file().mkdir(_rootDir, -1); cout << _rootDir << endl; //cout << nbPartitions << endl; for(int i=0; i* comp = new KmerCountCompressorPartition(_rootDir, i, nbBanks); _partitionCompressors.push_back(comp); } } ~KmerCountCompressor(){ u_int64_t nbKmers = 0; u_int64_t size = 0; for(size_t i=0; i<_partitionCompressors.size(); i++){ KmerCountCompressorPartition* comp = _partitionCompressors[i]; comp->flush(); nbKmers += comp->getNbKmers(); size += comp->getSizeByte(); delete comp; } cout << "Compression statistics " << endl; cout << "\tNb kmers: " << nbKmers << endl; cout << "\tCompressed size: " << size << "B - " << size/MBYTE << " MB" << endl; cout << "\tByte per kmer count: " << size / (float) nbKmers<< endl; IFile* outputFile = System::file().newFile(_rootDir + "/dsk_count_data", "wb"); outputFile->print("%i %i", _nbPartitions, _nbBanks); outputFile->flush(); delete outputFile; } void insert(int partitionIndex, const Type& kmer, const CountVector& abundancePerBank){ _partitionCompressors[partitionIndex]->insert(kmer, abundancePerBank); } private: string _rootDir; int _nbPartitions; int _nbBanks; vector* > _partitionCompressors; }; /********************************************************************* * ** KmerCountDecompressorPartition *********************************************************************/ template class KmerCountDecompressorPartition : KmerCountCoder { public: typedef typename Kmer::Type Type; typedef typename Kmer::Count Count; typedef typename Kmer::ModelCanonical::Kmer Kmer; KmerCountDecompressorPartition(const string& inputDir, int partitionIndex, int nbBanks, Functor* functor, gatb::core::tools::dp::IteratorListener* progress) : KmerCountCoder(nbBanks, partitionIndex) { _progress = progress; _functor = functor; _nbDecodedKmers = 0; _nbDecodedKmersProgress = 0; string filename = inputDir + "/part_" + KmerCountCoder::toString(_partitionIndex); _inputFile = new ifstream(filename.c_str(), ios::in|ios::binary); _inputFile->seekg(0, _inputFile->end); _rangeDecoder.setInputFile(_inputFile, true); _nbKmers = CompressionUtils::decodeNumeric(_rangeDecoder, _kmerModel); //cout << _nbKmers << endl; //_rangeEncoder.flush(); //for(u_int64_t blockSize : _blockSizes){ // CompressionUtils::encodeNumeric(_rangeEncoder, _kmerModel, blockSize); //} //_outputFile->fwrite((const char*) _rangeEncoder.getBuffer(true), _rangeEncoder.getBufferSize(), 1); clear(); _rangeDecoder.clear(); _inputFile->seekg(0, _inputFile->beg); _rangeDecoder.setInputFile(_inputFile); /* for(int i=0; i models; vector bankModels; for(int j=0; jexecute(kmer, abundancePerBanks); _nbDecodedKmers += 1; _nbDecodedKmersProgress += 1; if (_nbDecodedKmersProgress > 500000) { _progress->inc (_nbDecodedKmersProgress); _nbDecodedKmersProgress = 0; } } _progress->inc (_nbDecodedKmersProgress); } private: Functor* _functor; gatb::core::tools::dp::IteratorListener* _progress; RangeDecoder _rangeDecoder; ifstream* _inputFile; u_int64_t _nbDecodedKmers; u_int64_t _nbDecodedKmersProgress; }; /********************************************************************* * ** KmerCountDecompressor *********************************************************************/ template class KmerCountDecompressor : public gatb::core::tools::misc::impl::Algorithm { public: /** Shortcuts. */ typedef typename Kmer::Type Type; typedef typename Kmer::Count Count; typedef typename Kmer::ModelCanonical::Kmer Kmer; /** */ KmerCountDecompressor(const string& inputDir, int nbCores) : Algorithm("kcc", 0, 0) { //getInput()->setStr(STR_VERBOSE, "1"); _inputDir = inputDir; _nbCores = nbCores; IFile* dskCountDataFile = System::file().newFile(inputDir + "/dsk_count_data", "rb"); vector numbers; string n = ""; while(true){ u_int8_t c = dskCountDataFile->get(); if(c == ' ' || dskCountDataFile->isEOF()){ numbers.push_back(n); n.clear(); } else{ n += c; } if(dskCountDataFile->isEOF()) break; } //for(string& number: numbers){ // cout << number << endl; //} _nbPartitions = atoi(numbers[0].c_str()); _nbBanks = atoi(numbers[1].c_str()); //for(int i=0; i<_nbCores; i++) //for(int i=0; i<_nbPartitions; i++){ // KmerCountCompressorPartition decomp(inputDir, i, _nbBanks); // decomp.iterKmers(_nbCores); //} } ~KmerCountDecompressor(){ delete _progress; } void setupProgress(){ RangeDecoder rangeDecoder; vector kmerModel; for(int i=0; iseekg(0, file->end); rangeDecoder.setInputFile(file, true); u_int64_t nbKmers = CompressionUtils::decodeNumeric(rangeDecoder, kmerModel); totalKmers += nbKmers; rangeDecoder.clear(); for(int j=0; jinit (); } template static void *callMyFunction(void *object){ ((KmerCountDecompressorPartition*)object)->execute(); //object->execute(); return NULL; } void execute(){ } template void iterate (const Functor& functor, size_t groupSize=1000){ setupProgress(); pthread_t* tab_threads = new pthread_t[_nbCores]; //thread_arg_decoder * targ = new thread_arg_decoder [_nbCores]; vector* > _partitionDecompressors; for(int i=0; i<_nbPartitions;){ for(int j=0; j<_nbCores && i<_nbPartitions; j++){ Functor* func = new Functor(functor); KmerCountDecompressorPartition* decomp = new KmerCountDecompressorPartition(_inputDir, i, _nbBanks, func, _progress); _partitionDecompressors.push_back(decomp); i += 1; } //Lala * lala = new Lala(); for(int j=0; j<_partitionDecompressors.size(); j++){ //cout << "start" << endl; pthread_create(&tab_threads[j], NULL, &KmerCountDecompressor::callMyFunction, _partitionDecompressors[j]); //_partitionDecompressors[j]->execute(); //cout << "loulou" << endl; } for(int j=0; j<_partitionDecompressors.size(); j++){ pthread_join(tab_threads[j], NULL); } for(int j=0; j<_partitionDecompressors.size(); j++){ delete _partitionDecompressors[j]; } _partitionDecompressors.clear(); } delete tab_threads; _progress->finish (); } private: string _inputDir; int _nbCores; int _nbBanks; int _nbPartitions; gatb::core::tools::dp::IteratorListener* _progress; //vector* > _partitionDecompressors; }; /********************************************************************************/ } } } } /* end of namespaces. */ /********************************************************************************/ #endif /* _GATB_CORE_KMER_IMPL_KMER_COUNT_COMPRESSOR__HPP_ */ simka-1.5.3/src/core/Simka.cpp000077500000000000000000000257571377312000000161270ustar00rootroot00000000000000/***************************************************************************** * Simka: Fast kmer-based method for estimating the similarity between numerous metagenomic datasets * A tool from the GATB (Genome Assembly Tool Box) * Copyright (C) 2015 INRIA * Authors: G.Benoit, C.Lemaitre, P.Peterlongo * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU Affero General Public License as * published by the Free Software Foundation, either version 3 of the * License, or (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU Affero General Public License for more details. * * You should have received a copy of the GNU Affero General Public License * along with this program. If not, see . *****************************************************************************/ #include "Simka.hpp" #include "SimkaAlgorithm.hpp" IOptionsParser* Simka::createOptionsParser (IOptionsParser* parent) { IOptionsParser* parser = parent; //new OptionsParser ("Simka"); //Main parser parser->push_front (new OptionNoParam (STR_SIMKA_COMPUTE_DATA_INFO, "compute (and display) information before running Simka, such as the number of reads per dataset", false)); parser->push_front (new OptionNoParam (STR_SIMKA_KEEP_TMP_FILES, "keep temporary files", false)); parser->push_front (new OptionOneParam (STR_URI_OUTPUT_TMP, "output directory for temporary files", true)); parser->push_front (new OptionOneParam (STR_URI_OUTPUT, "output directory for result files (distance matrices)", false, "./simka_results")); parser->push_front (new OptionOneParam (STR_URI_INPUT, "input file of samples. One sample per line: id1: filename1...", true)); //parser->push_back (new OptionOneParam (STR_URI_OUTPUT_TMP, "output directory for temporary files", true)); //IOptionsParser* parser = getParser(); //IOptionsParser* dskParser = SortingCountAlgorithm<>::getOptionsParser(); //parser->push_back(dskParser); //dskParser->setVisible(false); //cout << parser->getParser(STR_NB_CORES) << endl; parser->getParser(STR_NB_CORES)->setVisible(false); //parser->push_back(new OptionOneParam(parser->getParser(STR_NB_CORES)->getName(), parser->getParser(STR_NB_CORES)->getHelp(), false, "0")); //parser->push_front(dskParser->getParser (STR_URI_OUTPUT_TMP)); //dskParser->getParser (STR_URI_OUTPUT_TMP)->setMandatory //parser->push_front(dskParser->getParser (STR_URI_OUTPUT)); //parser->getParser (STR_URI_OUTPUT)->setHelp("output directory for result files (similarity matrix, heatmaps)"); //parser->push_front(dskParser->getParser (STR_URI_INPUT)); //parser->getParser(STR_URI_INPUT)->setHelp("input file of datasets. One dataset per line: id filename1 filename2..."); //if (Option* p = dynamic_cast (parser->getParser(STR_URI_OUTPUT_TMP))) { p->s; } //Distance parser IOptionsParser* distanceParser = new OptionsParser ("distance"); distanceParser->push_back (new OptionNoParam (STR_SIMKA_COMPUTE_ALL_SIMPLE_DISTANCES, "compute all simple distances (Chord, Hellinger...)", false)); distanceParser->push_back (new OptionNoParam (STR_SIMKA_COMPUTE_ALL_COMPLEX_DISTANCES, "compute all complex distances (Jensen-Shannon...)", false)); //Kmer parser IOptionsParser* kmerParser = new OptionsParser ("kmer"); kmerParser->push_back (new OptionOneParam (STR_KMER_SIZE, "size of a kmer", false, "21")); //kmerParser->push_back(dskParser->getParser (STR_KMER_SIZE)); //kmerParser->push_back(new OptionOneParam (STR_KMER_PER_READ.c_str(), "number of selected kmers per read", false, "0")); //kmerParser->push_back (new OptionOneParam (STR_KMER_ABUNDANCE_MIN, "min abundance a kmer need to be considered", false, "1")); kmerParser->push_back (new OptionOneParam (STR_KMER_ABUNDANCE_MIN, "min abundance a kmer need to be considered", false, "2")); kmerParser->push_back (new OptionOneParam (STR_KMER_ABUNDANCE_MAX, "max abundance a kmer can have to be considered", false, "999999999")); //kmerParser->push_back(dskParser->getParser (STR_KMER_ABUNDANCE_MIN)); //if (Option* p = dynamic_cast (parser->getParser(STR_KMER_ABUNDANCE_MIN))) { p->setDefaultValue ("0"); } //if (Option* p = dynamic_cast (parser->getParser(STR_SOLIDITY_KIND))) { p->setDefaultValue ("all"); } //kmerParser->push_back(dskParser->getParser (STR_KMER_ABUNDANCE_MAX)); //kmerParser->push_back(dskParser->getParser (STR_SOLIDITY_KIND)); //kmerParser->getParser (STR_SOLIDITY_KIND)->setHelp("TODO"); //kmerParser->push_back (new OptionNoParam (STR_SIMKA_SOLIDITY_PER_DATASET.c_str(), "do not take into consideration multi-counting when determining solid kmers", false )); kmerParser->push_back (new OptionOneParam (STR_SIMKA_MIN_KMER_SHANNON_INDEX.c_str(), "minimal Shannon index a kmer should have to be kept. Float in [0,2]", false, "0" )); //Read filter parser IOptionsParser* readParser = new OptionsParser ("read"); readParser->push_back (new OptionOneParam (STR_SIMKA_MAX_READS.c_str(), "maximum number of reads per sample to process. Can be -1: use all reads. Can be 0: estimate it", false, "-1" )); readParser->push_back (new OptionOneParam (STR_SIMKA_MIN_READ_SIZE.c_str(), "minimal size a read should have to be kept", false, "0" )); readParser->push_back (new OptionOneParam (STR_SIMKA_MIN_READ_SHANNON_INDEX.c_str(), "minimal Shannon index a read should have to be kept. Float in [0,2]", false, "0" )); //Core parser IOptionsParser* coreParser = new OptionsParser ("core"); coreParser->push_back(new OptionOneParam(STR_NB_CORES, "number of cores", false, "0")); coreParser->push_back (new OptionOneParam (STR_MAX_MEMORY, "max memory (MB)", false, "5000")); //coreParser->push_back(dskParser->getParser ()); //coreParser->push_back(dskParser->getParser (STR_MAX_DISK)); //Distances //IOptionsParser* distanceParser = new OptionsParser ("distances"); //distanceParser->push_back (new OptionNoParam (STR_SIMKA_DISTANCE_BRAYCURTIS.c_str(), "compute Bray Curtis distance")); //distanceParser->push_back (new OptionNoParam (STR_SIMKA_DISTANCE_CHORD.c_str(), "compute Chord distance")); //distanceParser->push_back (new OptionNoParam (STR_SIMKA_DISTANCE_HELLINGER.c_str(), "compute Hellinger distance")); //distanceParser->push_back (new OptionNoParam (STR_SIMKA_DISTANCE_CANBERRA.c_str(), "compute Canberra distance")); //distanceParser->push_back (new OptionNoParam (STR_SIMKA_DISTANCE_KULCZYNSKI.c_str(), "compute Kulczynski distance")); parser->push_back(distanceParser); parser->push_back(kmerParser); parser->push_back(readParser); parser->push_back(coreParser); //parser->push_back(distanceParser); IOptionsParser* dskParser = SortingCountAlgorithm<>::getOptionsParser(); if (Option* p = dynamic_cast (dskParser->getParser(STR_MINIMIZER_SIZE))) { p->setDefaultValue ("7"); } parser->push_back(dskParser); dskParser->setVisible(false); if (Option* p = dynamic_cast (parser->getParser(STR_SOLIDITY_KIND))) { p->setDefaultValue ("all"); } return parser; } Simka::Simka() : Tool ("Simka") { Simka::createOptionsParser(getParser()); //coreParser->push_back(new OptionOneParam(parser->getParser(STR_NB_CORES)->getName(), parser->getParser(STR_NB_CORES)->getHelp(), false, "0")); //if (IOptionsParser* input = dskParser->getParser (STR_KMER_ABUNDANCE_MIN_THRESHOLD)) { input->setVisible (false); } /* IOptionsParser* parser = getParser(); IOptionsParser* dskParser = SortingCountAlgorithm<>::getOptionsParser(); parser->push_back (dskParser, 1); parser->push_back(dskParser); parser->getParser (STR_URI_INPUT)->setHelp("input file of datasets and their id. One dataset per line: dataset_id dataset_filename"); parser->getParser (STR_KMER_ABUNDANCE_MIN_THRESHOLD)->setVisible (false); parser->getParser (STR_HISTOGRAM_MAX)->setVisible (false); parser->getParser (STR_URI_SOLID_KMERS)->setVisible (false); parser->getParser (STR_URI_OUTPUT_DIR)->setHelp("output directory for temporary files"); parser->getParser (STR_URI_OUTPUT)->setHelp("output directory for result files"); parser->getParser (STR_SOLIDITY_KIND)->setHelp("TODO"); parser->getParser (STR_MINIMIZER_TYPE)->setVisible (false); parser->getParser (STR_MINIMIZER_SIZE)->setVisible (false); parser->getParser (STR_REPARTITION_TYPE)->setVisible (false); if (Option* p = dynamic_cast (parser->getParser(STR_KMER_ABUNDANCE_MIN))) { p->setDefaultValue ("0"); } parser->push_back (new OptionNoParam (STR_SOLIDITY_PER_DATASET.c_str(), "Do not take into consideration multi-counting when determining solidity of kmers", false )); */ /* parser->push_back (new OptionOneParam (STR_URI_INPUT, "reads file", true )); parser->push_back (new OptionOneParam (STR_KMER_SIZE, "size of a kmer", false, "31" )); parser->push_back (new OptionOneParam (STR_KMER_ABUNDANCE_MIN,"min abundance threshold for solid kmers", false, "3" )); parser->push_back (new OptionOneParam (STR_KMER_ABUNDANCE_MAX,"min abundance threshold for solid kmers", false, "3" )); parser->push_back (new OptionOneParam (STR_MAX_MEMORY, "max memory (in MBytes)", false, "2000")); parser->push_back (new OptionOneParam (STR_URI_OUTPUT_DIR, "output folder for solid kmers", false)); parser->push_back (new OptionOneParam (STR_URI_OUTPUT, "output file", false)); */ //setParser (parser); } struct Parameter { //Parameter (Simka& simka, IProperties* props) : props(props) {} Parameter (IProperties* props) : _props(props) {} //Simka& _simka; IProperties* _props; /* string _inputFilename; string _outputDir; size_t _kmerSize; pair _abundanceThreshold; bool _soliditySingle;*/ }; template struct Functor { void operator () (Parameter p) { SimkaAlgorithm simkaAlgorithm (p._props); simkaAlgorithm.execute(); /* #ifdef SIMKA_MIN simkaAlgorithm.executeSimkamin(); #else #endif*/ }}; void Simka::execute () { IProperties* input = getInput(); //Parameter params(*this, getInput()); Parameter params(input); size_t kmerSize = getInput()->getInt (STR_KMER_SIZE); /* params._kmerSize = getInput()->getInt (STR_KMER_SIZE); params._inputFilename = input->getStr(STR_URI_INPUT); params._outputDir = input->get(STR_URI_OUTPUT) ? input->getStr(STR_URI_OUTPUT) : "./"; params._abundanceThreshold.first = input->getInt(STR_KMER_ABUNDANCE_MIN); params._abundanceThreshold.second = input->getInt(STR_KMER_ABUNDANCE_MAX); params._soliditySingle = input->get(Simka::STR_SOLIDITY_PER_DATASET); cout << params._soliditySingle << endl; */ /** We launch the tool with the correct Integer implementation according to the choosen kmer size. */ Integer::apply (kmerSize, params); } simka-1.5.3/src/core/Simka.hpp000077500000000000000000000044031377312000000161150ustar00rootroot00000000000000/***************************************************************************** * Simka: Fast kmer-based method for estimating the similarity between numerous metagenomic datasets * A tool from the GATB (Genome Assembly Tool Box) * Copyright (C) 2015 INRIA * Authors: G.Benoit, C.Lemaitre, P.Peterlongo * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU Affero General Public License as * published by the Free Software Foundation, either version 3 of the * License, or (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU Affero General Public License for more details. * * You should have received a copy of the GNU Affero General Public License * along with this program. If not, see . *****************************************************************************/ #ifndef _TOOL_Simka_HPP_ #define _TOOL_Simka_HPP_ /********************************************************************************/ #include /********************************************************************************/ //using namespace gatb::core::system; //using namespace gatb::core::system::impl; //////////////////////////////////////////////////////////////////////////////// // // THIS FILE IS AUTOMATICALLY GENERATED... // // THIS IS A SIMPLE EXAMPLE HOW TO USE THE Tool CLASS. IF YOU WANT MORE FEATURES, // YOU CAN HAVE A LOOK AT THE ToyTool SNIPPET HERE: // // http://gatb-core.gforge.inria.fr/snippets_tools.html // //////////////////////////////////////////////////////////////////////////////// class Simka : public Tool { public: //typedef typename Kmer::Type Type; //typedef typename Kmer::Count Count; //typedef kmer::impl::Kmer<>::ModelDirect KmerModel; // Constructor Simka(); // Actual job done by the tool is here void execute (); static IOptionsParser* createOptionsParser (IOptionsParser* parent); //static void executeAlgorithm (Simka& simka, IProperties* props); private: }; #endif /* _TOOL_Simka_HPP_ */ simka-1.5.3/src/core/SimkaAlgorithm.cpp000077500000000000000000000367071377312000000177730ustar00rootroot00000000000000/***************************************************************************** * Simka: Fast kmer-based method for estimating the similarity between numerous metagenomic datasets * A tool from the GATB (Genome Assembly Tool Box) * Copyright (C) 2015 INRIA * Authors: G.Benoit, C.Lemaitre, P.Peterlongo * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU Affero General Public License as * published by the Free Software Foundation, either version 3 of the * License, or (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU Affero General Public License for more details. * * You should have received a copy of the GNU Affero General Public License * along with this program. If not, see . *****************************************************************************/ #include "SimkaAlgorithm.hpp" static const char* strProgressPartitionning = "Simka: Step 1: partitioning "; static const char* strProgressCounting = "Simka: Step 2: counting kmers "; template SimkaAlgorithm::SimkaAlgorithm(IProperties* options) : Algorithm("simka", -1, options) //_progress (0), _tmpPartitionsStorage(0), _tmpPartitions(0) { _options = options; _stats = 0; //_simkaDistance = 0; _banks = 0; //_processor = 0; //string maxDisk = ""; //if(_options->get(STR_MAX_DISK)){ // maxDisk = _options->getStr(STR_MAX_DISK); // cout << maxDisk << endl; //} //_multiStorage = new MultiDiskStorage(_options->getStr(STR_URI_OUTPUT_DIR), _options->getStr(STR_MAX_DISK)); // vector _tempDirMaxDisk _totalKmers = 0; /* if(_options->getInt(STR_VERBOSE) != 0){ cout << "Filter options" << endl; cout << "\tMax reads per dataset: " << _maxNbReads << endl; cout << "\tMin read size: " << _minReadSize << endl; cout << "\tMin Shannon index: " << _minShannonIndex << endl; }*/ //if(_maxNbReads == 0) // _maxNbReads = -1; //cout << _maxNbReads << endl; //cout << _soliditySingle << endl; /* string solidKindStr = _options->getStr(STR_SOLIDITY_KIND); if(solidKindStr == "range"){ _solidKind = SIMKA_SOLID_KIND::RANGE; } else if(solidKindStr == "sum"){ _solidKind = SIMKA_SOLID_KIND::SUM; } cout << solidKindStr << " " << solidKindStr << endl;*/ //_kmerSize = _options->get(STR_KMER_SIZE) ? _options->getInt(STR_KMER_SIZE) : 31; //_abundanceMin = _options->get(STR_KMER_ABUNDANCE_MIN) ? _options->getInt(STR_KMER_ABUNDANCE_MIN) : 0; //_maxMemory = props->get(STR_MAX_MEMORY) ? props->getInt(STR_MAX_MEMORY) : 2000; //_outputTempDir = props->get(STR_URI_OUTPUT_DIR) ? props->getStr(STR_URI_OUTPUT_DIR) : System::file().getDirectory(_inputFilename); //_outputFilename = props->get(STR_URI_OUTPUT) ? props->getStr(STR_URI_OUTPUT) : System::file().getDirectory(_inputFilename) + "/" + System::file().getBaseName(_inputFilename) + "_output.fasta"; //_nbCores = getInput()->getInt(STR_NB_CORES); //cout << "Input filename: " << _inputFilename << endl; //cout << "Kmer size: " << _kmerSize << endl; //cout << "Abundance min: " << _abundanceMin << endl; //cout << "Max memory: " << _maxMemory << endl; //cout << "Output temp dir: " << _outputTempDir << endl; //cout << "Output filename: " << _outputFilename << endl; //_banksInputFilename = _inputFilename + "_dsk_dataset_temp__"; } template SimkaAlgorithm::~SimkaAlgorithm() { } template void SimkaAlgorithm::execute() { /* if(!setup()) return; if(!isInputValid()) return; createBank(); count(); outputMatrix(); //outputHeatmap(); if(_options->getInt(STR_VERBOSE) != 0){ _stats->print(); print(); } clear();*/ } template bool SimkaAlgorithm::setup() { if(! createDirs() ) return false; try{ layoutInputFilename(); } catch (Exception& e){ cout << "Syntax error in input file" << endl; return false; } _nbBanks = _bankNames.size(); return true; } template void SimkaAlgorithm::parseArgs() { _computeSimpleDistances = _options->get(STR_SIMKA_COMPUTE_ALL_SIMPLE_DISTANCES); _computeComplexDistances = _options->get(STR_SIMKA_COMPUTE_ALL_COMPLEX_DISTANCES); _keepTmpFiles = _options->get(STR_SIMKA_KEEP_TMP_FILES); _maxMemory = _options->getInt(STR_MAX_MEMORY); _nbCores = _options->getInt(STR_NB_CORES); _inputFilename = _options->getStr(STR_URI_INPUT); _outputDir = _options->get(STR_URI_OUTPUT) ? _options->getStr(STR_URI_OUTPUT) : "./"; _outputDirTemp = _options->get(STR_URI_OUTPUT_TMP) ? _options->getStr(STR_URI_OUTPUT_TMP) : "./"; _kmerSize = _options->getInt(STR_KMER_SIZE); _abundanceThreshold.first = _options->getInt(STR_KMER_ABUNDANCE_MIN); _abundanceThreshold.second = min((u_int64_t)_options->getInt(STR_KMER_ABUNDANCE_MAX), (u_int64_t)(999999999)); //cout << _options->getInt(STR_KMER_ABUNDANCE_MAX) << endl; //cout << _abundanceThreshold.second << endl; _soliditySingle = _options->get(STR_SIMKA_SOLIDITY_PER_DATASET); //_nbMinimizers = _options->getInt(STR_KMER_PER_READ); //_maxDisk = getInput()->getInt(STR_MAX_DISK); //read filter _maxNbReads = _options->getInt(STR_SIMKA_MAX_READS); _minReadSize = _options->getInt(STR_SIMKA_MIN_READ_SIZE); _minReadShannonIndex = _options->getDouble(STR_SIMKA_MIN_READ_SHANNON_INDEX); _minReadShannonIndex = std::max(_minReadShannonIndex, 0.0); _minReadShannonIndex = std::min(_minReadShannonIndex, 2.0); _minKmerShannonIndex = _options->getDouble(STR_SIMKA_MIN_KMER_SHANNON_INDEX); _minKmerShannonIndex = std::max(_minKmerShannonIndex, 0.0); _minKmerShannonIndex = std::min(_minKmerShannonIndex, 2.0); if(!System::file().doesExist(_inputFilename)){ cerr << "ERROR: Input filename does not exist" << endl; exit(1); } } template bool SimkaAlgorithm::createDirs(){ if(!System::file().doesExist(_outputDir)){ int ok = System::file().mkdir(_outputDir, -1); if(ok != 0){ std::cout << "Error: can't create output directory (" << _outputDir << ")" << std::endl; return false; } } _outputDirTemp = _outputDirTemp; if(!System::file().doesExist(_outputDirTemp)){ int ok = System::file().mkdir(_outputDirTemp, -1); if(ok != 0){ std::cout << "Error: can't create output temp directory (" << _outputDirTemp << ")" << std::endl; return false; } } _outputDirTemp = System::file().getRealPath(_outputDirTemp); _outputDirTemp += "/simka_output_temp/"; System::file().mkdir(_outputDirTemp, -1); _options->setStr(STR_URI_OUTPUT_TMP, _outputDirTemp); System::file().mkdir(_outputDirTemp + "/input/", -1); return true; } template void SimkaAlgorithm::layoutInputFilename(){ if(_options->getInt(STR_VERBOSE) != 0){ cout << endl << "Creating input" << endl; } string inputDir = _outputDirTemp + "/input/"; ifstream inputFile(_inputFilename.c_str()); _banksInputFilename = inputDir + "__input_simka__"; //_inputFilename + "_dsk_dataset_temp__"; IFile* bankFile = System::file().newFile(_banksInputFilename, "wb"); string line; string linePart; vector lineIdDatasets; vector linepartPairedDatasets; vector linepartDatasets; string bankFileContents = ""; u_int64_t lineIndex = 0; while(getline(inputFile, line)){ line.erase(std::remove(line.begin(),line.end(),' '),line.end()); if(line == "") continue; //cout << line << endl; lineIdDatasets.clear(); linepartPairedDatasets.clear(); //vector filenames; stringstream lineStream(line); while(getline(lineStream, linePart, ':')){ lineIdDatasets.push_back(linePart); } string bankId = lineIdDatasets[0]; string linePairedDatasets = lineIdDatasets[1]; stringstream linePairedDatasetsStream(linePairedDatasets); while(getline(linePairedDatasetsStream, linePart, ';')){ linepartPairedDatasets.push_back(linePart); } string subBankFilename = inputDir + bankId; IFile* subBankFile = System::file().newFile(subBankFilename, "wb"); //cout << subBankFile->getPath() << endl; string subBankContents = ""; _nbBankPerDataset.push_back(linepartPairedDatasets.size()); for(size_t i=0; ifwrite(subBankContents.c_str(), subBankContents.size(), 1); subBankFile->flush(); delete subBankFile; bankFileContents += inputDir + "/" + bankId + "\n"; lineIndex += 1; _bankNames.push_back(bankId); } inputFile.close(); bankFileContents.erase(bankFileContents.size()-1); bankFile->fwrite(bankFileContents.c_str(), bankFileContents.size(), 1); bankFile->flush(); delete bankFile; if(_options->getInt(STR_VERBOSE) != 0){ cout << "\tNb input datasets: " << _bankNames.size() << endl; cout << endl; } } template bool SimkaAlgorithm::isInputValid(){ string inputDir = _outputDirTemp + "/input/"; for (size_t i=0; i<_nbBanks; i++){ try{ IBank* bank = Bank::open(inputDir + _bankNames[i]); LOCAL(bank); } catch (Exception& e){ cerr << "ERROR: Can't open dataset: " << _bankNames[i] << endl; return false; } } return true; } template void SimkaAlgorithm::computeMaxReads(){ string inputDir = _outputDirTemp + "/input/"; //if(_maxNbReads != 0){ // return; //} if(_maxNbReads == 0){ if(_options->getInt(STR_VERBOSE) != 0) cout << "-maxNbReads is not defined. Simka will estimating it..." << endl; } u_int64_t totalReads = 0; u_int64_t minReads = -1; u_int64_t maxReads = 0; u_int64_t meanReads = 0; if(_maxNbReads == 0 || _options->get(STR_SIMKA_COMPUTE_DATA_INFO)){ for (size_t i=0; i<_nbBanks; i++){ IBank* bank = Bank::open(inputDir + _bankNames[i]); LOCAL(bank); u_int64_t nbReads = bank->estimateNbItems(); nbReads /= _nbBankPerDataset[i]; totalReads += nbReads; if(nbReads < minReads){ minReads = nbReads; //_smallerBankId = _bankNames[i]; } if(nbReads > maxReads){ maxReads = nbReads; _largerBankId = _bankNames[i]; } } meanReads = totalReads / _nbBanks; if(_options->getInt(STR_VERBOSE) != 0){ cout << "Smaller sample contains: " << minReads << " reads" << endl; cout << "Larger sample contains: " << maxReads << " reads" << endl; cout << "Whole dataset contains a mean of: " << meanReads << " reads" << endl << endl; } } if(_maxNbReads == 0){ _maxNbReads = (minReads + meanReads) / 2; if(_options->getInt(STR_VERBOSE) != 0){ cout << "Reads per sample used up to: " << _maxNbReads << endl << endl; } } else if(_maxNbReads == -1){ if(_options->getInt(STR_VERBOSE) != 0) cout << "Reads per sample used: all"<< endl << endl; _maxNbReads = 0; } else{ if(_options->getInt(STR_VERBOSE) != 0){ cout << "Reads per sample used up to: " << _maxNbReads << endl << endl; } } } /* template void SimkaAlgorithm::layoutInputFilename(){ if(_options->getInt(STR_VERBOSE) != 0){ cout << endl << "Creating input" << endl; } _banksInputFilename = _inputFilename + "_dsk_dataset_temp__"; ifstream inputFile(_inputFilename.c_str()); IFile* bankFile = System::file().newFile(_banksInputFilename, "wb"); string line; string linePart; vector linePartList; string bankFileContents = ""; u_int64_t lineIndex = 0; while(getline(inputFile, line)){ if(line == "") continue; stringstream lineStream(line); linePartList.clear(); //vector filenames; while(getline(lineStream, linePart, ' ')){ if(linePart != ""){ linePartList.push_back(linePart); } } string bankId = linePartList[0]; _bankNames.push_back(bankId); //ID and one filename if(linePartList.size() == 2){ bankFileContents += linePartList[1] + "\n"; _nbBankPerDataset.push_back(1); } //ID and list of filename (paired files for example) else{ char buffer[200]; snprintf(buffer,200,"%llu", lineIndex); string subBankFilename = _banksInputFilename + "_" + string(buffer); _tempFilenamesToDelete.push_back(subBankFilename); IFile* subBankFile = System::file().newFile(subBankFilename, "wb"); string subBankContents = ""; for(size_t i=1; ifwrite(subBankContents.c_str(), subBankContents.size(), 1); subBankFile->flush(); delete subBankFile; bankFileContents += subBankFilename + "\n"; _nbBankPerDataset.push_back(linePartList.size() - 1); //linePartList.size() - 1 = nb sub banks //_nbReadsPerDataset.push_back(ceil(_maxNbReads / (float)())); } lineIndex += 1; } bankFileContents.erase(bankFileContents.size()-1); //bankFileContents.pop_back(); // "remove last /n bankFile->fwrite(bankFileContents.c_str(), bankFileContents.size(), 1); inputFile.close(); //delete inputFile; bankFile->flush(); delete bankFile; //for(int i=0; i<_nbBanksOfDataset.size(); i++){ // cout << i << " " << _nbBanksOfDataset[i] << endl; //} if(_options->getInt(STR_VERBOSE) != 0){ cout << "\tNb input datasets: " << _bankNames.size() << endl; } cout << endl; }*/ template void SimkaAlgorithm::createBank(){ IBank* bank = Bank::open(_banksInputFilename); SimkaSequenceFilter sequenceFilter(_minReadSize, _minReadShannonIndex); _banks = new SimkaBankFiltered(bank, sequenceFilter, _nbBankPerDataset, _maxNbReads); } template void SimkaAlgorithm::count(){ /* //SimkaDistanceParam distanceParams(_options); _stats = new SimkaStatistics(_nbBanks, _computeSimpleDistances, _computeComplexDistances, _outputDirTemp, _bankNames); SortingCountAlgorithm sortingCount (_banks, _options); // We create a custom count processor and give it to the sorting count algorithm vector dummyVec; _processor = new SimkaCountProcessor (*_stats, _nbBanks, _kmerSize, _abundanceThreshold, _solidKind, _soliditySingle, _minKmerShannonIndex); _processor->use(); sortingCount.addProcessor (_processor); // We launch the algorithm sortingCount.execute(); */ } template void SimkaAlgorithm::outputMatrix(){ _stats->outputMatrix(_outputDir, _bankNames); } template void SimkaAlgorithm::print(){ cout << "Output folder: " << _outputDir << endl; } template void SimkaAlgorithm::clear(){ if(_banks){ //_banks->finalize(); //delete _banks; } System::file().remove(_banksInputFilename); //if(_processor) _processor->forget(); for(size_t i=0; i<_tempFilenamesToDelete.size(); i++){ System::file().remove(_tempFilenamesToDelete[i]); } if(_stats) delete _stats; //if(_simkaDistance) delete _simkaDistance; //_banks->remove(); //delete _processor; } simka-1.5.3/src/core/SimkaAlgorithm.hpp000077500000000000000000000654621377312000000200000ustar00rootroot00000000000000/***************************************************************************** * Simka: Fast kmer-based method for estimating the similarity between numerous metagenomic datasets * A tool from the GATB (Genome Assembly Tool Box) * Copyright (C) 2015 INRIA * Authors: G.Benoit, C.Lemaitre, P.Peterlongo * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU Affero General Public License as * published by the Free Software Foundation, either version 3 of the * License, or (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU Affero General Public License for more details. * * You should have received a copy of the GNU Affero General Public License * along with this program. If not, see . *****************************************************************************/ #ifndef TOOLS_SIMKA_SRC_SIMKAALGORITHM_HPP_ #define TOOLS_SIMKA_SRC_SIMKAALGORITHM_HPP_ #include #include "SimkaCommons.hpp" #include #include //#define PRINT_STATS //#define CHI2_TEST //#define SIMKA_POTARA //#define BOOTSTRAP #define MAX_BOOTSTRAP 50 #define NB_BOOTSTRAP 45 //#define SIMKA_FUSION //#define MULTI_PROCESSUS //#define MULTI_DISK //#define SIMKA_MIN #include "SimkaDistance.hpp" enum SIMKA_SOLID_KIND{ RANGE, SUM, }; typedef u_int16_t bankIdType; class SimkaCounterBuilder { public: /** Constructor. * \param[in] nbBanks : number of banks parsed during kmer counting. */ SimkaCounterBuilder (size_t nbBanks=1) : _abundancePerBank(nbBanks) {} /** Get the number of banks. * \return the number of banks. */ size_t size() const { return _abundancePerBank.size(); } /** Initialization of the counting for the current kmer. This method should be called * when a kmer is seen for the first time. * \param[in] idxBank : bank index where the new current kmer has been found. */ void init (size_t idxBank=0) { for (size_t k=0; k<_abundancePerBank.size(); k++) { _abundancePerBank[k]=0; } _abundancePerBank [idxBank]= 1; } /** Increase the abundance of the current kmer for the provided bank index. * \param[in] idxBank : index of the bank */ void increase (size_t idxBank=0) { _abundancePerBank [idxBank] ++; } /** Set the abundance of the current kmer for the provided bank index. * \param[in] idxBank : index of the bank */ void set (CountNumber val, size_t idxBank=0) { _abundancePerBank [idxBank] = val; } /** Get the abundance of the current kmer for the provided bank index. * \param[in] idxBank : index of the bank * \return the abundance of the current kmer for the given bank. */ CountNumber operator[] (size_t idxBank) const { return _abundancePerBank[idxBank]; } /** */ const CountVector& get () const { return _abundancePerBank; } private: CountVector _abundancePerBank; }; /********************************************************************* * ** SimkaCountProcessor *********************************************************************/ template class SimkaCountProcessorSimple{ private: size_t _nbBanks; size_t _kmerSize; //pair _abundanceThreshold; //bool isAbundanceThreshold; SimkaStatistics* _stats; double _totalAbundance; u_int64_t _nbKmerCounted; double _minKmerShannonIndex; //vector _banksOks; vector _sharedBanks; typedef std::pair chi2val_Abundances; struct _chi2ValueSorterFunction { bool operator() (chi2val_Abundances l,chi2val_Abundances r) { return r.first < l.first; } } ; std::priority_queue< chi2val_Abundances, vector, _chi2ValueSorterFunction> _chi2ValueSorter; size_t _maxChi2Values; public: typedef typename Kmer::Type Type; //typedef typename Kmer::Count Count; SimkaCountProcessorSimple(SimkaStatistics* stats, size_t nbBanks, size_t kmerSize, const pair& abundanceThreshold, SIMKA_SOLID_KIND solidKind, bool soliditySingle, double minKmerShannonIndex) : _stats(stats) { _maxChi2Values = 1000; // We configure the vector for the N.(N+1)/2 possible pairs //_countTotal.resize (_nbBanks*(_nbBanks+1)/2); _nbBanks = nbBanks; _kmerSize = kmerSize; //_abundanceThreshold = abundanceThreshold; _minKmerShannonIndex = minKmerShannonIndex; //_localStats = new SimkaStatistics(_nbBanks, _stats._distanceParams); _nbKmerCounted = 0; //isAbundanceThreshold = _abundanceThreshold.first > 1 || _abundanceThreshold.second < 1000000; } void end(){ #ifdef CHI2_TEST size_t nbValues = _chi2ValueSorter.size(); for(size_t i=0; i::Type& kmer, const CountVector& counts){ //cout << kmer.toString(_kmerSize) << endl; //for(size_t i=0; i_nbDistinctKmers += 1; for(size_t i=0; i_nbKmers += abundance; _stats->_nbKmersPerBank[i] += abundance; _totalAbundance += abundance; } #endif /* A DEPLACER PENDANT LE COMPTAGE DES KMERS if(_minKmerShannonIndex != 0){ double shannonIndex = getShannonIndex(kmer); if(shannonIndex < _minKmerShannonIndex){ return; } }*/ #ifdef CHI2_TEST float X2j = 0; _totalAbundance = 0; for(size_t i=0; i_datasetNbReads[i]/_stats->_totalReads), 2) / (_stats->_datasetNbReads[i] / (_stats->_totalReads*_totalAbundance)); } //std::chi_squared_distribution distribution(_nbBanks-1); //double pvalue = chisqr(_nbBanks-1, X2j); /* if(lala> 100){ for(size_t i=0; i<_chi2ValueSorter.size(); i++){ double val = _chi2ValueSorter.top(); _chi2ValueSorter.pop(); cout << val << endl; } return; }*/ //cout << X2j << endl; if(_chi2ValueSorter.size() > _maxChi2Values){ if(X2j > _chi2ValueSorter.top().first){ _chi2ValueSorter.push(pair(X2j, counts)); _chi2ValueSorter.pop(); } } else{ _chi2ValueSorter.push(pair(X2j, counts)); } //cout << _chi2ValueSorter.size() << " " << X2j << " " << _chi2ValueSorter.top() << endl; //cout << X2j << " " << pvalue << endl; return; /* cout << kmer.toString(_kmerSize) << " ["; for(size_t i=0; i 0.01) return; #endif /* //for(size_t i=0; i<_datasetNbReads.size(); i++) // cout << i << " " << _datasetNbReads[i] << endl; //cout << _totalReads << " " << _totalAbundance << endl; //float Ri = 500000; //float Rtotal = Ri * _nbBanks; //float Ntotal = _totalAbundance; float X2j = 0; for(size_t i=0; i_nbSolidKmers += 1; // computeStats(counts); //} //else{ updateDistance(counts); //else // computeStats(counts); //_stats->_nbSolidKmers += 1; } void updateDistance(const CountVector& counts){ _sharedBanks.clear(); for(size_t i=0; i_computeSimpleDistances) updateDistanceSimple(counts); if(_stats->_computeComplexDistances) updateDistanceComplex(counts); } void updateDistanceDefault(const CountVector& counts){ for(size_t ii=0; ii<_sharedBanks.size(); ii++){ for(size_t jj=ii+1; jj<_sharedBanks.size(); jj++){ u_int16_t i = _sharedBanks[ii]; u_int16_t j = _sharedBanks[jj]; size_t symetricIndex = j + ((_nbBanks-1)*i) - (i*(i-1)/2); u_int64_t abundanceI = counts[i]; u_int64_t abundanceJ = counts[j]; _stats->_matrixNbSharedKmers[i][j] += counts[i]; _stats->_matrixNbSharedKmers[j][i] += counts[j]; _stats->_matrixNbDistinctSharedKmers[symetricIndex] += 1; //cout << i << " " << j << " " << (j + ((_nbBanks-1)*i) - (i*(i-1)/2)) << endl; _stats->_brayCurtisNumerator[symetricIndex] += min(abundanceI, abundanceJ); } } } void updateDistanceSimple(const CountVector& counts){ for(size_t ii=0; ii<_sharedBanks.size(); ii++){ for(size_t jj=ii+1; jj<_sharedBanks.size(); jj++){ u_int16_t i = _sharedBanks[ii]; u_int16_t j = _sharedBanks[jj]; u_int64_t abundanceI = counts[i]; u_int64_t abundanceJ = counts[j]; //cout << _stats->_chord_sqrt_N2[i] << endl; //_stats->_chord_NiNj[i][j] += abundanceI * abundanceJ; _stats->_chord_NiNj[i][j] += abundanceI * abundanceJ; _stats->_hellinger_SqrtNiNj[i][j] += sqrt(abundanceI * abundanceJ); _stats->_kulczynski_minNiNj[i][j] += min(abundanceI, abundanceJ); } } } void updateDistanceComplex(const CountVector& counts){ //_sharedBanks.clear(); //for(size_t i=0; i 0) double abundanceI = counts[i]; double abundanceJ = counts[j]; if(abundanceJ){ //_stats->_matrixNbSharedKmers[i][j] += abundanceI; //_stats->_matrixNbSharedKmers[j][i] += abundanceJ; //_stats->_matrixNbDistinctSharedKmers[i][j] += 1; //_stats->_chord_NiNj[i][j] += abundanceI * abundanceJ; //_stats->_chord_NiNj[i][j] += (abundanceI * abundanceJ) / (_stats->_chord_sqrt_N2[i]*_stats->_chord_sqrt_N2[j]); //_stats->_hellinger_SqrtNiNj[i][j] += sqrt(abundanceI * abundanceJ); //_stats->_kulczynski_minNiNj[i][j] += min(abundanceI, abundanceJ); double yX = abundanceJ * _stats->_nbSolidKmersPerBank[i]; double xY = abundanceI * _stats->_nbSolidKmersPerBank[j]; xi = (double)abundanceI / _stats->_nbSolidKmersPerBank[i]; d1 = xi * log((2*xY) / (xY + yX)); //xY = abundanceI * _stats->_nbSolidKmersPerBank[j]; //yX = abundanceJ * _stats->_nbSolidKmersPerBank[i]; xj = (double)abundanceJ / _stats->_nbSolidKmersPerBank[j]; d2 = xj * log((2*yX) / (xY + yX)); } else{ d2 = 0; double yX = abundanceJ * _stats->_nbSolidKmersPerBank[i]; double xY = abundanceI * _stats->_nbSolidKmersPerBank[j]; xi = (double)abundanceI / _stats->_nbSolidKmersPerBank[i]; d1 = xi * log((2*xY) / (xY + yX)); } /* if(abundanceI){ double yX = abundanceJ * _stats->_nbSolidKmersPerBank[i]; double xY = abundanceI * _stats->_nbSolidKmersPerBank[j]; xi = (double)abundanceI / _stats->_nbSolidKmersPerBank[i]; d1 = xi * log((2*xY) / (xY + yX)); } else{ d1 = 0; } if(abundanceJ){ double xY = abundanceI * _stats->_nbSolidKmersPerBank[j]; double yX = abundanceJ * _stats->_nbSolidKmersPerBank[i]; xj = (double)abundanceJ / _stats->_nbSolidKmersPerBank[j]; d2 = xj * log((2*yX) / (xY + yX)); } else{ d2 = 0; }*/ _stats->_kullbackLeibler[i][j] += d1 + d2; _stats->_canberra[i][j] += abs(abundanceI - abundanceJ) / (abundanceI + abundanceJ); //_stats->_brayCurtisNumerator[i][j] += abs(abundanceI - abundanceJ); _stats->_whittaker_minNiNj[i][j] += abs((int)((u_int64_t)(abundanceI*_stats->_nbSolidKmersPerBank[j]) - (u_int64_t)(abundanceJ*_stats->_nbSolidKmersPerBank[i]))); //cout << _stats->_nbSolidKmersPerBank[i] << endl; } } else{ //Here, we know that (abundanceI == 0) for(size_t jj=0; jj<_sharedBanks.size(); jj++){ u_int16_t j = _sharedBanks[jj]; if(i > j) continue; double abundanceI = counts[i]; double abundanceJ = counts[j]; d1 = 0; double xY = abundanceI * _stats->_nbSolidKmersPerBank[j]; double yX = abundanceJ * _stats->_nbSolidKmersPerBank[i]; xj = (double)abundanceJ / _stats->_nbSolidKmersPerBank[j]; d2 = xj * log((2*yX) / (xY + yX)); _stats->_kullbackLeibler[i][j] += d1 + d2; _stats->_canberra[i][j] += abs(abundanceI - abundanceJ) / (abundanceI + abundanceJ); //_stats->_brayCurtisNumerator[i][j] += abs(abundanceI - abundanceJ); //cout << _stats->_nbSolidKmersPerBank[i] << endl; _stats->_whittaker_minNiNj[i][j] += abs((int)((u_int64_t)(abundanceI*_stats->_nbSolidKmersPerBank[j]) - (u_int64_t)(abundanceJ*_stats->_nbSolidKmersPerBank[i]))); } } } /* return; double xi = 0; double xj = 0; double d1 = 0; double d2 = 0; #ifdef PRINT_STATS int nbBanksThatHaveKmer = 0; #endif //u_int64_t totalAbundance = 0; for(size_t i=0; i_nbSolidDistinctKmersPerBank[i] += 1; //_stats->_nbSolidKmersPerBank[i] += abundanceI; //_stats->_chord_N2[i] += pow(abundanceI, 2); } #endif for(size_t j=i+1; j_matrixNbSharedKmers[i][j] += abundanceI; _stats->_matrixNbSharedKmers[j][i] += abundanceJ; _stats->_matrixNbDistinctSharedKmers[i][j] += 1; //_stats->_chord_NiNj[i][j] += abundanceI * abundanceJ; _stats->_chord_NiNj[i][j] += (abundanceI * abundanceJ) / (_stats->_chord_sqrt_N2[i]*_stats->_chord_sqrt_N2[j]); _stats->_hellinger_SqrtNiNj[i][j] += sqrt(abundanceI * abundanceJ); _stats->_kulczynski_minNiNj[i][j] += min(abundanceI, abundanceJ); _stats->_whittaker_minNiNj[i][j] += abs((int)((u_int64_t)(abundanceI*_stats->_nbSolidKmersPerBank[j]) - (u_int64_t)(abundanceJ*_stats->_nbSolidKmersPerBank[i]))); } if(abundanceI){ double yX = abundanceJ * _stats->_nbSolidKmersPerBank[i]; double xY = abundanceI * _stats->_nbSolidKmersPerBank[j]; xi = (double)abundanceI / _stats->_nbSolidKmersPerBank[i]; d1 = xi * log((2*xY) / (xY + yX)); } else{ d1 = 0; } if(abundanceJ){ double xY = abundanceI * _stats->_nbSolidKmersPerBank[j]; double yX = abundanceJ * _stats->_nbSolidKmersPerBank[i]; xj = (double)abundanceJ / _stats->_nbSolidKmersPerBank[j]; d2 = xj * log((2*yX) / (xY + yX)); } else{ d2 = 0; } _stats->_kullbackLeibler[i][j] += d1 + d2; _stats->_canberra[i][j] += abs(abundanceI - abundanceJ) / (abundanceI + abundanceJ); _stats->_brayCurtisNumerator[i][j] += abs(abundanceI - abundanceJ); //cout << _stats->_nbSolidKmersPerBank[i] << endl; _stats->_whittaker_minNiNj[i][j] += abs((int)((u_int64_t)(abundanceI*_stats->_nbSolidKmersPerBank[j]) - (u_int64_t)(abundanceJ*_stats->_nbSolidKmersPerBank[i]))); } } }*/ #ifdef PRINT_STATS _stats->_nbDistinctKmersSharedByBanksThreshold[nbBanksThatHaveKmer-1] += 1; _stats->_nbKmersSharedByBanksThreshold[nbBanksThatHaveKmer-1] += _totalAbundance; if(_totalAbundance == 1){ //if( == 1){ _stats->_nbErroneousKmers += 1; //} } //else if(nbBanksThatHaveKmer == counter.size()){ //} #endif } //inline bool isSolidVector(const CountVector& counts); double getShannonIndex(const Type& kmer){ float index = 0; //float freq [5]; vector _freqs(4, 0); //char* seqStr = seq.getDataBuffer(); for (size_t i=0; i<_kmerSize; i++){ _freqs[kmer[i]] += 1.0; //seq[sizeKmer-i-1] = bin2NT [(*this)[i]]; } // Frequency of each letter (A, C, G, T or N) //for(size_t i=0; i < seq.size(); i++) // _freqs[nt2binTab[(unsigned char)seq[i]]] += 1.0; // Shannon index calculation for (size_t i=0; i<_freqs.size(); i++){ _freqs[i] /= (float) _kmerSize; if (_freqs[i] != 0) index += _freqs[i] * log (_freqs[i]) / log(2); } return abs(index); } double approx_gamma(double Z) { const double RECIP_E = 0.36787944117144232159552377016147; // RECIP_E = (E^-1) = (1.0 / E) const double TWOPI = 6.283185307179586476925286766559; // TWOPI = 2.0 * PI double D = 1.0 / (10.0 * Z); D = 1.0 / ((12 * Z) - D); D = (D + Z) * RECIP_E; D = pow(D, Z); D *= sqrt(TWOPI / Z); return D; } static double igf(double S, double Z) { if(Z < 0.0) { return 0.0; } double Sc = (1.0 / S); Sc *= pow(Z, S); Sc *= exp(-Z); double Sum = 1.0; double Nom = 1.0; double Denom = 1.0; for(int I = 0; I < 200; I++) { Nom *= Z; S++; Denom *= S; Sum += (Nom / Denom); } return Sum * Sc; } double chisqr(int Dof, double Cv) { if(Cv < 0 || Dof < 1) { return 0.0; } double K = ((double)Dof) * 0.5; double X = Cv * 0.5; if(Dof == 2) { return exp(-1.0 * X); } double PValue = igf(K, X); //if(isnan(PValue) || isinf(PValue) || PValue <= 1e-8) //{ // return 1e-14; //} PValue /= approx_gamma(K); //PValue /= tgamma(K); return PValue; //return (1.0 - PValue); } }; /********************************************************************************/ /** * */ /* template class SimkaTruncateIterator : public TruncateIterator { public: SimkaTruncateIterator (Iterator* ref, u_int64_t limit, bool initRef=true) : TruncateIterator(*ref, limit, initRef), _ref2(0){ setRef(ref); } private: Iterator* _ref2; void setRef (Iterator* ref2) { SP_SETATTR(ref2); } };*/ template class SimkaBankFiltered : public BankDelegate { public: u_int64_t _refNbReads; u_int64_t _refTotalSeqSize; u_int64_t _refMaxReadSize; /** Constructor. * \param[in] ref : referred bank. * \param[in] filter : functor that filters sequence. */ SimkaBankFiltered (IBank* ref, const Filter& filter, const vector& nbPaireds, u_int64_t maxReads) : BankDelegate (ref), _filter(filter) { _nbPaireds = nbPaireds; _maxReads = maxReads; _nbBanks = ref->getCompositionNb(); ref->estimate(_refNbReads, _refTotalSeqSize, _refMaxReadSize); //cout << _refNbReads << endl; //cout << _refTotalSeqSize << endl; //cout << _refMaxReadSize << endl; } void estimate (u_int64_t& number, u_int64_t& totalSize, u_int64_t& maxSize){ if(_maxReads == 0){ number = _refNbReads; totalSize = _refTotalSeqSize; maxSize = _refMaxReadSize; } else{ u_int64_t maxReads = 0; for(size_t i=0; i<_nbBanks; i++){ maxReads += _maxReads * _nbPaireds[i]; } //cout << _refNbReads << endl; //cout << _maxReads*_nbBanks << endl; maxReads = min (maxReads, _refNbReads); //cout << "ha " << maxReads << endl; if(maxReads == _refNbReads){ number = _refNbReads; totalSize = _refTotalSeqSize; maxSize = _refMaxReadSize; } else{ number = maxReads; double factor = (double)maxReads / (double)_refNbReads; totalSize = _refTotalSeqSize * factor; maxSize = _refMaxReadSize; } } //number = _maxReads; //totalSize = (_totalSizeRef*_nbReadToProcess)/_numberRef; //maxSize = _maxSizeRef; //cout << number2 << endl; //u_int64_t readSize = totalSize2 / number2; //cout << "lal:" << number2 << endl; //number = _maxReads; //number = _nbReadToProcess; //totalSize = _nbReadToProcess*readSize; //maxSize = readSize; cout << number << endl; //cout << totalSize << endl; //cout << maxSize << endl; } /** \copydoc tools::collections::Iterable::iterator */ Iterator* iterator () { //cout << endl << "---" << endl; //cout << "lala" << endl; // We create one iterator from the reference Iterator* it = _ref->iterator (); // We get the composition for this iterator std::vector*> iterators = it->getComposition(); //if (iterators.size() == 1) { return new FilterIterator (it, _filter); } //else //{ // We are going to create a new CompositeIterator, we won't need the one we just got from the reference LOCAL(it); // We may have to encapsulate each sub iterator with the filter. for (size_t i=0; i (iterators[i], _filter); } else{ //We create a truncated iterator that stop processing reads when _nbReadsPerDataset[i] is reached //cout << _nbReadsPerDataset[i] << endl; //CancellableIterator* truncIt = new CancellableIterator(*iterators[i]); Filter filter(_filter); //filter.setMaxReads(_nbReadsPerDataset[i]); //filter.setIt(truncIt); #ifdef BOOTSTRAP srand (time(NULL)); size_t nbBootstrap = 0; vector iSBoostrap(MAX_BOOTSTRAP); while(nbBootstrap != NB_BOOTSTRAP){ int index = rand() % iSBoostrap.size(); if(!iSBoostrap[index]){ iSBoostrap[index] = true; nbBootstrap += 1; } } filter.setBootstrap(iSBoostrap); #endif FilterIterator* filterIt = new FilterIterator (iterators[i], filter); iterators[i] = filterIt; }*/ //Iterator* it = iterators[i]; //std::vector*> iterators_ = it->getComposition(); iterators[i] = new SimkaInputIterator (iterators[i], _nbPaireds[i], _maxReads, _filter); } return new CompositeIterator (iterators); } private: vector _nbPaireds; Filter _filter; u_int64_t _maxReads; size_t _nbBanks; }; /********************************************************************* * ** SimkaAlgorithm *********************************************************************/ template class SimkaAlgorithm : public Algorithm { public: typedef typename Kmer::Type Type; typedef typename Kmer::Count Count; typedef typename Kmer::ModelCanonical ModelCanonical; typedef typename ModelCanonical::Kmer KmerType; SimkaAlgorithm(IProperties* options); ~SimkaAlgorithm(); void execute(); void print(); //void executeSimkamin(); static string toString(u_int64_t value){ char buffer[40]; snprintf(buffer, 30, "%llu", value); return string(buffer); } protected: bool setup(); bool isInputValid(); void parseArgs(); bool createDirs(); void computeMaxReads(); void layoutInputFilename(); void createBank(); void count(); void outputMatrix(); //void dumpMatrix(const string& outputFilename, const vector >& matrix); //void outputHeatmap(); //void __outputHeatmap(const string& outputFilenamePrefix, const string& matrixPercFilename, const string& matrixNormFilename); void clear(); u_int64_t _maxMemory; size_t _nbCores; string _outputDir; string _outputDirTemp; size_t _nbBanks; string _inputFilename; size_t _kmerSize; pair _abundanceThreshold; SIMKA_SOLID_KIND _solidKind; bool _soliditySingle; int64_t _maxNbReads; size_t _minReadSize; double _minReadShannonIndex; double _minKmerShannonIndex; size_t _nbMinimizers; //size_t _nbCores; SimkaStatistics* _stats; //SimkaDistance* _simkaDistance; string _banksInputFilename; vector _tempFilenamesToDelete; IBank* _banks; IProperties* _options; vector _bankNames; //vector _nbReadsPerDataset; string _outputFilenameSuffix; u_int64_t _totalKmers; vector _nbBankPerDataset; string _largerBankId; bool _computeSimpleDistances; bool _computeComplexDistances; bool _keepTmpFiles; //string _matDksNormFilename; //string _matDksPercFilename; //string _matAksNormFilename; //string _matAksPercFilename; //string _heatmapDksFilename; //string _heatmapAksFilename; /* gatb::core::tools::dp::IteratorListener* _progress; void setProgress (gatb::core::tools::dp::IteratorListener* progress) { SP_SETATTR(progress); } size_t _nbPartitions; std::vector > _nbKmersPerPartitionPerBank; vector > _nbk_per_radix_per_part;//number of kxmer per parti per rad Storage* _tmpPartitionsStorage; void setPartitionsStorage (Storage* tmpPartitionsStorage) { SP_SETATTR(tmpPartitionsStorage); } Partition* _tmpPartitions; void setPartitions (Partition* tmpPartitions) { SP_SETATTR(tmpPartitions); } vector _nbKmerPerPartitions; int getSizeofPerItem () const { return Type::getSize()/8 + sizeof(bankIdType); } std::vector getNbCoresList(); //this->_local_pInfo.incKmer_and_rad (p, radix_kxmer.getVal(), kx_size); //nb of superkmer per x per parti per radix //vector _speciesAbundancePerDataset; //MultiDiskStorage* _multiStorage; //u_int64_t _maxDisk; */ }; #endif /* TOOLS_SIMKA_SRC_SIMKAALGORITHM_HPP_ */ simka-1.5.3/src/core/SimkaAlgorithmTemplate.cpp.in000077500000000000000000000026341377312000000220640ustar00rootroot00000000000000/***************************************************************************** * Simka: Fast kmer-based method for estimating the similarity between numerous metagenomic datasets * A tool from the GATB (Genome Assembly Tool Box) * Copyright (C) 2015 INRIA * Authors: G.Benoit, C.Lemaitre, P.Peterlongo * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU Affero General Public License as * published by the Free Software Foundation, either version 3 of the * License, or (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU Affero General Public License for more details. * * You should have received a copy of the GNU Affero General Public License * along with this program. If not, see . *****************************************************************************/ #include // since we didn't define the functions in a .h file, that trick removes linker errors, // see http://www.parashift.com/c++-faq-lite/separate-template-class-defn-from-decl.html // (last example) // also, to reduce compilation time, I'm splitting it into several (8) files that will be compiled in parallel template class SimkaAlgorithm <${KSIZE}>;simka-1.5.3/src/core/SimkaCommons.hpp000066400000000000000000000250671377312000000174570ustar00rootroot00000000000000/* * SimkaCommons.h * * Created on: 24 juin 2017 * Author: gbenoit */ #ifndef SIMKA1_4_SRC_CORE_SIMKACOMMONS_HPP_ #define SIMKA1_4_SRC_CORE_SIMKACOMMONS_HPP_ #include const string STR_SIMKA_SOLIDITY_PER_DATASET = "-solidity-single"; const string STR_SIMKA_MAX_READS = "-max-reads"; const string STR_SIMKA_MIN_READ_SIZE = "-min-read-size"; const string STR_SIMKA_MIN_READ_SHANNON_INDEX = "-min-shannon-index"; const string STR_SIMKA_MIN_KMER_SHANNON_INDEX = "-kmer-shannon-index"; const string STR_KMER_PER_READ = "-kmer-per-read"; const string STR_SIMKA_COMPUTE_ALL_SIMPLE_DISTANCES= "-simple-dist"; const string STR_SIMKA_COMPUTE_ALL_COMPLEX_DISTANCES = "-complex-dist"; const string STR_SIMKA_KEEP_TMP_FILES = "-keep-tmp"; const string STR_SIMKA_COMPUTE_DATA_INFO = "-data-info"; class SimkaCommons { public: SimkaCommons(); virtual ~SimkaCommons(); static void checkInputValidity(const string& outputDirTemp, const string& inputFilename, u_int64_t& nbDatasets){ if(!System::file().doesExist(inputFilename)){ cout << "ERROR: Input does not exists (" + inputFilename + ")" << endl; exit(1); } nbDatasets = 0; bool error = false; //string inputDir = _outputDirTemp; // + "/input/"; ifstream inputFile(inputFilename.c_str()); //ofstream outputFileIds(_outputFilenameIds.c_str(), ios::binary); //_banksInputFilename = inputDir + "__input_simka__"; //_inputFilename + "_dsk_dataset_temp__"; //IFile* bankFile = System::file().newFile(_banksInputFilename, "wb"); string line; string linePart; vector lineIdDatasets; vector linepartPairedDatasets; vector linepartDatasets; //string bankFileContents = ""; u_int64_t lineIndex = 0; u_int64_t bankIdBytePos = 0; while(getline(inputFile, line)){ line.erase(std::remove(line.begin(),line.end(),' '),line.end()); if(line == "") continue; //cout << line << endl; lineIdDatasets.clear(); linepartPairedDatasets.clear(); //vector filenames; stringstream lineStream(line); while(getline(lineStream, linePart, ':')){ lineIdDatasets.push_back(linePart); } string bankId = lineIdDatasets[0]; string linePairedDatasets = lineIdDatasets[1]; stringstream linePairedDatasetsStream(linePairedDatasets); while(getline(linePairedDatasetsStream, linePart, ';')){ linepartPairedDatasets.push_back(linePart); } string subBankFilename = outputDirTemp + bankId; IFile* subBankFile = System::file().newFile(subBankFilename, "wb"); //cout << subBankFile->getPath() << endl; string subBankContents = ""; //_nbBankPerDataset.push_back(linepartPairedDatasets.size()); for(size_t i=0; ifwrite(subBankContents.c_str(), subBankContents.size(), 1); subBankFile->flush(); delete subBankFile; //bankFileContents += inputDir + "/" + bankId + "\n"; lineIndex += 1; try{ IBank* bank = Bank::open(subBankFilename); LOCAL(bank); nbDatasets += 1; } catch (Exception& e){ cerr << "ERROR: Can't open dataset: " << bankId << endl; error = true; } System::file().remove(subBankFilename); } inputFile.close(); if(error) exit(1); } }; template class SimkaInputIterator : public Iterator { public: /** Constructor. * \param[in] ref : the referred iterator * \param[in] initRef : will call 'first' on the reference if true */ SimkaInputIterator(Iterator* refs, size_t nbBanks, u_int64_t maxReads, Filter filter) : _filter(filter), _mainref(0) { setMainref(refs); _ref = _mainref->getComposition()[0]; _isDone = false; _nbDatasets = nbBanks; _nbBanks = _mainref->getComposition().size() / _nbDatasets; _maxReads = maxReads; _nbReadProcessed = 0; _currentBank = 0; _currentInternalBank = 0; _currentDataset = 0; } bool isFinished(){ if(_currentDataset == _nbDatasets){ _isDone = true; return true; } return false; } void nextDataset(){ _currentDataset += 1; if(isFinished()) return; _currentBank = _currentDataset * _nbBanks; _currentInternalBank = 0; _nbReadProcessed = 0; if(isFinished()) return; _ref = _mainref->getComposition()[_currentBank]; _isDone = false; first(); //nextBank(); } void nextBank(){ //cout << "next bank" << endl; //cout << "next bank "<< endl; _currentInternalBank += 1; if(_currentInternalBank == _nbBanks){ nextDataset(); } else{ _isDone = false; _currentBank += 1; _ref = _mainref->getComposition()[_currentBank]; first(); } } void first() { _ref->first(); while (!_ref->isDone() && _filter(_ref->item())==false) _ref->next(); _isDone = _ref->isDone(); if(!_isDone) *(this->_item) = _ref->item(); } void next(){ if(isFinished()){ _isDone = true; return; } //cout << "haha" << endl; _ref->next(); while (!_ref->isDone() && _filter(_ref->item())==false) _ref->next(); _isDone = _ref->isDone(); //cout << "haha" << endl; //if(!_isDone){ //cout << _currentBank << " " << _isDone << endl; //} //cout << _nbReadProcessed << " " << _currentBank << " " << _nbBanks << " " << _maxReads << endl; if(_isDone){ if(isFinished()){ //cout << _nbReadProcessed << endl; return; } else{ //cout << _nbReadProcessed << endl; nextBank(); if(isFinished()){ //cout << _nbReadProcessed << endl; return; } } } else{ *(this->_item) = _ref->item(); _nbReadProcessed += 1; } if(_maxReads && _nbReadProcessed >= _maxReads){ if(isFinished()) return; else nextDataset(); } } /** \copydoc Iterator::isDone */ bool isDone() { return _isDone; } /** \copydoc Iterator::item */ Item& item () { return *(this->_item); } private: bool _isDone; size_t _currentBank; //vector* > _refs; Iterator* _ref; size_t _nbBanks; u_int64_t _maxReads; Filter _filter; u_int64_t _nbReadProcessed; size_t _currentInternalBank; size_t _currentDataset; size_t _nbDatasets; Iterator* _mainref; void setMainref (Iterator* mainref) { SP_SETATTR(mainref); } }; struct SimkaSequenceFilter { //u_int64_t _maxNbReads; //u_int64_t _maxNbReadsPerBank; //u_int64_t _nbReadProcessed; //CancellableIterator* _it; //int* _bankIndex; //int* _datasetIndex; SimkaSequenceFilter(size_t minReadSize, double minShannonIndex){ //_maxNbReads = 0; //_nbReadProcessed = 0; _minReadSize = minReadSize; _minShannonIndex = minShannonIndex; } #ifdef BOOTSTRAP vector _bootstraps; void setBootstrap(vector& bootstraps){ _bootstraps = bootstraps; //for(size_t i=0; i<_bootstraps.size(); i++) // cout << _bootstraps[i]; //cout << endl << endl; } #endif //void setMaxReads(u_int64_t maxReads){ // _maxNbReads = maxReads; //} //void setIt(CancellableIterator* it){ // _it = it; //} bool operator() (Sequence& seq){ //cout << seq.toString() << endl; //cout << _nbReadProcessed << endl; //if(_maxNbReads != 0){ // if(_nbReadProcessed >= _maxNbReads){ // _it->_cancel = true; // return false; // } //} //cout << seq.getIndex() << " " << _nbReadProcessed << endl; #ifdef BOOTSTRAP int readPerBootstrap = _maxNbReads / MAX_BOOTSTRAP; int bootstrapIndex = seq.getIndex() / readPerBootstrap; if(!_bootstraps[bootstrapIndex]) return false; //cout << bootstrapIndex << endl; #endif if(!isReadSizeValid(seq)) return false; if(!isShannonIndexValid(seq)) return false; //cout << _nbReadProcessed << endl; //_nbReadProcessed += 1; return true; } bool isReadSizeValid(Sequence& seq){ if(_minReadSize == 0) return true; return seq.getDataSize() >= _minReadSize; } bool isShannonIndexValid(Sequence& seq){ if(_minShannonIndex == 0) return true; return getShannonIndex(seq) >= _minShannonIndex; } float getShannonIndex(Sequence& seq){ static char nt2binTab[128] = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, //69 0, 3, 0, 0, 0, 0, 0, 0, 4, 0, //79 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, }; float index = 0; //float freq [5]; vector _freqs(5, 0); char* seqStr = seq.getDataBuffer(); // Frequency of each letter (A, C, G, T or N) for(size_t i=0; i < seq.getDataSize(); i++) _freqs[nt2binTab[(unsigned char)seqStr[i]]] += 1.0; // Shannon index calculation for (size_t i=0; i<_freqs.size(); i++){ _freqs[i] /= (float) seq.getDataSize(); if (_freqs[i] != 0) index += _freqs[i] * log (_freqs[i]) / log(2); } return abs(index); } size_t _minReadSize; double _minShannonIndex; }; template class SimkaPotaraBankFiltered : public BankDelegate { public: SimkaPotaraBankFiltered (IBank* ref, const Filter& filter, u_int64_t maxReads, size_t nbDatasets) : BankDelegate (ref), _ref2(0), _filter(filter) { _maxReads = maxReads; _nbDatasets = nbDatasets; setRef2(_ref->iterator ()); } ~SimkaPotaraBankFiltered(){ std::vector*> itBanks = _ref2->getComposition(); for(size_t i=0; i setRef2(0); } Iterator* iterator () { return new SimkaInputIterator (_ref2, _nbDatasets, _maxReads, _filter); } private: Iterator* _ref2; void setRef2 (Iterator* ref2) { SP_SETATTR(ref2); } u_int64_t _maxReads; Filter _filter; u_int64_t _nbReadToProcess; size_t _datasetId; size_t _nbDatasets; }; #endif /* SIMKA1_4_SRC_CORE_SIMKACOMMONS_H_ */ simka-1.5.3/src/core/SimkaDistance.cpp000077500000000000000000001224651377312000000175740ustar00rootroot00000000000000/***************************************************************************** * Simka: Fast kmer-based method for estimating the similarity between numerous metagenomic datasets * A tool from the GATB (Genome Assembly Tool Box) * Copyright (C) 2015 INRIA * Authors: G.Benoit, C.Lemaitre, P.Peterlongo * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU Affero General Public License as * published by the Free Software Foundation, either version 3 of the * License, or (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU Affero General Public License for more details. * * You should have received a copy of the GNU Affero General Public License * along with this program. If not, see . *****************************************************************************/ #include "SimkaDistance.hpp" SimkaStatistics::SimkaStatistics(size_t nbBanks, bool computeSimpleDistances, bool computeComplexDistances, const string& tmpDir, const vector& datasetIds) { _nbBanks = nbBanks; _symetricDistanceMatrixSize = (_nbBanks*(_nbBanks+1))/2; _computeSimpleDistances = computeSimpleDistances; _computeComplexDistances = computeComplexDistances; //_nbBanks = 10000; _nbKmers = 0; _nbDistinctKmers = 0; _nbSolidKmers = 0; _nbErroneousKmers = 0; _nbSharedKmers = 0; //_abundanceMin = abundanceMin; //_mutex = mutex; //_outputDir = outputDir; _datasetNbReads.resize(_nbBanks, 0); _nbSolidDistinctKmersPerBank.resize(_nbBanks, 0); _nbSolidKmersPerBank.resize(_nbBanks, 0); _nbKmersPerBank.resize(_nbBanks, 0); //_nbDistinctKmersSharedByBanksThreshold.resize(_nbBanks, 0); //_nbKmersSharedByBanksThreshold.resize(_nbBanks, 0); _matrixNbDistinctSharedKmers.resize(_symetricDistanceMatrixSize); _matrixNbSharedKmers.resize(_nbBanks); _brayCurtisNumerator.resize(_symetricDistanceMatrixSize); for(size_t i=0; i<_nbBanks; i++){ //_matrixNbDistinctSharedKmers[i].resize(nbBanks, 0); _matrixNbSharedKmers[i].resize(nbBanks, 0); //_brayCurtisNumerator[i].resize(nbBanks, 0); //_kullbackLeibler[i].resize(nbBanks, 0); } if(_computeSimpleDistances){ //_abundance_jaccard_intersection.resize(_nbBanks); //for(size_t i=0; i<_nbBanks; i++){ // _abundance_jaccard_intersection[i].resize(nbBanks, 0); //} _chord_NiNj.resize(_nbBanks); _chord_sqrt_N2.resize(_nbBanks); //_chord_N2j.resize(_nbBanks); for(size_t i=0; i<_nbBanks; i++){ _chord_NiNj[i].resize(nbBanks, 0); //_chord_N2i[i].resize(nbBanks, 0); //_chord_N2j[i].resize(nbBanks, 0); } _hellinger_SqrtNiNj.resize(_nbBanks); for(size_t i=0; i<_nbBanks; i++){ _hellinger_SqrtNiNj[i].resize(nbBanks, 0); } _kulczynski_minNiNj.resize(_nbBanks); for(size_t i=0; i<_nbBanks; i++){ _kulczynski_minNiNj[i].resize(nbBanks, 0); } } if(_computeComplexDistances){ _whittaker_minNiNj.resize(_nbBanks); for(size_t i=0; i<_nbBanks; i++){ _whittaker_minNiNj[i].resize(nbBanks, 0); } _kullbackLeibler.resize(_nbBanks); for(size_t i=0; i<_nbBanks; i++){ _kullbackLeibler[i].resize(nbBanks, 0); } _canberra.resize(_nbBanks); for(size_t i=0; i<_nbBanks; i++){ _canberra[i].resize(nbBanks, 0); } } _totalReads = 0; for(size_t i=0; i<_nbBanks; i++){ string name = datasetIds[i]; string countFilename = tmpDir + "/count_synchro/" + name + ".ok"; string line; ifstream file(countFilename.c_str()); vector lines; while(getline(file, line)){ if(line == "") continue; lines.push_back(line); } file.close(); u_int64_t nbReads = strtoull(lines[0].c_str(), NULL, 10); _datasetNbReads[i] = nbReads; _nbSolidDistinctKmersPerBank[i] = strtoull(lines[1].c_str(), NULL, 10); _nbSolidKmersPerBank[i] = strtoull(lines[2].c_str(), NULL, 10); if(_computeSimpleDistances){ _chord_sqrt_N2[i] = sqrt(strtoull(lines[3].c_str(), NULL, 10)); } _totalReads += nbReads; /* for (size_t j=0; j<_nbCores; j++){ DistanceCommand* cmd = dynamic_cast*>(_cmds[j]); cmd->_stats->_datasetNbReads[i] = nbReads; cmd->_stats->_nbSolidDistinctKmersPerBank[i] = strtoull(lines[1].c_str(), NULL, 10); cmd->_stats->_nbSolidKmersPerBank[i] = strtoull(lines[2].c_str(), NULL, 10); cmd->_stats->_chord_sqrt_N2[i] = sqrt(strtoull(lines[3].c_str(), NULL, 10)); }*/ } } SimkaStatistics& SimkaStatistics::operator+= (const SimkaStatistics& other){ _nbKmers += other._nbKmers; _nbDistinctKmers += other._nbDistinctKmers; _nbSolidKmers += other._nbSolidKmers; _nbErroneousKmers += other._nbErroneousKmers; _nbSharedKmers += other._nbSharedKmers; for(size_t i=0; i<_nbBanks; i++){ _nbKmersPerBank[i] += other._nbKmersPerBank[i]; //_nbSolidDistinctKmersPerBank[i] += other._nbSolidDistinctKmersPerBank[i]; //_nbSolidKmersPerBank[i] += other._nbSolidKmersPerBank[i]; //_nbDistinctKmersSharedByBanksThreshold[i] += other._nbDistinctKmersSharedByBanksThreshold[i]; //_nbKmersSharedByBanksThreshold[i] += other._nbKmersSharedByBanksThreshold[i]; //if(_distanceParams._computeChord) //_chord_sqrt_N2[i] += other._chord_sqrt_N2[i]; } for(size_t i=0; i<_symetricDistanceMatrixSize; i++){ _brayCurtisNumerator[i] += other._brayCurtisNumerator[i]; _matrixNbDistinctSharedKmers[i] += other._matrixNbDistinctSharedKmers[i]; } for(size_t i=0; i<_nbBanks; i++){ for(size_t j=0; j<_nbBanks; j++){ _matrixNbSharedKmers[i][j] += other._matrixNbSharedKmers[i][j]; } } if(_computeSimpleDistances){ for(size_t i=0; i<_nbBanks; i++){ for(size_t j=0; j<_nbBanks; j++){ _chord_NiNj[i][j] += other._chord_NiNj[i][j]; _hellinger_SqrtNiNj[i][j] += other._hellinger_SqrtNiNj[i][j]; _kulczynski_minNiNj[i][j] += other._kulczynski_minNiNj[i][j]; } } } if(_computeComplexDistances){ for(size_t i=0; i<_nbBanks; i++){ for(size_t j=0; j<_nbBanks; j++){ _canberra[i][j] += other._canberra[i][j]; _whittaker_minNiNj[i][j] += other._whittaker_minNiNj[i][j]; _kullbackLeibler[i][j] += other._kullbackLeibler[i][j]; } } } return *this; } void SimkaStatistics::print(){ u_int64_t nbKmers = 0; u_int64_t nbDistinctKmersAfterMerging = _nbDistinctKmers; u_int64_t nbDistinctKmers = 0; u_int64_t nbSharedDistinctKmers = _nbSharedKmers; u_int64_t nbSharedKmers = 0; double meanCoverage = 0; for(size_t i=0; i<_nbBanks; i++){ nbKmers += _nbSolidKmersPerBank[i]; nbDistinctKmers += _nbSolidDistinctKmersPerBank[i]; float coverage = (double)_nbSolidKmersPerBank[i] / (double)_nbSolidDistinctKmersPerBank[i]; //cout << coverage << endl; meanCoverage += coverage; //nbDistinctKmers += _nbDistinctKmers; //for(size_t j=i+1; j<_nbBanks; j++){ // nbSharedDistinctKmers += _matrixNbDistinctSharedKmers[i][j]; // nbSharedKmers += _matrixNbSharedKmers[i][j]; //} } meanCoverage /= _nbBanks; u_int64_t totalReads = 0; u_int64_t minReads = -1; u_int64_t maxReads = 0; for (size_t i=0; i<_nbBanks; i++){ u_int64_t nbReads = _datasetNbReads[i]; //nbReads /= _nbBankPerDataset[i]; totalReads += nbReads; if(nbReads < minReads){ minReads = nbReads; //_smallerBankId = _bankNames[i]; } if(nbReads > maxReads){ maxReads = nbReads; } } u_int64_t meanReads = totalReads / _nbBanks; cout << endl << "Stats" << endl; cout << "\tReads" << endl; cout << "\t\tTotal: " << totalReads << " " << totalReads/1000000 << "M" << " " << totalReads/1000000000 << "G" << endl; cout << "\t\tMin: " << minReads << " " << minReads/1000000 << "M" << " " << minReads/1000000000 << "G" << endl; cout << "\t\tMax: " << maxReads << " " << maxReads/1000000 << "M" << " " << maxReads/1000000000 << "G" << endl; cout << "\t\tAverage: " << meanReads << " " << meanReads/1000000 << "M" << " " << meanReads/1000000000 << "G" << endl; cout << "\tKmers" << endl; cout << "\t\tDistinct Kmers (before merging): " << nbDistinctKmers << " " << nbDistinctKmers/1000000 << "M" << " " << nbDistinctKmers/1000000000 << "G" << endl; cout << "\t\tDistinct Kmers (after merging): " << nbDistinctKmersAfterMerging << " " << nbDistinctKmersAfterMerging/1000000 << "M" << " " << nbDistinctKmersAfterMerging/1000000000 << "G" << endl; cout << "\t\tShared distinct Kmers: " << nbSharedDistinctKmers << " " << nbSharedDistinctKmers/1000000 << "M" << " " << nbSharedDistinctKmers/1000000000 << "G" << endl; cout << "\t\tKmers: " << nbKmers << " " << nbKmers/1000000 << "M" << " " << nbKmers/1000000000 << "G" << endl; cout << "\t\tMean k-mer coverage: " << meanCoverage << endl; //cout << "\t\tShared distinct kmers: " << (int)((long double) nbSharedDistinctKmers / (long double)nbDistinctKmers * 100) << "% " << nbSharedDistinctKmers << " " << nbSharedDistinctKmers/1000000 << "M" << " " << nbSharedDistinctKmers/1000000000 << "G" << endl; //cout << "\t\tShared kmers: " << (int)((long double) nbSharedKmers / (long double)nbKmers * 100) << "% " << nbSharedKmers << " " << nbSharedKmers/1000000 << "M" << " " << nbSharedKmers/1000000000 << "G" << endl; cout << endl; return; //cout.precision(4); cout << endl << endl; //return; u_int64_t solidAbundance = 0; //for(int i=0; i<_nbSolidKmersPerBankAbundance.size(); i++) // solidAbundance += _nbSolidKmersPerBankAbundance[i]; for(size_t i=0; i<_nbKmersSharedByBanksThreshold.size(); i++) solidAbundance += _nbKmersSharedByBanksThreshold[i]; cout << "Statistics on kmer intersections:" << endl; cout << "\tNb kmers: " << _nbKmers << " " << _nbKmers / 1000000 << " M" << " " << _nbKmers / 1000000000 << " G" << endl; cout << endl; cout << "\tNb distinct kmers: " << _nbDistinctKmers << " " << _nbDistinctKmers / 1000000 << " M" << " " << _nbDistinctKmers / 1000000000 << " G" << " " << (100*_nbDistinctKmers)/(float)_nbKmers << "%" << endl; cout << "\tNb solid kmers: " << _nbSolidKmers << " " << _nbSolidKmers / 1000000 << " M" << " " << _nbSolidKmers / 1000000000 << " G" << " " << (100*_nbSolidKmers)/(float)_nbDistinctKmers << "% distinct" << " " << (100*solidAbundance) / (double)_nbKmers << "% abundance" << endl; //for(int i=0; i<_nbBanks; i++){ //cout << "Nb kmers (M) " << i << ": " << _nbSolidKmersPerBank[i] << endl << endl; //} cout << endl; cout << "\tPotentially erroneous (Kmers appearing only one time in a single bank): " << endl; cout << "\t\t" << _nbErroneousKmers << " " << _nbErroneousKmers / 1000000 << " M" << " " << _nbErroneousKmers / 1000000000 << " G" << " " << (100*_nbErroneousKmers)/(float)_nbDistinctKmers << "% distinct" << " " << (100*_nbErroneousKmers)/(float)_nbKmers << "% abundance" << endl; cout << endl; cout << "\tKmer shared by T banks :" << endl; for(size_t i=0; i<_nbBanks; i++){ cout << "\t\tShared by " << i+1 << " banks:"; cout << endl; cout << "\t\t\tDistinct: " << _nbDistinctKmersSharedByBanksThreshold[i] << " "; if(_nbSolidKmers > 0){ cout << (_nbDistinctKmersSharedByBanksThreshold[i]*100) / (float)_nbSolidKmers << "%"; } else{ cout << "0%"; } cout << endl; cout << "\t\t\tAbundance: " << _nbKmersSharedByBanksThreshold[i] << " "; if(solidAbundance > 0){ cout << (_nbKmersSharedByBanksThreshold[i]*100) / (float)solidAbundance << "%"; } else{ cout << "0%"; } if(_nbDistinctKmersSharedByBanksThreshold[i] > 0){ cout << endl; cout << "\t\t\tMean abundance per bank: " << _nbKmersSharedByBanksThreshold[i] / _nbDistinctKmersSharedByBanksThreshold[i] / (float) _nbBanks; } cout << endl; } //cout << endl; //cout << "Nb kmers in all banks (max/min > 10): " << _nbKmersInCoupleBankSupRatio << " " << (_nbKmersInCoupleBankSupRatio*100) / (float)_nbSolidKmers << "%" << endl; cout << endl << endl; } void SimkaStatistics::load(const string& filename){ IterableGzFile* file = new IterableGzFile(filename); Iterator* it = file->iterator(); LOCAL(it); it->first(); //_nbBanks = it->item(); it->next(); _computeSimpleDistances = it->item(); it->next(); _computeComplexDistances = it->item(); it->next(); //cout << _computeSimpleDistances << " " << _computeComplexDistances << endl; _nbKmers = it->item(); it->next(); _nbErroneousKmers = it->item(); it->next(); _nbDistinctKmers = it->item(); it->next(); _nbSolidKmers = it->item(); it->next(); _nbSharedKmers = it->item(); it->next(); for(size_t i=0; i<_nbBanks; i++){ _nbSolidDistinctKmersPerBank[i] = it->item(); it->next();} for(size_t i=0; i<_nbBanks; i++){ _nbKmersPerBank[i] = it->item(); it->next();} for(size_t i=0; i<_nbBanks; i++){ _nbSolidKmersPerBank[i] = it->item(); it->next();} //for(size_t i=0; i<_nbBanks; i++){ _nbDistinctKmersSharedByBanksThreshold[i] = it->item(); it->next();} //for(size_t i=0; i<_nbBanks; i++){ _nbKmersSharedByBanksThreshold[i] = it->item(); it->next();} for(size_t i=0; i<_nbBanks; i++){ //cout << i << endl; //cout << _nbBanks << endl; //cout << _matrixNbDistinctSharedKmers[i].size() << endl; for(size_t j=0; j<_nbBanks; j++){ _matrixNbSharedKmers[i][j] = it->item(); it->next();} //for(size_t j=0; j<_nbBanks; j++){ _abundance_jaccard_intersection[i][j] = it->item(); it->next();} } for(size_t i=0; i<_symetricDistanceMatrixSize; i++){ _matrixNbDistinctSharedKmers[i] = it->item(); it->next(); _brayCurtisNumerator[i] = it->item(); it->next(); } if(_computeSimpleDistances){ for(size_t i=0; i<_nbBanks; i++){ _chord_sqrt_N2[i] = it->item(); it->next();} for(size_t i=0; i<_nbBanks; i++){ for(size_t j=0; j<_nbBanks; j++){ _chord_NiNj[i][j] = it->item(); it->next();} for(size_t j=0; j<_nbBanks; j++){ _hellinger_SqrtNiNj[i][j] = it->item(); it->next();} for(size_t j=0; j<_nbBanks; j++){ _kulczynski_minNiNj[i][j] = it->item(); it->next();} } } if(_computeComplexDistances){ for(size_t i=0; i<_nbBanks; i++){ for(size_t j=0; j<_nbBanks; j++){ _canberra[i][j] = it->item(); it->next();} for(size_t j=0; j<_nbBanks; j++){ _whittaker_minNiNj[i][j] = it->item(); it->next();} for(size_t j=0; j<_nbBanks; j++){ _kullbackLeibler[i][j] = it->item(); it->next();} } } delete file; /* Storage::istream is (group, "simkaStats"); //is.read ((char*)&_nbBanks, sizeof(_nbBanks)); is.read ((char*)&_nbKmers, sizeof(_nbKmers)); is.read ((char*)&_nbErroneousKmers, sizeof(_nbErroneousKmers)); is.read ((char*)&_nbDistinctKmers, sizeof(_nbDistinctKmers)); is.read ((char*)&_nbSolidKmers, sizeof(_nbSolidKmers)); is.read ((char*)_nbSolidDistinctKmersPerBank.data(), sizeof(u_int64_t)*_nbBanks); is.read ((char*)_nbKmersPerBank.data(), sizeof(u_int64_t)*_nbBanks); is.read ((char*)_nbSolidKmersPerBank.data(), sizeof(u_int64_t)*_nbBanks); is.read ((char*)_nbDistinctKmersSharedByBanksThreshold.data(), sizeof(u_int64_t)*_nbBanks); is.read ((char*)_nbKmersSharedByBanksThreshold.data(), sizeof(u_int64_t)*_nbBanks); for(size_t i=0; i<_nbBanks; i++){ is.read ((char*)_matrixNbDistinctSharedKmers[i].data(), sizeof(u_int64_t)*_nbBanks); is.read ((char*)_matrixNbSharedKmers[i].data(), sizeof(u_int64_t)*_nbBanks); } //is.read ((char*)&_distanceParams._computeBrayCurtis, sizeof(_distanceParams._computeBrayCurtis)); //is.read ((char*)&_distanceParams._computeCanberra, sizeof(_distanceParams._computeCanberra)); //is.read ((char*)&_distanceParams._computeChord, sizeof(_distanceParams._computeChord)); //is.read ((char*)&_distanceParams._computeHellinger, sizeof(_distanceParams._computeHellinger)); //is.read ((char*)&_distanceParams._computeKulczynski, sizeof(_distanceParams._computeKulczynski)); //if(_distanceParams._computeBrayCurtis) for(size_t i=0; i<_nbBanks; i++) is.read ((char*)_brayCurtisNumerator[i].data(), sizeof(u_int64_t)*_nbBanks); //if(_distanceParams._computeCanberra) for(size_t i=0; i<_nbBanks; i++) is.read ((char*)_canberra[i].data(), sizeof(u_int64_t)*_nbBanks); //if(_distanceParams._computeChord){ is.read ((char*)_chord_N2.data(), sizeof(u_int64_t)*_nbBanks); for(size_t i=0; i<_nbBanks; i++) is.read ((char*)_chord_NiNj[i].data(), sizeof(u_int64_t)*_nbBanks); //} //if(_distanceParams._computeHellinger) for(size_t i=0; i<_nbBanks; i++) is.read ((char*)_hellinger_SqrtNiNj[i].data(), sizeof(u_int64_t)*_nbBanks); //if(_distanceParams._computeKulczynski) for(size_t i=0; i<_nbBanks; i++) is.read ((char*)_kulczynski_minNiNj[i].data(), sizeof(u_int64_t)*_nbBanks); */ } void SimkaStatistics::save (const string& filename){ BagGzFile* file = new BagGzFile(filename); //file->insert(_nbBanks); file->insert((long double)_computeSimpleDistances); file->insert((long double)_computeComplexDistances); file->insert((long double)_nbKmers); file->insert((long double)_nbErroneousKmers); file->insert((long double)_nbDistinctKmers); file->insert((long double)_nbSolidKmers); file->insert((long double)_nbSharedKmers); for(size_t i=0; i<_nbBanks; i++){ file->insert((long double)_nbSolidDistinctKmersPerBank[i]);} for(size_t i=0; i<_nbBanks; i++){ file->insert((long double)_nbKmersPerBank[i]);} for(size_t i=0; i<_nbBanks; i++){ file->insert((long double)_nbSolidKmersPerBank[i]);} //for(size_t i=0; i<_nbBanks; i++){ file->insert((long double)_nbDistinctKmersSharedByBanksThreshold[i]);} //for(size_t i=0; i<_nbBanks; i++){ file->insert((long double)_nbKmersSharedByBanksThreshold[i]);} for(size_t i=0; i<_nbBanks; i++){ //cout << i << endl; //cout << _nbBanks << endl; //cout << _matrixNbDistinctSharedKmers[i].size() << endl; for(size_t j=0; j<_nbBanks; j++){ file->insert((long double)_matrixNbSharedKmers[i][j]);} //for(size_t j=0; j<_nbBanks; j++){ file->insert((long double)_abundance_jaccard_intersection[i][j]);} } for(size_t i=0; i<_symetricDistanceMatrixSize; i++){ file->insert((long double)_matrixNbDistinctSharedKmers[i]); file->insert((long double)_brayCurtisNumerator[i]); } if(_computeSimpleDistances){ for(size_t i=0; i<_nbBanks; i++){ file->insert((long double)_chord_sqrt_N2[i]);} for(size_t i=0; i<_nbBanks; i++){ for(size_t j=0; j<_nbBanks; j++){ file->insert((long double)_chord_NiNj[i][j]);} for(size_t j=0; j<_nbBanks; j++){ file->insert((long double)_hellinger_SqrtNiNj[i][j]);} for(size_t j=0; j<_nbBanks; j++){ file->insert((long double)_kulczynski_minNiNj[i][j]);} } } if(_computeComplexDistances){ for(size_t i=0; i<_nbBanks; i++){ for(size_t j=0; j<_nbBanks; j++){ file->insert((long double)_canberra[i][j]);} for(size_t j=0; j<_nbBanks; j++){ file->insert((long double)_whittaker_minNiNj[i][j]);} for(size_t j=0; j<_nbBanks; j++){ file->insert((long double)_kullbackLeibler[i][j]);} } } /* file->insert(_nbKmersPerBank, 0); file->insert(_nbSolidKmersPerBank, 0); file->insert(_nbDistinctKmersSharedByBanksThreshold, 0); file->insert(_nbKmersSharedByBanksThreshold, 0); file->insert(_chord_N2, 0); for(size_t i=0; i<_nbBanks; i++){ file->insert(_matrixNbDistinctSharedKmers[i], 0); file->insert(_matrixNbSharedKmers[i], 0); file->insert(_brayCurtisNumerator[i], 0); file->insert(_canberra[i], 0); file->insert(_chord_NiNj[i], 0); file->insert(_hellinger_SqrtNiNj[i], 0); file->insert(_whittaker_minNiNj[i], 0); //cout << _kullbackLeibler[i][j] << endl; //file->insert(_kullbackLeibler[i], 0); file->insert(_kulczynski_minNiNj[i], 0); //for(size_t j=0; j<_nbBanks; j++){ // cout << _kullbackLeibler[i][j] << endl; //} }*/ file->flush(); delete file; /* cout << "loulou2" << endl; Storage::ostream os (group, "simkaStats"); cout << "loulou3" << endl; //os.write ((const char*)&_nbBanks, sizeof(_nbBanks)); os.write ((const char*)&_nbKmers, sizeof(_nbKmers)); os.write ((const char*)&_nbErroneousKmers, sizeof(_nbErroneousKmers)); os.write ((const char*)&_nbDistinctKmers, sizeof(_nbDistinctKmers)); os.write ((const char*)&_nbSolidKmers, sizeof(_nbSolidKmers)); cout << "loulou4" << endl; os.write ((const char*)_nbSolidDistinctKmersPerBank.data(), sizeof(u_int64_t)*_nbBanks); os.write ((const char*)_nbKmersPerBank.data(), sizeof(u_int64_t)*_nbBanks); os.write ((const char*)_nbSolidKmersPerBank.data(), sizeof(u_int64_t)*_nbBanks); os.write ((const char*)_nbDistinctKmersSharedByBanksThreshold.data(), sizeof(u_int64_t)*_nbBanks); os.write ((const char*)_nbKmersSharedByBanksThreshold.data(), sizeof(u_int64_t)*_nbBanks); cout << "loulou5" << endl; for(size_t i=0; i<_nbBanks; i++){ os.write ((const char*)_matrixNbDistinctSharedKmers[i].data(), sizeof(u_int64_t)*_nbBanks); os.write ((const char*)_matrixNbSharedKmers[i].data(), sizeof(u_int64_t)*_nbBanks); } //os.write ((const char*)&_distanceParams._computeBrayCurtis, sizeof(_distanceParams._computeBrayCurtis)); //os.write ((const char*)&_distanceParams._computeCanberra, sizeof(_distanceParams._computeCanberra)); //os.write ((const char*)&_distanceParams._computeChord, sizeof(_distanceParams._computeChord)); //os.write ((const char*)&_distanceParams._computeHellinger, sizeof(_distanceParams._computeHellinger)); //os.write ((const char*)&_distanceParams._computeKulczynski, sizeof(_distanceParams._computeKulczynski)); cout << "loulou6" << endl; //if(_distanceParams._computeBrayCurtis) for(size_t i=0; i<_nbBanks; i++) os.write ((const char*)_brayCurtisNumerator[i].data(), sizeof(u_int64_t)*_nbBanks); cout << "loulou7" << endl; //if(_distanceParams._computeCanberra) for(size_t i=0; i<_nbBanks; i++) os.write ((const char*)_canberra[i].data(), sizeof(u_int64_t)*_nbBanks); cout << "loulou8" << endl; //if(_distanceParams._computeChord){ os.write ((const char*)_chord_N2.data(), sizeof(u_int64_t)*_nbBanks); for(size_t i=0; i<_nbBanks; i++) os.write ((const char*)_chord_NiNj[i].data(), sizeof(u_int64_t)*_nbBanks); //} cout << "loulou9" << endl; //if(_distanceParams._computeHellinger) for(size_t i=0; i<_nbBanks; i++) os.write ((const char*)_hellinger_SqrtNiNj[i].data(), sizeof(u_int64_t)*_nbBanks); cout << "loulou10" << endl; //if(_distanceParams._computeKulczynski) for(size_t i=0; i<_nbBanks; i++) os.write ((const char*)_kulczynski_minNiNj[i].data(), sizeof(u_int64_t)*_nbBanks); cout << "loulou11" << endl; os.flush();*/ } void SimkaStatistics::outputMatrix(const string& outputDir, const vector& bankNames){ SimkaDistance _simkaDistance(*this); _outputFilenameSuffix = ""; char buffer[200]; //string strKmerSize = "_k"; //snprintf(buffer,200,"%llu",_kmerSize); //strKmerSize += string(buffer); //_outputFilenameSuffix += strKmerSize; dumpMatrix(outputDir, bankNames, "mat_presenceAbsence_chord", _simkaDistance._matrix_presenceAbsence_chordHellinger()); dumpMatrix(outputDir, bankNames, "mat_presenceAbsence_whittaker", _simkaDistance._matrix_presenceAbsence_Whittaker()); dumpMatrix(outputDir, bankNames, "mat_presenceAbsence_kulczynski", _simkaDistance._matrix_presenceAbsence_kulczynski()); dumpMatrix(outputDir, bankNames, "mat_presenceAbsence_braycurtis", _simkaDistance._matrix_presenceAbsence_sorensenBrayCurtis()); dumpMatrix(outputDir, bankNames, "mat_presenceAbsence_jaccard", _simkaDistance._matrix_presenceAbsence_jaccardCanberra()); dumpMatrix(outputDir, bankNames, "mat_presenceAbsence_simka-jaccard", _simkaDistance._matrix_presenceAbsence_jaccard_simka()); dumpMatrix(outputDir, bankNames, "mat_presenceAbsence_simka-jaccard_asym", _simkaDistance._matrix_presenceAbsence_jaccard_simka_asym()); dumpMatrix(outputDir, bankNames, "mat_presenceAbsence_ochiai", _simkaDistance._matrix_presenceAbsence_ochiai()); dumpMatrix(outputDir, bankNames, "mat_abundance_simka-jaccard", _simkaDistance._matrixSymJaccardAbundance()); dumpMatrix(outputDir, bankNames, "mat_abundance_simka-jaccard_asym", _simkaDistance._matrixAsymJaccardAbundance()); dumpMatrix(outputDir, bankNames, "mat_abundance_ab-ochiai", _simkaDistance._matrixOchiai()); dumpMatrix(outputDir, bankNames, "mat_abundance_ab-sorensen", _simkaDistance._matrixSorensen()); dumpMatrix(outputDir, bankNames, "mat_abundance_ab-jaccard", _simkaDistance._matrixJaccardAbundance()); const vector >& matrix = _simkaDistance._matrixBrayCurtis(); dumpMatrix(outputDir, bankNames, "mat_abundance_braycurtis", matrix); dumpMatrix(outputDir, bankNames, "mat_abundance_jaccard", _simkaDistance.computeJaccardDistanceFromBrayCurtis(matrix)); if(_computeSimpleDistances){ //dumpMatrix(outputDir, bankNames, "mat_abundance_braycurtis-simple", _simkaDistance._matrixJaccardIntersection); dumpMatrix(outputDir, bankNames, "mat_abundance_chord", _simkaDistance._matrixChord()); dumpMatrix(outputDir, bankNames, "mat_abundance_hellinger", _simkaDistance._matrixHellinger()); dumpMatrix(outputDir, bankNames, "mat_abundance_kulczynski", _simkaDistance._matrixKulczynski()); } if(_computeComplexDistances){ dumpMatrix(outputDir, bankNames, "mat_abundance_whittaker", _simkaDistance._matrixWhittaker()); dumpMatrix(outputDir, bankNames, "mat_abundance_jensenshannon", _simkaDistance._matrixKullbackLeibler()); dumpMatrix(outputDir, bankNames, "mat_abundance_canberra", _simkaDistance._matrixCanberra()); } } void SimkaStatistics::dumpMatrix(const string& outputDir, const vector& bankNames, const string& outputFilename, const vector >& matrix){ string filename = outputDir + "/" + outputFilename + ".csv"; gzFile out = gzopen((filename + ".gz").c_str(),"wb"); //char buffer[200]; string str; for(size_t i=0; ifwrite(str.c_str(), str.size(), 1); //file->flush(); //delete file; } SimkaDistance::SimkaDistance(SimkaStatistics& stats) : _stats(stats){ _nbBanks = _stats._nbBanks; //AnB is symetrical //for(size_t i=0; i<_nbBanks; i++) // for(size_t j=i+1; j<_nbBanks; j++) // _stats._matrixNbDistinctSharedKmers[j][i] = _stats._matrixNbDistinctSharedKmers[i][j]; /* u_int64_t a, b, c; u_int64_t b; u_int64_t c; _matrixJaccardAbundance = createSquaredMatrix(_nbBanks); _matrixBrayCurtis = createSquaredMatrix(_nbBanks); //_matrixJaccardIntersection = createSquaredMatrix(_nbBanks); _matrixSymJaccardAbundance = createSquaredMatrix(_nbBanks); _matrixAsymJaccardAbundance = createSquaredMatrix(_nbBanks); _matrixOchiai = createSquaredMatrix(_nbBanks); _matrixSorensen = createSquaredMatrix(_nbBanks); _matrix_presenceAbsence_sorensenBrayCurtis = createSquaredMatrix(_nbBanks); _matrix_presenceAbsence_Whittaker = createSquaredMatrix(_nbBanks); _matrix_presenceAbsence_kulczynski = createSquaredMatrix(_nbBanks); _matrix_presenceAbsence_ochiai = createSquaredMatrix(_nbBanks); _matrix_presenceAbsence_chordHellinger = createSquaredMatrix(_nbBanks); _matrix_presenceAbsence_jaccardCanberra = createSquaredMatrix(_nbBanks); _matrix_presenceAbsence_jaccard_simka = createSquaredMatrix(_nbBanks); _matrix_presenceAbsence_jaccard_simka_asym = createSquaredMatrix(_nbBanks); double dist = 0; for(size_t i=0; i<_nbBanks; i++){ //SpeciesAbundanceVectorType& X_i = _stats._speciesAbundancePerDataset[i]; //for(size_t j=0; j<_nbBanks; j++){ for(size_t j=i+1; j<_nbBanks; j++){ //SpeciesAbundanceVectorType& X_j = _stats._speciesAbundancePerDataset[j]; get_abc(i, j, a, b ,c); //PresenceAbsence chord hellinger dist = distance_presenceAbsence_chordHellinger(a, b, c); _matrix_presenceAbsence_chordHellinger[i][j] = dist; _matrix_presenceAbsence_chordHellinger[j][i] = dist; //Presence Absence Ochiai dist = distance_presenceAbsence_ochiai(a, b, c); _matrix_presenceAbsence_ochiai[i][j] = dist; _matrix_presenceAbsence_ochiai[j][i] = dist; //PresenceAbsence Jaccard Canberra dist = distance_presenceAbsence_jaccardCanberra(a, b, c); _matrix_presenceAbsence_jaccardCanberra[i][j] = dist; _matrix_presenceAbsence_jaccardCanberra[j][i] = dist; //PresenceAbsence Jaccard Simka dist = distance_presenceAbsence_jaccard_simka(i, j, SYMETRICAL); _matrix_presenceAbsence_jaccard_simka[i][j] = dist; _matrix_presenceAbsence_jaccard_simka[j][i] = dist; _matrix_presenceAbsence_jaccard_simka_asym[i][j] = distance_presenceAbsence_jaccard_simka(i, j, ASYMETRICAL); _matrix_presenceAbsence_jaccard_simka_asym[j][i] = distance_presenceAbsence_jaccard_simka(j, i, ASYMETRICAL); //PresenceAbsence Sorensen BrayCurtis dist = distance_presenceAbsence_sorensenBrayCurtis(a, b, c); _matrix_presenceAbsence_sorensenBrayCurtis[i][j] = dist; _matrix_presenceAbsence_sorensenBrayCurtis[j][i] = dist; //PresenceAbsence Whittaker dist = distance_presenceAbsence_whittaker(a, b, c); _matrix_presenceAbsence_Whittaker[i][j] = dist; _matrix_presenceAbsence_Whittaker[j][i] = dist; //PresenceAbsence kulczynski dist = distance_presenceAbsence_kulczynski(a, b, c); _matrix_presenceAbsence_kulczynski[i][j] = dist; _matrix_presenceAbsence_kulczynski[j][i] = dist; //Abundance Ochiai dist = distance_abundance_ochiai(i, j); _matrixOchiai[i][j] = dist; _matrixOchiai[j][i] = dist; //Abundance Sorensen dist = distance_abundance_sorensen(i, j); _matrixSorensen[i][j] = dist; _matrixSorensen[j][i] = dist; //Abundance Jaccard dist = distance_abundance_jaccard(i, j); _matrixJaccardAbundance[i][j] = dist; _matrixJaccardAbundance[j][i] = dist; //Abundance Jaccard Simka dist = distance_abundance_jaccard_simka(i, j, SYMETRICAL); _matrixSymJaccardAbundance[i][j] = dist; _matrixSymJaccardAbundance[j][i] = dist; _matrixAsymJaccardAbundance[i][j] = distance_abundance_jaccard_simka(i, j, ASYMETRICAL); _matrixAsymJaccardAbundance[j][i] = distance_abundance_jaccard_simka(j, i, ASYMETRICAL); //Abundance bray-curtis dist = distance_abundance_brayCurtis(i,j); _matrixBrayCurtis[i][j] = dist; _matrixBrayCurtis[j][i] = dist; //Abundance Jaccard Intersection //dist = distance_abundance_jaccard_intersection(i, j); //_matrixJaccardIntersection[i][j] = dist; //_matrixJaccardIntersection[j][i] = dist; } } if(_stats._computeSimpleDistances){ _matrixChord = createSquaredMatrix(_nbBanks); _matrixHellinger = createSquaredMatrix(_nbBanks); _matrixKulczynski = createSquaredMatrix(_nbBanks); for(size_t i=0; i<_nbBanks; i++){ for(size_t j=i+1; j<_nbBanks; j++){ //Abundance Chord dist = distance_abundance_chord(i, j); _matrixChord[i][j] = dist; _matrixChord[j][i] = dist; //Abundance Hellinger dist = distance_abundance_hellinger(i, j); _matrixHellinger[i][j] = dist; _matrixHellinger[j][i] = dist; //Abundance Kulczynski dist = distance_abundance_kulczynski(i, j); _matrixKulczynski[i][j] = dist; _matrixKulczynski[j][i] = dist; } } } if(_stats._computeComplexDistances){ _matrixCanberra = createSquaredMatrix(_nbBanks); _matrixWhittaker = createSquaredMatrix(_nbBanks); _matrixKullbackLeibler = createSquaredMatrix(_nbBanks); for(size_t i=0; i<_nbBanks; i++){ for(size_t j=i+1; j<_nbBanks; j++){ //Abundance Whittaker dist = distance_abundance_whittaker(i, j); _matrixWhittaker[i][j] = dist; _matrixWhittaker[j][i] = dist; //Abundance Kullback Leibler dist = distance_abundance_kullbackLeibler(i, j); _matrixKullbackLeibler[i][j] = dist; _matrixKullbackLeibler[j][i] = dist; //Abundance Canberra dist = distance_abundance_canberra(i, j, a, b, c); _matrixCanberra[i][j] = dist; _matrixCanberra[j][i] = dist; } } } */ } vector > SimkaDistance::createSquaredMatrix(size_t n){ vector > matrix; matrix.resize(n); for(size_t i=0; i. *****************************************************************************/ #ifndef TOOLS_SIMKA_SRC_SIMKADISTANCE_HPP_ #define TOOLS_SIMKA_SRC_SIMKADISTANCE_HPP_ #include const string STR_SIMKA_DISTANCE_BRAYCURTIS = "-bray-curtis"; const string STR_SIMKA_DISTANCE_CHORD = "-chord"; const string STR_SIMKA_DISTANCE_HELLINGER = "-hellinger"; const string STR_SIMKA_DISTANCE_CANBERRA = "-canberra"; const string STR_SIMKA_DISTANCE_KULCZYNSKI = "-kulczynski"; typedef vector SpeciesAbundanceVectorType; enum SIMKA_MATRIX_TYPE{ SYMETRICAL, ASYMETRICAL, }; /* class SimkaDistanceParam{ public: SimkaDistanceParam(){} SimkaDistanceParam(IProperties* params){ //_computeBrayCurtis = true; //_computeChord = true; //_computeHellinger = true; //_computeCanberra = true; //_computeKulczynski = true; //_computeBrayCurtis = params->get(STR_SIMKA_DISTANCE_BRAYCURTIS); //_computeChord = params->get(STR_SIMKA_DISTANCE_CHORD); //_computeHellinger = params->get(STR_SIMKA_DISTANCE_HELLINGER); //_computeCanberra = params->get(STR_SIMKA_DISTANCE_CANBERRA); //_computeKulczynski = params->get(STR_SIMKA_DISTANCE_KULCZYNSKI); } //bool _computeBrayCurtis; //bool _computeChord; //bool _computeHellinger; //bool _computeCanberra; //bool _computeKulczynski; };*/ class SimkaStatistics{ public: SimkaStatistics(size_t nbBanks, bool computeSimpleDistances, bool computeComplexDistances, const string& tmpDir, const vector& datasetIds); SimkaStatistics& operator+= (const SimkaStatistics& other); void print(); void load(const string& filename); void save(const string& filename); void outputMatrix(const string& outputDir, const vector& _bankNames); size_t _nbBanks; size_t _symetricDistanceMatrixSize; bool _computeSimpleDistances; bool _computeComplexDistances; double _totalReads; vector _nbSolidDistinctKmersPerBank; vector _nbSolidKmersPerBank; vector _nbDistinctKmersSharedByBanksThreshold; vector _nbKmersSharedByBanksThreshold; vector _matrixNbDistinctSharedKmers; vector > _matrixNbSharedKmers; vector _brayCurtisNumerator; //vector > _brayCurtisNumerator; //vector > _kullbackLeibler; //Abundance Chord vector > _chord_NiNj; vector _chord_sqrt_N2; //Abundance Hellinger vector > _hellinger_SqrtNiNj; vector > _whittaker_minNiNj; vector > _kullbackLeibler; vector > _abundance_jaccard_intersection; //Abundance Canberra vector > _canberra; //Abundance Kulczynski vector > _kulczynski_minNiNj; //string _outputDir; u_int64_t _nbKmers; vector _nbKmersPerBank; u_int64_t _nbErroneousKmers; u_int64_t _nbDistinctKmers; u_int64_t _nbSolidKmers; u_int64_t _nbSharedKmers; u_int64_t _nbDistinctSharedKmers; //SimkaDistanceParam _distanceParams; vector _datasetNbReads; //u_int64_t _nbKmersInCoupleBankSupRatio; //unordered_map _histos; private: void dumpMatrix(const string& outputDir, const vector& _bankNames, const string& outputFilename, const vector >& matrix); string _outputFilenameSuffix; }; class SimkaDistance { public: SimkaDistance(SimkaStatistics& stats); //virtual ~SimkaDistance(); //vector > getMatrixSorensen(SIMKA_MATRIX_TYPE type); //vector > getMatrixJaccard(); //vector > getMatrixAKS(SIMKA_MATRIX_TYPE type); //vector > getMatrixBrayCurtis(); //vector > getMatrixKullbackLeibler(); vector > _matrixJaccardAbundance(){ vector > matrix = createSquaredMatrix(_nbBanks); for(size_t i=0; i<_nbBanks; i++){ for(size_t j=i+1; j<_nbBanks; j++){ double dist = distance_abundance_jaccard(i, j); matrix[i][j] = dist; matrix[j][i] = dist; } } return matrix; } vector > _matrixBrayCurtis(){ vector > matrix = createSquaredMatrix(_nbBanks); for(size_t i=0; i<_nbBanks; i++){ for(size_t j=i+1; j<_nbBanks; j++){ double dist = distance_abundance_brayCurtis(i, j, j + ((_nbBanks-1)*i) - (i*(i-1)/2)); matrix[i][j] = dist; matrix[j][i] = dist; } } return matrix; } vector > _matrixChord(){ vector > matrix = createSquaredMatrix(_nbBanks); for(size_t i=0; i<_nbBanks; i++){ for(size_t j=i+1; j<_nbBanks; j++){ double dist = distance_abundance_chord(i, j); matrix[i][j] = dist; matrix[j][i] = dist; } } return matrix; } vector > _matrixHellinger(){ vector > matrix = createSquaredMatrix(_nbBanks); for(size_t i=0; i<_nbBanks; i++){ for(size_t j=i+1; j<_nbBanks; j++){ double dist = distance_abundance_hellinger(i, j); matrix[i][j] = dist; matrix[j][i] = dist; } } return matrix; } vector > _matrixWhittaker(){ vector > matrix = createSquaredMatrix(_nbBanks); for(size_t i=0; i<_nbBanks; i++){ for(size_t j=i+1; j<_nbBanks; j++){ double dist = distance_abundance_whittaker(i, j); matrix[i][j] = dist; matrix[j][i] = dist; } } return matrix; } vector > _matrixKullbackLeibler(){ vector > matrix = createSquaredMatrix(_nbBanks); for(size_t i=0; i<_nbBanks; i++){ for(size_t j=i+1; j<_nbBanks; j++){ double dist = distance_abundance_kullbackLeibler(i, j); matrix[i][j] = dist; matrix[j][i] = dist; } } return matrix; } vector > _matrixCanberra(){ vector > matrix = createSquaredMatrix(_nbBanks); u_int64_t a, b, c; for(size_t i=0; i<_nbBanks; i++){ for(size_t j=i+1; j<_nbBanks; j++){ get_abc(i, j, j + ((_nbBanks-1)*i) - (i*(i-1)/2), a, b ,c); double dist = distance_abundance_canberra(i, j, a, b, c); matrix[i][j] = dist; matrix[j][i] = dist; } } return matrix; } vector > _matrixKulczynski(){ vector > matrix = createSquaredMatrix(_nbBanks); for(size_t i=0; i<_nbBanks; i++){ for(size_t j=i+1; j<_nbBanks; j++){ double dist = distance_abundance_kulczynski(i, j); matrix[i][j] = dist; matrix[j][i] = dist; } } return matrix; } vector > _matrixSymJaccardAbundance(){ vector > matrix = createSquaredMatrix(_nbBanks); for(size_t i=0; i<_nbBanks; i++){ for(size_t j=i+1; j<_nbBanks; j++){ double dist = distance_abundance_jaccard_simka(i, j, SYMETRICAL); matrix[i][j] = dist; matrix[j][i] = dist; } } return matrix; } vector > _matrixAsymJaccardAbundance(){ vector > matrix = createSquaredMatrix(_nbBanks); for(size_t i=0; i<_nbBanks; i++){ for(size_t j=i+1; j<_nbBanks; j++){ matrix[i][j] = distance_abundance_jaccard_simka(i, j, ASYMETRICAL); matrix[j][i] = distance_abundance_jaccard_simka(j, i, ASYMETRICAL); } } return matrix; } vector > _matrixOchiai(){ vector > matrix = createSquaredMatrix(_nbBanks); for(size_t i=0; i<_nbBanks; i++){ for(size_t j=i+1; j<_nbBanks; j++){ double dist = distance_abundance_ochiai(i, j); matrix[i][j] = dist; matrix[j][i] = dist; } } return matrix; } vector > _matrixSorensen(){ vector > matrix = createSquaredMatrix(_nbBanks); for(size_t i=0; i<_nbBanks; i++){ for(size_t j=i+1; j<_nbBanks; j++){ double dist = distance_abundance_sorensen(i, j); matrix[i][j] = dist; matrix[j][i] = dist; } } return matrix; } vector > _matrix_presenceAbsence_sorensenBrayCurtis(){ vector > matrix = createSquaredMatrix(_nbBanks); u_int64_t a, b, c; for(size_t i=0; i<_nbBanks; i++){ for(size_t j=i+1; j<_nbBanks; j++){ get_abc(i, j, j + ((_nbBanks-1)*i) - (i*(i-1)/2), a, b ,c); double dist = distance_presenceAbsence_sorensenBrayCurtis(a, b, c); matrix[i][j] = dist; matrix[j][i] = dist; } } return matrix; } vector > _matrix_presenceAbsence_Whittaker(){ vector > matrix = createSquaredMatrix(_nbBanks); u_int64_t a, b, c; for(size_t i=0; i<_nbBanks; i++){ for(size_t j=i+1; j<_nbBanks; j++){ get_abc(i, j, j + ((_nbBanks-1)*i) - (i*(i-1)/2), a, b ,c); double dist = distance_presenceAbsence_whittaker(a, b, c); matrix[i][j] = dist; matrix[j][i] = dist; } } return matrix; } vector > _matrix_presenceAbsence_kulczynski(){ vector > matrix = createSquaredMatrix(_nbBanks); u_int64_t a, b, c; for(size_t i=0; i<_nbBanks; i++){ for(size_t j=i+1; j<_nbBanks; j++){ get_abc(i, j, j + ((_nbBanks-1)*i) - (i*(i-1)/2), a, b ,c); double dist = distance_presenceAbsence_kulczynski(a, b, c); matrix[i][j] = dist; matrix[j][i] = dist; } } return matrix; } vector > _matrix_presenceAbsence_ochiai(){ vector > matrix = createSquaredMatrix(_nbBanks); u_int64_t a, b, c; for(size_t i=0; i<_nbBanks; i++){ for(size_t j=i+1; j<_nbBanks; j++){ get_abc(i, j, j + ((_nbBanks-1)*i) - (i*(i-1)/2), a, b ,c); double dist = distance_presenceAbsence_ochiai(a, b, c); matrix[i][j] = dist; matrix[j][i] = dist; } } return matrix; } vector > _matrix_presenceAbsence_chordHellinger(){ vector > matrix = createSquaredMatrix(_nbBanks); u_int64_t a, b, c; for(size_t i=0; i<_nbBanks; i++){ for(size_t j=i+1; j<_nbBanks; j++){ get_abc(i, j, j + ((_nbBanks-1)*i) - (i*(i-1)/2), a, b ,c); double dist = distance_presenceAbsence_chordHellinger(a, b, c); matrix[i][j] = dist; matrix[j][i] = dist; } } return matrix; } vector > _matrix_presenceAbsence_jaccardCanberra(){ vector > matrix = createSquaredMatrix(_nbBanks); u_int64_t a, b, c; for(size_t i=0; i<_nbBanks; i++){ for(size_t j=i+1; j<_nbBanks; j++){ get_abc(i, j, j + ((_nbBanks-1)*i) - (i*(i-1)/2), a, b ,c); double dist = distance_presenceAbsence_jaccardCanberra(a, b, c); matrix[i][j] = dist; matrix[j][i] = dist; } } return matrix; } vector > _matrix_presenceAbsence_jaccard_simka(){ vector > matrix = createSquaredMatrix(_nbBanks); for(size_t i=0; i<_nbBanks; i++){ for(size_t j=i+1; j<_nbBanks; j++){ double dist = distance_presenceAbsence_jaccard_simka(i, j, j + ((_nbBanks-1)*i) - (i*(i-1)/2), SYMETRICAL); matrix[i][j] = dist; matrix[j][i] = dist; } } return matrix; } vector > _matrix_presenceAbsence_jaccard_simka_asym(){ vector > matrix = createSquaredMatrix(_nbBanks); for(size_t i=0; i<_nbBanks; i++){ for(size_t j=i+1; j<_nbBanks; j++){ matrix[i][j] = distance_presenceAbsence_jaccard_simka(i, j, j + ((_nbBanks-1)*i) - (i*(i-1)/2), ASYMETRICAL); matrix[j][i] = distance_presenceAbsence_jaccard_simka(j, i, j + ((_nbBanks-1)*i) - (i*(i-1)/2), ASYMETRICAL); } } return matrix; } vector > computeJaccardDistanceFromBrayCurtis(const vector >& brayDistanceMatrix){ vector > jaccardDistanceMatrix = createSquaredMatrix(_nbBanks); for(size_t i=0; i<_nbBanks; i++){ for(size_t j=0; j<_nbBanks; j++){ double B = brayDistanceMatrix[i][j]; double J = (2*B) / (1+B); jaccardDistanceMatrix[i][j] = J; } } return jaccardDistanceMatrix; } private: vector > createSquaredMatrix(size_t n); void get_abc(size_t bank1, size_t bank2, size_t symetricIndex, u_int64_t& a, u_int64_t& b, u_int64_t& c); double distance_abundance_brayCurtis(size_t bank1, size_t bank2, size_t symetricIndex); double distance_abundance_chord(size_t i, size_t j); double distance_abundance_hellinger(size_t i, size_t j); //double distance_abundance_jaccard_intersection(size_t i, size_t j); double distance_abundance_whittaker(size_t i, size_t j); double distance_abundance_kullbackLeibler(size_t i, size_t j); double distance_abundance_canberra(size_t i, size_t j, u_int64_t& ua, u_int64_t& ub, u_int64_t& uc); double distance_abundance_kulczynski(size_t i, size_t j); double distance_abundance_ochiai(size_t i, size_t j); double distance_abundance_sorensen(size_t i, size_t j); double distance_abundance_jaccard(size_t i, size_t j); double distance_abundance_jaccard_simka(size_t i, size_t j, SIMKA_MATRIX_TYPE type); double distance_presenceAbsence_chordHellinger(u_int64_t& ua, u_int64_t& ub, u_int64_t& uc); double distance_presenceAbsence_hellinger(u_int64_t& ua, u_int64_t& ub, u_int64_t& uc); double distance_presenceAbsence_whittaker(u_int64_t& ua, u_int64_t& ub, u_int64_t& uc); double distance_presenceAbsence_canberra(u_int64_t& ua, u_int64_t& ub, u_int64_t& uc); double distance_presenceAbsence_kulczynski(u_int64_t& ua, u_int64_t& ub, u_int64_t& uc); double distance_presenceAbsence_ochiai(u_int64_t& ua, u_int64_t& ub, u_int64_t& uc); double distance_presenceAbsence_sorensenBrayCurtis(u_int64_t& ua, u_int64_t& ub, u_int64_t& uc); double distance_presenceAbsence_jaccardCanberra(u_int64_t& ua, u_int64_t& ub, u_int64_t& uc); double distance_presenceAbsence_jaccard_simka(size_t i, size_t j, size_t symetricIndex, SIMKA_MATRIX_TYPE type); SimkaStatistics& _stats; //SimkaDistanceParam _distanceParams; size_t _nbBanks; }; #endif /* TOOLS_SIMKA_SRC_SIMKADISTANCE_HPP_ */ simka-1.5.3/src/core/main.cpp000077500000000000000000000030131377312000000157640ustar00rootroot00000000000000/***************************************************************************** * Simka: Fast kmer-based method for estimating the similarity between numerous metagenomic datasets * A tool from the GATB (Genome Assembly Tool Box) * Copyright (C) 2015 INRIA * Authors: G.Benoit, C.Lemaitre, P.Peterlongo * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU Affero General Public License as * published by the Free Software Foundation, either version 3 of the * License, or (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU Affero General Public License for more details. * * You should have received a copy of the GNU Affero General Public License * along with this program. If not, see . *****************************************************************************/ // We include the header file for the tool #include "Simka.hpp" /********************************************************************************/ int main (int argc, char* argv[]) { try { // We run the tool with the provided command line arguments. Simka().run (argc, argv); } catch (Exception& e) { std::cout << "EXCEPTION: " << e.getMessage() << std::endl; return EXIT_FAILURE; } return EXIT_SUCCESS; } simka-1.5.3/src/minikc/000077500000000000000000000000001377312000000146565ustar00rootroot00000000000000simka-1.5.3/src/minikc/MiniKC.hpp000066400000000000000000000167461377312000000165170ustar00rootroot00000000000000/* * MiniKC.hpp * * Created on: 16 juin 2016 * Author: gbenoit */ #ifndef GATB_SIMKA_SRC_MINIKC_MINIKC_HPP_ #define GATB_SIMKA_SRC_MINIKC_MINIKC_HPP_ #include //#include "../SimkaCount.cpp" //typedef u_int16_t CountType; template class SimkaCompressedProcessor : public CountProcessorAbstract{ public: typedef typename Kmer::Type Type; typedef typename Kmer::Count Count; struct Kmer_BankId_Count{ Type _type; u_int32_t _bankId; u_int64_t _count; Kmer_BankId_Count(){ } Kmer_BankId_Count(Type type, u_int64_t bankId, u_int64_t count){ _type = type; _bankId = bankId; _count = count; } }; //SimkaCompressedProcessor(vector* >& bags, vector >& caches, vector& cacheIndexes, CountNumber abundanceMin, CountNumber abundanceMax) : _bags(bags), _caches(caches), _cacheIndexes(cacheIndexes) SimkaCompressedProcessor(vector* >& bags, vector& nbKmerPerParts, vector& nbDistinctKmerPerParts, vector& chordPerParts, CountNumber abundanceMin, CountNumber abundanceMax, size_t bankIndex) : _bags(bags), _nbDistinctKmerPerParts(nbDistinctKmerPerParts), _nbKmerPerParts(nbKmerPerParts), _chordPerParts(chordPerParts) { _abundanceMin = abundanceMin; _abundanceMax = abundanceMax; _bankIndex = bankIndex; } ~SimkaCompressedProcessor(){} CountProcessorAbstract* clone () { return new SimkaCompressedProcessor (_bags, _nbKmerPerParts, _nbDistinctKmerPerParts, _chordPerParts, _abundanceMin, _abundanceMax, _bankIndex); } //CountProcessorAbstract* clone () { return new SimkaCompressedProcessor (_bags, _caches, _cacheIndexes, _abundanceMin, _abundanceMax); } void finishClones (vector*>& clones){} bool process (size_t partId, const typename Kmer::Type& kmer, const CountVector& count, CountNumber sum){ if(count[0] < _abundanceMin || count[0] > _abundanceMax) return false; Kmer_BankId_Count item(kmer, _bankIndex, count[0]); _bags[partId]->insert(item); _nbDistinctKmerPerParts[partId] += 1; _nbKmerPerParts[partId] += count[0]; _chordPerParts[partId] += pow(count[0], 2); /* size_t index = _cacheIndexes[partId]; _caches[partId][index] = item; index += 1; if(index == NB_COUNT_CACHE){ _bags[partId]->insert(_caches[partId], index); _cacheIndexes[partId] = 0; } else{ _cacheIndexes[partId] = index; }*/ return true; } vector* >& _bags; vector& _nbDistinctKmerPerParts; vector& _nbKmerPerParts; vector& _chordPerParts; CountNumber _abundanceMin; CountNumber _abundanceMax; size_t _bankIndex; //_stats->_chord_N2[i] += pow(abundanceI, 2); //vector >& _caches; //vector& _cacheIndexes; }; /* class SimkaCompressedProcessor_Mini{ public: typedef typename Kmer<>::Type Type; typedef typename Kmer<>::Count Count; //SimkaCompressedProcessor(vector* >& bags, vector >& caches, vector& cacheIndexes, CountNumber abundanceMin, CountNumber abundanceMax) : _bags(bags), _caches(caches), _cacheIndexes(cacheIndexes) SimkaCompressedProcessor_Mini(vector* >& bags, vector& nbKmerPerParts, vector& nbDistinctKmerPerParts, vector& chordPerParts, CountNumber abundanceMin, CountNumber abundanceMax) : _bags(bags), _nbDistinctKmerPerParts(nbDistinctKmerPerParts), _nbKmerPerParts(nbKmerPerParts), _chordPerParts(chordPerParts) { _abundanceMin = abundanceMin; _abundanceMax = abundanceMax; } bool process (size_t partId, const Type& kmer, CountType count){ if(count < _abundanceMin || count > _abundanceMax) return false; Count item(kmer, count); _bags[partId]->insert(item); _nbDistinctKmerPerParts[partId] += 1; _nbKmerPerParts[partId] += count; _chordPerParts[partId] += pow(count, 2); return true; } vector* >& _bags; vector& _nbDistinctKmerPerParts; vector& _nbKmerPerParts; vector& _chordPerParts; CountNumber _abundanceMin; CountNumber _abundanceMax; //_stats->_chord_N2[i] += pow(abundanceI, 2); //vector >& _caches; //vector& _cacheIndexes; };*/ template class MiniKC : public Algorithm{ public: typedef typename Kmer::Type Type; typedef typename Kmer::Count Count; typedef typename Kmer::ModelCanonical Model; typedef typename Kmer::ModelCanonical::Iterator ModelIt; //typedef Kmer<>::ModelCanonical ModelCanon; //typedef Kmer<>::ModelMinimizer ModelMini; typedef typename Kmer::template ModelMinimizer ModelMinimizer; //typedef typename Kmer::ModelCanonical ModelCanonical; //typedef typename ModelCanonical::Kmer KmerType; /* typedef Kmer::Count Count; typedef Kmer::Type Type; typedef Kmer::ModelCanonical ModelCanon; typedef Kmer::ModelMinimizer Model;*/ IBank* _bank; size_t _kmerSize; CountVector* _counts; Repartitor& _repartition; SimkaCompressedProcessor* _proc; u_int64_t _nbReads; MiniKC(IProperties* options, size_t kmerSize, IBank* bank, Repartitor& repartition, SimkaCompressedProcessor* proc): Algorithm("minikc", -1, options), _repartition(repartition) { _bank = bank; _kmerSize = kmerSize; _proc = proc; u_int64_t nbCounts = pow(4, _kmerSize); cout << "Nb distinct kmers (canonical): " << nbCounts << endl; _counts = new CountVector(nbCounts, 0); } void execute(){ count(); dump(); } void count(){ _nbReads = 0; Iterator* itSeq = createIterator(_bank->iterator(), _bank->estimateNbItems(), "Counting"); //Model definition of a kmer iterator (this one put kmer in cannonical form) //ModelCanonical _model(_kmerSize); //Model:: //Model _kmerIt(_model); Model model (_kmerSize); // We declare an iterator on a given sequence. ModelIt _kmerIt (model); Sequence* sequence; for (itSeq->first(); !itSeq->isDone(); itSeq->next()){ _nbReads += 1; sequence = &itSeq->item(); _kmerIt.setData (sequence->getData()); for (_kmerIt.first(); !_kmerIt.isDone(); _kmerIt.next()){ //u_int64_t kmer = min(_kmerIt->value(), revcomp(_kmerIt->value(), _kmerSize)).getVal(); //Kmer<> canonicalkmer = min(_kmerIt.item(), revcomp(_kmerIt->value())); //cout << _kmerIt->value().toString(kmerSize) << endl; u_int64_t kmer = _kmerIt->value().getVal(); //cout << _model.toString(kmer) << endl; //cout << kmer << endl; (*_counts)[kmer] += 1; //cout << kmer << endl; } } } void dump(){ ModelMinimizer model (_kmerSize, 7); Type kmer; //Kmer<>::ModelCanonical _model(_kmerSize); CountVector vec(1, 0); for(size_t i=0; i<_counts->size(); i++){ CountNumber count = (*_counts)[i]; if(count == 0) continue; kmer.setVal(i); //cout << i << " " << model.toString(kmer) << endl; //Type kmer(i); u_int64_t mini = model.getMinimizerValue(kmer); size_t p = this->_repartition (mini); vec[0] = count; _proc->process(p, kmer, vec, count); } } }; #endif /* GATB_SIMKA_SRC_MINIKC_MINIKC_HPP_ */ simka-1.5.3/src/minikc/SimkaCountProcess.cpp000066400000000000000000000011221377312000000207720ustar00rootroot00000000000000 #include /* printf */ #include /* system, NULL, EXIT_FAILURE */ #include #include using namespace std; int main (int argc, char* argv[]) { string command = "nohup "; for (int i = 1; i < argc; ++i) { //std::cout << argv[i] << std::endl; command += string(argv[i]) + " "; } //cout << command << endl; //cout << argc << " " << argv << endl; int ret=1; int nbTries = 0; while(ret != 0){ ret = system(command.c_str()); nanosleep((const struct timespec[]){{0, 10000000L}}, NULL); if(nbTries > 3) exit(1); nbTries += 1; } } simka-1.5.3/src/simkaMin/000077500000000000000000000000001377312000000151545ustar00rootroot00000000000000simka-1.5.3/src/simkaMin/MurmurHash3.cpp000066400000000000000000000163761377312000000200530ustar00rootroot00000000000000#include "MurmurHash3.h" //----------------------------------------------------------------------------- // Platform-specific functions and macros // Microsoft Visual Studio #if defined(_MSC_VER) #define FORCE_INLINE __forceinline #include #define ROTL32(x,y) _rotl(x,y) #define ROTL64(x,y) _rotl64(x,y) #define BIG_CONSTANT(x) (x) // Other compilers #else // defined(_MSC_VER) #define FORCE_INLINE inline __attribute__((always_inline)) inline uint32_t rotl32 ( uint32_t x, int8_t r ) { return (x << r) | (x >> (32 - r)); } inline uint64_t rotl64 ( uint64_t x, int8_t r ) { return (x << r) | (x >> (64 - r)); } #define ROTL32(x,y) rotl32(x,y) #define ROTL64(x,y) rotl64(x,y) #define BIG_CONSTANT(x) (x##LLU) #endif // !defined(_MSC_VER) //----------------------------------------------------------------------------- // Block read - if your platform needs to do endian-swapping or can only // handle aligned reads, do the conversion here FORCE_INLINE uint32_t getblock32 ( const uint32_t * p, int i ) { return p[i]; } FORCE_INLINE uint64_t getblock64 ( const uint64_t * p, int i ) { return p[i]; } //----------------------------------------------------------------------------- // Finalization mix - force all bits of a hash block to avalanche FORCE_INLINE uint32_t fmix32 ( uint32_t h ) { h ^= h >> 16; h *= 0x85ebca6b; h ^= h >> 13; h *= 0xc2b2ae35; h ^= h >> 16; return h; } //---------- FORCE_INLINE uint64_t fmix64 ( uint64_t k ) { k ^= k >> 33; k *= BIG_CONSTANT(0xff51afd7ed558ccd); k ^= k >> 33; k *= BIG_CONSTANT(0xc4ceb9fe1a85ec53); k ^= k >> 33; return k; } //----------------------------------------------------------------------------- void MurmurHash3_x86_32 ( const void * key, int len, uint32_t seed, void * out ) { const uint8_t * data = (const uint8_t*)key; const int nblocks = len / 4; uint32_t h1 = seed; const uint32_t c1 = 0xcc9e2d51; const uint32_t c2 = 0x1b873593; //---------- // body const uint32_t * blocks = (const uint32_t *)(data + nblocks*4); for(int i = -nblocks; i; i++) { uint32_t k1 = getblock32(blocks,i); k1 *= c1; k1 = ROTL32(k1,15); k1 *= c2; h1 ^= k1; h1 = ROTL32(h1,13); h1 = h1*5+0xe6546b64; } //---------- // tail const uint8_t * tail = (const uint8_t*)(data + nblocks*4); uint32_t k1 = 0; switch(len & 3) { case 3: k1 ^= tail[2] << 16; case 2: k1 ^= tail[1] << 8; case 1: k1 ^= tail[0]; k1 *= c1; k1 = ROTL32(k1,15); k1 *= c2; h1 ^= k1; }; //---------- // finalization h1 ^= len; h1 = fmix32(h1); *(uint32_t*)out = h1; } //----------------------------------------------------------------------------- void MurmurHash3_x86_128 ( const void * key, const int len, uint32_t seed, void * out ) { const uint8_t * data = (const uint8_t*)key; const int nblocks = len / 16; uint32_t h1 = seed; uint32_t h2 = seed; uint32_t h3 = seed; uint32_t h4 = seed; const uint32_t c1 = 0x239b961b; const uint32_t c2 = 0xab0e9789; const uint32_t c3 = 0x38b34ae5; const uint32_t c4 = 0xa1e38b93; //---------- // body const uint32_t * blocks = (const uint32_t *)(data + nblocks*16); for(int i = -nblocks; i; i++) { uint32_t k1 = getblock32(blocks,i*4+0); uint32_t k2 = getblock32(blocks,i*4+1); uint32_t k3 = getblock32(blocks,i*4+2); uint32_t k4 = getblock32(blocks,i*4+3); k1 *= c1; k1 = ROTL32(k1,15); k1 *= c2; h1 ^= k1; h1 = ROTL32(h1,19); h1 += h2; h1 = h1*5+0x561ccd1b; k2 *= c2; k2 = ROTL32(k2,16); k2 *= c3; h2 ^= k2; h2 = ROTL32(h2,17); h2 += h3; h2 = h2*5+0x0bcaa747; k3 *= c3; k3 = ROTL32(k3,17); k3 *= c4; h3 ^= k3; h3 = ROTL32(h3,15); h3 += h4; h3 = h3*5+0x96cd1c35; k4 *= c4; k4 = ROTL32(k4,18); k4 *= c1; h4 ^= k4; h4 = ROTL32(h4,13); h4 += h1; h4 = h4*5+0x32ac3b17; } //---------- // tail const uint8_t * tail = (const uint8_t*)(data + nblocks*16); uint32_t k1 = 0; uint32_t k2 = 0; uint32_t k3 = 0; uint32_t k4 = 0; switch(len & 15) { case 15: k4 ^= tail[14] << 16; case 14: k4 ^= tail[13] << 8; case 13: k4 ^= tail[12] << 0; k4 *= c4; k4 = ROTL32(k4,18); k4 *= c1; h4 ^= k4; case 12: k3 ^= tail[11] << 24; case 11: k3 ^= tail[10] << 16; case 10: k3 ^= tail[ 9] << 8; case 9: k3 ^= tail[ 8] << 0; k3 *= c3; k3 = ROTL32(k3,17); k3 *= c4; h3 ^= k3; case 8: k2 ^= tail[ 7] << 24; case 7: k2 ^= tail[ 6] << 16; case 6: k2 ^= tail[ 5] << 8; case 5: k2 ^= tail[ 4] << 0; k2 *= c2; k2 = ROTL32(k2,16); k2 *= c3; h2 ^= k2; case 4: k1 ^= tail[ 3] << 24; case 3: k1 ^= tail[ 2] << 16; case 2: k1 ^= tail[ 1] << 8; case 1: k1 ^= tail[ 0] << 0; k1 *= c1; k1 = ROTL32(k1,15); k1 *= c2; h1 ^= k1; }; //---------- // finalization h1 ^= len; h2 ^= len; h3 ^= len; h4 ^= len; h1 += h2; h1 += h3; h1 += h4; h2 += h1; h3 += h1; h4 += h1; h1 = fmix32(h1); h2 = fmix32(h2); h3 = fmix32(h3); h4 = fmix32(h4); h1 += h2; h1 += h3; h1 += h4; h2 += h1; h3 += h1; h4 += h1; ((uint32_t*)out)[0] = h1; ((uint32_t*)out)[1] = h2; ((uint32_t*)out)[2] = h3; ((uint32_t*)out)[3] = h4; } //----------------------------------------------------------------------------- void MurmurHash3_x64_128 ( const void * key, const int len, const uint32_t seed, void * out ) { const uint8_t * data = (const uint8_t*)key; const int nblocks = len / 16; uint64_t h1 = seed; uint64_t h2 = seed; const uint64_t c1 = BIG_CONSTANT(0x87c37b91114253d5); const uint64_t c2 = BIG_CONSTANT(0x4cf5ad432745937f); //---------- // body const uint64_t * blocks = (const uint64_t *)(data); for(int i = 0; i < nblocks; i++) { uint64_t k1 = getblock64(blocks,i*2+0); uint64_t k2 = getblock64(blocks,i*2+1); k1 *= c1; k1 = ROTL64(k1,31); k1 *= c2; h1 ^= k1; h1 = ROTL64(h1,27); h1 += h2; h1 = h1*5+0x52dce729; k2 *= c2; k2 = ROTL64(k2,33); k2 *= c1; h2 ^= k2; h2 = ROTL64(h2,31); h2 += h1; h2 = h2*5+0x38495ab5; } //---------- // tail const uint8_t * tail = (const uint8_t*)(data + nblocks*16); uint64_t k1 = 0; uint64_t k2 = 0; switch(len & 15) { case 15: k2 ^= ((uint64_t)tail[14]) << 48; case 14: k2 ^= ((uint64_t)tail[13]) << 40; case 13: k2 ^= ((uint64_t)tail[12]) << 32; case 12: k2 ^= ((uint64_t)tail[11]) << 24; case 11: k2 ^= ((uint64_t)tail[10]) << 16; case 10: k2 ^= ((uint64_t)tail[ 9]) << 8; case 9: k2 ^= ((uint64_t)tail[ 8]) << 0; k2 *= c2; k2 = ROTL64(k2,33); k2 *= c1; h2 ^= k2; case 8: k1 ^= ((uint64_t)tail[ 7]) << 56; case 7: k1 ^= ((uint64_t)tail[ 6]) << 48; case 6: k1 ^= ((uint64_t)tail[ 5]) << 40; case 5: k1 ^= ((uint64_t)tail[ 4]) << 32; case 4: k1 ^= ((uint64_t)tail[ 3]) << 24; case 3: k1 ^= ((uint64_t)tail[ 2]) << 16; case 2: k1 ^= ((uint64_t)tail[ 1]) << 8; case 1: k1 ^= ((uint64_t)tail[ 0]) << 0; k1 *= c1; k1 = ROTL64(k1,31); k1 *= c2; h1 ^= k1; }; //---------- // finalization h1 ^= len; h2 ^= len; h1 += h2; h2 += h1; h1 = fmix64(h1); h2 = fmix64(h2); h1 += h2; h2 += h1; ((uint64_t*)out)[0] = h1; ((uint64_t*)out)[1] = h2; } //----------------------------------------------------------------------------- simka-1.5.3/src/simkaMin/MurmurHash3.h000066400000000000000000000021261377312000000175040ustar00rootroot00000000000000//----------------------------------------------------------------------------- // MurmurHash3 was written by Austin Appleby, and is placed in the public // domain. The author hereby disclaims copyright to this source code. #ifndef _MURMURHASH3_H_ #define _MURMURHASH3_H_ //----------------------------------------------------------------------------- // Platform-specific functions and macros // Microsoft Visual Studio #if defined(_MSC_VER) && (_MSC_VER < 1600) typedef unsigned char uint8_t; typedef unsigned int uint32_t; typedef unsigned __int64 uint64_t; // Other compilers #else // defined(_MSC_VER) #include #endif // !defined(_MSC_VER) //----------------------------------------------------------------------------- void MurmurHash3_x86_32 ( const void * key, int len, uint32_t seed, void * out ); void MurmurHash3_x86_128 ( const void * key, int len, uint32_t seed, void * out ); void MurmurHash3_x64_128 ( const void * key, int len, uint32_t seed, void * out ); //----------------------------------------------------------------------------- #endif // _MURMURHASH3_H_ simka-1.5.3/src/simkaMin/SimkaMin.cpp000066400000000000000000000074261377312000000174010ustar00rootroot00000000000000/***************************************************************************** * SimkaMin: Fast kmer-based method for estimating the similarity between numerous metagenomic datasets * A tool from the GATB (Genome Assembly Tool Box) * Copyright (C) 2019 INRIA * Authors: G.Benoit * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU Affero General Public License as * published by the Free Software Foundation, either version 3 of the * License, or (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU Affero General Public License for more details. * * You should have received a copy of the GNU Affero General Public License * along with this program. If not, see . *****************************************************************************/ #include "SimkaMinCount.hpp" #include "SimkaMinDistance.hpp" #include "SimkaMinDistanceMatrixExporter.hpp" #include "SimkaMinDistanceMatrixMerger.hpp" #include "SimkaMinInfos.hpp" #include "SimkaMinAppend.hpp" void displayHelp(){ cout << "Usage: ./simkaMin [option]" << endl; cout << endl << "[Distance computation options]" << endl; cout << "\tsketch : transform datasets in small sketches of k-mers and their abundance" << endl; cout << "\tdistance : compute Jaccard and Bray-Curtis distances between sketches" << endl; cout << endl << "[Distance matrix manipulation options]" << endl; cout << "\texport : export distance matrices stored in binary format" << endl; //cout << "\tmatrix-update : update existing distance matrices" << endl; cout << endl << "[Sketch options]" << endl; cout << "\tappend : merge multiple sketch files into a single one" << endl; cout << "\tinfo : list datasets contained in a sketch file" << endl; cout << endl; } int main (int argc, char* argv[]) { try { if(argc < 2){ displayHelp(); } else{ //std::vector args; vector argsTemp( argv, argv + argc ); argsTemp.erase(argsTemp.begin()+1); //std::transform(argsTemp.begin(), argsTemp.end(), std::back_inserter(vc), convert); char** args = &argsTemp[0]; //char* args[]; //for(string& arg: argsTemp){ //} //rArray = new char*[argc+1]; //for(int i=0; i <= argc; i++) { // rArray[i] = argv[i]; //} // use rArray //delete [] rArray; //char* args = new char*[argc-1]; //vector test; //for(size_t i=0; i args(argv); string programName = string(argv[1]); if(programName == "sketch"){ Simka2ComputeKmerSpectrum().run (argc, args); } else if(programName == "append"){ SimkaMinAppend().run(argc, args); } else if(programName == "distance"){ SimkaMinDistance().run(argc, args); } else if(programName == "export"){ SimkaMinDistanceMatrixExporter().run(argc, args); } else if(programName == "matrix-update"){ //Hidden feature SimkaMinDistanceMatrixMerger().run(argc, args); } else if(programName == "info"){ SimkaMinInfos().run(argc, args); } else{ displayHelp(); } } //cout << argc << endl; //cout << argv[0] << endl; //cout << argv[1] << endl; //cout << argv[2] << endl; // } catch (Exception& e) { std::cout << "EXCEPTION: " << e.getMessage() << std::endl; return EXIT_FAILURE; } return EXIT_SUCCESS; } simka-1.5.3/src/simkaMin/SimkaMinAppend.hpp000066400000000000000000000130311377312000000205230ustar00rootroot00000000000000/***************************************************************************** * SimkaMin: Fast kmer-based method for estimating the similarity between numerous metagenomic datasets * A tool from the GATB (Genome Assembly Tool Box) * Copyright (C) 2019 INRIA * Authors: G.Benoit * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU Affero General Public License as * published by the Free Software Foundation, either version 3 of the * License, or (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU Affero General Public License for more details. * * You should have received a copy of the GNU Affero General Public License * along with this program. If not, see . *****************************************************************************/ #ifndef SIMKA1_4_SRC_SIMKAMIN_SIMKAMINAPPEND_HPP_ #define SIMKA1_4_SRC_SIMKAMIN_SIMKAMINAPPEND_HPP_ #include "SimkaMinCommons.hpp" /* Header and sketches of first sketch (-in1) are kept First sketch overwrite starts after its sketches, where ids starts Sketches of second files (-in2) are written after sketches of first file (-in1) Then, Ids of first file are written, and then ids of second file The number of datasets in the header is updated */ class SimkaMinAppendAlgorithm : public Algorithm { public: IProperties* _options; string _inputFilename1; string _inputFilename2; u_int32_t _nbDatasets; u_int32_t _sketchSize; ofstream _outputFile; ifstream _inputFile2; SimkaMinAppendAlgorithm(IProperties* options): Algorithm("simkaMinAppendAlgorithm", -1, options) { } void execute(){ parseArgs(); append(); } void parseArgs(){ _options = getInput(); _inputFilename1 = _options->getStr(STR_SIMKA_URI_INPUT_1); _inputFilename2 = _options->getStr(STR_SIMKA_URI_INPUT_2); if(!System::file().doesExist(_inputFilename1)){ std::cerr << "Error: input does not exist (" << _inputFilename1 << ")" << std::endl; exit(1); } if(!System::file().doesExist(_inputFilename2)){ std::cerr << "Error: input does not exist (" << _inputFilename2 << ")" << std::endl; exit(1); } } void append(){ u_int8_t kmerSize1, kmerSize2; u_int32_t sketchSize1, sketchSize2, seed1, seed2, nbDatasets1, nbDatasets2; SimkaMinCommons::getKmerInfos(_inputFilename1, kmerSize1, sketchSize1, seed1, nbDatasets1); SimkaMinCommons::getKmerInfos(_inputFilename2, kmerSize2, sketchSize2, seed2, nbDatasets2); if(kmerSize1 != kmerSize2){ std::cerr << "Error: can't merge sketches with different kmer sizes (" << kmerSize1 << " vs " << kmerSize2 << ")" << std::endl; exit(1); } if(sketchSize1 != sketchSize2){ std::cerr << "Error: can't merge sketches with different sketch sizes (" << sketchSize1 << " vs " << sketchSize2 << ")" << std::endl; exit(1); } if(seed1 != seed2){ std::cerr << "Error: can't merge sketches with different seeds (" << seed1 << " vs " << seed2 << ")" << std::endl; exit(1); } u_int32_t nbDatasets = nbDatasets1 + nbDatasets2; vector id1, id2; SimkaMinCommons::readIds(_inputFilename1, id1); SimkaMinCommons::readIds(_inputFilename2, id2); //open first file to be overwritten (but without rewriting all its sketches) _outputFile.open(_inputFilename1, ios::binary|ios::in); _inputFile2.open(_inputFilename2, ios::binary); //Update number of datasets in the header _outputFile.seekp(SimkaMinCommons::getFilePosition_nbDatasets()); _outputFile.write((const char*)&nbDatasets, sizeof(nbDatasets)); appendSkecthes(nbDatasets1, sketchSize1, nbDatasets2); appendIds(id1); appendIds(id2); _inputFile2.close(); _outputFile.close(); } void appendSkecthes(u_int32_t nbDatasets1, u_int32_t sketchSize1, u_int32_t nbDatasets2){ _outputFile.seekp(SimkaMinCommons::getFilePosition_sketchIds(nbDatasets1, sketchSize1)); _inputFile2.seekg(KMER_SPECTRUM_HEADER_SIZE); u_int64_t dataToTransfer = nbDatasets2*sketchSize1*sizeof(KmerAndCountType); u_int64_t bufferSize = 1024; char buffer[bufferSize]; /* copy from input to output */ while (dataToTransfer > 0) { u_int64_t size = min(bufferSize, dataToTransfer); _inputFile2.read(buffer, size); _outputFile.write(buffer, size); dataToTransfer -= size; } //fclose(infile); //fclose(outfile); } void appendIds(vector& ids){ for(size_t i=0; ipush_front (new OptionOneParam (STR_SIMKA_URI_INPUT_2, "second sketch file to merge (this file will be appended to the first one)", true)); parser->push_front (new OptionOneParam (STR_SIMKA_URI_INPUT_1, "first sketch file to merge (this file will be overwritten)", true)); parser->getParser (STR_NB_CORES)->setVisible (false); parser->getParser (STR_VERBOSE)->setVisible (false); } void execute () { IProperties* args = getInput(); SimkaMinAppendAlgorithm* algo = new SimkaMinAppendAlgorithm(args); algo->execute(); delete algo; } }; #endif /* SIMKA1_4_SRC_SIMKAMIN_SIMKAMINAPPEND_HPP_ */ simka-1.5.3/src/simkaMin/SimkaMinCommons.hpp000066400000000000000000000113261377312000000207340ustar00rootroot00000000000000/***************************************************************************** * SimkaMin: Fast kmer-based method for estimating the similarity between numerous metagenomic datasets * A tool from the GATB (Genome Assembly Tool Box) * Copyright (C) 2019 INRIA * Authors: G.Benoit * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU Affero General Public License as * published by the Free Software Foundation, either version 3 of the * License, or (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU Affero General Public License for more details. * * You should have received a copy of the GNU Affero General Public License * along with this program. If not, see . *****************************************************************************/ #ifndef SIMKA1_4_SRC_SIMKAMIN_SIMKAMINCOMMONS_HPP_ #define SIMKA1_4_SRC_SIMKAMIN_SIMKAMINCOMMONS_HPP_ #include #include #define KMER_SPECTRUM_HEADER_SIZE (1+4+4+4) //At the begining of the .kmers file we store the size of the kmer (on 1 byte), the sketch size (on 4 bytes), the seed used by Murmurhash3 (4 bytes), the number of datasets in the sketch file (4 bytes) const string STR_SIMKA_SEED = "-seed"; const string STR_SIMKA_SKETCH_SIZE = "-nb-kmers"; const string STR_SIMKA_URI_INPUT_1 = "-in1"; const string STR_SIMKA_URI_INPUT_2 = "-in2"; const string STR_SIMKA_INPUT_IDS = "-in-ids"; const string STR_SIMKA_ABUNDANCE_FILTER = "-filter"; //const string STR_SIMKA2_DATASET_ID = "-id"; typedef u_int32_t KmerCountType; typedef unordered_map KmerCountDictionaryType; typedef float DistanceValueType; struct KmerAndCountType{ public: u_int64_t _kmer; KmerCountType _count; KmerAndCountType(){ } KmerAndCountType(u_int64_t kmer, KmerCountType count){ _kmer = kmer; _count = count; } }; struct PairwiseDistance{ u_int64_t _i; u_int64_t _j; DistanceValueType _distance; PairwiseDistance(){ _i = -1; _j = -1; _distance = -1; } void set(u_int64_t i, u_int64_t j, DistanceValueType distance){ _i = i; _j = j; _distance = distance; } }; class SimkaMinCommons { public: //SimkaMinCommons(); //virtual ~SimkaMinCommons(); static void writeString(const string& s, ofstream& file){ u_int8_t size = s.size(); file.write((char const*)(&size), sizeof(size)); file.write(s.c_str(), size); } static void readString(string& s, ifstream& file){ u_int8_t size; file.read((char*)(&size), sizeof(size)); std::vector buffer(size); file.read(&buffer[0], buffer.size()); s.assign(buffer.begin(), buffer.end()); //return string linkedDatasetID( buffer.begin(), buffer.end() ); } static void readIds(const string& filename, vector& datasetIds){ u_int8_t kmerSize; u_int32_t sketchSize, seed, nbDatasets; getKmerInfos(filename, kmerSize, sketchSize, seed, nbDatasets); ifstream file(filename.c_str(), ios::binary); file.seekg(SimkaMinCommons::getFilePosition_sketchIds(nbDatasets, sketchSize)); //u_int32_t nbDatasets; //file.read((char*)(&nbDatasets), sizeof(nbDatasets)); string datasetId; for(size_t i=0; i. *****************************************************************************/ #ifndef SIMKA1_4_SRC_SIMKAMIN_SIMKAMINCOUNT_HPP_ #define SIMKA1_4_SRC_SIMKAMIN_SIMKAMINCOUNT_HPP_ /* * Simka2ComputeKmerSpectrum.hpp * * Created on: 4 nov. 2016 * Author: gbenoit */ //#include "../core/SimkaUtils.hpp" //#include "Simka2Utils.hpp" //#include "../minikc/MiniKC.hpp" //#include //#include //#include "../utils/SimkaIoUtils.hpp" //#include "SimkaAlgorithm.hpp" //#include "SimkaAlgorithm.hpp" #include "SimkaMinCommons.hpp" #include "SimkaCommons.hpp" #include "MurmurHash3.h" #include //#include "../../thirdparty/KMC/kmc_api/kmc_file.h" //#include "../../thirdparty/KMC/kmc_api/kmer_defs.h" //#include "../utils/MurmurHash3.h" //#define MERGE_BUFFER_SIZE 10000 //------------------------------------------------------------------------------- //------------------------------------------------------------------------------- //------------------------------------------------------------------------------- //------------------------------------------------------------------------------- //------------------------------------------------------------------------------- //------------------------------------------------------------------------------- //------------------------------------------------------------------------------- //------------------------------------------------------------------------------- //------------------------------------------------------------------------------- //------------------------------------------------------------------------------- //------------------------------------------------------------------------------- //------------------------------------------------------------------------------- //------------------------------------------------------------------------------- //------------------------------------------------------------------------------- //------------------------------------------------------------------------------- //------------------------------------------------------------------------------- //------------------------------------------------------------------------------- //------------------------------------------------------------------------------- //------------------------------------------------------------------------------- //------------------------------------------------------------------------------- //------------------------------------------------------------------------------- //------------------------------------------------------------------------------- //------------------------------------------------------------------------------- //------------------------------------------------------------------------------- //------------------------------------------------------------------------------- //------------------------------------------------------------------------------- //------------------------------------------------------------------------------- //------------------------------------------------------------------------------- //------------------------------------------------------------------------------- //------------------------------------------------------------------------------- //------------------------------------------------------------------------------- //------------------------------------------------------------------------------- //------------------------------------------------------------------------------- //------------------------------------------------------------------------------- //------------------------------------------------------------------------------- //------------------------------------------------------------------------------- //------------------------------------------------------------------------------- //------------------------------------------------------------------------------- template class SelectKmersCommand { public: typedef typename Kmer::ModelCanonical ModelCanonical; typedef typename Kmer::ModelCanonical::Iterator ModelCanonicalIterator; typedef typename Kmer::Type KmerType; typedef typename Kmer::ModelCanonical::Kmer KmerCanonicalType; //typedef typename ModelCanonical::Kmer Lol; size_t _kmerSize; size_t _sketchSize; u_int32_t _seed; //vector _minHashValues; //vector _minHashKmers; ModelCanonical _model; ModelCanonicalIterator _itKmer; u_int64_t _hash_otpt[2]; bool _isMaster; //size_t _bufferIndex; //size_t _partitionId; //vector _bufferKmers; //vector _bufferCounts; //vector _minHashValues; //vector& _minHashValuesSynchronized; //vector _minHashKmers; //vector _minHashKmersCounts; struct KmerCountSorter{ bool operator() (u_int64_t l, u_int64_t r) { return r > l; } }; //typedef typename KmerCountSorter KmerSorter; std::priority_queue< u_int64_t, vector, KmerCountSorter> _kmerCountSorter; vector _kmers; //std::priority_queue< u_int64_t, vector, KmerCountSorter>& _kmerCountSorterSynch; //KmerCountDictionaryType& _kmerCountsSynch; Bloom* _bloomFilter; u_int64_t _nbInsertedKmersInBloom; vector& _hashedKmers; KmerCountDictionaryType& _kmerCounts; //ofstream _outputFile; bool _useAbundanceFilter; SelectKmersCommand(size_t kmerSize, size_t sketchSize, u_int32_t seed, Bloom* bloomFilter, vector& kmers, KmerCountDictionaryType& kmerCounts, bool useAbundanceFilter) : _model(kmerSize), _itKmer(_model), _bloomFilter(bloomFilter), _hashedKmers(kmers), _kmerCounts(kmerCounts) { _kmerSize = kmerSize; _sketchSize = sketchSize; _seed = seed; _isMaster = true; _nbInsertedKmersInBloom = 0; _useAbundanceFilter = useAbundanceFilter; } SelectKmersCommand(const SelectKmersCommand& copy) : _model(copy._kmerSize), _itKmer(_model), _bloomFilter(copy._bloomFilter), _hashedKmers(copy._hashedKmers), _kmerCounts(copy._kmerCounts) { _kmerSize = copy._kmerSize; _sketchSize = copy._sketchSize; _seed = copy._seed; _isMaster = false; _nbInsertedKmersInBloom = 0; _useAbundanceFilter = copy._useAbundanceFilter; } ~SelectKmersCommand(){ if(_isMaster) return; if(_kmerCountSorter.size() == 0) return; //cout << "deleteeeeee" << endl; size_t sketchSize = _kmerCountSorter.size(); //cout << sketchSize << endl; for(size_t i=0; i=0; i++){ if(seq[i] == 'A'){ rev += 'T'; } else if(seq[i] == 'C'){ rev += 'G'; } else if(seq[i] == 'G'){ rev += 'C'; } else if(seq[i] == 'T'){ rev += 'A'; } } return rev; } */ //void minRevComp(string& kmer){ //string revKmer = //} void operator()(Sequence& sequence){ _model.build(sequence.getData(), _kmers); //_itKmer.setData(sequence.getData()); //cout << sequence.toString() << endl; //size_t len = sequence.getDataSize() - _kmerSize + 1; /// _kmerSize; //char* data = sequence.getDataBuffer(); for(size_t i=0; i<_kmers.size(); i++){ KmerCanonicalType& kmer = _kmers[i]; // We iterate the sequence data by block of size kmerSize //for (size_t i=0; ivalue(); //KmerType kmerRev = revcomp(kmer.value(), _kmerSize); //string kmerStr = kmer.value().toString(_kmerSize); //string kmerStrRev = kmerRev.toString(_kmerSize); //if(kmerStrRev < kmerStr){ // kmerStr = kmerStrRev; //} u_int64_t kmerValue = kmer.value().getVal(); u_int64_t kmerHashed; MurmurHash3_x64_128 ((const char*)&kmerValue, sizeof(kmerValue), _seed, &_hash_otpt); kmerHashed = _hash_otpt[0]; //cout << kmerStr << ": " << kmerHashed << endl; //todo: verifier dabord si le kmer peut etre insérer, plus rapide que els accès au table de hachage (bloom et selected) //cout << _useAbundanceFilter << endl; if(_useAbundanceFilter){ processFiltered(kmer.value(), kmerHashed); } else{ processUnfiltered(kmerHashed); } //cout << kmer.isValid() << endl; // We update the occurrences number for this kmer value //distrib [kmer.value().toInt()] += 1; } /* for(_itKmer.first(); !_itKmer.isDone(); _itKmer.next()){ //cout << _itKmer->value().toString(_kmerSize) << endl; Lol kkaka = _itKmer->value().value(); //cout << _itKmer->value().isValid() << endl; KmerType kmer = _itKmer->value(); KmerType kmerRev = revcomp(kmer, _kmerSize); string kmerStr = kmer.toString(_kmerSize); string kmerStrRev = kmerRev.toString(_kmerSize); if(kmerStrRev < kmerStr){ kmerStr = kmerStrRev; } //u_int64_t kmerValue = kmer.getVal(); u_int64_t kmerHashed; MurmurHash3_x64_128 ( kmerStr.c_str(), _kmerSize, 42, &_hash_otpt); kmerHashed = _hash_otpt[0]; if(kmerHashed == 66908235404){ //cout << kmer.value().isValid() << endl; cout << sequence.toString() << endl; cout << kmerStr << endl; } //cout << kmerStr << ": " << kmerHashed << endl; //todo: verifier dabord si le kmer peut etre insérer, plus rapide que els accès au table de hachage (bloom et selected) //cout << _useAbundanceFilter << endl; if(_useAbundanceFilter){ processFiltered(kmer, kmerHashed); } else{ processUnfiltered(kmerHashed); } }*/ } inline void processUnfiltered(u_int64_t kmerHashed){ if(_kmerCountSorter.size() < _sketchSize){ if(_kmerCounts.find(kmerHashed) == _kmerCounts.end()){ _kmerCountSorter.push(kmerHashed); _kmerCounts[kmerHashed] = 1; //cout << _kmerCountSorter.size() << endl; } else{ _kmerCounts[kmerHashed] += 1; } } else{ if(kmerHashed < _kmerCountSorter.top()){ if(_kmerCounts.find(kmerHashed) == _kmerCounts.end()){ //cout << kmer << " " << _kmerCounts.size() << endl; u_int64_t greaterValue = _kmerCountSorter.top(); _kmerCounts.erase(greaterValue); _kmerCountSorter.pop(); _kmerCountSorter.push(kmerHashed); _kmerCounts[kmerHashed] = 1; } else{ _kmerCounts[kmerHashed] += 1; } } } } inline void processFiltered(const KmerType& kmer, u_int64_t kmerHashed){ if(_kmerCountSorter.size() < _sketchSize){ if(_bloomFilter->contains(kmer)){ //Filling the queue with first elements if(_kmerCounts.find(kmerHashed) == _kmerCounts.end()){ _kmerCountSorter.push(kmerHashed); _kmerCounts[kmerHashed] = 2; //cout << _kmerCountSorter.size() << endl; } else{ _kmerCounts[kmerHashed] += 1; } } else{ _bloomFilter->insert(kmer); _nbInsertedKmersInBloom += 1; } } else{ if(kmerHashed < _kmerCountSorter.top()){ if(_bloomFilter->contains(kmer)){ if(_kmerCounts.find(kmerHashed) == _kmerCounts.end()){ //cout << kmer << " " << _kmerCounts.size() << endl; u_int64_t greaterValue = _kmerCountSorter.top(); _kmerCounts.erase(greaterValue); _kmerCountSorter.pop(); _kmerCountSorter.push(kmerHashed); _kmerCounts[kmerHashed] = 2; } else{ _kmerCounts[kmerHashed] += 1; } } else{ _bloomFilter->insert(kmer); _nbInsertedKmersInBloom += 1; } } } } }; /* size_t _kmerSize; size_t _sketchSize; vector _minHashValues; vector _minHashKmers; ModelCanonical _model; ModelCanonicalIterator _itKmer; //ModelCanonical model (kmerSize); //ModelCanonical::Kmer kmer = model.codeSeed (seq, Data::ASCII); pthread_mutex_t* _mutex; vector& _minHashValuesSynchronized; vector& _minHashKmersSynchronized; MinhashSketcher(size_t kmerSize, size_t sketchSize, pthread_mutex_t* mutex, vector& minHashValuesSynchronized, vector& minHashKmersSynchronized) : _model(kmerSize), _itKmer(_model), _mutex(mutex), _minHashValuesSynchronized(minHashValuesSynchronized), _minHashKmersSynchronized(minHashKmersSynchronized) { _kmerSize = kmerSize; _sketchSize = sketchSize; ModelCanonical _model(_kmerSize); _minHashValues = vector(_sketchSize, -1); _minHashKmers = vector(_sketchSize, 0); } MinhashSketcher(const MinhashSketcher& copy) : _model(copy._kmerSize), _itKmer(_model), _mutex(copy._mutex), _minHashValuesSynchronized(copy._minHashValuesSynchronized), _minHashKmersSynchronized(copy._minHashKmersSynchronized) { _kmerSize = copy._kmerSize; _sketchSize = copy._sketchSize; _minHashValues = vector(_sketchSize, -1); _minHashKmers = vector(_sketchSize, 0); } ~MinhashSketcher(){ //cout << "deleteeeeee" << endl; pthread_mutex_lock(_mutex); for(size_t i=0; i<_sketchSize; i++){ if(_minHashValues[i] < _minHashValuesSynchronized[i]){ _minHashValuesSynchronized[i] = _minHashValues[i]; _minHashKmersSynchronized[i] = _minHashKmers[i]; //cout << _minHashKmers[i] << endl; } } pthread_mutex_unlock(_mutex); } */ /* template class StorageItKmerCount { public: typedef typename Kmer::Type Type; typedef typename Kmer::Count Count; StorageItKmerCount(Iterator* it){ _it = it; } ~StorageItKmerCount(){ delete _it; } bool next(){ _it->next(); return !_it->isDone(); } Count& item(){ return _it->item(); } Iterator* _it; }; template class SimkaPartitionWriter { public: typedef typename Kmer::Type Type; typedef typename Kmer::Count Count; //typedef tuple*> KmerCount_It; struct Kmer_BankId_Count{ Type _type; u_int32_t _bankId; u_int16_t _count; Kmer_BankId_Count(){ } Kmer_BankId_Count(Type type, u_int64_t bankId, u_int64_t count){ _type = type; _bankId = bankId; _count = count; } }; string _outputDir; size_t _nbPartitions; vector _nbKmerPerParts; vector _nbDistinctKmerPerParts; vector _chordNiPerParts; vector* > _bags; vector* > _cachedBags; SimkaPartitionWriter(const string& oututDir, size_t nbPartitions){ _outputDir = oututDir; _nbPartitions = nbPartitions; _nbKmerPerParts = vector(_nbPartitions, 0); _nbDistinctKmerPerParts = vector(_nbPartitions, 0); _chordNiPerParts = vector(_nbPartitions, 0); //vector* > bags; //vector* > cachedBags; for(size_t i=0; i<_nbPartitions; i++){ //string outputFilename = _outputDir + "/" + _datasetID + "_" + Stringify::format("%i", i) + ".gz"; string outputFilename = _outputDir + "/" + Stringify::format("%i", i) + ".gz"; Bag* bag = new BagGzFile(outputFilename); Bag* cachedBag = new BagCache(bag, 10000); _cachedBags.push_back(cachedBag); //BagCache bagCache(*bag, 10000); _bags.push_back(bag); } } void insert(u_int64_t kmer, u_int64_t bankId, u_int64_t abundance){ //kmer.to_long(kmer_bin); size_t part = korenXor(kmer) % _nbPartitions; //hash_kmer(kmer_bin) % _nbPartitions; Type type; //(kmer_bin[0]); //type.setVal(kmer_bin[0]); type.setVal(kmer); //size_t part = oahash(kmer) % _nbPartitions; _cachedBags[part]->insert(Kmer_BankId_Count(type, bankId, abundance)); _nbDistinctKmerPerParts[part] += 1; _nbKmerPerParts[part] += abundance; _chordNiPerParts[part] += pow(abundance, 2); } void end(){ for(size_t i=0; i<_nbPartitions; i++){ //bags[i]->flush(); //cachedBags[i]->flush(); delete _cachedBags[i]; //delete bags[i]; } for(size_t i=0; i<_nbPartitions; i++){ string outputFilename = _outputDir + "/" + Stringify::format("%i", i) + ".gz"; checkGzFile(outputFilename); } } //There is a bug in simka, sometimes a gz file is erroneous at the end //It's really rare and I can't find it //My bad solution is to read the whole gz file as soon as it is close and a segfault will occur if it has a bad format //Of course it's a bad solution because it has a impact on simka performances... void checkGzFile(const string& filename){ IterableGzFile* gzFile = new IterableGzFile(filename, 10000); Iterator* it = gzFile->iterator(); it->first(); while(!it->isDone()){ it->next(); } delete it; delete gzFile; } inline u_int64_t korenXor(u_int64_t x)const{ x ^= (x << 21); x ^= (x >> 35); x ^= (x << 4); return x; } }; */ /********************************************************************* * ** SimkaAlgorithm *********************************************************************/ template class Simka2ComputeKmerSpectrumAlgorithm : public Algorithm { public: typedef typename Kmer::Type KmerType; typedef typename Kmer::ModelCanonical ModelCanonical; typedef typename Kmer::ModelCanonical::Iterator ModelCanonicalIterator; struct KmerCountSorter{ bool operator() (u_int64_t l, u_int64_t r) { return r > l; } }; //struct kxpcomp { bool operator() (KmerCount_It& l,KmerCount_It& r) { return (r._count.value < l._count.value); } } ; //u_int64_t _nbReads; //size_t _nbPartitions; u_int64_t _maxMemory; size_t _nbCores; string _outputDir; string _outputDirTemp; //size_t _nbBanks; string _inputFilename; //string _datasetID; u_int8_t _kmerSize; //pair _abundanceThreshold; //SIMKA_SOLID_KIND _solidKind; //bool _soliditySingle; int64_t _maxNbReads; size_t _minReadSize; double _minReadShannonIndex; //double _minKmerShannonIndex; //size_t _nbMinimizers; //size_t _nbCores; //SimkaStatistics* _stats; //SimkaDistance* _simkaDistance; //string _banksInputFilename; //string _h5Filename; //vector _tempFilenamesToDelete; //IBank* _banks; IProperties* _options; //size_t _localNbPartitions; //vector _bankNames; //vector _nbReadsPerDataset; //string _outputFilenameSuffix; //u_int64_t _totalKmers; //vector _nbBankPerDataset; //size_t _nbBankPerDataset; //string _largerBankId; //bool _computeSimpleDistances; //bool _computeComplexDistances; //bool _keepTmpFiles; //string _kmerDatataseFilename; //vector _cmds; //SimkaPartitionWriter* _partitionWriter; u_int32_t _seed; u_int32_t _sketchSize; bool _useAbundanceFilter; u_int32_t _nbDatasets; //pthread_mutex_t _mutex; //typedef typename SelectKmersCommand::KmerCountSorter KmerCountSorter; //std::priority_queue< u_int64_t, vector, KmerCountSorter> _kmerCountSorter; //KmerCountDictionaryType _kmerCounts; //size_t _nbBanks; //vector _bankNames; //vector _nbBankPerDataset; vector _threads; size_t _maxRunningThreads; vector _runningThreadIds; size_t _nbRunningThreads; vector _finishedThreads; mutex countKmersMutex; ofstream _outputFile; //string _outputFilenameKmers; //string _outputFilenameIds; IteratorListener* _progress; u_int64_t _progress_nbDatasetsToProcess; u_int64_t _progress_nbDatasetsProcessed; string _progress_text; Simka2ComputeKmerSpectrumAlgorithm(IProperties* options): Algorithm("simka", -1, options) { } void execute(){ //pthread_mutex_init(&_mutex, NULL); parseArgs(); createDirs(); cout << endl << "Checking input file validity..." << endl; SimkaCommons::checkInputValidity(_outputDirTemp, _inputFilename, _progress_nbDatasetsToProcess); _progress = this->createIteratorListener (_progress_nbDatasetsToProcess, ""); //new ProgressSynchro ( //this->createIteratorListener (_progress_nbDatasetsToProcess, ""), //System::thread().newSynchronizer()); _progress->setMessage (Stringify::format (_progress_text.c_str(), _progress_nbDatasetsProcessed, _progress_nbDatasetsToProcess)); _progress->init (); countDatasets(); string command = "rm -rf " + _outputDirTemp; system(command.c_str()); cout << "Output results: " << _outputDir << endl; } void parseArgs(){ _options = getInput(); _seed = _options->getInt(STR_SIMKA_SEED); _sketchSize = _options->getInt(STR_SIMKA_SKETCH_SIZE); _useAbundanceFilter = _options->get(STR_SIMKA_ABUNDANCE_FILTER); _maxMemory = _options->getInt(STR_MAX_MEMORY); _nbCores = _options->getInt(STR_NB_CORES); _inputFilename = _options->getStr(STR_URI_INPUT); //_datasetID = _options->getStr(STR_SIMKA2_DATASET_ID); _outputDir = _options->getStr(STR_URI_OUTPUT); // ? _options->getStr(STR_URI_OUTPUT) : "./"; if(_outputDir.empty()) _outputDir = "./simkaMin_kmers.bin"; _outputDirTemp = System::file().getDirectory(_outputDir) + "/__simkaMin_temp__/"; //cout << "outputdir temp to check: " << _outputDirTemp << endl; //_outputDirTemp = _options->get(STR_URI_OUTPUT_TMP) ? _options->getStr(STR_URI_OUTPUT_TMP) : "./"; _kmerSize = _options->getInt(STR_KMER_SIZE); //_abundanceThreshold.first = _options->getInt(STR_KMER_ABUNDANCE_MIN); //_abundanceThreshold.second = min((u_int64_t)_options->getInt(STR_KMER_ABUNDANCE_MAX), (u_int64_t)(999999999)); //_nbPartitions = _options->getInt(STR_SIMKA2_NB_PARTITION); //cout << _options->getInt(STR_KMER_ABUNDANCE_MAX) << endl; //cout << _abundanceThreshold.second << endl; //_soliditySingle = _options->get(STR_SIMKA_SOLIDITY_PER_DATASET); //_nbMinimizers = _options->getInt(STR_KMER_PER_READ); //_maxDisk = getInput()->getInt(STR_MAX_DISK); //read filter _maxNbReads = _options->getInt(STR_SIMKA_MAX_READS); _minReadSize = _options->getInt(STR_SIMKA_MIN_READ_SIZE); _minReadShannonIndex = _options->getDouble(STR_SIMKA_MIN_READ_SHANNON_INDEX); _minReadShannonIndex = std::max(_minReadShannonIndex, 0.0); _minReadShannonIndex = std::min(_minReadShannonIndex, 2.0); if(!System::file().doesExist(_inputFilename)){ std::cerr << "Error: input does not exist (" << _inputFilename << ")" << std::endl; exit(1); } if(System::file().doesExist(_outputDir)){ std::cerr << "Error: output file already exist (" << _outputDir << ")" << std::endl; exit(1); } _progress_text = "Sketching datasets (%d/%d)"; //_nbBankPerDataset = _options->getInt("-nb-dataset"); //_minKmerShannonIndex = _options->getDouble(STR_SIMKA_MIN_KMER_SHANNON_INDEX); //_minKmerShannonIndex = std::max(_minKmerShannonIndex, 0.0); //_minKmerShannonIndex = std::min(_minKmerShannonIndex, 2.0); //if(!System::file().doesExist(_inputFilename)){ // cerr << "ERROR: Input filename does not exist" << endl; // exit(1); //} //if(!System::file().doesExist(_outputDir)){ // std::cerr << "Error: can't create output directory (" << _outputDir << ")" << std::endl; // exit(EXIT_FAILURE); /* int ok = System::file().mkdir(_outputDir, -1); if(ok != 0){ std::cerr << "Error: can't create output directory (" << _outputDir << ")" << std::endl; exit(1); }*/ //} //_outputDirTemp = _outputDirTemp; //if(!System::file().doesExist(_outputDirTemp)){ //std::cerr << "Error: can't create output temp directory (" << _outputDirTemp << ")" << std::endl; //exit(EXIT_FAILURE); /* int ok = System::file().mkdir(_outputDirTemp, -1); if(ok != 0){ std::cerr << "Error: can't create output temp directory (" << _outputDirTemp << ")" << std::endl; exit(1); }*/ //} //_outputDirTemp = System::file().getRealPath(_outputDirTemp) + "/"; //cout << _outputDirTemp << endl; //_outputDirTemp += "/" + _datasetID + "_temp" + "/"; //System::file().mkdir(_outputDirTemp, -1); //_options->setStr(STR_URI_OUTPUT_TMP, _outputDirTemp); //System::file().mkdir(_outputDirTemp + "/input/", -1); //_maxMemory = _maxMemory / 1000; //_maxMemory = max(_maxMemory, (u_int64_t) 1); /* if(_outputDir.empty()){ _outputDir = "./simkaMin_kmers.bin"; } else if (_outputDir.find(".") == std::string::npos){ _outputDir += ".bin"; } _outputDir = System::file().getBaseName(_outputDir); cout << endl << endl; cout << _outputDir << endl; vector fields; stringstream outputFilenameStream(_outputDir); string field; while(std::getline(outputFilenameStream, field, '.')) { cout << field << endl; fields.push_back(field); } string prefix = fields[0]; string extension = ""; for(size_t i=1; isetStr(STR_URI_OUTPUT_TMP, _outputDirTemp); //System::file().mkdir(_outputDirTemp + "/input/", -1); } void countDatasets(){ //cout << endl << endl; //cout << "Sketching..." << endl; _outputFile.open(_outputDir, ios::binary); //Save sketch info //u_int8_t kmerSize = _kmerSize; //u_int32_t sketchSize = _sketchSize; //u_int32_t seed = _seed; _nbDatasets = 0; _outputFile.write((const char*)&_kmerSize, sizeof(_kmerSize)); _outputFile.write((const char*)&_sketchSize, sizeof(_sketchSize)); _outputFile.write((const char*)&_seed, sizeof(_seed)); _outputFile.write((const char*)&_nbDatasets, sizeof(_nbDatasets)); //cout << _maxRunningThreads << endl; size_t threadId = 0; //vector threads; //(_nbCores); //_isThreadRunning = vector(_nbCores); _nbRunningThreads = 0; _maxRunningThreads = _nbCores; string inputDir = _outputDirTemp; // + "/input/"; ifstream inputFile(_inputFilename.c_str()); //ofstream outputFileIds(_outputFilenameIds.c_str(), ios::binary); //_banksInputFilename = inputDir + "__input_simka__"; //_inputFilename + "_dsk_dataset_temp__"; //IFile* bankFile = System::file().newFile(_banksInputFilename, "wb"); string line; string linePart; vector lineIdDatasets; vector linepartPairedDatasets; vector linepartDatasets; //string bankFileContents = ""; size_t datasetId = 0; u_int64_t lineIndex = 0; u_int64_t bankIdBytePos = 0; while(getline(inputFile, line)){ line.erase(std::remove(line.begin(),line.end(),' '),line.end()); if(line == "") continue; //cout << line << endl; lineIdDatasets.clear(); linepartPairedDatasets.clear(); //vector filenames; stringstream lineStream(line); while(getline(lineStream, linePart, ':')){ lineIdDatasets.push_back(linePart); } string bankId = lineIdDatasets[0]; string linePairedDatasets = lineIdDatasets[1]; stringstream linePairedDatasetsStream(linePairedDatasets); while(getline(linePairedDatasetsStream, linePart, ';')){ linepartPairedDatasets.push_back(linePart); } string subBankFilename = inputDir + bankId; IFile* subBankFile = System::file().newFile(subBankFilename, "wb"); //cout << subBankFile->getPath() << endl; string subBankContents = ""; size_t nbBankPerDataset = linepartPairedDatasets.size(); for(size_t i=0; ifwrite(subBankContents.c_str(), subBankContents.size(), 1); subBankFile->flush(); delete subBankFile; //bankFileContents += inputDir + "/" + bankId + "\n"; lineIndex += 1; startNewThread(datasetId, subBankFilename, nbBankPerDataset); //count(); //_bankNames.push_back(bankId); datasetId += 1; _nbDatasets += 1; } //bankFileContents.erase(bankFileContents.size()-1); //bankFile->fwrite(bankFileContents.c_str(), bankFileContents.size(), 1); //bankFile->flush(); //delete bankFile; joinThreads(); _progress->finish(); inputFile.close(); writeIds(); //outputFileIds.seekp(0); //outputFileIds.write((const char*)&nbDatasets, sizeof(nbDatasets)); _outputFile.close(); //outputFileIds.close(); } void writeIds(){ _outputFile.seekp(SimkaMinCommons::getFilePosition_nbDatasets()); _outputFile.write((const char*)&_nbDatasets, sizeof(_nbDatasets)); _outputFile.seekp(SimkaMinCommons::getFilePosition_sketchIds(_nbDatasets, _sketchSize)); ifstream inputFile(_inputFilename.c_str()); string line; string linePart; vector lineIdDatasets; while(getline(inputFile, line)){ line.erase(std::remove(line.begin(),line.end(),' '),line.end()); if(line == "") continue; lineIdDatasets.clear(); stringstream lineStream(line); while(getline(lineStream, linePart, ':')){ lineIdDatasets.push_back(linePart); } string bankId = lineIdDatasets[0]; u_int8_t idSize = bankId.size(); _outputFile.write((const char*)& idSize, sizeof(idSize)); _outputFile.write(bankId.c_str(), bankId.size()); } inputFile.close(); } void startNewThread(size_t datasetId, const string& inputFilename, size_t nbBankPerDataset){ //for (size_t i=0; i<_nbBanks; i++){ // cout << i << endl; thread* t = new thread(&Simka2ComputeKmerSpectrumAlgorithm::countKmersOfDataset, this, datasetId, inputFilename, nbBankPerDataset); _threads.push_back(t); _runningThreadIds.push_back(datasetId); //threadId += 1; _nbRunningThreads += 1; //_isThreadRunning[threadId] = true; //_nbRunningThreads[i] += 1; if(_nbRunningThreads >= _maxRunningThreads){ waitThreads(); } //} //string filename = _outputDirTemp + "/selectedKmers.bin"; //ofstream selectKmersFile(filename.c_str(), ios::binary); //cout << _selectedKmerSorter.size() << " " << _nbUsedKmers << endl; //_selectedKmerSorter.pop(); //there is always one extra element because of a >= optimization... //cout << _selectedKmerSorter.size() << " " << _nbUsedKmers << endl; //u_int64_t size = _selectedKmerSorter.size(); //for(size_t i=0; i > _skecthCounts; //unordered_map > _; void countKmersOfDataset(size_t datasetId, const string& inputFilename, size_t nbBankPerDataset){ //TODO lock probably not required //countKmersMutex.lock(); //cout << "start: " << inputFilename << endl; //countKmersMutex.unlock(); IBank* bank = Bank::open(inputFilename); LOCAL(bank); SimkaSequenceFilter sequenceFilter(_minReadSize, _minReadShannonIndex); IBank* filteredBank = new SimkaPotaraBankFiltered(bank, sequenceFilter, _maxNbReads, nbBankPerDataset); LOCAL(filteredBank); Iterator* itSeq = filteredBank->iterator(); LOCAL(itSeq); //Iterator* itSeq = createIterator ( // filteredBank->iterator(), // filteredBank->estimateNbItems(), // "Computing minhash sketch and counting" //); //LOCAL(itSeq); IDispatcher* dispatcher = new SerialDispatcher(); Bloom* bloomFilter = 0; if(_useAbundanceFilter){ u_int64_t bloomMemoryBits = (_maxMemory * MBYTE * 8) / _maxRunningThreads; bloomMemoryBits = max(bloomMemoryBits, (u_int64_t) 10000); bloomFilter = new BloomCacheCoherent(bloomMemoryBits, 7); } //mutex commandMutex; //std::priority_queue< u_int64_t, vector, KmerCountSorter> kmerCountSorter; //unordered_map kmerCounts; vector kmers(_sketchSize, 0); //TODO only used for reversing kmers not really optimized... KmerCountDictionaryType _kmerCounts; { SelectKmersCommand command(_kmerSize, _sketchSize, _seed, bloomFilter, kmers, _kmerCounts, _useAbundanceFilter); dispatcher->iterate (itSeq, command, 1000); } /* ModelCanonical model; ModelCanonicalIterator itKmer(model); u_int64_t _hash_otpt[2]; u_int64_t _nbInsertedKmersInBloom = 0; for(itSeq->first(); !itSeq->isDone(); itSeq->next()){ Sequence& sequence = itSeq->item(); } */ delete dispatcher; delete bloomFilter; countKmersMutex.lock(); u_int64_t filePos = (datasetId * _sketchSize * sizeof(KmerAndCountType)) + KMER_SPECTRUM_HEADER_SIZE; //cout << "DATASTE ID: " << datasetId << " " << filePos << endl; _outputFile.seekp(filePos); //_kmerCountSorter.pop(); //Discard greater element because queue size is always equal to (_sketchSize + 1) because of an optimization //cout << "----------" << endl; for(size_t i=0; iinsert(kmerCount._kmer, _datasetIDbin, kmerCount._count); //_kmerCountSorter.pop(); //_partitionWriter->insert(_minHashKmers[i], _datasetIDbin, _minHashKmersCounts[i] ); //cout << _minHashKmers[i] << " " << _minHashKmersCounts[i] << endl; } System::file().remove(inputFilename); _progress_nbDatasetsProcessed += 1; _progress->setMessage (Stringify::format (_progress_text.c_str(), _progress_nbDatasetsProcessed, _progress_nbDatasetsToProcess)); _progress->inc(1); //cout << "end: " << inputFilename << endl; _finishedThreads.push_back(datasetId); countKmersMutex.unlock(); } void waitThreads(){ while(1){ bool isThreadAvailbale = false; countKmersMutex.lock(); for(size_t i=0; i<_finishedThreads.size(); i++){ size_t threadId = _finishedThreads[i]; //_runningThreadIds.erase(std::remove(_runningThreadIds.begin(), _runningThreadIds.end(), threadId), _runningThreadIds.end()); auto it = find(_runningThreadIds.begin(), _runningThreadIds.end(), threadId); int pos = distance(_runningThreadIds.begin(), it); //cout << "\t removing thread " << threadId << " (pos: " << pos << ")" << endl; _runningThreadIds.erase(_runningThreadIds.begin()+pos); _threads[pos]->join(); delete _threads[pos]; _threads.erase(_threads.begin()+pos); _nbRunningThreads -= 1; isThreadAvailbale = true; } _finishedThreads.clear(); countKmersMutex.unlock(); if(isThreadAvailbale){ //cout << _runningThreadIds.size() << " " << _threads.size() << endl; //countKmersMutex.unlock(); break; } sleep(1); } } void joinThreads(){ while(_nbRunningThreads > 0) waitThreads(); } }; class Simka2ComputeKmerSpectrum : public Tool{ public: Simka2ComputeKmerSpectrum(): Tool ("SimkaMin-ComputeKmerSpectrum"){ IOptionsParser* parser = getParser();//new OptionsParser ("Simka2 - Compute Kmer Spectrum"); //Main parser //parser->push_front (new OptionNoParam (STR_SIMKA_COMPUTE_DATA_INFO, "compute (and display) information before running Simka, such as the number of reads per dataset", false)); //parser->push_front (new OptionNoParam (STR_SIMKA_KEEP_TMP_FILES, "keep temporary files", false)); //parser->push_front (new OptionOneParam (STR_URI_OUTPUT_TMP, "output directory for temporary files", true)); parser->push_front (new OptionOneParam (STR_SIMKA_SEED, "seed used for random k-mer selection", false, "100")); parser->push_front (new OptionOneParam (STR_URI_OUTPUT, "output filename for kmer spectrum", false, "./simkaMin_kmers.bin")); parser->push_front (new OptionOneParam (STR_URI_INPUT, "input filename | TODO SPECIF", true)); //parser->push_front (new OptionOneParam (STR_SIMKA2_DATASET_ID, "identifier of the input dataset", true)); //parser->push_back (new OptionOneParam (STR_URI_OUTPUT_TMP, "output directory for temporary files", true)); //IOptionsParser* parser = getParser(); //IOptionsParser* dskParser = SortingCountAlgorithm<>::getOptionsParser(); //parser->push_back(dskParser); //dskParser->setVisible(false); //cout << parser->getParser(STR_NB_CORES) << endl; // //parser->push_back(new OptionOneParam(parser->getParser(STR_NB_CORES)->getName(), parser->getParser(STR_NB_CORES)->getHelp(), false, "0")); //parser->push_front(dskParser->getParser (STR_URI_OUTPUT_TMP)); //dskParser->getParser (STR_URI_OUTPUT_TMP)->setMandatory //parser->push_front(dskParser->getParser (STR_URI_OUTPUT)); //parser->getParser (STR_URI_OUTPUT)->setHelp("output directory for result files (similarity matrix, heatmaps)"); //parser->push_front(dskParser->getParser (STR_URI_INPUT)); //parser->getParser(STR_URI_INPUT)->setHelp("input file of datasets. One dataset per line: id filename1 filename2..."); //if (Option* p = dynamic_cast (parser->getParser(STR_URI_OUTPUT_TMP))) { p->s; } //Distance parser //IOptionsParser* distanceParser = new OptionsParser ("distance"); //distanceParser->push_back (new OptionNoParam (STR_SIMKA_COMPUTE_ALL_SIMPLE_DISTANCES, "compute all simple distances (Chord, Hellinger...)", false)); //distanceParser->push_back (new OptionNoParam (STR_SIMKA_COMPUTE_ALL_COMPLEX_DISTANCES, "compute all complex distances (Jensen-Shannon...)", false)); //Kmer parser IOptionsParser* kmerParser = new OptionsParser ("kmer"); kmerParser->push_back (new OptionOneParam (STR_KMER_SIZE, "size of a kmer", false, "21")); kmerParser->push_back (new OptionOneParam (STR_SIMKA_SKETCH_SIZE, "number of kmers used to compute distances", false, "100000")); kmerParser->push_back (new OptionNoParam (STR_SIMKA_ABUNDANCE_FILTER, "filter out k-mer seen one time (potentially erroneous)", false)); //kmerParser->push_back(dskParser->getParser (STR_KMER_SIZE)); //kmerParser->push_back(new OptionOneParam (STR_KMER_PER_READ.c_str(), "number of selected kmers per read", false, "0")); //kmerParser->push_back (new OptionOneParam (STR_KMER_ABUNDANCE_MIN, "min abundance a kmer need to be considered", false, "1")); //kmerParser->push_back (new OptionOneParam (STR_KMER_ABUNDANCE_MIN, "min abundance a kmer need to be considered", false, "2")); //KmerCountType maxAbundance = -1; //kmerParser->push_back (new OptionOneParam (STR_KMER_ABUNDANCE_MAX, "max abundance a kmer can have to be considered", false, Stringify::format("%i", maxAbundance))); //kmerParser->push_back(dskParser->getParser (STR_KMER_ABUNDANCE_MIN)); //if (Option* p = dynamic_cast (parser->getParser(STR_KMER_ABUNDANCE_MIN))) { p->setDefaultValue ("0"); } //if (Option* p = dynamic_cast (parser->getParser(STR_SOLIDITY_KIND))) { p->setDefaultValue ("all"); } //kmerParser->push_back(dskParser->getParser (STR_KMER_ABUNDANCE_MAX)); //kmerParser->push_back(dskParser->getParser (STR_SOLIDITY_KIND)); //kmerParser->getParser (STR_SOLIDITY_KIND)->setHelp("TODO"); //kmerParser->push_back (new OptionNoParam (STR_SIMKA_SOLIDITY_PER_DATASET.c_str(), "do not take into consideration multi-counting when determining solid kmers", false )); //kmerParser->push_back (new OptionOneParam (STR_SIMKA_MIN_KMER_SHANNON_INDEX.c_str(), "minimal Shannon index a kmer should have to be kept. Float in [0,2]", false, "0" )); //Read filter parser IOptionsParser* readParser = new OptionsParser ("read"); readParser->push_back (new OptionOneParam (STR_SIMKA_MAX_READS.c_str(), "maximum number of reads to process. Set to 0 to use all reads", false, "0" )); readParser->push_back (new OptionOneParam (STR_SIMKA_MIN_READ_SIZE.c_str(), "minimal size a read should have to be kept", false, "0" )); readParser->push_back (new OptionOneParam (STR_SIMKA_MIN_READ_SHANNON_INDEX.c_str(), "minimal Shannon index a read should have to be kept. Float in [0,2]", false, "0" )); //readParser->push_back (new OptionOneParam ("-nb-dataset", "nb paired datasets", true)); //Core parser IOptionsParser* coreParser = new OptionsParser ("core"); coreParser->push_back(new OptionOneParam(STR_NB_CORES, "number of cores", false, "0")); coreParser->push_back (new OptionOneParam (STR_MAX_MEMORY, "max memory (MB). Only used if -filter is enabled", false, "8000")); //coreParser->push_back (new OptionOneParam (STR_SIMKA2_NB_PARTITION, "nb partitions", true)); //coreParser->push_back(dskParser->getParser ()); //coreParser->push_back(dskParser->getParser (STR_MAX_DISK)); //Distances //IOptionsParser* distanceParser = new OptionsParser ("distances"); //distanceParser->push_back (new OptionNoParam (STR_SIMKA_DISTANCE_BRAYCURTIS.c_str(), "compute Bray Curtis distance")); //distanceParser->push_back (new OptionNoParam (STR_SIMKA_DISTANCE_CHORD.c_str(), "compute Chord distance")); //distanceParser->push_back (new OptionNoParam (STR_SIMKA_DISTANCE_HELLINGER.c_str(), "compute Hellinger distance")); //distanceParser->push_back (new OptionNoParam (STR_SIMKA_DISTANCE_CANBERRA.c_str(), "compute Canberra distance")); //distanceParser->push_back (new OptionNoParam (STR_SIMKA_DISTANCE_KULCZYNSKI.c_str(), "compute Kulczynski distance")); //parser->push_back(distanceParser); parser->push_back(kmerParser); parser->push_back(readParser); parser->push_back(coreParser); //parser->push_back(distanceParser); //IOptionsParser* dskParser = SortingCountAlgorithm<>::getOptionsParser(); //if (Option* p = dynamic_cast (dskParser->getParser(STR_MINIMIZER_SIZE))) { p->setDefaultValue ("7"); } //parser->push_back(dskParser); //if (Option* p = dynamic_cast (dskParser->getParser(STR_MINIMIZER_SIZE))) { p->setDefaultValue ("7"); } //dskParser->setVisible(false); parser->getParser(STR_NB_CORES)->setVisible(false); //getParser()->push_back(parser); //if (Option* p = dynamic_cast (parser->getParser(STR_SOLIDITY_KIND))) { p->setDefaultValue ("all"); } //return parser; } ~Simka2ComputeKmerSpectrum(){ } struct Parameter { //Parameter (Simka& simka, IProperties* props) : props(props) {} Parameter (IProperties* props) : _props(props) {} //Simka& _simka; IProperties* _props; }; template struct Functor { void operator () (Parameter p){ Simka2ComputeKmerSpectrumAlgorithm* algo = new Simka2ComputeKmerSpectrumAlgorithm(p._props); algo->execute(); delete algo; } }; void execute () { IProperties* input = getInput(); //Parameter params(*this, getInput()); Parameter params(input); size_t kmerSize = getInput()->getInt (STR_KMER_SIZE); Integer::apply (kmerSize, params); } }; #endif /* SIMKA1_4_SRC_SIMKAMIN_SIMKAMINCOUNT_HPP_ */ simka-1.5.3/src/simkaMin/SimkaMinDistance.hpp000066400000000000000000000734601377312000000210620ustar00rootroot00000000000000/***************************************************************************** * SimkaMin: Fast kmer-based method for estimating the similarity between numerous metagenomic datasets * A tool from the GATB (Genome Assembly Tool Box) * Copyright (C) 2019 INRIA * Authors: G.Benoit * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU Affero General Public License as * published by the Free Software Foundation, either version 3 of the * License, or (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU Affero General Public License for more details. * * You should have received a copy of the GNU Affero General Public License * along with this program. If not, see . *****************************************************************************/ #ifndef SIMKA1_4_SRC_SIMKAMIN_SIMKAMINDISTANCE_HPP_ #define SIMKA1_4_SRC_SIMKAMIN_SIMKAMINDISTANCE_HPP_ #include "SimkaMinCommons.hpp" #include class KmerSpectrumIterator{ public: //FILE * _is; //ifstream _kmerSpectrumFile; size_t _sketchSize; bool _isDone; u_int64_t _nbItems; size_t _datasetId; vector >& _kmercountSketches; size_t _nbDatasetOffset; //u_int8_t* _buffer; //u_int64_t _bufferSize; //KmerAndCountType* _buffer; //vector > _buffers; //vector _buffers_isInit; KmerSpectrumIterator(const string& filename, vector >& kmercountSketches, size_t nbDatasetOffset) : _kmercountSketches(kmercountSketches) { //_buffer = 0; //_is = fopen((filename).c_str(), "rb"); //_kmerSpectrumFile.open(filename + ".kmers", ios::binary); //_sketchSize = sketchSize; _nbDatasetOffset = nbDatasetOffset; //cout << nbDatasetOffset << endl; //_buffer = (KmerAndCountType*) MALLOC (sizeof(KmerAndCountType) * _sketchSize); } ~KmerSpectrumIterator(){ //fclose(_is); //_kmerSpectrumFile.close(); //if(_buffer){FREE (_buffer);} } void first(size_t datasetId){ //cout << datasetId << " " << _nbDatasetOffset << endl; _datasetId = datasetId-_nbDatasetOffset; //cout << datasetId << " " << _nbDatasetOffset << " " << _datasetId<< endl; //if(_buffer){FREE (_buffer);} //u_int64_t pos = KMER_SPECTRUM_HEADER_SIZE + (datasetId*_sketchSize*sizeof(KmerAndCountType)); //fseek(_is, pos, SEEK_SET); _nbItems = 0; _sketchSize = _kmercountSketches[_datasetId].size(); //cout << sizeof(KmerAndCountType) << endl; //_kmerSpectrumFile.read((char*)_buffer, 10*_sketchSize); //int res = fread(_buffer, sizeof(KmerAndCountType), _sketchSize, _is); } inline bool isDone(){ return _nbItems >= _sketchSize; } inline void next(u_int64_t& kmer, KmerCountType& count){ //KmerAndCountType kmerCount; // = _buffer[_nbItems]; //memcpy(&kmerCount, &_buffer[_nbItems*10], 10); KmerAndCountType& kmerCount = _kmercountSketches[_datasetId][_nbItems]; //cout << _datasetId << " " << _nbDatasetOffset << " " << _kmercountSketches[_datasetId].size() << endl; kmer = kmerCount._kmer; count = kmerCount._count; //cout << kmer << " " << count << endl; //_kmerSpectrumFile.read((char*)(&kmer), sizeof(kmer)); //_kmerSpectrumFile.read((char*)(&count), sizeof(count)); _nbItems += 1; } }; class ComputeDistanceManager{ public: ofstream& _distanceMatrixJaccard; ofstream& _distanceMatrixBrayCurtis; //size_t _sketchSize; KmerSpectrumIterator* _kmerSpectrumiterator1; KmerSpectrumIterator* _kmerSpectrumiterator2; bool _isSymmetrical; u_int64_t _nbDistinctKmers; u_int64_t _nbDistinctSharedKmers; u_int64_t _nbKmers; u_int64_t _nbSharedKmers; size_t _nbDatasets1; size_t _nbDatasets2; vector _jaccardDistances; vector _braycurtisDistances; u_int64_t _jaccardDistances_nb; mutex& _mutex; //u_int64_t nbLala; ComputeDistanceManager(const string& filename1, const string& filename2, ofstream& distanceMatrixJaccard, ofstream& distanceMatrixBrayCurtis, bool isSymmetrical, size_t nbDatasets1, size_t nbDatasets2, mutex& mutex, size_t main_start_i, size_t main_start_j, size_t n_i, size_t n_j, vector >& _kmercountSketches_i, vector >& _kmercountSketches_j) : _distanceMatrixJaccard(distanceMatrixJaccard), _distanceMatrixBrayCurtis(distanceMatrixBrayCurtis), _mutex(mutex) { //_sketchSize = sketchSize; _kmerSpectrumiterator1 = new KmerSpectrumIterator(filename1, _kmercountSketches_i, main_start_i); _kmerSpectrumiterator2 = new KmerSpectrumIterator(filename2, _kmercountSketches_j, main_start_j); _isSymmetrical = isSymmetrical; _nbDatasets1 = nbDatasets1; _nbDatasets2 = nbDatasets2; _jaccardDistances.resize(1000); _braycurtisDistances.resize(1000); _jaccardDistances_nb = 0; //nbLala = 0; } ~ComputeDistanceManager(){ delete _kmerSpectrumiterator1; delete _kmerSpectrumiterator2; if(_jaccardDistances_nb > 0){ writeDistances(); /* _mutex.lock(); for(size_t i=0; i<_jaccardDistances.size() ; i++){ PairwiseDistance& jaccard = _jaccardDistances[i]; PairwiseDistance& braycurtis = _braycurtisDistances[i]; u_int64_t pos = jaccard._i*_nbDatasets2*sizeof(DistanceValueType) + (jaccard._j*sizeof(DistanceValueType)); _distanceMatrixJaccard.seekp(pos); _distanceMatrixBrayCurtis.seekp(pos); _distanceMatrixJaccard.write((const char*)&jaccard._distance, sizeof(jaccard._distance)); _distanceMatrixBrayCurtis.write((const char*)&braycurtis._distance, sizeof(braycurtis._distance)); if(_isSymmetrical){ u_int64_t pos = jaccard._j*_nbDatasets1*sizeof(DistanceValueType) + (jaccard._i*sizeof(DistanceValueType)); _distanceMatrixJaccard.seekp(pos); _distanceMatrixBrayCurtis.seekp(pos); _distanceMatrixJaccard.write((const char*)&jaccard._distance, sizeof(jaccard._distance)); _distanceMatrixBrayCurtis.write((const char*)&braycurtis._distance, sizeof(braycurtis._distance)); } } _mutex.unlock(); _jaccardDistances_nb = 0; //_braycurtisDistances.clear(); //_jaccardDistances.clear(); */ } //cout << nbLala << endl; } void computeDistance_unsynch(size_t i, size_t j){ //nbLala += 1; //_mutex.lock(); //lala += 1; //_mutex.unlock(); _nbDistinctSharedKmers = 0; _nbDistinctKmers = 0; _nbKmers = 0; _nbSharedKmers = 0; _kmerSpectrumiterator1->first(i); _kmerSpectrumiterator2->first(j); u_int64_t sketchSize = min(_kmerSpectrumiterator1->_sketchSize, _kmerSpectrumiterator2->_sketchSize); u_int64_t kmer1; u_int64_t kmer2; KmerCountType count1; KmerCountType count2; _kmerSpectrumiterator1->next(kmer1, count1); _kmerSpectrumiterator2->next(kmer2, count2); while(_nbDistinctKmers < sketchSize){ //_nbDistinctKmers < _sketchSize && (!_kmerSpectrumiterator1->isDone()) && (!_kmerSpectrumiterator2->isDone()) ){ //cout << kmer1 << " " << kmer2 << endl; if(kmer1 > kmer2){ _nbDistinctKmers += 1; _nbKmers += count2; if(_kmerSpectrumiterator2->isDone()) break; _kmerSpectrumiterator2->next(kmer2, count2); } else if(kmer1 < kmer2){ _nbDistinctKmers += 1; _nbKmers += count1; if(_kmerSpectrumiterator1->isDone()) break; _kmerSpectrumiterator1->next(kmer1, count1); } else{ _nbDistinctKmers += 1; _nbKmers += count1 + count2; _nbDistinctSharedKmers += 1; _nbSharedKmers += min(count1, count2); if(_kmerSpectrumiterator2->isDone() || _kmerSpectrumiterator1->isDone()) break; _kmerSpectrumiterator1->next(kmer1, count1); _kmerSpectrumiterator2->next(kmer2, count2); } } DistanceValueType jaccard; DistanceValueType braycurtis; if(_nbDistinctKmers == 0){ jaccard = 1; } else{ jaccard = 1 - (long double) _nbDistinctSharedKmers / (long double) _nbDistinctKmers; } if(_nbKmers == 0){ braycurtis = 1; }else{ braycurtis = 1 - (long double) (2*_nbSharedKmers) / (long double) _nbKmers; } //_mutex.lock(); //cout << i << " " << j << " " << braycurtis << endl; //_mutex.unlock(); _jaccardDistances[_jaccardDistances_nb].set(i, j, jaccard); _braycurtisDistances[_jaccardDistances_nb].set(i, j, braycurtis); _jaccardDistances_nb += 1; if(_jaccardDistances_nb == _jaccardDistances.size()){ writeDistances(); //_braycurtisDistances.clear(); //_jaccardDistances.clear(); } //cout << "NB DISTINCT KMERS: " << _nbDistinctKmers << endl; //cout << "NB SHARED DISTINCT KMERS: " << _nbDistinctSharedKmers << endl; //cout << "JACCARD: " << << endl; //cout << "BRAY CURTIS: " << 1 - (long double) (2*_nbSharedKmers) / (long double) _nbKmers << endl; } void writeDistances(){ _mutex.lock(); for(size_t i=0; i<_jaccardDistances_nb ; i++){ PairwiseDistance& jaccard = _jaccardDistances[i]; PairwiseDistance& braycurtis = _braycurtisDistances[i]; u_int64_t pos = jaccard._i*_nbDatasets2*sizeof(DistanceValueType) + (jaccard._j*sizeof(DistanceValueType)); _distanceMatrixJaccard.seekp(pos); _distanceMatrixBrayCurtis.seekp(pos); _distanceMatrixJaccard.write((const char*)&jaccard._distance, sizeof(jaccard._distance)); _distanceMatrixBrayCurtis.write((const char*)&braycurtis._distance, sizeof(braycurtis._distance)); if(_isSymmetrical){ u_int64_t pos = jaccard._j*_nbDatasets1*sizeof(DistanceValueType) + (jaccard._i*sizeof(DistanceValueType)); _distanceMatrixJaccard.seekp(pos); _distanceMatrixBrayCurtis.seekp(pos); _distanceMatrixJaccard.write((const char*)&jaccard._distance, sizeof(jaccard._distance)); _distanceMatrixBrayCurtis.write((const char*)&braycurtis._distance, sizeof(braycurtis._distance)); } } _mutex.unlock(); _jaccardDistances_nb = 0; } }; class SimkaMinDistanceAlgorithm : public Algorithm { public: size_t _nbCores; string _outputDir; string _inputFilename1; string _inputFilename2; //pair _abundanceThreshold; //SIMKA_SOLID_KIND _solidKind; //bool _soliditySingle; //int64_t _maxNbReads; //size_t _minReadSize; //double _minReadShannonIndex; //double _minKmerShannonIndex; //size_t _nbMinimizers; //size_t _nbCores; //SimkaStatistics* _stats; //SimkaDistance* _simkaDistance; //string _banksInputFilename; //string _h5Filename; //vector _tempFilenamesToDelete; //IBank* _banks; IProperties* _options; //size_t _localNbPartitions; //vector _bankNames; //vector _nbReadsPerDataset; //string _outputFilenameSuffix; //u_int64_t _totalKmers; //vector _nbBankPerDataset; //size_t _nbBankPerDataset; //string _largerBankId; //bool _computeSimpleDistances; //bool _computeComplexDistances; //bool _keepTmpFiles; //string _kmerDatataseFilename; //vector _cmds; //SimkaPartitionWriter* _partitionWriter; //vector> _bufferKmers; //vector> _bufferCounts; //vector _bufferIndex; //vector _minHashValues; //vector _minHashKmers; //vector _minHashKmersCounts; u_int32_t _sketchSize_1, _sketchSize_2; u_int32_t _seed; //pthread_mutex_t _mutex; //typedef typename SelectKmersCommand::KmerCountSorter KmerCountSorter; //std::priority_queue< u_int64_t, vector, KmerCountSorter> _kmerCountSorter; //KmerCountDictionaryType _kmerCounts; size_t _nbBanks; //vector _bankNames; //vector _nbBankPerDataset; vector _threads; size_t _maxRunningThreads; vector _runningThreadIds; size_t _nbRunningThreads; vector _finishedThreads; mutex countKmersMutex; //vector _datasetIds1; //vector _datasetIds2; u_int32_t _nbDataset1; u_int32_t _nbDataset2; ofstream _distanceMatrixJaccard; ofstream _distanceMatrixBrayCurtis; mutex _mutex; IteratorListener* _progress; u_int64_t _progress_distanceStep; //u_int64_t _progress_nbDistancesToCompute; u_int64_t _progress_nbDistancesComputed; //string _progress_text; size_t _start_i, _start_j; size_t _n_i, _n_j; vector > _kmercountSketches_i; vector > _kmercountSketches_j; SimkaMinDistanceAlgorithm(IProperties* options): Algorithm("simkaMinDistanceAlgorithm", -1, options) { } void execute(){ //pthread_mutex_init(&_mutex, NULL); parseArgs(); readInfos(); loadSketches(); distance(); //createDirs(); //SimkaCommons::checkInputValidity(_outputDirTemp, _inputFilename); //countDatasets(); //string command = "rm -rf " + _outputDirTemp; //system(command.c_str()); cout << "Output results: " << _outputDir << endl; } void parseArgs(){ _options = getInput(); //_sketchSize = _options->getInt(STR_SIMKA_SKETCH_SIZE); _nbCores = _options->getInt(STR_NB_CORES); _inputFilename1 = _options->getStr(STR_SIMKA_URI_INPUT_1); _inputFilename2 = _options->getStr(STR_SIMKA_URI_INPUT_2); _outputDir = _options->getStr(STR_URI_OUTPUT); _start_i = _options->getInt("-start-i"); _start_j = _options->getInt("-start-j"); _n_i = _options->getInt("-n-i"); _n_j = _options->getInt("-n-j"); //_kmerSize = _options->getInt(STR_KMER_SIZE); if(!System::file().doesExist(_outputDir)){ int ok = System::file().mkdir(_outputDir, -1); if(ok != 0){ std::cerr << "Error: can't create output directory (" << _outputDir << ")" << std::endl; exit(1); } } } void readInfos(){ //_nbDataset1 = SimkaMinCommons::readNbDatasets(_inputFilename1); //_nbDataset2 = SimkaMinCommons::readNbDatasets(_inputFilename2); //u_int32_t sketchSize1; //u_int32_t sketchSize2; u_int8_t kmerSizeDummy; SimkaMinCommons::getKmerInfos(_inputFilename1, kmerSizeDummy, _sketchSize_1, _seed, _nbDataset1); SimkaMinCommons::getKmerInfos(_inputFilename2, kmerSizeDummy, _sketchSize_2, _seed, _nbDataset2); //_sketchSize = min(sketchSize1, sketchSize2); if(_sketchSize_1 != _sketchSize_2){ cout << "WARNING: both spectrums have different sizes (" << _sketchSize_1 << " and " << _sketchSize_2 << "), will use " << min(_sketchSize_1, _sketchSize_2) << " k-mers" << endl; } if(_n_i == 0){ _n_i = _nbDataset1; } if(_n_j == 0){ _n_j = _nbDataset2; } //_nbdatasetsToProcess = min(_nbdatasetsToProcess, ) //cout << _nbDataset1 << " " << _nbDataset2 << endl; //cout << _sketchSize << endl; //cout << _seed << endl; } /* void createDirs(){ //if(!System::file().doesExist(_outputDir)){ //int ok = System::file().mkdir(_outputDir, -1); //if(ok != 0){ // std::cerr << "Error: can't create output directory (" << _outputDir << ")" << std::endl; // exit(1); //} //} if(!System::file().doesExist(_outputDirTemp)){ int ok = System::file().mkdir(_outputDirTemp, -1); if(ok != 0){ std::cerr << "Error: can't create output temp directory (" << _outputDirTemp << ")" << std::endl; exit(1); } } _outputDirTemp = System::file().getRealPath(_outputDirTemp) + "/"; //_outputDirTemp += "/simka_output_temp/"; //System::file().mkdir(_outputDirTemp, -1); //_args->setStr(STR_URI_OUTPUT_TMP, _outputDirTemp); //System::file().mkdir(_outputDirTemp + "/input/", -1); }*/ void loadSketches(){ ifstream sketchFile_1; sketchFile_1.open(_inputFilename1.c_str(), ios::binary); ifstream sketchFile_2; sketchFile_2.open(_inputFilename2.c_str(), ios::binary); _kmercountSketches_i.resize(_n_i); _kmercountSketches_j.resize(_n_j); u_int32_t sketchSize = min(_sketchSize_1, _sketchSize_2); size_t index = 0; for(size_t i=_start_i; i<_start_i+_n_i; i++){ u_int64_t pos = KMER_SPECTRUM_HEADER_SIZE + (i*_sketchSize_1*sizeof(KmerAndCountType)); sketchFile_1.seekg(pos); _kmercountSketches_i[index].resize(sketchSize); //for(size_t k=0; k<_sketchSize; k++){ sketchFile_1.read((char*)&(_kmercountSketches_i[index][0]), sizeof(KmerAndCountType)*sketchSize); //} index += 1; } index = 0; for(size_t j=_start_j; j<_start_j+_n_j; j++){ u_int64_t pos = KMER_SPECTRUM_HEADER_SIZE + (j*_sketchSize_2*sizeof(KmerAndCountType)); sketchFile_2.seekg(pos); _kmercountSketches_j[index].resize(sketchSize); sketchFile_2.read((char*)&(_kmercountSketches_j[index][0]), sizeof(KmerAndCountType)*sketchSize); //for(size_t k=0; k<_sketchSize; k++){ //sketchFile_2.read(&_kmercountSketches_j[index][k], sizeof(KmerAndCountType)); //} index += 1; } sketchFile_1.close(); sketchFile_2.close(); for(size_t i=0; i<_kmercountSketches_i.size(); i++){ u_int64_t start = 0; for(size_t j=0; j<_sketchSize_1; j++){ if(_kmercountSketches_i[i][j]._kmer == 0){ start += 1; } } _kmercountSketches_i[i].erase(_kmercountSketches_i[i].begin(), _kmercountSketches_i[i].begin()+start); } for(size_t i=0; i<_kmercountSketches_j.size(); i++){ u_int64_t start = 0; for(size_t j=0; j<_kmercountSketches_j[i].size(); j++){ if(_kmercountSketches_j[i][j]._kmer == 0){ start += 1; } } _kmercountSketches_j[i].erase(_kmercountSketches_j[i].begin(), _kmercountSketches_j[i].begin()+start); } } void distance(){ if(System::file().doesExist(_outputDir + "/mat_presenceAbsence_jaccard.bin")){ _distanceMatrixJaccard.open((_outputDir + "/mat_presenceAbsence_jaccard.bin").c_str(), ios::binary | ios::in); _distanceMatrixBrayCurtis.open((_outputDir + "/mat_abundance_braycurtis.bin").c_str(), ios::binary | ios::in); } else{ _distanceMatrixJaccard.open((_outputDir + "/mat_presenceAbsence_jaccard.bin").c_str(), ios::binary); _distanceMatrixBrayCurtis.open((_outputDir + "/mat_abundance_braycurtis.bin").c_str(), ios::binary); } bool isSymmetrical = false; if(_inputFilename1 == _inputFilename2 && _start_i == _start_j){ computeDistanceSymetrical(); isSymmetrical = true; } else{ computeDistanceRectangle(); } for(size_t i=0; i<_threads.size(); i++){ _threads[i]->join(); delete _threads[i]; //cout << i << endl; } _progress->finish(); //Fill diagonal with 0 if(isSymmetrical){ for(size_t i=_start_i; i<_start_i+_n_i; i++){ size_t j=i; u_int64_t pos = i*_nbDataset1*sizeof(DistanceValueType) + (j*sizeof(DistanceValueType)); _distanceMatrixJaccard.seekp(pos); _distanceMatrixBrayCurtis.seekp(pos); DistanceValueType nullDist = 0; _distanceMatrixJaccard.write((const char*)&nullDist, sizeof(nullDist)); _distanceMatrixBrayCurtis.write((const char*)&nullDist, sizeof(nullDist)); } } _distanceMatrixJaccard.close(); _distanceMatrixBrayCurtis.close(); //string command = "cp " + string(_inputFilename1+".ids") + " " + _outputDir + "/matrix_infos.ids "; //cout << command << endl; //system(command.c_str()); } void computeDistanceSymetrical(){ //cout << "compute symetrical distances" << endl; u_int64_t nbDistancesToCompute = (_n_i*(_n_i-1)) / 2; //u_int64_t nbDistancesToCompute = _nbDataset1*_nbDataset1; //(_nbDataset1*(_nbDataset1-1)) / 2; u_int64_t nbDistancePerThreads = nbDistancesToCompute / _nbCores; u_int64_t nbDistancesRemaining = nbDistancesToCompute-(nbDistancePerThreads*_nbCores); //vector startDistanceI; //vector startDistanceJ; //size_t si=0; //size_t sj=0; //cout << "NB CORES: " << _nbCores << endl; //cout << "NB DISTANCES: " << nbDistancesToCompute << endl; //cout << "NB DISTANCES PER CORE: " << nbDistancePerThreads << endl; _progress = this->createIteratorListener (nbDistancesToCompute, "Computing distances"); _progress->init (); _progress_distanceStep = max((u_int64_t)1, (u_int64_t) (nbDistancePerThreads / 100)); u_int64_t nbDistances = 0; size_t nbRunnedThreads = 0; size_t i=_start_i; size_t j=i+1; size_t maxDatasets = _start_i+_n_i;//min((u_int64_t)_start_i+_nbdatasetsToProcess, (u_int64_t)_nbDataset1); //_computeDistanceManagers.push_back(); thread* t = new thread(&SimkaMinDistanceAlgorithm::computeDistances_unsynch, this, i, j, nbDistancePerThreads, nbRunnedThreads); _threads.push_back(t); //computeDistances_unsynch(i, j, nbDistancePerThreads, true); bool done = false; nbRunnedThreads += 1; for(; i= nbDistancePerThreads){ //cout << i << " " << j << endl; //cout << "lol: " << nbRunnedThreads << " " << nbDistancesRemaining << endl; if(nbRunnedThreads == _nbCores-1){ //Last threads compute remaining distances //cout << " LOL " << endl;); thread* t = new thread(&SimkaMinDistanceAlgorithm::computeDistances_unsynch, this, i, j, nbDistancePerThreads+nbDistancesRemaining, nbRunnedThreads); _threads.push_back(t); //computeDistances_unsynch(i, j, nbDistancePerThreads+nbDistancesRemaining, true); done = true; //nbDistances -= nbDistancesRemaining; } else{ thread* t = new thread(&SimkaMinDistanceAlgorithm::computeDistances_unsynch, this, i, j, nbDistancePerThreads, nbRunnedThreads); _threads.push_back(t); //computeDistances_unsynch(i, j, nbDistancePerThreads, true); } nbRunnedThreads += 1; nbDistances = 0; } nbDistances += 1; } } /* uint64_t iFloor = nbDistancePerThreads / _nbDataset1; uint64_t iMod = nbDistancePerThreads % _nbDataset1; for ( uint64_t i = 0, j = 0; i < _nbDataset1; i += iFloor, j += iMod ) { if ( j >= _nbDataset1 ) { if ( i == _nbDataset1 - 1 ) { break; } i++; j -= _nbDataset1; } cout << i << " " << j << endl; //thread* t = new thread(&SimkaMinDistanceAlgorithm::computeDistances_unsynch, this, i, j); //_threads.push_back(t); } */ /* while(true){ startDistanceI.push_back(si); startDistanceI.push_back(sj); u_int64_t nbDistances = 0; while(nbDistances < nbDistancesToCompute){ //for(size_t i=0; i<) } } for(size_t i=1; i<_nbDataset1; i++){ for(size_t j=(i+1); j<_nbDataset1; j++){ } }*/ } void computeDistanceRectangle(){ //cout << "compute rectangle distances" << endl; u_int64_t nbDistancesToCompute = _n_i*_n_j; //u_int64_t nbDistancesToCompute = _nbDataset1*_nbDataset1; //(_nbDataset1*(_nbDataset1-1)) / 2; u_int64_t nbDistancePerThreads = nbDistancesToCompute / _nbCores; u_int64_t nbDistancesRemaining = nbDistancesToCompute-(nbDistancePerThreads*_nbCores); //vector startDistanceI; //vector startDistanceJ; //size_t si=0; //size_t sj=0; //cout << "NB CORES: " << _nbCores << endl; //cout << "NB DISTANCES: " << nbDistancesToCompute << endl; //cout << "NB DISTANCES PER CORE: " << nbDistancePerThreads << endl; //cout << "NB DISTANCES REMAINING: " << nbDistancesRemaining << endl; _progress = this->createIteratorListener (nbDistancesToCompute, "Computing distances"); _progress->init (); u_int64_t nbDistances = 0; size_t nbRunnedThreads = 0; size_t i=_start_i; size_t j=_start_j; thread* t = new thread(&SimkaMinDistanceAlgorithm::computeDistances_rectanglular_unsynch, this, i, j, nbDistancePerThreads, nbRunnedThreads); _threads.push_back(t); bool done = false; nbRunnedThreads += 1; //size_t maxDatasetsI = min((u_int64_t)_start_i+_nbdatasetsToProcess, (u_int64_t)_nbDataset1); for(i=_start_i; i<_start_i+_n_i; i++){ if(done) break; //size_t maxDatasetsJ = min((u_int64_t)_start_j+_nbdatasetsToProcess, (u_int64_t)_nbDataset2); for(j=_start_j; j<_start_j+_n_j; j++){ if(done) break; if(nbDistances >= nbDistancePerThreads){ //cout << i << " " << j << endl; //cout << "lol: " << nbRunnedThreads << " " << nbDistancesRemaining << endl; if(nbRunnedThreads == _nbCores-1){ //Last threads compute remaining distances //cout << " LOL " << endl;); //cout << i << " " << j << endl; thread* t = new thread(&SimkaMinDistanceAlgorithm::computeDistances_rectanglular_unsynch, this, i, j, nbDistancePerThreads+nbDistancesRemaining, nbRunnedThreads); _threads.push_back(t); //computeDistances_unsynch(i, j, nbDistancePerThreads+nbDistancesRemaining, true); done = true; //nbDistances -= nbDistancesRemaining; } else{ //cout << i << " " << j << endl; thread* t = new thread(&SimkaMinDistanceAlgorithm::computeDistances_rectanglular_unsynch, this, i, j, nbDistancePerThreads, nbRunnedThreads); _threads.push_back(t); //computeDistances_unsynch(i, j, nbDistancePerThreads, true); } nbRunnedThreads += 1; nbDistances = 0; } //cout << nbDistances << " " << nbDistancePerThreads << endl; nbDistances += 1; } } } void computeDistances_unsynch(size_t si, size_t sj, size_t nbDistancesToCompute, size_t id){ ComputeDistanceManager computeDistanceManager(_inputFilename1, _inputFilename2, _distanceMatrixJaccard, _distanceMatrixBrayCurtis, true, _nbDataset1, _nbDataset2, _mutex, _start_i, _start_j, _n_i, _n_j, _kmercountSketches_i, _kmercountSketches_j); //cout << "-------------------" << endl; u_int64_t progress_nbComputedistances = 0; u_int64_t nbComputedDistances = 0; //size_t maxDatasetsI = min((u_int64_t)si+_nbdatasetsToProcess, (u_int64_t)_nbDataset1); for(size_t j=sj; j<_start_i+_n_i; j++){ //cout << j << endl; //cout << si << " " << j << endl; computeDistanceManager.computeDistance_unsynch(si, j); nbComputedDistances += 1; progress_nbComputedistances += 1; if(nbComputedDistances >= nbDistancesToCompute) break; } si += 1; if(nbComputedDistances < nbDistancesToCompute){ //cout << "lala2" << endl; for(size_t i=si; i<_start_i+_n_i; i++){ for(size_t j=i+1; j<_start_i+_n_i; j++){ //cout << i << " " << j << endl; computeDistanceManager.computeDistance_unsynch(i, j); nbComputedDistances += 1; progress_nbComputedistances += 1; //_mutex.lock(); //cout << progress_nbComputedistances << " " << _progress_distanceStep << endl; //_mutex.unlock(); if(progress_nbComputedistances > _progress_distanceStep){ _mutex.lock(); _progress->inc(progress_nbComputedistances); _mutex.unlock(); progress_nbComputedistances = 0; } if(nbComputedDistances >= nbDistancesToCompute) break; } if(nbComputedDistances >= nbDistancesToCompute) break; } } _mutex.lock(); _progress->inc(progress_nbComputedistances); _mutex.unlock(); } void computeDistances_rectanglular_unsynch(size_t si, size_t sj, size_t nbDistancesToCompute, size_t id){ //isSymetrical set to false ComputeDistanceManager computeDistanceManager(_inputFilename1, _inputFilename2, _distanceMatrixJaccard, _distanceMatrixBrayCurtis, false, _nbDataset1, _nbDataset2, _mutex, _start_i, _start_j, _n_i, _n_j, _kmercountSketches_i, _kmercountSketches_j); //_mutex.lock(); //cout << "------------------- " << si << " " << sj << endl; //_mutex.unlock(); u_int64_t nbComputedDistances = 0; u_int64_t progress_nbComputedistances = 0; //size_t maxDatasetsI = min((u_int64_t)si+_nbdatasetsToProcess, (u_int64_t)_nbDataset1); //size_t maxDatasetsJ = min((u_int64_t)sj+_nbdatasetsToProcess, (u_int64_t)_nbDataset2); for(size_t j=sj; j<_start_j+_n_j; j++){ //cout << si << " " << j << endl; computeDistanceManager.computeDistance_unsynch(si, j); nbComputedDistances += 1; progress_nbComputedistances += 1; if(nbComputedDistances >= nbDistancesToCompute) break; } si += 1; if(nbComputedDistances < nbDistancesToCompute){ for(size_t i=si; i<_start_i+_n_i; i++){ for(size_t j=_start_j; j<_start_j+_n_j; j++){ // (0 instead of i+1) //cout << i << " " << j << endl; computeDistanceManager.computeDistance_unsynch(i, j); nbComputedDistances += 1; progress_nbComputedistances += 1; if(progress_nbComputedistances > _progress_distanceStep){ _mutex.lock(); _progress->inc(progress_nbComputedistances); _mutex.unlock(); progress_nbComputedistances = 0; } if(nbComputedDistances >= nbDistancesToCompute) break; } if(nbComputedDistances >= nbDistancesToCompute) break; } } _mutex.lock(); //cout << nbComputedDistances << " " << nbDistancesToCompute << endl; _progress->inc(progress_nbComputedistances); _mutex.unlock(); } }; class SimkaMinDistance : public Tool{ public: SimkaMinDistance(): Tool ("SimkaMin-Distance"){ IOptionsParser* parser = getParser();//new OptionsParser ("Simka2 - Compute Kmer Spectrum"); parser->push_front (new OptionOneParam (STR_URI_OUTPUT, "output dir for distance matrices", false, "./simkaMin_results")); parser->push_front (new OptionOneParam (STR_SIMKA_URI_INPUT_2, "filename to a sketch file to compare with -in1", true)); parser->push_front (new OptionOneParam (STR_SIMKA_URI_INPUT_1, "filename to a sketch file to compare with -in2", true)); parser->push_back (new OptionOneParam ("-start-i", "start i (row)", false, "0")); parser->push_back (new OptionOneParam ("-start-j", "start j (column)", false, "0")); parser->push_back (new OptionOneParam ("-n-i", "Nb datasets to process (row)", false, "0")); parser->push_back (new OptionOneParam ("-n-j", "Nb datasets to process (column)", false, "0")); } void execute () { IProperties* args = getInput(); u_int32_t seed1; u_int32_t seed2; u_int32_t dummy; u_int8_t kmerSize1; u_int8_t kmerSize2; string inputFilename1 = args->getStr(STR_SIMKA_URI_INPUT_1); string inputFilename2 = args->getStr(STR_SIMKA_URI_INPUT_2); SimkaMinCommons::getKmerInfos(inputFilename1, kmerSize1, dummy, seed1, dummy); SimkaMinCommons::getKmerInfos(inputFilename2, kmerSize2, dummy, seed2, dummy); //size_t kmerSize = getInput()->getInt (STR_KMER_SIZE); if(kmerSize1 != kmerSize2){ cerr << "ERROR: can't compare both sketches because of different kmer sizes (" << kmerSize1 << " and " << kmerSize2 << ")" << endl; exit(1); } if(seed1 != seed2){ cerr << "ERROR: can't compare both sketches because of different seeds (" << seed1 << " and " << seed2 << ")" << endl; exit(1); } //cout << seed1 << " " << seed2 << endl; SimkaMinDistanceAlgorithm* algo = new SimkaMinDistanceAlgorithm(args); algo->execute(); delete algo; } }; #endif /* SIMKA1_4_SRC_SIMKAMIN_SIMKAMINDISTANCE_HPP_ */ simka-1.5.3/src/simkaMin/SimkaMinDistanceMatrixExporter.hpp000066400000000000000000000330031377312000000237650ustar00rootroot00000000000000/***************************************************************************** * SimkaMin: Fast kmer-based method for estimating the similarity between numerous metagenomic datasets * A tool from the GATB (Genome Assembly Tool Box) * Copyright (C) 2019 INRIA * Authors: G.Benoit * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU Affero General Public License as * published by the Free Software Foundation, either version 3 of the * License, or (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU Affero General Public License for more details. * * You should have received a copy of the GNU Affero General Public License * along with this program. If not, see . *****************************************************************************/ #ifndef SIMKA1_4_SRC_SIMKAMIN_SIMKAMINDISTANCEMATRIXEXPORTER_HPP_ #define SIMKA1_4_SRC_SIMKAMIN_SIMKAMINDISTANCEMATRIXEXPORTER_HPP_ #include "SimkaMinCommons.hpp" class SimkaDistanceMatrixBinary { public: static void loadRow(size_t rowIndex, ifstream& matrixBinaryFile, vector& resultRow){ matrixBinaryFile.seekg(rowIndex*resultRow.size()*sizeof(float), ios_base::beg); matrixBinaryFile.read((char*)resultRow.data(), sizeof(float)*resultRow.size()); } static void mergeMatrices(const string& existingMatrixFilename, const string& newMatrixFilename_existingVsNew, const string& newMatrixFilename_newVsNew, u_int32_t nbDatasets_existing, u_int32_t nbDatasets_new){ ifstream existingMatrixFile; existingMatrixFile.open(existingMatrixFilename.c_str(), ios::binary); vector existingRowData(nbDatasets_existing, 0); ifstream matrixFile_existingVsNew; matrixFile_existingVsNew.open(newMatrixFilename_existingVsNew.c_str(), ios::binary); ifstream matrixFile_newVsNew; matrixFile_newVsNew.open(newMatrixFilename_newVsNew.c_str(), ios::binary); vector newRowData(nbDatasets_new, 0); string tempOutputFilename = existingMatrixFilename + ".temp"; ofstream tempOutputFile; tempOutputFile.open(tempOutputFilename.c_str(), ios::binary); //Write existing distance + matrixFile_existingVsNew (right part) for(size_t i=0; i >& distanceMatrix_rectangular, const vector >& distanceMatrix_squaredHalf){ //string distanceMatrixDir = distanceMatricesDir + "/" + distanceName; //if(System::file().doesExist(distanceMatrixDir)){ //} //else{ // System::file().mkdir(distanceMatrixDir, -1); //} //string distanceMatrixDir = outputDirTemp + "/distance_matrix"; string filename = distanceMatricesDir + "/" + distanceName + ".bin"; ofstream outputFile(filename.c_str(), ios::binary); u_int64_t nbOldBanks = 0; if(distanceMatrix_rectangular.size() > 0){ nbOldBanks = distanceMatrix_rectangular[0].size(); } u_int64_t nbNewBanks = distanceMatrix_squaredHalf.size() + 1; u_int64_t nbBanks = nbOldBanks + nbNewBanks; if(nbOldBanks > 0){ if(nbNewBanks > 1){ for(size_t i=0; i >& distanceMatrix_rectangular){ //cout << endl; //cout << distanceMatrix_rectangular.size() << " " << distanceMatrix_rectangular[i].size() << endl; //cout << endl; //for(size_t j=0; j >& distanceMatrix_squaredHalf){ u_int64_t nbNewBanks = distanceMatrix_squaredHalf.size() + 1; //for(size_t i=0; i _ids1; vector _ids2; //vector _wantedIds; //vector _wantedIdsIndex_1; //vector _wantedIdsIndex_2; //unordered_map _idToIndex_1; //unordered_map _idToIndex_2; size_t _inputMatrixSize_1; size_t _inputMatrixSize_2; //size_t _outputMatrixSize; SimkaMinDistanceMatrixExporterAlgorithm(IProperties* options): Algorithm("simkaMinDistanceExporterAlgorithm", -1, options) { } void execute(){ _inputFilenameIds = ""; parseArgs(); createWantedIds(); //createIdsIndex(); writeMatrices(); } void parseArgs(){ _options = getInput(); _inputDir = _options->getStr(STR_URI_INPUT); _inputSketchFilename1 = _options->getStr(STR_SIMKA_URI_INPUT_1); _inputSketchFilename2 = _options->getStr(STR_SIMKA_URI_INPUT_2); _outputDir = _options->getStr(STR_URI_OUTPUT); if(getInput()->get(STR_SIMKA_INPUT_IDS)){ _inputFilenameIds = getInput()->getStr(STR_SIMKA_INPUT_IDS); } if(!System::file().doesExist(_outputDir)){ int ok = System::file().mkdir(_outputDir, -1); if(ok != 0){ std::cerr << "Error: can't create output directory (" << _outputDir << ")" << std::endl; exit(1); } } } void createWantedIds(){ SimkaMinCommons::readIds(_inputSketchFilename1, _ids1); SimkaMinCommons::readIds(_inputSketchFilename2, _ids2); /* if(_inputFilenameIds.empty()){ _wantedIds = vector(_ids1); _wantedIds.insert(_wantedIds.end(), _ids2.begin(), _ids2.end()); } else{ string line; ifstream inputFile(_inputFilenameIds.c_str()); while(getline(inputFile, line)){ line.erase(std::remove(line.begin(),line.end(),' '),line.end()); if(line == "") continue; _wantedIds.push_back(line); } } */ _inputMatrixSize_1 = _ids1.size(); _inputMatrixSize_2 = _ids2.size(); //_outputMatrixSize = _wantedIds.size(); cout << "Matrix size: " << _inputMatrixSize_1 << " x " << _inputMatrixSize_2 << endl; } /* void createIdsIndex(){ for(size_t i=0; i<_ids1.size(); i++){ _idToIndex_1[_ids1[i]] = i; } for(size_t i=0; i<_ids2.size(); i++){ _idToIndex_2[_ids2[i]] = i; } for(size_t i=0; i<_wantedIds.size(); i++){ if(_idToIndex_1.find(_wantedIds[i]) == _idToIndex_1.end()){ cout << "ID not found in distance matrix: " << _wantedIds[i] << endl; } else{ _wantedIdsIndex.push_back(_idToIndex[_wantedIds[i]]); } } //_wantedIdsIndex.resize(_outputMatrixSize); //for(size_t i=0; i<_outputMatrixSize; i++){ //} _outputMatrixSize = _wantedIdsIndex.size(); cout << "output matrix size: " << _outputMatrixSize << endl; } */ void writeMatrices(){ vector matrixFilenames = System::file().listdir(_inputDir); for(size_t i=0; i rowData(_ids2.size(), 0); ifstream binaryMatrixFile(binaryMatrixFilename.c_str(), ios::binary); string filename = _outputDir + "/" + distanceName + ".csv"; gzFile out = gzopen((filename + ".gz").c_str(),"wb"); string str = ""; for(size_t i=0; i<_ids2.size(); i++){ str += ";" + _ids2[i]; //_ids[_wantedIdsIndex[i]]; } str += '\n'; gzwrite(out, str.c_str(), str.size()); for(size_t i=0; i<_ids1.size(); i++){ str = ""; str += _ids1[i] + ";"; //[_wantedIdsIndex[i]] + ";"; //size_t rowIndex = _wantedIdsIndex[i]; SimkaDistanceMatrixBinary::loadRow(i, binaryMatrixFile, rowData); for(size_t j=0; j<_ids2.size(); j++){ //str += Stringify::format("%f", rowData[_wantedIdsIndex[j]]) + ";"; str += Stringify::format("%f", rowData[j]) + ";"; } str.erase(str.size()-1); str += '\n'; gzwrite(out, str.c_str(), str.size()); } gzclose(out); binaryMatrixFile.close(); } }; class SimkaMinDistanceMatrixExporter : public Tool{ public: SimkaMinDistanceMatrixExporter(): Tool ("SimkaMin-DistanceMatrixExporter"){ IOptionsParser* parser = getParser();//new OptionsParser ("Simka2 - Compute Kmer Spectrum"); parser->push_front (new OptionOneParam (STR_URI_OUTPUT, "output dir for distance matrices", false, "./simkaMin_results")); //parser->push_front (new OptionOneParam (STR_SIMKA_INPUT_IDS, "filename of ids in the result matrix (one id per line). Do not used this option to used all ids.", false)); parser->push_front (new OptionOneParam (STR_SIMKA_URI_INPUT_2, "second used sketch file (-in2 argument of ./simkaMin distance)", true)); parser->push_front (new OptionOneParam (STR_SIMKA_URI_INPUT_1, "first used sketch file (-in1 argument of ./simkaMin distance)", true)); parser->push_front (new OptionOneParam (STR_URI_INPUT, "input dir containing distance matrices in binary format (-out argument of ./simkaMin distance)", true)); } void execute (){ IProperties* args = getInput(); SimkaMinDistanceMatrixExporterAlgorithm* algo = new SimkaMinDistanceMatrixExporterAlgorithm(args); algo->execute(); delete algo; } }; #endif /* SIMKA1_4_SRC_SIMKAMIN_SIMKAMINDISTANCEMATRIXEXPORTER_HPP_ */ simka-1.5.3/src/simkaMin/SimkaMinDistanceMatrixMerger.hpp000066400000000000000000000130031377312000000233740ustar00rootroot00000000000000/***************************************************************************** * SimkaMin: Fast kmer-based method for estimating the similarity between numerous metagenomic datasets * A tool from the GATB (Genome Assembly Tool Box) * Copyright (C) 2019 INRIA * Authors: G.Benoit * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU Affero General Public License as * published by the Free Software Foundation, either version 3 of the * License, or (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU Affero General Public License for more details. * * You should have received a copy of the GNU Affero General Public License * along with this program. If not, see . *****************************************************************************/ #ifndef SIMKA1_4_SRC_SIMKAMIN_SIMKAMINDISTANCEMATRIXMERGER_HPP_ #define SIMKA1_4_SRC_SIMKAMIN_SIMKAMINDISTANCEMATRIXMERGER_HPP_ #include "SimkaMinCommons.hpp" #include "SimkaMinDistanceMatrixExporter.hpp" class SimkaMinDistanceMatrixMergerAlgorithm : public Algorithm{ public: IProperties* _options; string _inputDir; //string _outputDir; string _inputSketchFilename_existingDatasets; string _inputSketchFilename_newDatasets; //vector _ids1; //vector _ids2; //vector _wantedIds; //vector _wantedIdsIndex_1; //vector _wantedIdsIndex_2; //unordered_map _idToIndex_1; //unordered_map _idToIndex_2; //size_t _inputMatrixSize_1; //size_t _inputMatrixSize_2; //size_t _outputMatrixSize; SimkaMinDistanceMatrixMergerAlgorithm(IProperties* options): Algorithm("simkaMinDistanceMatrixMergerAlgorithm", -1, options) { } void execute(){ parseArgs(); mergeMatrices(); } void parseArgs(){ _options = getInput(); _inputDir = _options->getStr(STR_URI_INPUT) + "/"; _inputSketchFilename_existingDatasets = _options->getStr(STR_SIMKA_URI_INPUT_1); _inputSketchFilename_newDatasets = _options->getStr(STR_SIMKA_URI_INPUT_2); //_outputDir = _options->getStr(STR_URI_OUTPUT); //if(getInput()->get(STR_SIMKA_INPUT_IDS)){ // _inputFilenameIds = getInput()->getStr(STR_SIMKA_INPUT_IDS); //} //if(!System::file().doesExist(_outputDir)){ // int ok = System::file().mkdir(_outputDir, -1); // if(ok != 0){ // std::cerr << "Error: can't create output directory (" << _outputDir << ")" << std::endl; // exit(1); // } //} } void mergeMatrices(){ u_int32_t dummy; u_int8_t dummy_k; u_int32_t nbDatasets_existing, nbDatasets_new; SimkaMinCommons::getKmerInfos(_inputSketchFilename_existingDatasets, dummy_k, dummy, dummy, nbDatasets_existing); SimkaMinCommons::getKmerInfos(_inputSketchFilename_newDatasets, dummy_k, dummy, dummy, nbDatasets_new); vector matrixFilenames = System::file().listdir(_inputDir); for(size_t i=0; ipush_front (new OptionOneParam (STR_URI_OUTPUT, "output dir for distance matrices", false, "./simkaMin_results")); //parser->push_front (new OptionOneParam (STR_SIMKA_INPUT_IDS, "filename of ids in the result matrix (one id per line). Do not used this option to used all ids.", false)); parser->push_front (new OptionOneParam (STR_SIMKA_URI_INPUT_2, "sketch file of new datasets", true)); parser->push_front (new OptionOneParam (STR_SIMKA_URI_INPUT_1, "sketch file of existing datasets", true)); parser->push_front (new OptionOneParam (STR_URI_INPUT, "input dir containing existing simka results", true)); } void execute (){ IProperties* args = getInput(); SimkaMinDistanceMatrixMergerAlgorithm* algo = new SimkaMinDistanceMatrixMergerAlgorithm(args); algo->execute(); delete algo; } }; #endif /* SIMKA1_4_SRC_SIMKAMIN_SIMKAMINDISTANCEMATRIXMERGER_HPP_ */ simka-1.5.3/src/simkaMin/SimkaMinInfos.hpp000066400000000000000000000065751377312000000204110ustar00rootroot00000000000000/***************************************************************************** * SimkaMin: Fast kmer-based method for estimating the similarity between numerous metagenomic datasets * A tool from the GATB (Genome Assembly Tool Box) * Copyright (C) 2019 INRIA * Authors: G.Benoit * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU Affero General Public License as * published by the Free Software Foundation, either version 3 of the * License, or (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU Affero General Public License for more details. * * You should have received a copy of the GNU Affero General Public License * along with this program. If not, see . *****************************************************************************/ #ifndef SIMKA1_4_SRC_SIMKAMIN_SIMKAMININFOS_HPP_ #define SIMKA1_4_SRC_SIMKAMIN_SIMKAMININFOS_HPP_ #include "SimkaMinCommons.hpp" class SimkaMinInfosAlgorithm : public Algorithm { public: IProperties* _options; string _inputFilename; u_int32_t _nbDatasets; u_int32_t _sketchSize; SimkaMinInfosAlgorithm(IProperties* options): Algorithm("simkaMinInfosAlgorithm", -1, options) { } void execute(){ parseArgs(); printInfos(); } void parseArgs(){ _options = getInput(); _inputFilename = _options->getStr(STR_URI_INPUT); if(!System::file().doesExist(_inputFilename)){ std::cerr << "Error: input does not exist (" << _inputFilename << ")" << std::endl; exit(1); } } void printInfos(){ //vector datasetIds; //SimkaMinCommons::readIds(_inputFilename, datasetIds); u_int32_t seed; u_int8_t kmerSize; SimkaMinCommons::getKmerInfos(_inputFilename, kmerSize, _sketchSize, seed, _nbDatasets); cout << "Sketch info: " << _inputFilename << endl; cout << endl; cout << "k-mer size : " << (u_int32_t) kmerSize << endl; cout << "Sketch size : " << _sketchSize << endl; cout << "Seed : " << seed << endl; cout << endl; cout << "Nb Datasets: " << _nbDatasets << endl; printIds(); cout << endl; } void printIds(){ ifstream file(_inputFilename.c_str(), ios::binary); file.seekg(SimkaMinCommons::getFilePosition_sketchIds(_nbDatasets, _sketchSize)); //u_int32_t nbDatasets; //file.read((char*)(&nbDatasets), sizeof(nbDatasets)); string datasetId; for(size_t i=0; i<_nbDatasets; i++){ SimkaMinCommons::readString(datasetId, file); cout << datasetId << endl; //datasetIds.push_back(datasetId); } file.close(); } }; class SimkaMinInfos : public Tool{ public: SimkaMinInfos(): Tool ("SimkaMin-Infos"){ IOptionsParser* parser = getParser();//new OptionsParser ("Simka2 - Compute Kmer Spectrum"); parser->push_front (new OptionOneParam (STR_URI_INPUT, "filename to a sketch file", true)); parser->getParser (STR_NB_CORES)->setVisible (false); parser->getParser (STR_VERBOSE)->setVisible (false); } void execute () { IProperties* args = getInput(); SimkaMinInfosAlgorithm* algo = new SimkaMinInfosAlgorithm(args); algo->execute(); delete algo; } }; #endif /* SIMKA1_4_SRC_SIMKAMIN_SIMKAMININFOS_HPP_ */ simka-1.5.3/tests/000077500000000000000000000000001377312000000137575ustar00rootroot00000000000000simka-1.5.3/tests/simkaMin/000077500000000000000000000000001377312000000155275ustar00rootroot00000000000000simka-1.5.3/tests/simkaMin/test_simkaMin.py000077500000000000000000000224551377312000000207230ustar00rootroot00000000000000 #find ./ -name "sketch" -exec rm -r "{}" \; #find ./ -name "distance" -exec rm -r "{}" \; import sys, os, shutil, glob, gzip os.chdir(os.path.split(os.path.realpath(__file__))[0]) suffix = " > /dev/null 2>&1" dir = "__results__" K = [21, 31] FILTER = ["", "-filter"] NB_READS = ["0", "100"] NB_KMERS = ["100", "1000"] NB_CORES = [1, 0] def create_command(scriptFilename, outputPrefix, k, filter, nb_reads, nb_kmers, nb_cores, input_filename): outputDir = "k" + str(k) + "_" + filter.replace("-", "") + "_" + str(nb_reads) + "-" + str(nb_kmers) + "_n" + str(nb_cores) command = "python " + scriptFilename command += " -in " + input_filename command += " -out " + outputPrefix + "/" + outputDir command += " -nb-cores " + str(nb_cores) command += " -max-memory 100 " command += " -kmer-size " + str(k) command += " -nb-kmers " + str(nb_kmers) command += " -bin ../../build/bin/simkaMinCore " command += " -max-reads " + str(nb_reads) command += " " + filter + " " return command, outputDir def create_command_update(scriptFilename, outputPrefix, k, filter, nb_reads, nb_kmers, nb_cores, input_filename): outputDir = "k" + str(k) + "_" + filter.replace("-", "") + "_" + str(nb_reads) + "-" + str(nb_kmers) + "_n" + str(nb_cores) command = "python " + scriptFilename command += " -in " + input_filename command += " -in-to-update " + outputPrefix + "/" + outputDir + "/simkamin" command += " -nb-cores " + str(nb_cores) command += " -max-memory 100 " #command += " -kmer-size " + str(k) #command += " -nb-kmers " + str(nb_kmers) command += " -bin ../../build/bin/simkaMinCore " command += " -max-reads " + str(nb_reads) command += " " + filter + " " return command, outputDir def create_truth(): for k in K: for filter in FILTER: for nb_reads in NB_READS: for nb_kmers in NB_KMERS: for nb_cores in NB_CORES: command, outputDir = create_command("../../simkaMin/simkaMin.py", "truth_simkaMin_symetrical", k, filter, nb_reads, nb_kmers, nb_cores, " ../../example/simka_input.txt ") print (command) ret = os.system(command) if ret != 0: exit(1) #create_truth() #exit(1) def clear(testdir="__results__"): #if os.path.exists("temp_output"): # shutil.rmtree("temp_output") if os.path.exists(testdir): shutil.rmtree(testdir) def decompress_simka_results(dir): result_filenames = glob.glob(os.path.join(dir, '*.csv.gz')) for filename_gz in result_filenames: os.system("gunzip "+filename_gz) #filename_gz = result_dir + "/" + filename # with gzip.open(filename_gz, 'rb') as f: # outFile = open(filename_gz[:-3], "w") # outFile.write(str(f.read())) # outFile.close() # os.remove(filename_gz) def __test_matrices(result_dir, truth_dir): ok = True # print(result_dir + " " + truth_dir) decompress_simka_results(result_dir) result_filenames = glob.glob(os.path.join(result_dir, '*.csv')) if len(result_filenames) == 0: print("Error: no results") exit(1) decompress_simka_results(truth_dir) truth_filenames = glob.glob(os.path.join(truth_dir, '*.csv')) #if simka_vs_truth: # truth_filenames = glob.glob(os.path.join(truth_dir, '*.csv')) #else: #simka vs simka # #if result_dir+"/mat_abundance_jaccard.csv" in truth_filenames: #comparing simka results vs simka results # #truth_filenames.remove(result_dir+"/mat_abundance_jaccard.csv") #This distance is computed from Bray Curtis distance # # truth_filenames = glob.glob(os.path.join(truth_dir, '*.csv')) truth_filenames.sort() result_filenames.sort() for result_filename in result_filenames: distanceName = os.path.split(result_filename)[1] for truth_filename in truth_filenames: distanceName2 = os.path.split(truth_filename)[1] if distanceName != distanceName2: continue res_file = open(result_filename, "r") truth_file = open(truth_filename, "r") # print (res_file, truth_file) res_str = res_file.read() truth_str = truth_file.read() res_file.close() truth_file.close() if(res_str != truth_str): print("\t- TEST ERROR: " + distanceName) print("res") print(res_str) print("truth") print(truth_str) ok = False sys.exit(1) return ok def test_dists(dir): if(__test_matrices("__results__/" + dir, "truth_simkaMin_symetrical/" + dir)): print("\tOK") else: print("\tFAILED") exit(1) #---------------------------------------------------------------- #---------------------------------------------------------------- #---------------------------------------------------------------- def test(): clear() os.mkdir(dir) for k in K: for filter in FILTER: for nb_reads in NB_READS: for nb_kmers in NB_KMERS: for nb_cores in NB_CORES: command, outputDir = create_command("../../simkaMin/simkaMin.py", dir, k, filter, nb_reads, nb_kmers, nb_cores, " ../../example/simka_input.txt ") print (command) ret = os.system(command + suffix) if ret != 0: exit(1) test_dists(outputDir) clear() #---------------------------------------------------------------- #---------------------------------------------------------------- #---------------------------------------------------------------- def test_append(): print("Test append command") out_dir = "./test_append" clear(out_dir) os.mkdir(out_dir) merged_sketch_filename = os.path.join(out_dir, "merged_sketch.bin") filename = "../../example/simka_input.txt" for line in open(filename): line = line.strip() if len(line) == 0: continue filename_temp = os.path.join("../../example/test_simkaMin_input_temp.txt") f = open(filename_temp, "w") f.write(line) f.close() sketch_filename = os.path.join(out_dir, "sketch.bin") command = "../../build/bin/simkaMinCore sketch -in " + filename_temp + " -out " + sketch_filename + " -nb-kmers 100 -kmer-size 21 -nb-cores 4" print(command) ret = os.system(command + suffix) if ret != 0: exit(1) if os.path.exists(merged_sketch_filename): command = "../../build/bin/simkaMinCore append -in1 " + merged_sketch_filename + " -in2 " + sketch_filename print(command) ret = os.system(command + suffix) if ret != 0: exit(1) os.remove(sketch_filename) else: shutil.move(sketch_filename, merged_sketch_filename) os.remove(filename_temp) command = "../../build/bin/simkaMinCore distance -in1 " + merged_sketch_filename + " -in2 " + merged_sketch_filename + " -out " + dir + " -nb-cores 4 " print(command) ret = os.system(command + suffix) if ret != 0: exit(1) command = "../../build/bin/simkaMinCore export -in " + dir + " -in1 " + merged_sketch_filename + " -in2 " + merged_sketch_filename + " -out " + dir print(command) ret = os.system(command + suffix) if ret != 0: exit(1) if(__test_matrices(dir, "truth_simkaMin_symetrical/k21__0-100_n0")): print("\tOK") else: print("\tFAILED") exit(1) clear(out_dir) #---------------------------------------------------------------- #---------------------------------------------------------------- #---------------------------------------------------------------- def test_matrix_update(): print("Test update command") out_dir = "./test_matrix_update" clear(out_dir) os.mkdir(out_dir) filename = "../../example/simka_input.txt" filename_temp1 = os.path.join("../../example/test_simkaMin_input_temp1.txt") f1 = open(filename_temp1, "w") filename_temp2 = os.path.join("../../example/test_simkaMin_input_temp2.txt") f2 = open(filename_temp2, "w") N=2 #where to split the file i=0 for line in open(filename): if len(line) == 0: continue if i