pax_global_header00006660000000000000000000000064140077633760014527gustar00rootroot0000000000000052 comment=464ec7d0bcf3f3f9d4734b943cc237432f6bbfdf spoa-4.0.8/000077500000000000000000000000001400776337600125025ustar00rootroot00000000000000spoa-4.0.8/.gitignore000066400000000000000000000000371400776337600144720ustar00rootroot00000000000000# Compiled Object files build/ spoa-4.0.8/.travis.yml000066400000000000000000000016271400776337600146210ustar00rootroot00000000000000language: cpp matrix: include: - name: "GCC 4.8 (Linux)" # GCC 4.8.5 & CMake 3.12.4 os: linux dist: xenial addons: apt: sources: - ubuntu-toolchain-r-test packages: - g++-4.8 env: - SET_COMPILER="export CC=gcc-4.8 && export CXX=g++-4.8" - name: "Clang 3.5 (Linux)" # Clang 3.5.0 & CMake 3.12.4 os: linux dist: xenial addons: apt: packages: - clang-3.5 env: - SET_COMPILER="export CC=clang-3.5 && export CXX=clang++-3.5" - name: "Clang Xcode 9.4 (OSX)" # Clang 9.4.1 & CMake 3.15.5 os: osx osx_image: xcode9.4 before_install: - eval "${SET_COMPILER}" install: - mkdir build && cd build - cmake -DCMAKE_BUILD_TYPE=Release .. && make script: - ./bin/spoa --version - ./bin/spoa_test notifications: email: on_failure: always spoa-4.0.8/CMakeLists.txt000066400000000000000000000202651400776337600152470ustar00rootroot00000000000000cmake_minimum_required(VERSION 3.12) project(spoa VERSION 4.0.8 LANGUAGES CXX DESCRIPTION "Spoa is a c++ library (and tool) for SIMD vectorized partial order alignment.") set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wall -Wextra -pedantic") set(CMAKE_CXX_STANDARD 11) set(CMAKE_CXX_STANDARD_REQUIRED ON) set(CMAKE_CXX_EXTENSIONS OFF) if (${CMAKE_CXX_COMPILER_ID} STREQUAL "GNU") if (${CMAKE_CXX_COMPILER_VERSION} VERSION_GREATER_EQUAL "5.0.0") set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-aligned-new") endif () if (${CMAKE_CXX_COMPILER_VERSION} VERSION_GREATER_EQUAL "6.1.0") set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-ignored-attributes") endif () endif () set(CMAKE_ARCHIVE_OUTPUT_DIRECTORY ${PROJECT_BINARY_DIR}/lib) set(CMAKE_LIBRARY_OUTPUT_DIRECTORY ${PROJECT_BINARY_DIR}/lib) set(CMAKE_RUNTIME_OUTPUT_DIRECTORY ${PROJECT_BINARY_DIR}/bin) set(BUILD_SHARED_LIBS OFF CACHE BOOL "Build all libraries as shared") include(FetchContent) include(GNUInstallDirs) include(CMakePackageConfigHelpers) if (CMAKE_SOURCE_DIR STREQUAL PROJECT_SOURCE_DIR) set(spoa_main_project ON) endif () option(spoa_install "Generate library install target" ${spoa_main_project}) option(spoa_build_exe "Build executable" ${spoa_main_project}) option(spoa_build_tests "Build unit tests" ${spoa_main_project}) option(spoa_use_cereal "Use cereal library" OFF) option(spoa_optimize_for_native "Build with -march=native" ON) option(spoa_optimize_for_portability "Build with -msse4.1" OFF) option(spoa_use_simde "Use SIMDe library for porting vectorized code" OFF) option(spoa_use_simde_nonvec "Use SIMDe library for nonvectorized code" OFF) option(spoa_use_simde_openmp "Use SIMDe support for OpenMP SIMD" OFF) option(spoa_generate_dispatch "Use SIMDe to generate x86 dispatch" OFF) if (NOT spoa_generate_dispatch) if (spoa_optimize_for_portability) set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -msse4.1") elseif (spoa_optimize_for_native) set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -march=native") endif () endif () if (spoa_use_simde OR spoa_use_simde_nonvec OR spoa_use_simde_openmp OR spoa_generate_dispatch) FetchContent_Declare( simde GIT_REPOSITORY https://github.com/simd-everywhere/simde GIT_TAG v0.7.0) FetchContent_GetProperties(simde) if (NOT simde_POPULATED) FetchContent_Populate(simde) endif () add_compile_definitions(SPOA_USE_SIMDE SIMDE_ENABLE_NATIVE_ALIASES) if (spoa_use_simde_nonvec) add_compile_definitions(SIMDE_NO_NATIVE) endif () if (spoa_use_simde_openmp) add_compile_definitions(SIMDE_ENABLE_OPENMP) set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fopenmp-simd") endif () if (spoa_generate_dispatch) find_package(CpuFeatures 0.6.0 QUIET) if (NOT CpuFeatures_FOUND) if (spoa_install) message(FATAL_ERROR "Missing package CpuFeatures!") endif () FetchContent_Declare( cpu_features GIT_REPOSITORY https://github.com/google/cpu_features GIT_TAG v0.6.0) FetchContent_GetProperties(cpu_features) if (NOT cpu_features_POPULATED) FetchContent_Populate(cpu_features) add_subdirectory( ${cpu_features_SOURCE_DIR} ${cpu_features_BINARY_DIR} EXCLUDE_FROM_ALL) add_library(CpuFeatures::cpu_features ALIAS cpu_features) endif () endif () add_compile_definitions(SPOA_GENERATE_DISPATCH) endif () endif () if (spoa_use_cereal) find_package(cereal 1.3.0 QUIET) if (NOT cereal_FOUND) if (spoa_install) message(FATAL_ERROR "Missing package cereal!") endif () FetchContent_Declare( cereal GIT_REPOSITORY https://github.com/USCiLab/cereal GIT_TAG v1.3.0) FetchContent_GetProperties(cereal) if (NOT cereal_POPULATED) FetchContent_Populate(cereal) add_subdirectory( ${cereal_SOURCE_DIR} ${cereal_BINARY_DIR} EXCLUDE_FROM_ALL) add_library(cereal::cereal ALIAS cereal) endif () endif () endif () if (spoa_build_exe OR spoa_build_tests) find_package(bioparser 3.0.13 QUIET) find_package(biosoup 0.10.0 QUIET) if (NOT bioparser_FOUND) FetchContent_Declare( bioparser GIT_REPOSITORY https://github.com/rvaser/bioparser GIT_TAG 3.0.13) FetchContent_GetProperties(bioparser) if (NOT bioparser_POPULATED) FetchContent_Populate(bioparser) add_subdirectory( ${bioparser_SOURCE_DIR} ${bioparser_BINARY_DIR} EXCLUDE_FROM_ALL) endif () endif () if (NOT biosoup_FOUND) FetchContent_Declare( biosoup GIT_REPOSITORY https://github.com/rvaser/biosoup GIT_TAG 0.10.0) FetchContent_GetProperties(biosoup) if (NOT biosoup_POPULATED) FetchContent_Populate(biosoup) add_subdirectory( ${biosoup_SOURCE_DIR} ${biosoup_BINARY_DIR} EXCLUDE_FROM_ALL) endif () endif () endif () if (spoa_build_tests) find_package(GTest 1.10.0 QUIET) if (NOT GTest_FOUND) FetchContent_Declare( googletest GIT_REPOSITORY https://github.com/google/googletest GIT_TAG release-1.10.0) FetchContent_GetProperties(googletest) if (NOT googletest_POPULATED) FetchContent_Populate(googletest) add_subdirectory( ${googletest_SOURCE_DIR} ${googletest_BINARY_DIR} EXCLUDE_FROM_ALL) add_library(GTest::Main ALIAS gtest_main) endif () endif () endif () add_library(spoa src/alignment_engine.cpp src/graph.cpp src/simd_alignment_engine_dispatcher.cpp src/sisd_alignment_engine.cpp) add_library(spoa::spoa ALIAS spoa) target_include_directories(spoa PUBLIC $ $ $) if (spoa_use_cereal) target_link_libraries(spoa cereal::cereal) target_compile_definitions(spoa PUBLIC SPOA_USE_CEREAL) endif () if (BUILD_SHARED_LIBS) set_property(TARGET spoa PROPERTY SOVERSION "7.0.0") endif () if (spoa_generate_dispatch) list(APPEND ARCHITECTURES avx2 sse4.1 sse2) foreach(arch IN LISTS ARCHITECTURES) add_library(spoa_${arch} OBJECT src/simd_alignment_engine_dispatch.cpp) target_include_directories(spoa_${arch} PUBLIC $ $ $) set_property(TARGET spoa_${arch} PROPERTY COMPILE_FLAGS "-m${arch}") if (BUILD_SHARED_LIBS) set_property(TARGET spoa_${arch} PROPERTY POSITION_INDEPENDENT_CODE ON) endif () endforeach () add_dependencies(spoa spoa_avx2 spoa_sse4.1 spoa_sse2) target_link_libraries(spoa CpuFeatures::cpu_features) endif () if (spoa_install) configure_package_config_file( ${CMAKE_CURRENT_SOURCE_DIR}/Config.cmake.in ${CMAKE_CURRENT_BINARY_DIR}/${PROJECT_NAME}Config.cmake INSTALL_DESTINATION ${CMAKE_INSTALL_LIBDIR}/cmake/${PROJECT_NAME}) write_basic_package_version_file( ${CMAKE_CURRENT_BINARY_DIR}/${PROJECT_NAME}ConfigVersion.cmake COMPATIBILITY SameMajorVersion) install( TARGETS spoa EXPORT ${PROJECT_NAME}Targets DESTINATION ${CMAKE_INSTALL_LIBDIR}) install( DIRECTORY include/spoa DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}) install( EXPORT ${PROJECT_NAME}Targets NAMESPACE ${PROJECT_NAME}:: DESTINATION ${CMAKE_INSTALL_LIBDIR}/cmake/${PROJECT_NAME}) install( FILES ${CMAKE_CURRENT_BINARY_DIR}/${PROJECT_NAME}Config.cmake ${CMAKE_CURRENT_BINARY_DIR}/${PROJECT_NAME}ConfigVersion.cmake DESTINATION ${CMAKE_INSTALL_LIBDIR}/cmake/${PROJECT_NAME}) endif () if (spoa_build_exe) add_executable(spoa_exe src/main.cpp) target_link_libraries(spoa_exe spoa bioparser::bioparser biosoup::biosoup) target_compile_definitions(spoa_exe PRIVATE VERSION="${PROJECT_VERSION}") set_property(TARGET spoa_exe PROPERTY OUTPUT_NAME spoa) install(TARGETS spoa_exe DESTINATION ${CMAKE_INSTALL_BINDIR}) endif () if (spoa_build_tests) add_executable(spoa_test test/spoa_test.cpp) target_link_libraries(spoa_test spoa bioparser::bioparser biosoup::biosoup GTest::Main) target_compile_definitions(spoa_test PRIVATE TEST_DATA="${PROJECT_SOURCE_DIR}/test/data/sample.fastq.gz") endif () spoa-4.0.8/Config.cmake.in000066400000000000000000000004401400776337600153140ustar00rootroot00000000000000@PACKAGE_INIT@ include(CMakeFindDependencyMacro) if (@spoa_use_cereal@) find_dependency(cereal) endif () if (@spoa_generate_dispatch@) find_dependency(CpuFeatures) endif () include("${CMAKE_CURRENT_LIST_DIR}/@PROJECT_NAME@Targets.cmake") check_required_components("@PROJECT_NAME@") spoa-4.0.8/LICENSE000066400000000000000000000020671400776337600135140ustar00rootroot00000000000000The MIT License (MIT) Copyright (c) 2016 Robert Vaser Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. spoa-4.0.8/README.md000066400000000000000000000131221400776337600137600ustar00rootroot00000000000000# Spoa [![Latest GitHub release](https://img.shields.io/github/release/rvaser/spoa.svg)](https://github.com/rvaser/spoa/releases/latest) [![Build status for gcc/clang](https://travis-ci.com/rvaser/spoa.svg?branch=master)](https://travis-ci.com/rvaser/spoa) [![Published in Genome Research](https://img.shields.io/badge/published%20in-Genome%20Research-blue.svg)](https://doi.org/10.1101/gr.214270.116) Spoa (SIMD POA) is a c++ implementation of the partial order alignment (POA) algorithm (as described in 10.1093/bioinformatics/18.3.452) which is used to generate consensus sequences (as described in 10.1093/bioinformatics/btg109). It supports three alignment modes: local (Smith-Waterman), global (Needleman-Wunsch) and semi-global alignment (overlap), and three gap modes: linear, affine and convex (piecewise affine). It also supports Intel SSE4.1+ and AVX2 vectorization (marginally faster due to high latency shifts), [SIMDe](https://github.com/simd-everywhere/simde) and dispatching. ## Usage To build spoa run the following commands: ```bash git clone https://github.com/rvaser/spoa && cd spoa && mkdir build && cd build cmake -DCMAKE_BUILD_TYPE=Release .. && make ``` which will create spoa library, executable and unit tests. Running the executable will display the following usage: ```bash usage: spoa [options ...] # default output is stdout input file in FASTA/FASTQ format (can be compressed with gzip) options: -m default: 5 score for matching bases -n default: -4 score for mismatching bases -g default: -8 gap opening penalty (must be non-positive) -e default: -6 gap extension penalty (must be non-positive) -q default: -10 gap opening penalty of the second affine function (must be non-positive) -c default: -4 gap extension penalty of the second affine function (must be non-positive) -l, --algorithm default: 0 alignment mode: 0 - local (Smith-Waterman) 1 - global (Needleman-Wunsch) 2 - semi-global -r, --result (option can be used multiple times) default: 0 result mode: 0 - consensus (FASTA) 1 - multiple sequence alignment (FASTA) 2 - 0 & 1 (FASTA) 3 - partial order graph (GFA) 4 - 0 & 3 (GFA) -d, --dot output file for the partial order graph in DOT format -s, --strand-ambiguous for each sequence pick the strand with the better alignment --version prints the version number -h, --help prints the usage gap mode: linear if g >= e affine if g <= q or e >= c convex otherwise (default) ``` Running `make install` will install the library and the executable. If you choose to build with cereal or want to generate the dispatcher, cereal and cpu_features (see Dependencies) need to be installed beforehand, respectively. Once the library is installed, with or without additional options, a package will be copied to your system that can be searched and linked with: ```cmake find_package(spoa) target_link_libraries( spoa::spoa) ``` On the other hand, you can include spoa as a submodule and add it to your project with the following: ```cmake if (NOT TARGET spoa) add_subdirectory(/spoa EXCLUDE_FROM_ALL) endif () target_link_libraries( spoa::spoa) ``` #### Build options - `spoa_install`: generate library install target - `spoa_build_exe`: build executable - `spoa_build_tests`: build unit tests - `spoa_optimize_for_native`: build with `-march=native` - `spoa_optimize_for_portability`: build with `-msse4.1` - `spoa_use_cereal`: use cereal library - `spoa_use_simde`: build with SIMDe for porting vectorized code - `spoa_use_simde_nonvec`: use SIMDe library for nonvectorized code - `spoa_use_simde_openmp`: use SIMDe support for OpenMP SIMD - `spoa_generate_dispatch`: use SIMDe to generate x86 dispatch #### Dependencies - gcc 4.8+ | clang 3.5+ - cmake 3.12+ - (spoa_exe)(spoa_test) zlib 1.2.8+ ###### Hidden - (optional) USCiLab/cereal 1.3.0 - (optional) simd-everywhere/simde 0.7.0 - (optional) google/cpu_features 0.6.0 - (spoa_exe)(spoa_test) rvaser/bioparser 3.0.13 - (spoa_exe)(spoa_test) rvaser/biosoup 0.10.0 - (spoa_test) google/googletest 1.10.0 ## Examples ```cpp #include #include "spoa/spoa.hpp" int main(int argc, char** argv) { std::vector sequences = { "CATAAAAGAACGTAGGTCGCCCGTCCGTAACCTGTCGGATCACCGGAAAGGACCCGTAAAGTGATAATGAT", "ATAAAGGCAGTCGCTCTGTAAGCTGTCGATTCACCGGAAAGATGGCGTTACCACGTAAAGTGATAATGATTAT", "ATCAAAGAACGTGTAGCCTGTCCGTAATCTAGCGCATTTCACACGAGACCCGCGTAATGGG", "CGTAAATAGGTAATGATTATCATTACATATCACAACTAGGGCCGTATTAATCATGATATCATCA", "GTCGCTAGAGGCATCGTGAGTCGCTTCCGTACCGCAAGGATGACGAGTCACTTAAAGTGATAAT", "CCGTAACCTTCATCGGATCACCGGAAAGGACCCGTAAATAGACCTGATTATCATCTACAT" }; auto alignment_engine = spoa::AlignmentEngine::Create( spoa::AlignmentType::kNW, 3, -5, -3); // linear gaps spoa::Graph graph{}; for (const auto& it : sequences) { auto alignment = alignment_engine->Align(it, graph); graph.AddAlignment(alignment, it); } auto consensus = graph.GenerateConsensus(); std::cerr << ">Consensus LN:i:" << consensus.size() << std::endl << consensus << std::endl; auto msa = graph.GenerateMultipleSequenceAlignment(); for (const auto& it : msa) { std::cerr << it << std::endl; } return 0; } ``` ## Acknowledgement This work has been supported in part by Croatian Science Foundation under projects UIP-11-2013-7353 and IP-2018-01-5886. spoa-4.0.8/include/000077500000000000000000000000001400776337600141255ustar00rootroot00000000000000spoa-4.0.8/include/spoa/000077500000000000000000000000001400776337600150675ustar00rootroot00000000000000spoa-4.0.8/include/spoa/alignment_engine.hpp000066400000000000000000000042231400776337600211040ustar00rootroot00000000000000// Copyright (c) 2020 Robert Vaser #ifndef SPOA_ALIGNMENT_ENGINE_HPP_ #define SPOA_ALIGNMENT_ENGINE_HPP_ #pragma once #include #include #include #include #include namespace spoa { enum class AlignmentType { kSW, // Smith Waterman kNW, // Needleman Wunsch kOV // Overlap }; enum class AlignmentSubtype { kLinear, // g * i kAffine, // g + (i - 1) * e kConvex // min(g1 + (i - 1) * e1, g2 + (i - 1) * e2) }; class Graph; using Alignment = std::vector>; class AlignmentEngine { public: virtual ~AlignmentEngine() = default; static std::unique_ptr Create( AlignmentType type, std::int8_t m, // match std::int8_t n, // mismatch std::int8_t g); // gap static std::unique_ptr Create( AlignmentType type, std::int8_t m, std::int8_t n, std::int8_t g, // gap open std::int8_t e); // gap extend static std::unique_ptr Create( AlignmentType type, std::int8_t m, std::int8_t n, std::int8_t g, std::int8_t e, std::int8_t q, // gap open of second affine std::int8_t c); // gap extend of second affine virtual void Prealloc( std::uint32_t max_sequence_len, std::uint8_t alphabet_size) = 0; Alignment Align( const std::string& sequence, const Graph& graph, std::int32_t* score = nullptr); virtual Alignment Align( const char* sequence, std::uint32_t sequence_len, const Graph& graph, std::int32_t* score = nullptr) = 0; protected: AlignmentEngine( AlignmentType type, AlignmentSubtype subtype, std::int8_t m, std::int8_t n, std::int8_t g, std::int8_t e, std::int8_t q, std::int8_t c); std::int64_t WorstCaseAlignmentScore( std::int64_t sequence_len, std::int64_t graph_len) const; AlignmentType type_; AlignmentSubtype subtype_; std::int8_t m_; std::int8_t n_; std::int8_t g_; std::int8_t e_; std::int8_t q_; std::int8_t c_; }; } // namespace spoa #endif // SPOA_ALIGNMENT_ENGINE_HPP_ spoa-4.0.8/include/spoa/architectures.hpp000066400000000000000000000004051400776337600204440ustar00rootroot00000000000000// Copyright (c) 2020 Mario Brcic, Robert Vaser #ifndef SPOA_ARCHITECTURES_HPP_ #define SPOA_ARCHITECTURES_HPP_ namespace spoa { enum class Architecture { kAVX2, kSSE4_1, kSSE2, kAutomatic }; } // namespace spoa #endif // SPOA_ARCHITECTURES_HPP_ spoa-4.0.8/include/spoa/graph.hpp000066400000000000000000000167451400776337600167160ustar00rootroot00000000000000// Copyright (c) 2020 Robert Vaser #ifndef SPOA_GRAPH_HPP_ #define SPOA_GRAPH_HPP_ #include #include #include #include #include #include #include #ifdef SPOA_USE_CEREAL #include "cereal/access.hpp" #include "cereal/types/memory.hpp" #include "cereal/types/vector.hpp" #include "cereal/types/utility.hpp" #endif namespace spoa { using Alignment = std::vector>; class Graph { public: Graph(); Graph(const Graph&) = delete; Graph& operator=(const Graph&) = delete; Graph(Graph&&) = default; Graph& operator=(Graph&&) = default; ~Graph() = default; struct Node; struct Edge; struct Node { public: Node(std::uint32_t id, std::uint32_t code); Node(const Node&) = default; Node& operator=(const Node&) = default; Node(Node&&) = default; Node& operator=(Node&&) = default; Node* Successor(std::uint32_t label) const; std::uint32_t Coverage() const; std::uint32_t id; std::uint32_t code; std::vector inedges; std::vector outedges; std::vector aligned_nodes; private: #ifdef SPOA_USE_CEREAL Node() = default; template void serialize(Archive& archive) { // NOLINT archive(CEREAL_NVP(id), CEREAL_NVP(code)); } friend cereal::access; #endif }; struct Edge { public: Edge(Node* tail, Node* head, std::uint32_t label, std::uint32_t weight); Edge(const Edge&) = default; Edge& operator=(const Edge&) = default; Edge(Edge&&) = default; Edge& operator=(Edge&&) = default; void AddSequence(std::uint32_t label, std::uint32_t weight = 1); Node* tail; Node* head; std::vector labels; std::int64_t weight; private: #ifdef SPOA_USE_CEREAL Edge() = default; template void serialize(Archive& archive) { // NOLINT archive(labels, weight); } friend cereal::access; #endif }; const std::vector>& nodes() const { return nodes_; } const std::vector>& edges() const { return edges_; } const std::vector& rank_to_node() const { return rank_to_node_; } const std::vector& sequences() const { return sequences_; } std::uint32_t num_codes() const { return num_codes_; } std::uint8_t coder(std::uint8_t c) const { return coder_[c]; } std::uint8_t decoder(std::uint8_t code) const { return decoder_[code]; } const std::vector& consensus() const { return consensus_; } void AddAlignment( const Alignment& alignment, const std::string& sequence, std::uint32_t weight = 1); void AddAlignment( const Alignment& alignment, const std::string& sequence, const std::vector& weights); void AddAlignment( const Alignment& alignment, const std::string& sequence, const std::string& quality); void AddAlignment( const Alignment& alignment, const char* sequence, std::uint32_t sequence_len, std::uint32_t weight = 1); void AddAlignment( const Alignment& alignment, const char* sequence, std::uint32_t sequence_len, const std::vector& weights); void AddAlignment( const Alignment& alignment, const char* sequence, std::uint32_t sequence_len, const char* quality, std::uint32_t quality_len); std::vector GenerateMultipleSequenceAlignment( bool include_consensus = false); std::string GenerateConsensus(); std::string GenerateConsensus( std::vector* summary, bool verbose = false); Graph Subgraph( std::uint32_t begin, std::uint32_t end, std::vector* subgraph_to_graph) const; void UpdateAlignment( const std::vector& subgraph_to_graph, Alignment* alignment) const; // print with Graphviz void PrintDot(const std::string& path) const; void Clear(); #ifdef SPOA_USE_CEREAL template void save(Archive& archive) const { // NOLINT std::vector sequences; for (const auto& it : sequences_) { sequences.emplace_back(it->id); } std::vector> connections; for (const auto& it : edges_) { connections.emplace_back(it->tail->id, it->head->id); } std::vector> aligned_nodes; for (const auto& it : nodes_) { for (const auto& jt : it->aligned_nodes) { if (it->id < jt->id) { aligned_nodes.emplace_back(it->id, jt->id); } } } std::vector rank_to_node_id; for (const auto& it : rank_to_node_) { rank_to_node_id.emplace_back(it->id); } std::vector consensus; for (const auto& it : consensus_) { consensus.emplace_back(it->id); } archive( num_codes_, coder_, decoder_, nodes_, edges_, sequences, connections, aligned_nodes, rank_to_node_id, consensus); } template void load(Archive& archive) { // NOLINT std::vector sequences; std::vector> connections; std::vector> aligned_nodes; std::vector rank_to_node_id; std::vector consensus; archive( num_codes_, coder_, decoder_, nodes_, edges_, sequences, connections, aligned_nodes, rank_to_node_id, consensus); for (const auto& it : sequences) { sequences_.emplace_back(nodes_[it].get()); } for (std::uint32_t i = 0; i < connections.size(); ++i) { edges_[i]->tail = nodes_[connections[i].first].get(); edges_[i]->head = nodes_[connections[i].second].get(); edges_[i]->tail->outedges.emplace_back(edges_[i].get()); edges_[i]->head->inedges.emplace_back(edges_[i].get()); } for (const auto& it : aligned_nodes) { nodes_[it.first]->aligned_nodes.emplace_back(nodes_[it.second].get()); nodes_[it.second]->aligned_nodes.emplace_back(nodes_[it.first].get()); } for (const auto& it : rank_to_node_id) { rank_to_node_.emplace_back(nodes_[it].get()); } for (const auto& it : consensus) { consensus_.emplace_back(nodes_[it].get()); } } #endif private: Node* AddNode(std::uint32_t code); void AddEdge(Node* tail, Node* head, std::uint32_t weight); Node* AddSequence( const char* sequence, const std::vector& weights, std::uint32_t begin, std::uint32_t end); void TopologicalSort(); bool IsTopologicallySorted() const; void TraverseHeaviestBundle(); Node* BranchCompletion( std::uint32_t rank, std::vector* scores, std::vector* predecessors); std::vector ExtractSubgraph(const Node* begin, const Node* end) const; std::vector InitializeMultipleSequenceAlignment( std::uint32_t* row_size = nullptr) const; std::uint32_t num_codes_; std::vector coder_; std::vector decoder_; std::vector sequences_; std::vector> nodes_; std::vector> edges_; std::vector rank_to_node_; std::vector consensus_; }; } // namespace spoa #endif // SPOA_GRAPH_HPP_ spoa-4.0.8/include/spoa/spoa.hpp000066400000000000000000000002431400776337600165410ustar00rootroot00000000000000// Copyright (c) 2020 Robert Vaser #ifndef SPOA_SPOA_HPP_ #define SPOA_SPOA_HPP_ #include "graph.hpp" #include "alignment_engine.hpp" #endif // SPOA_SPOA_HPP_ spoa-4.0.8/src/000077500000000000000000000000001400776337600132715ustar00rootroot00000000000000spoa-4.0.8/src/alignment_engine.cpp000066400000000000000000000053061400776337600173040ustar00rootroot00000000000000// Copyright (c) 2020 Robert Vaser #include "spoa/alignment_engine.hpp" #include #include #include #include #include "sisd_alignment_engine.hpp" #include "simd_alignment_engine.hpp" namespace spoa { std::unique_ptr AlignmentEngine::Create( AlignmentType type, std::int8_t m, std::int8_t n, std::int8_t g) { return Create(type, m, n, g, g); } std::unique_ptr AlignmentEngine::Create( AlignmentType type, std::int8_t m, std::int8_t n, std::int8_t g, std::int8_t e) { return Create(type, m, n, g, e, g, e); } std::unique_ptr AlignmentEngine::Create( AlignmentType type, std::int8_t m, std::int8_t n, std::int8_t g, std::int8_t e, std::int8_t q, std::int8_t c) { if (type != AlignmentType::kSW && type != AlignmentType::kNW && type != AlignmentType::kOV) { throw std::invalid_argument( "[spoa::AlignmentEngine::Create] error: invalid alignment type!"); } if (g > 0 || q > 0) { throw std::invalid_argument( "[spoa::AlignmentEngine::Create] error: " "gap opening penalty must be non-positive!"); } if (e > 0 || c > 0) { throw std::invalid_argument( "[spoa::AlignmentEngine::Create] error: " "gap extension penalty must be non-positive!"); } AlignmentSubtype subtype = g >= e ? AlignmentSubtype::kLinear : (g <= q || e >= c ? AlignmentSubtype::kAffine : AlignmentSubtype::kConvex); if (subtype == AlignmentSubtype::kLinear) { e = g; } else if (subtype == AlignmentSubtype::kAffine) { q = g; c = e; } auto dst = CreateSimdAlignmentEngine(type, subtype, m, n, g, e, q, c); if (!dst) { return SisdAlignmentEngine::Create(type, subtype, m, n, g, e, q, c); } return dst; } AlignmentEngine::AlignmentEngine( AlignmentType type, AlignmentSubtype subtype, std::int8_t m, std::int8_t n, std::int8_t g, std::int8_t e, std::int8_t q, std::int8_t c) : type_(type), subtype_(subtype), m_(m), n_(n), g_(g), e_(e), q_(q), c_(c) { } Alignment AlignmentEngine::Align( const std::string& sequence, const Graph& graph, std::int32_t* score) { return Align(sequence.c_str(), sequence.size(), graph, score); } std::int64_t AlignmentEngine::WorstCaseAlignmentScore( std::int64_t i, std::int64_t j) const { auto gap_score = [&] (std::int64_t len) -> std::int64_t { return len == 0 ? 0 : std::min(g_ + (len - 1) * e_, q_ + (len - 1) * c_); }; return std::min( -1 * (m_ * std::min(i, j) + gap_score(std::abs(i - j))), gap_score(i) + gap_score(j)); } } // namespace spoa spoa-4.0.8/src/graph.cpp000066400000000000000000000423061400776337600151030ustar00rootroot00000000000000// Copyright (c) 2020 Robert Vaser #include "spoa/graph.hpp" #include #include #include #include #include #include namespace spoa { Graph::Node::Node(std::uint32_t id, std::uint32_t code) : id(id), code(code), inedges(), outedges(), aligned_nodes() { } Graph::Node* Graph::Node::Successor(std::uint32_t label) const { for (const auto& it : outedges) { auto jt = std::find(it->labels.begin(), it->labels.end(), label); if (jt != it->labels.end()) { return it->head; } } return nullptr; } std::uint32_t Graph::Node::Coverage() const { std::unordered_set labels; for (const auto& it : inedges) { std::copy( it->labels.begin(), it->labels.end(), std::inserter(labels, labels.end())); } for (const auto& it : outedges) { std::copy( it->labels.begin(), it->labels.end(), std::inserter(labels, labels.end())); } return labels.size(); } Graph::Edge::Edge( Node* tail, Node* head, std::uint32_t label, std::uint32_t weight) : tail(tail), head(head), labels(1, label), weight(weight) { } void Graph::Edge::AddSequence(std::uint32_t label, std::uint32_t w) { labels.emplace_back(label); weight += w; } Graph::Graph() : num_codes_(0), coder_(256, -1), decoder_(256, -1), sequences_(), nodes_(), edges_(), rank_to_node_(), consensus_() { } Graph::Node* Graph::AddNode(std::uint32_t code) { nodes_.emplace_back(new Node(nodes_.size(), code)); return nodes_.back().get(); } void Graph::AddEdge(Node* tail, Node* head, std::uint32_t weight) { for (const auto& it : tail->outedges) { if (it->head == head) { it->AddSequence(sequences_.size(), weight); return; } } edges_.emplace_back(new Edge(tail, head, sequences_.size(), weight)); tail->outedges.emplace_back(edges_.back().get()); head->inedges.emplace_back(edges_.back().get()); } Graph::Node* Graph::AddSequence( const char* sequence, const std::vector& weights, std::uint32_t begin, std::uint32_t end) { if (begin == end) { return nullptr; } Node* prev = nullptr; for (std::uint32_t i = begin; i < end; ++i) { auto curr = AddNode(coder_[sequence[i]]); if (prev) { // both nodes contribute to the weight AddEdge(prev, curr, weights[i - 1] + weights[i]); } prev = curr; } return nodes_[nodes_.size() - (end - begin)].get(); } void Graph::AddAlignment( const Alignment& alignment, const std::string& sequence, std::uint32_t weight) { AddAlignment(alignment, sequence.c_str(), sequence.size(), weight); } void Graph::AddAlignment( const Alignment& alignment, const char* sequence, std::uint32_t sequence_len, std::uint32_t weight) { std::vector weights(sequence_len, weight); AddAlignment(alignment, sequence, sequence_len, weights); } void Graph::AddAlignment( const Alignment& alignment, const std::string& sequence, const std::string& quality) { AddAlignment( alignment, sequence.c_str(), sequence.size(), quality.c_str(), quality.size()); } void Graph::AddAlignment( const Alignment& alignment, const char* sequence, std::uint32_t sequence_len, const char* quality, std::uint32_t quality_len) { std::vector weights; for (std::uint32_t i = 0; i < quality_len; ++i) { weights.emplace_back(quality[i] - 33); // Phred quality } AddAlignment(alignment, sequence, sequence_len, weights); } void Graph::AddAlignment( const Alignment& alignment, const std::string& sequence, const std::vector& weights) { AddAlignment(alignment, sequence.c_str(), sequence.size(), weights); } void Graph::AddAlignment( const Alignment& alignment, const char* sequence, std::uint32_t sequence_len, const std::vector& weights) { if (sequence_len == 0) { return; } if (sequence_len != weights.size()) { throw std::invalid_argument( "[spoa::Graph::AddAlignment] error: " "sequence and weights are of unequal size!"); } for (std::uint32_t i = 0; i < sequence_len; ++i) { if (coder_[sequence[i]] == -1) { coder_[sequence[i]] = num_codes_; decoder_[num_codes_++] = sequence[i]; } } if (alignment.empty()) { sequences_.emplace_back(AddSequence(sequence, weights, 0, sequence_len)); TopologicalSort(); return; } std::vector valid; for (const auto& it : alignment) { if (it.second != -1) { if (it.second < 0 || it.second >= static_cast(sequence_len)) { // NOLINT throw std::invalid_argument( "[spoa::Graph::AddAlignment] error: invalid alignment"); } valid.emplace_back(it.second); } } if (valid.empty()) { throw std::invalid_argument( "[spoa::Graph::AddAlignment] error: missing sequence in alignment"); } // add unaligned bases Node* begin = AddSequence(sequence, weights, 0, valid.front()); Node* prev = begin ? nodes_.back().get() : nullptr; Node* last = AddSequence(sequence, weights, valid.back() + 1, sequence_len); // add aligned bases for (const auto& it : alignment) { if (it.second == -1) { continue; } std::uint32_t code = coder_[sequence[it.second]]; Node* curr = nullptr; if (it.first == -1) { curr = AddNode(code); } else { auto jt = nodes_[it.first].get(); if (jt->code == code) { curr = jt; } else { for (const auto& kt : jt->aligned_nodes) { if (kt->code == code) { curr = kt; break; } } if (!curr) { curr = AddNode(code); for (const auto& kt : jt->aligned_nodes) { kt->aligned_nodes.emplace_back(curr); curr->aligned_nodes.emplace_back(kt); } jt->aligned_nodes.emplace_back(curr); curr->aligned_nodes.emplace_back(jt); } } } if (!begin) { begin = curr; } if (prev) { // both nodes contribute to weight AddEdge(prev, curr, weights[it.second - 1] + weights[it.second]); } prev = curr; } if (last) { AddEdge(prev, last, weights[valid.back()] + weights[valid.back() + 1]); } sequences_.emplace_back(begin); TopologicalSort(); } void Graph::TopologicalSort() { rank_to_node_.clear(); std::vector marks(nodes_.size(), 0); std::vector ignored(nodes_.size(), 0); std::stack stack; for (const auto& it : nodes_) { if (marks[it->id] != 0) { continue; } stack.push(it.get()); while (!stack.empty()) { auto curr = stack.top(); bool is_valid = true; if (marks[curr->id] != 2) { for (const auto& jt : curr->inedges) { if (marks[jt->tail->id] != 2) { stack.push(jt->tail); is_valid = false; } } if (!ignored[curr->id]) { for (const auto& jt : curr->aligned_nodes) { if (marks[jt->id] != 2) { stack.push(jt); ignored[jt->id] = true; is_valid = false; } } } assert((is_valid || marks[curr->id] != 1) && "Graph is not a DAG"); if (is_valid) { marks[curr->id] = 2; if (!ignored[curr->id]) { rank_to_node_.emplace_back(curr); for (const auto& jt : curr->aligned_nodes) { rank_to_node_.emplace_back(jt); } } } else { marks[curr->id] = 1; } } if (is_valid) { stack.pop(); } } } assert(IsTopologicallySorted() && "Graph is not topologically sorted"); } bool Graph::IsTopologicallySorted() const { assert(nodes_.size() == rank_to_node_.size() && "Topological sort not called "); // NOLINT std::vector visited(nodes_.size(), 0); for (const auto& it : rank_to_node_) { for (const auto& jt : it->inedges) { if (!visited[jt->tail->id]) { return false; } } visited[it->id] = 1; } return true; } std::vector Graph::InitializeMultipleSequenceAlignment( std::uint32_t* row_size) const { std::vector dst(nodes_.size()); std::uint32_t j = 0; for (std::uint32_t i = 0; i < rank_to_node_.size(); ++i, ++j) { auto it = rank_to_node_[i]; dst[it->id] = j; for (const auto& jt : it->aligned_nodes) { dst[jt->id] = j; ++i; } } if (row_size) { *row_size = j; } return dst; } std::vector Graph::GenerateMultipleSequenceAlignment( bool include_consensus) { std::uint32_t row_size = 0; auto node_id_to_column = InitializeMultipleSequenceAlignment(&row_size); std::vector dst; for (std::uint32_t i = 0; i < sequences_.size(); ++i) { std::string row(row_size, '-'); auto it = sequences_[i]; while (true) { row[node_id_to_column[it->id]] = decoder_[it->code]; if (!(it = it->Successor(i))) { break; } } dst.emplace_back(row); } if (include_consensus) { TraverseHeaviestBundle(); std::string row(row_size, '-'); for (const auto& it : consensus_) { row[node_id_to_column[it->id]] = decoder_[it->code]; } dst.emplace_back(row); } return dst; } std::string Graph::GenerateConsensus() { TraverseHeaviestBundle(); std::string dst{}; for (const auto& it : consensus_) { dst += decoder_[it->code]; } return dst; } std::string Graph::GenerateConsensus( std::vector* summary, bool verbose) { if (!summary) { throw std::invalid_argument( "[spoa::Graph::GenerateConsensus] error: invalid ptr to summary"); } auto dst = GenerateConsensus(); summary->clear(); if (!verbose) { for (const auto& it : consensus_) { summary->emplace_back(0); summary->back() += it->Coverage(); for (const auto& jt : it->aligned_nodes) { summary->back() += jt->Coverage(); } } } else { summary->resize((num_codes_ + 1) * consensus_.size(), 0); auto node_id_to_column = InitializeMultipleSequenceAlignment(); for (std::uint32_t i = 0; i < sequences_.size(); ++i) { Node* it = sequences_[i]; std::uint32_t c = 0, p, column = node_id_to_column[it->id]; bool is_gap = false; while (true) { for (; c < consensus_.size(); ++c) { if (node_id_to_column[consensus_[c]->id] < column) { continue; } else { if (node_id_to_column[consensus_[c]->id] == column) { if (is_gap) { for (std::uint32_t j = p + 1; j < c; ++j) { ++(*summary)[num_codes_ * consensus_.size() + j]; } } is_gap = true; p = c; ++(*summary)[it->code * consensus_.size() + c]; } break; } } if (c == consensus_.size() || !(it = it->Successor(i))) { break; } column = node_id_to_column[it->id]; } } } return dst; } void Graph::TraverseHeaviestBundle() { if (rank_to_node_.empty()) { return; } std::vector predecessors(nodes_.size(), nullptr); std::vector scores(nodes_.size(), -1); Node* max = nullptr; for (const auto& it : rank_to_node_) { for (const auto& jt : it->inedges) { if ((scores[it->id] < jt->weight) || (scores[it->id] == jt->weight && scores[predecessors[it->id]->id] <= scores[jt->tail->id])) { // NOLINT scores[it->id] = jt->weight; predecessors[it->id] = jt->tail; } } if (predecessors[it->id]) { scores[it->id] += scores[predecessors[it->id]->id]; } if (!max || scores[max->id] < scores[it->id]) { max = it; } } if (!max->outedges.empty()) { std::vector node_id_to_rank(nodes_.size(), 0); for (std::uint32_t i = 0; i < rank_to_node_.size(); ++i) { node_id_to_rank[rank_to_node_[i]->id] = i; } while (!max->outedges.empty()) { max = BranchCompletion(node_id_to_rank[max->id], &scores, &predecessors); } } // traceback consensus_.clear(); while (predecessors[max->id]) { consensus_.emplace_back(max); max = predecessors[max->id]; } consensus_.emplace_back(max); std::reverse(consensus_.begin(), consensus_.end()); } Graph::Node* Graph::BranchCompletion( std::uint32_t rank, std::vector* scores, std::vector* predecessors) { auto start = rank_to_node_[rank]; for (const auto& it : start->outedges) { for (const auto& jt : it->head->inedges) { if (jt->tail != start) { (*scores)[jt->tail->id] = -1; } } } Node* max = nullptr; for (std::uint32_t i = rank + 1; i < rank_to_node_.size(); ++i) { auto it = rank_to_node_[i]; (*scores)[it->id] = -1; (*predecessors)[it->id] = nullptr; for (const auto& jt : it->inedges) { if ((*scores)[jt->tail->id] == -1) { continue; } if (((*scores)[it->id] < jt->weight) || ((*scores)[it->id] == jt->weight && (*scores)[(*predecessors)[it->id]->id] <= (*scores)[jt->tail->id])) { // NOLINT (*scores)[it->id] = jt->weight; (*predecessors)[it->id] = jt->tail; } } if ((*predecessors)[it->id]) { (*scores)[it->id] += (*scores)[(*predecessors)[it->id]->id]; } if (!max || (*scores)[max->id] < (*scores)[it->id]) { max = it; } } return max; } std::vector Graph::ExtractSubgraph(const Node* begin, const Node* end) const { // NOLINT std::vector dst(nodes_.size(), false); std::stack stack; stack.push(begin); while (!stack.empty()) { auto curr = stack.top(); stack.pop(); if (!dst[curr->id] && curr->id >= end->id) { for (const auto& it : curr->inedges) { stack.push(it->tail); } for (const auto& it : curr->aligned_nodes) { stack.push(it); } dst[curr->id] = true; } } return dst; } Graph Graph::Subgraph( std::uint32_t begin, std::uint32_t end, std::vector* subgraph_to_graph) const { if (!subgraph_to_graph) { throw std::invalid_argument( "[spoa::Graph::Subgraph] error: invalid ptr to subgraph_to_graph"); } auto is_in_subgraph = ExtractSubgraph(nodes_[end].get(), nodes_[begin].get()); // init subgraph Graph subgraph{}; subgraph.num_codes_ = num_codes_; subgraph.coder_ = coder_; subgraph.decoder_ = decoder_; // subgraph.sequences_ = TODO(rvaser) maybe add sequences // create a map from subgraph nodes to graph nodes and vice versa subgraph_to_graph->clear(); subgraph_to_graph->resize(nodes_.size(), nullptr); std::vector graph_to_subgraph(nodes_.size(), nullptr); for (const auto& it : nodes_) { if (!is_in_subgraph[it->id]) { continue; } subgraph.AddNode(it->code); graph_to_subgraph[it->id] = subgraph.nodes_.back().get(); (*subgraph_to_graph)[subgraph.nodes_.back()->id] = it.get(); } // connect nodes for (const auto& it : nodes_) { if (!is_in_subgraph[it->id]) { continue; } auto jt = graph_to_subgraph[it->id]; for (const auto& kt : it->inedges) { if (graph_to_subgraph[kt->tail->id]) { subgraph.AddEdge(graph_to_subgraph[kt->tail->id], jt, kt->weight); } } for (const auto& kt : it->aligned_nodes) { if (graph_to_subgraph[kt->id]) { jt->aligned_nodes.emplace_back(graph_to_subgraph[kt->id]); } } } subgraph.TopologicalSort(); return subgraph; } void Graph::UpdateAlignment( const std::vector& subgraph_to_graph, Alignment* alignment) const { for (auto& it : *alignment) { if (it.first != -1) { it.first = subgraph_to_graph[it.first]->id; } } } void Graph::PrintDot(const std::string& path) const { if (path.empty()) { return; } std::ofstream os(path); std::vector consensus_rank(nodes_.size(), -1); std::int32_t rank = 0; for (const auto& it : consensus_) { consensus_rank[it->id] = rank++; } os << "digraph " << sequences_.size() << " {" << std::endl << " graph [rankdir = LR]" << std::endl; for (const auto& it : nodes_) { os << " " << it->id << "[label = \"" << it->id << " - " << static_cast(decoder_[it->code]) << "\""; if (consensus_rank[it->id] != -1) { os << ", style = filled, fillcolor = goldenrod1"; } os << "]" << std::endl; for (const auto& jt : it->outedges) { os << " " << it->id << " -> " << jt->head->id << " [label = \"" << jt->weight << "\""; if (consensus_rank[it->id] + 1 == consensus_rank[jt->head->id]) { os << ", color = goldenrod1"; } os << "]" << std::endl; } for (const auto& jt : it->aligned_nodes) { if (jt->id > it->id) { os << " " << it->id << " -> " << jt->id << " [style = dotted, arrowhead = none]" << std::endl; } } } os << "}" << std::endl; os.close(); } void Graph::Clear() { num_codes_ = 0; std::fill(coder_.begin(), coder_.end(), -1); std::fill(decoder_.begin(), decoder_.end(), -1); sequences_.clear(); nodes_.clear(); edges_.clear(); rank_to_node_.clear(); consensus_.clear(); } } // namespace spoa spoa-4.0.8/src/main.cpp000066400000000000000000000250141400776337600147230ustar00rootroot00000000000000// Copyright (c) 2020 Robert Vaser #include #include "bioparser/fasta_parser.hpp" #include "bioparser/fastq_parser.hpp" #include "biosoup/sequence.hpp" #include "spoa/spoa.hpp" std::atomic biosoup::Sequence::num_objects{0}; namespace { static struct option options[] = { {"algorithm", required_argument, nullptr, 'l'}, {"result", required_argument, nullptr, 'r'}, {"dot", required_argument, nullptr, 'd'}, {"strand-ambiguous", no_argument, nullptr, 's'}, {"version", no_argument, nullptr, 'v'}, {"help", no_argument, nullptr, 'h'}, {nullptr, 0, nullptr, 0} }; std::unique_ptr> CreateParser( const std::string& path) { auto is_suffix = [] (const std::string& str, const std::string& suff) { return str.size() < suff.size() ? false : str.compare(str.size() - suff.size(), suff.size(), suff) == 0; }; if (is_suffix(path, ".fasta") || is_suffix(path, ".fasta.gz") || is_suffix(path, ".fna") || is_suffix(path, ".fna.gz") || is_suffix(path, ".faa") || is_suffix(path, ".faa.gz") || is_suffix(path, ".fa") || is_suffix(path, ".fa.gz")) { try { return bioparser::Parser::Create(path); // NOLINT } catch (const std::invalid_argument& exception) { std::cerr << exception.what() << std::endl; return nullptr; } } if (is_suffix(path, ".fastq") || is_suffix(path, ".fastq.gz") || is_suffix(path, ".fq") || is_suffix(path, ".fq.gz")) { try { return bioparser::Parser::Create(path); // NOLINT } catch (const std::invalid_argument& exception) { std::cerr << exception.what() << std::endl; return nullptr; } } std::cerr << "[spoa::CreateParser] error: file " << path << " has unsupported format extension (valid extensions: .fasta, " << ".fasta.gz, .fna, .fna.gz, .faa, .faa.gz, .fa, .fa.gz, .fastq, " << ".fastq.gz, .fq, .fq.gz)" << std::endl; return nullptr; } void Help() { std::cout << "usage: spoa [options ...] \n" "\n" " # default output is stdout\n" " \n" " input file in FASTA/FASTQ format (can be compressed with gzip)\n" "\n" " options:\n" " -m \n" " default: 5\n" " score for matching bases\n" " -n \n" " default: -4\n" " score for mismatching bases\n" " -g \n" " default: -8\n" " gap opening penalty (must be non-positive)\n" " -e \n" " default: -6\n" " gap extension penalty (must be non-positive)\n" " -q \n" " default: -10\n" " gap opening penalty of the second affine function\n" " (must be non-positive)\n" " -c \n" " default: -4\n" " gap extension penalty of the second affine function\n" " (must be non-positive)\n" " -l, --algorithm \n" " default: 0\n" " alignment mode:\n" " 0 - local (Smith-Waterman)\n" " 1 - global (Needleman-Wunsch)\n" " 2 - semi-global\n" " -r, --result (option can be used multiple times)\n" " default: 0\n" " result mode:\n" " 0 - consensus (FASTA)\n" " 1 - multiple sequence alignment (FASTA)\n" " 2 - 0 & 1 (FASTA)\n" " 3 - partial order graph (GFA)\n" " 4 - 0 & 3 (GFA)\n" " -d, --dot \n" " output file for the partial order graph in DOT format\n" " -s, --strand-ambiguous\n" " for each sequence pick the strand with the better alignment\n" " --version\n" " prints the version number\n" " -h, --help\n" " prints the usage\n" "\n" " gap mode:\n" " linear if g >= e\n" " affine if g <= q or e >= c\n" " convex otherwise (default)\n"; } void PrintGfa( const spoa::Graph& graph, const std::vector& headers, const std::vector& is_reversed, bool include_consensus = false) { if (headers.size() < graph.sequences().size()) { std::cerr << "[spoa::PrintGfa] error: missing header(s)" << std::endl; return; } if (!is_reversed.empty() && is_reversed.size() < graph.sequences().size()) { std::cerr << "[spoa::PringGfa] error: missing reversion flag(s)" << std::endl; // NOLINT return; } std::vector is_consensus_node(graph.nodes().size(), false); for (const auto& it : graph.consensus()) { is_consensus_node[it->id] = true; } std::cout << "H\tVN:Z:1.0" << std::endl; for (const auto& it : graph.nodes()) { std::cout << "S\t" << it->id + 1 << "\t" << static_cast(graph.decoder(it->code)); if (is_consensus_node[it->id]) { std::cout << "\tic:Z:true"; } std::cout << std::endl; for (const auto& jt : it->outedges) { std::cout << "L\t" << it->id + 1 << "\t" << "+\t" << jt->head->id + 1 << "\t" << "+\t" << "OM\t" << "ew:f:" << jt->weight; if (is_consensus_node[it->id] && is_consensus_node[jt->head->id]) { std::cout << "\tic:Z:true"; } std::cout << std::endl; } } for (std::uint32_t i = 0; i < graph.sequences().size(); ++i) { std::cout << "P\t" << headers[i] << "\t"; std::vector path; auto curr = graph.sequences()[i]; while (true) { path.emplace_back(curr->id + 1); if (!(curr = curr->Successor(i))) { break; } } bool ir = !is_reversed.empty() && is_reversed[i]; if (ir) { std::reverse(path.begin(), path.end()); } for (std::uint32_t j = 0; j < path.size(); ++j) { if (j != 0) { std::cout << ","; } std::cout << path[j] << (ir ? "-" : "+"); } std::cout << "\t*" << std::endl; } if (include_consensus) { std::cout << "P\tConsensus\t"; for (std::uint32_t i = 0; i < graph.consensus().size(); ++i) { if (i != 0) { std::cout << ","; } std::cout << graph.consensus()[i]->id + 1 << "+"; } std::cout << "\t*" << std::endl; } } } // namespace int main(int argc, char** argv) { std::int8_t m = 5; std::int8_t n = -4; std::int8_t g = -8; std::int8_t e = -6; std::int8_t q = -10; std::int8_t c = -4; std::uint8_t algorithm = 0; std::vector results = { 0 }; std::string dot_path{}; bool is_strand_ambiguous = false; std::string optstr = "m:n:g:e:q:c:l:r:d:sh"; int opt; while ((opt = getopt_long(argc, argv, optstr.c_str(), options, nullptr)) != -1) { // NOLINT switch (opt) { case 'm': m = atoi(optarg); break; case 'n': n = atoi(optarg); break; case 'g': g = atoi(optarg); break; case 'e': e = atoi(optarg); break; case 'q': q = atoi(optarg); break; case 'c': c = atoi(optarg); break; case 'l': algorithm = atoi(optarg); break; case 'r': results.emplace_back(atoi(optarg)); break; case 'd': dot_path = optarg; break; case 's': is_strand_ambiguous = true; break; case 'v': std::cout << VERSION << std::endl; return 0; case 'h': Help(); return 0; default: return 1; } } if (results.size() > 1) { results.erase(results.begin()); } if (optind >= argc) { std::cerr << "[spoa::] error: missing input file!" << std::endl; Help(); return 1; } auto sparser = CreateParser(argv[optind]); if (sparser == nullptr) { return 1; } std::unique_ptr alignment_engine; try { alignment_engine = spoa::AlignmentEngine::Create( static_cast(algorithm), m, n, g, e, q, c); } catch(std::invalid_argument& exception) { std::cerr << exception.what() << std::endl; return 1; } std::vector> sequences; sequences = sparser->Parse(-1); std::size_t max_sequence_len = 0; for (const auto& it : sequences) { max_sequence_len = std::max(max_sequence_len, it->data.size()); } try { alignment_engine->Prealloc(max_sequence_len, 4); } catch (std::invalid_argument& exception) { std::cerr << exception.what() << std::endl; return 1; } spoa::Graph graph{}; std::vector is_reversed; for (const auto& it : sequences) { std::int32_t score = 0; spoa::Alignment alignment; try { alignment = alignment_engine->Align(it->data, graph, &score); } catch (std::invalid_argument& exception) { std::cerr << exception.what() << std::endl; return 1; } if (is_strand_ambiguous) { it->ReverseAndComplement(); std::int32_t score_rev = 0; spoa::Alignment alignment_rev; try { alignment_rev = alignment_engine->Align(it->data, graph, &score_rev); } catch (std::invalid_argument& exception) { std::cerr << exception.what() << std::endl; return 1; } if (score >= score_rev) { it->ReverseAndComplement(); is_reversed.push_back(false); } else { alignment = alignment_rev; is_reversed.push_back(true); } } try { if (it->quality.empty()) { graph.AddAlignment(alignment, it->data); } else { graph.AddAlignment(alignment, it->data, it->quality); } } catch(std::invalid_argument& exception) { std::cerr << exception.what() << std::endl; return 1; } } for (const auto& it : results) { switch (it) { case 0: { auto consensus = graph.GenerateConsensus(); std::cout << ">Consensus LN:i:" << consensus.size() << std::endl << consensus << std::endl; break; } case 1: case 2: { auto msa = graph.GenerateMultipleSequenceAlignment(it == 2); for (std::uint32_t i = 0; i < msa.size(); ++i) { std::string name = i < sequences.size() ? sequences[i]->name : "Consensus"; // NOLINT std::cout << ">" << name << std::endl << msa[i] << std::endl; } break; } case 3: case 4: { std::vector headers; for (const auto& it : sequences) { headers.emplace_back(it->name); } graph.GenerateConsensus(); PrintGfa(graph, headers, is_reversed, it == 4); break; } default: break; } } graph.PrintDot(dot_path); return 0; } spoa-4.0.8/src/simd_alignment_engine.hpp000066400000000000000000000052751400776337600203320ustar00rootroot00000000000000// Copyright (c) 2020 Robert Vaser #ifndef SIMD_ALIGNMENT_ENGINE_HPP_ #define SIMD_ALIGNMENT_ENGINE_HPP_ #include #include #include #include #include "spoa/alignment_engine.hpp" #include "spoa/architectures.hpp" namespace spoa { class Graph; std::unique_ptr CreateSimdAlignmentEngine( // for dispatcher AlignmentType type, AlignmentSubtype subtype, std::int8_t m, std::int8_t n, std::int8_t g, std::int8_t e, std::int8_t q, std::int8_t c); template class SimdAlignmentEngine: public AlignmentEngine { public: SimdAlignmentEngine(const SimdAlignmentEngine&) = delete; SimdAlignmentEngine& operator=(const SimdAlignmentEngine&) = delete; SimdAlignmentEngine(SimdAlignmentEngine&&) = default; SimdAlignmentEngine& operator=(SimdAlignmentEngine&&) = delete; ~SimdAlignmentEngine() = default; static std::unique_ptr Create( AlignmentType type, AlignmentSubtype subtype, std::int8_t m, std::int8_t n, std::int8_t g, std::int8_t e, std::int8_t q, std::int8_t c); void Prealloc( std::uint32_t max_sequence_len, std::uint8_t alphabet_size) override; Alignment Align( const char* sequence, std::uint32_t sequence_len, const Graph& graph, std::int32_t* score) override; friend std::unique_ptr CreateSimdAlignmentEngine( AlignmentType type, AlignmentSubtype subtype, std::int8_t m, std::int8_t n, std::int8_t g, std::int8_t e, std::int8_t q, std::int8_t c); private: SimdAlignmentEngine( AlignmentType type, AlignmentSubtype subtype, std::int8_t m, std::int8_t n, std::int8_t g, std::int8_t e, std::int8_t q, std::int8_t c); template Alignment Linear( std::uint32_t sequence_len, const Graph& graph, std::int32_t* score) noexcept; template Alignment Affine( std::uint32_t sequence_len, const Graph& graph, std::int32_t* score) noexcept; template Alignment Convex( std::uint32_t sequence_len, const Graph& graph, std::int32_t* score) noexcept; void Realloc( std::uint64_t matrix_width, std::uint64_t matrix_height, std::uint8_t num_codes); template void Initialize( const char* sequence, const Graph& graph, std::uint64_t normal_matrix_width, std::uint64_t matrix_width, std::uint64_t matrix_height) noexcept; struct Implementation; std::unique_ptr pimpl_; }; } // namespace spoa #endif // SIMD_ALIGNMENT_ENGINE_HPP_ spoa-4.0.8/src/simd_alignment_engine_dispatch.cpp000066400000000000000000000005411400776337600221730ustar00rootroot00000000000000// Copyright (c) 2020 Mario Brcic, Robert Vaser #include "simd_alignment_engine_implementation.hpp" #if defined(__AVX2__) #define ARCH Architecture::kAVX2 #elif defined(__SSE4_1__) #define ARCH Architecture::kSSE4_1 #else #define ARCH Architecture::kSSE2 #endif namespace spoa { template class SimdAlignmentEngine; } // namespace spoa spoa-4.0.8/src/simd_alignment_engine_dispatcher.cpp000066400000000000000000000022251400776337600225230ustar00rootroot00000000000000// Copyright (c) 2020 Mario Brcic, Robert Vaser #include "simd_alignment_engine_implementation.hpp" #ifdef SPOA_GENERATE_DISPATCH #include "cpuinfo_x86.h" // NOLINT static const cpu_features::X86Features features = cpu_features::GetX86Info().features; #endif namespace spoa { #ifndef SPOA_GENERATE_DISPATCH template class SimdAlignmentEngine; #endif std::unique_ptr CreateSimdAlignmentEngine( AlignmentType type, AlignmentSubtype subtype, std::int8_t m, std::int8_t n, std::int8_t g, std::int8_t e, std::int8_t q, std::int8_t c) { #ifdef SPOA_GENERATE_DISPATCH if (features.avx2) { return SimdAlignmentEngine::Create( type, subtype, m, n, g, e, q, c); } else if (features.sse4_1) { return SimdAlignmentEngine::Create( type, subtype, m, n, g, e, q, c); } else { return SimdAlignmentEngine::Create( type, subtype, m, n, g, e, q, c); } #else return SimdAlignmentEngine::Create( type, subtype, m, n, g, e, q, c); #endif } } // namespace spoa spoa-4.0.8/src/simd_alignment_engine_implementation.hpp000066400000000000000000002125461400776337600234400ustar00rootroot00000000000000// Copyright (c) 2020 Robert Vaser #ifndef SIMD_ALIGNMENT_ENGINE_IMPLEMENTATION_HPP_ #define SIMD_ALIGNMENT_ENGINE_IMPLEMENTATION_HPP_ #include "simd_alignment_engine.hpp" #include #include #include #include #include #include #include extern "C" { #ifdef SPOA_USE_SIMDE #if defined(__AVX2__) #include "simde/x86/avx2.h" #else #include "simde/x86/sse4.1.h" // SSE4.1 is covered better #endif #elif defined(__AVX2__) || defined(__SSE4_1__) #include // AVX2 and lower #endif } #include "spoa/graph.hpp" namespace spoa { // Taken from https://gcc.gnu.org/viewcvs/gcc?view=revision&revision=216149 inline void* align( std::size_t __align, std::size_t __size, void*& __ptr, // NOLINT std::size_t& __space) noexcept { // NOLINT const auto __intptr = reinterpret_cast(__ptr); const auto __aligned = (__intptr - 1u + __align) & -__align; const auto __diff = __aligned - __intptr; if ((__size + __diff) > __space) { return nullptr; } else { __space -= __diff; return __ptr = reinterpret_cast(__aligned); } } template T* AllocateAlignedMemory( T** storage, std::size_t size, std::size_t alignment) { *storage = new T[size + alignment - 1]; void* ptr = static_cast(*storage); std::size_t storage_size = (size + alignment - 1) * sizeof(T); return static_cast(align(alignment, size * sizeof(T), ptr, storage_size)); } template struct InstructionSet; #if defined(__AVX2__) constexpr std::uint32_t kRegisterSize = 256; using __mxxxi = __m256i; inline __mxxxi _mmxxx_load_si(__mxxxi const* mem_addr) { return _mm256_load_si256(mem_addr); } inline void _mmxxx_store_si(__mxxxi* mem_addr, const __mxxxi& a) { _mm256_store_si256(mem_addr, a); } inline __mxxxi _mmxxx_or_si(const __mxxxi& a, const __mxxxi& b) { return _mm256_or_si256(a, b); } #define _mmxxx_slli_si(a, n) n < 16 ? \ _mm256_alignr_epi8(a, _mm256_permute2x128_si256(a, a, _MM_SHUFFLE(0, 0, 2, 0)), 16 - n) : \ _mm256_permute2x128_si256(a, a, _MM_SHUFFLE(0, 0, 2, 0)) #define _mmxxx_srli_si(a, n) \ _mm256_srli_si256(_mm256_permute2x128_si256(a, a, _MM_SHUFFLE(2, 0, 0, 1)), n - 16) // NOLINT template struct InstructionSet { using type = std::int16_t; static constexpr std::uint32_t kNumVar = kRegisterSize / (8 * sizeof(type)); static constexpr std::uint32_t kLogNumVar = 4; static constexpr std::uint32_t kLSS = 2; // Left Shift Size static constexpr std::uint32_t kRSS = 30; // Right Shift Size static inline __mxxxi _mmxxx_add_epi(const __mxxxi& a, const __mxxxi& b) { return _mm256_add_epi16(a, b); } static inline __mxxxi _mmxxx_sub_epi(const __mxxxi& a, const __mxxxi& b) { return _mm256_sub_epi16(a, b); } static inline __mxxxi _mmxxx_min_epi(const __mxxxi& a, const __mxxxi& b) { return _mm256_min_epi16(a, b); } static inline __mxxxi _mmxxx_max_epi(const __mxxxi& a, const __mxxxi& b) { return _mm256_max_epi16(a, b); } static inline __mxxxi _mmxxx_set1_epi(type a) { return _mm256_set1_epi16(a); } static inline void _mmxxx_prefix_max( __mxxxi& a, // NOLINT const __mxxxi* masks, const __mxxxi* penalties) { a = _mmxxx_max_epi(a, _mmxxx_or_si(masks[0], _mmxxx_slli_si(_mmxxx_add_epi(a, penalties[0]), 2))); // NOLINT a = _mmxxx_max_epi(a, _mmxxx_or_si(masks[1], _mmxxx_slli_si(_mmxxx_add_epi(a, penalties[1]), 4))); // NOLINT a = _mmxxx_max_epi(a, _mmxxx_or_si(masks[2], _mmxxx_slli_si(_mmxxx_add_epi(a, penalties[2]), 8))); // NOLINT a = _mmxxx_max_epi(a, _mmxxx_or_si(masks[3], _mmxxx_slli_si(_mmxxx_add_epi(a, penalties[3]), 16))); // NOLINT } }; template struct InstructionSet { using type = std::int32_t; static constexpr std::uint32_t kNumVar = kRegisterSize / (8 * sizeof(type)); static constexpr std::uint32_t kLogNumVar = 3; static constexpr std::uint32_t kLSS = 4; static constexpr std::uint32_t kRSS = 28; static inline __mxxxi _mmxxx_add_epi(const __mxxxi& a, const __mxxxi& b) { return _mm256_add_epi32(a, b); } static inline __mxxxi _mmxxx_sub_epi(const __mxxxi& a, const __mxxxi& b) { return _mm256_sub_epi32(a, b); } static inline __mxxxi _mmxxx_min_epi(const __mxxxi& a, const __mxxxi& b) { return _mm256_min_epi32(a, b); } static inline __mxxxi _mmxxx_max_epi(const __mxxxi& a, const __mxxxi& b) { return _mm256_max_epi32(a, b); } static inline __mxxxi _mmxxx_set1_epi(type a) { return _mm256_set1_epi32(a); } static inline void _mmxxx_prefix_max( __mxxxi& a, // NOLINT const __mxxxi* masks, const __mxxxi* penalties) { a = _mmxxx_max_epi(a, _mmxxx_or_si(masks[0], _mmxxx_slli_si(_mmxxx_add_epi(a, penalties[0]), 4))); // NOLINT a = _mmxxx_max_epi(a, _mmxxx_or_si(masks[1], _mmxxx_slli_si(_mmxxx_add_epi(a, penalties[1]), 8))); // NOLINT a = _mmxxx_max_epi(a, _mmxxx_or_si(masks[2], _mmxxx_slli_si(_mmxxx_add_epi(a, penalties[2]), 16))); // NOLINT } }; #elif defined(__SSE4_1__) || defined(SPOA_USE_SIMDE) constexpr std::uint32_t kRegisterSize = 128; using __mxxxi = __m128i; inline __mxxxi _mmxxx_load_si(__mxxxi const* mem_addr) { return _mm_load_si128(mem_addr); } inline void _mmxxx_store_si(__mxxxi* mem_addr, const __mxxxi& a) { _mm_store_si128(mem_addr, a); } inline __mxxxi _mmxxx_or_si(const __mxxxi& a, const __mxxxi& b) { return _mm_or_si128(a, b); } #define _mmxxx_slli_si(a, n) \ _mm_slli_si128(a, n) #define _mmxxx_srli_si(a, n) \ _mm_srli_si128(a, n) template struct InstructionSet { using type = std::int16_t; static constexpr std::uint32_t kNumVar = kRegisterSize / (8 * sizeof(type)); static constexpr std::uint32_t kLogNumVar = 3; static constexpr std::uint32_t kLSS = 2; static constexpr std::uint32_t kRSS = 14; static inline __mxxxi _mmxxx_add_epi(const __mxxxi& a, const __mxxxi& b) { return _mm_add_epi16(a, b); } static inline __mxxxi _mmxxx_sub_epi(const __mxxxi& a, const __mxxxi& b) { return _mm_sub_epi16(a, b); } static inline __mxxxi _mmxxx_min_epi(const __mxxxi& a, const __mxxxi& b) { return _mm_min_epi16(a, b); } static inline __mxxxi _mmxxx_max_epi(const __mxxxi& a, const __mxxxi& b) { return _mm_max_epi16(a, b); } static inline __mxxxi _mmxxx_set1_epi(type a) { return _mm_set1_epi16(a); } static inline void _mmxxx_prefix_max( __mxxxi& a, // NOLINT const __mxxxi* masks, const __mxxxi* penalties) { a = _mmxxx_max_epi(a, _mmxxx_or_si(masks[0], _mmxxx_slli_si(_mmxxx_add_epi(a, penalties[0]), 2))); // NOLINT a = _mmxxx_max_epi(a, _mmxxx_or_si(masks[1], _mmxxx_slli_si(_mmxxx_add_epi(a, penalties[1]), 4))); // NOLINT a = _mmxxx_max_epi(a, _mmxxx_or_si(masks[2], _mmxxx_slli_si(_mmxxx_add_epi(a, penalties[2]), 8))); // NOLINT } }; template struct InstructionSet { using type = std::int32_t; static constexpr std::uint32_t kNumVar = kRegisterSize / (8 * sizeof(type)); static constexpr std::uint32_t kLogNumVar = 2; static constexpr std::uint32_t kLSS = 4; static constexpr std::uint32_t kRSS = 12; static inline __mxxxi _mmxxx_add_epi(const __mxxxi& a, const __mxxxi& b) { return _mm_add_epi32(a, b); } static inline __mxxxi _mmxxx_sub_epi(const __mxxxi& a, const __mxxxi& b) { return _mm_sub_epi32(a, b); } static inline __mxxxi _mmxxx_min_epi(const __mxxxi& a, const __mxxxi& b) { return _mm_min_epi32(a, b); } static inline __mxxxi _mmxxx_max_epi(const __mxxxi& a, const __mxxxi& b) { return _mm_max_epi32(a, b); } static inline __mxxxi _mmxxx_set1_epi(type a) { return _mm_set1_epi32(a); } static inline void _mmxxx_prefix_max( __mxxxi& a, // NOLINT const __mxxxi* masks, const __mxxxi* penalties) { a = _mmxxx_max_epi(a, _mmxxx_or_si(masks[0], _mmxxx_slli_si(_mmxxx_add_epi(a, penalties[0]), 4))); // NOLINT a = _mmxxx_max_epi(a, _mmxxx_or_si(masks[1], _mmxxx_slli_si(_mmxxx_add_epi(a, penalties[1]), 8))); // NOLINT } }; #endif #if defined(__AVX2__) || defined(__SSE4_1__) || defined(SPOA_USE_SIMDE) template void _mmxxx_print(const __mxxxi& a) { __attribute__((aligned(kRegisterSize / 8))) typename T::type unpacked[T::kNumVar]; // NOLINT _mmxxx_store_si(reinterpret_cast<__mxxxi*>(unpacked), a); for (std::uint32_t i = 0; i < T::kNumVar; i++) { std::cout << unpacked[i] << " "; } } template typename T::type _mmxxx_max_value(const __mxxxi& a) { typename T::type max_score = 0; __attribute__((aligned(kRegisterSize / 8))) typename T::type unpacked[T::kNumVar]; // NOLINT _mmxxx_store_si(reinterpret_cast<__mxxxi*>(unpacked), a); for (std::uint32_t i = 0; i < T::kNumVar; i++) { max_score = std::max(max_score, unpacked[i]); } return max_score; } template typename T::type _mmxxx_value_at(const __mxxxi& a, std::uint32_t i) { __attribute__((aligned(kRegisterSize / 8))) typename T::type unpacked[T::kNumVar]; // NOLINT _mmxxx_store_si(reinterpret_cast<__mxxxi*>(unpacked), a); return unpacked[i]; } template std::int32_t _mmxxx_index_of( const __mxxxi* row, std::uint32_t row_width, typename T::type value) { for (std::uint32_t i = 0; i < row_width; ++i) { __attribute__((aligned(kRegisterSize / 8))) typename T::type unpacked[T::kNumVar]; // NOLINT _mmxxx_store_si(reinterpret_cast<__mxxxi*>(unpacked), row[i]); for (std::uint32_t j = 0; j < T::kNumVar; j++) { if (unpacked[j] == value) { return i * T::kNumVar + j; } } } return -1; } #endif template std::unique_ptr SimdAlignmentEngine::Create( AlignmentType type, AlignmentSubtype subtype, std::int8_t m, std::int8_t n, std::int8_t g, std::int8_t e, std::int8_t q, std::int8_t c) { #if defined(__AVX2__) || defined(__SSE4_1__) || defined(SPOA_USE_SIMDE) return std::unique_ptr( new SimdAlignmentEngine(type, subtype, m, n, g, e, q, c)); #else (void) type; (void) subtype; (void) m; (void) n; (void) g; (void) e; (void) q; (void) c; return nullptr; #endif } template struct SimdAlignmentEngine::Implementation { #if defined(__AVX2__) || defined(__SSE4_1__) || defined(SPOA_USE_SIMDE) std::vector node_id_to_rank; std::unique_ptr<__mxxxi[]> sequence_profile_storage; std::uint64_t sequence_profile_size; __mxxxi* sequence_profile; std::vector first_column; std::unique_ptr<__mxxxi[]> M_storage; std::uint64_t M_size; __mxxxi* H; __mxxxi* F; __mxxxi* E; __mxxxi* O; __mxxxi* Q; std::unique_ptr<__mxxxi[]> masks_storage; std::uint32_t masks_size; __mxxxi* masks; std::unique_ptr<__mxxxi[]> penalties_storage; std::uint32_t penalties_size; __mxxxi* penalties; Implementation() : node_id_to_rank(), sequence_profile_storage(nullptr), sequence_profile_size(0), sequence_profile(nullptr), first_column(), M_storage(nullptr), M_size(0), H(nullptr), F(nullptr), E(nullptr), O(nullptr), Q(nullptr), masks_storage(nullptr), masks_size(0), masks(nullptr), penalties_storage(nullptr), penalties_size(0), penalties(nullptr) { } #endif }; template SimdAlignmentEngine::SimdAlignmentEngine( AlignmentType type, AlignmentSubtype subtype, std::int8_t m, std::int8_t n, std::int8_t g, std::int8_t e, std::int8_t q, std::int8_t c) : AlignmentEngine(type, subtype, m, n, g, e, q, c), pimpl_(new Implementation()) { } template void SimdAlignmentEngine::Prealloc( std::uint32_t max_sequence_len, std::uint8_t alphabet_size) { if (max_sequence_len > std::numeric_limits::max()) { throw std::invalid_argument( "[spoa::SimdAlignmentEngine::Prealloc] error: too large sequence!"); } #if defined(__AVX2__) || defined(__SSE4_1__) || defined(SPOA_USE_SIMDE) std::int64_t worst_case_score = WorstCaseAlignmentScore( static_cast(max_sequence_len) + 8, static_cast(max_sequence_len) * alphabet_size); if (worst_case_score < std::numeric_limits::min() + 1024) { return; } else if (worst_case_score < std::numeric_limits::min() + 1024) { // NOLINT try { Realloc( (max_sequence_len / InstructionSet::kNumVar) + 1, static_cast(max_sequence_len) * alphabet_size, alphabet_size); } catch (std::bad_alloc& ba) { throw std::invalid_argument( "[spoa::SimdAlignmentEngine::Prealloc] error: insufficient memory!"); } } else { try { Realloc( (max_sequence_len / InstructionSet::kNumVar) + 1, static_cast(max_sequence_len) * alphabet_size, alphabet_size); } catch (std::bad_alloc& ba) { throw std::invalid_argument( "[spoa::SimdAlignmentEngine::Prealloc] error: insufficient memory!"); } } #endif (void) alphabet_size; } template void SimdAlignmentEngine::Realloc( std::uint64_t matrix_width, std::uint64_t matrix_height, std::uint8_t num_codes) { #if defined(__AVX2__) || defined(__SSE4_1__) || defined(SPOA_USE_SIMDE) if (pimpl_->node_id_to_rank.size() < matrix_height - 1) { pimpl_->node_id_to_rank.resize(matrix_height - 1, 0); } if (pimpl_->sequence_profile_size < num_codes * matrix_width) { __mxxxi* storage = nullptr; pimpl_->sequence_profile_size = num_codes * matrix_width; pimpl_->sequence_profile = AllocateAlignedMemory( &storage, pimpl_->sequence_profile_size, kRegisterSize / 8); pimpl_->sequence_profile_storage.reset(); pimpl_->sequence_profile_storage = std::unique_ptr<__mxxxi[]>(storage); } if (subtype_ == AlignmentSubtype::kLinear) { if (pimpl_->first_column.size() < matrix_height) { pimpl_->first_column.resize(matrix_height, 0); } if (pimpl_->M_size < matrix_height * matrix_width) { __mxxxi* storage = nullptr; pimpl_->M_size = matrix_height * matrix_width; pimpl_->H = AllocateAlignedMemory( &storage, pimpl_->M_size, kRegisterSize / 8); pimpl_->M_storage.reset(); pimpl_->M_storage = std::unique_ptr<__mxxxi[]>(storage); } } else if (subtype_ == AlignmentSubtype::kAffine) { if (pimpl_->first_column.size() < 2 * matrix_height) { pimpl_->first_column.resize(2 * matrix_height, 0); } if (pimpl_->M_size < 3 * matrix_height * matrix_width) { __mxxxi* storage = nullptr; pimpl_->M_size = 3 * matrix_height * matrix_width; pimpl_->H = AllocateAlignedMemory( &storage, pimpl_->M_size, kRegisterSize / 8); pimpl_->F = pimpl_->H + matrix_height * matrix_width; pimpl_->E = pimpl_->F + matrix_height * matrix_width; pimpl_->M_storage.reset(); pimpl_->M_storage = std::unique_ptr<__mxxxi[]>(storage); } } else if (subtype_ == AlignmentSubtype::kConvex) { if (pimpl_->first_column.size() < 3 * matrix_height) { pimpl_->first_column.resize(3 * matrix_height, 0); } if (pimpl_->M_size < 5 * matrix_height * matrix_width) { __mxxxi* storage = nullptr; pimpl_->M_size = 5 * matrix_height * matrix_width; pimpl_->H = AllocateAlignedMemory( &storage, pimpl_->M_size, kRegisterSize / 8); pimpl_->F = pimpl_->H + matrix_height * matrix_width; pimpl_->E = pimpl_->F + matrix_height * matrix_width; pimpl_->O = pimpl_->E + matrix_height * matrix_width; pimpl_->Q = pimpl_->O + matrix_height * matrix_width; pimpl_->M_storage.reset(); pimpl_->M_storage = std::unique_ptr<__mxxxi[]>(storage); } } if (pimpl_->masks_size < InstructionSet::kLogNumVar + 1) { __mxxxi* storage = nullptr; pimpl_->masks_size = InstructionSet::kLogNumVar + 1; pimpl_->masks = AllocateAlignedMemory( &storage, pimpl_->masks_size, kRegisterSize / 8); pimpl_->masks_storage.reset(); pimpl_->masks_storage = std::unique_ptr<__mxxxi[]>(storage); } if (pimpl_->penalties_size < 2 * InstructionSet::kLogNumVar) { // NOLINT __mxxxi* storage = nullptr; pimpl_->penalties_size = 2 * InstructionSet::kLogNumVar; pimpl_->penalties = AllocateAlignedMemory( &storage, pimpl_->penalties_size, kRegisterSize / 8); pimpl_->penalties_storage.reset(); pimpl_->penalties_storage = std::unique_ptr<__mxxxi[]>(storage); } #endif (void) matrix_width; (void) matrix_height; (void) num_codes; } template template void SimdAlignmentEngine::Initialize( const char* sequence, const Graph& graph, std::uint64_t normal_matrix_width, std::uint64_t matrix_width, std::uint64_t matrix_height) noexcept { #if defined(__AVX2__) || defined(__SSE4_1__) || defined(SPOA_USE_SIMDE) std::int32_t padding_penatly = -1 * std::max( std::max(abs(m_), abs(n_)), std::max(abs(g_), abs(q_))); __attribute__((aligned(kRegisterSize / 8))) typename T::type unpacked[T::kNumVar] = {}; // NOLINT for (std::uint32_t i = 0; i < graph.num_codes(); ++i) { char c = graph.decoder(i); for (std::uint32_t j = 0; j < matrix_width; ++j) { for (std::uint32_t k = 0; k < T::kNumVar; ++k) { unpacked[k] = (j * T::kNumVar + k) < normal_matrix_width ? (c == sequence[j * T::kNumVar + k] ? m_ : n_) : padding_penatly; } pimpl_->sequence_profile[i * matrix_width + j] = _mmxxx_load_si(reinterpret_cast(unpacked)); } } const auto& rank_to_node = graph.rank_to_node(); for (std::uint32_t i = 0; i < rank_to_node.size(); ++i) { pimpl_->node_id_to_rank[rank_to_node[i]->id] = i; } typename T::type kNegativeInfinity = std::numeric_limits::min() + 1024; __mxxxi negative_infinities = T::_mmxxx_set1_epi(kNegativeInfinity); __mxxxi zeroes = T::_mmxxx_set1_epi(0); // initialize secondary matrices switch (subtype_) { case AlignmentSubtype::kConvex: for (std::uint32_t j = 0; j < matrix_width; ++j) { pimpl_->O[j] = negative_infinities; pimpl_->Q[j] = T::_mmxxx_set1_epi(q_ + j * T::kNumVar * c_); __mxxxi c = T::_mmxxx_set1_epi(c_); for (std::uint32_t k = 1; k < T::kNumVar; ++k) { c = _mmxxx_slli_si(c, T::kLSS); pimpl_->Q[j] = T::_mmxxx_add_epi(pimpl_->Q[j], c); } } pimpl_->first_column[2 * matrix_height] = 0; for (std::uint32_t i = 1; i < matrix_height; ++i) { const auto& edges = rank_to_node[i - 1]->inedges; std::int32_t penalty = edges.empty() ? q_ - c_ : kNegativeInfinity; for (const auto& it : edges) { std::uint32_t pred_i = pimpl_->node_id_to_rank[it->tail->id] + 1; penalty = std::max(penalty, pimpl_->first_column[2 * matrix_height + pred_i]); // NOLINT } pimpl_->first_column[2 * matrix_height + i] = penalty + c_; } // fall through case AlignmentSubtype::kAffine: for (std::uint32_t j = 0; j < matrix_width; ++j) { pimpl_->F[j] = negative_infinities; pimpl_->E[j] = T::_mmxxx_set1_epi(g_ + j * T::kNumVar * e_); __mxxxi e = T::_mmxxx_set1_epi(e_); for (std::uint32_t k = 1; k < T::kNumVar; ++k) { e = _mmxxx_slli_si(e, T::kLSS); pimpl_->E[j] = T::_mmxxx_add_epi(pimpl_->E[j], e); } } pimpl_->first_column[matrix_height] = 0; for (std::uint32_t i = 1; i < matrix_height; ++i) { const auto& edges = rank_to_node[i - 1]->inedges; std::int32_t penalty = edges.empty() ? g_ - e_ : kNegativeInfinity; for (const auto& it : edges) { std::uint32_t pred_i = pimpl_->node_id_to_rank[it->tail->id] + 1; penalty = std::max(penalty, pimpl_->first_column[matrix_height + pred_i]); // NOLINT } pimpl_->first_column[matrix_height + i] = penalty + e_; } // fall through case AlignmentSubtype::kLinear: break; default: break; } // initialize primary matrix switch (type_) { case AlignmentType::kSW: for (std::uint32_t j = 0; j < matrix_width; ++j) { pimpl_->H[j] = zeroes; } for (std::uint32_t i = 0; i < matrix_height; ++i) { pimpl_->first_column[i] = 0; } break; case AlignmentType::kNW: switch (subtype_) { case AlignmentSubtype::kConvex: for (std::uint32_t i = 0; i < matrix_height; ++i) { pimpl_->first_column[i] = std::max( pimpl_->first_column[matrix_height + i], pimpl_->first_column[2 * matrix_height + i]); } for (std::uint32_t j = 0; j < matrix_width; ++j) { pimpl_->H[j] = T::_mmxxx_max_epi(pimpl_->E[j], pimpl_->Q[j]); } break; case AlignmentSubtype::kAffine: for (std::uint32_t i = 0; i < matrix_height; ++i) { pimpl_->first_column[i] = pimpl_->first_column[matrix_height + i]; } for (std::uint32_t j = 0; j < matrix_width; ++j) { pimpl_->H[j] = pimpl_->E[j]; } break; case AlignmentSubtype::kLinear: pimpl_->first_column[0] = 0; for (std::uint32_t i = 1; i < matrix_height; ++i) { const auto& edges = rank_to_node[i - 1]->inedges; std::int32_t penalty = edges.empty() ? 0 : kNegativeInfinity; for (const auto& it : edges) { std::uint32_t pred_i = pimpl_->node_id_to_rank[it->tail->id] + 1; penalty = std::max(penalty, pimpl_->first_column[pred_i]); } pimpl_->first_column[i] = penalty + g_; } for (std::uint32_t j = 0; j < matrix_width; ++j) { pimpl_->H[j] = T::_mmxxx_set1_epi(g_ + j * T::kNumVar * g_); __mxxxi g = T::_mmxxx_set1_epi(g_); for (std::uint32_t k = 1; k < T::kNumVar; ++k) { g = _mmxxx_slli_si(g, T::kLSS); pimpl_->H[j] = T::_mmxxx_add_epi(pimpl_->H[j], g); } } default: break; } break; case AlignmentType::kOV: switch (subtype_) { case AlignmentSubtype::kConvex: for (std::uint32_t j = 0; j < matrix_width; ++j) { pimpl_->H[j] = T::_mmxxx_max_epi(pimpl_->E[j], pimpl_->Q[j]); } break; case AlignmentSubtype::kAffine: for (std::uint32_t j = 0; j < matrix_width; ++j) { pimpl_->H[j] = pimpl_->E[j]; } break; case AlignmentSubtype::kLinear: for (std::uint32_t j = 0; j < matrix_width; ++j) { pimpl_->H[j] = T::_mmxxx_set1_epi(g_ + j * T::kNumVar * g_); __mxxxi g = T::_mmxxx_set1_epi(g_); for (std::uint32_t k = 1; k < T::kNumVar; ++k) { g = _mmxxx_slli_si(g, T::kLSS); pimpl_->H[j] = T::_mmxxx_add_epi(pimpl_->H[j], g); } } break; default: break; } for (std::uint32_t i = 0; i < matrix_height; ++i) { pimpl_->first_column[i] = 0; } break; default: break; } #endif (void) sequence; (void) graph; (void) normal_matrix_width; (void) matrix_width; (void) matrix_height; } template Alignment SimdAlignmentEngine::Align( const char* sequence, std::uint32_t sequence_len, const Graph& graph, std::int32_t* score) { if (sequence_len > std::numeric_limits::max()) { throw std::invalid_argument( "[spoa::SimdAlignmentEngine::Align] error: too large sequence!"); } if (graph.nodes().empty() || sequence_len == 0) { return Alignment(); } #if defined(__AVX2__) || defined(__SSE4_1__) || defined(SPOA_USE_SIMDE) std::int64_t worst_case_score = WorstCaseAlignmentScore( sequence_len + 8, graph.nodes().size()); if (worst_case_score < std::numeric_limits::min() + 1024) { throw std::invalid_argument( "[spoa::SimdAlignmentEngine::Align] error: possible overflow!"); } else if (worst_case_score < std::numeric_limits::min() + 1024) { // NOLINT try { Realloc( std::ceil(static_cast(sequence_len) / InstructionSet::kNumVar), // NOLINT graph.nodes().size() + 1, graph.num_codes()); } catch (std::bad_alloc& ba) { throw std::invalid_argument( "[spoa::SimdAlignmentEngine::Align] error: insufficient memory!"); } Initialize>( sequence, graph, sequence_len, std::ceil(static_cast(sequence_len) / InstructionSet::kNumVar), // NOLINT graph.nodes().size() + 1); if (subtype_ == AlignmentSubtype::kLinear) { return Linear>(sequence_len, graph, score); // NOLINT } else if (subtype_ == AlignmentSubtype::kAffine) { return Affine>(sequence_len, graph, score); // NOLINT } else if (subtype_ == AlignmentSubtype::kConvex) { return Convex>(sequence_len, graph, score); // NOLINT } } else { try { Realloc( std::ceil(static_cast(sequence_len) / InstructionSet::kNumVar), // NOLINT graph.nodes().size() + 1, graph.num_codes()); } catch (std::bad_alloc& ba) { throw std::invalid_argument( "[spoa::SimdAlignmentEngine::Align] error: insufficient memory!"); } Initialize>( sequence, graph, sequence_len, std::ceil(static_cast(sequence_len) / InstructionSet::kNumVar), // NOLINT graph.nodes().size() + 1); if (subtype_ == AlignmentSubtype::kLinear) { return Linear>(sequence_len, graph, score); // NOLINT } else if (subtype_ == AlignmentSubtype::kAffine) { return Affine>(sequence_len, graph, score); // NOLINT } else if (subtype_ == AlignmentSubtype::kConvex) { return Convex>(sequence_len, graph, score); // NOLINT } } #endif (void) sequence; (void) score; return Alignment(); } template template Alignment SimdAlignmentEngine::Linear( std::uint32_t sequence_len, const Graph& graph, std::int32_t* score) noexcept { #if defined(__AVX2__) || defined(__SSE4_1__) || defined(SPOA_USE_SIMDE) std::uint64_t normal_matrix_width = sequence_len; std::uint64_t matrix_width = std::ceil(static_cast(sequence_len) / T::kNumVar); const auto& rank_to_node = graph.rank_to_node(); typename T::type kNegativeInfinity = std::numeric_limits::min() + 1024; __attribute__((aligned(kRegisterSize / 8))) typename T::type unpacked[T::kNumVar] = {0}; // NOLINT for (std::uint32_t i = 0, j = 0; i < T::kNumVar && j < T::kLogNumVar; ++i) { unpacked[i] = kNegativeInfinity; if ((i & (i + 1)) == 0) { pimpl_->masks[j++] = _mmxxx_load_si( reinterpret_cast(unpacked)); } } pimpl_->masks[T::kLogNumVar] = _mmxxx_slli_si( T::_mmxxx_set1_epi(kNegativeInfinity), T::kLSS); pimpl_->penalties[0] = T::_mmxxx_set1_epi(g_); for (std::uint32_t i = 1; i < T::kLogNumVar; ++i) { pimpl_->penalties[i] = T::_mmxxx_add_epi( pimpl_->penalties[i - 1], pimpl_->penalties[i - 1]); } typename T::type max_score = type_ == AlignmentType::kSW ? 0 : kNegativeInfinity; // NOLINT std::int32_t max_i = -1; std::int32_t max_j = -1; std::uint32_t last_column_id = (normal_matrix_width - 1) % T::kNumVar; __mxxxi zeroes = T::_mmxxx_set1_epi(0); __mxxxi g = T::_mmxxx_set1_epi(g_); // alignment for (const auto& it : rank_to_node) { __mxxxi* char_profile = &(pimpl_->sequence_profile[it->code * matrix_width]); // NOLINT std::uint32_t i = pimpl_->node_id_to_rank[it->id] + 1; std::uint32_t pred_i = it->inedges.empty() ? 0 : pimpl_->node_id_to_rank[it->inedges[0]->tail->id] + 1; __mxxxi* H_row = &(pimpl_->H[i * matrix_width]); __mxxxi* H_pred_row = &(pimpl_->H[pred_i * matrix_width]); __mxxxi x = _mmxxx_srli_si( T::_mmxxx_set1_epi(pimpl_->first_column[pred_i]), T::kRSS); for (std::uint64_t j = 0; j < matrix_width; ++j) { // get diagonal __mxxxi t1 = _mmxxx_srli_si(H_pred_row[j], T::kRSS); H_row[j] = _mmxxx_or_si( _mmxxx_slli_si(H_pred_row[j], T::kLSS), x); x = t1; // update M H_row[j] = T::_mmxxx_max_epi( T::_mmxxx_add_epi(H_row[j], char_profile[j]), T::_mmxxx_add_epi(H_pred_row[j], g)); } // check other predecessors for (std::uint32_t p = 1; p < it->inedges.size(); ++p) { pred_i = pimpl_->node_id_to_rank[it->inedges[p]->tail->id] + 1; H_pred_row = &(pimpl_->H[pred_i * matrix_width]); x = _mmxxx_srli_si( T::_mmxxx_set1_epi(pimpl_->first_column[pred_i]), T::kRSS); for (std::uint64_t j = 0; j < matrix_width; ++j) { // get diagonal __mxxxi t1 = _mmxxx_srli_si(H_pred_row[j], T::kRSS); __mxxxi m = _mmxxx_or_si( _mmxxx_slli_si(H_pred_row[j], T::kLSS), x); x = t1; // updage M H_row[j] = T::_mmxxx_max_epi( H_row[j], T::_mmxxx_max_epi( T::_mmxxx_add_epi(m, char_profile[j]), T::_mmxxx_add_epi(H_pred_row[j], g))); } } __mxxxi score = T::_mmxxx_set1_epi(kNegativeInfinity); x = _mmxxx_srli_si( T::_mmxxx_add_epi( T::_mmxxx_set1_epi(pimpl_->first_column[i]), g), T::kRSS); for (std::uint64_t j = 0; j < matrix_width; ++j) { // add last element of previous vector into this one H_row[j] = T::_mmxxx_max_epi( H_row[j], _mmxxx_or_si(x, pimpl_->masks[T::kLogNumVar])); T::_mmxxx_prefix_max(H_row[j], pimpl_->masks, pimpl_->penalties); x = _mmxxx_srli_si( T::_mmxxx_add_epi(H_row[j], g), T::kRSS); if (type_ == AlignmentType::kSW) { H_row[j] = T::_mmxxx_max_epi(H_row[j], zeroes); } score = T::_mmxxx_max_epi(score, H_row[j]); } if (type_ == AlignmentType::kSW) { std::int32_t max_row_score = _mmxxx_max_value(score); if (max_score < max_row_score) { max_score = max_row_score; max_i = i; } } else if (type_ == AlignmentType::kOV) { if (it->outedges.empty()) { std::int32_t max_row_score = _mmxxx_max_value(score); if (max_score < max_row_score) { max_score = max_row_score; max_i = i; } } } else if (type_ == AlignmentType::kNW) { if (it->outedges.empty()) { std::int32_t max_row_score = _mmxxx_value_at( H_row[matrix_width - 1], last_column_id); if (max_score < max_row_score) { max_score = max_row_score; max_i = i; } } } } if (max_i == -1 && max_j == -1) { return Alignment(); } if (score) { *score = max_score; } if (type_ == AlignmentType::kSW) { max_j = _mmxxx_index_of( &(pimpl_->H[max_i * matrix_width]), matrix_width, max_score); } else if (type_ == AlignmentType::kOV) { if (rank_to_node[max_i - 1]->outedges.empty()) { max_j = _mmxxx_index_of( &(pimpl_->H[max_i * matrix_width]), matrix_width, max_score); } else { max_j = normal_matrix_width - 1; } } else if (type_ == AlignmentType::kNW) { max_j = normal_matrix_width - 1; } // backtrack std::uint32_t max_num_predecessors = 1; for (std::uint32_t i = 0; i < static_cast(max_i); ++i) { max_num_predecessors = std::max( max_num_predecessors, static_cast(rank_to_node[i]->inedges.size())); } typename T::type* backtrack_storage = nullptr; typename T::type* H = AllocateAlignedMemory( &backtrack_storage, 3 * T::kNumVar + 2 * T::kNumVar * max_num_predecessors, kRegisterSize / 8); typename T::type* H_pred = H + T::kNumVar; typename T::type* H_diag_pred = H_pred + T::kNumVar * max_num_predecessors; typename T::type* H_left_pred = H_diag_pred + T::kNumVar * max_num_predecessors; // NOLINT typename T::type* profile = H_left_pred + T::kNumVar; std::vector predecessors; std::int32_t i = max_i; std::int32_t j = max_j; std::int32_t prev_i = 0, prev_j = 0; std::uint32_t j_div = j / T::kNumVar; std::uint32_t j_mod = j % T::kNumVar; bool load_next_segment = true; Alignment alignment; do { // check stop condition if (j == -1 || i == 0) { break; } const auto& it = rank_to_node[i - 1]; // load everything if (load_next_segment) { predecessors.clear(); // load current cells _mmxxx_store_si( reinterpret_cast<__mxxxi*>(H), pimpl_->H[i * matrix_width + j_div]); // load predecessors cells if (it->inedges.empty()) { predecessors.emplace_back(0); _mmxxx_store_si(reinterpret_cast<__mxxxi*>(H_pred), pimpl_->H[j_div]); } else { std::uint32_t store_pos = 0; for (const auto& jt : it->inedges) { predecessors.emplace_back(pimpl_->node_id_to_rank[jt->tail->id] + 1); _mmxxx_store_si( reinterpret_cast<__mxxxi*>(&H_pred[store_pos * T::kNumVar]), pimpl_->H[predecessors.back() * matrix_width + j_div]); ++store_pos; } } // load query profile cells _mmxxx_store_si( reinterpret_cast<__mxxxi*>(profile), pimpl_->sequence_profile[it->code * matrix_width + j_div]); } // check stop condition if (type_ == AlignmentType::kSW && H[j_mod] == 0) { break; } if (j_mod == 0) { // border case if (j_div > 0) { _mmxxx_store_si( reinterpret_cast<__mxxxi*>(H_left_pred), pimpl_->H[i * matrix_width + j_div - 1]); for (std::uint32_t p = 0; p < predecessors.size(); ++p) { _mmxxx_store_si( reinterpret_cast<__mxxxi*>(&H_diag_pred[p * T::kNumVar]), pimpl_->H[predecessors[p] * matrix_width + (j_div - 1)]); } } else { H_left_pred[T::kNumVar - 1] = pimpl_->first_column[i]; for (std::uint32_t p = 0; p < predecessors.size(); ++p) { H_diag_pred[(p + 1) * T::kNumVar - 1] = pimpl_->first_column[predecessors[p]]; } } } // find best predecessor cell bool predecessor_found = false; if (i != 0) { for (std::uint32_t p = 0; p < predecessors.size(); ++p) { if ((j_mod == 0 && H[j_mod] == H_diag_pred[(p + 1) * T::kNumVar - 1] + profile[j_mod]) || // NOLINT (j_mod != 0 && H[j_mod] == H_pred[p * T::kNumVar + j_mod - 1] + profile[j_mod])) { // NOLINT prev_i = predecessors[p]; prev_j = j - 1; predecessor_found = true; break; } } } if (!predecessor_found && i != 0) { for (std::uint32_t p = 0; p < predecessors.size(); ++p) { if (H[j_mod] == H_pred[p * T::kNumVar + j_mod] + g_) { prev_i = predecessors[p]; prev_j = j; predecessor_found = true; break; } } } if (!predecessor_found) { if ((j_mod == 0 && H[j_mod] == H_left_pred[T::kNumVar - 1] + g_) || (j_mod != 0 && H[j_mod] == H[j_mod - 1] + g_)) { prev_i = i; prev_j = j - 1; predecessor_found = true; } } alignment.emplace_back( i == prev_i ? -1 : rank_to_node[i - 1]->id, j == prev_j ? -1 : j); // update for next round load_next_segment = (i == prev_i ? false : true) || (j != prev_j && prev_j % T::kNumVar == T::kNumVar - 1 ? true : false); i = prev_i; j = prev_j; j_div = j / T::kNumVar; j_mod = j % T::kNumVar; } while (true); delete[] backtrack_storage; // update alignment for NW (backtrack stops on first row or column) if (type_ == AlignmentType::kNW) { while (i == 0 && j != -1) { alignment.emplace_back(-1, j); --j; } while (i != 0 && j == -1) { alignment.emplace_back(rank_to_node[i - 1]->id, -1); const auto& it = rank_to_node[i - 1]; if (it->inedges.empty()) { i = 0; } else { for (const auto& jt : it->inedges) { std::uint32_t pred_i = pimpl_->node_id_to_rank[jt->tail->id] + 1; if (pimpl_->first_column[i] == pimpl_->first_column[pred_i] + g_) { i = pred_i; break; } } } } } std::reverse(alignment.begin(), alignment.end()); return alignment; #else (void) sequence_len; (void) graph; (void) score; return Alignment(); #endif } template template Alignment SimdAlignmentEngine::Affine( std::uint32_t sequence_len, const Graph& graph, std::int32_t* score) noexcept { #if defined(__AVX2__) || defined(__SSE4_1__) || defined(SPOA_USE_SIMDE) std::uint64_t normal_matrix_width = sequence_len; std::uint64_t matrix_width = std::ceil(static_cast(sequence_len) / T::kNumVar); const auto& rank_to_node = graph.rank_to_node(); typename T::type kNegativeInfinity = std::numeric_limits::min() + 1024; typename T::type max_score = type_ == AlignmentType::kSW ? 0 : kNegativeInfinity; // NOLINT std::int32_t max_i = -1; std::int32_t max_j = -1; std::uint32_t last_column_id = (normal_matrix_width - 1) % T::kNumVar; __mxxxi zeroes = T::_mmxxx_set1_epi(0); __mxxxi g = T::_mmxxx_set1_epi(g_ - e_); __mxxxi e = T::_mmxxx_set1_epi(e_); __attribute__((aligned(kRegisterSize / 8))) typename T::type unpacked[T::kNumVar] = {0}; // NOLINT for (std::uint32_t i = 0, j = 0; i < T::kNumVar && j < T::kLogNumVar; ++i) { unpacked[i] = kNegativeInfinity; if ((i & (i + 1)) == 0) { pimpl_->masks[j++] = _mmxxx_load_si( reinterpret_cast(unpacked)); } } pimpl_->masks[T::kLogNumVar] = _mmxxx_slli_si( T::_mmxxx_set1_epi(kNegativeInfinity), T::kLSS); pimpl_->penalties[0] = T::_mmxxx_set1_epi(e_); for (std::uint32_t i = 1; i < T::kLogNumVar; ++i) { pimpl_->penalties[i] = T::_mmxxx_add_epi( pimpl_->penalties[i - 1], pimpl_->penalties[i - 1]); } // alignment for (const auto& it : rank_to_node) { __mxxxi* char_profile = &(pimpl_->sequence_profile[it->code * matrix_width]); // NOLINT std::uint32_t i = pimpl_->node_id_to_rank[it->id] + 1; __mxxxi* H_row = &(pimpl_->H[i * matrix_width]); __mxxxi* F_row = &(pimpl_->F[i * matrix_width]); std::uint32_t pred_i = it->inedges.empty() ? 0 : pimpl_->node_id_to_rank[it->inedges[0]->tail->id] + 1; __mxxxi* H_pred_row = &(pimpl_->H[pred_i * matrix_width]); __mxxxi* F_pred_row = &(pimpl_->F[pred_i * matrix_width]); __mxxxi x = _mmxxx_srli_si( T::_mmxxx_set1_epi(pimpl_->first_column[pred_i]), T::kRSS); for (std::uint64_t j = 0; j < matrix_width; ++j) { // update F F_row[j] = T::_mmxxx_add_epi( T::_mmxxx_max_epi( T::_mmxxx_add_epi(H_pred_row[j], g), F_pred_row[j]), e); // update H H_row[j] = T::_mmxxx_add_epi( _mmxxx_or_si( _mmxxx_slli_si(H_pred_row[j], T::kLSS), x), char_profile[j]); x = _mmxxx_srli_si(H_pred_row[j], T::kRSS); } // check other predecessors for (std::uint32_t p = 1; p < it->inedges.size(); ++p) { pred_i = pimpl_->node_id_to_rank[it->inedges[p]->tail->id] + 1; H_pred_row = &(pimpl_->H[pred_i * matrix_width]); F_pred_row = &(pimpl_->F[pred_i * matrix_width]); x = _mmxxx_srli_si( T::_mmxxx_set1_epi(pimpl_->first_column[pred_i]), T::kRSS); for (std::uint64_t j = 0; j < matrix_width; ++j) { // update F F_row[j] = T::_mmxxx_max_epi( F_row[j], T::_mmxxx_add_epi( T::_mmxxx_max_epi( T::_mmxxx_add_epi(H_pred_row[j], g), F_pred_row[j]), e)); // update H H_row[j] = T::_mmxxx_max_epi( H_row[j], T::_mmxxx_add_epi( _mmxxx_or_si( _mmxxx_slli_si(H_pred_row[j], T::kLSS), x), char_profile[j])); x = _mmxxx_srli_si(H_pred_row[j], T::kRSS); } } __mxxxi* E_row = &(pimpl_->E[i * matrix_width]); __mxxxi score = zeroes; x = T::_mmxxx_set1_epi(pimpl_->first_column[i]); for (std::uint64_t j = 0; j < matrix_width; ++j) { H_row[j] = T::_mmxxx_max_epi(H_row[j], F_row[j]); E_row[j] = T::_mmxxx_add_epi( T::_mmxxx_add_epi( _mmxxx_or_si( _mmxxx_slli_si(H_row[j], T::kLSS), _mmxxx_srli_si(x, T::kRSS)), g), e); T::_mmxxx_prefix_max(E_row[j], pimpl_->masks, pimpl_->penalties); H_row[j] = T::_mmxxx_max_epi(H_row[j], E_row[j]); x = T::_mmxxx_max_epi( H_row[j], T::_mmxxx_sub_epi(E_row[j], g)); if (type_ == AlignmentType::kSW) { H_row[j] = T::_mmxxx_max_epi(H_row[j], zeroes); } score = T::_mmxxx_max_epi(score, H_row[j]); } if (type_ == AlignmentType::kSW) { std::int32_t max_row_score = _mmxxx_max_value(score); if (max_score < max_row_score) { max_score = max_row_score; max_i = i; } } else if (type_ == AlignmentType::kOV) { if (it->outedges.empty()) { std::int32_t max_row_score = _mmxxx_max_value(score); if (max_score < max_row_score) { max_score = max_row_score; max_i = i; } } } else if (type_ == AlignmentType::kNW) { if (it->outedges.empty()) { std::int32_t max_row_score = _mmxxx_value_at( H_row[matrix_width - 1], last_column_id); if (max_score < max_row_score) { max_score = max_row_score; max_i = i; } } } } if (max_i == -1 && max_j == -1) { return Alignment(); } if (score) { *score = max_score; } if (type_ == AlignmentType::kSW) { max_j = _mmxxx_index_of( &(pimpl_->H[max_i * matrix_width]), matrix_width, max_score); } else if (type_ == AlignmentType::kOV) { if (rank_to_node[max_i - 1]->outedges.empty()) { max_j = _mmxxx_index_of( &(pimpl_->H[max_i * matrix_width]), matrix_width, max_score); } else { max_j = normal_matrix_width - 1; } } else if (type_ == AlignmentType::kNW) { max_j = normal_matrix_width - 1; } // backtrack std::uint32_t max_num_predecessors = 1; for (std::uint32_t i = 0; i < static_cast(max_i); ++i) { max_num_predecessors = std::max( max_num_predecessors, static_cast(rank_to_node[i]->inedges.size())); } typename T::type* backtrack_storage = nullptr; typename T::type* H = AllocateAlignedMemory( &backtrack_storage, 6 * T::kNumVar + 3 * T::kNumVar * max_num_predecessors, kRegisterSize / 8); typename T::type* H_pred = H + T::kNumVar; typename T::type* H_diag_pred = H_pred + T::kNumVar * max_num_predecessors; typename T::type* H_left = H_diag_pred + T::kNumVar * max_num_predecessors; typename T::type* F = H_left + T::kNumVar; typename T::type* F_pred = F + T::kNumVar; typename T::type* E = F_pred + T::kNumVar * max_num_predecessors; typename T::type* E_left = E + T::kNumVar; typename T::type* profile = E_left + T::kNumVar; std::vector predecessors; std::int32_t i = max_i; std::int32_t j = max_j; std::int32_t prev_i = 0, prev_j = 0; std::uint32_t j_div = j / T::kNumVar; std::uint32_t j_mod = j % T::kNumVar; bool load_next_segment = true; Alignment alignment; do { // check stop condition if (j == -1 || i == 0) { break; } const auto& it = rank_to_node[i - 1]; // load everything if (load_next_segment) { predecessors.clear(); // load current cells _mmxxx_store_si( reinterpret_cast<__mxxxi*>(H), pimpl_->H[i * matrix_width + j_div]); _mmxxx_store_si( reinterpret_cast<__mxxxi*>(E), pimpl_->E[i * matrix_width + j_div]); // load predecessors cells if (it->inedges.empty()) { predecessors.emplace_back(0); _mmxxx_store_si(reinterpret_cast<__mxxxi*>(H_pred), pimpl_->H[j_div]); _mmxxx_store_si(reinterpret_cast<__mxxxi*>(F_pred), pimpl_->F[j_div]); } else { std::uint32_t store_pos = 0; for (const auto& jt : it->inedges) { predecessors.emplace_back(pimpl_->node_id_to_rank[jt->tail->id] + 1); _mmxxx_store_si( reinterpret_cast<__mxxxi*>(&H_pred[store_pos * T::kNumVar]), pimpl_->H[predecessors.back() * matrix_width + j_div]); _mmxxx_store_si( reinterpret_cast<__mxxxi*>(&F_pred[store_pos * T::kNumVar]), pimpl_->F[predecessors.back() * matrix_width + j_div]); ++store_pos; } } // load query profile cells _mmxxx_store_si( reinterpret_cast<__mxxxi*>(profile), pimpl_->sequence_profile[it->code * matrix_width + j_div]); } // check stop condition if (type_ == AlignmentType::kSW && H[j_mod] == 0) { break; } if (j_mod == 0) { // border case if (j_div > 0) { for (std::uint32_t p = 0; p < predecessors.size(); ++p) { _mmxxx_store_si( reinterpret_cast<__mxxxi*>(&H_diag_pred[p * T::kNumVar]), pimpl_->H[predecessors[p] * matrix_width + (j_div - 1)]); } _mmxxx_store_si( reinterpret_cast<__mxxxi*>(H_left), pimpl_->H[i * matrix_width + j_div - 1]); _mmxxx_store_si( reinterpret_cast<__mxxxi*>(E_left), pimpl_->E[i * matrix_width + j_div - 1]); } else { for (std::uint32_t p = 0; p < predecessors.size(); ++p) { H_diag_pred[(p + 1) * T::kNumVar - 1] = pimpl_->first_column[predecessors[p]]; } H_left[T::kNumVar - 1] = pimpl_->first_column[i]; E_left[T::kNumVar - 1] = pimpl_->first_column[i]; } } // find best predecessor cell bool predecessor_found = false, extend_left = false, extend_up = false; if (i != 0) { for (std::uint32_t p = 0; p < predecessors.size(); ++p) { if ((j_mod == 0 && H[j_mod] == H_diag_pred[(p + 1) * T::kNumVar - 1] + profile[j_mod]) || // NOLINT (j_mod != 0 && H[j_mod] == H_pred[p * T::kNumVar + j_mod - 1] + profile[j_mod])) { // NOLINT prev_i = predecessors[p]; prev_j = j - 1; predecessor_found = true; break; } } } if (!predecessor_found && i != 0) { for (std::uint32_t p = 0; p < predecessors.size(); ++p) { if ((extend_up = H[j_mod] == F_pred[p * T::kNumVar + j_mod] + e_) || H[j_mod] == H_pred[p * T::kNumVar + j_mod] + g_) { prev_i = predecessors[p]; prev_j = j; predecessor_found = true; break; } } } if (!predecessor_found) { if ((j_mod != 0 && ((extend_left = H[j_mod] == E[j_mod - 1] + e_) || H[j_mod] == H[j_mod - 1] + g_)) || (j_mod == 0 && ((extend_left = H[j_mod] == E_left[T::kNumVar - 1] + e_ ) || // NOLINT H[j_mod] == H_left[T::kNumVar - 1] + g_))) { // NOLINT prev_i = i; prev_j = j - 1; predecessor_found = true; } } alignment.emplace_back( i == prev_i ? -1 : rank_to_node[i - 1]->id, j == prev_j ? -1 : j); // update for next round load_next_segment = (i == prev_i ? false : true) || (j != prev_j && prev_j % T::kNumVar == T::kNumVar - 1 ? true : false); i = prev_i; j = prev_j; j_div = j / T::kNumVar; j_mod = j % T::kNumVar; if (extend_left) { while (true) { // load if (j_mod == T::kNumVar - 1) { _mmxxx_store_si( reinterpret_cast<__mxxxi*>(E), pimpl_->E[i * matrix_width + j_div]); } else if (j_mod == 0) { // boarder case if (j_div > 0) { _mmxxx_store_si( reinterpret_cast<__mxxxi*>(E_left), pimpl_->E[i * matrix_width + j_div - 1]); } } alignment.emplace_back(-1, j); --j; j_div = j / T::kNumVar; j_mod = j % T::kNumVar; if ((j == -1) || (j_mod != T::kNumVar - 1 && E[j_mod] + e_ != E[j_mod + 1]) || (j_mod == T::kNumVar - 1 && E_left[j_mod] + e_ != E[0])) { break; } } load_next_segment = true; } else if (extend_up) { while (true) { // load _mmxxx_store_si( reinterpret_cast<__mxxxi*>(F), pimpl_->F[i * matrix_width + j_div]); prev_i = 0; predecessors.clear(); std::uint32_t store_pos = 0; for (const auto& it : rank_to_node[i - 1]->inedges) { predecessors.emplace_back(pimpl_->node_id_to_rank[it->tail->id] + 1); _mmxxx_store_si( reinterpret_cast<__mxxxi*>(&H_pred[store_pos * T::kNumVar]), pimpl_->H[predecessors.back() * matrix_width + j_div]); _mmxxx_store_si( reinterpret_cast<__mxxxi*>(&F_pred[store_pos * T::kNumVar]), pimpl_->F[predecessors.back() * matrix_width + j_div]); ++store_pos; } bool stop = false; for (std::uint32_t p = 0; p < predecessors.size(); ++p) { if ((stop = F[j_mod] == H_pred[p * T::kNumVar + j_mod] + g_) || F[j_mod] == F_pred[p * T::kNumVar + j_mod] + e_) { prev_i = predecessors[p]; break; } } alignment.emplace_back(rank_to_node[i - 1]->id, -1); i = prev_i; if (stop || i == 0) { break; } } } } while (true); delete[] backtrack_storage; // update alignment for NW (backtrack stops on first row or column) if (type_ == AlignmentType::kNW) { while (i == 0 && j != -1) { alignment.emplace_back(-1, j); --j; } while (i != 0 && j == -1) { alignment.emplace_back(rank_to_node[i - 1]->id, -1); const auto& it = rank_to_node[i - 1]; if (it->inedges.empty()) { i = 0; } else { for (const auto& jt : it->inedges) { std::uint32_t pred_i = pimpl_->node_id_to_rank[jt->tail->id] + 1; if (pimpl_->first_column[i] == pimpl_->first_column[pred_i] + e_) { i = pred_i; break; } } } } } std::reverse(alignment.begin(), alignment.end()); return alignment; #else (void) sequence_len; (void) graph; (void) score; return Alignment(); #endif } template template Alignment SimdAlignmentEngine::Convex( std::uint32_t sequence_len, const Graph& graph, std::int32_t* score) noexcept { #if defined(__AVX2__) || defined(__SSE4_1__) || defined(SPOA_USE_SIMDE) std::uint64_t normal_matrix_width = sequence_len; std::uint64_t matrix_width = std::ceil(static_cast(sequence_len) / T::kNumVar); std::uint64_t matrix_height = graph.nodes().size() + 1; const auto& rank_to_node = graph.rank_to_node(); typename T::type kNegativeInfinity = std::numeric_limits::min() + 1024; typename T::type max_score = type_ == AlignmentType::kSW ? 0 : kNegativeInfinity; // NOLINT std::int32_t max_i = -1; std::int32_t max_j = -1; std::uint32_t last_column_id = (normal_matrix_width - 1) % T::kNumVar; __mxxxi zeroes = T::_mmxxx_set1_epi(0); __mxxxi g = T::_mmxxx_set1_epi(g_ - e_); __mxxxi e = T::_mmxxx_set1_epi(e_); __mxxxi q = T::_mmxxx_set1_epi(q_ - c_); __mxxxi c = T::_mmxxx_set1_epi(c_); __attribute__((aligned(kRegisterSize / 8))) typename T::type unpacked[T::kNumVar] = {0}; // NOLINT for (std::uint32_t i = 0, j = 0; i < T::kNumVar && j < T::kLogNumVar; ++i) { unpacked[i] = kNegativeInfinity; if ((i & (i + 1)) == 0) { pimpl_->masks[j++] = _mmxxx_load_si( reinterpret_cast(unpacked)); } } pimpl_->masks[T::kLogNumVar] = _mmxxx_slli_si( T::_mmxxx_set1_epi(kNegativeInfinity), T::kLSS); pimpl_->penalties[0] = T::_mmxxx_set1_epi(e_); for (std::uint32_t i = 1; i < T::kLogNumVar; ++i) { pimpl_->penalties[i] = T::_mmxxx_add_epi( pimpl_->penalties[i - 1], pimpl_->penalties[i - 1]); } pimpl_->penalties[T::kLogNumVar] = T::_mmxxx_set1_epi(c_); for (std::uint32_t i = T::kLogNumVar + 1; i < 2 * T::kLogNumVar; ++i) { pimpl_->penalties[i] = T::_mmxxx_add_epi( pimpl_->penalties[i - 1], pimpl_->penalties[i - 1]); } // alignment for (const auto& it : rank_to_node) { __mxxxi* char_profile = &(pimpl_->sequence_profile[it->code * matrix_width]); // NOLINT std::uint32_t i = pimpl_->node_id_to_rank[it->id] + 1; __mxxxi* H_row = &(pimpl_->H[i * matrix_width]); __mxxxi* F_row = &(pimpl_->F[i * matrix_width]); __mxxxi* O_row = &(pimpl_->O[i * matrix_width]); std::uint32_t pred_i = it->inedges.empty() ? 0 : pimpl_->node_id_to_rank[it->inedges[0]->tail->id] + 1; __mxxxi* H_pred_row = &(pimpl_->H[pred_i * matrix_width]); __mxxxi* F_pred_row = &(pimpl_->F[pred_i * matrix_width]); __mxxxi* O_pred_row = &(pimpl_->O[pred_i * matrix_width]); __mxxxi x = _mmxxx_srli_si( T::_mmxxx_set1_epi(pimpl_->first_column[pred_i]), T::kRSS); for (std::uint64_t j = 0; j < matrix_width; ++j) { // update F F_row[j] = T::_mmxxx_add_epi( T::_mmxxx_max_epi( T::_mmxxx_add_epi(H_pred_row[j], g), F_pred_row[j]), e); // update O O_row[j] = T::_mmxxx_add_epi( T::_mmxxx_max_epi( T::_mmxxx_add_epi(H_pred_row[j], q), O_pred_row[j]), c); // update H H_row[j] = T::_mmxxx_add_epi( _mmxxx_or_si( _mmxxx_slli_si(H_pred_row[j], T::kLSS), x), char_profile[j]); x = _mmxxx_srli_si(H_pred_row[j], T::kRSS); } // check other predecessors for (std::uint32_t p = 1; p < it->inedges.size(); ++p) { pred_i = pimpl_->node_id_to_rank[it->inedges[p]->tail->id] + 1; H_pred_row = &(pimpl_->H[pred_i * matrix_width]); F_pred_row = &(pimpl_->F[pred_i * matrix_width]); O_pred_row = &(pimpl_->O[pred_i * matrix_width]); x = _mmxxx_srli_si( T::_mmxxx_set1_epi(pimpl_->first_column[pred_i]), T::kRSS); for (std::uint64_t j = 0; j < matrix_width; ++j) { // update F F_row[j] = T::_mmxxx_max_epi( F_row[j], T::_mmxxx_add_epi( T::_mmxxx_max_epi( T::_mmxxx_add_epi(H_pred_row[j], g), F_pred_row[j]), e)); // update O O_row[j] = T::_mmxxx_max_epi( O_row[j], T::_mmxxx_add_epi( T::_mmxxx_max_epi( T::_mmxxx_add_epi(H_pred_row[j], q), O_pred_row[j]), c)); // update H H_row[j] = T::_mmxxx_max_epi( H_row[j], T::_mmxxx_add_epi( _mmxxx_or_si( _mmxxx_slli_si(H_pred_row[j], T::kLSS), x), char_profile[j])); x = _mmxxx_srli_si(H_pred_row[j], T::kRSS); } } __mxxxi* E_row = &(pimpl_->E[i * matrix_width]); __mxxxi* Q_row = &(pimpl_->Q[i * matrix_width]); x = T::_mmxxx_set1_epi(pimpl_->first_column[i]); __mxxxi y = T::_mmxxx_set1_epi(pimpl_->first_column[i]); __mxxxi score = zeroes; for (std::uint64_t j = 0; j < matrix_width; ++j) { H_row[j] = T::_mmxxx_max_epi( H_row[j], T::_mmxxx_max_epi(F_row[j], O_row[j])); E_row[j] = T::_mmxxx_add_epi( T::_mmxxx_add_epi( _mmxxx_or_si( _mmxxx_slli_si(H_row[j], T::kLSS), _mmxxx_srli_si(x, T::kRSS)), g), e); T::_mmxxx_prefix_max(E_row[j], pimpl_->masks, pimpl_->penalties); Q_row[j] = T::_mmxxx_add_epi( T::_mmxxx_add_epi( _mmxxx_or_si( _mmxxx_slli_si(H_row[j], T::kLSS), _mmxxx_srli_si(y, T::kRSS)), q), c); T::_mmxxx_prefix_max(Q_row[j], pimpl_->masks, &pimpl_->penalties[T::kLogNumVar]); // NOLINT H_row[j] = T::_mmxxx_max_epi( H_row[j], T::_mmxxx_max_epi(E_row[j], Q_row[j])); x = T::_mmxxx_max_epi( H_row[j], T::_mmxxx_sub_epi(E_row[j], g)); y = T::_mmxxx_max_epi( H_row[j], T::_mmxxx_sub_epi(Q_row[j], q)); if (type_ == AlignmentType::kSW) { H_row[j] = T::_mmxxx_max_epi(H_row[j], zeroes); } score = T::_mmxxx_max_epi(score, H_row[j]); } if (type_ == AlignmentType::kSW) { std::int32_t max_row_score = _mmxxx_max_value(score); if (max_score < max_row_score) { max_score = max_row_score; max_i = i; } } else if (type_ == AlignmentType::kOV) { if (it->outedges.empty()) { std::int32_t max_row_score = _mmxxx_max_value(score); if (max_score < max_row_score) { max_score = max_row_score; max_i = i; } } } else if (type_ == AlignmentType::kNW) { if (it->outedges.empty()) { std::int32_t max_row_score = _mmxxx_value_at( H_row[matrix_width - 1], last_column_id); if (max_score < max_row_score) { max_score = max_row_score; max_i = i; } } } } if (max_i == -1 && max_j == -1) { return Alignment(); } if (score) { *score = max_score; } if (type_ == AlignmentType::kSW) { max_j = _mmxxx_index_of( &(pimpl_->H[max_i * matrix_width]), matrix_width, max_score); } else if (type_ == AlignmentType::kOV) { if (rank_to_node[max_i - 1]->outedges.empty()) { max_j = _mmxxx_index_of( &(pimpl_->H[max_i * matrix_width]), matrix_width, max_score); } else { max_j = normal_matrix_width - 1; } } else if (type_ == AlignmentType::kNW) { max_j = normal_matrix_width - 1; } // backtrack std::uint32_t max_num_predecessors = 1; for (std::uint32_t i = 0; i < static_cast(max_i); ++i) { max_num_predecessors = std::max( max_num_predecessors, static_cast(rank_to_node[i]->inedges.size())); } typename T::type* backtrack_storage = nullptr; typename T::type* H = AllocateAlignedMemory( &backtrack_storage, 9 * T::kNumVar + 4 * T::kNumVar * max_num_predecessors, kRegisterSize / 8); typename T::type* H_pred = H + T::kNumVar; typename T::type* H_diag_pred = H_pred + T::kNumVar * max_num_predecessors; typename T::type* H_left = H_diag_pred + T::kNumVar * max_num_predecessors; typename T::type* F = H_left + T::kNumVar; typename T::type* F_pred = F + T::kNumVar; typename T::type* O = F_pred + T::kNumVar * max_num_predecessors; typename T::type* O_pred = O + T::kNumVar; typename T::type* E = O_pred + T::kNumVar * max_num_predecessors; typename T::type* E_left = E + T::kNumVar; typename T::type* Q = E_left + T::kNumVar; typename T::type* Q_left = Q + T::kNumVar; typename T::type* profile = Q_left + T::kNumVar; std::vector predecessors; std::int32_t i = max_i; std::int32_t j = max_j; std::int32_t prev_i = 0, prev_j = 0; std::uint32_t j_div = j / T::kNumVar; std::uint32_t j_mod = j % T::kNumVar; bool load_next_segment = true; Alignment alignment; do { // check stop condition if (j == -1 || i == 0) { break; } const auto& it = rank_to_node[i - 1]; // load everything if (load_next_segment) { predecessors.clear(); // load current cells _mmxxx_store_si( reinterpret_cast<__mxxxi*>(H), pimpl_->H[i * matrix_width + j_div]); _mmxxx_store_si( reinterpret_cast<__mxxxi*>(E), pimpl_->E[i * matrix_width + j_div]); _mmxxx_store_si( reinterpret_cast<__mxxxi*>(Q), pimpl_->Q[i * matrix_width + j_div]); // load predecessors cells if (it->inedges.empty()) { predecessors.emplace_back(0); _mmxxx_store_si(reinterpret_cast<__mxxxi*>(H_pred), pimpl_->H[j_div]); _mmxxx_store_si(reinterpret_cast<__mxxxi*>(F_pred), pimpl_->F[j_div]); _mmxxx_store_si(reinterpret_cast<__mxxxi*>(O_pred), pimpl_->O[j_div]); } else { std::uint32_t store_pos = 0; for (const auto& jt : it->inedges) { predecessors.emplace_back(pimpl_->node_id_to_rank[jt->tail->id] + 1); _mmxxx_store_si( reinterpret_cast<__mxxxi*>(&H_pred[store_pos * T::kNumVar]), pimpl_->H[predecessors.back() * matrix_width + j_div]); _mmxxx_store_si( reinterpret_cast<__mxxxi*>(&F_pred[store_pos * T::kNumVar]), pimpl_->F[predecessors.back() * matrix_width + j_div]); _mmxxx_store_si( reinterpret_cast<__mxxxi*>(&O_pred[store_pos * T::kNumVar]), pimpl_->O[predecessors.back() * matrix_width + j_div]); ++store_pos; } } // load query profile cells _mmxxx_store_si( reinterpret_cast<__mxxxi*>(profile), pimpl_->sequence_profile[it->code * matrix_width + j_div]); } // check stop condition if (type_ == AlignmentType::kSW && H[j_mod] == 0) { break; } if (j_mod == 0) { // border case if (j_div > 0) { for (std::uint32_t p = 0; p < predecessors.size(); ++p) { _mmxxx_store_si( reinterpret_cast<__mxxxi*>(&H_diag_pred[p * T::kNumVar]), pimpl_->H[predecessors[p] * matrix_width + (j_div - 1)]); } _mmxxx_store_si( reinterpret_cast<__mxxxi*>(H_left), pimpl_->H[i * matrix_width + j_div - 1]); _mmxxx_store_si( reinterpret_cast<__mxxxi*>(E_left), pimpl_->E[i * matrix_width + j_div - 1]); _mmxxx_store_si( reinterpret_cast<__mxxxi*>(Q_left), pimpl_->Q[i * matrix_width + j_div - 1]); } else { for (std::uint32_t p = 0; p < predecessors.size(); ++p) { H_diag_pred[(p + 1) * T::kNumVar - 1] = pimpl_->first_column[predecessors[p]]; // NOLINT } H_left[T::kNumVar - 1] = pimpl_->first_column[i]; E_left[T::kNumVar - 1] = pimpl_->first_column[i]; Q_left[T::kNumVar - 1] = pimpl_->first_column[i]; } } // find best predecessor cell bool predecessor_found = false, extend_left = false, extend_up = false; if (i != 0) { for (std::uint32_t p = 0; p < predecessors.size(); ++p) { if ((j_mod == 0 && H[j_mod] == H_diag_pred[(p + 1) * T::kNumVar - 1] + profile[j_mod]) || // NOLINT (j_mod != 0 && H[j_mod] == H_pred[p * T::kNumVar + j_mod - 1] + profile[j_mod])) { // NOLINT prev_i = predecessors[p]; prev_j = j - 1; predecessor_found = true; break; } } } if (!predecessor_found && i != 0) { for (std::uint32_t p = 0; p < predecessors.size(); ++p) { if ((extend_up = H[j_mod] == F_pred[p * T::kNumVar + j_mod] + e_) || H[j_mod] == H_pred[p * T::kNumVar + j_mod] + g_ || (extend_up = H[j_mod] == O_pred[p * T::kNumVar + j_mod] + c_) || H[j_mod] == H_pred[p * T::kNumVar + j_mod] + q_) { prev_i = predecessors[p]; prev_j = j; predecessor_found = true; break; } } } if (!predecessor_found) { if ((j_mod != 0 && ((extend_left = H[j_mod] == E[j_mod - 1] + e_) || H[j_mod] == H[j_mod - 1] + g_ || (extend_left = H[j_mod] == Q[j_mod - 1] + c_) || H[j_mod] == H[j_mod - 1] + q_)) || (j_mod == 0 && ((extend_left = H[j_mod] == E_left[T::kNumVar - 1] + e_) || // NOLINT H[j_mod] == H_left[T::kNumVar - 1] + g_ || // NOLINT (extend_left = H[j_mod] == Q_left[T::kNumVar - 1] + c_) || // NOLINT H[j_mod] == H_left[T::kNumVar - 1] + q_))) { // NOLINT prev_i = i; prev_j = j - 1; predecessor_found = true; } } alignment.emplace_back( i == prev_i ? -1 : rank_to_node[i - 1]->id, j == prev_j ? -1 : j); // update for next round load_next_segment = (i == prev_i ? false : true) || (j != prev_j && prev_j % T::kNumVar == T::kNumVar - 1 ? true : false); i = prev_i; j = prev_j; j_div = j / T::kNumVar; j_mod = j % T::kNumVar; if (extend_left) { while (true) { // load if (j_mod == T::kNumVar - 1) { _mmxxx_store_si( reinterpret_cast<__mxxxi*>(E), pimpl_->E[i * matrix_width + j_div]); _mmxxx_store_si( reinterpret_cast<__mxxxi*>(Q), pimpl_->Q[i * matrix_width + j_div]); } else if (j_mod == 0) { // boarder case if (j_div > 0) { _mmxxx_store_si( reinterpret_cast<__mxxxi*>(E_left), pimpl_->E[i * matrix_width + j_div - 1]); _mmxxx_store_si( reinterpret_cast<__mxxxi*>(Q_left), pimpl_->Q[i * matrix_width + j_div - 1]); } } alignment.emplace_back(-1, j); --j; j_div = j / T::kNumVar; j_mod = j % T::kNumVar; if ((j == -1) || (j_mod != T::kNumVar - 1 && E[j_mod] + e_ != E[j_mod + 1]) || (j_mod == T::kNumVar - 1 && E_left[j_mod] + e_ != E[0]) || (j_mod != T::kNumVar - 1 && Q[j_mod] + c_ != Q[j_mod + 1]) || (j_mod == T::kNumVar - 1 && Q_left[j_mod] + c_ != Q[0])) { break; } } load_next_segment = true; } else if (extend_up) { while (true) { // load _mmxxx_store_si( reinterpret_cast<__mxxxi*>(F), pimpl_->F[i * matrix_width + j_div]); _mmxxx_store_si( reinterpret_cast<__mxxxi*>(O), pimpl_->O[i * matrix_width + j_div]); predecessors.clear(); std::uint32_t store_pos = 0; for (const auto& it : rank_to_node[i - 1]->inedges) { predecessors.emplace_back(pimpl_->node_id_to_rank[it->tail->id] + 1); _mmxxx_store_si( reinterpret_cast<__mxxxi*>(&H_pred[store_pos * T::kNumVar]), pimpl_->H[predecessors.back() * matrix_width + j_div]); _mmxxx_store_si( reinterpret_cast<__mxxxi*>(&F_pred[store_pos * T::kNumVar]), pimpl_->F[predecessors.back() * matrix_width + j_div]); _mmxxx_store_si( reinterpret_cast<__mxxxi*>(&O_pred[store_pos * T::kNumVar]), pimpl_->O[predecessors.back() * matrix_width + j_div]); ++store_pos; } bool stop = true; prev_i = 0; for (std::uint32_t p = 0; p < predecessors.size(); ++p) { if (F[j_mod] == F_pred[p * T::kNumVar + j_mod] + e_ || O[j_mod] == O_pred[p * T::kNumVar + j_mod] + c_) { prev_i = predecessors[p]; stop = false; break; } } if (stop == true) { for (std::uint32_t p = 0; p < predecessors.size(); ++p) { if (F[j_mod] == H_pred[p * T::kNumVar + j_mod] + g_ || O[j_mod] == H_pred[p * T::kNumVar + j_mod] + q_) { prev_i = predecessors[p]; break; } } } alignment.emplace_back(rank_to_node[i - 1]->id, -1); i = prev_i; if (stop || i == 0) { break; } } } } while (true); delete[] backtrack_storage; // update alignment for NW (backtrack stops on first row or column) if (type_ == AlignmentType::kNW) { while (i == 0 && j != -1) { alignment.emplace_back(-1, j); --j; } while (i != 0 && j == -1) { alignment.emplace_back(rank_to_node[i - 1]->id, -1); const auto& it = rank_to_node[i - 1]; if (it->inedges.empty()) { i = 0; } else { for (const auto& jt : it->inedges) { std::uint32_t pred_i = pimpl_->node_id_to_rank[jt->tail->id] + 1; if (pimpl_->first_column[matrix_height + i] == pimpl_->first_column[matrix_height + pred_i] + e_ || // NOLINT pimpl_->first_column[2 * matrix_height + i] == pimpl_->first_column[2 * matrix_height + pred_i] + c_ ) { // NOLINT i = pred_i; break; } } } } } std::reverse(alignment.begin(), alignment.end()); return alignment; #else (void) sequence_len; (void) graph; (void) score; return Alignment(); #endif } } // namespace spoa #endif // SIMD_ALIGNMENT_ENGINE_IMPLEMENTATION_HPP_ spoa-4.0.8/src/sisd_alignment_engine.cpp000066400000000000000000000723251400776337600203330ustar00rootroot00000000000000// Copyright (c) 2020 Robert Vaser #include "sisd_alignment_engine.hpp" #include #include #include #include "spoa/graph.hpp" namespace spoa { constexpr std::int32_t kNegativeInfinity = std::numeric_limits::min() + 1024; std::unique_ptr SisdAlignmentEngine::Create( AlignmentType type, AlignmentSubtype subtype, std::int8_t m, std::int8_t n, std::int8_t g, std::int8_t e, std::int8_t q, std::int8_t c) { return std::unique_ptr( new SisdAlignmentEngine(type, subtype, m, n, g, e, q, c)); } struct SisdAlignmentEngine::Implementation { std::vector node_id_to_rank; std::vector sequence_profile; std::vector M; std::int32_t* H; std::int32_t* F; std::int32_t* E; std::int32_t* O; std::int32_t* Q; Implementation() : node_id_to_rank(), sequence_profile(), M(), H(nullptr), F(nullptr), E(nullptr), O(nullptr), Q(nullptr) { } }; SisdAlignmentEngine::SisdAlignmentEngine( AlignmentType type, AlignmentSubtype subtype, std::int8_t m, std::int8_t n, std::int8_t g, std::int8_t e, std::int8_t q, std::int8_t c) : AlignmentEngine(type, subtype, m, n, g, e, q, c), pimpl_(new Implementation()) { } void SisdAlignmentEngine::Prealloc( std::uint32_t max_sequence_len, std::uint8_t alphabet_size) { if (max_sequence_len > std::numeric_limits::max()) { throw std::invalid_argument( "[spoa::SisdAlignmentEngine::Prealloc] error: too large sequence!"); } try { Realloc( static_cast(max_sequence_len) + 1, static_cast(max_sequence_len) * alphabet_size + alphabet_size, // NOLINT alphabet_size); } catch (std::bad_alloc& ba) { throw std::invalid_argument( "[spoa::SisdAlignmentEngine::Prealloc] error: insufficient memory!"); } } void SisdAlignmentEngine::Realloc( std::uint64_t matrix_width, std::uint64_t matrix_height, std::uint8_t num_codes) { if (pimpl_->node_id_to_rank.size() < matrix_height - 1) { pimpl_->node_id_to_rank.resize(matrix_height - 1, 0); } if (pimpl_->sequence_profile.size() < num_codes * matrix_width) { pimpl_->sequence_profile.resize(num_codes * matrix_width, 0); } if (subtype_ == AlignmentSubtype::kLinear) { if (pimpl_->M.size() < matrix_height * matrix_width) { pimpl_->M.resize(matrix_width * matrix_height, 0); pimpl_->H = pimpl_->M.data(); pimpl_->F = nullptr; pimpl_->E = nullptr; } } else if (subtype_ == AlignmentSubtype::kAffine) { if (pimpl_->M.size() < 3 * matrix_height * matrix_width) { pimpl_->M.resize(3 * matrix_width * matrix_height, 0); pimpl_->H = pimpl_->M.data(); pimpl_->F = pimpl_->H + matrix_width * matrix_height; pimpl_->E = pimpl_->F + matrix_width * matrix_height; } } else if (subtype_ == AlignmentSubtype::kConvex) { if (pimpl_->M.size() < 5 * matrix_height * matrix_width) { pimpl_->M.resize(5 * matrix_width * matrix_height, 0); pimpl_->H = pimpl_->M.data(); pimpl_->F = pimpl_->H + matrix_width * matrix_height; pimpl_->E = pimpl_->F + matrix_width * matrix_height; pimpl_->O = pimpl_->E + matrix_width * matrix_height; pimpl_->Q = pimpl_->O + matrix_width * matrix_height; } } } void SisdAlignmentEngine::Initialize( const char* sequence, std::uint32_t sequence_len, const Graph& graph) noexcept { std::uint32_t matrix_width = sequence_len + 1; std::uint32_t matrix_height = graph.nodes().size() + 1; for (std::uint32_t i = 0; i < graph.num_codes(); ++i) { char c = graph.decoder(i); pimpl_->sequence_profile[i * matrix_width] = 0; for (std::uint32_t j = 0; j < sequence_len; ++j) { pimpl_->sequence_profile[i * matrix_width + (j + 1)] = (c == sequence[j] ? m_ : n_); } } const auto& rank_to_node = graph.rank_to_node(); for (std::uint32_t i = 0; i < rank_to_node.size(); ++i) { pimpl_->node_id_to_rank[rank_to_node[i]->id] = i; } // initialize secondary matrices switch (subtype_) { case AlignmentSubtype::kConvex: pimpl_->O[0] = 0; pimpl_->Q[0] = 0; for (std::uint32_t j = 1; j < matrix_width; ++j) { pimpl_->O[j] = kNegativeInfinity; pimpl_->Q[j] = q_ + (j - 1) * c_; } for (std::uint32_t i = 1; i < matrix_height; ++i) { const auto& edges = rank_to_node[i - 1]->inedges; std::int32_t penalty = edges.empty() ? q_ - c_ : kNegativeInfinity; for (const auto& it : edges) { std::uint32_t pred_i = pimpl_->node_id_to_rank[it->tail->id] + 1; penalty = std::max(penalty, pimpl_->O[pred_i * matrix_width]); } pimpl_->O[i * matrix_width] = penalty + c_; pimpl_->Q[i * matrix_width] = kNegativeInfinity; } // fall through case AlignmentSubtype::kAffine: pimpl_->F[0] = 0; pimpl_->E[0] = 0; for (std::uint32_t j = 1; j < matrix_width; ++j) { pimpl_->F[j] = kNegativeInfinity; pimpl_->E[j] = g_ + (j - 1) * e_; } for (std::uint32_t i = 1; i < matrix_height; ++i) { const auto& edges = rank_to_node[i - 1]->inedges; std::int32_t penalty = edges.empty() ? g_ - e_ : kNegativeInfinity; for (const auto& it : edges) { std::uint32_t pred_i = pimpl_->node_id_to_rank[it->tail->id] + 1; penalty = std::max(penalty, pimpl_->F[pred_i * matrix_width]); } pimpl_->F[i * matrix_width] = penalty + e_; pimpl_->E[i * matrix_width] = kNegativeInfinity; } // fall through case AlignmentSubtype::kLinear: pimpl_->H[0] = 0; break; default: break; } // initialize primary matrix switch (type_) { case AlignmentType::kSW: for (std::uint32_t j = 1; j < matrix_width; ++j) { pimpl_->H[j] = 0; } for (std::uint32_t i = 1; i < matrix_height; ++i) { pimpl_->H[i * matrix_width] = 0; } break; case AlignmentType::kNW: switch (subtype_) { case AlignmentSubtype::kConvex: for (std::uint32_t j = 1; j < matrix_width; ++j) { pimpl_->H[j] = std::max(pimpl_->Q[j], pimpl_->E[j]); } for (std::uint32_t i = 1; i < matrix_height; ++i) { pimpl_->H[i * matrix_width] = std::max( pimpl_->O[i * matrix_width], pimpl_->F[i * matrix_width]); } break; case AlignmentSubtype::kAffine: for (std::uint32_t j = 1; j < matrix_width; ++j) { pimpl_->H[j] = pimpl_->E[j]; } for (std::uint32_t i = 1; i < matrix_height; ++i) { pimpl_->H[i * matrix_width] = pimpl_->F[i * matrix_width]; } break; case AlignmentSubtype::kLinear: for (std::uint32_t j = 1; j < matrix_width; ++j) { pimpl_->H[j] = j * g_; } for (std::uint32_t i = 1; i < matrix_height; ++i) { const auto& edges = rank_to_node[i - 1]->inedges; std::int32_t penalty = edges.empty() ? 0 : kNegativeInfinity; for (const auto& it : edges) { std::uint32_t pred_i = pimpl_->node_id_to_rank[it->tail->id] + 1; penalty = std::max(penalty, pimpl_->H[pred_i * matrix_width]); } pimpl_->H[i * matrix_width] = penalty + g_; } default: break; } break; case AlignmentType::kOV: switch (subtype_) { case AlignmentSubtype::kConvex: for (std::uint32_t j = 1; j < matrix_width; ++j) { pimpl_->H[j] = std::max(pimpl_->Q[j], pimpl_->E[j]); } break; case AlignmentSubtype::kAffine: for (std::uint32_t j = 1; j < matrix_width; ++j) { pimpl_->H[j] = pimpl_->E[j]; } break; case AlignmentSubtype::kLinear: for (std::uint32_t j = 1; j < matrix_width; ++j) { pimpl_->H[j] = j * g_; } break; default: break; } for (std::uint32_t i = 1; i < matrix_height; ++i) { pimpl_->H[i * matrix_width] = 0; } break; default: break; } } Alignment SisdAlignmentEngine::Align( const char* sequence, std::uint32_t sequence_len, const Graph& graph, std::int32_t* score) { if (sequence_len > std::numeric_limits::max()) { throw std::invalid_argument( "[spoa::SisdAlignmentEngine::Align] error: too large sequence!"); } if (graph.nodes().empty() || sequence_len == 0) { return Alignment(); } if (WorstCaseAlignmentScore(sequence_len, graph.nodes().size()) < kNegativeInfinity) { // NOLINT throw std::invalid_argument( "[spoa::SisdAlignmentEngine::Align] error: possible overflow!"); } try { Realloc(sequence_len + 1, graph.nodes().size() + 1, graph.num_codes()); } catch (std::bad_alloc& ba) { throw std::invalid_argument( "[spoa::SisdAlignmentEngine::Align] error: insufficient memory!"); } Initialize(sequence, sequence_len, graph); if (subtype_ == AlignmentSubtype::kLinear) { return Linear(sequence_len, graph, score); } else if (subtype_ == AlignmentSubtype::kAffine) { return Affine(sequence_len, graph, score); } else if (subtype_ == AlignmentSubtype::kConvex) { return Convex(sequence_len, graph, score); } return Alignment(); } Alignment SisdAlignmentEngine::Linear( std::uint32_t sequence_len, const Graph& graph, std::int32_t* score) noexcept { std::uint64_t matrix_width = sequence_len + 1; const auto& rank_to_node = graph.rank_to_node(); std::int32_t max_score = type_ == AlignmentType::kSW ? 0 : kNegativeInfinity; std::uint32_t max_i = 0; std::uint32_t max_j = 0; auto update_max_score = [&max_score, &max_i, &max_j] ( std::int32_t* H_row, std::uint32_t i, std::uint32_t j) -> void { if (max_score < H_row[j]) { max_score = H_row[j]; max_i = i; max_j = j; } return; }; // alignment for (const auto& it : rank_to_node) { const auto& char_profile = &(pimpl_->sequence_profile[it->code * matrix_width]); std::uint32_t i = pimpl_->node_id_to_rank[it->id] + 1; std::uint32_t pred_i = it->inedges.empty() ? 0 : pimpl_->node_id_to_rank[it->inedges[0]->tail->id] + 1; std::int32_t* H_row = &(pimpl_->H[i * matrix_width]); std::int32_t* H_pred_row = &(pimpl_->H[pred_i * matrix_width]); // update H for (std::uint64_t j = 1; j < matrix_width; ++j) { H_row[j] = std::max( H_pred_row[j - 1] + char_profile[j], H_pred_row[j] + g_); } // check other predeccessors for (std::uint32_t p = 1; p < it->inedges.size(); ++p) { pred_i = pimpl_->node_id_to_rank[it->inedges[p]->tail->id] + 1; H_pred_row = &(pimpl_->H[pred_i * matrix_width]); for (std::uint64_t j = 1; j < matrix_width; ++j) { H_row[j] = std::max( H_pred_row[j - 1] + char_profile[j], std::max( H_row[j], H_pred_row[j] + g_)); } } for (std::uint64_t j = 1; j < matrix_width; ++j) { H_row[j] = std::max(H_row[j - 1] + g_, H_row[j]); if (type_ == AlignmentType::kSW) { H_row[j] = std::max(H_row[j], 0); update_max_score(H_row, i, j); } else if (type_ == AlignmentType::kNW && it->outedges.empty() && j == matrix_width - 1) { update_max_score(H_row, i, j); } else if (type_ == AlignmentType::kOV && it->outedges.empty()) { update_max_score(H_row, i, j); } } } if (max_i == 0 && max_j == 0) { return Alignment(); } if (score) { *score = max_score; } // backtrack Alignment alignment; std::uint32_t i = max_i; std::uint32_t j = max_j; auto sw_condition = [this, &i, &j, &matrix_width] () -> bool { return (pimpl_->H[i * matrix_width + j] == 0) ? false : true; }; auto nw_condition = [&i, &j] () -> bool { return (i == 0 && j == 0) ? false : true; }; auto ov_condition = [&i, &j] () -> bool { return (i == 0 || j == 0) ? false : true; }; std::uint32_t prev_i = 0; std::uint32_t prev_j = 0; while ((type_ == AlignmentType::kSW && sw_condition()) || (type_ == AlignmentType::kNW && nw_condition()) || (type_ == AlignmentType::kOV && ov_condition())) { auto H_ij = pimpl_->H[i * matrix_width + j]; bool predecessor_found = false; if (i != 0 && j != 0) { const auto& it = rank_to_node[i - 1]; std::int32_t match_cost = pimpl_->sequence_profile[it->code * matrix_width + j]; std::uint32_t pred_i = it->inedges.empty() ? 0 : pimpl_->node_id_to_rank[it->inedges[0]->tail->id] + 1; if (H_ij == pimpl_->H[pred_i * matrix_width + (j - 1)] + match_cost) { prev_i = pred_i; prev_j = j - 1; predecessor_found = true; } else { for (std::uint32_t p = 1; p < it->inedges.size(); ++p) { std::uint32_t pred_i = pimpl_->node_id_to_rank[it->inedges[p]->tail->id] + 1; if (H_ij == pimpl_->H[pred_i * matrix_width + (j - 1)] + match_cost) { prev_i = pred_i; prev_j = j - 1; predecessor_found = true; break; } } } } if (!predecessor_found && i != 0) { const auto& it = rank_to_node[i - 1]; std::uint32_t pred_i = it->inedges.empty() ? 0 : pimpl_->node_id_to_rank[it->inedges[0]->tail->id] + 1; if (H_ij == pimpl_->H[pred_i * matrix_width + j] + g_) { prev_i = pred_i; prev_j = j; predecessor_found = true; } else { for (std::uint32_t p = 1; p < it->inedges.size(); ++p) { std::uint32_t pred_i = pimpl_->node_id_to_rank[it->inedges[p]->tail->id] + 1; if (H_ij == pimpl_->H[pred_i * matrix_width + j] + g_) { prev_i = pred_i; prev_j = j; predecessor_found = true; break; } } } } if (!predecessor_found && H_ij == pimpl_->H[i * matrix_width + j - 1] + g_) { // NOLINT prev_i = i; prev_j = j - 1; predecessor_found = true; } alignment.emplace_back( i == prev_i ? -1 : rank_to_node[i - 1]->id, j == prev_j ? -1 : j - 1); i = prev_i; j = prev_j; } std::reverse(alignment.begin(), alignment.end()); return alignment; } Alignment SisdAlignmentEngine::Affine( std::uint32_t sequence_len, const Graph& graph, std::int32_t* score) noexcept { std::uint64_t matrix_width = sequence_len + 1; const auto& rank_to_node = graph.rank_to_node(); std::int32_t max_score = type_ == AlignmentType::kSW ? 0 : kNegativeInfinity; std::uint32_t max_i = 0; std::uint32_t max_j = 0; auto update_max_score = [&max_score, &max_i, &max_j] ( std::int32_t* H_row, std::uint32_t i, std::uint32_t j) -> void { if (max_score < H_row[j]) { max_score = H_row[j]; max_i = i; max_j = j; } return; }; // alignment for (const auto& it : rank_to_node) { const auto& char_profile = &(pimpl_->sequence_profile[it->code * matrix_width]); std::uint32_t i = pimpl_->node_id_to_rank[it->id] + 1; std::uint32_t pred_i = it->inedges.empty() ? 0 : pimpl_->node_id_to_rank[it->inedges[0]->tail->id] + 1; std::int32_t* H_row = &(pimpl_->H[i * matrix_width]); std::int32_t* H_pred_row = &(pimpl_->H[pred_i * matrix_width]); std::int32_t* F_row = &(pimpl_->F[i * matrix_width]); std::int32_t* F_pred_row = &(pimpl_->F[pred_i * matrix_width]); // update F and H for (std::uint64_t j = 1; j < matrix_width; ++j) { F_row[j] = std::max( H_pred_row[j] + g_, F_pred_row[j] + e_); H_row[j] = H_pred_row[j - 1] + char_profile[j]; } // check other predeccessors for (std::uint32_t p = 1; p < it->inedges.size(); ++p) { pred_i = pimpl_->node_id_to_rank[it->inedges[p]->tail->id] + 1; H_pred_row = &(pimpl_->H[pred_i * matrix_width]); F_pred_row = &(pimpl_->F[pred_i * matrix_width]); for (std::uint64_t j = 1; j < matrix_width; ++j) { F_row[j] = std::max( F_row[j], std::max( H_pred_row[j] + g_, F_pred_row[j] + e_)); H_row[j] = std::max( H_row[j], H_pred_row[j - 1] + char_profile[j]); } } // update E and H std::int32_t* E_row = &(pimpl_->E[i * matrix_width]); for (std::uint64_t j = 1; j < matrix_width; ++j) { E_row[j] = std::max(H_row[j - 1] + g_, E_row[j - 1] + e_); H_row[j] = std::max(H_row[j], std::max(F_row[j], E_row[j])); if (type_ == AlignmentType::kSW) { H_row[j] = std::max(H_row[j], 0); update_max_score(H_row, i, j); } else if (type_ == AlignmentType::kNW && (it->outedges.empty() && j == matrix_width - 1)) { update_max_score(H_row, i, j); } else if (type_ == AlignmentType::kOV && (it->outedges.empty())) { update_max_score(H_row, i, j); } } } if (max_i == 0 && max_j == 0) { return Alignment(); } if (score) { *score = max_score; } // backtrack Alignment alignment; std::uint32_t i = max_i; std::uint32_t j = max_j; auto sw_condition = [this, &i, &j, &matrix_width] () -> bool { return (pimpl_->H[i * matrix_width + j] == 0) ? false : true; }; auto nw_condition = [&i, &j] () -> bool { return (i == 0 && j == 0) ? false : true; }; auto ov_condition = [&i, &j] () -> bool { return (i == 0 || j == 0) ? false : true; }; std::uint32_t prev_i = 0; std::uint32_t prev_j = 0; while ((type_ == AlignmentType::kSW && sw_condition()) || (type_ == AlignmentType::kNW && nw_condition()) || (type_ == AlignmentType::kOV && ov_condition())) { auto H_ij = pimpl_->H[i * matrix_width + j]; bool predecessor_found = false, extend_left = false, extend_up = false; if (i != 0 && j != 0) { const auto& it = rank_to_node[i - 1]; std::int32_t match_cost = pimpl_->sequence_profile[it->code * matrix_width + j]; std::uint32_t pred_i = it->inedges.empty() ? 0 : pimpl_->node_id_to_rank[it->inedges[0]->tail->id] + 1; if (H_ij == pimpl_->H[pred_i * matrix_width + (j - 1)] + match_cost) { prev_i = pred_i; prev_j = j - 1; predecessor_found = true; } else { for (std::uint32_t p = 1; p < it->inedges.size(); ++p) { pred_i = pimpl_->node_id_to_rank[it->inedges[p]->tail->id] + 1; if (H_ij == pimpl_->H[pred_i * matrix_width + (j - 1)] + match_cost) { prev_i = pred_i; prev_j = j - 1; predecessor_found = true; break; } } } } if (!predecessor_found && i != 0) { const auto& it = rank_to_node[i - 1]; std::uint32_t pred_i = it->inedges.empty() ? 0 : pimpl_->node_id_to_rank[it->inedges[0]->tail->id] + 1; if ((extend_up = H_ij == pimpl_->F[pred_i * matrix_width + j] + e_) || H_ij == pimpl_->H[pred_i * matrix_width + j] + g_) { prev_i = pred_i; prev_j = j; predecessor_found = true; } else { for (std::uint32_t p = 1; p < it->inedges.size(); ++p) { pred_i = pimpl_->node_id_to_rank[it->inedges[p]->tail->id] + 1; if ((extend_up = H_ij == pimpl_->F[pred_i * matrix_width + j] + e_) || H_ij == pimpl_->H[pred_i * matrix_width + j] + g_) { prev_i = pred_i; prev_j = j; predecessor_found = true; break; } } } } if (!predecessor_found && j != 0) { if ((extend_left = H_ij == pimpl_->E[i * matrix_width + j - 1] + e_) || H_ij == pimpl_->H[i * matrix_width + j - 1] + g_) { prev_i = i; prev_j = j - 1; predecessor_found = true; } } alignment.emplace_back( i == prev_i ? -1 : rank_to_node[i - 1]->id, j == prev_j ? -1 : j - 1); i = prev_i; j = prev_j; if (extend_left) { while (true) { alignment.emplace_back(-1, j - 1); --j; if (pimpl_->E[i * matrix_width + j] + e_ != pimpl_->E[i * matrix_width + j + 1]) { break; } } } else if (extend_up) { while (true) { bool stop = false; prev_i = 0; for (const auto& it : rank_to_node[i - 1]->inedges) { std::uint32_t pred_i = pimpl_->node_id_to_rank[it->tail->id] + 1; if ((stop = pimpl_->F[i * matrix_width + j] == pimpl_->H[pred_i * matrix_width + j] + g_) || // NOLINT pimpl_->F[i * matrix_width + j] == pimpl_->F[pred_i * matrix_width + j] + e_) { // NOLINT prev_i = pred_i; break; } } alignment.emplace_back(rank_to_node[i - 1]->id, -1); i = prev_i; if (stop || i == 0) { break; } } } } std::reverse(alignment.begin(), alignment.end()); return alignment; } Alignment SisdAlignmentEngine::Convex( std::uint32_t sequence_len, const Graph& graph, std::int32_t* score) noexcept { std::uint64_t matrix_width = sequence_len + 1; const auto& rank_to_node = graph.rank_to_node(); std::int32_t max_score = type_ == AlignmentType::kSW ? 0 : kNegativeInfinity; std::uint32_t max_i = 0; std::uint32_t max_j = 0; auto update_max_score = [&max_score, &max_i, &max_j] ( std::int32_t* H_row, std::uint32_t i, std::uint32_t j) -> void { if (max_score < H_row[j]) { max_score = H_row[j]; max_i = i; max_j = j; } return; }; // alignment for (const auto& it : rank_to_node) { const auto& char_profile = &(pimpl_->sequence_profile[it->code * matrix_width]); std::uint32_t i = pimpl_->node_id_to_rank[it->id] + 1; std::uint32_t pred_i = it->inedges.empty() ? 0 : pimpl_->node_id_to_rank[it->inedges[0]->tail->id] + 1; std::int32_t* H_row = &(pimpl_->H[i * matrix_width]); std::int32_t* H_pred_row = &(pimpl_->H[pred_i * matrix_width]); std::int32_t* F_row = &(pimpl_->F[i * matrix_width]); std::int32_t* F_pred_row = &(pimpl_->F[pred_i * matrix_width]); std::int32_t* O_row = &(pimpl_->O[i * matrix_width]); std::int32_t* O_pred_row = &(pimpl_->O[pred_i * matrix_width]); // update F, O and H for (std::uint64_t j = 1; j < matrix_width; ++j) { F_row[j] = std::max(H_pred_row[j] + g_, F_pred_row[j] + e_); O_row[j] = std::max(H_pred_row[j] + q_, O_pred_row[j] + c_); H_row[j] = H_pred_row[j - 1] + char_profile[j]; } // check other predeccessors for (std::uint32_t p = 1; p < it->inedges.size(); ++p) { pred_i = pimpl_->node_id_to_rank[it->inedges[p]->tail->id] + 1; H_pred_row = &(pimpl_->H[pred_i * matrix_width]); F_pred_row = &(pimpl_->F[pred_i * matrix_width]); O_pred_row = &(pimpl_->O[pred_i * matrix_width]); for (std::uint64_t j = 1; j < matrix_width; ++j) { F_row[j] = std::max( F_row[j], std::max( H_pred_row[j] + g_, F_pred_row[j] + e_)); O_row[j] = std::max( O_row[j], std::max( H_pred_row[j] + q_, O_pred_row[j] + c_)); H_row[j] = std::max(H_row[j], H_pred_row[j - 1] + char_profile[j]); } } // update E, Q and H std::int32_t* E_row = &(pimpl_->E[i * matrix_width]); std::int32_t* Q_row = &(pimpl_->Q[i * matrix_width]); for (std::uint64_t j = 1; j < matrix_width; ++j) { E_row[j] = std::max(H_row[j - 1] + g_, E_row[j - 1] + e_); Q_row[j] = std::max(H_row[j - 1] + q_, Q_row[j - 1] + c_); H_row[j] = std::max( H_row[j], std::max( std::max(F_row[j], E_row[j]), std::max(O_row[j], Q_row[j]))); if (type_ == AlignmentType::kSW) { H_row[j] = std::max(H_row[j], 0); update_max_score(H_row, i, j); } else if (type_ == AlignmentType::kNW && (it->outedges.empty() && j == matrix_width - 1)) { update_max_score(H_row, i, j); } else if (type_ == AlignmentType::kOV && it->outedges.empty()) { update_max_score(H_row, i, j); } } } if (max_i == 0 && max_j == 0) { return Alignment(); } if (score) { *score = max_score; } // backtrack Alignment alignment; std::uint32_t i = max_i; std::uint32_t j = max_j; auto sw_condition = [this, &i, &j, &matrix_width] () -> bool { return (pimpl_->H[i * matrix_width + j] == 0) ? false : true; }; auto nw_condition = [&i, &j] () -> bool { return (i == 0 && j == 0) ? false : true; }; auto ov_condition = [&i, &j] () -> bool { return (i == 0 || j == 0) ? false : true; }; std::uint32_t prev_i = 0; std::uint32_t prev_j = 0; while ((type_ == AlignmentType::kSW && sw_condition()) || (type_ == AlignmentType::kNW && nw_condition()) || (type_ == AlignmentType::kOV && ov_condition())) { auto H_ij = pimpl_->H[i * matrix_width + j]; bool predecessor_found = false, extend_left = false, extend_up = false; if (i != 0 && j != 0) { const auto& it = rank_to_node[i - 1]; std::int32_t match_cost = pimpl_->sequence_profile[it->code * matrix_width + j]; std::uint32_t pred_i = it->inedges.empty() ? 0 : pimpl_->node_id_to_rank[it->inedges[0]->tail->id] + 1; if (H_ij == pimpl_->H[pred_i * matrix_width + (j - 1)] + match_cost) { prev_i = pred_i; prev_j = j - 1; predecessor_found = true; } else { for (std::uint32_t p = 1; p < it->inedges.size(); ++p) { pred_i = pimpl_->node_id_to_rank[it->inedges[p]->tail->id] + 1; if (H_ij == pimpl_->H[pred_i * matrix_width + (j - 1)] + match_cost) { prev_i = pred_i; prev_j = j - 1; predecessor_found = true; break; } } } } if (!predecessor_found && i != 0) { const auto& it = rank_to_node[i - 1]; std::uint32_t pred_i = it->inedges.empty() ? 0 : pimpl_->node_id_to_rank[it->inedges[0]->tail->id] + 1; if ((extend_up |= H_ij == pimpl_->F[pred_i * matrix_width + j] + e_) || H_ij == pimpl_->H[pred_i * matrix_width + j] + g_ || (extend_up |= H_ij == pimpl_->O[pred_i * matrix_width + j] + c_) || H_ij == pimpl_->H[pred_i * matrix_width + j] + q_) { prev_i = pred_i; prev_j = j; predecessor_found = true; } else { for (std::uint32_t p = 1; p < it->inedges.size(); ++p) { pred_i = pimpl_->node_id_to_rank[it->inedges[p]->tail->id] + 1; if ((extend_up |= H_ij == pimpl_->F[pred_i * matrix_width + j] + e_) || // NOLINT H_ij == pimpl_->H[pred_i * matrix_width + j] + g_ || // NOLINT (extend_up |= H_ij == pimpl_->O[pred_i * matrix_width + j] + c_) || // NOLINT H_ij == pimpl_->H[pred_i * matrix_width + j] + q_) { prev_i = pred_i; prev_j = j; predecessor_found = true; break; } } } } if (!predecessor_found && j != 0) { if ((extend_left |= H_ij == pimpl_->E[i * matrix_width + j - 1] + e_) || H_ij == pimpl_->H[i * matrix_width + j - 1] + g_ || (extend_left |= H_ij == pimpl_->Q[i * matrix_width + j - 1] + c_) || H_ij == pimpl_->H[i * matrix_width + j - 1] + q_) { prev_i = i; prev_j = j - 1; predecessor_found = true; } } alignment.emplace_back( i == prev_i ? -1 : rank_to_node[i - 1]->id, j == prev_j ? -1 : j - 1); i = prev_i; j = prev_j; if (extend_left) { while (true) { alignment.emplace_back(-1, j - 1); --j; if (pimpl_->E[i * matrix_width + j] + e_ != pimpl_->E[i * matrix_width + j + 1] && // NOLINT pimpl_->Q[i * matrix_width + j] + c_ != pimpl_->Q[i * matrix_width + j + 1]) { // NOLINT break; } } } else if (extend_up) { while (true) { bool stop = true; prev_i = 0; for (const auto& it : rank_to_node[i - 1]->inedges) { std::uint32_t pred_i = pimpl_->node_id_to_rank[it->tail->id] + 1; if (pimpl_->F[i * matrix_width + j] == pimpl_->F[pred_i * matrix_width + j] + e_ || // NOLINT pimpl_->O[i * matrix_width + j] == pimpl_->O[pred_i * matrix_width + j] + c_) { // NOLINT prev_i = pred_i; stop = false; break; } } if (stop == true) { for (const auto& it : rank_to_node[i - 1]->inedges) { std::uint32_t pred_i = pimpl_->node_id_to_rank[it->tail->id] + 1; if (pimpl_->F[i * matrix_width + j] == pimpl_->H[pred_i * matrix_width + j] + g_ || // NOLINT pimpl_->O[i * matrix_width + j] == pimpl_->H[pred_i * matrix_width + j] + q_) { // NOLINT prev_i = pred_i; break; } } } alignment.emplace_back(rank_to_node[i - 1]->id, -1); i = prev_i; if (stop || i == 0) { break; } } } } std::reverse(alignment.begin(), alignment.end()); return alignment; } } // namespace spoa spoa-4.0.8/src/sisd_alignment_engine.hpp000066400000000000000000000037171400776337600203370ustar00rootroot00000000000000// Copyright (c) 2020 Robert Vaser #ifndef SISD_ALIGNMENT_ENGINE_HPP_ #define SISD_ALIGNMENT_ENGINE_HPP_ #include #include #include #include #include "spoa/alignment_engine.hpp" namespace spoa { class SisdAlignmentEngine: public AlignmentEngine { public: SisdAlignmentEngine(const SisdAlignmentEngine&) = delete; SisdAlignmentEngine& operator=(const SisdAlignmentEngine&) = delete; SisdAlignmentEngine(SisdAlignmentEngine&&) = default; SisdAlignmentEngine& operator=(SisdAlignmentEngine&&) = default; ~SisdAlignmentEngine() = default; static std::unique_ptr Create( AlignmentType type, AlignmentSubtype subtype, std::int8_t m, std::int8_t n, std::int8_t g, std::int8_t e, std::int8_t q, std::int8_t c); void Prealloc( std::uint32_t max_sequence_len, std::uint8_t alphabet_size) override; Alignment Align( const char* sequence, std::uint32_t sequence_len, const Graph& graph, std::int32_t* score) override; private: SisdAlignmentEngine( AlignmentType type, AlignmentSubtype subtype, std::int8_t m, std::int8_t n, std::int8_t g, std::int8_t e, std::int8_t q, std::int8_t c); Alignment Linear( std::uint32_t sequence_len, const Graph& graph, std::int32_t* score) noexcept; Alignment Affine( std::uint32_t sequence_len, const Graph& graph, std::int32_t* score) noexcept; Alignment Convex( std::uint32_t sequence_len, const Graph& graph, std::int32_t* score) noexcept; void Realloc( std::uint64_t matrix_width, std::uint64_t matrix_height, std::uint8_t num_codes); void Initialize( const char* sequence, std::uint32_t sequence_len, const Graph& graph) noexcept; struct Implementation; std::unique_ptr pimpl_; }; } // namespace spoa #endif // SISD_ALIGNMENT_ENGINE_HPP_ spoa-4.0.8/test/000077500000000000000000000000001400776337600134615ustar00rootroot00000000000000spoa-4.0.8/test/data/000077500000000000000000000000001400776337600143725ustar00rootroot00000000000000spoa-4.0.8/test/data/sample.fastq.gz000066400000000000000000000353451400776337600173440ustar00rootroot00000000000000@Eb_sample.fastq}[ȮbJ1+?-pw(zd1]%Q$EÁU?my[uٶmm~}^W{YyY<϶,Kkf;7cdavjG }Ê׆ ҎslYPnf V;u^c탙wgWڵ.vFvbjmv{Å:w<2| 9ٍ,F~pUvݣ=kZp+OWϋogBf=>~>~3A- 3Cs] u C^x6m3}qDY.{\` 5j,`~G 9s& Wȿȷ[fLD@{̀F^ \xq¬$mNnoGa51ms𮡾ԒXk-c朓w)SɩZ"Q{߶?M%G.Il mjbK}\RN}>hrh>o pjSn)mC{5жX;_;~h{rJҶ gƙ;>eE{6-Kۻab^>ۃ(|vӞBpk|j~<_+|<0Sn=jCp4/^5}nnn ~~ }b-@13[jX ; < wdݯ,;X[I1{`c0R RNQ=";a}mml7sZO6Rpǯ~ey`y}w0S(>7!ّ{$;  g|cM$+y cXmlTFT/˜M6 8{W0qtjc;YjpdiRF& 6\T/,9jV{9"[l,0#&v+^` /Hpٚ3EExmk4 mb1}Xg`dO$l Qo0/EWx@xxf3&fDnk|kNHsbMAu ;0'?q֨W0^CZ<- y`g֫N@ge/aj:c;S, qv(V m!ƱE@< Gyp*#,a65?=ž, ā9+(A`T&T璟oC+N H˙` #0,ۘmmms.ižӲ5b8eX}ڧ;,lfO2v_sr[H?"{Oޛ7KV'>pϺ7ſR[ݩ>^@@b!gE_`~Fc`AG0͛`6#a@mr ơ` }b`q/m a.GCO'P5\ qhqt?0fq_2lбl4^Gk6;W4 Op4/þ$>ud" j5bvK=(0E{!_}q5͏>Lcl A` 1*#d Gl$T4*GvP|Ό:"9ro;.@KbM plf:0ZVt@ $"A!u'r H  [R+I旁bi|b6sNI@LܼL` 𢁨^}~%~?ś|S;O}NɸDi$J8E~ti6Vǭ-rpnGy(zeWӶY|qw7 sm=ʀ:`u5:jBr]v).ip◎rh0WW`t7.bؙhL1c_D,h㲍vK%fH&b&H!xn @HѼtn.r Sy-+Xu+|BN` _q!sV$ ^r5܎:'b?5 XW@ CdDSlg434~~G{}2>?>çh i6Am'v<ь ~5D6\m4|D^)a) [rC`;|8}qd{SH6=!rQvRyt"{ĸ^ >l~b8;,SBƳ;J=㨞0͈+8Ȏ+L@|C)B۔D3)Xvv43O39*]\E|;Ey'& ṅN¦jSW shA5ٵs1rr'YdY!)lVG&3R25XDԃw3`.5levc.1$ Um߂vwSCކ+CGye^3} n \v΀-*kj*?0"ais`M7H$Hഄ g59VEĒь+H?'%w"/nc;2zx>d1yeJCHOt%N1v3΋ sDUsA2-p* XoN ,f89nM5~z@_]tk3X-;$x/v.!nn]>DBO.H7W[̉J.\j B`ܣtY5/k`tN-DWӤڨaO^,ӑRH!aO|e6* > QhQł^.[ *q)9^X/nG"?*M0,%55"`6x4X6{d``qI)@I%Gp;8HXkcNRC*h'c6@1$iX3mRGzmeaЏ80mHZn,#Z=l[dPb@[ѷCǗJm969uޛ'm$SdaP}Ld+@#!} L/p0W6^Yì+{dw# ?725uۍ'^ \\QJݞ# )s`fmh?uY ,W:=6"'G{ڡ >ȝ2!@JTQ-):4f"ևVa|KdA4[)Ta0PMj$D򅐺Z^d$ 0dPMyu mV\bD5%j%}Z9)X:zdEF޿zYU=J-137^Txd!mѕ͂ÍڌlϜ3ܢfzW)n`[F4;"$iI-M,$Q (GP"3_k!&=nHSH' ]O09|v_Nu8D$B;RTI3 \r/͐B+X ]ȅ2scå+uAXJp>jgN6y,K*@~mQ!H e 2ODۍپp*8ȁI ?2܏Gv{2H#!o^ɌY&w:,>4]mnH,+x5.I,:<'KEF$PJɠsR$T"!\$+#*sAڰL2S]Vfz22$˄h>(Ye5#U5oyDEzP!$$QI>df,LFnM4 VıOj6L LϨPٶG:7H-pFm!aJ\5I"P ,~~% ۥ.EVޣg8͓q2 *ixj>4IД.؇؂T $W-eĿ5M_ƅ- VDJCW%nT5*\.%=NTr'8 G0.}Y>]B\ 2'GvlctFQWUġ DjN~syscwVSJ3[ jFѴ5`%M.ګgY n{u Dwnu<ۋ\ %% Ӡ#Nj/0߫~q]1o;C,;<דּH2)\8|z}{cSxe Vaf\-IK|O_BR=ENM8|?NIO~J W^ӟ%+jhS9`CtɟjH WmP`B xac؁Kr$M+#튐=AVQ T)$JgcGpVjb1qHqoA\׹63N&4MgŁ,o2}$G?>8.@l4%I̘%Jԙdp$[0ƚ"l~j.=;h|Qj #FdTBᙛp 3G Ft[BO=X5Za,׽~a f O9/;1pc@0|Awt;#_YuZَi1goVnbՍ-7![] zfn-Lc2c$WxYpˉ0XC2+}dܠ^aά dȀg'EXN2,04^ZIj̨H\)JfwU 5#]roRߞsGR26{t cQ( p<āŤbfs'輶k{:Ty]-=n/==7}s]=6rbro-!{\$)B* t5t;wq͞6vctQeCZj`W)3P5ߑ6dc[8M)"B S+"ʘbr)hQjh%_1zhFEh50$ d7Tg +VXBBg>|Ғ@lM ;Z&MM^sUղ)HU[]*dVnՕkY$xbw^Z˝\dHFv2QKwJCj7w\fvcx˕um u:`޿PTಧ:In8$eqʔ.!u;qD{9mЫޛY_>Y Ai}7>bo V4;˵:1=oFWvY&u}d;zSeW&)h¶lxEƲ$JhFP6bxd/ C>Č3 Kbi-.1]* nmnOcmlPdbr(' (2= -&Yd 3řl>M@Qd٦&.B6yC ֊1h@G+cZ#UdiRV^y_?HEAײyIjL*)(Т_kܴ0;de^ ҬFtfRё>hMcf lJ1XPz@dwuSj#.` PF}?Pm>%)W0̙eđ".a3Cr/aL#u$P-@p@igCz;$tBd]ay+ 0'$/+GqŀAcʛŵߵݜ&eW5ż!4Nc /a^K]#t7f9ìSSX*FIl\2+4 ,{-P{YKH"(팒Z(ߕ*˘A=ʤRcJu6Oo}r6D\M06:*'QOjSn!ן񮔰NT5ٮ:$1;Ơ}?HS<+KA2zM`r]::u1˃z]k[_z~{~+./~,󥌊{*hom/F@{νsB1Y^DwҧQC0L$#IݡY)iVXh3/v2߾RW\S H RVԋ1hb xG`,VhlzM/k=UQ-$g*YĬԺpU|^[msg6x2p J#dd+O W%axRdw e6S$Y_D`ڑm"iLƋbokLdwÌ2o%yZIesRn% 1a Y?.T7b3Y YMiŨb[dLGUO^,~#LR:;ŏIRi =21,0 0cPbٙ w3CEF zL9W[@-6Z.R>mUEkRb7$˦bV[+^EɏQNV%A^#Rݽwa%k6|[o)&̮讗Nr6>mZu&]%l$UTkGV PL/Q.~˼X]01Fo*ZMXaҟԸ#*c! ҧid~gu3 ]L/.rF̩Yh]gS&#C6@ {Hg>-@gG(ZJ*5wW#q`!ɮz:vNm~{- 'eV2TvJed\`qB ײ^0y}@ y>jo,2Yr6x~nhT0ژHi0a6"!z˃ԊVZݽ]ƈ2Lg: ZZϚF(zf(^TS4H!qdILB%@D9QfF8j|EYn(.̢X:93Ԅ4z 9ZEP&sXFl(ζ5:&84|퇶:׏7]e|{Wyy"O94;?y:ܽC-n/]p^upbIMP ט]qdG7#D;  `V1`S%?%z/䠝NU`pAa_fY_e*WBY_J3EꊫsgEj=[g GrĺI)!em+BŕY&` )艤s&H L.9h%̅>4n?l]J^#I|XADȔ.|%C%لfPKwռ~b)a0H2=!abr!vApwxpXob6DY'TyѨ ɮԥ+o4WާsQ8+_x8OԣB7Gδדpjf~vdBY9ʘ1Ɖz&>Bο^}\at=H s$Jf{Ի:dZ!kܽvӨ vNF傼MWWbs*ن91C\<Ć| WvqdXڃuSzk;w@=rҘ4|[{ϳ˽eOb+JSTI$5ZV'i zTIWbR IJr+JCPkAŜaHc =prAM [D25voar 9]3y+-jAj&K1_G)t?/tR;eOK Jble afj.R( \/cNJ38=Tx1e=)XASu<]tYrIk{@@kg5L!NODEq0Ip^ *G`>0o*^tg476;aɪU=${CyҮ$8#{N-=zH&.0e6Wrߦ=+mW8R((pDœAKwrX0TC|hxFHQ-%-JXwFzH6U!ll(xAҪ8!P^d CˮDA$Jc(MD.U0QMwSrS]{"YLTA6:FF̒l1jxv$Nz;.XݞZ!՛#A=w^l;.&u]zM q2:M^8^k(j^ =0 {@rtɕسrBdqG\'꾉NRЋTqf)`o~N,h\ӉaTa[6ڜP/S`X]ư_Bo/3օd{["NIvگe<[%N/[2aUYt =ϴa˗Q8eko 1+] `]PBs} Ty'WW8"k+η`Q'*gգ@ŠNXceEe@Fr>G\Phzsmfh jtF ;eP  ] .f,/OUbgf. ne  )LD,nZΫ0Ex9[*%Ҁ*pDCɞd*$fZZ:=FF+Ԡw k)S>AE*{\)׷w5Zylsj! D2/Tc|{Ӌ>89ggg87zed=hk 4ĠO6ͅ-nqNj9nA$yf-zf$pFTZ'_L]?*0ukJƔkX c25ؙnNp+6. l^NͻGU/\m^O@WZ#?Z  ^°w/IG cL3'__Y*P΁ mr߶]JaQZ -zso*fVl-y|j$sG]z3`]R/E҇5 䭵,g%dZm4$*3iI:H|kr[BG15"_lQ2x("Z^7?’ 6= b42 ko!eud*+4}}1qX{L<ha܆ҌB,+4ħ"qSXF% :^ Kh@{TD"r,XEu0\^YZᭆwftܞq=|g6E^ .oy=スpUڻh:2JGUi7,Z`:H1ɞGV!&&L @g̢QMl^pbgfN~]3Nĕ^ZTK&,Vh+Z**z!Jt#9ymdz=NDK(*br]( wN=p!#)bd%Di#}ԝ0f!\Y(~~_[[)#J'~nW9?&.wch$ nE Ҏqej)'6%f)9; 2Zr/>*ke1hE\^,Щ9qK=Fϗ%QdƑ9h;\"+0jxtYu#= +Rɢ]ZŒUL"l׳b.xޭ ?wPtRn{g}HH.غG#ܮ o.;c]ԆP-_a4>hxjtj{ö9R}AF6B/KBqXfVp&U 'nnf`On0 Cvб^fngzft Zs%Q=Z* X,]P #\%@7DJboRiCң.x"OT]\̰C6ĖFLY \ӫ4t߯ ߅t_ [cw7VXᩖōoAU rf_q{o}=vx -m&@zádfz0T֪p-ȚǫBqIsԨ(ήdS H ]Y,oY:U`5Z S5+ŅE+ࢥn(WfmzN#+y>4GvQ;.c ؕIV,} ܷrRK7n%3 v@]qA=HTu$j}ݨ8X+@3ٓB>^. csS jPh25%oa6 {dwٱvT@V)tFH:uRԟƃd\-& 0!N9nrŁR?TeQUIz- N+H![߿ѿk~cv,~!Gfmy.  iQ2HÈזW/O}@;;zGcyO7DyUr3<&j*Z6DDX^Q +:NIy`8RlD48Bʾl!Q\i8HRe"Kz2ɠ|,q=,.x]fQ,h56B&V4<Ӝ6S)Jּpdnu=4&֥n܏'?_=gR+)ێZi$w='&yy/m1LHK!ҫ`74Pޖw^J)F./:fX9I Y# [5[" E oS@< %' \|Zey?X[5FZQ-"5'=-3 4GV+dhK5e ̜>*Js 8@pcAJXKY_Q;Xb#F&R+76]⼊Uh) XGC9o(z])_r1;`„1Q!V=uGP5 #ʚU]2/;pTѽv3vŵŞ]j6,o4L$m`o&׍{XpE$?O0DD2G]iPhN'r1x:@[=-NjӍ&yFO3ݒ+Gyiir_אj/UAS (ܺ؛ƛL,tx͍(ρpI<./.w q c6E|[l/&nke4pax lxQq&TK*`{><`sSHr pGW eETyds9wZifl#DO0#Wlb1k ɻ5$ }l[ 0HyѠԑzR̄Ax1:Aʉ"TA*`kئ qU,= ʼ r DKc!??~i ɅϏO_'_4_\33:`ҭ@Փ@z> o~ y+{zEwJHsMa@&^f፭  H"Z +/h롏ʀ' U'+bž"bNUўQ_lz Ś)cp!.+#Ъ$kp+ġA7Xq!IR 6Otf0Qд$E2-XᤚP]T&2YUXL"![͈L|t*a XV=^5= )o Y#puXj]X">#Fjdh\nCQpSJc(}_6Yc״'.;A}ʍFfspoa-4.0.8/test/spoa_test.cpp000066400000000000000000000373751400776337600162050ustar00rootroot00000000000000// Copyright (c) 2020 Robert Vaser #include #include "bioparser/fastq_parser.hpp" #include "biosoup/sequence.hpp" #ifdef SPOA_USE_CEREAL #include "cereal/archives/binary.hpp" #endif #include "gtest/gtest.h" #include "spoa/spoa.hpp" std::atomic biosoup::Sequence::num_objects{0}; namespace spoa { namespace test { class SpoaTest: public ::testing::Test { public: void Setup( AlignmentType type, std::int8_t m, std::int8_t n, std::int8_t g, std::int8_t e, std::int8_t q, std::int8_t c, bool quality) { auto p = bioparser::Parser::Create(TEST_DATA); // NOLINT s = p->Parse(-1); EXPECT_EQ(55, s.size()); ae = AlignmentEngine::Create(type, m, n, g, e, q, c); gr = Graph(); iq = quality; } void Align() { std::size_t ms = 0; for (const auto& it : s) { ms = std::max(ms, it->data.size()); } ae->Prealloc(ms, 4); for (const auto& it : s) { auto a = ae->Align(it->data, gr); if (iq) { gr.AddAlignment(a, it->data, it->quality); } else { gr.AddAlignment(a, it->data); } } } void Check(const std::string& c) { EXPECT_EQ(c, gr.GenerateConsensus()); auto msa = gr.GenerateMultipleSequenceAlignment(); EXPECT_EQ(s.size(), msa.size()); std::size_t rs = msa.front().size(); std::vector gc(rs, 0); for (const auto& it : msa) { EXPECT_EQ(rs, it.size()); for (std::size_t i = 0; i < rs; ++i) { gc[i] += it[i] == '-' ? 1 : 0; } } for (const auto& it : gc) { EXPECT_GT(msa.size(), it); } for (std::uint32_t i = 0; i < msa.size(); ++i) { msa[i].erase(std::remove(msa[i].begin(), msa[i].end(), '-'), msa[i].end()); // NOLINT EXPECT_EQ(msa[i], s[i]->data); } } std::vector> s; std::unique_ptr ae; Graph gr; bool iq; }; TEST(SpoaAlignmentTest, TypeError) { try { auto ae = AlignmentEngine::Create(static_cast(4), 1, -1, -1); } catch(std::invalid_argument& exception) { EXPECT_STREQ( exception.what(), "[spoa::AlignmentEngine::Create] error: invalid alignment type!"); } } TEST(SpoaAlignmentTest, EmptyInput) { auto ae = AlignmentEngine::Create(AlignmentType::kSW, 1, -1, -1); Graph g{}; auto a = ae->Align("", g); EXPECT_TRUE(a.empty()); } TEST(SpoaAlignmentTest, LargeInput) { auto ae = AlignmentEngine::Create(AlignmentType::kSW, 1, -1, -1); try { ae->Prealloc(-1, 1); } catch (std::invalid_argument& exception) { EXPECT_EQ( std::string(exception.what()).substr(11), "AlignmentEngine::Prealloc] error: too large sequence!"); } try { ae->Prealloc((1ULL << 31) - 1, -1); } catch (std::invalid_argument& exception) { EXPECT_EQ( std::string(exception.what()).substr(11), "AlignmentEngine::Prealloc] error: insufficient memory!"); } } TEST_F(SpoaTest, Clear) { Setup(AlignmentType::kSW, 5, -4, -8, -8, -8, -8, false); Align(); auto c = gr.GenerateConsensus(); gr.Clear(); Align(); EXPECT_EQ(c, gr.GenerateConsensus()); } #ifdef SPOA_USE_CEREAL TEST_F(SpoaTest, Archive) { Setup(AlignmentType::kNW, 2, -5, -2, -2, -2, -2, true); { std::ofstream os("spoa.test.cereal"); cereal::BinaryOutputArchive archive(os); archive(gr); } auto c = gr.GenerateConsensus(); gr = {}; { std::ifstream is("spoa.test.cereal"); cereal::BinaryInputArchive archive(is); archive(gr); } EXPECT_EQ(c, gr.GenerateConsensus()); } #endif TEST_F(SpoaTest, Local) { Setup(AlignmentType::kSW, 5, -4, -8, -8, -8, -8, false); Align(); std::string c = "AATGATGCGCTTTGTTGGCGCGGTGGCTTGATGCAGGGGCTAATCGACCTCTGGCAACCACTTTTCCATGAC" "AGGAGTTGAATATGGCATTCAGTAATCCCTTCGATGATCCGCAGGGAGCGTTTTACATATTGCGCAATGCGC" "AGGGGCAATTCAGTCTGTGGCCGCAACAATGCGTCTTACCGGCAGGCTGGGACATTGTGTGTCAGCCGCAGT" "CACAGGCGTCCTGCCAGCAGTGGCTGGAAGCCCACTGGCGTACTCTGACACCGACGAATTTTACCCAGTTGC" "AGGAGGCACAATGAGCCAGCATTTACCTTTGGTCGCCGCACAGCCCGGCATCTGGATGGCAGAAAAACTGTC" "AGAATTACCCTCCGCCTGGAGCGTGGCGCATTACGTTGAGTTAACCGGAGAGGTTGATTCGCCATTACTGGC" "CCGCGCGGTGGTTGCCGGACTAGCGCAAGCAGATACGCTTTACACGCGCAACCAAGGATTTCGG"; Check(c); } TEST_F(SpoaTest, LocalAffine) { Setup(AlignmentType::kSW, 5, -4, -8, -6, -8, -6, false); Align(); std::string c = "AATGATGCGCTTTGTTGGCGCGGTGGCTTGATGCAGGGGCTAATCGACCTCTGGCAACCACTTTTCCATGAC" "AGGAGTTGAATATGGCATTCAGTAATCCCTTCGATGATCCGCAGGGAGCGTTTTACATATTGCGCAATGCGC" "AGGGGCAATTCAGTCTGTGGCCGCAACAATGCGTCTTACCGGCAGGCTGGGACATTGTGTGTCAGCCGCAGT" "CACAGGCGTCCTGCCAGCAGTGGCTGGAAGCCCACTGGCGTACTCTGACACCGACGAATTTTACCCAGTTGC" "AGGAGGCACAATGAGCCAGCATTTACCTTTGGTCGCCGCACAGCCCGGCATCTGGATGGCAGAAAAACTGTC" "AGAATTACCCTCCGCCTGGAGCGTGGCGCATTACGTTGAGTTAACCGGAGAGGTTGATTCGCCATTACTGGC" "CCGCGCGGTGGTTGCCGGACTAGCGCAAGCAGATACGCTTTACACGCGCAACCAAGGATTTCGG"; Check(c); } TEST_F(SpoaTest, LocalConvex) { Setup(AlignmentType::kSW, 5, -4, -8, -6, -10, -2, false); Align(); std::string c = "AATGATGCGCTTTGTTGGCGCGGTGGCTTGATGCAGGGGCTAATCGACCTCTGGCAACCACTTTTCCATGAC" "AGGAGTTGAATATGGCATTCAGTAATCCCTTCGATGATCCGCAGGGAGCGTTTTACATATTGCGCAATGCGC" "AGGGGCAATTCAGTCTGTGGCCGCAACAATGCGTCTTACCGGCAGGCTGGGACATTGTGTGTCAGCCGCAGT" "CACAGGCGTCCTGCCAGCAGTGGCTGGAAGCCCACTGGCGTACTCTGACACCGACGAATTTTACCCAGTTGC" "AGGGAGGCACAATGAGCCAGCATTTACCTTTGGTCGCCGCACAGCCCGGCATCTGGATGGCAGAAAAACTGT" "CAGAATTACCCTCCGCCTGGAGCGTGGCGCATTACGTTGAGTTAACCGGAGAGGTTGATTCGCCATTACTGG" "CCCGCGCGGTGGTTGCCGGACTAGCGCAAAGCAGATACGCTG"; Check(c); } TEST_F(SpoaTest, LocalWithQualities) { Setup(AlignmentType::kSW, 5, -4, -8, -8, -8, -8, true); Align(); std::string c = "AATGATGCGCTTTGTTGGCGCGGTGGCTTGATGCAGGGGCTAATCGACCTCTGGCAACCACTTTTCCATGAC" "AGGAGTTGAATATGGCATTCAGTAATCCCTTCGATGATCCGCAGGGAGCGTTTTACATATTGCGCAATGCGC" "AGGGGCAATTCAGTCTGTGGCCGCAACAATGCGTCTTACCGGCAGGCTGGGACATTGTGTGTCAGCCGCAGT" "CACAGGCGTCCTGCCAGCAGTGGCTGGAAGCCCACTGGCGTACTCTGACACCGACGAATTTTACCCAGTTGC" "AGGAGGCACAATGAGCCAGCATTTACCTTTGGTCGCCGCACAGCCCGGCATCTGGATGGCAGAAAAACTGTC" "AGAATTACCCTCCGCCTGGAGCGTGGCGCATTACGTTGAGTTAACCGGAGAGGTTGATTCGCCATTACTGGC" "CCGCGCGGTGGTTGCCGGACTAGCGCAAGCAGATACGCTTTACACGCGCAACCAAGGATTTCGG"; Check(c); } TEST_F(SpoaTest, LocalAffineWithQualities) { Setup(AlignmentType::kSW, 5, -4, -8, -6, -8, -6, true); Align(); std::string c = "AATGATGCGCTTTGTTGGCGCGGTGGCTTGATGCAGGGGCTAATCGACCTCTGGCAACCACTTTTCCATGAC" "AGGAGTTGAATATGGCATTCAGTAATCCCTTCGATGATCCGCAGGGAGCGTTTTACATATTGCGCAATGCGC" "AGGGGCAATTCAGTCTGTGGCCGCAACAATGCGTCTTACCGGCAGGCTGGGACATTGTGTGTCAGCCGCAGT" "CACAGGCGTCCTGCCAGCAGTGGCTGGAAGCCCACTGGCGTACTCTGACACCGACGAATTTTACCCAGTTGC" "AGGAGGCACAATGAGCCAGCATTTACCTTTGGTCGCCGCACAGCCCGGCATCTGGATGGCAGAAAAACTGTC" "AGAATTACCCTCCGCCTGGAGCGTGGCGCATTACGTTGAGTTAACCGGAGAGGTTGATTCGCCATTACTGGC" "CCGCGCGGTGGTTGCCGGACTAGCGCAAGCAGATACGCTTTACACGCGCAACCAAGGATTTCGG"; Check(c); } TEST_F(SpoaTest, LocalConvexWithQualities) { Setup(AlignmentType::kSW, 5, -4, -8, -6, -10, -2, true); Align(); std::string c = "AATGATGCGCTTTGTTGGCGCGGTGGCTTGATGCAGGGGCTAATCGACCTCTGGCAACCACTTTTCCATGAC" "AGGAGTTGAATATGGCATTCAGTAATCCCTTCGATGATCCGCAGGGAGCGTTTTACATATTGCGCAATGCGC" "AGGGGCAATTCAGTCTGTGGCCGCAACAATGCGTCTTACCGGCAGGCTGGGACATTGTGTGTCAGCCGCAGT" "CACAGGCGTCCTGCCAGCAGTGGCTGGAAGCCCACTGGCGTACTCTGACACCGACGAATTTTACCCAGTTGC" "AGGGAGGCACAATGAGCCAGCATTTACCTTTGGTCGCCGCACAGCCCGGCATCTGGATGGCAGAAAAACTGT" "CAGAATTACCCTCCGCCTGGAGCGTGGCGCATTACGTTGAGTTAACCGGAGAGGTTGATTCGCCATTACTGG" "CCCGCGCGGTGGTTGCCGGACTAGCGCAAGCAGATACGCTG"; Check(c); } TEST_F(SpoaTest, Global) { Setup(AlignmentType::kNW, 5, -4, -8, -8, -8, -8, false); Align(); std::string c = "ATGATGCGCTTTGTTGGCGCGGTGGCTTGATGCAGGGGCTAATCGACCTCTGGCAACCACTTTTCCATGACA" "GGAGTTGAATATGGCATTCAGTAATCCCTTCGATGATCCGCAGGGAGCGTTTTACATATTGCGCAATGCGCA" "GGGGCAATTCAGTCTGTGGCCGCAACAATGCGTCTTACCGGCAGGCTGGGACATTGTGTGTCAGCCGCAGTC" "ACAGGCGTCCTGCCAGCAGTGGCTGGAAGCCCACTGGCGTACTCTGACACCGACGAATTTTACCCAGTTGCA" "GGAGGCACAATGAGCCAGCATTTACCTTTGGTCGCCGCACAGCCCGGCATCTGGATGGCAGAAAAACTGTCA" "GAATTACCCTCCGCCTGGAGCGTGGCGCATTACGTTGAGTTAACCGGAGAGGTTGATTCGCCATTACTGGCC" "CGCGCGGTGGTTGCCGGACTAGCGCAAGCAGATACGC"; Check(c); } TEST_F(SpoaTest, GlobalAffine) { Setup(AlignmentType::kNW, 5, -4, -8, -6, -8, -6, false); Align(); std::string c = "ATGATGCGCTTTGTTGGCGCGGTGGCTTGATGCAGGGGCTAATCGACCTCTGGCAACCACTTTTCCATGACA" "GGAGTTGAATATGGCATTCAGTAATCCCTTCGATGATCCGCAGGGAGCGTTTTACATATTGCGCAATGCGCA" "GGGGCAATTCAGTCTGTGGCCGCAACAATGCGTCTTACCGGCAGGCTGGGACATTGTGTGTCAGCCGCAGTC" "ACAGGCGTCCTGCCAGCAGTGGCTGGAAGCCCACTGGCGTACTCTGACACCGACGAATTTTACCCAGTTGCA" "GGAGGCACAATGAGCCAGCATTTACCTTTGGTCGCCGCACAGCCCGGCATCTGGATGGCAGAAAAACTGTCA" "GAATTACCCTCCGCCTGGAGCGTGGCGCATTACGTTGAGTTAACCGGAGAGGTTGATTCGCCATTACTGGCC" "CGCGCGGTGGTTGCCGGACTAGCGCAAGCAGATACGC"; Check(c); } TEST_F(SpoaTest, GlobalConvex) { Setup(AlignmentType::kNW, 5, -4, -8, -6, -10, -2, false); Align(); std::string c = "ATGATGCGCTTTGTTGGCGCGGTGGCTTGATGCAGGGGCTAATCGACCTCTGGCAACCACTTTTCCATGACA" "GGAGTTGAATATGGCATTCAGTAATCCCTTCGATGATCCGCAGGGAGCGTTTTACATATTGCGCAATGCGCA" "GGGGCAATTCAGTCTGTGGCCGCAACAATGCGTCTTACCGGCAGGCTGGGACATTGTGTGTCAGCCGCAGTC" "ACAGGCGTCCTGCCAGCAGTGGCTGGAAGCCCACTGGCGTACTCTGACACCGACGAATTTTACCCAGTTGCA" "GGGAGGCACAATGAGCCAGCATTTACCTTTGGTCGCCGCACAGCCCGGCATCTGGATGGCAGAAAAACTGTC" "AGAATTACCCTCCGCCTGGAGCGTGGCGCATTACGTTGAGTTAACCGGAGAGGTTGATTCGCCATTACTGGC" "CCGCGCGGTGGTTGCCGGACTAGCGCAAAGCAGATACGC"; Check(c); } TEST_F(SpoaTest, GlobalWithQualities) { Setup(AlignmentType::kNW, 5, -4, -8, -8, -8, -8, true); Align(); std::string c = "ATGATGCGCTTTGTTGGCGCGGTGGCTTGATGCAGGGGCTAATCGACCTCTGGCAACCACTTTTCCATGACA" "GGAGTTGAATATGGCATTCAGTAATCCCTTCGATGATCCGCAGGGAGCGTTTTACATATTGCGCAATGCGCA" "GGGGCAATTCAGTCTGTGGCCGCAACAATGCGTCTTACCGGCAGGCTGGGACATTGTGTGTCAGCCGCAGTC" "ACAGGCGTCCTGCCAGCAGTGGCTGGAAGCCCACTGGCGTACTCTGACACCGACGAATTTTACCCAGTTGCA" "GGAGGCACAATGAGCCAGCATTTACCTTTGGTCGCCGCACAGCCCGGCATCTGGATGGCAGAAAAACTGTCA" "GAATTACCCTCCGCCTGGAGCGTGGCGCATTACGTTGAGTTAACCGGAGAGGTTGATTCGCCATTACTGGCC" "CGCGCGGTGGTTGCCGGACTAGCGCAAGCAGATACGC"; Check(c); } TEST_F(SpoaTest, GlobalAffineWithQualities) { Setup(AlignmentType::kNW, 5, -4, -8, -6, -8, -6, true); Align(); std::string c = "ATGATGCGCTTTGTTGGCGCGGTGGCTTGATGCAGGGGCTAATCGACCTCTGGCAACCACTTTTCCATGACA" "GGAGTTGAATATGGCATTCAGTAATCCCTTCGATGATCCGCAGGGAGCGTTTTACATATTGCGCAATGCGCA" "GGGGCAATTCAGTCTGTGGCCGCAACAATGCGTCTTACCGGCAGGCTGGGACATTGTGTGTCAGCCGCAGTC" "ACAGGCGTCCTGCCAGCAGTGGCTGGAAGCCCACTGGCGTACTCTGACACCGACGAATTTTACCCAGTTGCA" "GGAGGCACAATGAGCCAGCATTTACCTTTGGTCGCCGCACAGCCCGGCATCTGGATGGCAGAAAAACTGTCA" "GAATTACCCTCCGCCTGGAGCGTGGCGCATTACGTTGAGTTAACCGGAGAGGTTGATTCGCCATTACTGGCC" "CGCGCGGTGGTTGCCGGACTAGCGCAAGCAGATACGC"; Check(c); } TEST_F(SpoaTest, GlobalConvexWithQualities) { Setup(AlignmentType::kNW, 5, -4, -8, -6, -10, -2, true); Align(); std::string c = "ATGATGCGCTTTGTTGGCGCGGTGGCTTGATGCAGGGGCTAATCGACCTCTGGCAACCACTTTTCCATGACA" "GGAGTTGAATATGGCATTCAGTAATCCCTTCGATGATCCGCAGGGAGCGTTTTACATATTGCGCAATGCGCA" "GGGGCAATTCAGTCTGTGGCCGCAACAATGCGTCTTACCGGCAGGCTGGGACATTGTGTGTCAGCCGCAGTC" "ACAGGCGTCCTGCCAGCAGTGGCTGGAAGCCCACTGGCGTACTCTGACACCGACGAATTTTACCCAGTTGCA" "GGGAGGCACAATGAGCCAGCATTTACCTTTGGTCGCCGCACAGCCCGGCATCTGGATGGCAGAAAAACTGTC" "AGAATTACCCTCCGCCTGGAGCGTGGCGCATTACGTTGAGTTAACCGGAGAGGTTGATTCGCCATTACTGGC" "CCGCGCGGTGGTTGCCGGACTAGCGCAAAGCAGATACGC"; Check(c); } TEST_F(SpoaTest, SemiGlobal) { Setup(AlignmentType::kOV, 5, -4, -8, -8, -8, -8, false); Align(); std::string c = "ACATGATGCGCTTTGTTGGCGCGGTGGCTTGATGCAGGGGCTAATCGACCTCTGGCAACCACTTTTCCATGA" "CAGGAGTTGAATATGGCATTCAGTAATCCCTTCGATGATCCGCAGGGAGCGTTTTACATATTGCGCAATGCG" "CAGGGGCAATTCAGTCTGTGGCCGCAACAATGCGTCTTACCGGCAGGCTGGGACATTGTGTGTCAGCCGCAG" "TCACAGGCGTCCTGCCAGCAGTGGCTGGAAGCCCACTGGCGTACTCTGACACCGACGAATTTTACCCAGTTG" "CAGGAGGCACAATGAGCCAGCATTTACCTTTGGTCGCCGCACAGCCCGGCATCTGGATGGCAGAAAAACTGT" "CAGAATTACCCTCCGCCTGGAGCGTGGCGCATTACGTTGAGTTAACCGGAGAGGTTGATTCGCCATTACTGG" "CCCGCGCGGTGGTTGCCGGACTAGCGCAAGCAGATACGCGTTTTACACGCGCAACCAAGGATTTCGG"; Check(c); } TEST_F(SpoaTest, SemiGlobalAffine) { Setup(AlignmentType::kOV, 5, -4, -8, -6, -8, -6, false); Align(); std::string c = "GTATGATGCGCTTTGTTGGCGCGGTGGCTTGATGCAGGGGCTAATCGACCTCTGGCAACCACTTTTCCATGA" "CAGGAGTTGAATATGGCATTCAGTAATCCCTTCGATGATCCGCAGGGAGCGTTTTACATATTGCGCAATGCG" "CAGGGGCAATTCAGTCTGTGGCCGCAACAATGCGTCTTACCGGCAGGCTGGGACATTGTGTGTCAGCCGCAG" "TCACAGGCGTCCTGCCAGCAGTGGCTGGAAGCCCACTGGCGTACTCTGACACCGACGAATTTTACCCAGTTG" "CAGGAGGCACAATGAGCCAGCATTTACCTTTGGTCGCCGCACAGCCCGGCATCTGGATGGCAGAAAAACTGT" "CAGAATTACCCTCCGCCTGGAGCGTGGCGCATTACGTTGAGTTAACCGGAGAGGTTGATTCGCCATTACTGG" "CCCGCGCGGTGGTTGCCGGACTAGCGCAAGCAGATACGCGTTTTACACGCGCAACCAAGGATTTCGG"; Check(c); } TEST_F(SpoaTest, SemiGlobalConvex) { Setup(AlignmentType::kOV, 5, -4, -8, -6, -10, -2, false); Align(); std::string c = "GTATGATGCGCTTTGTTGGCGCGGTGGCTTGATGCAGGGGCTAATCGACCTCTGGCAACCACTTTTCCATGA" "CAGGAGTTGAATATGGCATTCAGTAATCCCTTCGATGATCCGCAGGGAGCGTTTTACATATTGCGCAATGCG" "CAGGGGCAATTCAGTCTGTGGCCGCAACAATGCGTCTTACCGGCAGGCTGGGACATTGTGTGTCAGCCGCAG" "TCACAGGCGTCCTGCCAGCAGTGGCTGGAAGCCCACTGGCGTACTCTGACACCGACGAATTTTACCCAGTTG" "CAGGGAGGCACAATGAGCCAGCATTTACCTTTGGTCGCCGCACAGCCCGGCATCTGGATGGCAGAAAAACTG" "TCAGAATTACCCTCCGCCTGGAGCGTGGCGCATTACGTTGAGTTAACCGGAGAGGTTGATTCGCCATTACTG" "GCCCGCGCGGTGGTTGCCGGACTAGCGCAAAGCAGATACGC"; Check(c); } TEST_F(SpoaTest, SemiGlobalWithQualities) { Setup(AlignmentType::kOV, 5, -4, -8, -8, -8, -8, true); Align(); std::string c = "ACATGATGCGCTTTGTTGGCGCGGTGGCTTGATGCAGGGGCTAATCGACCTCTGGCAACCACTTTTCCATGA" "CAGGAGTTGAATATGGCATTCAGTAATCCCTTCGATGATCCGCAGGGAGCGTTTTACATATTGCGCAATGCG" "CAGGGGCAATTCAGTCTGTGGCCGCAACAATGCGTCTTACCGGCAGGCTGGGACATTGTGTGTCAGCCGCAG" "TCACAGGCGTCCTGCCAGCAGTGGCTGGAAGCCCACTGGCGTACTCTGACACCGACGAATTTTACCCAGTTG" "CAGGAGGCACAATGAGCCAGCATTTACCTTTGGTCGCCGCACAGCCCGGCATCTGGATGGCAGAAAAACTGT" "CAGAATTACCCTCCGCCTGGAGCGTGGCGCATTACGTTGAGTTAACCGGAGAGGTTGATTCGCCATTACTGG" "CCCGCGCGGTGGTTGCCGGACTAGCGCAAGCAGATACGCGTTTTACACGCGCAACCAAGGATTTCGG"; Check(c); } TEST_F(SpoaTest, SemiGlobalAffineWithQualities) { Setup(AlignmentType::kOV, 5, -4, -8, -6, -8, -6, true); Align(); std::string c = "ACATGATGCGCTTTGTTGGCGCGGTGGCTTGATGCAGGGGCTAATCGACCTCTGGCAACCACTTTTCCATGA" "CAGGAGTTGAATATGGCATTCAGTAATCCCTTCGATGATCCGCAGGGAGCGTTTTACATATTGCGCAATGCG" "CAGGGGCAATTCAGTCTGTGGCCGCAACAATGCGTCTTACCGGCAGGCTGGGACATTGTGTGTCAGCCGCAG" "TCACAGGCGTCCTGCCAGCAGTGGCTGGAAGCCCACTGGCGTACTCTGACACCGACGAATTTTACCCAGTTG" "CAGGAGGCACAATGAGCCAGCATTTACCTTTGGTCGCCGCACAGCCCGGCATCTGGATGGCAGAAAAACTGT" "CAGAATTACCCTCCGCCTGGAGCGTGGCGCATTACGTTGAGTTAACCGGAGAGGTTGATTCGCCATTACTGG" "CCCGCGCGGTGGTTGCCGGACTAGCGCAAGCAGATACGCGTTTTACACGCGCAACCAAGGATTTCGG"; Check(c); } TEST_F(SpoaTest, SemiGlobalConvexWithQualities) { Setup(AlignmentType::kOV, 5, -4, -8, -6, -10, -2, true); Align(); std::string c = "GTATGATGCGCTTTGTTGGCGCGGTGGCTTGATGCAGGGGCTAATCGACCTCTGGCAACCACTTTTCCATGA" "CAGGAGTTGAATATGGCATTCAGTAATCCCTTCGATGATCCGCAGGGAGCGTTTTACATATTGCGCAATGCG" "CAGGGGCAATTCAGTCTGTGGCCGCAACAATGCGTCTTACCGGCAGGCTGGGACATTGTGTGTCAGCCGCAG" "TCACAGGCGTCCTGCCAGCAGTGGCTGGAAGCCCACTGGCGTACTCTGACACCGACGAATTTTACCCAGTTG" "CAGGGAGGCACAATGAGCCAGCATTTACCTTTGGTCGCCGCACAGCCCGGCATCTGGATGGCAGAAAAACTG" "TCAGAATTACCCTCCGCCTGGAGCGTGGCGCATTACGTTGAGTTAACCGGAGAGGTTGATTCGCCATTACTG" "GCCCGCGCGGTGGTTGCCGGACTAGCGCAAAGCAGATACGC"; Check(c); } } // namespace test } // namespace spoa