pax_global_header00006660000000000000000000000064133572021430014512gustar00rootroot0000000000000052 comment=b838b972e869f8ae8acbeb6825d20b6244c29639 SKESA-2.3.0/000077500000000000000000000000001335720214300123625ustar00rootroot00000000000000SKESA-2.3.0/DBGraph.hpp000066400000000000000000000633721335720214300143550ustar00rootroot00000000000000/*=========================================================================== * * PUBLIC DOMAIN NOTICE * National Center for Biotechnology Information * * This software/database is a "United States Government Work" under the * terms of the United States Copyright Act. It was written as part of * the author's official duties as a United States Government employee and * thus cannot be copyrighted. This software/database is freely available * to the public for use. The National Library of Medicine and the U.S. * Government have not placed any restriction on its use or reproduction. * * Although all reasonable efforts have been taken to ensure the accuracy * and reliability of the software and data, the NLM and the U.S. * Government do not and cannot warrant the performance or results that * may be obtained by using this software or data. The NLM and the U.S. * Government disclaim all warranties, express or implied, including * warranties of performance, merchantability or fitness for any particular * purpose. * * Please cite the author in any work or product based on this material. * * =========================================================================== * */ #ifndef _DeBruijn_Graph_ #define _DeBruijn_Graph_ #include #include #include "counter.hpp" #include "concurrenthash.hpp" // This file contains classes which facilitate basic operation of storing reads, counting kmers, // and creating and traversing a de Bruijn graph using namespace std; namespace DeBruijn { // Implementation of de Bruijn graph based on TKmerCount which stores kmer (smaller in the bit encoding of self and its reverse // complement), its count, fraction of times the stored kmer was seen as self, and information for presence/absence in graph // for each of the eight possible extensions to which this kmer can be connected // Allows basic traversing operations such as find kmer and its abundance (count) or find successors for a kmer // We use a node-centric definition of de Bruijn graph in which nodes of the graph are kmers class CDBGraph { public: // Construct graph from counted kmers and histogram // is_stranded indicates if count include reliable direction information (PlusFraction() and MinusFraction() could be used) CDBGraph(const TKmerCount& kmers, const TBins& bins, bool is_stranded) : m_graph_kmers(kmers.KmerLen()), m_bins(bins), m_is_stranded(is_stranded) { m_graph_kmers.PushBackElementsFrom(kmers); string max_kmer(m_graph_kmers.KmerLen(), bin2NT[3]); m_max_kmer = TKmer(max_kmer); m_visited.resize(GraphSize(), 0); } // Construct graph from temporary containers CDBGraph(TKmerCount&& kmers, TBins&& bins, bool is_stranded) : m_graph_kmers(kmers.KmerLen()), m_is_stranded(is_stranded) { m_graph_kmers.Swap(kmers); m_bins.swap(bins); string max_kmer(m_graph_kmers.KmerLen(), bin2NT[3]); m_max_kmer = TKmer(max_kmer); m_visited.resize(GraphSize(), 0); } // Load from a file CDBGraph(istream& in) { string tag; if(!getline(in, tag) || tag != "Sorted Graph") throw runtime_error("Wrong format of graph file"); m_graph_kmers.Load(in); string max_kmer(m_graph_kmers.KmerLen(), bin2NT[3]); m_max_kmer = TKmer(max_kmer); int bin_num; if(!in.read(reinterpret_cast(&bin_num), sizeof bin_num)) throw runtime_error("Error in CDBGraph read"); for(int i = 0; i < bin_num; ++i) { pair bin; if(!in.read(reinterpret_cast(&bin), sizeof bin)) throw runtime_error("Error in CDBGraph read"); m_bins.push_back(bin); } if(!in.read(reinterpret_cast(&m_is_stranded), sizeof m_is_stranded)) throw runtime_error("Error in CDBGraph read"); m_visited.resize(GraphSize(), 0); } // Save in a file void Save(ostream& out) const { out << "Sorted Graph\n"; m_graph_kmers.Save(out); int bin_num = m_bins.size(); out.write(reinterpret_cast(&bin_num), sizeof bin_num); out.write(reinterpret_cast(&m_bins[0]), bin_num*(sizeof m_bins[0])); out.write(reinterpret_cast(&m_is_stranded), sizeof m_is_stranded); if(!out) throw runtime_error("Error in CDBGraph write"); } class Node { public: explicit Node(size_t node = 0) : m_node(node) {} bool isValid() const { return m_node > 0; } bool isPlus() const { return (m_node%2 == 0); } bool isMinus() const { return (m_node%2 != 0); } Node ReverseComplement() const { if(m_node != 0) return Node(m_node%2 == 0 ? m_node+1 : m_node-1); else return Node(0); } Node DropStrand() const { return Node(2*(m_node/2)); } bool operator == (const Node& other) const { return m_node == other.m_node; } bool operator != (const Node& other) const { return m_node != other.m_node; } bool operator < (const Node& other) const { return m_node < other.m_node; } bool operator > (const Node& other) const { return m_node > other.m_node; } struct Hash { size_t operator()(const Node& node) const { return std::hash()(node.m_node); } }; private: friend class CDBGraph; size_t Index() const { return m_node/2-1; } size_t m_node; }; class Iterator : public Node { public: Iterator& operator++() { m_node += 2; return *this; } private: friend class CDBGraph; explicit Iterator(size_t node) : Node(node) {} }; Iterator Begin() const { return Iterator(GraphSize() > 0 ? 2 : 0); } Iterator End() const { return Iterator(GraphSize() > 0 ? 2*(GraphSize()+1) : 0); } vector Chunks(int desired_num) { vector chunks; size_t step = GraphSize()/desired_num+1; for(size_t index = 0; index < GraphSize(); ++index) { if(index%step == 0) chunks.push_back(Iterator(2*(index+1))); } if(!chunks.empty()) chunks.push_back(End()); return chunks; } // These two functions map kmers to integer indexes which could be used to retrieve kmer properties // 0 is returned for kmers not present in the graph // positive even numbers are for stored kmers // positive odd numbers are for reverse complement of stored kmers Node GetNode(const TKmer& kmer) const { // finds kmer in graph TKmer rkmer = revcomp(kmer, KmerLen()); if(kmer < rkmer) { size_t index = m_graph_kmers.Find(kmer); return Node(index == GraphSize() ? 0 : 2*(index+1)); } else { size_t index = m_graph_kmers.Find(rkmer); return Node(index == GraphSize() ? 0 : 2*(index+1)+1); } } Node GetNode(const string& kmer_seq) const { // finds kmer in graph if(kmer_seq.find_first_not_of("ACGT") != string::npos || (int)kmer_seq.size() != KmerLen()) // invalid kmer return Node(0); TKmer kmer(kmer_seq); return GetNode(kmer); } // for all access with Node there is NO check that node is in range !!!!!!!! int Abundance(const Node& node) const { // total count for a kmer if(!node.isValid()) return 0; else return m_graph_kmers.GetKmerCount(node.Index()).second; // automatically clips out branching information! } // 32 bit count; 8 bit branching; 8 bit not used yet; 16 bit +/- double MinusFraction(const Node& node) const { // fraction of the times kmer was seen in - direction double plusf = PlusFraction(node); return min(plusf,1-plusf); } double PlusFraction(const Node& node) const { // fraction of the times kmer was seen in + direction double plusf = double(m_graph_kmers.GetKmerCount(node.Index()).second >> 48)/numeric_limits::max(); if(node.isMinus()) plusf = 1-plusf; return plusf; } TKmer GetNodeKmer(const Node& node) const { // returns kmer as TKmer if(node.isPlus()) return m_graph_kmers.GetKmerCount(node.Index()).first; else return revcomp(m_graph_kmers.GetKmerCount(node.Index()).first, KmerLen()); } string GetNodeSeq(const Node& node) const { // returnd kmer as string return GetNodeKmer(node).toString(KmerLen()); } const uint64_t* getPointer(const Node& node) { return m_graph_kmers.getPointer(node.Index()); } // multithread safe way to set visited value; returns true if value was as expected before and has been successfully changed // 1 is used for permanent holding; 2 is used for temporary holding; 3 for multi contig bool SetVisited(const Node& node, uint8_t value=1, uint8_t expected=0) { return m_visited[node.Index()].Set(value, expected); } void SetTempHolding(const Node& node) { SetVisited(node, 2, 1); } void SetMultContig(const Node& node) { SetVisited(node, 3, 1); } bool ClearVisited(const Node& node) { // multithread safe way to clear visited value; returns true if value was set before return m_visited[node.Index()].Set(0, 1) || m_visited[node.Index()].Set(0, 2) || m_visited[node.Index()].Set(0, 3); } uint8_t IsVisited(const Node& node) const { // returns visited value return m_visited[node.Index()]; } bool IsMultContig(const Node& node) const { return IsVisited(node) == 3; } void ClearHoldings() { // clears temporary holdings for(auto& v : m_visited) if(v == 2) v = 0; } void ClearAllVisited() { // clears all visited for(auto& v : m_visited) v = 0; } void SetColor(const Node& node, uint8_t mask) { m_visited[node.Index()].m_atomic |= mask; } uint8_t GetColor(const Node& node) const { return IsVisited(node); } struct Successor { Successor(const Node& node, char c) : m_node(node), m_nt(c) {} Node m_node; char m_nt; bool operator == (const Successor& other) const { return m_node == other.m_node; } bool operator != (const Successor& other) const { return m_node != other.m_node; } bool operator < (const Successor& other) const { return m_node < other.m_node; } }; // Returns successors of a node // These are nodes representing kmers produced by extending the right end of the kmer for // this node by one base and removing the leftmost base of the kmer // Each successor stores the successor's node and the extra base // Finding predecessors is done by finding successors of reverse complement of the kmer for the node vector GetNodeSuccessors(const Node& node) const { vector successors; if(!node.isValid()) return successors; uint8_t branch_info = (m_graph_kmers.GetCount(node.Index()) >> 32); bitset<4> branches(node.isMinus() ? (branch_info >> 4) : branch_info); if(branches.count()) { TKmer shifted_kmer = (GetNodeKmer(node) << 2) & m_max_kmer; for(int nt = 0; nt < 4; ++nt) { if(branches[nt]) { Node successor = GetNode(shifted_kmer + TKmer(KmerLen(), nt)); successors.push_back(Successor(successor, bin2NT[nt])); } } } return successors; } // Revese complement node static Node ReverseComplement(Node node) { return node.ReverseComplement(); } int KmerLen() const { return m_graph_kmers.KmerLen(); } // returns kmer length size_t GraphSize() const { return m_graph_kmers.Size(); } // returns total number of elements size_t ElementSize() const { return m_graph_kmers.ElementSize(); } // element size in bytes size_t MemoryFootprint() const { // reserved memory in bytes return m_graph_kmers.MemoryFootprint()+m_visited.capacity()+sizeof(TBins::value_type)*m_bins.capacity(); } bool GraphIsStranded() const { return m_is_stranded; } // indicates if graph contains stranded information // returns minimum position for stored histogram int HistogramMinimum() const { pair r = HistogramRange(m_bins); if(r.first < 0) return 0; else return m_bins[r.first].first; } // useus simple heuristic to evaluate the genome size size_t GenomeSize() const { return CalculateGenomeSize(m_bins); } // returns histogram const TBins& GetBins() const { return m_bins; } // average count of kmers in the histogram with the main peak double AverageCount() const { return GetAverageCount(m_bins); } private: TKmerCount m_graph_kmers; // only the minimal kmers are stored TKmer m_max_kmer; // contains 1 in all kmer_len bit positions TBins m_bins; vector> m_visited; bool m_is_stranded; }; class CDBHashGraph { public: // Construct graph from temporary containers CDBHashGraph(CKmerHashCount&& kmers, bool is_stranded) : m_graph_kmers(kmers.KmerLen()), m_is_stranded(is_stranded) { m_graph_kmers.Swap(kmers); m_bins = m_graph_kmers.GetBins(); string max_kmer(m_graph_kmers.KmerLen(), bin2NT[3]); m_max_kmer = TKmer(max_kmer); m_graph_size = 0; for(auto& bin : m_bins) m_graph_size += bin.second; } // Load from a file CDBHashGraph(istream& in) { string tag; if(!getline(in, tag) || tag != "Hash Graph") throw runtime_error("Wrong format of graph file"); m_graph_kmers.Load(in); string max_kmer(m_graph_kmers.KmerLen(), bin2NT[3]); m_max_kmer = TKmer(max_kmer); int bin_num; if(!in.read(reinterpret_cast(&bin_num), sizeof bin_num)) throw runtime_error("Error in CDBHashGraph read"); for(int i = 0; i < bin_num; ++i) { pair bin; if(!in.read(reinterpret_cast(&bin), sizeof bin)) throw runtime_error("Error in CDBHashGraph read"); m_bins.push_back(bin); } m_graph_size = 0; for(auto& bin : m_bins) m_graph_size += bin.second; if(!in.read(reinterpret_cast(&m_is_stranded), sizeof m_is_stranded)) throw runtime_error("Error in CDBHashGraph read"); ClearAllVisited(); } // Save in a file void Save(ostream& out) const { out << "Hash Graph\n"; m_graph_kmers.Save(out); int bin_num = m_bins.size(); out.write(reinterpret_cast(&bin_num), sizeof bin_num); out.write(reinterpret_cast(&m_bins[0]), bin_num*(sizeof m_bins[0])); out.write(reinterpret_cast(&m_is_stranded), sizeof m_is_stranded); if(!out) throw runtime_error("Error in CDBHashGraph write"); } class Node : public CKmerHashCount::Index { public: enum Status : int8_t { eMinus = -1, eNotValid = 0, ePlus = 1 }; Node() : Index(), m_status(eNotValid) {} Node(CKmerHashCount::Index index, Status status) : Index(index), m_status(status) {} Node(CKmerHashCount::Iterator iter) : Index(iter), m_status(ePlus) {} bool isValid() const { return m_status != eNotValid; } bool isPlus() const { return m_status > 0; } bool isMinus() const { return m_status < 0; } Node DropStrand() const { Node node = *this; if(isMinus()) node.m_status = ePlus; return node; } Node ReverseComplement() const { Node node = *this; switch(m_status) { case eMinus : node.m_status = ePlus; return node; case eNotValid : return node; case ePlus: node.m_status = eMinus; return node; } return node; } bool operator==(const Node& other) const { return Index::operator==(other) && m_status == other.m_status; } bool operator!=(const Node& other) const { return !operator==(other); } bool operator<(const Node& other) const { if(Index::operator==(other)) return m_status < other.m_status; else return Index::operator<(other); } bool operator>(const Node& other) const { if(Index::operator==(other)) return m_status > other.m_status; else return Index::operator>(other); } struct Hash { size_t operator()(const Node& node) const { return Index::Hash()(node)^std::hash()(node.m_status); } }; private: Status m_status; }; typedef CKmerHashCount::Iterator Iterator; Iterator Begin() { return m_graph_kmers.Begin(); } Iterator End() { return m_graph_kmers.End(); } vector Chunks(int desired_num) { return m_graph_kmers.Chunks(desired_num); } Node GetNode(const TKmer& kmer) const { // finds kmer in graph TKmer rkmer = revcomp(kmer, KmerLen()); CKmerHashCount::Index end = m_graph_kmers.EndIndex(); if(kmer < rkmer) { CKmerHashCount::Index index = const_cast(m_graph_kmers).FindIndex(kmer); return Node(index, index == end ? Node::eNotValid : Node::ePlus); } else { CKmerHashCount::Index index = const_cast(m_graph_kmers).FindIndex(rkmer); return Node(index, index == end ? Node::eNotValid : Node::eMinus); } } Node GetNode(const string& kmer_seq) { // finds kmer in graph if(kmer_seq.find_first_not_of("ACGT") != string::npos || (int)kmer_seq.size() != KmerLen()) // invalid kmer return Node(); TKmer kmer(kmer_seq); return GetNode(kmer); } // Revese complement node static Node ReverseComplement(const Node& node) { return node.ReverseComplement(); } // for all access with Node there is NO check that node is in range !!!!!!!! int Abundance(const Node& node) const { // total count for a kmer if(!node.isValid()) return 0; else return node.GetMapped(m_graph_kmers)->m_data; // automatically clips out branching information! } // 32 bit count; 8 bit branching; 8 bit visited control; 16 bit +/- double MinusFraction(const Node& node) const { // fraction of the times kmer was seen in - direction double plusf = PlusFraction(node); return min(plusf,1-plusf); } double PlusFraction(const Node& node) const { // fraction of the times kmer was seen in + direction double plusf = double(node.GetMapped(m_graph_kmers)->m_data >> 48)/numeric_limits::max(); if(node.isMinus()) plusf = 1-plusf; return plusf; } TKmer GetNodeKmer(const Node& node) const { // returns kmer as TKmer if(node.isPlus()) return node.GetElement(m_graph_kmers).first; else return revcomp(node.GetElement(m_graph_kmers).first, KmerLen()); } string GetNodeSeq(const Node& node) const { // returnd kmer as string return GetNodeKmer(node).toString(KmerLen()); } const uint64_t* getPointer(const Node& node) const { return node.GetKeyPointer(m_graph_kmers); } enum Visited : uint64_t {eNull = 0, eVisited = 0x10000000000, eTemp = 0x20000000000, eMulti = 0x40000000000, eAll = 0xFF0000000000 }; // multithread safe way to set visited value; returns true if value was as expected before and has been successfully changed // 1 is used for permanent holding; 2 is used for temporary holding; 4 for multi contig bool SetVisited(const Node& node, Visited value = eVisited, Visited expected = eNull) { // we assume that other bits are const auto& count = node.GetMapped(m_graph_kmers)->m_data; uint64_t other_bits = (~eAll)&count.Load(); return count.Set(other_bits|value, other_bits|expected); } void SetTempHolding(const Node& node) { SetVisited(node, eTemp, eVisited); } void SetMultContig(const Node& node) { SetVisited(node, eMulti, eVisited); } void ClearVisited(const Node& node) { node.GetMapped(m_graph_kmers)->m_data.m_atomic &= ~eAll; } uint64_t IsVisited(const Node& node) const { return eAll&node.GetMapped(m_graph_kmers)->m_data; } bool IsMultContig(const Node& node) const { return eMulti&node.GetMapped(m_graph_kmers)->m_data; } void ClearHoldings() { // clears temporary holdings for(auto it = m_graph_kmers.Begin(); it != m_graph_kmers.End(); ++it) { auto& count = it.GetMapped()->m_data; if(eTemp&count) count.m_atomic &= ~eAll; } } void ClearAllVisited() { // clears all visited for(auto it = m_graph_kmers.Begin(); it != m_graph_kmers.End(); ++it) it.GetMapped()->m_data .m_atomic &= ~eAll; } void SetColor(const Node& node, uint8_t mask) { auto& count = node.GetMapped(m_graph_kmers)->m_data; count.m_atomic |= (uint64_t(mask) << 40); } uint8_t GetColor(const Node& node) const { auto& count = node.GetMapped(m_graph_kmers)->m_data; return (count.m_atomic&eAll) >> 40; } struct Successor { Successor(const Node& node, char c) : m_node(node), m_nt(c) {} Node m_node; char m_nt; bool operator == (const Successor& other) const { return m_node == other.m_node; } bool operator != (const Successor& other) const { return m_node != other.m_node; } bool operator < (const Successor& other) const { return m_node < other.m_node; } }; // Returns successors of a node // These are nodes representing kmers produced by extending the right end of the kmer for // this node by one base and removing the leftmost base of the kmer // Each successor stores the successor's node and the extra base // Finding predecessors is done by finding successors of reverse complement of the kmer for the node vector GetNodeSuccessors(const Node& node) const { vector successors; if(!node.isValid()) return successors; uint8_t branch_info = node.GetMapped(m_graph_kmers)->m_data >> 32; bitset<4> branches(node.isMinus() ? (branch_info >> 4) : branch_info); if(branches.count()) { TKmer shifted_kmer = (GetNodeKmer(node) << 2) & m_max_kmer; for(int nt = 0; nt < 4; ++nt) { if(branches[nt]) { Node successor = GetNode(shifted_kmer + TKmer(KmerLen(), nt)); successors.push_back(Successor(successor, bin2NT[nt])); } } } return successors; } bool GraphIsStranded() const { return m_is_stranded; } // indicates if graph contains stranded information int KmerLen() const { return m_graph_kmers.KmerLen(); } // returns kmer length // returns minimum position for stored histogram int HistogramMinimum() const { pair r = HistogramRange(m_bins); if(r.first < 0) return 0; else return m_bins[r.first].first; } // useus simple heuristic to evaluate the genome size size_t GenomeSize() const { return CalculateGenomeSize(m_bins); } // returns histogram const TBins& GetBins() const { return m_bins; } // returns histogram // average count of kmers in the histogram with the main peak double AverageCount() const { return GetAverageCount(m_bins); } size_t GraphSize() const { return m_graph_size; } private: CKmerHashCount m_graph_kmers; TKmer m_max_kmer; // contains 1 in all kmer_len bit positions TBins m_bins; size_t m_graph_size; bool m_is_stranded; }; }; // namespace #endif /* _DeBruijn_Graph_ */ SKESA-2.3.0/Integer.hpp000066400000000000000000000477241335720214300145060ustar00rootroot00000000000000/*=========================================================================== * * PUBLIC DOMAIN NOTICE * National Center for Biotechnology Information * * This software/database is a "United States Government Work" under the * terms of the United States Copyright Act. It was written as part of * the author's official duties as a United States Government employee and * thus cannot be copyrighted. This software/database is freely available * to the public for use. The National Library of Medicine and the U.S. * Government have not placed any restriction on its use or reproduction. * * Although all reasonable efforts have been taken to ensure the accuracy * and reliability of the software and data, the NLM and the U.S. * Government do not and cannot warrant the performance or results that * may be obtained by using this software or data. The NLM and the U.S. * Government disclaim all warranties, express or implied, including * warranties of performance, merchantability or fitness for any particular * purpose. * * Please cite the author in any work or product based on this material. * * =========================================================================== * */ /***************************************************************************** * GATB : Genome Assembly Tool Box * Copyright (C) 2014 INRIA * Authors: R.Chikhi, G.Rizk, E.Drezen * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU Affero General Public License as * published by the Free Software Foundation, either version 3 of the * License, or (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU Affero General Public License for more details. * * You should have received a copy of the GNU Affero General Public License * along with this program. If not, see . *****************************************************************************/ /** \file Integer.hpp * \date 01/03/2013 * \author edrezen * \brief Entry point class for large integer usage */ #ifndef _GATB_CORE_TOOLS_MATH_INTEGER_HPP_ #define _GATB_CORE_TOOLS_MATH_INTEGER_HPP_ /********************************************************************************/ #include "LargeInt.hpp" #include "Model.hpp" #include "KmerInit.hpp" #include #include /********************************************************************************/ namespace DeBruijn { /********************************************************************************/ /** \brief Class for large integers calculus * * The IntegerTemplate is implemented as a boost variant, which means that it can act like T1, T2, T3... type * according to the configuration. * * The IntegerTemplate should be specialized with 4 different LargeInt implementation * classes. * * All the methods are implemented through a boost variant visitor. * * According to the INTEGER_KIND compilation flag, we define the Integer class * as an alias of one from several possible implementations. * * Note that we have 2 possible native implementations (NativeInt64 and NativeInt128) * that rely on native types uint64_t and __uint128_t. * * For larger integer, a multi-precision LargeInt is used. * * From the user point of view, [s]he has just to include this file and use the Integer * class. * */ class IntegerTemplate { public: typedef TLargeIntN Type; operator Type() const { return v; } IntegerTemplate() : v(LargeInt(0)) {} static int MaxKmer() { return 32*MaxPrec; } IntegerTemplate(int kmer_len, uint64_t n) { int p = (kmer_len+31)/32; v = CreateVariant(p); *this = *this + n; } IntegerTemplate(std::string::const_iterator begin, std::string::const_iterator end) : IntegerTemplate(end-begin, 0) { for(auto i = begin; i != end; ++i) { *this = (*this) << 2; *this = *this + (std::find(bin2NT.begin(), bin2NT.end(), *i) - bin2NT.begin()); } } IntegerTemplate(const std::string& kmer) : IntegerTemplate(kmer.begin(), kmer.end()) {} //TODO remove duplication IntegerTemplate(std::deque::const_iterator begin, std::deque::const_iterator end) : IntegerTemplate(end-begin, 0) { for(auto i = begin; i != end; ++i) { *this = (*this) << 2; *this = *this + (std::find(bin2NT.begin(), bin2NT.end(), *i) - bin2NT.begin()); } } /**Construct from a different size IntegerTemplate Will clip (or add) extra nucs on the LEFT of the string */ IntegerTemplate(IntegerTemplate other, int kmer_len) : IntegerTemplate(kmer_len, 0) { // construct correct type uint64_t* p = getPointer(); uint64_t* other_p = other.getPointer(); size_t prec = (kmer_len+31)/32; // number of 8-byte words significant for new kmer size_t other_prec = other.getSize()/64; // number of 8-byte words in old kmer (could have some extra 0s) copy(other_p, other_p+std::min(prec,other_prec), p); int partial_part_bits = 2*(kmer_len%32); if(partial_part_bits > 0) { uint64_t mask = (uint64_t(1) << partial_part_bits) - 1; p[prec-1] &= mask; } } /** Copy constructor. Relies on the copy constructor of boost variant * \param[in] t : the object to be used for initialization */ template explicit IntegerTemplate (const T& t) : v (t) {} /** Affectation operator. Relies on the affectation operator of boost variant * \param[in] t : object to be copied * \return the current object. */ template IntegerTemplate& operator=(const T& t) { v = t; return *this; } /** Get the name of the class used by the variant (ie. one of the Ti template class parameters) * \return the class name. */ const char* getName () const { return boost::apply_visitor (Integer_name(), *(*this)); } /** Get the size of an instance of the class used by the variant (ie. one of the Ti template class parameters) * \return the size of an object (in bits). */ const size_t getSize () const { return boost::apply_visitor (Integer_size(), *(*this)); } /** Operator + * \param[in] a : first operand * \param[in] b : second operand * \return sum of the two operands. */ inline friend IntegerTemplate operator+ (const IntegerTemplate& a, const IntegerTemplate& b) { return boost::apply_visitor (Integer_plus(), *a, *b); } inline friend IntegerTemplate operator+ (const IntegerTemplate& a, uint64_t b) { return boost::apply_visitor (Number_plus(b), *a); } /** Operator - * \param[in] a : first operand * \param[in] b : second operand * \return substraction of the two operands. */ inline friend IntegerTemplate operator- (const IntegerTemplate& a, const IntegerTemplate& b) { return boost::apply_visitor (Integer_minus(), *a, *b); } /** Operator | * \param[in] a : first operand * \param[in] b : second operand * \return 'or' of the two operands. */ inline friend IntegerTemplate operator| (const IntegerTemplate& a, const IntegerTemplate& b) { return boost::apply_visitor (Integer_or(), *a, *b); } /** Operator ^ * \param[in] a : first operand * \param[in] b : second operand * \return 'xor' of the two operands. */ inline friend IntegerTemplate operator^ (const IntegerTemplate& a, const IntegerTemplate& b) { return boost::apply_visitor (Integer_xor(), *a, *b); } /** Operator & * \param[in] a : first operand * \param[in] b : second operand * \return 'and' of the two operands. */ inline friend IntegerTemplate operator& (const IntegerTemplate& a, const IntegerTemplate& b) { return boost::apply_visitor (Integer_and(), *a, *b); } /** Operator ~ * \param[in] a : operand * \return negation of the operand */ inline friend IntegerTemplate operator~ (const IntegerTemplate& a) { Integer_compl v; return boost::apply_visitor (v, *a); } /** Operator == * \param[in] a : first operand * \param[in] b : second operand * \return equality of the two operands. */ inline friend bool operator== (const IntegerTemplate& a, const IntegerTemplate& b) { return boost::apply_visitor (Integer_equals(), *a, *b); } /** Operator != * \param[in] a : first operand * \param[in] b : second operand * \return inequality of the two operands. */ inline friend bool operator!= (const IntegerTemplate& a, const IntegerTemplate& b) { return ! (*a==*b); } /** Operator < * \param[in] a : first operand * \param[in] b : second operand * \return '<' of the two operands. */ inline friend bool operator< (const IntegerTemplate& a, const IntegerTemplate& b) { return boost::apply_visitor (Integer_less(), *a, *b); } /** Operator <= * \param[in] a : first operand * \param[in] b : second operand * \return '<=' of the two operands. */ inline friend bool operator<= (const IntegerTemplate& a, const IntegerTemplate& b) { return boost::apply_visitor (Integer_lesseq(), *a, *b); } /** Operator / * \param[in] a : first operand * \param[in] c : second operand * \return division of the two operands. */ inline friend IntegerTemplate operator/ (const IntegerTemplate& a, const u_int32_t& c) { return boost::apply_visitor (Integer_div(c), *a); } /** Operator % * \param[in] a : first operand * \param[in] c : second operand * \return modulo of the two operands. */ inline friend u_int32_t operator% (const IntegerTemplate& a, const u_int32_t& c) { return boost::apply_visitor (Integer_mod(c), *a); } /** Operator >> * \param[in] a : first operand * \param[in] c : second operand * \return right shift of the two operands. */ inline friend IntegerTemplate operator>> (const IntegerTemplate& a, const int& c) { return boost::apply_visitor (Integer_shiftLeft(c), *a); } /** Operator << * \param[in] a : first operand * \param[in] c : second operand * \return left shift of the two operands. */ inline friend IntegerTemplate operator<< (const IntegerTemplate& a, const int& c) { return boost::apply_visitor (Integer_shiftRight(c), *a); } /** Operator += * \param[in] a : first operand * \return addition and affectation. */ IntegerTemplate & operator+= (const IntegerTemplate& a) { boost::apply_visitor (Integer_plusaffect(), *(*this), *a); return *this; } /** Operator ^= * \param[in] a : first operand * \return xor and affectation. */ IntegerTemplate & operator^= (const IntegerTemplate& a) { boost::apply_visitor (Integer_xoraffect(), *(*this), *a); return *this; } /** Operator[] access the ith nucleotide in the given integer. For instance a[4] get the 5th nucleotide of * a kmer encoded as an Integer object. * \param[in] idx : index of the nucleotide to be retrieved * \return the nucleotide value as follow: A=0, C=1, T=2 and G=3 */ u_int8_t operator[] (size_t idx) const { return boost::apply_visitor (Integer_value_at(idx), *(*this)); } /** Get the reverse complement of a kmer encoded as an IntegerTemplate object. Note that the kmer size must be known. * \param[in] a : kmer value to be reversed-complemented * \param[in] sizeKmer : size of the kmer * \return the reverse complement kmer as a IntegerTemplate value */ friend IntegerTemplate revcomp (const IntegerTemplate& a, size_t sizeKmer) { return boost::apply_visitor (Integer_revomp(sizeKmer), *a); } /** Get an ASCII string representation of a kmer encoded as a IntegerTemplate object * \param[in] sizeKmer : size of the kmer * \return the ASCII representation of the kmer. */ std::string toString (size_t sizeKmer) const { return boost::apply_visitor (Integer_toString(sizeKmer), *(*this)); } /** Output stream operator for the IntegerTemplate class * \param[in] s : the output stream to be used. * \param[in] a : the object to output * \return the modified output stream. */ friend std::ostream & operator<<(std::ostream & s, const IntegerTemplate& a) { s << *a; return s; } /** Get the value of the IntegerTemplate object as a U type, U being one of the T1,T2,T3,T4 * template class parameters. This method can be seen as a converter from the IntegerTemplate class * to a specific U type (given as a template parameter of this method). * \return the converted value as a U type. */ template const U& get () const { return * boost::get(&v); } /** Get pointer to the actual data **/ uint64_t* getPointer() { return boost::apply_visitor (Pointer(), v); } /** Get a hash value on 64 bits for a given IntegerTemplate object. * \return the hash value on 64 bits. */ u_int64_t oahash() const { return boost::apply_visitor (Integer_oahash(), *(*this)); } protected: Type v; Type& operator *() { return v; } const Type& operator *() const { return v; } private: struct Integer_oahash : public boost::static_visitor { template u_int64_t operator() (const T& a) const { return a.oahash(); }}; struct Pointer : public boost::static_visitor { template uint64_t* operator() (T& a) const { return a.getPointer(); }}; struct Integer_name : public boost::static_visitor { template const char* operator() (const T& a) const { return a.getName(); }}; struct Integer_size : public boost::static_visitor { template const size_t operator() (const T& a) const { return a.getSize(); }}; struct Integer_plus : public boost::static_visitor { template IntegerTemplate operator() (const T& a, const T& b) const { return IntegerTemplate(a + b); } template IntegerTemplate operator() (const T& a, const U& b) const { return IntegerTemplate();} }; struct Number_plus : public boost::static_visitor { Number_plus(uint64_t n) : num(n) {} template IntegerTemplate operator() (const T& a) const { return IntegerTemplate(a + T(num));} uint64_t num; }; struct Integer_minus : public boost::static_visitor { template IntegerTemplate operator() (const T& a, const T& b) const { return IntegerTemplate(a - b); } template IntegerTemplate operator() (const T& a, const U& b) const { return IntegerTemplate(); } }; struct Integer_or : public boost::static_visitor { template IntegerTemplate operator() (const T& a, const T& b) const { return IntegerTemplate(a | b); } template IntegerTemplate operator() (const T& a, const U& b) const { return IntegerTemplate(); } }; struct Integer_xor : public boost::static_visitor { template IntegerTemplate operator() (const T& a, const T& b) const { return IntegerTemplate(a ^ b); } template IntegerTemplate operator() (const T& a, const U& b) const { return IntegerTemplate(); } }; struct Integer_and : public boost::static_visitor { template IntegerTemplate operator() (const T& a, const T& b) const { return IntegerTemplate(a & b); } template IntegerTemplate operator() (const T& a, const U& b) const { return IntegerTemplate(); } }; struct Integer_less : public boost::static_visitor { template bool operator() (const T& a, const T& b) const { return a < b; } template bool operator() (const T& a, const U& b) const { return false; } }; struct Integer_lesseq : public boost::static_visitor { template bool operator() (const T& a, const T& b) const { return a <= b; } template bool operator() (const T& a, const U& b) const { return false; } }; struct Integer_equals : public boost::static_visitor { template bool operator() (const T& a, const T& b) const { return a == b; } template bool operator() (const T& a, const U& b) const { return false; } }; struct Integer_plusaffect : public boost::static_visitor<> { template void operator() ( T& a, const T& b) const { a += b; } template void operator() ( T& a, const U& b) const { } }; struct Integer_xoraffect : public boost::static_visitor<> { template void operator() ( T& a, const T& b) const { a ^= b; } template void operator() ( T& a, const U& b) const { } }; struct Integer_compl : public boost::static_visitor { template IntegerTemplate operator() (const T& a) { return IntegerTemplate(~a); }}; template struct Visitor : public boost::static_visitor { Visitor (Arg a=Arg()) : arg(a) {} Arg arg; }; struct Integer_div : public Visitor { Integer_div (const u_int32_t& c) : Visitor(c) {} template IntegerTemplate operator() (const T& a) const { return IntegerTemplate(a/this->arg); }}; struct Integer_mod : public Visitor { Integer_mod (const u_int32_t& c) : Visitor(c) {} template u_int32_t operator() (const T& a) const { return (a%this->arg); }}; struct Integer_shiftLeft : public Visitor { Integer_shiftLeft (const int& c) : Visitor(c) {} template IntegerTemplate operator() (const T& a) const { return IntegerTemplate (a >> this->arg); }}; struct Integer_shiftRight : public Visitor { Integer_shiftRight (const int& c) : Visitor(c) {} template IntegerTemplate operator() (const T& a) const { return IntegerTemplate (a << this->arg); }}; struct Integer_revomp : public Visitor { Integer_revomp (const size_t& c) : Visitor(c) {} template IntegerTemplate operator() (const T& a) const { return IntegerTemplate (revcomp(a,this->arg)); }}; struct Integer_value_at : public Visitor { Integer_value_at (size_t idx) : Visitor(idx) {} template u_int8_t operator() (const T& a) const { return a[this->arg]; }}; struct Integer_toString : public Visitor { Integer_toString (size_t c) : Visitor(c) {} template std::string operator() (const T& a) const { return a.toString(this->arg); }}; }; /********************************************************************************/ typedef IntegerTemplate TKmer; /********************************************************************************/ }; /********************************************************************************/ #endif /* _GATB_CORE_TOOLS_MATH_INTEGER_HPP_ */ SKESA-2.3.0/KmerInit.hpp000066400000000000000000000076551335720214300146320ustar00rootroot00000000000000/*=========================================================================== * * PUBLIC DOMAIN NOTICE * National Center for Biotechnology Information * * This software/database is a "United States Government Work" under the * terms of the United States Copyright Act. It was written as part of * the author's official duties as a United States Government employee and * thus cannot be copyrighted. This software/database is freely available * to the public for use. The National Library of Medicine and the U.S. * Government have not placed any restriction on its use or reproduction. * * Although all reasonable efforts have been taken to ensure the accuracy * and reliability of the software and data, the NLM and the U.S. * Government do not and cannot warrant the performance or results that * may be obtained by using this software or data. The NLM and the U.S. * Government disclaim all warranties, express or implied, including * warranties of performance, merchantability or fitness for any particular * purpose. * * Please cite the author in any work or product based on this material. * * =========================================================================== * */ #ifndef _KmerInit_ #define _KmerInit_ #include "LargeInt.hpp" #include #include /****************** This is the only place where we manipulate boost::variant directly. The rest of the code MUST use these definitions and boost::visitor *******************/ using namespace std; namespace DeBruijn { #define MaxPrec 16 // kmer up to 512 template class BoundedType, typename... V> using BoostVariant = boost::variant, BoundedType<2,V...>, BoundedType<3,V...>, BoundedType<4,V...>, BoundedType<5,V...>, BoundedType<6,V...>, BoundedType<7,V...>, BoundedType<8,V...>, BoundedType<9,V...>, BoundedType<10,V...>, BoundedType<11,V...>, BoundedType<12,V...>, BoundedType<13,V...>, BoundedType<14,V...>, BoundedType<15,V...>, BoundedType<16,V...>>; // for TKmer typedef BoostVariant TLargeIntN; // for TKmerCount template using TLargeIntVec = vector,size_t>>; typedef BoostVariant TKmerCountN; // for TKmerMap struct SKmerHash { template size_t operator() (const T& kmer) const { return kmer.oahash(); } }; template using TLargeIntMap = unordered_map,V,SKmerHash>; template using TKmerMapN = BoostVariant; // This variadic template could be used in construsctors of all boost::variants used in this code template class BoundedType, typename... Params> Variant CreateVariant(int p) { switch(p) { case 1 : return BoundedType<1, Params...>(); case 2 : return BoundedType<2, Params...>(); case 3 : return BoundedType<3, Params...>(); case 4 : return BoundedType<4, Params...>(); case 5 : return BoundedType<5, Params...>(); case 6 : return BoundedType<6, Params...>(); case 7 : return BoundedType<7, Params...>(); case 8 : return BoundedType<8, Params...>(); case 9 : return BoundedType<9, Params...>(); case 10 : return BoundedType<10, Params...>(); case 11 : return BoundedType<11, Params...>(); case 12 : return BoundedType<12, Params...>(); case 13 : return BoundedType<13, Params...>(); case 14 : return BoundedType<14, Params...>(); case 15 : return BoundedType<15, Params...>(); case 16 : return BoundedType<16, Params...>(); default : throw runtime_error("Not supported kmer length"); } } }; // namespace #endif /* _KmerInit_ */ SKESA-2.3.0/LICENSE000066400000000000000000000053021335720214300133670ustar00rootroot00000000000000CONTENTS Public Domain Notice Exceptions (for bundled 3rd-party code) Copyright F.A.Q. ============================================================== PUBLIC DOMAIN NOTICE National Center for Biotechnology Information With the exception of certain third-party files summarized below, this software is a "United States Government Work" under the terms of the United States Copyright Act. It was written as part of the authors' official duties as United States Government employees and thus cannot be copyrighted. This software is freely available to the public for use. The National Library of Medicine and the U.S. Government have not placed any restriction on its use or reproduction. Although all reasonable efforts have been taken to ensure the accuracy and reliability of the software and data, the NLM and the U.S. Government do not and cannot warrant the performance or results that may be obtained by using this software or data. The NLM and the U.S. Government disclaim all warranties, express or implied, including warranties of performance, merchantability or fitness for any particular purpose. Please cite the authors in any work or product based on this material. ============================================================== EXCEPTIONS (in all cases excluding NCBI-written makefiles): Files: Integer.hpp LargeInt.hpp LargeInt1.hpp LargeInt2.hpp Type: Large integer implementation Authors: R.Chikhi, G.Rizk, E.Drezen License: GNU Affero General Public License version 3 ============================================================== Copyright F.A.Q. -------------------------------------------------------------- Q. Our product makes use of the NCBI source code, and we made changes and additions to that version of the NCBI code to better fit it to our needs. Can we copyright the code, and how? A. You can copyright only the *changes* or the *additions* you made to the NCBI source code. You should identify unambiguously those sections of the code that were modified, e.g. by commenting any changes you made in the code you distribute. Therefore, your license has to make clear to users that your product is a combination of code that is public domain within the U.S. (but may be subject to copyright by the U.S. in foreign countries) and code that has been created or modified by you. -------------------------------------------------------------- Q. Can we (re)license all or part of the NCBI source code? A. No, you cannot license or relicense the source code written by NCBI since you cannot claim any copyright in the software that was developed at NCBI as a 'government work' and consequently is in the public domain within the U.S. SKESA-2.3.0/LargeInt.hpp000066400000000000000000000506111335720214300146030ustar00rootroot00000000000000/*=========================================================================== * * PUBLIC DOMAIN NOTICE * National Center for Biotechnology Information * * This software/database is a "United States Government Work" under the * terms of the United States Copyright Act. It was written as part of * the author's official duties as a United States Government employee and * thus cannot be copyrighted. This software/database is freely available * to the public for use. The National Library of Medicine and the U.S. * Government have not placed any restriction on its use or reproduction. * * Although all reasonable efforts have been taken to ensure the accuracy * and reliability of the software and data, the NLM and the U.S. * Government do not and cannot warrant the performance or results that * may be obtained by using this software or data. The NLM and the U.S. * Government disclaim all warranties, express or implied, including * warranties of performance, merchantability or fitness for any particular * purpose. * * Please cite the author in any work or product based on this material. * * =========================================================================== * */ /***************************************************************************** * GATB : Genome Assembly Tool Box * Copyright (C) 2014 INRIA * Authors: R.Chikhi, G.Rizk, E.Drezen * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU Affero General Public License as * published by the Free Software Foundation, either version 3 of the * License, or (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU Affero General Public License for more details. * * You should have received a copy of the GNU Affero General Public License * along with this program. If not, see . *****************************************************************************/ /** \file LargeInt.hpp * \date 01/03/2013 * \author edrezen * \brief Class that manages large integers * * arbitrary-precision integer library * very limited: only does what minia needs (but not what minia deserves) * This file holds interfaces related to the Design Pattern Observer. */ #ifndef _GATB_CORE_TOOLS_MATH_LARGEINT_HPP_ #define _GATB_CORE_TOOLS_MATH_LARGEINT_HPP_ /********************************************************************************/ #include #include #include #include #include #include "config.hpp" /********************************************************************************/ namespace DeBruijn { /********************************************************************************/ extern std::array revcomp_4NT; extern std::array bin2NT; inline static u_int64_t oahash64 (u_int64_t elem) { // return std::hash()(elem); u_int64_t code = elem; code = code ^ (code >> 14); //supp code = (~code) + (code << 18); code = code ^ (code >> 31); code = code * 21; code = code ^ (code >> 11); code = code + (code << 6); code = code ^ (code >> 22); return code; } /** \brief Large integer class * * The LargeInt class provides methods for integer calculus. It has a template parameter * 'precision' giving the number of bits used the integer representation. For instance: * - LargeInt<1> : representation of integers up to 2^64 * - LargeInt<2> : representation of integers up to 2^128 * - etc * * This template class has a specialization for precision=1. In this case, native 64 bits * integers are used. * * This template class may have a specialization for precision=2. If the used operating * system allows it, native 128 bits integers are used. * * In the other cases, the LargeInt provides a generic integer calculus class. Note that * such an implementation could be optimized in several ways, including direct assembly * code for maximum speed. * * The LargeInt class is hugely used throughout the GATB project since it encodes kmers values. * * The LargeInt class is mainly used with the IntegerTemplate class, where 4 specializations * of LargeInt are used as template types of IntegerTemplate. * * \see IntegerTemplate */ template class LargeInt { public: /** Get the name of the class used by the variant (ie. one of the Ti template class parameters) * \return the class name. */ static const char* getName () { static char buffer[256]; static bool first = true; if (first) { first = false; snprintf (buffer, sizeof(buffer), "LargeInt<%d>", precision); } return buffer; } /** Get the 64 less significant bits of the LargeInt object as a native integer type. * \return (part of) the LargeInt object as a native integer type. */ u_int64_t getVal() const { return this->value[0]; } /** Get the size of an instance of the class * \return the size of an object (in bits). */ static const size_t getSize () { return 8*sizeof(u_int64_t)*precision; } /********************************************************************************/ /** Constructor. * \param[in] val : initial value of the large integer. */ LargeInt(const u_int64_t& val = 0) noexcept { value[0] = val; for (int i = 1; i < precision; i++) value[i] = 0; } LargeInt(const std::string& kmer) noexcept : LargeInt(0) { int sizeKmer = kmer.size(); for (int i = 0; i < sizeKmer; i++) { operator<<=(2); value[0] += std::find(bin2NT.begin(), bin2NT.end(), kmer[i]) - bin2NT.begin(); } } template LargeInt(const T& a, const T& b) noexcept : LargeInt(0) { for(T i = a; i < b; ++i) { operator<<=(2); value[0] += std::find(bin2NT.begin(), bin2NT.end(), *i) - bin2NT.begin(); } } /********************************************************************************/ /** Operator + * \param[in] other : operand * \return sum of object and the operand. */ LargeInt operator+ (const LargeInt& other) const { LargeInt result; int carry = 0; for (int i = 0 ; i < precision ; i++) { result.value[i] = this->value[i] + other.value[i] + carry; carry = (result.value[i] < this->value[i]) ? 1 : 0; } return result; } /********************************************************************************/ /** Operator - * \param[in] other : operand * \return subtraction of object and the operand. */ LargeInt operator- (const LargeInt& other) const { LargeInt result; int carry = 0; for (int i = 0 ; i < precision ; i++) { result.value[i] = this->value[i] - other.value[i] - carry; carry = (result.value[i] > this->value[i]) ? 1 : 0; } return result; } /********************************************************************************/ /** Operator / * \param[in] divisor : operand * \return division of the object by the divisor. */ LargeInt operator/(const uint32_t& divisor) const { LargeInt result; std::fill( result.value, result.value + precision, 0 ); // inspired by Divide32() from http://subversion.assembla.com/svn/pxcode/RakNet/Source/BigInt.cpp u_int64_t r = 0; uint32_t mask32bits = ~0; for (int i = precision-1; i >= 0; --i) { for (int j = 1; j >= 0; --j) // [j=1: high-32 bits, j=0: low-32 bits] of array[i] { u_int64_t n = (r << 32) | ((this->value[i] >> (32*j)) & mask32bits ); result.value[i] = result.value[i] | (((n / divisor) & mask32bits) << (32*j)); r = n % divisor; } } return result; } /********************************************************************************/ /** Operator % * \param[in] divisor : operand * \return modulo of the object by the operand. */ uint32_t operator%(const uint32_t& divisor) const { u_int64_t r = 0; uint32_t mask32bits = ~0; for (int i = precision-1; i >= 0; --i) { for (int j = 1; j >= 0; --j) // [j=1: high-32 bits, j=0: low-32 bits] of array[i] { u_int64_t n = (r << 32) | ((this->value[i] >> (32*j)) & mask32bits ); r = n % divisor; } } return (uint32_t)r; } /********************************************************************************/ /** Operator ^ * \param[in] other : operand * \return operator^ of the object by the operand. */ LargeInt operator^(const LargeInt& other) const { LargeInt result; for (int i=0 ; i < precision ; i++) result.value[i] = this->value[i] ^ other.value[i]; return result; } /********************************************************************************/ /** Operator | * \param[in] other : operand * \return operator| of the object by the operand. */ LargeInt operator|(const LargeInt& other) const { LargeInt result; for (int i=0 ; i < precision ; i++) result.value[i] = this->value[i] | other.value[i]; return result; } /********************************************************************************/ /** Operator & * \param[in] other : operand * \return operator& of the object by the operand. */ LargeInt operator&(const LargeInt& other) const { LargeInt result; for (int i=0 ; i < precision ; i++) result.value[i] = this->value[i] & other.value[i]; return result; } /********************************************************************************/ /** Operator & * \param[in] other : operand * \return operator& of the object by the operand. */ LargeInt operator&(const char& other) const { LargeInt result; result.value[0] = this->value[0] & other; return result; } /********************************************************************************/ /** Operator ~ * \return negation of the object */ LargeInt operator~() const { LargeInt result; for (int i=0 ; i < precision ; i++) result.value[i] = ~this->value[i]; return result; } /********************************************************************************/ /** Operator <<. Note: this method is likely to be hugely used when we want to get * neighbors of a given kmer encoded as a LargeInt object. * \param[in] coeff : operand * \return left shift of the object */ LargeInt operator<<(const int& coeff) const { LargeInt result (0); int large_shift = coeff / 64; int small_shift = coeff % 64; for (int i = large_shift ; i < precision-1; i++) { result.value[i] = result.value[i] | (this->value[i-large_shift] << small_shift); if (small_shift == 0) // gcc "bug".. u_int64_t x; x>>64 == 1<<63, x<<64 == 1 { result.value[i+1] = 0; } else { result.value[i+1] = this->value[i-large_shift] >> (64 - small_shift); } } result.value[precision-1] = result.value[precision-1] | (this->value[precision-1-large_shift] << small_shift); return result; } /********************************************************************************/ /** Operator >>. Note: this method is likely to be hugely used when we want to get * neighbors of a given kmer encoded as a LargeInt object. * \param[in] coeff : operand * \return right shift of the object */ LargeInt operator>>(const int& coeff) const { LargeInt result (0); int large_shift = coeff / 64; int small_shift = coeff % 64; result.value[0] = (this->value[large_shift] >> small_shift); for (int i = 1 ; i < precision - large_shift ; i++) { result.value[i] = (this->value[i+large_shift] >> small_shift); if (small_shift == 0) // gcc "bug".. u_int64_t x; x>>64 == 1<<63, x<<64 == 1 { result.value[i-1] = result.value[i-1]; } else { result.value[i-1] = result.value[i-1] | (this->value[i+large_shift] << (64 - small_shift)); } } return result; } /********************************************************************************/ /** Operator != * \param[in] c : operand * \return inequality */ bool operator!=(const LargeInt& c) const { for (int i = 0 ; i < precision ; i++) if( this->value[i] != c.value[i] ) return true; return false; } /********************************************************************************/ /** Operator == * \param[in] c : operand * \return equality */ bool operator==(const LargeInt& c) const { for (int i = 0 ; i < precision ; i++) if( this->value[i] != c.value[i] ) return false; return true; } /********************************************************************************/ /** Operator < * \param[in] c : operand */ bool operator<(const LargeInt& c) const { for (int i = precision-1 ; i>=0 ; --i) if( this->value[i] != c.value[i] ) return this->value[i] < c.value[i]; return false; } /********************************************************************************/ /** Operator <= * \param[in] c : operand */ bool operator<=(const LargeInt& c) const { return operator==(c) || operator<(c); } /********************************************************************************/ /** Operator += * \param[in] other : operand * \return addition and affectation */ LargeInt& operator+= (const LargeInt& other) { // NOT so easy to optimize because of the carry *this = *this + other; return *this; } /********************************************************************************/ /** Operator ^= * \param[in] other : operand * \return xor and affectation */ LargeInt& operator^= (const LargeInt& other) { for (int i=0 ; i < precision ; i++) { this->value[i] ^= other.value[i]; } return *this; } /********************************************************************************/ /** Operator &= * \param[in] other : operand * \return and and affectation */ LargeInt& operator&= (const LargeInt& other) { for (int i=0 ; i < precision ; i++) { this->value[i] &= other.value[i]; } return *this; } /********************************************************************************/ /** Operator |= * \param[in] other : operand * \return or and affectation */ LargeInt& operator|= (const LargeInt& other) { for (int i=0 ; i < precision ; i++) { this->value[i] |= other.value[i]; } return *this; } /********************************************************************************/ /** Operator <<= * \param[in] coeff : operand * \return left shift and affectation */ LargeInt& operator<<= (const int& coeff) { *(this) = (*this) << coeff; return *this; } /********************************************************************************/ /** Operator >>= * \param[in] coeff : operand * \return right shift and affectation */ LargeInt& operator>>= (const int& coeff) { *(this) = (*this) >> coeff; return *this; } /********************************************************************************/ /** Output stream operator for the IntegerTemplate class * \param[in] s : the output stream to be used. * \param[in] l : the object to output * \return the modified output stream. */ friend std::ostream & operator<<(std::ostream & s, const LargeInt & l) { int i=0; /** We want to display the number in hexa (easier to do...) */ s << std::hex; /** We skip leading 0. */ for (i=precision-1; i>=0 && l.value[i]==0; i--) {} /** We dump the different parts of the large integer. */ for ( ; i>=0 ; i--) { s << l.value[i]; if (i>=1) { s << "."; } } /** We go back to decimal format. */ s << std::dec; /** We return the output stream. */ return s; } /********************************************************************************/ /** Computes a kmer value as polynom. We may have conversion from the data buffer to * a nucleotide code. This is done through the provided functor. * \param[in] data : kmer given as a buffer of nucleotides * \param[in] size : size of the kmer * \param[in] fct : convert the ith entry in the buffer into a nucleotide code (A=0, C=1, T=2 and G=3) */ template static LargeInt polynom (const char* data, size_t size, Map fct) { LargeInt res (0); for (size_t i=0; ivalue[idx/32] >> (2*idx%64)) & 3; } u_int64_t oahash() const { // hash = XOR_of_series[hash(i-th chunk iof 64 bits)] u_int64_t result = 0, chunk, mask = ~0; LargeInt intermediate = *this; for (size_t i=0;i> 64; result ^= oahash64 (chunk); } return result; } u_int64_t* getPointer() { return value; } private: template friend LargeInt revcomp (const LargeInt& i, size_t sizeKmer); u_int64_t value[precision]; }; /********************************************************************************/ template inline LargeInt revcomp (const LargeInt& x, size_t sizeKmer) { const LargeInt res = x; unsigned char* kmerrev = (unsigned char *) (&(res.value[0])); unsigned char* kmer = (unsigned char *) (&(x.value[0])); for (size_t i=0; i<8*precision; ++i) { kmerrev[8*precision-1-i] = revcomp_4NT [kmer[i]]; } return (res >> (2*( 32*precision - sizeKmer)) ) ; } /********************************************************************************/ /******************** SPECIALIZATION FOR precision=1 ********************/ /********************************************************************************/ #include "LargeInt1.hpp" /********************************************************************************/ /******************** SPECIALIZATION FOR precision=2 ********************/ /********************************************************************************/ #include "LargeInt2.hpp" /********************************************************************************/ } /* end of namespace */ /********************************************************************************/ #endif /* _GATB_CORE_TOOLS_MATH_LARGEINT_HPP_ */ SKESA-2.3.0/LargeInt1.hpp000066400000000000000000000173711335720214300146720ustar00rootroot00000000000000/*=========================================================================== * * PUBLIC DOMAIN NOTICE * National Center for Biotechnology Information * * This software/database is a "United States Government Work" under the * terms of the United States Copyright Act. It was written as part of * the author's official duties as a United States Government employee and * thus cannot be copyrighted. This software/database is freely available * to the public for use. The National Library of Medicine and the U.S. * Government have not placed any restriction on its use or reproduction. * * Although all reasonable efforts have been taken to ensure the accuracy * and reliability of the software and data, the NLM and the U.S. * Government do not and cannot warrant the performance or results that * may be obtained by using this software or data. The NLM and the U.S. * Government disclaim all warranties, express or implied, including * warranties of performance, merchantability or fitness for any particular * purpose. * * Please cite the author in any work or product based on this material. * * =========================================================================== * */ /***************************************************************************** * GATB : Genome Assembly Tool Box * Copyright (C) 2014 INRIA * Authors: R.Chikhi, G.Rizk, E.Drezen * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU Affero General Public License as * published by the Free Software Foundation, either version 3 of the * License, or (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU Affero General Public License for more details. * * You should have received a copy of the GNU Affero General Public License * along with this program. If not, see . *****************************************************************************/ /** \file LargeInt<1>.hpp * \date 01/03/2013 * \author edrezen * \brief Integer class relying on native u_int64_t type */ template<> class LargeInt<1> { public: /** Constructor. * \param[in] c : initial value of the large integer. */ LargeInt<1>(const u_int64_t& c=0) noexcept { value[0] = c; } LargeInt<1>(const std::string& kmer) noexcept : LargeInt<1>(0) { int sizeKmer = kmer.size(); for (int i = 0; i < sizeKmer; i++) { operator<<=(2); value[0] += std::find(bin2NT.begin(), bin2NT.end(), kmer[i]) - bin2NT.begin(); } } template LargeInt<1>(const T& a, const T& b) noexcept : LargeInt<1>(0) { for(T i = a; i < b; ++i) { operator<<=(2); value[0] += std::find(bin2NT.begin(), bin2NT.end(), *i) - bin2NT.begin(); } } u_int64_t getVal () const { return *value; } static const char* getName () { return "LargeInt<1>"; } static const size_t getSize () { return 8*sizeof(u_int64_t); } /** Returns lower 64 bits */ u_int64_t toInt () const { return value[0]; } LargeInt<1> operator+ (const LargeInt<1>& other) const { return value[0] + other.value[0]; } LargeInt<1> operator- (const LargeInt<1>& other) const { return value[0] - other.value[0]; } LargeInt<1> operator| (const LargeInt<1>& other) const { return value[0] | other.value[0]; } LargeInt<1> operator* (const int& coeff) const { return value[0] * coeff; } LargeInt<1> operator/ (const u_int32_t& divisor) const { return value[0] / divisor; } u_int32_t operator% (const u_int32_t& divisor) const { return value[0] % divisor; } LargeInt<1> operator^ (const LargeInt<1>& other) const { return value[0] ^ other.value[0]; } LargeInt<1> operator& (const LargeInt<1>& other) const { return value[0] & other.value[0]; } LargeInt<1> operator& (const char& other) const { return value[0] & other; } LargeInt<1> operator~ () const { return ~value[0]; } LargeInt<1> operator<< (const int& coeff) const { return value[0] << coeff; } LargeInt<1> operator>> (const int& coeff) const { return value[0] >> coeff; } bool operator!= (const LargeInt<1>& c) const { return value[0] != c.value[0]; } bool operator== (const LargeInt<1>& c) const { return value[0] == c.value[0]; } bool operator< (const LargeInt<1>& c) const { return value[0] < c.value[0]; } bool operator<= (const LargeInt<1>& c) const { return value[0] <= c.value[0]; } LargeInt<1>& operator+= (const LargeInt<1>& other) { value[0] += other.value[0]; return *this; } LargeInt<1>& operator^= (const LargeInt<1>& other) { value[0] ^= other.value[0]; return *this; } LargeInt<1>& operator<<= (const int& coeff) { value[0] <<= coeff; return *this; } LargeInt<1>& operator>>= (const int& coeff) { value[0] >>= coeff; return *this; } u_int8_t operator[] (size_t idx) const { return (value[0] >> (2*idx)) & 3; } /********************************************************************************/ friend std::ostream & operator<<(std::ostream & s, const LargeInt<1> & l) { s << std::hex << l.value[0] << std::dec; return s; } /********************************************************************************/ /** Print corresponding kmer in ASCII * \param[in] sizeKmer : kmer size (def=32). */ std::string toString (size_t sizeKmer) const { int i; u_int64_t temp = value[0]; std::string seq(sizeKmer,'A'); for (i=sizeKmer-1; i>=0; i--) { seq[i] = bin2NT[ temp&3 ]; temp = temp>>2; } return seq; } u_int64_t oahash() const { return oahash64(value[0]); } /********************************************************************************/ inline static u_int64_t revcomp64 (const u_int64_t& x, size_t sizeKmer) { u_int64_t res = x; // OLD VERSION (with lookup table) // unsigned char* kmerrev = (unsigned char *) (&(res)); // unsigned char* kmer = (unsigned char *) (&(x)); // for (size_t i=0; i<8; ++i) { kmerrev[8-1-i] = revcomp_4NT [kmer[i]]; } res = ((res>> 2 & 0x3333333333333333) | (res & 0x3333333333333333) << 2); res = ((res>> 4 & 0x0F0F0F0F0F0F0F0F) | (res & 0x0F0F0F0F0F0F0F0F) << 4); res = ((res>> 8 & 0x00FF00FF00FF00FF) | (res & 0x00FF00FF00FF00FF) << 8); res = ((res>>16 & 0x0000FFFF0000FFFF) | (res & 0x0000FFFF0000FFFF) << 16); res = ((res>>32 & 0x00000000FFFFFFFF) | (res & 0x00000000FFFFFFFF) << 32); res = res ^ 0xAAAAAAAAAAAAAAAA; return (res >> (2*(32-sizeKmer))) ; } /********************************************************************************/ template static LargeInt<1> polynom (const char* data, size_t size, Map fct) { LargeInt<1> res (0); for (size_t i=0; i revcomp (const LargeInt<1>& i, size_t sizeKmer); u_int64_t value[1]; }; /********************************************************************************/ inline LargeInt<1> revcomp (const LargeInt<1>& x, size_t sizeKmer) { return LargeInt<1>::revcomp64 (x.value[0], sizeKmer); } SKESA-2.3.0/LargeInt2.hpp000066400000000000000000000213171335720214300146660ustar00rootroot00000000000000/*=========================================================================== * * PUBLIC DOMAIN NOTICE * National Center for Biotechnology Information * * This software/database is a "United States Government Work" under the * terms of the United States Copyright Act. It was written as part of * the author's official duties as a United States Government employee and * thus cannot be copyrighted. This software/database is freely available * to the public for use. The National Library of Medicine and the U.S. * Government have not placed any restriction on its use or reproduction. * * Although all reasonable efforts have been taken to ensure the accuracy * and reliability of the software and data, the NLM and the U.S. * Government do not and cannot warrant the performance or results that * may be obtained by using this software or data. The NLM and the U.S. * Government disclaim all warranties, express or implied, including * warranties of performance, merchantability or fitness for any particular * purpose. * * Please cite the author in any work or product based on this material. * * =========================================================================== * */ /***************************************************************************** * GATB : Genome Assembly Tool Box * Copyright (C) 2014 INRIA * Authors: R.Chikhi, G.Rizk, E.Drezen * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU Affero General Public License as * published by the Free Software Foundation, either version 3 of the * License, or (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU Affero General Public License for more details. * * You should have received a copy of the GNU Affero General Public License * along with this program. If not, see . *****************************************************************************/ /** \file LargeInt<2>.hpp * \date 01/03/2013 * \author edrezen * \brief Integer class relying on native u_int64_t type */ /********************************************************************************/ #if INT128_FOUND == 1 /********************************************************************************/ u_int64_t revcomp64 (const u_int64_t& x, size_t sizeKmer) { u_int64_t res = x; unsigned char* kmerrev = (unsigned char *) (&(res)); unsigned char* kmer = (unsigned char *) (&(x)); for (size_t i=0; i<8; ++i) { kmerrev[8-1-i] = revcomp_4NT [kmer[i]]; } return (res >> (2*( 32 - sizeKmer))) ; } template<> class LargeInt<2> { public: /** Constructor. * \param[in] c : initial value of the large integer. */ LargeInt<2>(const __uint128_t& c=0) noexcept { value[0] = c; } LargeInt<2>(const std::string& kmer) noexcept : LargeInt<2>(0) { int sizeKmer = kmer.size(); for (int i = 0; i < sizeKmer; i++) { operator<<=(2); value[0] += std::find(bin2NT.begin(), bin2NT.end(), kmer[i]) - bin2NT.begin(); } } template LargeInt<2>(const T& a, const T& b) noexcept : LargeInt<2>(0) { for(T i = a; i < b; ++i) { operator<<=(2); value[0] += std::find(bin2NT.begin(), bin2NT.end(), *i) - bin2NT.begin(); } } u_int64_t getVal () const { return *value; } static const char* getName () { return "LargeInt<2>"; } static const size_t getSize () { return 8*sizeof(__uint128_t); } LargeInt<2> operator+ (const LargeInt<2>& other) const { return value[0] + other.value[0]; } LargeInt<2> operator- (const LargeInt<2>& other) const { return value[0] - other.value[0]; } LargeInt<2> operator| (const LargeInt<2>& other) const { return value[0] | other.value[0]; } LargeInt<2> operator* (const int& coeff) const { return value[0] * coeff; } LargeInt<2> operator/ (const u_int32_t& divisor) const { return value[0] / divisor; } u_int32_t operator% (const u_int32_t& divisor) const { return value[0] % divisor; } LargeInt<2> operator^ (const LargeInt<2>& other) const { return value[0] ^ other.value[0]; } LargeInt<2> operator& (const LargeInt<2>& other) const { return value[0] & other.value[0]; } LargeInt<2> operator& (const char& other) const { return value[0] & other; } LargeInt<2> operator~ () const { return ~value[0]; } LargeInt<2> operator<< (const int& coeff) const { return value[0] << coeff; } LargeInt<2> operator>> (const int& coeff) const { return value[0] >> coeff; } bool operator!= (const LargeInt<2>& c) const { return value[0] != c.value[0]; } bool operator== (const LargeInt<2>& c) const { return value[0] == c.value[0]; } bool operator< (const LargeInt<2>& c) const { return value[0] < c.value[0]; } bool operator<= (const LargeInt<2>& c) const { return value[0] <= c.value[0]; } LargeInt<2>& operator+= (const LargeInt<2>& other) { value[0] += other.value[0]; return *this; } LargeInt<2>& operator^= (const LargeInt<2>& other) { value[0] ^= other.value[0]; return *this; } LargeInt<2>& operator<<= (const int& coeff) { value[0] <<= coeff; return *this; } LargeInt<2>& operator>>= (const int& coeff) { value[0] >>= coeff; return *this; } u_int8_t operator[] (size_t idx) const { return (value[0] >> (2*idx)) & 3; } /** Output stream overload. NOTE: for easier process, dump the value in hexadecimal. * \param[in] os : the output stream * \param[in] in : the integer value to be output. * \return the output stream. */ friend std::ostream & operator<<(std::ostream & os, const LargeInt<2> & in) { __uint128_t x = in.value[0]; u_int64_t high_nucl = (u_int64_t) (x>>64); u_int64_t low_nucl = (u_int64_t)(x&((((__uint128_t)1)<<64)-1)); if (high_nucl == 0) { os << std::hex << low_nucl << std::dec; } else { os << std::hex << high_nucl << "." << low_nucl << std::dec; } return os; } /********************************************************************************/ /** Print corresponding kmer in ASCII * \param[in] sizeKmer : kmer size (def=32). */ std::string toString (size_t sizeKmer) const { std::string seq(sizeKmer,'A'); for (size_t i=0; i>64)) ^ oahash64 ((u_int64_t)(value[0]&((((__uint128_t)1)<<64)-1))); } /********************************************************************************/ template static LargeInt<2> polynom (const char* data, size_t size, Map fct) { LargeInt<2> res (0); for (size_t i=0; i(value); } private: friend LargeInt<2> revcomp (const LargeInt<2>& i, size_t sizeKmer); __uint128_t value[1]; }; /********************************************************************************/ inline LargeInt<2> revcomp (const LargeInt<2>& in, size_t sizeKmer) { // ---64bits-- ---64bits-- // original kmer: [__high_nucl__|__low_nucl___] // // ex: [ AC | .......TG ] // //revcomp: [ CA | .......GT ] // \_low_nucl__/\high_nucl/ const __uint128_t& x = in.value[0]; u_int64_t high_nucl = (u_int64_t)(x>>64); int nb_high_nucl = sizeKmer>32?sizeKmer - 32:0; __uint128_t revcomp_high_nucl = revcomp64 (high_nucl, nb_high_nucl); if (sizeKmer<=32) revcomp_high_nucl = 0; // srsly dunno why this is needed. gcc bug? u_int64_t x ---> (x>>64) != 0 u_int64_t low_nucl = (u_int64_t)(x&((((__uint128_t)1)<<64)-1)); int nb_low_nucl = sizeKmer>32?32:sizeKmer; __uint128_t revcomp_low_nucl = revcomp64 (low_nucl, nb_low_nucl); return (revcomp_low_nucl<<(2*nb_high_nucl)) + revcomp_high_nucl; } /********************************************************************************/ #endif //INT128_FOUND /********************************************************************************/ SKESA-2.3.0/Makefile000066400000000000000000000061501335720214300140240ustar00rootroot00000000000000# =========================================================================== # # PUBLIC DOMAIN NOTICE # National Center for Biotechnology Information # # This software/database is a "United States Government Work" under the # terms of the United States Copyright Act. It was written as part of # the author's official duties as a United States Government employee and # thus cannot be copyrighted. This software/database is freely available # to the public for use. The National Library of Medicine and the U.S. # Government have not placed any restriction on its use or reproduction. # # Although all reasonable efforts have been taken to ensure the accuracy # and reliability of the software and data, the NLM and the U.S. # Government do not and cannot warrant the performance or results that # may be obtained by using this software or data. The NLM and the U.S. # Government disclaim all warranties, express or implied, including # warranties of performance, merchantability or fitness for any particular # purpose. # # Please cite the author in any work or product based on this material. # # =========================================================================== #setenv BOOST_PATH /netopt/ncbi_tools64/boost-1.62.0-ncbi1 ifdef BOOST_PATH BOOST_INCL := -I $(BOOST_PATH)/include BOOST_LIB := -L $(BOOST_PATH)/lib endif NGS_DIR := $(CURDIR)/NGS VDB_PATH := $(NGS_DIR)/vdb_out NGS_PATH := $(NGS_DIR)/ngs_out BUILD_PATH := $(NGS_DIR)/build VDB_INCL := -I $(VDB_PATH)/include VDB_LIB := -L $(VDB_PATH)/lib64 NGS_INCL := -I $(NGS_PATH)/include NGS_LIB := -L $(NGS_PATH)/lib64 CC = c++ -std=c++11 CFLAGS = -Wall -Wno-format-y2k -pthread -fPIC -O3 -finline-functions -fstrict-aliasing \ -fomit-frame-pointer -msse4.2 $(BOOST_INCL) $(NGS_INCL) $(VDB_INCL) LIBS = $(VDB_LIB) -lncbi-ngs-c++-static -lncbi-vdb-static \ $(NGS_LIB) -lngs-c++-static \ -Wl,-Bstatic $(BOOST_LIB) \ -lboost_program_options \ -lboost_iostreams \ -lboost_regex \ -lboost_timer \ -lboost_chrono \ -lboost_system \ -Wl,-Bdynamic -lrt -ldl -lm -lpthread -lz %.o: %.cpp $(CC) -c -o $@ $< $(CFLAGS) all: skesa glb_align.o: glb_align.hpp Makefile skesa.o: common_util.hpp concurrenthash.hpp readsgetter.hpp ngs_includes.hpp counter.hpp graphdigger.hpp assembler.hpp KmerInit.hpp DBGraph.hpp Integer.hpp LargeInt.hpp LargeInt1.hpp LargeInt2.hpp Model.hpp config.hpp Makefile $(NGS_DIR)/ngs.done skesa: skesa.o glb_align.o $(CC) -o $@ $^ $(LIBS) $(NGS_DIR)/ngs.done: rm -fr $(NGS_DIR) mkdir -p $(NGS_DIR)/ngs mkdir $(BUILD_PATH) mkdir $(NGS_PATH) mkdir $(VDB_PATH) cd $(NGS_DIR)/ngs; git init; git remote add -f origin https://github.com/ncbi/ngs.git; git config core.sparseCheckout true; echo "ngs-sdk" >> .git/info/sparse-checkout; git pull origin master cd $(NGS_DIR)/ngs/ngs-sdk; ./configure --prefix=$(NGS_PATH) --build-prefix=$(BUILD_PATH); make; make install cd $(NGS_DIR); git clone https://github.com/ncbi/ncbi-vdb.git cd $(NGS_DIR)/ncbi-vdb; ./configure --prefix=$(VDB_PATH) --build-prefix=$(BUILD_PATH); make; make install touch $@ SKESA-2.3.0/Makefile.nongs000066400000000000000000000042261335720214300151510ustar00rootroot00000000000000# =========================================================================== # # PUBLIC DOMAIN NOTICE # National Center for Biotechnology Information # # This software/database is a "United States Government Work" under the # terms of the United States Copyright Act. It was written as part of # the author's official duties as a United States Government employee and # thus cannot be copyrighted. This software/database is freely available # to the public for use. The National Library of Medicine and the U.S. # Government have not placed any restriction on its use or reproduction. # # Although all reasonable efforts have been taken to ensure the accuracy # and reliability of the software and data, the NLM and the U.S. # Government do not and cannot warrant the performance or results that # may be obtained by using this software or data. The NLM and the U.S. # Government disclaim all warranties, express or implied, including # warranties of performance, merchantability or fitness for any particular # purpose. # # Please cite the author in any work or product based on this material. # # =========================================================================== #setenv BOOST_PATH /netopt/ncbi_tools64/boost-1.62.0-ncbi1 ifdef BOOST_PATH BOOST_INCL := -I $(BOOST_PATH)/include BOOST_LIB := -L $(BOOST_PATH)/lib endif CC = c++ -std=c++11 CFLAGS = -D NO_NGS -Wall -Wno-format-y2k -pthread -fPIC -O3 -finline-functions -fstrict-aliasing \ -fomit-frame-pointer -msse4.2 $(BOOST_INCL) \ LIBS = -Wl,-Bstatic $(BOOST_LIB) \ -lboost_program_options \ -lboost_iostreams \ -lboost_regex \ -lboost_timer \ -lboost_chrono \ -lboost_system \ -Wl,-Bdynamic -lrt -ldl -lm -lpthread -lz %.o: %.cpp $(CC) -c -o $@ $< $(CFLAGS) all: skesa glb_align.o: glb_align.hpp Makefile.nongs skesa.o: common_util.hpp concurrenthash.hpp readsgetter.hpp ngs_includes.hpp counter.hpp graphdigger.hpp assembler.hpp KmerInit.hpp DBGraph.hpp Integer.hpp LargeInt.hpp LargeInt1.hpp LargeInt2.hpp Model.hpp config.hpp Makefile.nongs skesa: skesa.o glb_align.o $(CC) -o $@ $^ $(LIBS) SKESA-2.3.0/Model.hpp000066400000000000000000000153631335720214300141430ustar00rootroot00000000000000/*=========================================================================== * * PUBLIC DOMAIN NOTICE * National Center for Biotechnology Information * * This software/database is a "United States Government Work" under the * terms of the United States Copyright Act. It was written as part of * the author's official duties as a United States Government employee and * thus cannot be copyrighted. This software/database is freely available * to the public for use. The National Library of Medicine and the U.S. * Government have not placed any restriction on its use or reproduction. * * Although all reasonable efforts have been taken to ensure the accuracy * and reliability of the software and data, the NLM and the U.S. * Government do not and cannot warrant the performance or results that * may be obtained by using this software or data. The NLM and the U.S. * Government disclaim all warranties, express or implied, including * warranties of performance, merchantability or fitness for any particular * purpose. * * Please cite the author in any work or product based on this material. * * =========================================================================== * */ #ifndef _Model_ #define _Model_ #include #include #include #include using namespace std; namespace DeBruijn { inline char Complement(char c) { switch(toupper(c)) { case 'A' : return 'T'; case 'C' : return 'G'; case 'G' : return 'C'; case 'T' : return 'A'; case 'K' : return 'M'; case 'M' : return 'K'; case 'R' : return 'Y'; case 'Y' : return 'R'; case 'D' : return 'H'; case 'V' : return 'B'; case 'H' : return 'D'; case 'B' : return 'V'; case 'N' : return 'N'; default : return c; } } map ToAmbiguousIUPAC = {{"A",'A'}, {"C",'C'}, {"G",'G'}, {"T",'T'}, {"CT",'Y'}, {"AG",'R'}, {"AT",'W'}, {"CG",'S'}, {"GT",'K'}, {"AC",'M'}, {"AGT",'D'}, {"ACG",'V'}, {"ACT",'H'}, {"CGT",'B'}, {"ACGT",'N'}}; map FromAmbiguousIUPAC = {{'A',"A"}, {'C',"C"}, {'G',"G"}, {'T',"T"}, {'Y',"CT"}, {'R',"AG"}, {'W',"AT"}, {'S',"CG"}, {'K',"GT"}, {'M',"AC"}, {'D',"AGT"}, {'V',"ACG"}, {'H',"ACT"}, {'B',"CGT"}, {'N',"ACGT"}}; string AmbiguousString(string s) { sort(s.begin(), s.end()); s.erase(unique(s.begin(),s.end()), s.end()); return s; } bool MatchWithAmbiguousDNA(char a, char b) { string aa = FromAmbiguousIUPAC[a]; string bb = FromAmbiguousIUPAC[b]; return (aa.find(bb) != string::npos) || (bb.find(aa) != string::npos); } template void ReverseComplementSeq(const BidirectionalIterator& first, const BidirectionalIterator& last) { for (BidirectionalIterator i( first ); i != last; ++i) *i = Complement(*i); reverse(first, last); } //complement of one NT array comp_NT = { 2,3,0,1 }; array bin2NT = {'A','C','T','G'}; array binrev = {2,3,0,1}; //reverse complement of 4NT, ie one byte array revcomp_4NT = { 0xaa, 0xea, 0x2a, 0x6a, 0xba, 0xfa, 0x3a, 0x7a, 0x8a, 0xca, 0xa, 0x4a, 0x9a, 0xda, 0x1a, 0x5a, 0xae, 0xee, 0x2e, 0x6e, 0xbe, 0xfe, 0x3e, 0x7e, 0x8e, 0xce, 0xe, 0x4e, 0x9e, 0xde, 0x1e, 0x5e, 0xa2, 0xe2, 0x22, 0x62, 0xb2, 0xf2, 0x32, 0x72, 0x82, 0xc2, 0x2, 0x42, 0x92, 0xd2, 0x12, 0x52, 0xa6, 0xe6, 0x26, 0x66, 0xb6, 0xf6, 0x36, 0x76, 0x86, 0xc6, 0x6, 0x46, 0x96, 0xd6, 0x16, 0x56, 0xab, 0xeb, 0x2b, 0x6b, 0xbb, 0xfb, 0x3b, 0x7b, 0x8b, 0xcb, 0xb, 0x4b, 0x9b, 0xdb, 0x1b, 0x5b, 0xaf, 0xef, 0x2f, 0x6f, 0xbf, 0xff, 0x3f, 0x7f, 0x8f, 0xcf, 0xf, 0x4f, 0x9f, 0xdf, 0x1f, 0x5f, 0xa3, 0xe3, 0x23, 0x63, 0xb3, 0xf3, 0x33, 0x73, 0x83, 0xc3, 0x3, 0x43, 0x93, 0xd3, 0x13, 0x53, 0xa7, 0xe7, 0x27, 0x67, 0xb7, 0xf7, 0x37, 0x77, 0x87, 0xc7, 0x7, 0x47, 0x97, 0xd7, 0x17, 0x57, 0xa8, 0xe8, 0x28, 0x68, 0xb8, 0xf8, 0x38, 0x78, 0x88, 0xc8, 0x8, 0x48, 0x98, 0xd8, 0x18, 0x58, 0xac, 0xec, 0x2c, 0x6c, 0xbc, 0xfc, 0x3c, 0x7c, 0x8c, 0xcc, 0xc, 0x4c, 0x9c, 0xdc, 0x1c, 0x5c, 0xa0, 0xe0, 0x20, 0x60, 0xb0, 0xf0, 0x30, 0x70, 0x80, 0xc0, 0x0, 0x40, 0x90, 0xd0, 0x10, 0x50, 0xa4, 0xe4, 0x24, 0x64, 0xb4, 0xf4, 0x34, 0x74, 0x84, 0xc4, 0x4, 0x44, 0x94, 0xd4, 0x14, 0x54, 0xa9, 0xe9, 0x29, 0x69, 0xb9, 0xf9, 0x39, 0x79, 0x89, 0xc9, 0x9, 0x49, 0x99, 0xd9, 0x19, 0x59, 0xad, 0xed, 0x2d, 0x6d, 0xbd, 0xfd, 0x3d, 0x7d, 0x8d, 0xcd, 0xd, 0x4d, 0x9d, 0xdd, 0x1d, 0x5d, 0xa1, 0xe1, 0x21, 0x61, 0xb1, 0xf1, 0x31, 0x71, 0x81, 0xc1, 0x1, 0x41, 0x91, 0xd1, 0x11, 0x51, 0xa5, 0xe5, 0x25, 0x65, 0xb5, 0xf5, 0x35, 0x75, 0x85, 0xc5, 0x5, 0x45, 0x95, 0xd5, 0x15, 0x55 }; }; // namespace #endif /* _Model_ */ SKESA-2.3.0/README.md000066400000000000000000000240101335720214300136360ustar00rootroot00000000000000# SKESA - Strategic Kmer Extension for Scrupulous Assemblies Version 2.3 For questions regarding SKESA, please contact Alexandre Souvorov (souvorov@ncbi.nlm.nih.gov) Richa Agarwala (agarwala@ncbi.nlm.nih.gov) Please [cite](#citation) our paper. ## Compilation Download current source code for SKESA $ git clone https://github.com/ncbi/SKESA Alternatively, download last stable release from https://github.com/ncbi/SKESA/releases Releases also include test data and precompiled binary. Test data is available in example subdirectory that has the command in file run.test for generating the SKESA assembly using the test data. Do following: $ cd SKESA If you would like to build NGS library for accessing reads from SRA, then do $ make Otherwise, if reading inputs only from files, do $ make -f Makefile.nongs BOOST install is expected by makefiles in the SKESA release. If you do not have BOOST on the system path, please specify BOOST_PATH using a command like setenv BOOST_PATH /netopt/ncbi_tools64/boost-1.62.0-ncbi1 before running make. These make files have been tested with BOOST v 1.62.0 and gcc v 4.9. ## Synopsis Running skesa or skesa -h or skesa --help gives information about options and produces the following: -------------------------------------------------------------- General options: -h [ --help ] Produce help message -v [ --version ] Print version --cores arg (=0) Number of cores to use (default all) [integer] --memory arg (=32) Memory available (GB, only for sorted counter) [integer] --hash_count Use hash counter [flag] --estimated_kmers arg (=100) Estimated number of unique kmers for bloom filter (M, only for hash counter) [integer] --skip_bloom_filter Don't do bloom filter; use --estimated_kmers as the hash table size (only for hash counter) [flag] Input/output options : at least one input providing reads for assembly must be specified: --fasta arg Input fasta file(s) (could be used multiple times for different runs) [string] --fastq arg Input fastq file(s) (could be used multiple times for different runs) [string] --use_paired_ends Indicates that a single (not comma separated) fasta/fastq file contains paired reads [flag] --sra_run arg Input sra run accession (could be used multiple times for different runs) [string] --contigs_out arg Output file for contigs (stdout if not specified) [string] Assembly options: --kmer arg (=21) Minimal kmer length for assembly [integer] --min_count arg Minimal count for kmers retained for comparing alternate choices [integer] --max_kmer_count arg Minimum acceptable average count for estimating the maximal kmer length in reads [integer] --vector_percent arg (=0.05) Count for vectors as a fraction of the read number (1. disables) [float (0,1]] --insert_size arg Expected insert size for paired reads (if not provided, it will be estimated) [integer] --steps arg (=11) Number of assembly iterations from minimal to maximal kmer length in reads [integer] --fraction arg (=0.1) Maximum noise to signal ratio acceptable for extension [float [0,1)] --max_snp_len arg (=150) Maximal snp length [integer] --min_contig arg (=200) Minimal contig length reported in output [integer] --allow_snps Allow additional step for snp discovery [flag] Debugging options: --force_single_ends Don't use paired-end information [flag] --seeds arg Input file with seeds [string] --all arg Output fasta for each iteration [string] --dbg_out arg Output kmer file [string] --hist arg File for histogram [string] --connected_reads arg File for connected paired reads [string] -------------------------------------------------------------- Note that --sra_run option is not available if SKESA is built using Makefile.nongs ## Short description SKESA is a de-novo sequence read assembler for microbial genomes based on DeBruijn graphs. It uses conservative heuristics and is designed to create breaks at repeat regions in the genome. This leads to excellent sequence quality. Using kmers longer than mate length and up to insert size also allows SKESA to attain good contiguity as determined by the N50 statistic. It is a multi-threaded application that scales well with the number of processors. For different runs with the same inputs, including the order of reads, the order and orientation of contigs in the output is deterministic. SKESA can process read information by accessing reads from SRA (option --sra_run) or from files in fasta (option --fasta) or fastq (option --fastq) format. Any combination of input streams is allowed. Files could be gzipped, which is recognized automatically. When accessing reads from SRA SKESA automatically determines if the read set consists of paired-end or single-end reads. For fasta/fastq input of paired reads with separate files for each mate, filenames separated by a comma for first mate followed by the second mate are listed and in this case, the order of reads is expected to be same in files for both mates. Alternatively, a single file with both mates could be specified. In this case the reads are expected to be interleaved with first mate followed by the second, and the option --use_paired_ends must be used. A limitation of the current release is that in case multiple streams of paired reads are provided, it is assumed that all streams have the same insert size. User can explicitly specify expected insert size for the reads (option --insert_size). Otherwise, a sample of input reads is used to estimate the expected insert size. This sampling may lead to very small differences in assembly of the same read set if the order of reads is different and selected sample gives a difference in expected insert size. Two additional options users may wish to specify depending on the resources available to them are as follows: 1. the number of cores (option --cores) and 2. total amount of memory in Gb (option --memory) Remaining options are for debugging or modifying algorithm parameters. Output of assembly is contigs in fasta format. The definition line for contig has format Contig__ where is consecutive integers starting from one for numbering the contigs and is the average count of kmers in the contig using minimal kmer length used in the assembly. Contigs are ordered lexicographically. Limitations: 1. SKESA is designed for haploid genomes. If it is used for diploid genomes or RNAseq reads, it should create breaks at all heterozygous sites in the genome and sites for alternative splicing, respectively. The allow_snps option can be used to make some joins at well separated heterozygous sites. 2. SKESA is designed for ILLUMINA reads that do not have systematic homopolymer errors. The assembly for reads that do not have properties similar to ILLUMINA is likely to be quite fragmented. 3. Forward-reverse orientation for paired reads is assumed at this time. If this is not true, steps using paired reads are unlikely to change/improve the assembly. 4. Requesting expected insert size to be estimated using a sample is guaranteed to give the same result, including the order of contigs, for the same order of reads but may give very small differences if read order is changed and insert size estimate is different. ## Usage examples In all the examples below, we are providing 4 cores and have 48 Gb of memory. Example of an assembly that directly accesses SRA for an unpaired read set SRR867211 is: $ skesa --sra_run SRR867211 --cores 4 --memory 48 > SRR867211.skesa.fa Example of an assembly that directly accesses SRA for a paired read set SRR1960353 is: $ skesa --sra_run SRR1960353 --cores 4 --memory 48 > SRR1960353.skesa.fa Example of an assembly that uses separate fastq files for each mate of SRR1703350 is: $ skesa --fastq SRR1703350_1.fq,SRR1703350_2.fq --cores 4 --memory 48 > SRR1703350.skesa.fa Example of an assembly that uses interleaved mates for SRR1703350 as fastq input is: $ skesa --fastq SRR1703350.fq --use_paired_ends --cores 4 --memory 48 > SRR1703350.skesa.fa Example of an assembly that uses reads from SRA for SRR1695624 and gzipped fasta for SRR1745628 is: $ skesa --sra_run SRR1695624 --fasta SRR1745628.fa.gz --use_paired_ends --cores 4 --memory 48 > SAMN03218571.skesa.fa Example of the same assembly as above done with both runs accessed from SRA is: $ skesa --sra_run SRR1695624 --sra_run SRR1745628 --cores 4 --memory 48 > SAMN03218571.skesa.fa ## Citation Alexandre Souvorov, Richa Agarwala and David J. Lipman. **SKESA: strategic k-mer extension for scrupulous assemblies.** *Genome Biology* 2018 **19**:153. [doi.org/10.1186/s13059-018-1540-z](https://doi.org/10.1186/s13059-018-1540-z) SKESA-2.3.0/assembler.hpp000066400000000000000000001506221335720214300150560ustar00rootroot00000000000000/*=========================================================================== * * PUBLIC DOMAIN NOTICE * National Center for Biotechnology Information * * This software/database is a "United States Government Work" under the * terms of the United States Copyright Act. It was written as part of * the author's official duties as a United States Government employee and * thus cannot be copyrighted. This software/database is freely available * to the public for use. The National Library of Medicine and the U.S. * Government have not placed any restriction on its use or reproduction. * * Although all reasonable efforts have been taken to ensure the accuracy * and reliability of the software and data, the NLM and the U.S. * Government do not and cannot warrant the performance or results that * may be obtained by using this software or data. The NLM and the U.S. * Government disclaim all warranties, express or implied, including * warranties of performance, merchantability or fitness for any particular * purpose. * * Please cite the author in any work or product based on this material. * * =========================================================================== * */ #ifndef _DBGAssembler_ #define _DBGAssembler_ #include #include "DBGraph.hpp" #include "counter.hpp" #include "graphdigger.hpp" namespace DeBruijn { /****************************** General description CDBGAssembler implements the SKESA assembling algorithm. 1. It uses the counts for kmers with the minimal kmer length specified (default 21 bp) to estimate the maximal kmer length (starting from average mate length) that has sufficient coverage requested in maxkmercount. If reads are paired and insert size isn't specified, it estimates the insert size by assembling between mates for a sample of the reads. 2. It assembles iteratively starting from minimal to maximal kmer length in a specified number of steps. Each step builds a de Bruijn graph for the kmer size for that iteration and uses it to improve previously assembled contigs. After each assembly iteration, the reads already used in the contigs are removed from further consideration. 3. If reads are paired, it uses the reads that are not marked as used and the set of de Bruijn graphs built in 2) to connect the mate pairs. 4. Using the paired reads connected in 3), it performs three additional assembly iterations with the kmer size up to the insert size. *******************************/ template class CDBGAssembler { public: // fraction - Maximal noise to signal ratio of counts acceptable for extension // jump - minimal length of accepted dead ends; i.e. dead ends shorter than this length are ignored // low_count - minimal count for kmers in a contig // steps - number of assembly iterations from minimal to maximal kmer size in reads // min_count - minimal kmer count to be included in a de Bruijn graph // min_kmer - the minimal kmer size for the main steps // max_kmer_paired - insert size (0 if not known) // maxkmercount - the minimal average count for estimating the maximal kmer // memory - the upper bound for memory use (GB) // ncores - number of threads // raw_reads - reads (for effective multithreading, number of elements in the list should be >= ncores) typedef typename DBGraph::Node Node; using GraphDigger = CDBGraphDigger; template CDBGAssembler(double fraction, int jump, int low_count, int steps, int min_count, int min_kmer, bool forcesinglereads, int max_kmer_paired, int maxkmercount, int ncores, list>& raw_reads, TStrList seeds, bool allow_snps, bool estimate_min_count, GraphArgs... gargs) : m_fraction(fraction), m_jump(jump), m_low_count(low_count), m_steps(steps), m_min_count(min_count), m_min_kmer(min_kmer), m_max_kmer_paired(max_kmer_paired), m_maxkmercount(maxkmercount), m_ncores(ncores), m_average_count(0), m_raw_reads(raw_reads) { m_max_kmer = m_min_kmer; m_insert_size = 0; for(auto& reads : m_raw_reads) { m_raw_pairs.push_back({reads[0], CReadHolder(false)}); } m_connected_reads.resize(m_raw_reads.size(), {CReadHolder(false), CReadHolder(true)}); double total_seq = 0; size_t total_reads = 0; size_t paired = 0; for(auto& reads : m_raw_reads) { if(forcesinglereads) { for(CReadHolder::string_iterator is = reads[0].sbegin(); is != reads[0].send(); ++is) reads[1].PushBack(is); reads[0].Clear(); } total_seq += reads[0].TotalSeq()+reads[1].TotalSeq(); total_reads += reads[0].ReadNum()+reads[1].ReadNum(); paired += reads[0].ReadNum(); } bool usepairedends = paired > 0; //graph for minimal kmer double average_count = GetGraph(m_min_kmer, m_raw_reads, true, estimate_min_count ? total_seq : 0, gargs...); if(average_count == 0) throw runtime_error("Reads are too short for selected minimal kmer length"); m_average_count = average_count; // estimate genome int read_len = total_seq/total_reads+0.5; cerr << endl << "Average read length: " << read_len << endl; size_t genome_size = m_graphs[m_min_kmer]->GenomeSize(); cerr << "Genome size estimate: " << genome_size << endl << endl; {// first iteration if(!seeds.empty()) { m_contigs.push_back(TContigSequenceList()); for(string& seed : seeds) { m_contigs.back().emplace_back(); // empty contig auto& contig = m_contigs.back().back(); contig.InsertNewChunk(); contig.InsertNewVariant(); // one empty list for(char c : seed) { string ambigs = FromAmbiguousIUPAC[c]; if(ambigs.size() == 1) { contig.ExtendTopVariant(c); } else { contig.InsertNewChunk(); for(char c : ambigs) contig.InsertNewVariant(c); contig.InsertNewChunk(); contig.InsertNewVariant(); // one empty list } } } /*TODO do we need to restore this? for(auto& contig : m_contigs.back()) { int replen = contig.LenMax()-1; contig.m_left_repeat = replen; contig.m_right_repeat = replen; } */ CombineSimilarContigs(m_contigs.back()); m_seeds = m_contigs.back(); cerr << "Seeds: " << m_contigs.back().size() << endl; int num = 0; for(auto& contig : m_contigs.back()) { string first_variant; for(auto& lst : contig) first_variant.insert(first_variant.end(), lst.front().begin(), lst.front().end()); cerr << ">Seed_" << ++num << endl << first_variant << endl; int pos = 0; for(unsigned chunk = 0; chunk < contig.size(); ++chunk) { //output variants int chunk_len = contig[chunk].front().size(); if(contig.VariableChunk(chunk)) { int left = 0; if(chunk > 0) left = min(100,(int)contig[chunk-1].front().size()); int right = 0; if(chunk < contig.size()-1) right = min(100,(int)contig[chunk+1].front().size()); int var = 0; auto it = contig[chunk].begin(); for(++it; it != contig[chunk].end(); ++it) { auto& variant = *it; cerr << ">Variant_" << ++var << "_for_Seed_" << num << ":" << pos-left+1 << "_" << pos+chunk_len+right << "\n"; if(chunk > 0) { for(int l = left ; l > 0; --l) cerr << *(contig[chunk-1].front().end()-l); } for(char c : variant) cerr << c; if(chunk < contig.size()-1) { for(int r = 0; r < right; ++r) cerr << contig[chunk+1].front()[r]; } cerr << endl; } } pos += chunk_len; } } } ImproveContigs(m_min_kmer, false); if(m_contigs.back().empty()) throw runtime_error("Was not able to assemble anything"); } //estimate max_kmer if(m_steps > 1 && average_count > m_maxkmercount) { m_max_kmer = read_len+1-double(m_maxkmercount)/average_count*(read_len-min_kmer+1); m_max_kmer = min(TKmer::MaxKmer(), m_max_kmer); EstimateMaxKmer(read_len, gargs...); } cerr << endl << "Average count: " << average_count << " Max kmer: " << m_max_kmer << endl; //estimate insert size if(steps > 1 || usepairedends) { if(m_max_kmer_paired == 0 && usepairedends) { size_t mates = 0; for(auto& rh : m_raw_reads) mates += rh[0].ReadNum(); unsigned sample_size = 10000; // use 10000 reads for connecting to estimate insert size unordered_set selection; if(mates/2 > 2*sample_size) { // make random choice for reads default_random_engine generator; uniform_int_distribution distribution(0,mates/2-1); for(unsigned s = 0; s < sample_size; ) { if(selection.insert(distribution(generator)).second) ++s; } } else if(mates/2 > 0) { // too few paired reads so using all : may be > sample_size but <= twice that size for(size_t i = 0; i <= mates/2-1; ++i) selection.insert(i); } if(!selection.empty()) { CStopWatch timer; timer.Restart(); list> mate_pairs; size_t mp = 0; int sub_sample = sample_size/m_ncores; size_t num = 0; for(auto& reads : m_raw_reads) { for(CReadHolder::string_iterator is = reads[0].sbegin(); is != reads[0].send(); ++is, ++mp) { if(selection.count(mp)) { if((num++)%sub_sample == 0) mate_pairs.push_back({CReadHolder(true), CReadHolder(false)}); mate_pairs.back()[0].PushBack(is); mate_pairs.back()[0].PushBack(++is); } else { ++is; } } } int long_insert_size = 2000; // we don't expect inserts to be longer than 2000 bp for this program GraphDigger graph_digger(*m_graphs[min_kmer], m_fraction, m_jump, m_low_count); list> connected_mate_pairs = graph_digger.ConnectPairs(mate_pairs, long_insert_size, m_ncores, false); CReadHolder connected_mates(false); for(auto& mp : connected_mate_pairs) { for(CReadHolder::string_iterator is = mp[0].sbegin(); is != mp[0].send(); ++is) connected_mates.PushBack(is); } m_max_kmer_paired = connected_mates.N50(); cerr << endl << "N50 for inserts: " << m_max_kmer_paired << endl << endl; } } m_max_kmer_paired = min(m_max_kmer_paired,TKmer::MaxKmer()); m_insert_size = 3*m_max_kmer_paired; // we don't expect spread of histogram to go beyond three times expected insert CleanReads(); } //main iterations if(m_steps > 1) { if(m_max_kmer > 1.5*m_min_kmer) { double alpha = double(m_max_kmer-m_min_kmer)/(steps-1); // find desired distance between consecutive kmers for(int step = 1; step < m_steps; ++step) { int kmer_len = min_kmer+step*alpha+0.5; // round to integer kmer_len -= 1-kmer_len%2; // get odd kmer if(GetGraph(kmer_len, m_raw_reads, true, 0, gargs...) == 0) { cerr << "Empty graph for kmer length: " << kmer_len << " skipping this and longer kmers" << endl; break; } ImproveContigs(kmer_len, false); CleanReads(); } } else { cerr << "WARNING: iterations are disabled" << endl; } } // three additional iterations with kmers (usually) longer than read length and upto insert size if(usepairedends && m_insert_size > 0 && m_max_kmer_paired > 1.5*m_max_kmer) { ConnectPairsIteratively(); array long_kmers; long_kmers[0] = 1.25*m_max_kmer; long_kmers[2] = m_max_kmer_paired; long_kmers[1] = (long_kmers[0]+long_kmers[2])/2; for(int kmer_len : long_kmers) { kmer_len -= 1-kmer_len%2; if(GetGraph(kmer_len, m_connected_reads, false, 0, gargs...) == 0) { cerr << "Empty graph for kmer length: " << kmer_len << " skipping this and longer kmers" << endl; break; } ImproveContigs(kmer_len, false); } } if(allow_snps) { // snp discovery for(auto it = m_graphs.rbegin(); it != m_graphs.rend(); ++it) { int kmer_len = it->first; ImproveContigs (kmer_len, true); } } } map& Graphs() { return m_graphs; } TContigSequenceList& Contigs() { return m_contigs.back(); } vector& AllIterations() { return m_contigs; } TContigSequenceList& ShortContigs() { return m_short_contigs; }; CReadHolder ConnectedReads() const { CReadHolder connected_reads(false); for(const auto& cr : m_connected_reads) { for(CReadHolder::string_iterator is = cr[0].sbegin(); is != cr[0].send(); ++is) connected_reads.PushBack(is); } return connected_reads; } virtual ~CDBGAssembler() { for(auto& graph : m_graphs) delete graph.second; } private: // connects paired reads using all constructed de Bruijn graphs void ConnectPairsIteratively() { for(auto& gr : m_graphs) { int kmer_len = gr.first; cerr << endl << "Connecting mate pairs using kmer length: " << kmer_len << endl; GraphDigger graph_digger(*gr.second, m_fraction, m_jump, m_low_count); list> connected_reads_temp = graph_digger.ConnectPairs(m_raw_pairs, m_insert_size, m_ncores, true); list>::iterator pairedi = m_connected_reads.begin(); list>::iterator rawi = m_raw_pairs.begin(); for(auto& pr : connected_reads_temp) { swap((*rawi)[0], pr[1]); // keep still not connected for(CReadHolder::string_iterator is = pr[0].sbegin(); is != pr[0].send(); ++is) // add new connected reads (*pairedi)[0].PushBack(*is); ++rawi; ++pairedi; } } size_t connected = 0; for(auto& rh : m_connected_reads) connected += rh[0].ReadNum(); cerr << "Totally connected: " << connected << endl; size_t added = 0; list>::iterator pairedi = m_connected_reads.begin(); for(auto& reads : m_raw_pairs) { for(CReadHolder::string_iterator is = reads[0].sbegin(); is != reads[0].send(); ++is) { if((int)is.ReadLen() > m_max_kmer) { (*pairedi)[0].PushBack(*is); ++added; } } ++pairedi; } cerr << "Added notconnected: " << added << endl; } // scans kmers for all assembled contigs and creates a map // the key is the smaller of two possible kmer directions // the value is a tupe: // int - position on contig // bool - the same as the key or reverse complemented // CContigSequence* - pointer to the contig typedef CKmerHashMap, 8> TKmerToContig; // typedef CKmerMap> TKmerToContig; TKmerToContig GetAssembledKmers() { int kmer_len = m_graphs.rbegin()->first; CKmerMap seed_kmers(kmer_len); for(auto& seed : m_seeds) { if((int)seed.LenMin() < kmer_len) continue; seed.RemoveShortUniqIntervals(kmer_len); for(int i = seed.size()-1; i >= 0; i -= 2) { if(i == (int)seed.size()-1) { if((int)seed.ChunkLenMax(i) >= kmer_len) { // last chunk could be short CReadHolder rh(false); rh.PushBack(seed.back().front()); for(CReadHolder::kmer_iterator ik = rh.kbegin(kmer_len) ; ik != rh.kend(); ++ik) { TKmer kmer = *ik; TKmer rkmer = revcomp(kmer, kmer_len); ++seed_kmers[kmer < rkmer ? kmer : rkmer]; } } } else { // all uniq chunks in the middle >= kmer_len; first/last could be short if((int)seed.ChunkLenMax(i) >= kmer_len) { TVariation seq(seed[i].front().begin(), seed[i].front().end()); CReadHolder rh(false); rh.PushBack(seq); for(CReadHolder::kmer_iterator ik = rh.kbegin(kmer_len) ; ik != rh.kend(); ++ik) { TKmer kmer = *ik; TKmer rkmer = revcomp(kmer, kmer_len); ++seed_kmers[kmer < rkmer ? kmer : rkmer]; } } for(auto& variant : seed[i+1]) { TVariation seq; if((int)seed.ChunkLenMax(i) >= kmer_len-1) seq.insert(seq.end(), seed[i].front().end()-kmer_len+1, seed[i].front().end()); else seq.insert(seq.end(), seed[i].front().begin(), seed[i].front().end()); seq.insert(seq.end(), variant.begin(), variant.end()); if((int)seed.ChunkLenMax(i+2) >= kmer_len-1) seq.insert(seq.end(), seed[i+2].front().begin(), seed[i+2].front().begin()+kmer_len-1); else seq.insert(seq.end(), seed[i+2].front().begin(), seed[i+2].front().end()); CReadHolder rh(false); rh.PushBack(seq); for(CReadHolder::kmer_iterator ik = rh.kbegin(kmer_len) ; ik != rh.kend(); ++ik) { TKmer kmer = *ik; TKmer rkmer = revcomp(kmer, kmer_len); ++seed_kmers[kmer < rkmer ? kmer : rkmer]; } } } } } cerr << "Seed kmers: " << seed_kmers.Size() << endl; int min_len = max(m_max_kmer_paired, m_max_kmer); size_t knum = 0; list>> contigs; for(auto& contig : m_contigs.back()) { if((int)contig.LenMin() >= min_len && contig.size() == 1) { contigs.emplace_back(&contig, 0); knum += contig.LenMin()+2*(kmer_len-1); // overestimation for reserve } } TKmerToContig assembled_kmers(kmer_len, knum); list> jobs; for(int thr = 0; thr < m_ncores; ++thr) jobs.push_back(bind(&CDBGAssembler::AssembledKmersJob, this, ref(contigs), ref(assembled_kmers), ref(seed_kmers))); RunThreads(m_ncores, jobs); return assembled_kmers; } void AssembledKmersJob(list>>& contigs, TKmerToContig& assembled_kmers, CKmerMap& seed_kmers) const { for(auto& pr : contigs) { if(!pr.second.Set(1)) continue; auto& contig = *pr.first; int kmer_len = m_graphs.rbegin()->first; auto& graphp = m_graphs.rbegin()->second; int pos = contig.ChunkLenMax(0)-kmer_len; CReadHolder rh(false); if(contig.m_circular) { auto cc = contig[0].front(); cc.insert(cc.end(), contig[0].front().begin(), contig[0].front().begin()+kmer_len-1); // add kmer-1 bases to get all kmers rh.PushBack(cc); pos = contig.ChunkLenMax(0)-1; } else { rh.PushBack(contig[0].front()); } bool found_repeat = false; list>> contig_kmers; for(CReadHolder::kmer_iterator ik = rh.kbegin(kmer_len) ; ik != rh.kend(); ++ik, --pos) { // iteration from last kmer to first TKmer kmer = *ik; auto node = graphp->GetNode(kmer); if(graphp->Abundance(node)*m_fraction > m_average_count) continue; if(node.isValid() && graphp->IsMultContig(node)) { found_repeat = true; break; } TKmer rkmer = revcomp(kmer, kmer_len); TKmer* kmerp = &kmer; bool direct = true; if(rkmer < kmer) { kmerp = &rkmer; direct = false; } contig_kmers.emplace_back(*kmerp, make_tuple(pos, direct, &contig)); } if(!found_repeat) { for(auto& kmer : contig_kmers) { if(seed_kmers.Find(kmer.first) == nullptr) *assembled_kmers.FindOrInsert(kmer.first) = kmer.second; } } } } // finds if a read belongs to any of the contigs // return tuple: // int - position on the contig (-1 if not found) // int - +1 if in positive strand; -1 if in negative strand // CContigSequence* - pointer to the contig static tuple FindMatchForRead(const CReadHolder::string_iterator& is, TKmerToContig& assembled_kmers) { int rlen = is.ReadLen(); int kmer_len = assembled_kmers.KmerLen(); int plus = 1; tuple* rsltp = nullptr; int knum = rlen-kmer_len+1; for(CReadHolder::kmer_iterator ik = is.KmersForRead(kmer_len); rsltp == nullptr && knum > 0; --knum, ++ik) { TKmer kmer = *ik; TKmer rkmer = revcomp(kmer, kmer_len); TKmer* kmerp = &kmer; plus = 1; if(rkmer < kmer) { kmerp = &rkmer; plus = -plus; } rsltp = assembled_kmers.Find(*kmerp); if(rsltp != nullptr && get<0>(*rsltp) < 0) rsltp = nullptr; } int pos = -1; // position on contig of the 'outer' read end (aka insert end) const CContigSequence* sp = nullptr; if(rsltp != nullptr) { sp = get<2>(*rsltp); // pointer to the contig if(!get<1>(*rsltp)) plus = -plus; if(plus > 0) { pos = get<0>(*rsltp)-knum; if(pos < 0 && sp->m_circular) pos += sp->LenMax(); } else { pos = get<0>(*rsltp)+kmer_len-1+knum; if(pos >= (int)sp->LenMax() && sp->m_circular) pos -= sp->LenMax(); } } return make_tuple(pos, plus, sp); } // removes reads if they belong to already assembled contigs // using contig sequence creates artificial connected pairs when both mates are placed // // assembled_kmers - a map of all kmers in already assembled contigs // margin - the minimal distance from an edge of a contig for a read to be removed // insert_size - the upper limit for insert size // raw_reads - reads // connected_reads - pointer to connected reads (nullp if not used) static void RemoveUsedReadsJob(TKmerToContig& assembled_kmers, int margin, int insert_size, array& raw_reads, CReadHolder* connected_reads) { int kmer_len = assembled_kmers.KmerLen(); { CReadHolder cleaned_reads(true); CReadHolder::string_iterator is1 = raw_reads[0].sbegin(); CReadHolder::string_iterator is2 = raw_reads[0].sbegin(); ++is2; for( ; is2 != raw_reads[0].send(); ++is1, ++is1, ++is2, ++is2) { if((int)min(is1.ReadLen(), is2.ReadLen()) < kmer_len) { if(connected_reads) { // keep short pairs for connection cleaned_reads.PushBack(is1); cleaned_reads.PushBack(is2); } else { // give chance to be used as unpaired raw_reads[1].PushBack(is1); raw_reads[1].PushBack(is2); } continue; } tuple rslt1 = FindMatchForRead(is1, assembled_kmers); int pos1 = get<0>(rslt1); int plus1 = get<1>(rslt1); const CContigSequence* sp1 = get<2>(rslt1); int clen1 = 0; int left_flank1 = 0; int right_flank1 = 0; if(pos1 >= 0) { left_flank1 = sp1->m_left_repeat; right_flank1 = sp1->m_right_repeat; clen1 = sp1->LenMax(); if(sp1->m_circular || (plus1 > 0 && pos1 >= margin+left_flank1 && pos1+insert_size-1 < clen1-margin-right_flank1) || (plus1 < 0 && pos1-insert_size+1 >= margin+left_flank1 && pos1 < clen1-margin-right_flank1)) continue; } // check for second mate in case first mate was of bad quality and not found in contigs tuple rslt2 = FindMatchForRead(is2, assembled_kmers); int pos2 = get<0>(rslt2); int plus2 = get<1>(rslt2); const CContigSequence* sp2 = get<2>(rslt2); if(pos2 >= 0) { int left_flank2 = sp2->m_left_repeat; int right_flank2 = sp2->m_right_repeat; int clen2 = sp2->LenMax(); if(sp2->m_circular || (plus2 > 0 && pos2 >= margin+left_flank2 && pos2+insert_size-1 < clen2-margin-right_flank2) || (plus2 < 0 && pos2-insert_size+1 >= margin+left_flank2 && pos2 < clen2-margin-right_flank2)) continue; } if(pos1 >= 0 && pos2 >= 0 && sp1 == sp2 && plus1 != plus2) { // same contig, different strands if((plus1 > 0 && pos1 >= margin+left_flank1 && pos2 < clen1-margin-right_flank1) || (plus1 < 0 && pos2 >= margin+left_flank1 && pos1 < clen1-margin-right_flank1)) { // deep inside continue; } else if(connected_reads) { if((plus1 > 0 && pos1 >= 0 && pos2 < clen1) || (plus1 < 0 && pos2 >= 0 && pos1 < clen1)) { // inside but not deep int a = min(pos1,pos2); int b = max(pos1,pos2); if(b < (int)sp1->ChunkLenMax(0)) { // in first uniq chunk TVariation seq(sp1->front().front().begin()+a, sp1->front().front().begin()+b+1); connected_reads->PushBack(seq); continue; } else if(clen1-a <= (int)sp1->ChunkLenMax(sp1->size()-1)) { // in last uniq chunk TVariation seq(sp1->back().front().end()-clen1+a, sp1->back().front().end()-clen1+b+1); connected_reads->PushBack(seq); continue; } } } } cleaned_reads.PushBack(is1); cleaned_reads.PushBack(is2); } cleaned_reads.Swap(raw_reads[0]); } if(!connected_reads) { CReadHolder cleaned_reads(false); for(CReadHolder::string_iterator is = raw_reads[1].sbegin() ;is != raw_reads[1].send(); ++is) { int rlen = is.ReadLen(); if(rlen < kmer_len) continue; tuple rslt = FindMatchForRead(is, assembled_kmers); int pos = get<0>(rslt); int plus = get<1>(rslt); const CContigSequence* sp = get<2>(rslt); if(pos >= 0) { int left_flank = sp->m_left_repeat; int right_flank = sp->m_right_repeat; int clen = sp->LenMax(); if(sp->m_circular || (plus > 0 && pos >= margin+left_flank && pos+rlen-1 < clen-margin-right_flank) || (plus < 0 && pos-rlen+1 >= margin+left_flank && pos < clen-margin-right_flank)) continue; } cleaned_reads.PushBack(is); } cleaned_reads.Swap(raw_reads[1]); } } // removes used reads from the read set used for de Bruijn graphs // assembled_kmers - a map of all kmers in already assembled contigs // margin - the minimal distance from an edge of a contig for a read to be removed // insert_size - the upper limit for insert size // ncores - number of threads // raw_reads - reads static void RemoveUsedReads(TKmerToContig& assembled_kmers, int margin, int insert_size, int ncores, list>& raw_reads) { list> jobs; for(auto& job_input : raw_reads) { jobs.push_back(bind(RemoveUsedReadsJob, ref(assembled_kmers), margin, insert_size, ref(job_input), (CReadHolder*)0)); } RunThreads(ncores, jobs); } // removes used reads from the read set used for pair connection and from already connected (by contig sequence) reads // assembled_kmers - a map of all kmers in already assembled contigs // margin - the minimal distance from an edge of a contig for a read to be removed // insert_size - the upper limit for insert size // ncores - number of threads // raw_reads - reads // connected_reads - already connected by contig sequence reads static void RemoveUsedPairs(TKmerToContig& assembled_kmers, int margin, int insert_size, int ncores, list>& raw_reads, list>& connected_reads) { list> jobs; auto icr = connected_reads.begin(); for(auto& job_input : raw_reads) { jobs.push_back(bind(RemoveUsedReadsJob, ref(assembled_kmers), margin, insert_size, ref(job_input), &(*icr++)[1])); } RunThreads(ncores, jobs); } // removes used reads from the read set used for de Bruijn graphs and from the read set used for pair connection // removes paired reads not needed as they are already connected by contig sequence reads // creates new set of reads to use void CleanReads() { CStopWatch timer; timer.Restart(); TKmerToContig assembled_kmers = GetAssembledKmers(); if(assembled_kmers.TableSize() > 0) { int jump = 50; //TODO reconsile with what used in filterneighbors RemoveUsedReads(assembled_kmers, m_max_kmer+jump, m_insert_size, m_ncores, m_raw_reads); RemoveUsedReads(assembled_kmers, jump, m_insert_size, m_ncores, m_connected_reads); RemoveUsedPairs(assembled_kmers, jump, m_insert_size, m_ncores, m_raw_pairs, m_connected_reads); } size_t reads = 0; for(auto& rh : m_raw_reads) reads += rh[0].ReadNum()+rh[1].ReadNum(); cerr << "Cleaned reads: " << reads << endl; reads = 0; for(auto& rh : m_raw_pairs) reads += rh[0].ReadNum()+rh[1].ReadNum(); cerr << "Reads for connection: " << reads << endl; reads = 0; for(auto& rh : m_connected_reads) reads += rh[0].ReadNum()+rh[1].ReadNum(); cerr << "Internal reads: " << reads << endl; cerr << "Reads cleaned in " << timer.Elapsed(); } // improves previously assembled contigs using a longer kmer void ImproveContigs (int kmer_len, bool allow_snps) { DBGraph& graph = *m_graphs[kmer_len]; int jump = m_jump; if(allow_snps) jump += kmer_len; GraphDigger graph_digger(graph, m_fraction, jump, m_low_count, allow_snps); cerr << "Kmer: " << kmer_len << " Graph size: " << graph.GraphSize() << " Contigs in: " << (m_contigs.empty() ? 0 : m_contigs.back().size()) << endl; cerr << "Valley: " << graph_digger.HistMin() << endl; CStopWatch total; total.Restart(); CStopWatch timer; timer.Restart(); //convert strings to SContig and mark visited kmers if(allow_snps) graph.ClearAllVisited(); TContigList scontigs = ConverToSContigAndMarkVisited(graph_digger); cerr << endl << "Mark used kmers in " << timer.Elapsed(); if(allow_snps) graph_digger.CheckRepeats(scontigs); size_t singl = 0; size_t multipl = 0; for(auto it = graph.Begin(); it != graph.End(); ++it) { if(graph.IsMultContig(it)) ++multipl; else if(graph.IsVisited(it)) ++singl; } cerr << "Kmers in multiple/single contigs: " << multipl << " " << singl << endl; // connect overlapping contigs if we had seeds if(!m_seeds.empty() && !allow_snps) { timer.Restart(); graph_digger.CheckRepeats(scontigs); cerr << "Check repeats in " << timer.Elapsed(); timer.Restart(); graph_digger.ConnectOverlappingContigs(scontigs); cerr << "Connect overlapping contigs in " << timer.Elapsed(); } timer.Restart(); //create new contigs using not yet included kmers GraphDigger graph_digger_no_jump(graph, m_fraction, 0, m_low_count); unsigned min_len_for_new_seeds = 3*kmer_len; // short ones are likely to be noise GraphDigger test_graphdigger(*m_graphs[m_min_kmer], m_fraction, 0, m_low_count); GraphDigger* test_graphdiggerp = nullptr; if(kmer_len != m_min_kmer) test_graphdiggerp = &test_graphdigger; TContigList new_seeds = graph_digger_no_jump.GenerateNewSeeds(min_len_for_new_seeds, m_ncores, test_graphdiggerp); cerr << "New seeds: " << new_seeds.size() << endl; //add new seeds scontigs.splice(scontigs.end(), new_seeds); cerr << "New seeds in " << timer.Elapsed(); timer.Restart(); graph_digger.ConnectAndExtendContigs(scontigs, m_ncores); // convert back to CContigSequence m_contigs.push_back(TContigSequenceList()); for(auto& contig : scontigs) { m_contigs.back().push_back(contig.m_seq); } m_contigs.back().sort(); vector contigs_len; size_t genome_len = 0; for(auto& contig : m_contigs.back()) { contigs_len.push_back(contig.LenMax()); genome_len += contigs_len.back(); } sort(contigs_len.begin(), contigs_len.end()); size_t n50 = 0; int l50 = 0; size_t len = 0; for(int j = (int)contigs_len.size()-1; j >= 0 && len < 0.5*genome_len; --j) { ++l50; n50 = contigs_len[j]; len += contigs_len[j]; } cerr << "Connections and extensions in " << timer.Elapsed(); cerr << "Contigs out: " << contigs_len.size() << " Genome: " << genome_len << " N50: " << n50 << " L50: " << l50 << endl; cerr << "Assembled in " << total.Elapsed() << endl; } // converts contigs from the previous iteration into SContig and marks visited the nodes in the graph TContigList ConverToSContigAndMarkVisited(GraphDigger& graph_digger) { if(m_contigs.empty()) return TContigList(); int kmer_len = graph_digger.Graph().KmerLen(); for(auto& contig : m_contigs.back()) { //remove short snps if(!contig.m_circular) { if(contig.size() > 1 && (int)contig.ChunkLenMax(0) < kmer_len) { contig.m_left_repeat = 0; contig.pop_front(); contig.pop_front(); } if(contig.size() > 1 && (int)contig.ChunkLenMax(contig.size()-1) < kmer_len) { contig.m_right_repeat = 0; contig.pop_back(); contig.pop_back(); } } if((int)contig.LenMin() < kmer_len) m_short_contigs.push_back(contig); } TContigList scontigs; vector>> contig_is_taken; for(const auto& contig : m_contigs.back()) contig_is_taken.push_back(make_pair(&contig,SAtomic(0))); vector> scontigs_for_threads(m_ncores); list> jobs; for(auto& sc : scontigs_for_threads) jobs.push_back(bind(&CDBGAssembler::ConverToSContigAndMarkVisitedJob, this, ref(contig_is_taken), ref(sc), ref(graph_digger))); RunThreads(m_ncores, jobs); for(auto& sc : scontigs_for_threads) scontigs.splice(scontigs.end(), sc); return scontigs; } // one-thread worker for ConverToSContigAndMarkVisited() void ConverToSContigAndMarkVisitedJob(vector>>& contig_is_taken, TContigList& scontigs, GraphDigger& graph_digger) { DBGraph& graph = graph_digger.Graph(); int kmer_len = graph.KmerLen(); for(auto& cnt : contig_is_taken) { if(!cnt.second.Set(1)) continue; const CContigSequence& contig = *cnt.first; int contig_len = contig.LenMin(); if(contig_len >= kmer_len) scontigs.push_back(SContig(contig, graph)); // constructor sets visited in graph } } // estimates available memory int64_t AvailableMemory(int memory) const { int64_t GB = 1000000000; int64_t mem_available = GB*memory; int64_t mem_used = 0; for(const auto& reads : m_raw_reads) mem_used += reads[0].MemoryFootprint()+reads[1].MemoryFootprint(); for(const auto& reads : m_raw_pairs) mem_used += reads[0].MemoryFootprint()+reads[1].MemoryFootprint(); for(const auto& reads : m_connected_reads) mem_used += reads[0].MemoryFootprint()+reads[1].MemoryFootprint(); for(auto& graph : m_graphs) mem_used += graph.second->MemoryFootprint(); return mem_available-mem_used; } template void EstimateMaxKmer(int read_len, GraphArgs... gargs) { static_assert(sizeof(DBGraph) != sizeof(DBGraph), "Unknown specialization of CDBGAssembler"); } // counts kmers and build a de Bruijn graph; returns average count of kmers in the graph // kmer_len - the size of the kmer // reads - reads from input or connected internally // is_stranded - whether or not stranded information is meaningful template double GetGraph(int kmer_len, const list>& reads, bool is_stranded, double total_seq, GraphArgs... gargs) { static_assert(sizeof(DBGraph) != sizeof(DBGraph), "Unknown specialization of CDBGAssembler"); return 0; } double m_fraction; // Maximal noise to signal ratio of counts acceptable for extension int m_jump; // minimal length of accepted dead ends int m_low_count; // minimal kmer count to be included in a contig int m_steps; // number of main steps int m_min_count; // minimal kmer count to be included in a de Bruijn graph int m_min_kmer; // the minimal kmer size for the main steps int m_max_kmer_paired; // insert size int m_insert_size; // upper bound for the insert size int m_maxkmercount; // the minimal average count for estimating the maximal kmer int m_ncores; // number of threads int m_max_kmer; // maximal kmer size for the main steps double m_average_count; // average count for minimal kmers list>& m_raw_reads; // original reads - will be reduced gradually list> m_raw_pairs; // paired original reads for connection - will be reduced gradually list> m_connected_reads; // connected pairs (long reads) map m_graphs; // De Bruijn graphs for mutiple kmers vector m_contigs; // assembled contigs for each iteration TContigSequenceList m_short_contigs; TContigSequenceList m_seeds; }; template<> template<> // one for graph, the other for args void CDBGAssembler::EstimateMaxKmer(int read_len, int memory) { while(m_max_kmer > m_min_kmer) { m_max_kmer -= 1-m_max_kmer%2; // odd kmers desired CKmerCounter kmer_counter(m_raw_reads, m_max_kmer, m_min_count, true, AvailableMemory(memory), m_ncores); if(kmer_counter.Kmers().Size() < 100) { // find a kmer length with at least 100 distinct kmers at that length m_max_kmer -= read_len/25; // reduce maximal kmer length by a small amount based on read length continue; } double average_count_for_max_kmer = kmer_counter.AverageCount(); if(average_count_for_max_kmer >= m_maxkmercount) break; else m_max_kmer -= read_len/25; } m_max_kmer = max(m_max_kmer, m_min_kmer); } template<> template<> // one for graph, the other for args void CDBGAssembler::EstimateMaxKmer(int read_len, int estimated_kmer_num, bool skip_bloom_filter) { int64_t M = 1000000; while(m_max_kmer > m_min_kmer) { m_max_kmer -= 1-m_max_kmer%2; // odd kmers desired CKmerHashCounter kmer_counter(m_raw_reads, m_max_kmer, m_min_count, M*estimated_kmer_num, true, m_ncores, skip_bloom_filter); if(kmer_counter.KmerNum() < 100) { // find a kmer length with at least 100 distinct kmers at that length m_max_kmer -= read_len/25; // reduce maximal kmer length by a small amount based on read length continue; } double average_count_for_max_kmer = GetAverageCount(kmer_counter.Kmers().GetBins()); if(average_count_for_max_kmer >= m_maxkmercount) break; else m_max_kmer -= read_len/25; } m_max_kmer = max(m_max_kmer, m_min_kmer); } template<> template<> // one for graph, the other for args double CDBGAssembler::GetGraph(int kmer_len, const list>& reads, bool is_stranded, double total_seq, int memory) { CKmerCounter kmer_counter(reads, kmer_len, m_min_count, is_stranded, AvailableMemory(memory), m_ncores); if(kmer_counter.Kmers().Size() == 0) return 0; TKmerCount& sorted_kmers = kmer_counter.Kmers(); if(total_seq > 0) { map hist; for(size_t index = 0; index < sorted_kmers.Size(); ++index) { ++hist[sorted_kmers.GetCount(index)]; // count clipped to integer automatically } TBins bins(hist.begin(), hist.end()); int genome_size = CalculateGenomeSize(bins); if(genome_size > 0) { int new_min_count = total_seq/genome_size/50+0.5; if(new_min_count > m_min_count) { int new_maxkmercount = max(10, int(total_seq/genome_size/10+0.5)); cerr << "WARNING: --min_count changed from " << m_min_count << " to " << new_min_count << " because of high coverage for genome size " << genome_size << endl; cerr << "WARNING: --max_kmer_count " << m_maxkmercount << " to " << new_maxkmercount << " because of high coverage for genome size " << genome_size << endl; m_min_count = new_min_count; m_low_count = m_min_count; m_maxkmercount = new_maxkmercount; sorted_kmers.RemoveLowCountKmers(m_min_count); } } } if(kmer_counter.Kmers().Size() == 0) return 0; double average_count = kmer_counter.AverageCount(); kmer_counter.GetBranches(); map hist; for(size_t index = 0; index < sorted_kmers.Size(); ++index) { ++hist[sorted_kmers.GetCount(index)]; // count clipped to integer automatically } TBins bins(hist.begin(), hist.end()); m_graphs[kmer_len] = new CDBGraph(move(sorted_kmers), move(bins), is_stranded); return average_count; } template<> template<> // one for graph, the other for args double CDBGAssembler::GetGraph(int kmer_len, const list>& reads, bool is_stranded, double total_seq, int estimated_kmer_num, bool skip_bloom_filter) { int64_t M = 1000000; CKmerHashCounter kmer_counter(reads, kmer_len, m_min_count, M*estimated_kmer_num, is_stranded, m_ncores, skip_bloom_filter); if(kmer_counter.KmerNum() == 0) return 0; if(total_seq > 0) { TBins bins = kmer_counter.Kmers().GetBins(); int genome_size = CalculateGenomeSize(bins); if(genome_size > 0) { int new_min_count = total_seq/genome_size/50+0.5; if(new_min_count > m_min_count) { int new_maxkmercount = max(10, int(total_seq/genome_size/10+0.5)); cerr << "WARNING: --min_count changed from " << m_min_count << " to " << new_min_count << " because of high coverage for genome size " << genome_size << endl; cerr << "WARNING: --max_kmer_count changed from " << m_maxkmercount << " to " << new_maxkmercount << " because of high coverage for genome size " << genome_size << endl; m_min_count = new_min_count; m_low_count = m_min_count; m_maxkmercount = new_maxkmercount; kmer_counter.RemoveLowCountKmers(m_min_count); } } } if(kmer_counter.KmerNum() == 0) return 0; kmer_counter.GetBranches(); m_graphs[kmer_len] = new CDBHashGraph(move(kmer_counter.Kmers()), is_stranded); return m_graphs[kmer_len]->AverageCount(); } }; // namespace #endif /* _DBGAssembler_ */ SKESA-2.3.0/common_util.hpp000066400000000000000000000676701335720214300154400ustar00rootroot00000000000000/*=========================================================================== * * PUBLIC DOMAIN NOTICE * National Center for Biotechnology Information * * This software/database is a "United States Government Work" under the * terms of the United States Copyright Act. It was written as part of * the author's official duties as a United States Government employee and * thus cannot be copyrighted. This software/database is freely available * to the public for use. The National Library of Medicine and the U.S. * Government have not placed any restriction on its use or reproduction. * * Although all reasonable efforts have been taken to ensure the accuracy * and reliability of the software and data, the NLM and the U.S. * Government do not and cannot warrant the performance or results that * may be obtained by using this software or data. The NLM and the U.S. * Government disclaim all warranties, express or implied, including * warranties of performance, merchantability or fitness for any particular * purpose. * * Please cite the author in any work or product based on this material. * * =========================================================================== * */ #ifndef _common_util_ #define _common_util_ #include #include #include #include #include using namespace std; namespace DeBruijn { // Wraps around atomic<> to make it possible to use in containers // IMPORTANT: don't concurrently create or modify containers of SAtomic!!!!! template struct SAtomic { typedef T Type; SAtomic(T t = 0) { m_atomic.store(t); } SAtomic(const atomic &a) { m_atomic.store(a.load()); } SAtomic(const SAtomic &other) { m_atomic.store(other.m_atomic.load()); } SAtomic& operator=(const SAtomic &other) { m_atomic.store(other.m_atomic.load()); return *this; } SAtomic& operator=(T t) { m_atomic.store(t); return *this; } bool Set(T value, T expected = 0) { return m_atomic.compare_exchange_strong(expected, value); } operator T() const { return m_atomic.load(); } T Load() const { return m_atomic.load(); } atomic m_atomic; }; class CStopWatch : public boost::timer::cpu_timer { public: void Restart() { start(); } string Elapsed() const { return format(); } void Stop() { stop (); } void Resume() { resume(); } }; // runs ncores threads until all jobs are exhausted void RunThreads(int ncores, list>& jobs) { typedef list> ThreadsStatus; ThreadsStatus active_threads_status; // int total_jobs = jobs.size(); // cerr << "Remaining " << total_jobs << " jobs from " << total_jobs << endl; //create ncores threads for(int i = 0; i < ncores && !jobs.empty(); ++i) { active_threads_status.push_front(async(launch::async, jobs.front())); jobs.pop_front(); } //for each finished thread create a new one until done chrono::milliseconds span (1); while(!active_threads_status.empty()) { for(auto iloop = active_threads_status.begin(); iloop != active_threads_status.end(); ) { auto done = iloop++; if(done->wait_for(span) == future_status::timeout) // not ready continue; done->get(); active_threads_status.erase(done); if(!jobs.empty()) { active_threads_status.push_front(async(launch::async, jobs.front())); jobs.pop_front(); } // cerr << "Remaining jobs " << jobs.size()+active_threads_status.size() << " from " << total_jobs << endl; } } // cerr << endl; } // Stores DNA sequences using 4 letter alphabet // The sequences and kmers could be accessed sequentially using iterator-type classes // class CReadHolder { public: CReadHolder(bool contains_paired) : m_total_seq(0), m_front_shift(0), m_contains_paired(contains_paired) {}; // inserts read at the end // void PushBack(const string& read) { template void PushBack(const Container& read) { int shift = (m_total_seq*2 + m_front_shift)%64; for(int i = (int)read.size()-1; i >= 0; --i) { // put backward for kmer compatibility if(shift == 0) m_storage.push_back(0); m_storage.back() += ((find(bin2NT.begin(), bin2NT.end(), read[i]) - bin2NT.begin()) << shift); shift = (shift+2)%64; } m_read_length.push_back(read.size()); m_total_seq += read.size(); } template void PushBack(RandomIterator begin, uint32_t len) { int shift = (m_total_seq*2 + m_front_shift)%64; for(RandomIterator it = begin+len-1; ; --it) { if(shift == 0) m_storage.push_back(0); m_storage.back() += ((find(bin2NT.begin(), bin2NT.end(), *it) - bin2NT.begin()) << shift); shift = (shift+2)%64; if(it == begin) break; } m_read_length.push_back(len); m_total_seq += len; } // insert sequence from other container class string_iterator; void PushBack(const string_iterator& is) { size_t read_len = is.ReadLen(); m_read_length.push_back(read_len); size_t destination_first_bit = m_front_shift+2*m_total_seq; m_total_seq += read_len; m_storage.resize((m_front_shift+2*m_total_seq+63)/64); const CReadHolder& other_holder = *is.m_readholderp; size_t bit_from = is.m_readholderp->m_front_shift+is.m_position; size_t bit_to = bit_from+2*read_len; other_holder.CopyBits(bit_from, bit_to, m_storage, destination_first_bit, m_storage.size()); } // removes first sequence void PopFront() { m_total_seq -= m_read_length.front(); if(m_total_seq == 0) { Clear(); } else { int nextp = m_front_shift+2*m_read_length.front(); m_read_length.pop_front(); m_front_shift = nextp%64; for(int num = nextp/64; num > 0; --num) m_storage.pop_front(); } } // swaps contents with other void Swap(CReadHolder& other) { swap(m_storage, other.m_storage); swap(m_read_length, other.m_read_length); swap(m_total_seq, other.m_total_seq); swap(m_front_shift, other. m_front_shift); } // deletes all sequences and releases memory void Clear() { CReadHolder(m_contains_paired).Swap(*this); } // Total nucleotide count of the sequnce size_t TotalSeq() const { return m_total_seq; } // Maximal length of included sequences size_t MaxLength() const { if(m_read_length.empty()) return 0; else return *max_element(m_read_length.begin(), m_read_length.end()); } // the number of kmers of give length that could be generated size_t KmerNum(unsigned kmer_len) const { size_t num = 0; if(m_read_length.empty()) return num; for(auto l : m_read_length) { if(l >= kmer_len) num += l-kmer_len+1; } return num; } // total number of sequences size_t ReadNum() const { return m_read_length.size(); } size_t MemoryFootprint() const { return 8*m_storage.size()+4*m_read_length.size(); } // memory in bytes // shortest sequence length at xx% of total length size_t NXX(double xx) const { vector read_length(m_read_length.begin(), m_read_length.end()); sort(read_length.begin(), read_length.end()); size_t nxx = 0; size_t len = 0; for(int j = (int)read_length.size()-1; j >= 0 && len < xx*m_total_seq; --j) { nxx = read_length[j]; len += read_length[j]; } return nxx; } // shortest sequence length at 50% of total length size_t N50() const { return NXX(0.5); } // iterator-type clas to access kmers class kmer_iterator; kmer_iterator kend() const { return kmer_iterator(0, *this, 2*m_total_seq); } kmer_iterator kbegin(int kmer_len) const { return kmer_iterator(kmer_len, *this); } class kmer_iterator { public: // dereference operator; returns value! TKmer operator*() const { TKmer kmer(m_kmer_len, 0); uint64_t* guts = kmer.getPointer(); size_t bit_from = m_readholderp->m_front_shift+m_position; size_t bit_to = bit_from+2*m_kmer_len; m_readholderp->CopyBits(bit_from, bit_to, guts, 0, (2*m_kmer_len+63)/64); return kmer; } // iterator advance kmer_iterator& operator++() { if(m_position == 2*(m_readholderp->m_total_seq-m_kmer_len)) { m_position = 2*m_readholderp->m_total_seq; return *this; } m_position += 2; if(++m_position_in_read == m_readholderp->m_read_length[m_read]-m_kmer_len+1) { m_position += 2*(m_kmer_len-1); ++m_read; m_position_in_read = 0; SkipShortReads(); } return *this; } // doesn't check read boundaries - should be used only if landing in the SAME read kmer_iterator& operator+=(int l) { m_position += 2*l; m_position_in_read += l; return *this; } friend bool operator==(kmer_iterator const& li, kmer_iterator const& ri) { return li.m_position == ri.m_position && li.m_readholderp == ri.m_readholderp; } friend bool operator!=(kmer_iterator const& li, kmer_iterator const& ri) { return li.m_position != ri.m_position || li.m_readholderp != ri.m_readholderp; } friend class CReadHolder; private: kmer_iterator(int kmer_len, const CReadHolder& rholder, size_t position = 0, size_t position_in_read = 0, size_t read = 0) : m_readholderp(&rholder), m_read(read), m_position(position), m_kmer_len(kmer_len), m_position_in_read(position_in_read) { SkipShortReads(); } void SkipShortReads() { while(m_position < 2*m_readholderp->m_total_seq && m_read < m_readholderp->m_read_length.size() && m_readholderp->m_read_length[m_read] < m_kmer_len) m_position += 2*m_readholderp->m_read_length[m_read++]; } const CReadHolder* m_readholderp; size_t m_read; // read number size_t m_position; // BIT num in concatenated sequence uint32_t m_kmer_len; uint32_t m_position_in_read; // SYMBOL in read }; // iterator-type clas to access reads string_iterator send() const { return string_iterator(*this, 2*m_total_seq, m_read_length.size()); } string_iterator sbegin() const { return string_iterator(*this); } enum {eSingle = 0, eFirstMate = 1, eSecondMate = 2}; class string_iterator { public: string_iterator() : m_readholderp(nullptr), m_position(0), m_read(0) {} string operator*() const { int read_length = m_readholderp->m_read_length[m_read]; string read; read.reserve(read_length); size_t position = m_position+m_readholderp->m_front_shift+2*(read_length-1); for(int i = 0; i < read_length; ++i) { read.push_back(bin2NT[(m_readholderp->m_storage[position/64] >> position%64) & 3]); position -= 2; } return read; } // returns inversed binary sequence (not complemented) // assumes that destination is extended properly and filled with 0s void BSeq(int shift, uint64_t* destination) const { size_t position = m_position+m_readholderp->m_front_shift+2*shift; size_t len = 2*(ReadLen()-shift); m_readholderp->CopyBits(position, position+len, destination, 0, (len+63)/64); } // returns clipped binary sequence in correct order // assumes that destination is extended properly and filled with 0s // left/right refer to the original sequence void TrueBSeq(size_t left_clip, size_t right_clip, bool reverse_complement, uint64_t* destination) const { auto Reverse = [](uint64_t& word) { word = ((word & 0x3333333333333333) << 2) | ((word >> 2) & 0x3333333333333333); // swap adjacent pairs word = ((word & 0x0F0F0F0F0F0F0F0F) << 4) | ((word >> 4) & 0x0F0F0F0F0F0F0F0F); // swap nibbles word = ((word & 0x00FF00FF00FF00FF) << 8) | ((word >> 8) & 0x00FF00FF00FF00FF); // swap bytes word = ((word & 0x0000FFFF0000FFFF) << 16) | ((word >> 16) & 0x0000FFFF0000FFFF); // swap 16 bit chunks word = ((word & 0x00000000FFFFFFFF) << 32) | ((word >> 32) & 0x00000000FFFFFFFF); // swap 32 bit chunks }; size_t position = m_position+m_readholderp->m_front_shift+2*right_clip; // sequence stored reversed size_t len = 2*(ReadLen()-right_clip-left_clip); size_t destination_size = (len+63)/64; if(reverse_complement) { m_readholderp->CopyBits(position, position+len, destination, 0, destination_size); // already reversed; not complemented for(size_t p = 0; p < destination_size; ++p) // complement (will also convert trailing As into Ts) destination[p] ^= 0xAAAAAAAAAAAAAAAA; int partial_bits = len%64; if(partial_bits > 0) // remove trailing Ts destination[destination_size-1] &= (1ULL << partial_bits) - 1; } else { int shift_to_right_end = 64*destination_size-len; m_readholderp->CopyBits(position, position+len, destination, shift_to_right_end, destination_size); // reversed and shifted to the end of the destination for(size_t p = 0; p < destination_size/2; ++p) { swap(destination[p], destination[destination_size-1-p]); Reverse(destination[p]); Reverse(destination[destination_size-1-p]); } if(destination_size%2) Reverse(destination[destination_size/2]); } } // returns number of equal nucleotides (2bit) from the beginning // could be longer than actual sequence length if sequence is not multiple of 32 static size_t CommomSeqLen(const uint64_t* seq1p, const uint64_t* seq2p, size_t word_len) { auto last = seq1p+word_len; auto mism = mismatch(seq1p, last, seq2p); size_t extend = 32*(mism.first-seq1p); if(mism.first != last) extend += (ffsll(*mism.first ^ *mism.second)-1)/2; // after ^ all matches are 0s; ffs returns 1-based position of the first bit set to 1 return extend; } string_iterator& operator++() { if(m_read == m_readholderp->m_read_length.size()) return *this; m_position += 2*m_readholderp->m_read_length[m_read++]; return *this; } size_t ReadLen() const { return m_readholderp->m_read_length[m_read]; } kmer_iterator KmersForRead(int kmer_len) const { if(kmer_len <= (int)m_readholderp->m_read_length[m_read]) return kmer_iterator(kmer_len, *m_readholderp, m_position, 0, m_read); else return m_readholderp->kend(); } size_t Hash() const { hash h1; hash h2; return h1(m_readholderp)^h2(m_position); } struct SHash { size_t operator()(const string_iterator& is) const { return is.Hash(); } }; bool HasMate() const { return m_readholderp->m_contains_paired; } int PairType() const { if(!m_readholderp->m_contains_paired) return eSingle; else if(m_read%2) // odd return eSecondMate; else // even return eFirstMate; } string_iterator GetMate() const { // undefined behavior if not paired container if(m_read%2) // odd return string_iterator(*m_readholderp, m_position-2*m_readholderp->m_read_length[m_read-1], m_read-1); else // even return string_iterator(*m_readholderp, m_position+2*m_readholderp->m_read_length[m_read], m_read+1); } friend bool operator==(const string_iterator& li, const string_iterator& ri) { return li.m_read == ri.m_read && li.m_readholderp == ri.m_readholderp; } friend bool operator!=(const string_iterator& li, const string_iterator& ri) { return li.m_read != ri.m_read || li.m_readholderp != ri.m_readholderp; } friend class CReadHolder; private: string_iterator(const CReadHolder& rholder, size_t position = 0, size_t read = 0) : m_readholderp(&rholder), m_position(position), m_read(read) {} const CReadHolder* m_readholderp; size_t m_position; size_t m_read; }; private: // efficiently copies sequence to destination without converting it to string // assumes that destination is extended properly and filled with 0; destination_size - number of 'used' 8-byte words in destination after copy template void CopyBits(size_t bit_from, size_t bit_to, Dest& destination, size_t destination_bit_from, size_t destination_size) const { if(bit_to <= bit_from) return; size_t word = bit_from/64; size_t last_word = (bit_to-1)/64; unsigned shift = bit_from%64; size_t destination_word = destination_bit_from/64; unsigned destination_shift = destination_bit_from%64; if(shift > 0) { // first word partial uint64_t chunk = (m_storage[word++] >> shift); if(destination_shift > 0) { // destination word partial destination[destination_word] += (chunk << destination_shift); if(shift <= destination_shift) // we used all remaining destination word ++destination_word; if(shift < destination_shift && destination_word < destination_size) // first word spills out destination[destination_word] += (chunk >> (64-destination_shift)); } else { // desination word is not partial - it is bigger than chunk destination[destination_word] = chunk; } destination_shift = (destination_shift+64-shift)%64; } for( ; word <= last_word; ++word, ++destination_word) { if(destination_shift > 0) { destination[destination_word] += (m_storage[word] << destination_shift); if(destination_word+1 < destination_size) destination[destination_word+1] += (m_storage[word] >> (64-destination_shift)); } else { destination[destination_word] = m_storage[word]; } } int partial_bits = (destination_bit_from+bit_to-bit_from)%64; if(partial_bits > 0) { uint64_t mask = (1ULL << partial_bits) - 1; destination[destination_size-1] &= mask; } } deque m_storage; deque m_read_length; size_t m_total_seq; int m_front_shift; bool m_contains_paired; }; typedef vector> TBins; // pair of position,count // simple heuristic to find a valley/peak in a histogram int FindValleyAndPeak(const TBins& bins, int rlimit) { int SLOPE_LEN = 5; int peak = min(rlimit,(int)bins.size()-SLOPE_LEN-1); while(peak >= SLOPE_LEN) { bool maxim = true; for(int i = 1; i <= SLOPE_LEN && maxim; ++i) maxim = bins[peak+i].second < bins[peak].second; for(int i = 1; i <= SLOPE_LEN && maxim; ++i) maxim = bins[peak-i].second < bins[peak].second; if(maxim) break; --peak; } if(peak < SLOPE_LEN) return -1; int valley = 0; for(int i = 1; i <= peak; ++i) { if(bins[i].second < bins[valley].second) valley = i; } if(valley == peak) return -1; for(int i = valley; i < (int)bins.size(); ++i) { if(bins[i].second > bins[peak].second) peak = i; } if(bins[valley].second < 0.7*bins[peak].second) return valley; else return -1; } // a simple heuristic to find main range in a histogram pair HistogramRange(const TBins& bins) { // returns ; valley == -1 if not found unsigned MIN_NUM = 100; size_t gsize = 0; for(auto& bin : bins) { if(bin.second >= MIN_NUM) gsize += bin.first*bin.second; } // step back over repeats and plasmids that are not likely to be more than 20 percent of the genome int rl = 0; size_t gs = 0; for(auto& bin : bins) { gs += bin.first*bin.second; if(rl < (int)bins.size()-1) ++rl; if(gs > 0.8*gsize) break; } // find histogram portion with biggest volume and estimate genome size as number of kmers in the portion int valley = -1; int rlimit = rl; size_t genome = 0; size_t genome_vol = 0; while(true) { int v = FindValleyAndPeak(bins, rl); size_t g = 0; size_t g_vol = 0; for(int i = max(0, v); i <= rl; ++i) { g_vol += (bins[i].first*bins[i].second); g += bins[i].second; } if((v >= 0 && g > genome) || g_vol > genome_vol) { valley = v; rlimit = rl; genome = g; genome_vol = g_vol; // cerr << valley << " " << rlimit << " " << genome << endl; } if(v < 0) break; rl = v; } return make_pair(valley, rlimit); } double GetAverageCount(const TBins& bins) { pair grange = HistogramRange(bins); if(grange.first < 0) grange.first = 0; size_t genome = 0; size_t kmers = 0; for(int i = grange.first; i <= grange.second; ++i) { genome += bins[i].second; kmers += bins[i].first*bins[i].second; } if(genome > 0) return double(kmers)/genome; else return 0.; } size_t CalculateGenomeSize(const TBins& bins) { pair grange = HistogramRange(bins); if(grange.first < 0) grange.first = 0; size_t genome = 0; for(int i = grange.first; i <= grange.second; ++i) genome += bins[i].second; return genome; } template class CKmerMap { // A hash with kmer as a key // Implemented using a boost::variant of unordered_map<,V> with maximal N = 16 which allows kmer size up to 512 public: typedef V MappedType; typedef TKmerMapN Type; CKmerMap(int kmer_len = 0) : m_kmer_len(kmer_len) { if(m_kmer_len > 0) m_container = CreateVariant, TLargeIntMap, V>((m_kmer_len+31)/32); } size_t Size() const { return apply_visitor(container_size(), m_container); } // number of elements in the container void Reserve(size_t rsrv) { apply_visitor(reserve(rsrv), m_container); } // reserves hash table for rsrv elements void Clear() { apply_visitor(clear(), m_container); } // clear hash table V& operator[] (const TKmer& kmer) { if(m_kmer_len == 0) throw runtime_error("Can't insert in uninitialized container"); return apply_visitor(mapper(kmer), m_container); } V* Find(const TKmer& kmer) { return apply_visitor(find(kmer), m_container); } // returns nullptr if not found int KmerLen() const { return m_kmer_len; } template void GetInfo(Prob& prob) { apply_visitor(get_info(prob), m_container); } // scans the containier and calls prob(k, v) for each mapped element private: template struct get_info : public boost::static_visitor<> { get_info(Prob& p) : prob(p) {} template void operator()(T& v) const { for(auto& val : v) prob(TKmer(val.first), val.second); } Prob& prob; }; struct container_size : public boost::static_visitor { template size_t operator()(const T& v) const { return v.size();} }; struct clear : public boost::static_visitor<> { template void operator()(const T& v) const { v.clear();} }; struct reserve : public boost::static_visitor<> { reserve(size_t r) : rsrv(r) {} template void operator() (T& v) const { v.reserve(rsrv); } size_t rsrv; }; struct mapper : public boost::static_visitor { mapper(const TKmer& k) : kmer(k) {} template V& operator()(T& v) const { typedef typename T::key_type large_t; return v[kmer.get()]; } const TKmer& kmer; }; struct find : public boost::static_visitor { find(const TKmer& k) : kmer(k) {} template V* operator()(T& v) const { typedef typename T::key_type large_t; typename T::iterator it = v.find(kmer.get()); if(it != v.end()) return &(it->second); else return 0; } const TKmer& kmer; }; Type m_container; int m_kmer_len; }; template using TKmerMap = CKmerMap; // for compatibility with previous code }; // namespace #endif /* _common_util_ */ SKESA-2.3.0/concurrenthash.hpp000066400000000000000000002017151335720214300161270ustar00rootroot00000000000000/*=========================================================================== * * PUBLIC DOMAIN NOTICE * National Center for Biotechnology Information * * This software/database is a "United States Government Work" under the * terms of the United States Copyright Act. It was written as part of * the author's official duties as a United States Government employee and * thus cannot be copyrighted. This software/database is freely available * to the public for use. The National Library of Medicine and the U.S. * Government have not placed any restriction on its use or reproduction. * * Although all reasonable efforts have been taken to ensure the accuracy * and reliability of the software and data, the NLM and the U.S. * Government do not and cannot warrant the performance or results that * may be obtained by using this software or data. The NLM and the U.S. * Government disclaim all warranties, express or implied, including * warranties of performance, merchantability or fitness for any particular * purpose. * * Please cite the author in any work or product based on this material. * * =========================================================================== * */ #ifndef _Concurrent_Hash_ #define _Concurrent_Hash_ #include "Integer.hpp" #include "common_util.hpp" // This file contains classes which facilitate basic operation of storing reads, counting kmers, // and creating and traversing a de Bruijn graph using namespace std; namespace DeBruijn { template // in bytes class CConcurrentBlockedBloomFilter { public: enum EInsertResult {eNewKmer = 0, eAboveThresholdKmer = 1, eExistingKmer = 2}; // table_size - number of counting elements in bloom filter // counter_bit_size - number of bith per counter (2, 4, 8) // hash_num - number of has functions (generated from two) CConcurrentBlockedBloomFilter(size_t table_size, int counter_bit_size, int hash_num, int min_count) { Reset(table_size, counter_bit_size, hash_num, min_count); } void Reset(size_t table_size, int counter_bit_size, int hash_num, int min_count) { assert(counter_bit_size <= 8); m_counter_bit_size = counter_bit_size; m_hash_num = hash_num; m_max_element = (1 << m_counter_bit_size) - 1; m_min_count = min(min_count, m_max_element); m_elements_in_block = 8*BlockSize/m_counter_bit_size; m_blocks = ceil((double)table_size/m_elements_in_block); m_table_size = m_blocks*m_elements_in_block; m_count_table.clear(); m_status.clear(); m_count_table.resize(m_blocks); m_status.resize(m_blocks); } EInsertResult Insert(size_t hashp, size_t hashm) { size_t ind = hashp%m_blocks; auto& block = m_count_table[ind]; if(Test(hashp, hashm) >= m_min_count) return eExistingKmer; while(!m_status[ind].Set(1)); int count = Test(hashp, hashm); if(count >= m_min_count) { m_status[ind] = 0; return eExistingKmer; } for(int h = 1; h < m_hash_num; ++h) { hashp += hashm; size_t pos = (hashp&(m_elements_in_block-1))*m_counter_bit_size; // bit position of the counting element in block auto& cell = block.m_data[pos >> m_bits_in_cell_log]; int shift = pos&(m_bits_in_cell-1); int cnt = (cell >> shift)&m_max_element; if(cnt <= count) cell += ((TCell)1 << shift); } m_status[ind] = 0; if(count == 0) return eNewKmer; else if(count == m_min_count-1) return eAboveThresholdKmer; else return eExistingKmer; } int Test(size_t hashp, size_t hashm) const { auto& block = m_count_table[hashp%m_blocks]; int count = m_max_element; for(int h = 1; h < m_hash_num; ++h) { hashp += hashm; size_t pos = (hashp&(m_elements_in_block-1))*m_counter_bit_size; // bit position of the counting element in block auto& cell = block.m_data[pos >> m_bits_in_cell_log]; int shift = pos&(m_bits_in_cell-1); int cnt = (cell >> shift)&m_max_element; if(cnt < count) count = cnt; } return count; } int MaxElement() const { return m_max_element; } int HashNum() const { return m_hash_num; } size_t TableSize() const { return m_table_size; } // number of counters size_t TableFootPrint() const { return (sizeof(SBloomBlock)+1)*m_count_table.size(); } // bytes private: typedef uint64_t TCell; struct alignas(64) SBloomBlock { SBloomBlock() { memset(m_data.data(), 0, BlockSize); } array m_data; }; vector m_count_table; vector> m_status; size_t m_elements_in_block; size_t m_blocks; size_t m_table_size; int m_counter_bit_size; int m_hash_num; int m_min_count; int m_max_element; int m_bits_in_cell = 8*sizeof(TCell); int m_bits_in_cell_log = log(m_bits_in_cell)/log(2); }; typedef CConcurrentBlockedBloomFilter<128> TConcurrentBlockedBloomFilter; // minimalistic multithread safe forward list // allows reading and inserts in the beginning // reading thread will not see new entries inserted after reading started template class CForwardList { public: struct SNode { E m_data = E(); SNode* m_next = nullptr; }; template class iterator : public std::iterator { public: template struct choose; template struct choose { typedef IsTrue type; }; template struct choose { typedef IsFalse type; }; typedef typename choose::type reference; typedef typename choose::type pointer; typedef typename choose::type node_pointer; iterator(SNode* node = nullptr) : m_node(node) {}; iterator& operator++() { m_node = m_node->m_next; return *this; } reference& operator*() { return m_node->m_data; } pointer operator->() { return &m_node->m_data; } node_pointer NodePointer() { return m_node; } bool operator!=(const iterator& i) const { return i.m_node != m_node; } private: friend class CForwardList; SNode* m_node; }; iterator begin() { return iterator(m_head.load()); } iterator end() { return iterator(); } iterator begin() const { return iterator(m_head.load()); } iterator end() const { return iterator(); } CForwardList() { m_head.store(nullptr); } // not mutithread safe CForwardList& operator=(const CForwardList& other) { Clear(); for(auto it = other.begin(); it != other.end(); ++it) PushFront(*it); return *this; } CForwardList(const CForwardList& other) { m_head.store(nullptr); for(auto it = other.begin(); it != other.end(); ++it) PushFront(*it); } ~CForwardList() { Clear(); } E& front() { return m_head.load()->m_data; } SNode* Head() const { return m_head; } SNode* NewNode(const E& e) { SNode* p = new SNode; p->m_data = e; p->m_next = m_head; return p; } SNode* NewNode() { SNode* p = new SNode; p->m_next = m_head; return p; } bool TryPushFront(SNode* nodep) { return m_head.compare_exchange_strong(nodep->m_next, nodep); } E* Emplace() { SNode* p = NewNode(); while (!m_head.compare_exchange_strong(p->m_next, p)); return &(p->m_data); } void PushFront(const E& e) { SNode* p = NewNode(e); while (!m_head.compare_exchange_strong(p->m_next, p)); } // not mutithread safe template void remove_if(const P& prob) { while(m_head.load() != nullptr && prob(m_head.load()->m_data)) { SNode* p = m_head; m_head = p->m_next; delete p; } for(SNode* p = m_head; p != nullptr; ) { SNode* after = p->m_next; if(after != nullptr && prob(after->m_data)) { p->m_next = after->m_next; delete after; } else { p = p->m_next; } } } void Save(ostream& os) const { size_t vsize = sizeof(E); os.write(reinterpret_cast(&vsize), sizeof vsize); size_t elements = distance(begin(), end()); os.write(reinterpret_cast(&elements), sizeof elements); for(auto& elem : *this) os.write(reinterpret_cast(&elem), sizeof elem); if(!os) throw runtime_error("Error in CForwardList write"); } void Load(istream& is) { Clear(); size_t vsize; if(!is.read(reinterpret_cast(&vsize), sizeof vsize)) throw runtime_error("Error in CForwardList read"); if(vsize != sizeof(E)) throw runtime_error("Wrong format for CForwardList load"); size_t elements; if(!is.read(reinterpret_cast(&elements), sizeof elements)) throw runtime_error("Error in CForwardList read"); for( ; elements > 0; --elements) { E* p = Emplace(); if(!is.read(reinterpret_cast(p), sizeof *p)) throw runtime_error("Error in CForwardList read"); } } void Clear() { for(SNode* p = m_head; p != nullptr; ) { auto tmp = p->m_next; delete p; p = tmp; } m_head.store(nullptr); } void Init() { m_head.store(nullptr); } private: atomic m_head; }; // minimalistic deque-type container allowing multithread initialization of a large hash_table template class CDeque { public: typedef E value_type; CDeque(size_t size = 0) : m_chunks(1) { m_data.resize(m_chunks); Reset(size, m_chunks); } ~CDeque() { if(m_chunks > 1) { list> jobs; for(unsigned chunk = 0; chunk < m_chunks; ++chunk) jobs.push_back(bind(&CDeque::ReleaseChunk, this, chunk)); RunThreads(m_chunks, jobs); } } E& operator[](size_t index) { return m_data[index/m_chunk_size][index%m_chunk_size]; } const E& operator[](size_t index) const { return m_data[index/m_chunk_size][index%m_chunk_size]; } size_t Size() const { return m_size; } void Reset(size_t size, size_t chunks) { m_chunks = chunks; m_data.resize(m_chunks); m_size = size; m_chunk_size = (size+m_chunks-1)/m_chunks; if(m_chunks == 1) { ResetChunk(0, m_chunk_size); } else { list> jobs; for(unsigned chunk = 0; chunk < m_chunks; ++chunk) { size_t chunk_size = min(m_chunk_size, size); jobs.push_back(bind(&CDeque::ResetChunk, this, chunk, chunk_size)); size -= min(m_chunk_size, size); } RunThreads(m_chunks, jobs); } } void Swap(CDeque& other) { swap(m_chunks, other.m_chunks); swap(m_size, other.m_size); swap(m_chunk_size, other.m_chunk_size); swap(m_data, other.m_data); } void Save(ostream& os) const { size_t vsize = sizeof(E); os.write(reinterpret_cast(&vsize), sizeof vsize); os.write(reinterpret_cast(&m_chunks), sizeof m_chunks); os.write(reinterpret_cast(&m_size), sizeof m_size); os.write(reinterpret_cast(&m_chunk_size), sizeof m_chunk_size); for(auto& chunk : m_data) { size_t num = chunk.size(); os.write(reinterpret_cast(&num), sizeof num); os.write(reinterpret_cast(chunk.data()), num*vsize); } if(!os) throw runtime_error("Error in CDeque write"); } void Load(istream& is) { size_t vsize; if(!is.read(reinterpret_cast(&vsize), sizeof vsize)) throw runtime_error("Error in CDeque read"); if(vsize != sizeof(E)) throw runtime_error("Wrong format for CDeque load"); if(!is.read(reinterpret_cast(&m_chunks), sizeof m_chunks)) throw runtime_error("Error in CDeque read"); if(!is.read(reinterpret_cast(&m_size), sizeof m_size)) throw runtime_error("Error in CDeque read"); if(!is.read(reinterpret_cast(&m_chunk_size), sizeof m_chunk_size)) throw runtime_error("Error in CDeque read"); m_data.clear(); m_data.resize(m_chunks); for(auto& chunk : m_data) { size_t num; if(!is.read(reinterpret_cast(&num), sizeof num)) throw runtime_error("Error in CDeque read"); chunk.resize(num); if(!is.read(reinterpret_cast(chunk.data()), num*vsize)) throw runtime_error("Error in CDeque read"); } } private: void ResetChunk(size_t chunk, size_t chunk_size) { m_data[chunk].clear(); m_data[chunk].resize(chunk_size); } void ReleaseChunk(size_t chunk) { vector().swap(m_data[chunk]); } size_t m_chunks = 0; size_t m_size = 0; size_t m_chunk_size = 0; vector> m_data; }; // BucketBlock <= 32 // Moderate value of BucketBlock will improve memory cache use // Larger values will reduce the number of entries in the spillover lists but eventually will increase the search time // 0 (all entries in the lists) is permitted and could be used for reduction of the table size for large sizeof(V) template class CKmerHashMap { static_assert(BucketBlock <= 32, ""); public: CKmerHashMap(int kmer_len = 0, size_t size = 0) : m_kmer_len(kmer_len) { if(m_kmer_len > 0) m_hash_table = CreateVariant, THashBlockVec, MappedV>((m_kmer_len+31)/32); Reset(size, 1); } void Reset(size_t size, size_t chunks) { size_t blocks = size/max(1,BucketBlock); if(size%max(1,BucketBlock)) ++blocks; m_table_size = max(1,BucketBlock)*blocks; apply_visitor(resize(blocks, chunks), m_hash_table); } class Index { public: Index(size_t ind = 0, void* ptr = nullptr) : m_index(ind), m_lstp(ptr) {} void Advance(CKmerHashMap& hash) { apply_visitor(CKmerHashMap::index_advance(*this), hash.m_hash_table); } pair GetElement(CKmerHashMap& hash) const { return apply_visitor(CKmerHashMap::index_get(*this), hash.m_hash_table); }; pair GetElement(const CKmerHashMap& hash) const { return apply_visitor(CKmerHashMap::index_get(*this), const_cast(hash).m_hash_table); }; MappedV* GetMapped(CKmerHashMap& hash) const { return apply_visitor(CKmerHashMap::index_get_mapped(*this), hash.m_hash_table); }; const MappedV* GetMapped(const CKmerHashMap& hash) const { return apply_visitor(CKmerHashMap::index_get_mapped(*this), const_cast(hash).m_hash_table); }; const uint64_t* GetKeyPointer(const CKmerHashMap& hash) const { return apply_visitor(CKmerHashMap::index_get_keyp(*this), const_cast(hash).m_hash_table); }; bool operator==(const Index& other) const { return m_index == other.m_index && m_lstp == other.m_lstp; } bool operator!=(const Index& other) const { return !operator==(other); } bool operator<(const Index& other) const { if(m_index == other.m_index) return m_lstp < other.m_lstp; else return m_index < other.m_index; } bool operator>(const Index& other) const { if(m_index == other.m_index) return m_lstp > other.m_lstp; else return m_index > other.m_index; } struct Hash { size_t operator()(const Index& index) const { return std::hash()(index.m_index)^std::hash()(index.m_lstp); } }; protected: friend class CKmerHashMap; size_t m_index; // index; list considered a single entry void* m_lstp; // pointer to list element }; Index EndIndex() const { return Index((BucketBlock+1)*BucketsNum(), nullptr); } class Iterator : public Index { public: Iterator(const Index& index, CKmerHashMap* hp) : Index(index), hashp(hp) {} Iterator(size_t ind, void* ptr, CKmerHashMap* hp) : Index(ind, ptr), hashp(hp) {} Iterator& operator++() { this->Advance(*hashp); return *this; } pair GetElement() { return Index::GetElement(*hashp); } MappedV* GetMapped() { return Index::GetMapped(*hashp); } const uint64_t* GetKeyPointer() { return Index::GetKeyPointer(*hashp); } private: CKmerHashMap* hashp; }; Iterator Begin() { return Iterator(apply_visitor(hash_begin(0), m_hash_table), this); } Iterator End() { return Iterator((BucketBlock+1)*BucketsNum(), nullptr, this); } Iterator FirstForBucket(size_t bucket) { return Iterator(apply_visitor(hash_begin(bucket), m_hash_table), this); } vector Chunks(int desired_num) { vector chunks; if(BucketsNum() == 0) return chunks; size_t step = BucketsNum()/desired_num+1; for(size_t bucket = 0; bucket < BucketsNum(); ) { chunks.push_back(FirstForBucket(bucket)); bucket = chunks.back().m_index/(BucketBlock+1)+step; } if(chunks.back() != End()) chunks.push_back(End()); return chunks; } //returns pointer to mapped value if exists, otherwise nullptr MappedV* Find(const TKmer& kmer) { if(m_table_size == 0) return nullptr; else return apply_visitor(find(kmer), m_hash_table); } //returns index in hash table Index FindIndex(const TKmer& kmer) { if(m_table_size == 0) return EndIndex(); else return apply_visitor(find_index(kmer), m_hash_table); } // if kmer already included returns pointer to mapped value // if not inserts a new entry and returns pointer to default value // caller MUST update the mapped value // assumes that any updates will be atomic MappedV* FindOrInsert(const TKmer& kmer) { size_t index = kmer.oahash()%m_table_size; return FindOrInsertInBucket(kmer, index); } MappedV* FindOrInsertInBucket(const TKmer& kmer, size_t index) {return apply_visitor(find_or_insert(kmer,index), m_hash_table); } void Swap(CKmerHashMap& other) { apply_visitor(swap_with_other(), m_hash_table, other.m_hash_table); swap(m_table_size, other.m_table_size); swap(m_kmer_len, other.m_kmer_len); } int KmerLen() const { return m_kmer_len; } size_t TableSize() const { return m_table_size; } size_t TableFootPrint() const { return apply_visitor(hash_footprint(), m_hash_table); } size_t BucketsNum() const { return m_table_size/max(1,BucketBlock); } void Info() const { apply_visitor(info(), m_hash_table); } void Save(ostream& os) const { os.write(reinterpret_cast(&m_table_size), sizeof m_table_size); os.write(reinterpret_cast(&m_kmer_len), sizeof m_kmer_len); apply_visitor(save(os), m_hash_table); if(!os) throw runtime_error("Error in CKmerHashMap write"); } void Load(istream& is) { if(!is.read(reinterpret_cast(&m_table_size), sizeof m_table_size)) throw runtime_error("Error in CKmerHashMap read"); if(!is.read(reinterpret_cast(&m_kmer_len), sizeof m_kmer_len)) throw runtime_error("Error in CKmerHashMap read"); m_hash_table = CreateVariant, THashBlockVec, MappedV>((m_kmer_len+31)/32); apply_visitor(load(is), m_hash_table); } protected: friend class Index; template struct SHashBlock { typedef LargeInt large_t; typedef V mapped_t; typedef pair element_t; typedef CForwardList list_t; typedef typename list_t::SNode snode_t; enum States : uint64_t {eAssigned = 1, eKeyExists = 2}; enum { eBucketBlock = BucketBlock }; SHashBlock() : m_status(0) {} SHashBlock(const SHashBlock& other) : m_data(other.m_data), m_extra(other.m_extra), m_status(other.m_status.load()) {} // used for table initialisation only SHashBlock& operator=(const SHashBlock& other) { m_data = other.m_data; m_extra = other.m_extra; m_status.store(other.m_status.load()); return *this; } pair Find(const large_t& k, int hint) { if(BucketBlock > 0) { //try exact position first if(isEmpty(hint)) { return make_pair(BucketBlock+1, nullptr); } else { Wait(hint); if(m_data[hint].first == k) return make_pair(hint, nullptr); } //scan array for(int shift = 0; shift < BucketBlock; ++shift) { if(shift != hint) { if(isEmpty(shift)) { return make_pair(BucketBlock+1, nullptr); } else { Wait(shift); if(m_data[shift].first == k) return make_pair(shift, nullptr); } } } } //scan spillover list for(auto it = m_extra.begin(); it != m_extra.end(); ++it) { auto& cell = *it; if(cell.first == k) return make_pair(BucketBlock, it.NodePointer()); } return make_pair(BucketBlock+1, nullptr); } // 1. Try to put to exact position prescribed by hash // 2. Put in the lowest available array element // 3. Put in the spillover list mapped_t* FindOrInsert(const large_t& k, int hint) { auto TryCell = [&](int shift) { auto& cell = m_data[shift]; //try to grab if(Lock(shift, k)) return &cell.second; //already assigned to some kmer //wait if kmer is not stored yet Wait(shift); if(cell.first == k) // kmer matches return &cell.second; else return (mapped_t*)nullptr; // other kmer }; if(BucketBlock > 0) { //try exact position first auto rslt = TryCell(hint); if(rslt != nullptr) return rslt; //scan remaining array for(int shift = 0; shift < BucketBlock; ++shift) { if(shift != hint) { auto rslt = TryCell(shift); if(rslt != nullptr) return rslt; } } } //scan spillover list auto existing_head = m_extra.Head(); for(auto p = existing_head; p != nullptr; p = p->m_next) { if(p->m_data.first == k) return &(p->m_data.second); } typename list_t::SNode* nodep = new typename list_t::SNode; nodep->m_data.first = k; nodep->m_next = existing_head; while(!m_extra.TryPushFront(nodep)) { //check if a new elemet matches for(auto p = nodep->m_next; p != existing_head; p = p->m_next) { if(p->m_data.first == k) { delete nodep; return &(p->m_data.second); } } existing_head = nodep->m_next; } return &(nodep->m_data.second); } element_t* IndexGet(int shift, void* lstp) { if(shift < BucketBlock) { // array element return &m_data[shift]; } else { //list element snode_t* ptr = reinterpret_cast(lstp); return &(ptr->m_data); } } bool Lock(int shift, const large_t& kmer) { uint64_t assigned = eAssigned << 2*shift; uint64_t expected = m_status; do { if(expected&assigned) return false; } while(!m_status.compare_exchange_strong(expected, expected|assigned)); m_data[shift].first = kmer; m_status |= eKeyExists << 2*shift; return true; } void Wait(int shift) { uint64_t keyexists = eKeyExists << 2*shift; while(!(m_status&keyexists)); } bool isEmpty(int shift) const { uint64_t assigned = eAssigned << 2*shift; return (!(m_status&assigned)); } void Move(element_t& cell, int to) { m_data[to] = cell; m_status |= (eAssigned|eKeyExists) << 2*to; cell.second = V(); } void Move(int from, int to) { Move(m_data[from], to); m_status &= ~((eAssigned|eKeyExists) << 2*from); // clear bits } void Clear(int shift) { m_data[shift].second = V(); m_status &= ~((uint64_t)(eAssigned|eKeyExists) << 2*shift); // clear bits } array m_data; list_t m_extra; atomic m_status; }; template using THashBlockVec = CDeque>; template using TKmerHashTable = BoostVariant; struct save : public boost::static_visitor { save(ostream& out) : os(out) {} template void operator()(const T& v) const { v.Save(os); size_t list_num = 0; for(size_t i = 0; i < v.Size(); ++i) { if(v[i].m_extra.Head() != nullptr) ++list_num; } os.write(reinterpret_cast(&list_num), sizeof list_num); for(size_t i = 0; i < v.Size(); ++i) { if(v[i].m_extra.Head() != nullptr) { os.write(reinterpret_cast(&i), sizeof i); v[i].m_extra.Save(os); } } } ostream& os; }; struct load : public boost::static_visitor { load(istream& in) : is(in) {} template void operator()(T& v) const { v.Load(is); size_t list_num; if(!is.read(reinterpret_cast(&list_num), sizeof list_num)) throw runtime_error("Error in CKmerHashMap read"); for( ; list_num > 0; --list_num) { size_t i; if(!is.read(reinterpret_cast(&i), sizeof i)) throw runtime_error("Error in CKmerHashMap read"); v[i].m_extra.Init(); v[i].m_extra.Load(is); } } istream& is; }; struct swap_with_other : public boost::static_visitor<> { template void operator() (T& a, T& b) const { a.Swap(b); } template void operator() (T& a, U& b) const { throw runtime_error("Can't swap different type containers"); } }; struct index_get : public boost::static_visitor> { index_get(const Index& ind) : index(ind) {} template pair operator()(T& v) const { auto elemp = v[index.m_index/(BucketBlock+1)].IndexGet(index.m_index%(BucketBlock+1), index.m_lstp); return make_pair(TKmer(elemp->first), &(elemp->second)); } const Index& index; }; struct index_get_mapped : public boost::static_visitor { index_get_mapped(const Index& ind) : index(ind) {} template MappedV* operator()(T& v) const { auto elemp = v[index.m_index/(BucketBlock+1)].IndexGet(index.m_index%(BucketBlock+1), index.m_lstp); return &(elemp->second); } const Index& index; }; struct index_get_keyp : public boost::static_visitor { index_get_keyp(const Index& ind) : index(ind) {} template const uint64_t* operator()(T& v) const { auto elemp = v[index.m_index/(BucketBlock+1)].IndexGet(index.m_index%(BucketBlock+1), index.m_lstp); return elemp->first.getPointer(); } const Index& index; }; template static Index next_available(T& v, size_t from) { for(size_t i = from; i < v.Size(); ++i) { auto& bucket = v[i]; for(int shift = 0; shift < BucketBlock; ++shift) { if(!bucket.isEmpty(shift)) return Index(i*(BucketBlock+1)+shift, nullptr); } if(bucket.m_extra.Head() != nullptr) return Index(i*(BucketBlock+1)+BucketBlock, bucket.m_extra.Head()); } return Index((BucketBlock+1)*v.Size(), nullptr); } struct index_advance : public boost::static_visitor<> { index_advance(Index& ind) : index(ind) {} template void operator()(T& v) const { typedef typename T::value_type::list_t::SNode snode_t; size_t ind = index.m_index; size_t i = ind/(BucketBlock+1); auto& bucket = v[i]; int shift = ind%(BucketBlock+1); if(shift < BucketBlock-1) { // not last array element - check all next elements while(++shift < BucketBlock) { if(!bucket.isEmpty(shift)) { index.m_index = i*(BucketBlock+1)+shift; return; } } } else if(shift == BucketBlock-1) { // last array element - check spillover list if(bucket.m_extra.Head() != nullptr) { ++index.m_index; index.m_lstp = bucket.m_extra.Head(); return; } } else { // spillover list - check next list element snode_t* ptr = reinterpret_cast(index.m_lstp); if(ptr->m_next != nullptr) { index.m_lstp = ptr->m_next; return; } } index = next_available(v, i+1); // look for next bucket } Index& index; }; struct hash_begin : public boost::static_visitor { hash_begin(size_t fr) : from(fr) {} template Index operator()(T& v) const { return next_available(v, from); } size_t from; }; struct hash_footprint : public boost::static_visitor { template size_t operator()(T& v) const { return sizeof(typename T::value_type)*v.Size(); } }; //returns pointer to mapped value if exists, otherwise nullptr struct find : public boost::static_visitor { find(const TKmer& k) : kmer(k) {} template MappedV* operator()(T& v) const { auto& k = kmer.get(); size_t pos = k.oahash()%(v.Size()*max(1,BucketBlock)); auto& bucket = v[pos/max(1,BucketBlock)]; int hint = pos%max(1,BucketBlock); auto rslt = bucket.Find(k, hint); if(rslt.first < BucketBlock) // found in array return &bucket.m_data[rslt.first].second; else if(rslt.first == BucketBlock) // found in list return &rslt.second->m_data.second; else // not found return nullptr; } const TKmer& kmer; }; //returns Index struct find_index : public boost::static_visitor { find_index(const TKmer& k) : kmer(k) {} template Index operator()(T& v) const { typedef typename T::value_type::large_t large_t; const large_t& k = kmer.get(); size_t pos = k.oahash()%(v.Size()*max(1,BucketBlock)); size_t bucket_num = pos/max(1,BucketBlock); int hint = pos%max(1,BucketBlock); auto rslt = v[bucket_num].Find(k, hint); if(rslt.first <= BucketBlock) // found in array return Index(bucket_num*(BucketBlock+1)+rslt.first, rslt.second); else // not found return Index((BucketBlock+1)*v.Size(), nullptr); } const TKmer& kmer; }; // if kmer already included returns pointer to mapped value // if not inserts a new entry and returns pointer to default value // caller MUST update the mapped value // assumes that any updated will be atomic struct find_or_insert : public boost::static_visitor { find_or_insert(const TKmer& k, size_t i) : kmer(k), index(i) {} template MappedV* operator()(T& v) const { typedef typename T::value_type::large_t large_t; const large_t& k = kmer.get(); size_t bucket_num = index/max(1,BucketBlock); int hint = index%max(1,BucketBlock); return v[bucket_num].FindOrInsert(k, hint); } const TKmer& kmer; size_t index; }; struct info : public boost::static_visitor<> { template void operator()(T& v) const { map numbers; for(size_t i = 0; i < v.Size(); ++i) { auto& bucket= v[i]; int num = distance(bucket.m_extra.begin(), bucket.m_extra.end()); for(int shift = 0; shift < BucketBlock; ++shift) { if(!bucket.isEmpty(shift)) ++num; } ++numbers[num]; } for(auto& rslt : numbers) cerr << "Bucket:\t" << rslt.first << "\t" << rslt.second << endl; } }; struct resize : public boost::static_visitor<> { resize(size_t s, size_t c) : size(s), chunks(c) {} template void operator()(T& v) const { v.Reset(size, chunks); } size_t size; size_t chunks; }; TKmerHashTable m_hash_table; size_t m_table_size; int m_kmer_len; }; struct SKmerCounter { SKmerCounter() : m_data(0) {} bool operator==(const SKmerCounter& kc) const { return kc.m_data == m_data; } uint32_t Increment(bool is_plus) { return (m_data.m_atomic += (is_plus ? 0x100000001 : 1)); } uint32_t Count() const { return m_data; } // clips plus part SAtomic m_data; }; class CKmerHashCount : public CKmerHashMap { public: CKmerHashCount(int kmer_len = 0, size_t size = 0) : CKmerHashMap(kmer_len, size) {} // returns true if kmer was new bool UpdateCount(const TKmer& kmer, bool is_plus) { size_t index = kmer.oahash()%m_table_size; return (FindOrInsertInBucket(kmer, index)->Increment(is_plus) == 1); } size_t UpdateCounts(const CReadHolder::string_iterator& is, const TConcurrentBlockedBloomFilter& bloom, int min_count) { return apply_visitor(update_counts(is, bloom, min_count, m_kmer_len), m_hash_table); } // rehash bucket from other container void RehashOtherBuckets(CKmerHashCount& other, size_t bucket_from, size_t bucket_to) { apply_visitor(rehash_bucket(bucket_from, bucket_to, *this), m_hash_table, other.m_hash_table); } //remove false positives size_t CleanBuckets(int min_count, size_t bucket_from, size_t bucket_to) { return apply_visitor(clean_buckets(min_count, bucket_from, bucket_to, TableSize()), m_hash_table); } TBins GetBins() { map hist; for(auto index = Begin(); index != End(); ++index) { ++hist[index.GetMapped()->Count()]; } return TBins(hist.begin(), hist.end()); } private: struct update_counts : public boost::static_visitor { update_counts(const CReadHolder::string_iterator& i, const TConcurrentBlockedBloomFilter& bl, int mc, unsigned kl) : is(i), bloom(bl), min_count(mc), kmer_len(kl) {} template size_t operator() (T& v) const { if(v.Size() == 0) return 0; typedef typename T::value_type::large_t large_t; size_t read_len = is.ReadLen(); if(read_len < kmer_len) return 0; unsigned kmer_bytes = (2*kmer_len+7)/8; //number of whole bytes in kmer unsigned kmer_size = (2*kmer_len+63)/64; //number of whole 8-byte words in kmer int partial_bits = (2*kmer_len)%64; //number of used bits in partial 8 byte word (if any) uint64_t mask = numeric_limits::max(); if(partial_bits > 0) mask = (uint64_t(1) << partial_bits) - 1; size_t buf_size = (2*read_len+63)/64+1; uint64_t* read_buf = new uint64_t[buf_size]; //(enough + 1) 8-byte words for read (one extra because we'll copy kmers using whole bytes which can go beyond the sequence) size_t new_kmers = 0; large_t kmer(0); for(int shift = 0; shift < 4 && read_len-shift >= kmer_len; ++shift) { memset(read_buf, 0, 8*buf_size); is.BSeq(shift, read_buf); for(unsigned k = 0; k <= read_len-shift-kmer_len; k += 4) { // every 4th kmer on the byte boundary memcpy(kmer.getPointer(), (uint8_t*)read_buf+k/4, kmer_bytes); kmer.getPointer()[kmer_size-1] &= mask; large_t rkmer = revcomp(kmer, kmer_len); large_t* min_kmerp = &rkmer; bool is_plus = false; size_t hashp = rkmer.oahash(); size_t hashm = kmer.oahash(); if(kmer < rkmer) { min_kmerp = &kmer; is_plus = true; swap(hashp, hashm); } int bucket_block = T::value_type::eBucketBlock; size_t pos = hashp%(v.Size()*max(1,bucket_block)); size_t bucket_num = pos/max(1,bucket_block); int hint = pos%max(1,bucket_block); auto& bucket = v[bucket_num]; auto rslt = bucket.Find(*min_kmerp, hint); if(rslt.first < bucket_block) { // found in array if(bucket.m_data[rslt.first].second.Increment(is_plus) == 1) ++new_kmers; continue; } else if(rslt.first == bucket_block) { // found in list if(rslt.second->m_data.second.Increment(is_plus) == 1) ++new_kmers; continue; } if(min_count > 1 && bloom.Test(hashp, hashm) < min(min_count, bloom.MaxElement())) continue; if(bucket.FindOrInsert(*min_kmerp, hint)->Increment(is_plus) == 1) ++new_kmers; } } delete[] read_buf; return new_kmers; } const CReadHolder::string_iterator& is; const TConcurrentBlockedBloomFilter& bloom; int min_count; unsigned kmer_len; }; struct rehash_bucket : public boost::static_visitor<> { rehash_bucket(size_t bf, size_t bt, CKmerHashCount& h) : bucket_from(bf), bucket_to(bt), hash(h) {} template void operator() (T& a, T& b) const { typedef typename T::value_type::element_t element_t; for(size_t indexb = bucket_from; indexb <= bucket_to; ++indexb) { auto& bucket_b = b[indexb]; for(auto& cell : bucket_b.m_data) { if(cell.second.Count() != 0) { auto& kmer = cell.first; size_t indexa = kmer.oahash()%hash.TableSize(); *hash.FindOrInsertInBucket(TKmer(kmer), indexa) = cell.second; } } for(element_t& cell : bucket_b.m_extra) { auto& kmer = cell.first; size_t indexa = kmer.oahash()%hash.TableSize(); *hash.FindOrInsertInBucket(TKmer(kmer), indexa) = cell.second; } } } template void operator() (T& a, U& b) const { throw runtime_error("Can't rehash from different type container"); } size_t bucket_from; size_t bucket_to; CKmerHashCount& hash; }; struct clean_buckets : public boost::static_visitor { clean_buckets(int mc, size_t bf, size_t bt, size_t tb) : min_count(mc), bucket_from(bf), bucket_to(bt), table_size(tb) {} template size_t operator()(T& v) const { typedef typename T::value_type::element_t element_t; size_t num = 0; for(size_t bind = bucket_from; bind <= bucket_to; ++bind) { auto& bucket = v[bind]; int empty_cells = 0; auto Reposition = [&bucket, this](element_t& cell, unsigned limit) { size_t index = cell.first.oahash()%table_size; size_t orig_shift = index%bucket.m_data.size(); if(orig_shift == limit) return false; if(bucket.m_data[orig_shift].second.Count() < min_count) { if(limit < bucket.m_data.size()) bucket.Move(limit, orig_shift); else bucket.Move(cell, orig_shift); return orig_shift > limit; } for(unsigned shift = 0; shift < limit; ++shift) { if(shift != orig_shift && bucket.m_data[shift].second.Count() < min_count) { if(limit < bucket.m_data.size()) bucket.Move(limit, shift); else bucket.Move(cell, shift); return false; } } return false; }; for(unsigned shift = 0; shift < bucket.m_data.size(); ++shift) { auto& cell = bucket.m_data[shift]; auto count = cell.second.Count(); if(count < min_count) { ++empty_cells; if(count > 0) bucket.Clear(shift); } else { if(Reposition(cell, shift)) ++empty_cells; // moved down and created new empty cell (will be counted later) else ++num; // stayed or moved up } } for(auto& cell : bucket.m_extra) { if(cell.second.Count() >= min_count) { ++num; if(empty_cells > 0) { Reposition(cell, bucket.m_data.size()); --empty_cells; } } } bucket.m_extra.remove_if([this](const element_t& elem) {return elem.second.Count() < min_count;}); } return num; } unsigned min_count; size_t bucket_from; size_t bucket_to; size_t table_size; }; }; class CKmerHashCounter { public: CKmerHashCounter(const list>& reads, int kmer_len, int min_count, size_t estimated_kmer_num, bool is_stranded, int ncores, bool skip_bloom) : m_kmer_len(kmer_len), m_min_count(min_count), m_is_stranded(is_stranded), m_ncores(ncores), m_skip_bloom(skip_bloom), m_hash_table(m_kmer_len), m_estimated_table_size(0), m_estimated_uniq_kmers(0), m_kmer_num(0), m_kmer_num_raw(0), m_kmer_count(0), m_rehash_status(false) { m_kmer_step = max(1., 0.1*m_hash_table.TableSize()/m_ncores); for(auto& rholder : reads) m_start_position.push_back(make_pair(0, rholder[0].sbegin())); CStopWatch timer; TConcurrentBlockedBloomFilter bloom(0, 2, 1, m_min_count); if(!m_skip_bloom) { m_estimated_uniq_kmers.store(estimated_kmer_num); while(true) { timer.Restart(); int counter_bit_size = 2; for( ; counter_bit_size <= 8 && (1 << counter_bit_size)-1 < m_min_count; counter_bit_size *= 2); double false_positive_rate = 0.03; // size_t bloom_table_size = -1.5*(double)m_estimated_uniq_kmers.load()*log(false_positive_rate)/log(2.)/log(2.); // 50% extra because blocked size_t bloom_table_size = -(double)m_estimated_uniq_kmers.load()*log(false_positive_rate)/log(2.)/log(2.); // 50% extra because blocked int hash_num = ceil(-log(false_positive_rate)/log(2.)); bloom.Reset(bloom_table_size, counter_bit_size, hash_num, m_min_count); cerr << "\nBloom table size: " << bloom.TableSize() << "(" << 0.1*(bloom.TableFootPrint()/100000) << "MB)" << " Counter bit size: " << counter_bit_size << " Hash num: " << hash_num << endl; m_estimated_table_size.store(0); m_estimated_uniq_kmers.store(0); list> jobs; for(auto& job_input : reads) { if(job_input[0].ReadNum() > 0 || job_input[1].ReadNum() > 0) { // not empty jobs.push_back(bind(&CKmerHashCounter::InsertInBloomJob, this, ref(job_input), ref(bloom))); } } RunThreads(m_ncores, jobs); if(m_min_count == 1) m_estimated_table_size.store(m_estimated_uniq_kmers.load()); double kmers = m_estimated_uniq_kmers.load(); false_positive_rate = pow(1.-exp(-hash_num*kmers/bloom_table_size), hash_num); cerr << "Estimated kmers above threshold: " << m_estimated_table_size.load() << " Estimated uniq kmers: " << m_estimated_uniq_kmers.load() << " Estimated bloom false positive rate " << false_positive_rate << endl; cerr << "Bloom filter in " << timer.Elapsed(); if(false_positive_rate < 0.15) break; cerr << "\nBloom filter false positive rate is too high - increasing the bloom filter size and recalculating" << endl; } } else { m_estimated_table_size.store(estimated_kmer_num); } timer.Restart(); m_hash_table.Reset(1.5*m_estimated_table_size.load(), m_ncores); while(m_hash_table.TableSize() > 0) { { CStopWatch timer; timer.Restart(); list> jobs; auto start_pos = m_start_position.begin(); for(auto& job_input : reads) { if(job_input[0].ReadNum() > 0 || job_input[1].ReadNum() > 0) { // not empty jobs.push_back(bind(&CKmerHashCounter::CountKmersJob, this, ref(job_input), ref(*start_pos), ref(bloom))); } ++start_pos; } RunThreads(m_ncores, jobs); } if(!m_rehash_status.load()) break; //Rehash { CStopWatch timer; timer.Restart(); size_t new_size = m_hash_table.TableSize()*m_increase_factor; cerr << "Rehash new size: " << new_size << endl; CKmerHashCount hash_table_tmp(m_kmer_len); hash_table_tmp.Reset(new_size, m_ncores); swap(m_hash_table, hash_table_tmp); m_kmer_step = max(1., 0.1*m_hash_table.TableSize()/m_ncores); m_rehash_status.store(false); list> jobs; size_t step = ceil((double)hash_table_tmp.BucketsNum()/m_ncores); for(int thr = 0; thr < m_ncores; ++thr) { size_t from = step*thr; size_t to = min(hash_table_tmp.BucketsNum()-1,from+step-1); if(to >= from) jobs.push_back(bind(&CKmerHashCounter::RehashJob, this, ref(hash_table_tmp), from, to)); } RunThreads(m_ncores, jobs); cerr << "Rehashing in " << timer.Elapsed(); } } cerr << "Create hash in " << timer.Elapsed(); timer.Restart(); //Remove false positives RemoveLowCountKmers(m_min_count); cerr << "Clean hash in " << timer.Elapsed(); timer.Restart(); cerr << "Initial kmers: " << m_kmer_num_raw.load() << " Kmers above threshold: " << m_kmer_num.load() << " Total kmers: " << m_kmer_count.load() << " Hash table size: " << m_hash_table.TableSize() << "(" << 0.1*(m_hash_table.TableFootPrint()/100000) << "MB)" << endl; } void RemoveLowCountKmers(int min_count) { m_min_count = min_count; if(m_hash_table.TableSize() > 0) { list> jobs; size_t step = ceil((double)m_hash_table.BucketsNum()/m_ncores); for(int thr = 0; thr < m_ncores; ++thr) { size_t from = step*thr; size_t to = min(m_hash_table.BucketsNum()-1,from+step-1); if(to >= from) jobs.push_back(bind(&CKmerHashCounter::CleanJob, this, from, to)); } RunThreads(m_ncores, jobs); } } // prepares kmer counts to be used in de Bruijn graph void GetBranches() { CStopWatch timer; timer.Restart(); list> jobs; size_t step = ceil((double)m_hash_table.BucketsNum()/m_ncores); for(int thr = 0; thr < m_ncores; ++thr) { size_t from = step*thr; size_t to = min(m_hash_table.BucketsNum()-1,from+step-1); if(to >= from) jobs.push_back(bind(&CKmerHashCounter::GetBranchesJob, this, from, to)); } RunThreads(m_ncores, jobs); cerr << "Kmers branching in " << timer.Elapsed(); } void Info() const { m_hash_table.Info(); } CKmerHashCount& Kmers() { return m_hash_table; } size_t KmerNum() const { return m_kmer_num; } private: void GetBranchesJob(size_t bucket_from, size_t bucket_to) { TKmer max_kmer(string(m_kmer_len, bin2NT[3])); for(size_t bucket = bucket_from; bucket <= bucket_to; ++bucket) { for(auto index = m_hash_table.FirstForBucket(bucket); index != m_hash_table.FirstForBucket(bucket+1); ++index) { uint64_t branches = 0; pair kmer_count = index.GetElement(); //direct TKmer shifted_kmer = (kmer_count.first << 2) & max_kmer; //inverse TKmer shifted_rkmer = (revcomp(kmer_count.first, m_kmer_len) << 2) & max_kmer; for(int nt = 0; nt < 4; ++nt) { TKmer k = shifted_kmer + TKmer(m_kmer_len, nt); SKmerCounter* nbrp = m_hash_table.Find(min(k, revcomp(k, m_kmer_len))); // New kmer is a neighbor if it exists in reads and is not same as current kmer if(nbrp != nullptr && nbrp != kmer_count.second) branches |= (1 << nt); k = shifted_rkmer + TKmer(m_kmer_len, nt); nbrp = m_hash_table.Find(min(k, revcomp(k, m_kmer_len))); if(nbrp != nullptr && nbrp != kmer_count.second) branches |= (1 << (nt+4)); } uint64_t count = kmer_count.second->m_data; uint32_t total_count = count; uint32_t plus_count = (count >> 32); uint64_t plusf = uint16_t(double(plus_count)/total_count*numeric_limits::max()+0.5); kmer_count.second->m_data = (plusf << 48)+(branches << 32)+total_count; } } } void CleanJob(size_t bucket_from, size_t bucket_to) { m_kmer_num += m_hash_table.CleanBuckets(m_min_count, bucket_from, bucket_to); } class CBloomInserter : public TKmer { public: CBloomInserter(int kmer_len) : TKmer(kmer_len, 0), m_kmer_len(kmer_len) {} pair InsertInBloom(const CReadHolder::string_iterator& is, TConcurrentBlockedBloomFilter& bloom) { return apply_visitor(insert_in_bloom(is, bloom, m_kmer_len), v); } private: unsigned m_kmer_len; struct insert_in_bloom : public boost::static_visitor> { insert_in_bloom(const CReadHolder::string_iterator& i, TConcurrentBlockedBloomFilter& bl, unsigned kl) : is(i), bloom(bl), kmer_len(kl) {} template pair operator() (large_t& kmer) const { size_t above_threshold_kmers = 0; size_t uniq_kmers = 0; size_t read_len = is.ReadLen(); if(read_len < kmer_len) return make_pair(above_threshold_kmers, uniq_kmers); unsigned kmer_bytes = (2*kmer_len+7)/8; //number of whole bytes in kmer unsigned kmer_size = (2*kmer_len+63)/64; //number of whole 8-byte words in kmer int partial_bits = (2*kmer_len)%64; //number of used bits in partial 8 byte word (if any) uint64_t mask = numeric_limits::max(); if(partial_bits > 0) mask = (uint64_t(1) << partial_bits) - 1; size_t buf_size = (2*read_len+63)/64+1; uint64_t* read_buf = new uint64_t[buf_size]; //(enough + 1) 8-byte words for read (one extra because we'll copy kmers using whole bytes which can go beyond the sequence) for(int shift = 0; shift < 4 && read_len-shift >= kmer_len; ++shift) { memset(read_buf, 0, 8*buf_size); is.BSeq(shift, read_buf); for(unsigned k = 0; k <= read_len-shift-kmer_len; k += 4) { // every 4th kmer on the byte boundary memcpy(kmer.getPointer(), (uint8_t*)read_buf+k/4, kmer_bytes); kmer.getPointer()[kmer_size-1] &= mask; large_t rkmer = revcomp(kmer, kmer_len); size_t hashp = rkmer.oahash(); size_t hashm = kmer.oahash(); if(kmer < rkmer) swap(hashp, hashm); switch(bloom.Insert(hashp, hashm)) { case TConcurrentBlockedBloomFilter::eNewKmer : ++uniq_kmers; continue; case TConcurrentBlockedBloomFilter::eAboveThresholdKmer : ++above_threshold_kmers; continue; default : continue; } } } delete[] read_buf; return make_pair(above_threshold_kmers, uniq_kmers); } const CReadHolder::string_iterator& is; TConcurrentBlockedBloomFilter& bloom; unsigned kmer_len; }; }; void InsertInBloomJob(const array& rholder, TConcurrentBlockedBloomFilter& bloom) { size_t above_threshold_kmers = 0; size_t uniq_kmers = 0; CBloomInserter bloom_inserter(m_kmer_len); for(int p = 0; p < 2; ++p) { for(CReadHolder::string_iterator is = rholder[p].sbegin(); is != rholder[p].send(); ++is) { auto rslt = bloom_inserter.InsertInBloom(is, bloom); above_threshold_kmers += rslt.first; uniq_kmers += rslt.second; } } m_estimated_table_size += above_threshold_kmers; m_estimated_uniq_kmers += uniq_kmers; } void RehashJob(CKmerHashCount& other_hash_table, size_t bucket_from, size_t bucket_to) { m_hash_table.RehashOtherBuckets(other_hash_table, bucket_from, bucket_to); } void CountKmersJob(const array& rholder, pair& start_pos, const TConcurrentBlockedBloomFilter& bloom) { size_t kmer_num = 0; size_t kmer_count = 0; for(int p = start_pos.first; p < 2; ++p) { CReadHolder::string_iterator from = start_pos.second; if(p != start_pos.first) from = rholder[p].sbegin(); for(CReadHolder::string_iterator is = from; is != rholder[p].send(); ++is) { size_t read_len = is.ReadLen(); if(read_len >= (unsigned)m_kmer_len) kmer_count += read_len-m_kmer_len+1; else continue; kmer_num += m_hash_table.UpdateCounts(is, bloom, m_skip_bloom ? 0 : m_min_count); if(kmer_num >= m_kmer_step) { m_kmer_num_raw += kmer_num; m_kmer_count += kmer_count; kmer_num = 0; kmer_count = 0; if(m_kmer_num_raw.load() > m_hash_table.TableSize()*m_max_load_factor) m_rehash_status.store(true); if(m_rehash_status.load()) { start_pos.first = p; ++is; start_pos.second = is; return; } } } } m_kmer_num_raw += kmer_num; m_kmer_count += kmer_count; start_pos.first = 2; } int m_kmer_len; int m_min_count; bool m_is_stranded; int m_ncores; bool m_skip_bloom; CKmerHashCount m_hash_table; atomic m_estimated_table_size; atomic m_estimated_uniq_kmers; atomic m_kmer_num; atomic m_kmer_num_raw; atomic m_kmer_count; atomic m_rehash_status; size_t m_kmer_step; double m_max_load_factor = 1; int m_increase_factor = 2; list> m_start_position; }; }; // namespace #endif /*_Concurrent_Hash_*/ SKESA-2.3.0/config.hpp000066400000000000000000000025701335720214300143440ustar00rootroot00000000000000/*=========================================================================== * * PUBLIC DOMAIN NOTICE * National Center for Biotechnology Information * * This software/database is a "United States Government Work" under the * terms of the United States Copyright Act. It was written as part of * the author's official duties as a United States Government employee and * thus cannot be copyrighted. This software/database is freely available * to the public for use. The National Library of Medicine and the U.S. * Government have not placed any restriction on its use or reproduction. * * Although all reasonable efforts have been taken to ensure the accuracy * and reliability of the software and data, the NLM and the U.S. * Government do not and cannot warrant the performance or results that * may be obtained by using this software or data. The NLM and the U.S. * Government disclaim all warranties, express or implied, including * warranties of performance, merchantability or fitness for any particular * purpose. * * Please cite the author in any work or product based on this material. * * =========================================================================== * */ // It appears that int128 is not faster than int64[2] and wastes some memory because of additional padding // disabled for now #define INT128_FOUND 0 SKESA-2.3.0/counter.hpp000066400000000000000000000674041335720214300145650ustar00rootroot00000000000000/*=========================================================================== * * PUBLIC DOMAIN NOTICE * National Center for Biotechnology Information * * This software/database is a "United States Government Work" under the * terms of the United States Copyright Act. It was written as part of * the author's official duties as a United States Government employee and * thus cannot be copyrighted. This software/database is freely available * to the public for use. The National Library of Medicine and the U.S. * Government have not placed any restriction on its use or reproduction. * * Although all reasonable efforts have been taken to ensure the accuracy * and reliability of the software and data, the NLM and the U.S. * Government do not and cannot warrant the performance or results that * may be obtained by using this software or data. The NLM and the U.S. * Government disclaim all warranties, express or implied, including * warranties of performance, merchantability or fitness for any particular * purpose. * * Please cite the author in any work or product based on this material. * * =========================================================================== * */ #ifndef _KmerCounter_ #define _KmerCounter_ #include "Integer.hpp" #include "common_util.hpp" namespace DeBruijn { class CKmerCount { // Class for kmer counting and searching implemented using a boost::variant of vector,size_t>> // Currently, maximum N defined in config.hpp is 16 that allows kmers of length at most 512 to be stored. // Only smaller (in the bit encoding) of a kmer and its reverse complement is stored // // When vector is sorted, binary search on the first element of the pair that represents a kmer can be used for retrieving // the information stored for the kmer in the second element of the pair. // First 32 bits of the second element stores total count for kmer (self and reverse complement) // Remaining 32 bits store count for kmer for self only during the counting operations but are modified to additionally store // branching information when used inside CDBGraph public: typedef TKmerCountN Type; CKmerCount(int kmer_len = 0) : m_kmer_len(kmer_len) { if(m_kmer_len > 0) m_container = CreateVariant((m_kmer_len+31)/32); } size_t Size() const { return apply_visitor(container_size(), m_container); } // number of elements in the container void Reserve(size_t rsrv) { apply_visitor(reserve(rsrv), m_container); } // reserves memory for rsrv elements void Clear() { apply_visitor(clear(), m_container); } // clears container (doesn't release memory) size_t Capacity() const { return apply_visitor(container_capacity(), m_container); } // tells how many elements could be stored in reserved memory size_t ElementSize() const { return apply_visitor(element_size(), m_container); } // size of one vector element in bytes size_t MemoryFootprint() const { return Capacity()*ElementSize(); } // reserved memory in bytes void PushBack(const TKmer& kmer, size_t count) { // push back one element if(m_kmer_len == 0) throw runtime_error("Can't insert in uninitialized container"); apply_visitor(push_back(kmer, count), m_container); } void PushBackElementsFrom(const CKmerCount& other) { // push back elements from other container if(m_kmer_len == 0) throw runtime_error("Can't insert in uninitialized container"); apply_visitor(push_back_elements(), m_container, other.m_container); } size_t Find(const TKmer& kmer) const { return apply_visitor(find_kmer(kmer), m_container); } // finds index for a kmer (returns Size() if not found) void UpdateCount(size_t count, size_t index) { apply_visitor(update_count(count, index), m_container); } // updates count at the index position size_t GetCount(size_t index) const { return apply_visitor(get_count(index), m_container); } // gets count at the index position pair GetKmerCount(size_t index) const { return apply_visitor(get_kmer_count(index), m_container); } // gets kmer and count at the index position const uint64_t* getPointer(size_t index) { return apply_visitor(get_pointer(index), m_container); } // gets access to binary kmer sequence int KmerLen() const { return m_kmer_len; } void Sort() { apply_visitor(container_sort(), m_container); } void SortAndExtractUniq(int min_count, CKmerCount& uniq) { // sorts container, aggregates counts, copies elements with count >= min_count into uniq uniq = CKmerCount(m_kmer_len); // init Sort(); apply_visitor(extract_uniq(min_count), m_container, uniq.m_container); } void SortAndUniq(int min_count) { // sorts container, aggregate counts, keeps elements with count >= min_count Sort(); apply_visitor(uniq(min_count), m_container); } void RemoveLowCountKmers(int min_count) { apply_visitor(remove_low_count(min_count), m_container); } void MergeTwoSorted(const CKmerCount& other) { // merges with other assuming both sorted if(m_kmer_len != other.KmerLen()) throw runtime_error("Can't merge kmers of different lengths"); apply_visitor(merge_sorted(), m_container, other.m_container); } void Swap(CKmerCount& other) { // swaps with other swap(m_kmer_len, other.m_kmer_len); apply_visitor(swap_with_other(), m_container, other.m_container); } void Save(ostream& out) const { out.write(reinterpret_cast(&m_kmer_len), sizeof(m_kmer_len)); apply_visitor(save(out), m_container); if(!out) throw runtime_error("Error in counter write"); } void Load(istream& in) { if(!in.read(reinterpret_cast(&m_kmer_len), sizeof(m_kmer_len))) throw runtime_error("Error in counter read"); m_container = CreateVariant((m_kmer_len+31)/32); apply_visitor(load(in), m_container); } private: struct find_kmer : public boost::static_visitor { find_kmer(const TKmer& k) : kmer(k) {} template size_t operator()(const T& v) const { typedef typename T::value_type pair_t; typedef typename pair_t::first_type large_t; auto it = lower_bound(v.begin(), v.end(), kmer.get(), [](const pair_t& element, const large_t& target){ return element.first < target; }); if(it == v.end() || it->first != kmer.get()) return v.size(); else return it-v.begin(); } const TKmer& kmer; }; struct reserve : public boost::static_visitor<> { reserve(size_t r) : rsrv(r) {} template void operator() (T& v) const { v.reserve(rsrv); } size_t rsrv; }; struct container_size : public boost::static_visitor { template size_t operator()(const T& v) const { return v.size();} }; struct container_capacity : public boost::static_visitor { template size_t operator()(const T& v) const { return v.capacity();} }; struct element_size : public boost::static_visitor { template size_t operator()(const T& v) const { return sizeof(typename T::value_type);} }; struct clear : public boost::static_visitor<> { template void operator()(T& v) const { v.clear();} }; struct push_back : public boost::static_visitor<> { push_back(const TKmer& k, size_t c) : kmer(k), count(c) {} template void operator() (T& v) const { typedef typename T::value_type::first_type large_t; v.push_back(make_pair(kmer.get(), count)); } const TKmer& kmer; size_t count; }; struct push_back_elements : public boost::static_visitor<> { template void operator() (T& a, const T& b) const { a.insert(a.end(), b.begin(), b.end()); } template void operator() (T& a, const U& b) const { throw runtime_error("Can't copy from different type container"); } }; struct merge_sorted : public boost::static_visitor<> { template void operator() (T& a, const T& b) const { T merged; merged.reserve(a.size()+b.size()); merge(a.begin(), a.end(), b.begin(), b.end(), back_inserter(merged)); merged.swap(a); } template void operator() (T& a, const U& b) const { throw runtime_error("Can't merge different type containers"); } }; struct update_count : public boost::static_visitor<> { update_count(size_t c, size_t i) : count(c), index(i) {} template void operator() (T& v) const { v[index].second = count; } size_t count; size_t index; }; struct get_count : public boost::static_visitor { get_count(size_t i) : index(i) {} template size_t operator() (T& v) const { return v[index].second; } size_t index; }; struct get_kmer_count : public boost::static_visitor> { get_kmer_count(size_t i) : index(i) {} template pair operator() (T& v) const { return make_pair(TKmer(v[index].first), v[index].second); } size_t index; }; struct get_pointer : public boost::static_visitor { get_pointer(size_t i) : index(i) {} template const uint64_t* operator() (T& v) const { return v[index].first.getPointer(); } size_t index; }; struct container_sort : public boost::static_visitor<> { template void operator() (T& v) const { sort(v.begin(), v.end()); }}; struct swap_with_other : public boost::static_visitor<> { template void operator() (T& a, T& b) const { a.swap(b); } template void operator() (T& a, U& b) const { throw runtime_error("Can't swap different type containers"); } }; struct remove_low_count : public boost::static_visitor<> { remove_low_count(int mc) : min_count(mc) {} template void operator() (T& v) const { v.erase(remove_if(v.begin(), v.end(), [this](const typename T::value_type& pair) { return (uint32_t)pair.second < this->min_count; }), v.end()); } unsigned min_count; }; struct uniq : public boost::static_visitor<> { uniq(int mc) : min_count(mc) {} template void operator() (T& v) const { typedef typename T::iterator iter_t; iter_t nextp = v.begin(); for(iter_t ip = v.begin(); ip != v.end(); ) { iter_t workp = ip; while(++ip != v.end() && workp->first == ip->first) workp->second += ip->second; // accumulate all 8 bytes; we assume that count will not spill into higher half if((uint32_t)workp->second >= min_count) *nextp++ = *workp; } v.erase(nextp, v.end()); } unsigned min_count; }; struct extract_uniq : public boost::static_visitor<> { extract_uniq(int mc) : min_count(mc) {} template void operator() (T& a, T& b) const { if(a.empty()) return; size_t num = 1; uint32_t count = a[0].second; // count only 4 bytes!!!!!! for(size_t i = 1; i < a.size(); ++i) { if(a[i-1].first < a[i].first) { if(count >= min_count) ++num; count = a[i].second; } else { count += a[i].second; } } if(count < min_count) --num; b.reserve(num+1); b.push_back(a[0]); for(size_t i = 1; i < a.size(); ++i) { if(b.back().first < a[i].first) { if((uint32_t)b.back().second < min_count) b.pop_back(); b.push_back(a[i]); } else { b.back().second += a[i].second; // accumulate all 8 bytes; we assume that count will not spill into higher half } } if((uint32_t)b.back().second < min_count) b.pop_back(); } template void operator() (T& a, U& b) const { throw runtime_error("Can't extract into different type container"); } unsigned min_count; }; struct save : public boost::static_visitor<> { save(ostream& out) : os(out) {} template void operator() (T& v) const { size_t num = v.size(); os.write(reinterpret_cast(&num), sizeof num); if(num > 0) os.write(reinterpret_cast(&v[0]), num*sizeof(v[0])); } ostream& os; }; struct load : public boost::static_visitor<> { load(istream& in) : is(in) {} template void operator() (T& v) const { size_t num; if(!is.read(reinterpret_cast(&num), sizeof num)) throw runtime_error("Error in counter read"); if(num > 0) { v.resize(num); if(!is.read(reinterpret_cast(&v[0]), num*sizeof(v[0]))) throw runtime_error("Error in counter read"); } } istream& is; }; Type m_container; int m_kmer_len; }; typedef CKmerCount TKmerCount; // for compatibility with previous code // CKmerCounter counts kmers in reads using multiple threads and stores them in TKmerCount // It also finds neighbors (in GetBranches) if a user wants to use this class to build a CDBGraph (de Bruijn graph) // As Kmer counting could be memory expensive, CKmerCounter accepts an upper limit for the memory available and will // subdivide the task, if needed. // If the number of subtasks exceeds 10, it will throw an exception asking for more memory. class CKmerCounter { public: // reads - raw reads (ncores or more elements in the list) // kmer_len - size of kmer // min_count - minimal count for accepted kmers // is_stranded - flag indicating whether kmers are from input reads where strand is informative or from connected paired // reads generated internally by the program where strand is not a meaningful observation // mem_available - allowed memory in bytes // ncores - number of cores CKmerCounter(const list>& reads, int kmer_len, int min_count, bool is_stranded, int64_t mem_available, int ncores) : m_kmer_len(kmer_len), m_min_count(min_count), m_is_stranded(is_stranded), m_mem_available(mem_available), m_ncores(ncores), m_reads(reads) { cerr << endl << "Kmer len: " << m_kmer_len << endl; CStopWatch timer; timer.Restart(); int64_t raw_kmer_num = 0; for(const auto& reads : m_reads) raw_kmer_num += reads[0].KmerNum(m_kmer_len)+reads[1].KmerNum(m_kmer_len); int64_t GB = 1000000000; int kmer_size = TKmerCount(m_kmer_len).ElementSize(); int64_t mem_needed = 1.2*raw_kmer_num*kmer_size; int max_cycles = 10; // maximum cycles allowed int64_t mbuf = 2*GB; // memory buffer for allocation uncertainity if(mem_needed >= max_cycles*(mem_available-mbuf)) { throw runtime_error("Memory provided is insufficient to do runs in 10 cycles for the read coverage. We find that 16 Gb for 20x coverage of a 5 Mb genome is usually sufficient"); } int cycles = ceil(double(mem_needed)/(mem_available-mbuf)); cerr << "Raw kmers: " << raw_kmer_num << " Memory needed (GB): " << double(mem_needed)/GB << " Memory available (GB): " << double(mem_available-mbuf)/GB << " " << cycles << " cycle(s) will be performed" << endl; int njobs = 8*m_reads.size(); // many buckets reduce short-lived memory overhead spike in SortAndMergeJob int kmer_buckets = cycles*njobs; for(int cycl = 0; cycl < cycles; ++cycl) { pair bucket_range(cycl*njobs, (cycl+1)*njobs-1); list> raw_kmers; list> jobs; for(auto& job_input : m_reads) { if(job_input[0].ReadNum() > 0 || job_input[1].ReadNum() > 0) { // not empty raw_kmers.push_back(vector()); jobs.push_back(bind(&CKmerCounter::SpawnKmersJob, this, ref(job_input), kmer_buckets, bucket_range, ref(raw_kmers.back()))); } } RunThreads(ncores, jobs); // size_t total = 0; // for(auto& v : raw_kmers) { // for(auto& tc : v) // total += tc.MemoryFootprint(); // } SortAndMergeKmers(raw_kmers); } size_t utotal = 0; for(auto& c : m_uniq_kmers) utotal += c.Size(); cerr << "Distinct kmers: " << utotal << endl; cerr << "Kmer count in " << timer.Elapsed(); MergeSortedKmers(); if(m_uniq_kmers.empty()) m_uniq_kmers.push_back(TKmerCount(m_kmer_len)); } virtual ~CKmerCounter() {} // reference to counted kmers TKmerCount& Kmers() { return m_uniq_kmers.front(); } const TKmerCount& Kmers() const { return m_uniq_kmers.front(); } // average count of kmers in the histogram with the main peak double AverageCount() const { map bins; for(size_t index = 0; index < Kmers().Size(); ++index) { ++bins[Kmers().GetCount(index)]; // count clipped to integer automatically } TBins hist(bins.begin(), bins.end()); return GetAverageCount(hist); } // prepares kmer counts to be used in CDBGraph (de Bruijn graph) // runs multiple instances of GetBranchesJob void GetBranches() { CStopWatch timer; timer.Restart(); if(Kmers().Size() > 0) { vector branches(Kmers().Size()); size_t bucket_size = Kmers().Size()/m_ncores+1; list> jobs; for(int i = 0; i < m_ncores; ++i) { pair range(bucket_size*i,min(bucket_size*(i+1)-1,Kmers().Size()-1)); if(range.second >= range.first) jobs.push_back(bind(&CKmerCounter::GetBranchesJob, this, range, ref(branches))); } RunThreads(m_ncores, jobs); for(size_t index = 0; index < Kmers().Size(); ++index) { size_t b = branches[index]; size_t count = Kmers().GetCount(index); uint32_t total_count = count; uint32_t plus_count = (count >> 32); size_t plusf = uint16_t(double(plus_count)/total_count*numeric_limits::max()+0.5); Kmers().UpdateCount((plusf << 48)+(b << 32)+total_count, index); // we put strand info and branching in the high half of the count!!!!! } } cerr << "Kmers branching in " << timer.Elapsed(); } bool IsStranded() const { return m_is_stranded; } // indicates if contains stranded information private: // one-thread worker producing kmers and putting them in multiple non-overlapping buckets // rholder - input reads // buckets - total number of buckets // bucket_range - range of buckets used by this worker // kmers - output kmers void SpawnKmersJob(const array& rholder, int buckets, pair bucket_range, vector& kmers) { size_t total = rholder[0].KmerNum(m_kmer_len)+rholder[1].KmerNum(m_kmer_len); size_t reserve = 1.1*total/buckets; int active_buckets = bucket_range.second-bucket_range.first+1; kmers.resize(active_buckets, TKmerCount(m_kmer_len)); for(auto& k : kmers) k.Reserve(reserve); for(int p = 0; p < 2; ++p) { for(CReadHolder::kmer_iterator itk = rholder[p].kbegin(m_kmer_len); itk != rholder[p].kend(); ++itk) { TKmer kmer = *itk; TKmer rkmer = revcomp(kmer, m_kmer_len); size_t count = 1; TKmer* min_kmerp = &rkmer; if(kmer < rkmer) { min_kmerp = &kmer; count += (size_t(1) << 32); } int bucket = min_kmerp->oahash()%buckets; if(bucket < bucket_range.first || bucket > bucket_range.second) continue; // good to go int ind = bucket - bucket_range.first; if(kmers[ind].Size() == kmers[ind].Capacity()) { //expensive plan B for the case of failed hash uniformity // cerr << "Warning: Hash uniformity problem" << endl; TKmerCount bigger(m_kmer_len); bigger.Reserve(kmers[ind].Size()*1.2); bigger.PushBackElementsFrom(kmers[ind]); bigger.Swap(kmers[ind]); } kmers[ind].PushBack(*min_kmerp, count); } } } //SortAndMergeJob briefly doubles the input memory - should be executed in small chunks!!!!!! // one-thread worker which accepts all containers for a given bucket and merges, sorts and counts them // group - list of containers // ukmers - counted kmers typedef list TContainerPList; void SortAndMergeJob(TContainerPList group, TKmerCount& ukmers) { TKmerCount all_kmers(group.front()->KmerLen()); if(group.size() == 1) { all_kmers.Swap(*group.front()); } else { size_t total = 0; for(auto p : group) total += p->Size(); all_kmers.Reserve(total); // doubles the input memory!!!! for(auto p : group) { all_kmers.PushBackElementsFrom(*p); TKmerCount(p->KmerLen()).Swap(*p); } } all_kmers.SortAndExtractUniq(m_min_count, ukmers); } // runs multiple instances of SortAndMergeJob and stores results in m_uniq_kmers // raw_kmers - input kmers void SortAndMergeKmers(list>& raw_kmers) { list> jobs; int bucken_num = raw_kmers.front().size(); for(int bucket = 0; bucket < bucken_num; ++bucket) { TContainerPList job_input; for(auto& vec : raw_kmers) job_input.push_back(&vec[bucket]); m_uniq_kmers.push_back(TKmerCount()); jobs.push_back(bind(&CKmerCounter::SortAndMergeJob, this, job_input, ref(m_uniq_kmers.back()))); } RunThreads(m_ncores, jobs); } // one-thread worker which merges two sorted buckets static void MergeSortedJob(TKmerCount& akmers, TKmerCount& bkmers) { akmers.MergeTwoSorted(bkmers); TKmerCount(bkmers.KmerLen()).Swap(bkmers); // release bkmers memory } // runs multiple instances of MergeSortedJob // at the end m_uniq_kmers has only one element with final kmers void MergeSortedKmers() { CStopWatch timer; timer.Restart(); while(m_uniq_kmers.size() > 1) { list> jobs; for(list::iterator first = m_uniq_kmers.begin(); first != m_uniq_kmers.end(); ++first) { list::iterator second = first; if(++second != m_uniq_kmers.end()) { jobs.push_back(bind(MergeSortedJob, ref(*first), ref(*second))); first = second; } } RunThreads(m_ncores, jobs); for(auto iloop = m_uniq_kmers.begin(); iloop != m_uniq_kmers.end(); ) { auto it = iloop++; if(it->Size() == 0) m_uniq_kmers.erase(it); } } cerr << "Uniq kmers merging in " << timer.Elapsed(); } // one-thread worker which calculates the branching information (neighbors) for a range of kmers // range - from,to indexes for kmers // branches - vector of branching information (one bit is used for each of the eight possible neighbors) void GetBranchesJob(pair range, vector& branches) { TKmer max_kmer(string(m_kmer_len, bin2NT[3])); for(size_t index = range.first; index <= range.second; ++index) { pair kmer_count = Kmers().GetKmerCount(index); //direct TKmer shifted_kmer = (kmer_count.first << 2) & max_kmer; //inverse TKmer shifted_rkmer = (revcomp(kmer_count.first, m_kmer_len) << 2) & max_kmer; for(int nt = 0; nt < 4; ++nt) { TKmer k = shifted_kmer + TKmer(m_kmer_len, nt); size_t new_index = Kmers().Find(min(k, revcomp(k, m_kmer_len))); // New kmer is a neighbor if it exists in reads and is not same as current kmer if(new_index != Kmers().Size() && new_index != index) branches[index] |= (1 << nt); k = shifted_rkmer + TKmer(m_kmer_len, nt); new_index = Kmers().Find(min(k, revcomp(k, m_kmer_len))); if(new_index != Kmers().Size() && new_index != index) branches[index] |= (1 << (nt+4)); } } } int m_kmer_len; int m_min_count; bool m_is_stranded; size_t m_mem_available; int m_ncores; const list>& m_reads; list m_uniq_kmers; // storage for kmer buckets; at the end will have one element which is the result }; }; // namespace #endif /* _KmerCounter_ */ SKESA-2.3.0/glb_align.cpp000066400000000000000000000636471335720214300150240ustar00rootroot00000000000000/*=========================================================================== * * PUBLIC DOMAIN NOTICE * National Center for Biotechnology Information * * This software/database is a "United States Government Work" under the * terms of the United States Copyright Act. It was written as part of * the author's official duties as a United States Government employee and * thus cannot be copyrighted. This software/database is freely available * to the public for use. The National Library of Medicine and the U.S. * Government have not placed any restriction on its use or reproduction. * * Although all reasonable efforts have been taken to ensure the accuracy * and reliability of the software and data, the NLM and the U.S. * Government do not and cannot warrant the performance or results that * may be obtained by using this software or data. The NLM and the U.S. * Government disclaim all warranties, express or implied, including * warranties of performance, merchantability or fitness for any particular * purpose. * * Please cite the author in any work or product based on this material. * * =========================================================================== * */ #include "glb_align.hpp" #include #include #include using namespace std; namespace DeBruijn { void CCigar::PushFront(const SElement& el) { if(el.m_type == 'M') { m_qfrom -= el.m_len; m_sfrom -= el.m_len; } else if(el.m_type == 'D') m_sfrom -= el.m_len; else m_qfrom -= el.m_len; if(m_elements.empty() || m_elements.front().m_type != el.m_type) m_elements.push_front(el); else m_elements.front().m_len += el.m_len; } void CCigar::PushFront(const CCigar other_cigar) { for(auto it = other_cigar.m_elements.rbegin(); it != other_cigar.m_elements.rend(); ++it) PushFront(*it); } void CCigar::PushBack(const SElement& el) { if(el.m_type == 'M') { m_qto += el.m_len; m_sto += el.m_len; } else if(el.m_type == 'D') m_sto += el.m_len; else m_qto += el.m_len; if(m_elements.empty() || m_elements.back().m_type != el.m_type) m_elements.push_back(el); else m_elements.back().m_len += el.m_len; } string CCigar::CigarString(int qstart, int qlen) const { string cigar; for(auto& element : m_elements) cigar += to_string(element.m_len)+element.m_type; int missingstart = qstart+m_qfrom; if(missingstart > 0) cigar = to_string(missingstart)+"S"+cigar; int missingend = qlen-1-m_qto-qstart; if(missingend > 0) cigar += to_string(missingend)+"S"; return cigar; } string CCigar::DetailedCigarString(int qstart, int qlen, const char* query, const char* subject) const { string cigar; query += m_qfrom; subject += m_sfrom; for(auto& element : m_elements) { if(element.m_type == 'M') { bool is_match = *query == *subject; int len = 0; for(int l = 0; l < element.m_len; ++l) { if((*query == *subject) == is_match) { ++len; } else { cigar += to_string(len)+ (is_match ? "=" : "X"); is_match = !is_match; len = 1; } ++query; ++subject; } cigar += to_string(len)+ (is_match ? "=" : "X"); } else if(element.m_type == 'D') { cigar += to_string(element.m_len)+element.m_type; subject += element.m_len; } else { cigar += to_string(element.m_len)+element.m_type; query += element.m_len; } } int missingstart = qstart+m_qfrom; if(missingstart > 0) cigar = to_string(missingstart)+"S"+cigar; int missingend = qlen-1-m_qto-qstart; if(missingend > 0) cigar += to_string(missingend)+"S"; return cigar; } TCharAlign CCigar::ToAlign(const char* query, const char* subject) const { TCharAlign align; query += m_qfrom; subject += m_sfrom; for(auto& element : m_elements) { if(element.m_type == 'M') { align.first.insert(align.first.end(), query, query+element.m_len); query += element.m_len; align.second.insert(align.second.end(), subject, subject+element.m_len); subject += element.m_len; } else if(element.m_type == 'D') { align.first.insert(align.first.end(), element.m_len, '-'); align.second.insert(align.second.end(), subject, subject+element.m_len); subject += element.m_len; } else { align.first.insert(align.first.end(), query, query+element.m_len); query += element.m_len; align.second.insert(align.second.end(), element.m_len, '-'); } } return align; } int CCigar::Matches(const char* query, const char* subject) const { int matches = 0; query += m_qfrom; subject += m_sfrom; for(auto& element : m_elements) { if(element.m_type == 'M') { for(int l = 0; l < element.m_len; ++l) { if(*query == *subject) ++matches; ++query; ++subject; } } else if(element.m_type == 'D') { subject += element.m_len; } else { query += element.m_len; } } return matches; } int CCigar::Distance(const char* query, const char* subject) const { int dist = 0; query += m_qfrom; subject += m_sfrom; for(auto& element : m_elements) { if(element.m_type == 'M') { for(int l = 0; l < element.m_len; ++l) { if(*query != *subject) ++dist; ++query; ++subject; } } else if(element.m_type == 'D') { subject += element.m_len; dist += element.m_len; } else { query += element.m_len; dist += element.m_len; } } return dist; } int CCigar::Score(const char* query, const char* subject, int gopen, int gapextend, const char delta[256][256]) const { int score = 0; query += m_qfrom; subject += m_sfrom; for(auto& element : m_elements) { if(element.m_type == 'M') { for(int l = 0; l < element.m_len; ++l) { score += delta[(int)*query][(int)*subject]; ++query; ++subject; } } else if(element.m_type == 'D') { subject += element.m_len; score -= gopen+gapextend*element.m_len; } else { query += element.m_len; score -= gopen+gapextend*element.m_len; } } return score; } enum{Agap = 1, Bgap = 2, Astart = 4, Bstart = 8, Zero = 16}; CCigar BackTrack(int ia, int ib, char* m, int nb) { CCigar track(ia, ib); while((ia >= 0 || ib >= 0) && !(*m&Zero)) { if(*m&Agap) { int len = 1; while(!(*m&Astart)) { ++len; --m; } --m; ib -= len; track.PushFront(CCigar::SElement(len,'D')); } else if(*m&Bgap) { int len = 1; while(!(*m&Bstart)) { ++len; m -= nb+1; } m -= nb+1; ia -= len; track.PushFront(CCigar::SElement(len,'I')); } else { track.PushFront(CCigar::SElement(1,'M')); --ia; --ib; m -= nb+2; } } return track; } CCigar BackTrackBand(int ia, int ib, char* m, int band) { CCigar track(ia, ib); while((ia >= 0 || ib >= 0) && !(*m&Zero)) { if(*m&Agap) { int len = 1; while(!(*m&Astart)) { ++len; --m; } --m; ib -= len; track.PushFront(CCigar::SElement(len,'D')); } else if(*m&Bgap) { int len = 1; while(!(*m&Bstart)) { ++len; m -= band+1; } m -= band+1; ia -= len; track.PushFront(CCigar::SElement(len,'I')); } else { track.PushFront(CCigar::SElement(1,'M')); --ia; --ib; m -= band+2; } } return track; } class CScore { public: // we keep score and tiebreaker in int64 integer //!!!!!!!! tiebreaker must be >= 0 or it will spill into the score part !!!!!!!!!! CScore() : m_score(0) {} CScore(int32_t score, int32_t breaker) : m_score((int64_t(score) << 32) + breaker) {} bool operator>(const CScore& other) const { return m_score > other.m_score; } CScore operator+(const CScore& other) const { return CScore(m_score+other.m_score); } CScore& operator+=(const CScore& other) { m_score += other.m_score; return *this; } int32_t Score() const { return (m_score >> 32); } private: CScore(int64_t score) : m_score(score) {} int64_t m_score; }; struct SRawMemory { SRawMemory(size_t na, size_t nb) { s = new CScore[nb+1]; sm = new CScore[nb+1]; gapb = new CScore[nb+1]; mtrx = new char[(na+1)*(nb+1)]; } SRawMemory(size_t na, size_t nb, size_t band) { s = new CScore[nb+1]; sm = new CScore[nb+1]; gapb = new CScore[nb+1]; mtrx = new char[(na+1)*(band+2)]; // one extra element on each side } ~SRawMemory() { delete[] s; delete[] sm; delete[] gapb; delete[] mtrx; } CScore* s; // best scores in current a-raw CScore* sm; // best scores in previous a-raw CScore* gapb; // best score with b-gap char* mtrx; // backtracking info (Astart/Bstart gap start, Agap/Bgap best score has gap and should be backtracked to Asrt/Bsart; Zero stop bactracking) }; CCigar GlbAlign(const char* a, int na, const char* b, int nb, int rho, int sigma, const char delta[256][256]) { // rho - new gap penalty (one base gap rho+sigma) // sigma - extension penalty SRawMemory memory(na, nb); CScore* s = memory.s; // best scores in current a-raw CScore* sm = memory.sm; // best scores in previous a-raw CScore* gapb = memory.gapb; // best score with b-gap char* mtrx = memory.mtrx; // backtracking info (Astart/Bstart gap start, Agap/Bgap best score has gap and should be backtracked to Asrt/Bsart; Zero stop bactracking) CScore rsa(-rho-sigma, 0); // new gapa CScore rsb(-rho-sigma, 1); // new gapb CScore bignegative(numeric_limits::min()/2, 0); sm[0] = CScore(); sm[1] = rsa; // scores for -------------- (the best scores for i == -1) for(int i = 2; i <= nb; ++i) // BBBBBBBBBBBBBB sm[i] = sm[i-1]+CScore(-sigma, 0); s[0] = rsb; // score for A (the best score for j == -1 and i == 0) // - for(int i = 0; i <= nb; ++i) gapb[i] = bignegative; mtrx[0] = 0; for(int i = 1; i <= nb; ++i) { // --------------- mtrx[i] = Agap; // BBBBBBBBBBBBBBB } mtrx[1] |= Astart; char* m = mtrx+nb; for(int i = 0; i < na; ++i) { *(++m) = Bstart|Bgap; //AAAAAAAAAAAAAAA //--------------- CScore gapa = bignegative; int ai = a[i]; const char* matrix = delta[ai]; CScore* sp = s; for(int j = 0; j < nb; ) { *(++m) = 0; CScore ss = sm[j]+CScore(matrix[(int)b[j]], 1); // diagonal extension gapa += CScore(-sigma, 0); // gapa extension if(*sp+rsa > gapa) { // for j == 0 this will open AAAAAAAAAAA- which could be used if mismatch is very expensive gapa = *sp+rsa; // -----------B *m |= Astart; } CScore& gapbj = gapb[++j]; gapbj += CScore(-sigma, 1); // gapb extension if(sm[j]+rsb > gapbj) { // for i == 0 this will open BBBBBBBBBBB- which could be used if mismatch is very expensive gapbj = sm[j]+rsb; // -----------A *m |= Bstart; } if(gapa > gapbj) { if(ss > gapa) { *(++sp) = ss; } else { *(++sp) = gapa; *m |= Agap; } } else { if(ss > gapbj) { *(++sp) = ss; } else { *(++sp) = gapbj; *m |= Bgap; } } } swap(sm,s); *s = *sm+CScore(-sigma, 1); } int ia = na-1; int ib = nb-1; m = mtrx+(na+1)*(nb+1)-1; return BackTrack(ia, ib, m, nb); } CCigar LclAlign(const char* a, int na, const char* b, int nb, int rho, int sigma, const char delta[256][256]) { // rho - new gap penalty (one base gap rho+sigma) // sigma - extension penalty SRawMemory memory(na, nb); CScore* s = memory.s; // best scores in current a-raw CScore* sm = memory.sm; // best scores in previous a-raw CScore* gapb = memory.gapb; // best score with b-gap char* mtrx = memory.mtrx; // backtracking info (Astart/Bstart gap start, Agap/Bgap best score has gap and should be backtracked to Asrt/Bsart; Zero stop bactracking) CScore rsa(-rho-sigma, 0); // new gapa CScore rsb(-rho-sigma, 1); // new gapb for(int i = 0; i <= nb; ++i) { sm[i] = CScore(); mtrx[i] = Zero; gapb[i] = CScore(); } s[0] = CScore(); CScore max_score; char* max_ptr = mtrx; char* m = mtrx+nb; for(int i = 0; i < na; ++i) { *(++m) = Zero; CScore gapa; int ai = a[i]; const char* matrix = delta[ai]; CScore* sp = s; for(int j = 0; j < nb; ) { *(++m) = 0; CScore ss = sm[j]+CScore(matrix[(int)b[j]], 1); // diagonal extension gapa += CScore(-sigma, 0); // gapa extension if(*sp+rsa > gapa) { gapa = *sp+rsa; // new gapa *m |= Astart; } CScore& gapbj = gapb[++j]; gapbj += CScore(-sigma, 1); // gapb extension if(sm[j]+rsb > gapbj) { gapbj = sm[j]+rsb; // new gapb *m |= Bstart; } if(gapa > gapbj) { if(ss > gapa) { *(++sp) = ss; if(ss > max_score) { max_score = ss; max_ptr = m; } } else { *(++sp) = gapa; *m |= Agap; } } else { if(ss > gapbj) { *(++sp) = ss; if(ss > max_score) { max_score = ss; max_ptr = m; } } else { *(++sp) = gapbj; *m |= Bgap; } } if(sp->Score() <= 0) { *sp = CScore(); *m |= Zero; } } swap(sm,s); } int ia = (max_ptr-mtrx)/(nb+1)-1; int ib = (max_ptr-mtrx)%(nb+1)-1; m = max_ptr; return BackTrack(ia, ib, m, nb); } CCigar LclAlign(const char* a, int na, const char* b, int nb, int rho, int sigma, bool pinleft, bool pinright, const char delta[256][256]) { // rho - new gap penalty (one base gap rho+sigma) // sigma - extension penalty SRawMemory memory(na, nb); CScore* s = memory.s; // best scores in current a-raw CScore* sm = memory.sm; // best scores in previous a-raw CScore* gapb = memory.gapb; // best score with b-gap char* mtrx = memory.mtrx; // backtracking info (Astart/Bstart gap start, Agap/Bgap best score has gap and should be backtracked to Asrt/Bsart; Zero stop bactracking) CScore rsa(-rho-sigma, 0); // new gapa CScore rsb(-rho-sigma, 1); // new gapb CScore bignegative(numeric_limits::min()/2, 0); sm[0] = CScore(); mtrx[0] = 0; gapb[0] = bignegative; // not used if(pinleft) { if(nb > 0) { sm[1] = rsa; mtrx[1] = Astart|Agap; gapb[1] = bignegative; for(int i = 2; i <= nb; ++i) { sm[i] = sm[i-1]+CScore(-sigma, 0); mtrx[i] = Agap; gapb[i] = bignegative; } } s[0] = rsb; } else { for(int i = 1; i <= nb; ++i) { sm[i] = CScore(); mtrx[i] = Zero; gapb[i] = bignegative; } s[0] = CScore(); } CScore max_score; char* max_ptr = mtrx; char* m = mtrx+nb; for(int i = 0; i < na; ++i) { *(++m) = pinleft ? Bstart|Bgap : Zero; CScore gapa = bignegative; int ai = a[i]; const char* matrix = delta[ai]; CScore* sp = s; for(int j = 0; j < nb; ) { *(++m) = 0; CScore ss = sm[j]+CScore(matrix[(int)b[j]], 1); // diagonal extension gapa += CScore(-sigma, 0); // gapa extension if(*sp+rsa > gapa) { gapa = *sp+rsa; // new gapa *m |= Astart; } CScore& gapbj = gapb[++j]; gapbj += CScore(-sigma, 1); // gapb extension if(sm[j]+rsb > gapbj) { gapbj = sm[j]+rsb; // new gapb *m |= Bstart; } if(gapa > gapbj) { if(ss > gapa) { *(++sp) = ss; if(ss > max_score) { max_score = ss; max_ptr = m; } } else { *(++sp) = gapa; *m |= Agap; } } else { if(ss > gapbj) { *(++sp) = ss; if(ss > max_score) { max_score = ss; max_ptr = m; } } else { *(++sp) = gapbj; *m |= Bgap; } } if(sp->Score() <= 0 && !pinleft) { *sp = CScore(); *m |= Zero; } } swap(sm,s); if(pinleft) *s = *sm+CScore(-sigma, 1); } int maxa, maxb; if(pinright) { maxa = na-1; maxb = nb-1; max_score = sm[nb]; } else { maxa = (max_ptr-mtrx)/(nb+1)-1; maxb = (max_ptr-mtrx)%(nb+1)-1; m = max_ptr; } int ia = maxa; int ib = maxb; return BackTrack(ia, ib, m, nb); } CCigar VariBandAlign(const char* a, int na, const char* b, int nb, int rho, int sigma, const char delta[256][256], const TRange* blimits) { // rho - new gap penalty (one base gap rho+sigma) // sigma - extension penalty SRawMemory memory(na, nb); CScore* s = memory.s; // best scores in current a-raw CScore* sm = memory.sm; // best scores in previous a-raw CScore* gapb = memory.gapb; // best score with b-gap char* mtrx = memory.mtrx; // backtracking info (Astart/Bstart gap start, Agap/Bgap best score has gap and should be backtracked to Asrt/Bsart; Zero stop bactracking) CScore rsa(-rho-sigma, 0); // new gapa CScore rsb(-rho-sigma, 1); // new gapb for(int i = 0; i <= nb; ++i) { s[i] = CScore(); sm[i] = CScore(); gapb[i] = CScore(); mtrx[i] = Zero; } CScore max_score; char* max_ptr = mtrx; char* m = mtrx+nb; const TRange* last = blimits+na; while(true) { int ai = *a++; const char* matrix = delta[ai]; int bleft = blimits->first; int bright = blimits->second; m += bleft; *(++m) = Zero; CScore gapa; CScore* sp = s+bleft; *sp = CScore(); for(int j = bleft; j <= bright; ) { *(++m) = 0; CScore ss = sm[j]+CScore(matrix[(int)b[j]], 1); // diagonal extension gapa += CScore(-sigma, 0); // gapa extension if(*sp+rsa > gapa) { gapa = *sp+rsa; *m |= Astart; } CScore& gapbj = gapb[++j]; gapbj += CScore(-sigma, 1); // gapb extension if(sm[j]+rsb > gapbj) { gapbj = sm[j]+rsb; *m |= Bstart; } if(gapa > gapbj) { if(ss > gapa) { *(++sp) = ss; if(ss > max_score) { max_score = ss; max_ptr = m; } } else { *(++sp) = gapa; *m |= Agap; } } else { if(ss > gapbj) { *(++sp) = ss; if(ss > max_score) { max_score = ss; max_ptr = m; } } else { *(++sp) = gapbj; *m |= Bgap; } } if(sp->Score() <= 0) { *sp = CScore(); *m |= Zero; } } if(++blimits == last) break; swap(sm,s); m -= bright+1; // beginning of the current raw //clean up (s - self sustained) int nextr = blimits->second; //right increased for(int l = bright+1; l <= nextr; ++l) m[l+1] = Zero; //right decreased for(int l = nextr+1; l <= bright; ++l) { gapb[l+1] = CScore(); sm[l+1] = CScore(); } int nextl = blimits->first; //left decreased for(int l = nextl-1; l <= bleft-1; ++l) { gapb[l+1] = CScore(); sm[l+1] = CScore(); m[l+1] = Zero; } m += nb; // end of the current raw } int ia = (max_ptr-mtrx)/(nb+1)-1; int ib = (max_ptr-mtrx)%(nb+1)-1; m = max_ptr; return BackTrack(ia, ib, m, nb); } CCigar BandAlign(const char* a, int na, const char* b, int nb, int rho, int sigma, const char delta[256][256], int band) { // rho - new gap penalty (one base gap rho+sigma) // sigma - extension penalty band = 2*(band/2)+1; // odd SRawMemory memory(na, nb, band); CScore* s = memory.s; // best scores in current a-raw CScore* sm = memory.sm; // best scores in previous a-raw CScore* gapb = memory.gapb; // best score with b-gap char* mtrx = memory.mtrx; // backtracking info (Astart/Bstart gap start, Agap/Bgap best score has gap and should be backtracked to Asrt/Bsart; Zero stop bactracking) CScore rsa(-rho-sigma, 0); // new gapa CScore rsb(-rho-sigma, 1); // new gapb for(int i = 0; i <= nb; ++i) { s[i] = CScore(); sm[i] = CScore(); gapb[i] = CScore(); } for(int i = 0; i < band+2; ++i) mtrx[i] = Zero; CScore max_score; char* max_ptr = mtrx; for(int i = 0; i < min(na, nb+band/2); ++i) { int ai = a[i]; const char* matrix = delta[ai]; int bleft = max(0, i-band/2); int bright = i+band/2; char* m = mtrx+size_t(i+1)*(band+2)+band-(bright-bleft+1); *m = Zero; bright = min(bright, nb-1); CScore gapa; CScore* sp = s+bleft; *sp = CScore(); for(int j = bleft; j <= bright; ) { *(++m) = 0; CScore ss = sm[j]+CScore(matrix[(int)b[j]], 1); // diagonal extension gapa += CScore(-sigma, 0); // gapa extension if(*sp+rsa > gapa) { gapa = *sp+rsa; *m |= Astart; } CScore& gapbj = gapb[++j]; gapbj += CScore(-sigma, 1); // gapb extension if(sm[j]+rsb > gapbj) { gapbj = sm[j]+rsb; *m |= Bstart; } if(gapa > gapbj) { if(ss > gapa) { *(++sp) = ss; if(ss > max_score) { max_score = ss; max_ptr = m; } } else { *(++sp) = gapa; *m |= Agap; } } else { if(ss > gapbj) { *(++sp) = ss; if(ss > max_score) { max_score = ss; max_ptr = m; } } else { *(++sp) = gapbj; *m |= Bgap; } } if(sp->Score() <= 0) { *sp = CScore(); *m |= Zero; } } *(++m) = Zero; swap(sm,s); } int ia = (max_ptr-mtrx)/(band+2)-1; int ib = (max_ptr-mtrx)%(band+2)-1+ia-band/2; return BackTrackBand(ia, ib, max_ptr, band); } SMatrix::SMatrix(int match, int mismatch) { // matrix for DNA for(int i = 0; i < 256; ++i) { int c = toupper(i); for(int j = 0; j < 256; ++j) { if(c != 'N' && c == toupper(j)) matrix[i][j] = match; else matrix[i][j] = -mismatch; } } } SMatrix::SMatrix() { // matrix for proteins string aa("ARNDCQEGHILKMFPSTWYVBZX*"); int scores[] = { 4,-1,-2,-2, 0,-1,-1, 0,-2,-1,-1,-1,-1,-2,-1, 1, 0,-3,-2, 0,-2,-1, 0,-4, -1, 5, 0,-2,-3, 1, 0,-2, 0,-3,-2, 2,-1,-3,-2,-1,-1,-3,-2,-3,-1, 0,-1,-4, -2, 0, 6, 1,-3, 0, 0, 0, 1,-3,-3, 0,-2,-3,-2, 1, 0,-4,-2,-3, 3, 0,-1,-4, -2,-2, 1, 6,-3, 0, 2,-1,-1,-3,-4,-1,-3,-3,-1, 0,-1,-4,-3,-3, 4, 1,-1,-4, 0,-3,-3,-3, 9,-3,-4,-3,-3,-1,-1,-3,-1,-2,-3,-1,-1,-2,-2,-1,-3,-3,-2,-4, -1, 1, 0, 0,-3, 5, 2,-2, 0,-3,-2, 1, 0,-3,-1, 0,-1,-2,-1,-2, 0, 3,-1,-4, -1, 0, 0, 2,-4, 2, 5,-2, 0,-3,-3, 1,-2,-3,-1, 0,-1,-3,-2,-2, 1, 4,-1,-4, 0,-2, 0,-1,-3,-2,-2, 6,-2,-4,-4,-2,-3,-3,-2, 0,-2,-2,-3,-3,-1,-2,-1,-4, -2, 0, 1,-1,-3, 0, 0,-2, 8,-3,-3,-1,-2,-1,-2,-1,-2,-2, 2,-3, 0, 0,-1,-4, -1,-3,-3,-3,-1,-3,-3,-4,-3, 4, 2,-3, 1, 0,-3,-2,-1,-3,-1, 3,-3,-3,-1,-4, -1,-2,-3,-4,-1,-2,-3,-4,-3, 2, 4,-2, 2, 0,-3,-2,-1,-2,-1, 1,-4,-3,-1,-4, -1, 2, 0,-1,-3, 1, 1,-2,-1,-3,-2, 5,-1,-3,-1, 0,-1,-3,-2,-2, 0, 1,-1,-4, -1,-1,-2,-3,-1, 0,-2,-3,-2, 1, 2,-1, 5, 0,-2,-1,-1,-1,-1, 1,-3,-1,-1,-4, -2,-3,-3,-3,-2,-3,-3,-3,-1, 0, 0,-3, 0, 6,-4,-2,-2, 1, 3,-1,-3,-3,-1,-4, -1,-2,-2,-1,-3,-1,-1,-2,-2,-3,-3,-1,-2,-4, 7,-1,-1,-4,-3,-2,-2,-1,-2,-4, 1,-1, 1, 0,-1, 0, 0, 0,-1,-2,-2, 0,-1,-2,-1, 4, 1,-3,-2,-2, 0, 0, 0,-4, 0,-1, 0,-1,-1,-1,-1,-2,-2,-1,-1,-1,-1,-2,-1, 1, 5,-2,-2, 0,-1,-1, 0,-4, -3,-3,-4,-4,-2,-2,-3,-2,-2,-3,-2,-3,-1, 1,-4,-3,-2,11, 2,-3,-4,-3,-2,-4, -2,-2,-2,-3,-2,-1,-2,-3, 2,-1,-1,-2,-1, 3,-3,-2,-2, 2, 7,-1,-3,-2,-1,-4, 0,-3,-3,-3,-1,-2,-2,-3,-3, 3, 1,-2, 1,-1,-2,-2, 0,-3,-1, 4,-3,-2,-1,-4, -2,-1, 3, 4,-3, 0, 1,-1, 0,-3,-4, 0,-3,-3,-2, 0,-1,-4,-3,-3, 4, 1,-1,-4, -1, 0, 0, 1,-3, 3, 4,-2, 0,-3,-3, 1,-1,-3,-1, 0,-1,-3,-2,-2, 1, 4,-1,-4, 0,-1,-1,-1,-2,-1,-1,-1,-1,-1,-1,-1,-1,-1,-2, 0, 0,-2,-1,-1,-1,-1,-1,-4, -4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4, 1 }; for(int i = 0; i < 256; ++i) { for(int j = 0; j < 256; ++j) { matrix[i][j] = 0; } } int num = aa.size(); for(int i = 0; i < num; ++i) { char c = aa[i]; for(int j = 0; j < num; ++j) { int score = scores[num*j+i]; char d = aa[j]; matrix[(int)c][(int)d] = score; matrix[(int)tolower(c)][(int)tolower(d)] = score; matrix[(int)c][(int)tolower(d)] = score; matrix[(int)tolower(c)][(int)d] = score; } } } }; // namespace SKESA-2.3.0/glb_align.hpp000066400000000000000000000114141335720214300150120ustar00rootroot00000000000000/*=========================================================================== * * PUBLIC DOMAIN NOTICE * National Center for Biotechnology Information * * This software/database is a "United States Government Work" under the * terms of the United States Copyright Act. It was written as part of * the author's official duties as a United States Government employee and * thus cannot be copyrighted. This software/database is freely available * to the public for use. The National Library of Medicine and the U.S. * Government have not placed any restriction on its use or reproduction. * * Although all reasonable efforts have been taken to ensure the accuracy * and reliability of the software and data, the NLM and the U.S. * Government do not and cannot warrant the performance or results that * may be obtained by using this software or data. The NLM and the U.S. * Government disclaim all warranties, express or implied, including * warranties of performance, merchantability or fitness for any particular * purpose. * * Please cite the author in any work or product based on this material. * * =========================================================================== * */ #ifndef GLBALIGN__HPP #define GLBALIGN__HPP #include #include #include #include #include using namespace std; namespace DeBruijn { typedef pair TCharAlign; typedef pair TRange; class CCigar { public: CCigar(int qto = -1, int sto = -1) : m_qfrom(qto+1), m_qto(qto), m_sfrom(sto+1), m_sto(sto) {} struct SElement { SElement(int l, char t) : m_len(l), m_type(t) {} int m_len; char m_type; // 'M' 'D' 'I' }; void PushFront(const SElement& el); void PushBack(const SElement& el); void PushFront(const CCigar other_cigar); string CigarString(int qstart, int qlen) const; // qstart, qlen identify notaligned 5'/3' parts string DetailedCigarString(int qstart, int qlen, const char* query, const char* subject) const; TRange QueryRange() const { return TRange(m_qfrom, m_qto); } TRange SubjectRange() const { return TRange(m_sfrom, m_sto); } TCharAlign ToAlign(const char* query, const char* subject) const; int Matches(const char* query, const char* subject) const; int Distance(const char* query, const char* subject) const; int Score(const char* query, const char* subject, int gopen, int gapextend, const char delta[256][256]) const; private: list m_elements; int m_qfrom, m_qto, m_sfrom, m_sto; }; //Needleman-Wunsch CCigar GlbAlign(const char* query, int querylen, const char* subject, int subjectlen, int gopen, int gapextend, const char delta[256][256]); //Smith-Waterman CCigar LclAlign(const char* query, int querylen, const char* subject, int subjectlen, int gopen, int gapextend, const char delta[256][256]); //Smith-Waterman with optional NW ends CCigar LclAlign(const char* query, int querylen, const char* subject, int subjectlen, int gopen, int gapextend, bool pinleft, bool pinright, const char delta[256][256]); //variable band Smith-Waterman (traceback matrix full) CCigar VariBandAlign(const char* query, int querylen, const char* subject, int subjectlen, int gopen, int gapextend, const char delta[256][256], const TRange* subject_limits); //band Smith-Waterman (traceback matrix banded) CCigar BandAlign(const char* query, int querylen, const char* subject, int subjectlen, int gopen, int gapextend, const char delta[256][256], int band); struct SMatrix { SMatrix(int match, int mismatch); // matrix for DNA SMatrix(); // matrix for proteins blosum62 char matrix[256][256]; }; template int EditDistance(const T &s1, const T & s2) { const int len1 = s1.size(), len2 = s2.size(); vector col(len2+1), prevCol(len2+1); for (int i = 0; i < (int)prevCol.size(); i++) prevCol[i] = i; for (int i = 0; i < len1; i++) { col[0] = i+1; for (int j = 0; j < len2; j++) col[j+1] = min( min( 1 + col[j], 1 + prevCol[1 + j]), prevCol[j] + (s1[i]==s2[j] ? 0 : 1) ); col.swap(prevCol); } return prevCol[len2]; } template double Entropy(RandomIterator start, size_t length) { if(length == 0) return 0; double tA = 1.e-8; double tC = 1.e-8; double tG = 1.e-8; double tT = 1.e-8; for(auto it = start; it != start+length; ++it) { switch(*it) { case 'A': tA += 1; break; case 'C': tC += 1; break; case 'G': tG += 1; break; case 'T': tT += 1; break; default: break; } } double entropy = -(tA*log(tA/length)+tC*log(tC/length)+tG*log(tG/length)+tT*log(tT/length))/(length*log(4.)); return entropy; } }; // namespace #endif // GLBALIGN__HPP SKESA-2.3.0/graphdigger.hpp000066400000000000000000004731201335720214300153650ustar00rootroot00000000000000/*=========================================================================== * * PUBLIC DOMAIN NOTICE * National Center for Biotechnology Information * * This software/database is a "United States Government Work" under the * terms of the United States Copyright Act. It was written as part of * the author's official duties as a United States Government employee and * thus cannot be copyrighted. This software/database is freely available * to the public for use. The National Library of Medicine and the U.S. * Government have not placed any restriction on its use or reproduction. * * Although all reasonable efforts have been taken to ensure the accuracy * and reliability of the software and data, the NLM and the U.S. * Government do not and cannot warrant the performance or results that * may be obtained by using this software or data. The NLM and the U.S. * Government disclaim all warranties, express or implied, including * warranties of performance, merchantability or fitness for any particular * purpose. * * Please cite the author in any work or product based on this material. * * =========================================================================== * */ #ifndef _GraphDigger_ #define _GraphDigger_ #include #include #include #include #include "glb_align.hpp" #include "DBGraph.hpp" namespace DeBruijn { /************************ General description Class CDBGraphDigger, defined in this file, performs most of the actual assembling work. struct SContig is used to hold assembled sequences. Members are deque m_seq; // sequence representing contig deque m_kmers; // all kmers of this contig sequence int m_kmer_len; // size of kmers used for building the contig CDBGraph::Node m_next_left; // denied left kmer (connection possible but it is already owned) CDBGraph::Node m_next_right; // denied right kmer (connection possible but it is already owned) SContig* m_left_link; // if set points to 'left' contig SContig* m_right_link; // if set points to 'right' contig int m_left_shift; // shift+1 for m_next_left in this contig (positive for the right end) int m_right_shift; // shift+1 for m_next_right in this contig (positive for the right end) int m_left_extend; // number of newly assembled bases which could be clipped int m_right_extend; // number of newly assembled bases which could be clipped SAtomic m_is_taken; There are three main scenarios how this structure may be created 1) From a previously assembled contig represented by a c++ string. In this case, no members other than m_seq, m_kmers, and m_kmer_len change their default zero value. 2) Assembled starting from one of the kmers which has not been used so far. Because assembly is done in multiple threads, two or more threads could start assembling the same contig from different starting kmers. At some point they will collide with each other and will try to obtain a kmer which has already been used by the other contig in a different thread. When this happens, the thread stops extending the contig and assigns the denied kmer to m_next_left or m_next_right. These partially assembled contigs (which internally are called fragments) could be connected to each other using m_next_left/m_next_right. It is done in ConnectFragments(). 3) When we increase the kmer size, some previously assembled contigs could be extended or connected because the longer kmer could resolve some repeats. To achieve this, we assemble new contigs starting from each of the flank kmers. When these contigs are started, m_left_link/m_right_link are assigned to point to the parent contig and m_left_shift/m_right_shift are assigned to indicate the start position. Because the work is done in mutiple threads, the contigs could come in two fragments if they started from different contigs and come together. The connected fragments will have links on both sides. These contigs are 'connectors'. The rest of contigs are 'extenders'. They are used by ConnectAndExtendContigs() to form a new contig set. There is an important corner case for connectors. A thread could start from contig A and finish assembling a connector all the way to contig B before some other threads starts dealing with B. In this case the sequence starting from B will not contain any real bases but will only have m_next_left/m_next_right and a link. Those are mentioned in the code as 'empty linkers' and should be treated as special cases. A note about multiprocessing in 2) and 3): In both cases, the threads should be able to decide which kmers or contigs are still available for work. For communicating this information between the threads, the code uses lock-free c++ atomic variables. For the kmers, this is stored in m_visited vector in CDBGraph. For the contigs, this is stored in m_is_taken. The length of newly assembled sequence is stored in m_left_extend/m_right_extend. ************************/ mutex out_mutex; typedef deque TVariation; struct SeqInterval { SeqInterval(TVariation::iterator b, TVariation::iterator e) : begin(b), end(e) {} bool operator<(const SeqInterval& other) const { return lexicographical_compare(begin, end, other.begin, other.end); } bool operator==(const SeqInterval& other) const { return equal(begin, end, other.begin); } TVariation::iterator begin; TVariation::iterator end; }; typedef forward_list TLocalVariants; class CContigSequence : public deque { public: int m_left_repeat = 0; // number of bases which COULD be in repeat int m_right_repeat = 0; // number of bases which COULD be in repeat bool m_circular = false; int VariantsNumber(int chunk) { return distance((*this)[chunk].begin(), (*this)[chunk].end()); } bool UniqueChunk(int chunk) const { auto it = (*this)[chunk].begin(); return (it != (*this)[chunk].end() && ++it == (*this)[chunk].end()); } bool VariableChunk(int chunk) const { auto it = (*this)[chunk].begin(); return (it != (*this)[chunk].end() && ++it != (*this)[chunk].end()); } size_t ChunkLenMax(int chunk) const { size_t mx = 0; for(auto& seq : (*this)[chunk]) mx = max(mx, seq.size()); return mx; } size_t ChunkLenMin(int chunk) const { size_t mn = numeric_limits::max(); for(auto& seq : (*this)[chunk]) mn = min(mn, seq.size()); return mn; } size_t LenMax() const { size_t len = 0; for(unsigned chunk = 0; chunk < size(); ++chunk) len += ChunkLenMax(chunk); return len; } size_t LenMin() const { size_t len = 0; for(unsigned chunk = 0; chunk < size(); ++chunk) len += ChunkLenMin(chunk); return len; } void InsertNewVariant() { back().emplace_front(); } void InsertNewVariant(char c) { back().emplace_front(1, c); } template void InsertNewVariant(ForwardIterator b, ForwardIterator e) { back().emplace_front(b, e); } void ExtendTopVariant(char c) { back().front().push_back(c); } template void ExtendTopVariant(ForwardIterator b, ForwardIterator e) { back().front().insert(back().front().end(), b, e); } void InsertNewChunk() { emplace_back(); } template void InsertNewChunk(ForwardIterator b, ForwardIterator e) { InsertNewChunk(); InsertNewVariant(b, e); } void StabilizeVariantsOrder() { for(auto& chunk : *this) chunk.sort(); } void ReverseComplement() { std::swap(m_left_repeat, m_right_repeat); reverse(begin(), end()); for(auto& chunk : *this) { for(auto& seq : chunk) ReverseComplementSeq(seq.begin(), seq.end()); } StabilizeVariantsOrder(); } bool RemoveShortUniqIntervals(int min_uniq_len) { if(size() >= 5) { for(unsigned i = 2; i < size()-2; ) { if((int)ChunkLenMax(i) < min_uniq_len) { auto& new_chunk = *insert(begin()+i+2, TLocalVariants()); // empty chunk for(auto& var1 : (*this)[i-1]) { for(auto& var2 : (*this)[i+1]) { new_chunk.push_front(var1); auto& seq = new_chunk.front(); seq.insert(seq.end(), (*this)[i].front().begin(), (*this)[i].front().end()); seq.insert(seq.end(), var2.begin(), var2.end()); } } erase(begin()+i-1, begin()+i+2); } else { i += 2; } } } if(m_circular && size() >= 5 && (int)(ChunkLenMax(0)+ChunkLenMax(size()-1)) < min_uniq_len) { // rotate contig so that short interval is in the middle ExtendTopVariant(front().front().begin(), front().front().end()); // add first chunk to the last pop_front(); push_back(front()); // move first variable chunk to end pop_front(); InsertNewChunk(); ExtendTopVariant(front().front()[0]); // move first base to the end front().front().pop_front(); RemoveShortUniqIntervals(min_uniq_len); return true; } return false; } void ContractVariableIntervals() { if(size() > 2) { for(unsigned i = 1; i < size()-1; ++i) { if(VariableChunk(i)) { bool all_same = true; while(all_same) { for(auto& seq : (*this)[i]) { if(seq.empty() || seq.front() != (*this)[i].front().front()) { all_same = false; break; } } if(all_same) { (*this)[i-1].front().push_back((*this)[i].front().front()); for(auto& seq : (*this)[i]) seq.pop_front(); } } all_same = true; while(all_same) { for(auto& seq : (*this)[i]) { if(seq.empty() || seq.back() != (*this)[i].front().back()) { all_same = false; break; } } if(all_same) { (*this)[i+1].front().push_front((*this)[i].front().back()); for(auto& seq : (*this)[i]) seq.pop_back(); } } } } } } bool AllSameL(int chunk, int shift) const { if(!VariableChunk(chunk)) return false; auto Symbol_i = [](const TVariation& seq, const TVariation& next, unsigned i) { if(i < seq.size()) return seq[i]; else if(i < seq.size()+next.size()-1 ) // -1 - we don't want to absorb all next return next[i-seq.size()]; else return char(0); }; char symb = Symbol_i((*this)[chunk].front(), (*this)[chunk+1].front(), shift); if(!symb) return false; auto it = (*this)[chunk].begin(); for(++it; it != (*this)[chunk].end(); ++it) { if(Symbol_i(*it, (*this)[chunk+1].front(), shift) != symb) return false; } return true; } bool AllSameR(int chunk, int shift) const { if(!VariableChunk(chunk)) return false; auto Symbol_i = [](const TVariation& seq, const TVariation& prev, unsigned i) { if(i < seq.size()) return seq[seq.size()-1-i]; else if(i < seq.size()+prev.size()-1) // -1 - we don't want to absorb all prev return prev[prev.size()+seq.size()-1-i]; else return char(0); }; char symb = Symbol_i((*this)[chunk].front(), (*this)[chunk-1].front(), shift); if(!symb) return false; auto it = (*this)[chunk].begin(); for(++it; it != (*this)[chunk].end(); ++it) { if(Symbol_i(*it, (*this)[chunk-1].front(), shift) != symb) return false; } return true; } void IncludeRepeatsInVariableIntervals() { for(unsigned chunk = 1; chunk < size()-1; chunk += 2) { int min_len = ChunkLenMin(chunk); for(int shift = 0; AllSameL(chunk, shift); ++shift) { if(shift >= min_len) { for(auto& seq : (*this)[chunk]) { seq.push_back((*this)[chunk+1].front().front()); } (*this)[chunk+1].front().pop_front(); ++min_len; } } for(int shift = 0; AllSameR(chunk, shift); ++shift) { if(shift >= min_len) { for(auto& seq : (*this)[chunk]) { seq.push_front((*this)[chunk-1].front().back()); } (*this)[chunk-1].front().pop_back(); } } } } }; typedef list TContigSequenceList; void CombineSimilarContigs(TContigSequenceList& contigs) { int match = 1; int mismatch = 2; int gap_open = 5; int gap_extend = 2; SMatrix delta(match, mismatch); list all_variants; for(auto& contig : contigs) { list variants; for(auto& seq : contig[0]) variants.emplace_back(seq.begin(), seq.end()); for(unsigned l = 1; l < contig.size(); ++l) { if(contig.UniqueChunk(l)) { for(auto& seq : variants) seq.insert(seq.end(), contig[l].front().begin(), contig[l].front().end()); } else { list new_variants; for(auto& seq : variants) { for(auto& var : contig[l]) { new_variants.push_back(seq); new_variants.back().insert(new_variants.back().end(), var.begin(), var.end()); } } swap(variants, new_variants); } } all_variants.splice(all_variants.end(), variants); } all_variants.sort([](const string& a, const string& b) { return a.size() > b.size(); }); list> all_groups; while(!all_variants.empty()) { string& query = all_variants.front(); list group; auto it_loop = all_variants.begin(); for(++it_loop; it_loop != all_variants.end(); ) { auto it = it_loop++; string& subject = *it; if(query.size()-subject.size() > 0.1*query.size()) continue; // CCigar cigar = LclAlign(query.c_str(), query.size(), subject.c_str(), subject.size(), gap_open, gap_extend, delta.matrix); CCigar cigar = BandAlign(query.c_str(), query.size(), subject.c_str(), subject.size(), gap_open, gap_extend, delta.matrix, 0.1*query.size()); if(cigar.QueryRange().first != 0 || cigar.QueryRange().second != (int)query.size()-1) continue; if(cigar.SubjectRange().first != 0 || cigar.SubjectRange().second != (int)subject.size()-1) continue; if(cigar.Matches(query.c_str(), subject.c_str()) < 0.9*query.size()) continue; TCharAlign align = cigar.ToAlign(query.c_str(), subject.c_str()); if(group.empty()) { group.emplace_back(align.first.begin(), align.first.end()); group.emplace_back(align.second.begin(), align.second.end()); } else { TVariation& master = group.front(); int mpos = 0; TVariation new_member; for(unsigned i = 0; i < align.first.size(); ++i) { if(align.first[i] == master[mpos]) { new_member.push_back(align.second[i]); ++mpos; } else if(master[mpos] == '-') { while(master[mpos] == '-') { new_member.push_back('-'); ++mpos; } new_member.push_back(align.second[i]); ++mpos; } else { // align.first[i] == '-' for(TVariation& seq : group) seq.insert(seq.begin()+mpos, '-'); new_member.push_back(align.second[i]); ++mpos; } } group.push_back(new_member); } all_variants.erase(it); } if(group.empty()) group.emplace_back(query.begin(), query.end()); all_groups.push_back(move(group)); all_variants.pop_front(); } TContigSequenceList new_contigs; for(auto& group : all_groups) { if(group.size() == 1) { new_contigs.push_back(CContigSequence()); new_contigs.back().InsertNewChunk(group.front().begin(), group.front().end()); continue; } auto NextMismatch = [&](unsigned pos) { for( ; pos < group.front().size(); ++pos) { for(auto& seq : group) { if(seq[pos] != group.front()[pos]) return pos; } } return pos; }; CContigSequence combined_seq; int min_uniq_len = 21; for(unsigned mism = NextMismatch(0); mism < group.front().size(); mism = NextMismatch(0)) { if(mism > 0) { combined_seq.InsertNewChunk(group.front().begin(), group.front().begin()+mism); for(auto& seq : group) seq.erase(seq.begin(), seq.begin()+mism); } for(unsigned len = 1; len <= group.front().size(); ) { unsigned next_mism = NextMismatch(len); if(next_mism >= len+min_uniq_len || next_mism == group.front().size()) { map> varmap; for(auto& seq : group) varmap[SeqInterval(seq.begin(), seq.begin()+len)].emplace(seq.begin()+len, seq.end()); bool all_same = true; for(auto it = varmap.begin(); all_same && ++it != varmap.end(); ) { if(varmap.begin()->second != it->second) all_same = false; } if(all_same) { combined_seq.InsertNewChunk(varmap.begin()->first.begin, varmap.begin()->first.end); for(auto it = varmap.begin(); ++it != varmap.end(); ) combined_seq.InsertNewVariant(it->first.begin, it->first.end); for(auto& seq : group) seq.erase(seq.begin(), seq.begin()+len); group.sort(); group.erase(unique(group.begin(),group.end()), group.end()); break; } } len = next_mism+1; } } if(!group.front().empty()) combined_seq.InsertNewChunk(group.front().begin(), group.front().end()); for(auto& chunk : combined_seq) { for(auto& seq : chunk) seq.erase(remove(seq.begin(),seq.end(),'-'), seq.end()); } new_contigs.push_back(move(combined_seq)); } swap(contigs, new_contigs); } typedef list TStrList; template using TBases = deque; template class SContig; template using TContigList = list>; template struct SContig { typedef typename DBGraph::Node Node; typedef forward_list TNodeList; SContig(DBGraph& graph) :m_graph(graph), m_kmer_len(graph.KmerLen()) {} SContig(const CContigSequence& contig, DBGraph& graph) : m_seq(contig), m_graph(graph), m_kmer_len(graph.KmerLen()) { GenerateKmersAndCleanSNPs(); } void GenerateKmersAndCleanSNPs() { if(m_seq.RemoveShortUniqIntervals(m_kmer_len)) RotateCircularToMinKmer(); int rotation = 0; bool extended = false; auto& first_chunk = m_seq.front().front(); auto& last_chunk = m_seq.back().front(); if(m_seq.m_circular && (int)(last_chunk.size()+first_chunk.size()) >= m_kmer_len-1) { extended = true; if((int)first_chunk.size() < m_kmer_len-1) { rotation = m_kmer_len-1-first_chunk.size(); first_chunk.insert(first_chunk.begin(), last_chunk.end()-rotation, last_chunk.end()); last_chunk.erase(last_chunk.end()-rotation, last_chunk.end()); } last_chunk.insert(last_chunk.end(), first_chunk.begin(), first_chunk.begin()+m_kmer_len-1); } for(int i = m_seq.size()-1; i >= 0; ) { if(i == (int)m_seq.size()-1) { if((int)m_seq.ChunkLenMax(i) >= m_kmer_len) { // last chunk size >= kmer_len CReadHolder rh(false); rh.PushBack(m_seq.back().front()); for(CReadHolder::kmer_iterator ik = rh.kbegin(m_kmer_len) ; ik != rh.kend(); ++ik) { Node node = m_graph.GetNode(*ik); if(node.isValid() && !m_graph.SetVisited(node)) m_graph.SetMultContig(node); } } --i; } else { // all uniq chunks >= kmer_len-1 if((int)m_seq.ChunkLenMax(i-1) >= m_kmer_len) { CReadHolder rh(false); rh.PushBack(m_seq[i-1].front()); for(CReadHolder::kmer_iterator ik = rh.kbegin(m_kmer_len); ik != rh.kend(); ++ik) { Node node = m_graph.GetNode(*ik); if(node.isValid() && !m_graph.SetVisited(node)) m_graph.SetMultContig(node); } } unordered_set kmers; list> failed_nodes; list failed_variants; for(auto prev = m_seq[i].before_begin(); ;++prev) { auto current = prev; if(++current == m_seq[i].end()) break; auto& variant = *current; int left = min(m_kmer_len-1, (int)m_seq.ChunkLenMax(i-1)); TVariation var_seq(m_seq[i-1].front().end()-left, m_seq[i-1].front().end()); var_seq.insert(var_seq.end(), variant.begin(), variant.end()); int right = min(m_kmer_len-1, (int)m_seq.ChunkLenMax(i+1)); var_seq.insert(var_seq.end(), m_seq[i+1].front().begin(), m_seq[i+1].front().begin()+right); if(i == 1 && m_seq.m_circular && !extended) { // can happen only if there is one long varianle chunk and short ends if(m_seq.size() != 3) throw runtime_error("Error in circular extension"); var_seq.insert(var_seq.end(), var_seq.begin(), var_seq.begin()+m_kmer_len-1); } CReadHolder rh(false); rh.PushBack(var_seq); deque var_nodes; bool failed = false; for(CReadHolder::kmer_iterator ik = rh.kbegin(m_kmer_len) ; ik != rh.kend(); ++ik) { var_nodes.emplace_back(m_graph.GetNode(*ik)); if(!var_nodes.back().isValid()) failed = true; } if(failed) { failed_nodes.push_back(move(var_nodes)); failed_variants.push_front(prev); // reverse order for deleting } else { kmers.insert(var_nodes.begin(), var_nodes.end()); } } if((int)failed_variants.size() == m_seq.VariantsNumber(i)) { // all failed for(auto& nodes : failed_nodes) kmers.insert(nodes.begin(), nodes.end()); } else { // some are good for(auto prev : failed_variants) m_seq[i].erase_after(prev); if(m_seq.UniqueChunk(i)) { // only one left m_seq[i-1].front().insert(m_seq[i-1].front().end(), m_seq[i].front().begin(), m_seq[i].front().end()); m_seq[i-1].front().insert(m_seq[i-1].front().end(), m_seq[i+1].front().begin(), m_seq[i+1].front().end()); m_seq.erase(m_seq.begin()+i, m_seq.begin()+i+2); } } for(auto& node : kmers) { if(node.isValid() && !m_graph.SetVisited(node)) m_graph.SetMultContig(node); } i -= 2; } } if(m_seq.m_circular && extended) { last_chunk.erase(last_chunk.end()-m_kmer_len+1, last_chunk.end()); if(rotation > 0) { last_chunk.insert(last_chunk.end(), first_chunk.begin(), first_chunk.begin()+rotation); first_chunk.erase(first_chunk.begin(), first_chunk.begin()+rotation); } } m_seq.ContractVariableIntervals(); m_seq.IncludeRepeatsInVariableIntervals(); if(m_seq.RemoveShortUniqIntervals(m_kmer_len)) RotateCircularToMinKmer(); m_seq.StabilizeVariantsOrder(); } SContig(const SContig& to_left, const SContig& to_right, const Node& initial_node, const Node& lnode, const Node& rnode, DBGraph& graph) : m_next_left(lnode), m_next_right(rnode), m_graph(graph), m_kmer_len(graph.KmerLen()) { // initial_node - the starting kmer // to_left - left extension of the starting kmer // to_right - right extension of the starting kmer // lnode - left denied node // rnode - right denied node // graph - de Bruijn graph // take parts of the assembled sequence and put them together in SContig if(!to_left.m_seq.empty()) { m_seq = to_left.m_seq; ReverseComplement(); } // could be changed by ReverseComplement m_next_left = lnode; m_next_right = rnode; string ikmer = graph.GetNodeSeq(initial_node); if(m_seq.empty() || m_seq.VariableChunk(m_seq.size()-1)) // empty or variant m_seq.InsertNewChunk(ikmer.begin(), ikmer.end()); else m_seq.ExtendTopVariant(ikmer.begin(), ikmer.end()); if(!to_right.m_seq.empty()) { if(to_right.m_seq.UniqueChunk(0)) { m_seq.ExtendTopVariant(to_right.m_seq.front().front().begin(), to_right.m_seq.front().front().end()); m_seq.insert(m_seq.end(), to_right.m_seq.begin()+1, to_right.m_seq.end()); } else { m_seq.insert(m_seq.end(), to_right.m_seq.begin(), to_right.m_seq.end()); } } m_seq.StabilizeVariantsOrder(); m_left_extend = m_right_extend = LenMax(); } SContig(SContig* link, int shift, const Node& takeoff_node, const SContig& extension, const Node& rnode, DBGraph& graph) : m_next_left(takeoff_node), m_next_right(rnode), m_left_link(link), m_left_shift(shift), m_graph(graph), m_kmer_len(graph.KmerLen()) { string kmer = graph.GetNodeSeq(takeoff_node); m_seq.InsertNewChunk(kmer.begin()+1, kmer.end()); // don't include first base if(!extension.m_seq.empty()) { if(extension.m_seq.UniqueChunk(0)) { m_seq.ExtendTopVariant(extension.m_seq.front().front().begin(), extension.m_seq.front().front().end()); m_seq.insert(m_seq.end(), extension.m_seq.begin()+1, extension.m_seq.end()); } else { m_seq.insert(m_seq.end(), extension.m_seq.begin(), extension.m_seq.end()); } } m_seq.StabilizeVariantsOrder(); m_left_extend = m_right_extend = LenMax(); } Node FrontKmer() const { if(m_seq.VariableChunk(0) || (int)m_seq.ChunkLenMax(0) < m_kmer_len) return Node(); TKmer kmer(m_seq.front().front().begin(), m_seq.front().front().begin()+m_kmer_len); // front must be unambiguous return m_graph.GetNode(kmer); } Node BackKmer() const { int last = m_seq.size()-1; if(m_seq.VariableChunk(last) || (int)m_seq.ChunkLenMax(last) < m_kmer_len) return Node(); TKmer kmer(m_seq.back().front().end()-m_kmer_len, m_seq.back().front().end()); return m_graph.GetNode(kmer); } // don't 'own' any kmers bool EmptyLinker() const { return ((int)max(m_seq.ChunkLenMax(0), m_seq.ChunkLenMax(m_seq.size()-1)) < m_kmer_len && m_seq.size() <= 3); } bool RightSNP() const { return (m_seq.size() >= 3 && m_seq.UniqueChunk(m_seq.size()-1) && (int)m_seq.ChunkLenMax(m_seq.size()-1) < m_kmer_len); } bool LeftSNP() const { return (m_seq.size() >= 3 && m_seq.UniqueChunk(0) && (int)m_seq.ChunkLenMax(0) < m_kmer_len); } Node RightConnectingNode() const { int last_index = m_seq.size()-1; if((int)m_seq.ChunkLenMax(last_index) >= m_kmer_len) { // normal end return BackKmer(); } else if(m_seq.size() >= 3) { // snp if((int)m_seq.ChunkLenMax(last_index-2) >= m_kmer_len) { TKmer kmer(m_seq[last_index-2].front().end()-m_kmer_len, m_seq[last_index-2].front().end()); return m_graph.GetNode(kmer); } } return m_next_left; // empty linker } Node LeftConnectingNode() const { if((int)m_seq.ChunkLenMax(0) >= m_kmer_len) { // normal end return FrontKmer(); } else if(m_seq.size() >= 3) { // snp if((int)m_seq.ChunkLenMax(2) >= m_kmer_len) { TKmer kmer(m_seq[2].front().begin(), m_seq[2].front().begin()+m_kmer_len); // front must be unambiguous return m_graph.GetNode(kmer); } } return m_next_right; // empty linker } void ReverseComplement() { m_seq.ReverseComplement(); swap(m_next_left, m_next_right); m_next_left = DBGraph::ReverseComplement(m_next_left); m_next_right = DBGraph::ReverseComplement(m_next_right); swap(m_left_link, m_right_link); swap(m_left_shift, m_right_shift); swap(m_left_extend, m_right_extend); } void AddToRight(const SContig& other) { m_seq.m_circular = false; m_next_right = other.m_next_right; m_right_link = other.m_right_link; m_right_shift = other.m_right_shift; if(EmptyLinker() && other.EmptyLinker()) return; auto& last_chunk = m_seq.back().front(); int last_chunk_len = last_chunk.size(); int overlap = m_kmer_len-1; auto first_other_chunk_it = other.m_seq.begin(); if(RightSNP() && other.LeftSNP()) { // skip snp chunk overlap = last_chunk_len+other.m_seq.ChunkLenMax(1)+first_other_chunk_it->front().size(); first_other_chunk_it += 2; } if(other.m_right_extend < (int)other.LenMax()) { m_right_extend = other.m_right_extend; } else { m_right_extend += other.m_right_extend-overlap; if(m_left_extend == (int)LenMax()) m_left_extend = m_right_extend; } auto& first_other_chunk = first_other_chunk_it->front(); last_chunk.insert(last_chunk.end(), first_other_chunk.begin()+min(m_kmer_len-1,last_chunk_len), first_other_chunk.end()); // combine overlapping chunks m_seq.insert(m_seq.end(), first_other_chunk_it+1, other.m_seq.end()); // insert remaining chunks } void AddToLeft(const SContig& other) { m_seq.m_circular = false; m_next_left = other.m_next_left; m_left_link = other.m_left_link; m_left_shift = other.m_left_shift; if(EmptyLinker() && other.EmptyLinker()) return; auto& first_chunk = m_seq.front().front(); int first_chunk_len = first_chunk.size(); int overlap = m_kmer_len-1; auto last_other_chunk_it = other.m_seq.end()-1; if(LeftSNP() && other.RightSNP()) { // skip snp chunk overlap = first_chunk_len+other.m_seq.ChunkLenMax(other.m_seq.size()-2)+last_other_chunk_it->front().size(); last_other_chunk_it -= 2; } if(other.m_left_extend < (int)other.LenMax()) { m_left_extend = other.m_left_extend; } else { m_left_extend += other.m_left_extend-overlap; if(m_right_extend == (int)LenMax()) m_right_extend = m_left_extend; } auto& last_other_chunk = last_other_chunk_it->front(); first_chunk.insert(first_chunk.begin(),last_other_chunk.begin(), last_other_chunk.end()-min(m_kmer_len-1,first_chunk_len)); // combine overlapping chunks m_seq.insert(m_seq.begin(), other.m_seq.begin(), last_other_chunk_it); // insert remaining chunks } void ClipRight(int clip) { if(clip <= 0) return; m_seq.m_circular = false; m_next_right = Node(); m_right_link = nullptr; m_right_shift = 0; while(!m_seq.empty() && (m_seq.VariableChunk(m_seq.size()-1) || (int)m_seq.ChunkLenMax(m_seq.size()-1) <= clip)) { int chunk_len = m_seq.ChunkLenMax(m_seq.size()-1); clip -= chunk_len; m_right_extend = max(0, m_right_extend-chunk_len); m_seq.pop_back(); } if(clip > 0 && !m_seq.empty()) { m_right_extend = max(0, m_right_extend-clip); m_seq.back().front().erase(m_seq.back().front().end()-clip, m_seq.back().front().end()); } if((int)LenMin() < m_kmer_len-1) m_seq.clear(); } void ClipLeft(int clip) { if(clip <= 0) return; m_seq.m_circular = false; m_next_left = Node(); m_left_link = nullptr; m_left_shift = 0; while(!m_seq.empty() && (m_seq.VariableChunk(0) || (int)m_seq.ChunkLenMax(0) <= clip)) { int chunk_len = m_seq.ChunkLenMax(0); clip -= chunk_len; m_left_extend = max(0, m_left_extend-chunk_len); m_seq.pop_front(); } if(clip > 0 && !m_seq.empty()) { m_left_extend = max(0, m_left_extend-clip); m_seq.front().front().erase(m_seq.front().front().begin(), m_seq.front().front().begin()+clip); } if((int)LenMin() < m_kmer_len-1) m_seq.clear(); } size_t LenMax() const { return m_seq.LenMax(); } size_t LenMin() const { return m_seq.LenMin(); } tuple MinKmerPosition() const { //chunk, position in chunk, strand int kmer_len = min(21, m_kmer_len); // duplicated in RotateCircularToMinKmer() typedef LargeInt<1> large_t; unordered_map, SKmerHash> kmers; // [kmer], chunk, position in chunk, strand/notvalid for(int i = m_seq.size()-1; i >= 0; i -= 2) { deque>> chunk_kmers; if(i == (int)m_seq.size()-1) { if((int)m_seq.ChunkLenMax(i) >= kmer_len) { // last chunk could be short chunk_kmers.resize(m_seq.ChunkLenMax(i)-kmer_len+1); CReadHolder rh(false); rh.PushBack(m_seq.back().front()); int pos = chunk_kmers.size(); for(CReadHolder::kmer_iterator ik = rh.kbegin(kmer_len) ; ik != rh.kend(); ++ik) // iteration from last kmer to first chunk_kmers[--pos].push_front(get(TKmer::Type(*ik))); } } else { // all uniq chunks in the middle >= kmer_len-1; first/last could be short chunk_kmers.resize(m_seq.ChunkLenMax(i)+m_seq.ChunkLenMax(i+1)); if((int)m_seq.ChunkLenMax(i) >= kmer_len) { TVariation seq(m_seq[i].front().begin(), m_seq[i].front().end()); CReadHolder rh(false); rh.PushBack(seq); int pos = seq.size()-kmer_len+1; for(CReadHolder::kmer_iterator ik = rh.kbegin(kmer_len) ; ik != rh.kend(); ++ik) // iteration from last kmer to first chunk_kmers[--pos].push_front(get(TKmer::Type(*ik))); } for(auto& variant : m_seq[i+1]) { TVariation seq; if((int)m_seq.ChunkLenMax(i) >= kmer_len-1) seq.insert(seq.end(), m_seq[i].front().end()-kmer_len+1, m_seq[i].front().end()); else seq.insert(seq.end(), m_seq[i].front().begin(), m_seq[i].front().end()); seq.insert(seq.end(), variant.begin(), variant.end()); if((int)m_seq.ChunkLenMax(i+2) >= kmer_len-1) seq.insert(seq.end(), m_seq[i+2].front().begin(), m_seq[i+2].front().begin()+kmer_len-1); else seq.insert(seq.end(), m_seq[i+2].front().begin(), m_seq[i+2].front().end()); CReadHolder rh(false); rh.PushBack(seq); int pos = seq.size()-kmer_len+1; for(CReadHolder::kmer_iterator ik = rh.kbegin(kmer_len) ; ik != rh.kend(); ++ik) // iteration from last kmer to first chunk_kmers[--pos].push_front(get(TKmer::Type(*ik))); } } for(unsigned pos = 0; pos < chunk_kmers.size(); ++pos) { int k = pos; int chunk = i; if(pos >= m_seq.ChunkLenMax(i)) { k = pos-m_seq.ChunkLenMax(i); chunk = i+1; } for(auto& kmer : chunk_kmers[pos]) { int strand = 1; large_t* min_kmerp = &kmer; large_t rkmer = revcomp(kmer, kmer_len); if(rkmer < kmer) { strand = -1; min_kmerp = &rkmer; } auto rslt = kmers.insert(make_pair(*min_kmerp, make_tuple(chunk, k, strand))); if(!rslt.second) get<2>(rslt.first->second) = 0; } } } tuple rslt(0, 0, 0); large_t min_kmer; for(auto& elem : kmers) { if(get<2>(elem.second)) { // not a repeat if(!get<2>(rslt) || elem.first < min_kmer) { min_kmer = elem.first; rslt = elem.second; } } } return rslt; } // stabilize contig orientation using minimal kmer in the contig void SelectMinDirection() { int strand = get<2>(MinKmerPosition()); if(strand < 0) ReverseComplement(); } // finds stable origin for circular contigs by placing minimal kmer at the beginning of the sequence void RotateCircularToMinKmer() { // assumes that the next extension of sequence would give the first kmer (m_next_right == m_kmers.front()) int kmer_len = min(21, m_kmer_len); m_seq.back().front().erase(m_seq.back().front().end()-(m_kmer_len-kmer_len), m_seq.back().front().end()); // clip extra portion of overlap auto rslt = MinKmerPosition(); if(get<2>(rslt) == 0) return; m_seq.back().front().erase(m_seq.back().front().end()-kmer_len+1, m_seq.back().front().end()); // clip remaining overlap size_t first_chunk = get<0>(rslt); size_t first_base = get<1>(rslt); if(get<2>(rslt) < 0) { first_base += min(21, m_kmer_len); while(first_base >= m_seq.ChunkLenMax(first_chunk)) { first_base -= m_seq.ChunkLenMax(first_chunk); first_chunk = (first_chunk+1)%m_seq.size(); } if(m_seq.VariableChunk(first_chunk)) { // ambiguous interval - we don't want to cut it ++first_chunk; // variable chunk cant be last first_base = 1; } else if(first_chunk > 0 && first_base == 0) { // we want some uniq intervals on both ends first_base = 1; } } else { if(m_seq.VariableChunk(first_chunk)) { // ambiguous interval - we don't want to cut it --first_chunk; // variable chunk cant be first first_base = m_seq.ChunkLenMax(first_chunk)-1; // leave one base } else if(first_chunk > 0 && first_base == 0) { // we want some uniq intervals on both ends first_chunk -= 2; first_base = m_seq.ChunkLenMax(first_chunk)-1; // leave one base } } if(m_seq.size() == 1) { rotate(m_seq.front().front().begin(), m_seq.front().front().begin()+first_base, m_seq.front().front().end()); } else { if(first_chunk > 0) { auto& last_seq = m_seq.back().front(); last_seq.insert(last_seq.end(), m_seq.front().front().begin(), m_seq.front().front().end()); m_seq.pop_front(); rotate(m_seq.begin(), m_seq.begin()+first_chunk-1, m_seq.end()); } if(first_base > 0) { if(m_seq.VariableChunk(m_seq.size()-1)) { m_seq.InsertNewChunk(); m_seq.InsertNewVariant(); } auto& last_seq = m_seq.back().front(); last_seq.insert(last_seq.end(), m_seq.front().front().begin(), m_seq.front().front().begin()+first_base); m_seq.front().front().erase(m_seq.front().front().begin(), m_seq.front().front().begin()+first_base); } } //clean edges m_next_left = Node(); m_next_right = Node(); m_left_link = nullptr; m_left_shift = 0; m_right_link = nullptr; m_right_shift = 0; m_left_extend = 0; // prevents any further clipping m_right_extend = 0; // prevents any further clipping m_seq.m_circular = true; } bool operator<(const SContig& other) const { return m_seq < other.m_seq; } // connects fragments created in different threads and combines doubled 'empty' linkers static TContigList ConnectFragments(vector>& fragments, const DBGraph& graph) { int total = 0; size_t len = 0; for(auto& ns : fragments) { for(auto& seq : ns) { /* cerr << "Fragment: " << seq.m_left_link << " " << seq.m_right_link << endl; cerr << "Lnode: "; if(seq.m_next_left.isValid()) cerr << graph.GetNodeSeq(seq.m_next_left); cerr << endl; cerr << "Rnode: "; if(seq.m_next_right.isValid()) cerr << graph.GetNodeSeq(seq.m_next_right); cerr << endl; for(auto& chunk : seq.m_seq) { cerr << "Chunk:" << endl; for(auto& var : chunk) { cerr << "Variant : "; for(char c : var) cerr << c; cerr << endl; } } */ ++total; len += seq.LenMax()-graph.KmerLen()+1; } } cerr << "Fragments before: " << total << " " << len << endl; TContigList connected; unordered_map::iterator, typename Node::Hash> denied_left_nodes; unordered_map::iterator, typename Node::Hash> denied_right_nodes; for(auto& ns : fragments) { for(auto iloop = ns.begin(); iloop != ns.end(); ) { auto ic = iloop++; connected.splice(connected.begin(), ns, ic); SContig& contig = *connected.begin(); if(contig.m_next_left > contig.m_next_right) // need this to pair two identical empty links contig.ReverseComplement(); if(contig.m_next_left.isValid()) { auto rslt = denied_left_nodes.insert(make_pair(contig.m_next_left, connected.begin())); if(!rslt.second) { typename TContigList::iterator other = rslt.first->second; if(contig.m_left_link && other->m_right_link) { // other started from end of contig and went all the way to another contig other->m_left_link = contig.m_left_link; // add left link to other connected.pop_front(); continue; } else if(other->m_left_link && contig.m_right_link) { // contig started from end of contig and went all the way to another contig contig.m_left_link = other->m_left_link; // add left link to contig rslt.first->second = connected.begin(); if(other->m_next_right.isValid()) denied_right_nodes.erase(other->m_next_right); connected.erase(other); /* if(contig.EmptyLinker() && contig.m_left_link && !other->m_left_link && contig.m_next_right == other->LeftConnectingNode()) { other->AddToLeft(contig); // add left link to other connected.pop_front(); continue; }else if(other->EmptyLinker() && other->m_left_link && !contig.m_left_link && other->m_next_right == contig.LeftConnectingNode()) { contig.AddToLeft(*other); // add left link to contig rslt.first->second = connected.begin(); denied_right_nodes.erase(other->m_next_right); connected.erase(other); */ } else { cerr << "Unexpected left fork: " << graph.GetNodeSeq(contig.m_next_left) << endl; cerr << "Contig: " << contig.m_left_link << " " << contig.m_right_link << " "; if(contig.m_next_left.isValid()) cerr << contig.m_graph.GetNodeSeq(contig.m_next_left) << " "; else cerr << "LC notvalid "; if(contig.m_next_right.isValid()) cerr << contig.m_graph.GetNodeSeq(contig.m_next_right) << " "; else cerr << "RC notvalid "; cerr << endl; for(auto& chunk : contig.m_seq) { cerr << "Chunk:" << endl; for(auto& var : chunk) { cerr << "Variant: "; for(char c : var) cerr << c; cerr << endl; } } auto& other = *rslt.first->second; cerr << "Other: " << other.m_left_link << " " << other.m_right_link << " "; if(other.m_next_left.isValid()) cerr << other.m_graph.GetNodeSeq(other.m_next_left) << " "; else cerr << "LC notvalid "; if(other.m_next_right.isValid()) cerr << other.m_graph.GetNodeSeq(other.m_next_right) << " "; else cerr << "RC notvalid "; cerr << endl; for(auto& chunk : other.m_seq) { cerr << "Chunk:" << endl; for(auto& var : chunk) { cerr << "Variant: "; for(char c : var) cerr << c; cerr << endl; } } } } } if(contig.m_next_right.isValid()) { auto rslt = denied_right_nodes.insert(make_pair(contig.m_next_right, connected.begin())); if(!rslt.second) { typename TContigList::iterator other = rslt.first->second; if(contig.m_right_link && other->m_left_link) { // other started from end of contig and went all the way to another contig other->m_right_link = contig.m_right_link; // add right link to other denied_left_nodes.erase(contig.m_next_left); connected.pop_front(); } else if(other->m_right_link && contig.m_left_link) { // contig started from end of contig and went all the way to another contig contig.m_right_link = other->m_right_link; // add right link to contig rslt.first->second = connected.begin(); if(other->m_next_left.isValid()) denied_left_nodes.erase(other->m_next_left); connected.erase(other); /* if(contig.EmptyLinker() && contig.m_right_link && !other->m_right_link && contig.m_next_left == other->RightConnectingNode()) { other->AddToRight(contig); // add right link to other denied_left_nodes.erase(contig.m_next_left); connected.pop_front(); } else if (other->EmptyLinker() && other->m_right_link && !contig.m_right_link && other->m_next_left == contig.RightConnectingNode()) { contig.AddToRight(*other); // add right link to contig rslt.first->second = connected.begin(); denied_left_nodes.erase(other->m_next_left); connected.erase(other); */ } else { cerr << "Unexpected right fork: " << graph.GetNodeSeq(contig.m_next_right) << endl; cerr << "Contig: " << contig.m_left_link << " " << contig.m_right_link << " "; if(contig.m_next_left.isValid()) cerr << contig.m_graph.GetNodeSeq(contig.m_next_left) << " "; else cerr << "LC notvalid "; if(contig.m_next_right.isValid()) cerr << contig.m_graph.GetNodeSeq(contig.m_next_right) << " "; else cerr << "RC notvalid "; cerr << endl; for(auto& chunk : contig.m_seq) { cerr << "Chunk:" << endl; for(auto& var : chunk) { cerr << "Variant: "; for(char c : var) cerr << c; cerr << endl; } } auto& other = *rslt.first->second; cerr << "Other: " << other.m_left_link << " " << other.m_right_link << " "; if(other.m_next_left.isValid()) cerr << other.m_graph.GetNodeSeq(other.m_next_left) << " "; else cerr << "LC notvalid "; if(other.m_next_right.isValid()) cerr << other.m_graph.GetNodeSeq(other.m_next_right) << " "; else cerr << "RC notvalid "; cerr << endl; for(auto& chunk : other.m_seq) { cerr << "Chunk:" << endl; for(auto& var : chunk) { cerr << "Variant: "; for(char c : var) cerr << c; cerr << endl; } } } } } } } for(SContig& contig : connected) { if(contig.EmptyLinker()) continue; if(contig.m_next_right.isValid()) denied_right_nodes.erase(contig.m_next_right); if(contig.m_next_left.isValid()) denied_left_nodes.erase(contig.m_next_left); bool keep_doing = true; while(keep_doing) { keep_doing = false; if(contig.m_next_right.isValid()) { Node rnode = contig.RightConnectingNode(); auto rslt = denied_left_nodes.find(rnode); if(rslt != denied_left_nodes.end()) { keep_doing = true; SContig& rcontig = *rslt->second; if(rcontig.m_next_right.isValid()) denied_right_nodes.erase(rcontig.m_next_right); contig.AddToRight(rcontig); connected.erase(rslt->second); denied_left_nodes.erase(rslt); } else if((rslt = denied_right_nodes.find(DBGraph::ReverseComplement(rnode))) != denied_right_nodes.end()) { keep_doing = true; SContig& rcontig = *rslt->second; if(rcontig.m_next_left.isValid()) denied_left_nodes.erase(rcontig.m_next_left); rcontig.ReverseComplement(); contig.AddToRight(rcontig); connected.erase(rslt->second); denied_right_nodes.erase(rslt); } } if(contig.m_next_left.isValid()) { Node lnode = contig.LeftConnectingNode(); auto rslt = denied_right_nodes.find(lnode); if(rslt != denied_right_nodes.end()) { keep_doing = true; SContig& lcontig = *rslt->second; if(lcontig.m_next_left.isValid()) denied_left_nodes.erase(lcontig.m_next_left); contig.AddToLeft(lcontig); connected.erase(rslt->second); denied_right_nodes.erase(rslt); } else if((rslt = denied_left_nodes.find(DBGraph::ReverseComplement(lnode))) != denied_left_nodes.end()) { keep_doing = true; SContig& lcontig = *rslt->second; if(lcontig.m_next_right.isValid()) denied_right_nodes.erase(lcontig.m_next_right); lcontig.ReverseComplement(); contig.AddToLeft(lcontig); connected.erase(rslt->second); denied_left_nodes.erase(rslt); } } } if(contig.m_next_right == contig.LeftConnectingNode() && (int)contig.LenMax() >= 2*graph.KmerLen()-1) // circular and not very short contig.RotateCircularToMinKmer(); } total = 0; len = 0; for(auto& seq : connected) { ++total; len += seq.LenMax()-graph.KmerLen()+1; } cerr << "Fragments after: " << total << " " << len << endl; return connected; } // connects and extends contigs from previous iteration using a longer kmer // scontigs - previous contigs // extensions - connectors and extenders produced by longer kmer static void ConnectAndExtendContigs(TContigList& scontigs, TContigList& extensions, int ncores) { if(scontigs.empty()) return; int kmer_len = scontigs.front().m_kmer_len; int connectors = 0; int extenders = 0; //assign links to main contigs for(auto& ex : extensions) { if(ex.m_left_link && ex.m_right_link) ++connectors; else ++extenders; if(ex.m_left_link) { auto& contig = *ex.m_left_link; if(contig.RightConnectingNode() == ex.m_next_left) { if(contig.m_right_link) throw runtime_error("Multiple connection of contigs"); contig.m_right_link = &ex; } else if(contig.LeftConnectingNode() == DBGraph::ReverseComplement(ex.m_next_left)) { if(contig.m_left_link) throw runtime_error("Multiple connection of contigs"); contig.m_left_link = &ex; } else { cerr << "Corrupted connection of contigs L" << endl; cerr << "Contig: "; if(contig.LeftConnectingNode().isValid()) cerr << contig.m_graph.GetNodeSeq(contig.LeftConnectingNode()) << " "; else cerr << "LC notvalid "; if(contig.RightConnectingNode().isValid()) cerr << contig.m_graph.GetNodeSeq(contig.RightConnectingNode()) << " "; else cerr << "RC notvalid "; cerr << endl; for(auto& chunk : contig.m_seq) { cerr << "Chunk:" << endl; for(auto& var : chunk) { cerr << "Variant: "; for(char c : var) cerr << c; cerr << endl; } } cerr << "Extension: "; if(ex.m_next_left.isValid()) cerr << contig.m_graph.GetNodeSeq(ex.m_next_left); else cerr << "Notvalid"; cerr << endl; for(auto& chunk : ex.m_seq) { cerr << "Chunk:" << endl; for(auto& var : chunk) { cerr << "Variant: "; for(char c : var) cerr << c; cerr << endl; } } if(ex.m_next_right.isValid()) cerr << "NR: " << contig.m_graph.GetNodeSeq(ex.m_next_right) << endl; if(ex.m_right_link) { auto& contig = *ex.m_right_link; cerr << "RContig: "; if(contig.LeftConnectingNode().isValid()) cerr << contig.m_graph.GetNodeSeq(contig.LeftConnectingNode()) << " "; else cerr << "LC notvalid "; if(contig.RightConnectingNode().isValid()) cerr << contig.m_graph.GetNodeSeq(contig.RightConnectingNode()) << " "; else cerr << "RC notvalid "; cerr << endl; for(auto& chunk : contig.m_seq) { cerr << "Chunk:" << endl; for(auto& var : chunk) { cerr << "Variant: "; for(char c : var) cerr << c; cerr << endl; } } } // throw runtime_error("Corrupted connection of contigs L"); } } if(ex.m_right_link) { auto& contig = *ex.m_right_link; if(contig.LeftConnectingNode() == ex.m_next_right) { if(contig.m_left_link) throw runtime_error("Multiple connection of contigs"); contig.m_left_link = &ex; } else if(contig.RightConnectingNode() == DBGraph::ReverseComplement(ex.m_next_right)) { if(contig.m_right_link) throw runtime_error("Multiple connection of contigs"); contig.m_right_link = &ex; } else { cerr << "Corrupted connection of contigs R" << endl; cerr << "Contig: "; if(contig.LeftConnectingNode().isValid()) cerr << contig.m_graph.GetNodeSeq(contig.LeftConnectingNode()) << " "; else cerr << "LC notvalid "; if(contig.RightConnectingNode().isValid()) cerr << contig.m_graph.GetNodeSeq(contig.RightConnectingNode()) << " "; else cerr << "RC notvalid "; cerr << endl; for(auto& chunk : contig.m_seq) { cerr << "Chunk:" << endl; for(auto& var : chunk) { cerr << "Variant: "; for(char c : var) cerr << c; cerr << endl; } } cerr << "Extension: "; if(ex.m_next_right.isValid()) cerr << contig.m_graph.GetNodeSeq(ex.m_next_right); else cerr << "Notvalid"; cerr << endl; for(auto& chunk : ex.m_seq) { cerr << "Chunk:" << endl; for(auto& var : chunk) { cerr << "Variant: "; for(char c : var) cerr << c; cerr << endl; } } if(ex.m_next_left.isValid()) cerr << "NL: " << contig.m_graph.GetNodeSeq(ex.m_next_left) << endl; if(ex.m_left_link) { auto& contig = *ex.m_left_link; cerr << "LContig: "; if(contig.LeftConnectingNode().isValid()) cerr << contig.m_graph.GetNodeSeq(contig.LeftConnectingNode()) << " "; else cerr << "LC notvalid "; if(contig.RightConnectingNode().isValid()) cerr << contig.m_graph.GetNodeSeq(contig.RightConnectingNode()) << " "; else cerr << "RC notvalid "; cerr << endl; for(auto& chunk : contig.m_seq) { cerr << "Chunk:" << endl; for(auto& var : chunk) { cerr << "Variant: "; for(char c : var) cerr << c; cerr << endl; } } } // throw runtime_error("Corrupted connection of contigs R"); } } } cerr << "Connectors: " << connectors << " Extenders: " << extenders << endl; for(auto& contig : scontigs) contig.m_is_taken = 0; //select starting points for chains for(auto& contig : scontigs) { if(contig.m_is_taken) continue; if(contig.m_left_link == nullptr && contig.m_right_link == nullptr) { contig.m_is_taken = 1; continue; } //mark as taken all chain members except the starting point auto parent = &contig; bool circular = false; for(auto child = parent->m_right_link; child != nullptr && !circular; ) { child->m_is_taken = 1; if(child->m_left_link == parent) { parent = child; child = child->m_right_link; } else { parent = child; child = child->m_left_link; } circular = (child == &contig); } if(circular) continue; parent = &contig; for(auto child = parent->m_left_link; child != nullptr; ) { child->m_is_taken = 1; if(child->m_left_link == parent) { parent = child; child = child->m_right_link; } else { parent = child; child = child->m_left_link; } } } list> jobs; for(int thr = 0; thr < ncores; ++thr) { jobs.push_back(bind(ConnectContigsJob, ref(scontigs))); } RunThreads(ncores, jobs); //remove fragments for(auto iloop = scontigs.begin(); iloop != scontigs.end(); ) { auto ic = iloop++; if(ic->m_is_taken == 2 || (int)ic->LenMin() < kmer_len) scontigs.erase(ic); else ic->m_is_taken = 0; } } static void ConnectContigsJob(TContigList& scontigs) { int kmer_len = scontigs.front().m_kmer_len; for(auto& contig : scontigs) { if(!contig.m_is_taken.Set(1)) // grab contig continue; int num = 0; bool circular = false; for(auto parent = &contig; parent->m_right_link != nullptr && !circular; ++num) { auto child = parent->m_right_link; if(child->m_left_link != parent) child->ReverseComplement(); if(child->m_right_link == &contig) { // circular circular = true; if(child->m_left_link == &contig) { // special case of a single connector needs additional check of orientation if(contig.RightConnectingNode() != child->m_next_left) child->ReverseComplement(); } } contig.AddToRight(*child); if(num%2) // child is contig, not connector/extender contig.m_seq.m_right_repeat = child->m_seq.m_right_repeat; child->m_is_taken = 2; // will be removed if(circular && (int)contig.LenMax() >= 2*kmer_len-1) //stabilize circular contig contig.RotateCircularToMinKmer(); parent = child; } if(circular) continue; num = 0; for(auto parent = &contig; parent->m_left_link != nullptr; ++num) { auto child = parent->m_left_link; if(child->m_right_link != parent) child->ReverseComplement(); contig.AddToLeft(*child); if(num%2) // child is contig, not connector/extender contig.m_seq.m_left_repeat = child->m_seq.m_left_repeat; child->m_is_taken = 2; // will be removed parent = child; } //clip flanks which are not 'double' checked auto& graph = contig.m_graph; for(int low_abundance_clip = 10; low_abundance_clip > 0 && contig.m_left_extend > 0; --low_abundance_clip) { auto kmer = contig.FrontKmer(); if(!kmer.isValid() || graph.Abundance(kmer) > 5) break; contig.ClipLeft(1); } int left_clip = min(kmer_len,contig.m_left_extend); contig.ClipLeft(left_clip); if(contig.m_left_extend > 0) contig.m_seq.m_left_repeat = min(kmer_len-1, contig.m_left_extend+contig.m_seq.m_left_repeat); for(int low_abundance_clip = 10; low_abundance_clip > 0 && contig.m_right_extend > 0; --low_abundance_clip) { auto kmer = contig.BackKmer(); if(!kmer.isValid() || graph.Abundance(kmer) > 5) break; contig.ClipRight(1); } int right_clip = min(kmer_len,contig.m_right_extend); contig.ClipRight(right_clip); if(contig.m_right_extend > 0) contig.m_seq.m_right_repeat = min(kmer_len-1, contig.m_right_extend+contig.m_seq.m_right_repeat); } } CContigSequence m_seq; // sequence Node m_next_left; // denied left kmer (connection possible but it is already owned) Node m_next_right; // denied right kmer (connection possible but it is already owned) SContig* m_left_link = nullptr; // if set points to 'left' contig int m_left_shift = 0; // shift+1 for m_next_left in this contig (positive for the right end) SContig* m_right_link = nullptr; // if set points to 'right' contig int m_right_shift = 0; // shift+1 for m_next_right in this contig (positive for the right end) int m_left_extend = 0; // number of newly assembled bases which could be clipped int m_right_extend = 0; // number of newly assembled bases which could be clipped DBGraph& m_graph; int m_kmer_len; SAtomic m_is_taken = 0; }; // This is a very lightweight class holding a reference to de Bruijn graph and main assembling parameters // It provides function used in assembling template class CDBGraphDigger { public: typedef typename DBGraph::Node Node; typedef typename DBGraph::Successor Successor; CDBGraphDigger(DBGraph& graph, double fraction, int jump, int low_count, bool allow_snps = false) : m_graph(graph), m_fraction(fraction), m_jump(jump), m_hist_min(graph.HistogramMinimum()), m_low_count(low_count), m_allow_snps(allow_snps) { m_max_branch = 200; // maximum number of paths explored before quitting } private: typedef tuple TContigEnd; public: // starting from a node, find an extension of len l with maximal abundance string MostLikelyExtension(Node node, unsigned len) const { //don't do FilterNeighbors because it is called in it string s; while(s.size() < len) { vector successors = m_graph.GetNodeSuccessors(node); if(successors.empty()) return s; sort(successors.begin(), successors.end(), [&](const Successor& a, const Successor& b) {return m_graph.Abundance(a.m_node) > m_graph.Abundance(b.m_node);}); node = successors[0].m_node; s.push_back(successors[0].m_nt); } return s; } string MostLikelySeq(Successor base, unsigned len) const { string s(1, base.m_nt); return s+MostLikelyExtension(base.m_node, len-1); } // starting from a node, find an extension of len l without forks (simple path); returns true if hit dead end pair StringentExtension(Node node, unsigned len) const { string s; while(s.size() < len) { vector successors = m_graph.GetNodeSuccessors(node); FilterNeighbors(successors, false); if(successors.empty()) return make_pair(s, true); if(successors.size() != 1) return make_pair(s, false); node = successors[0].m_node; s.push_back(successors[0].m_nt); } return make_pair(s, false); } bool ExtendableSuccessor(const Successor& initial_suc) const { int kmer_len = m_graph.KmerLen(); int total_len = max(100, kmer_len); unordered_map node_len; node_len.emplace(initial_suc.m_node,0); stack> active_nodes; active_nodes.emplace(initial_suc.m_node,0); while(!active_nodes.empty()) { Node node = active_nodes.top().first; int len = active_nodes.top().second; active_nodes.pop(); if(len == kmer_len) { vector step_back = m_graph.GetNodeSuccessors(m_graph.ReverseComplement(node)); FilterLowAbundanceNeighbors(step_back); bool found = false; for(auto& back : step_back) { if(back.m_nt == Complement(initial_suc.m_nt)) { found = true; break; } } if(!found) continue; } if(len == total_len) return true; if(len > kmer_len) { int& l = node_len[node]; if(len > l) l = len; else continue; } vector successors = m_graph.GetNodeSuccessors(node); FilterLowAbundanceNeighbors(successors); if(!successors.empty()) { for(int i = successors.size()-1; i >= 0; --i) active_nodes.emplace(successors[i].m_node, len+1); } } return false; } vector GetReversibleNodeSuccessors(const Node& node, int* numbackp = nullptr) const { vector neighbors = m_graph.GetNodeSuccessors(node); FilterNeighbors(neighbors, true); if(numbackp != nullptr) *numbackp = 0; for(auto& neighbor : neighbors) { vector step_back = m_graph.GetNodeSuccessors(m_graph.ReverseComplement(neighbor.m_node)); FilterNeighbors(step_back, true); bool found = false; for(auto& back : step_back) { if(back.m_node == m_graph.ReverseComplement(node)) { found = true; break; } } if(!found) { neighbors.clear(); return neighbors; } if(numbackp != nullptr) *numbackp = max(*numbackp, (int)step_back.size()); } return neighbors; } bool GoodNode(const Node& node) const { return m_graph.Abundance(node) >= m_low_count; } int HistMin() const { return m_hist_min; } // removes noise forks void FilterLowAbundanceNeighbors(vector& successors) const { // low abundance forks if(successors.size() > 1) { int abundance = 0; for(auto& suc : successors) { abundance += m_graph.Abundance(suc.m_node); } sort(successors.begin(), successors.end(), [&](const Successor& a, const Successor& b) { auto abundancea = m_graph.Abundance(a.m_node); auto abundanceb = m_graph.Abundance(b.m_node); if(abundancea == abundanceb) return a.m_nt < b.m_nt; else return abundancea > abundanceb; }); for(int j = successors.size()-1; j > 0 && m_graph.Abundance(successors.back().m_node) <= m_fraction*abundance; --j) successors.pop_back(); } // strand specific noise reduction for Illumina issue of GGT->GG[ACG] if(m_graph.GraphIsStranded() && successors.size() > 1) { double fraction = 0.1*m_fraction; int target = -1; for(int j = 0; target < 0 && j < (int)successors.size(); ++j) { if(m_graph.GetNodeSeq(successors[j].m_node).substr(m_graph.KmerLen()-3) == "GGT") target = j; } if(target >= 0) { int abundance = m_graph.Abundance(successors[target].m_node); if(abundance > 5) { double am = abundance*(1-m_graph.PlusFraction(successors[target].m_node)); for(int j = 0; j < (int)successors.size(); ) { if(m_graph.Abundance(successors[j].m_node)*(1-m_graph.PlusFraction(successors[j].m_node)) < fraction*am) successors.erase(successors.begin()+j); else ++j; } } } } } void FilterNeighbors(vector& successors, bool check_extension) const { // low abundance forks FilterLowAbundanceNeighbors(successors); //not extendable forks if(check_extension && successors.size() > 1 && m_graph.Abundance(successors.front().m_node) > 5) { for(int i = 0; i < (int)successors.size(); ) { if(ExtendableSuccessor(successors[i])) ++i; else successors.erase(successors.begin()+i); } } // strand specific noise reduction for Illumina issue of GGT->GG[ACG] for negative strand and low coverage (the prev loop didn't work) if(m_graph.GraphIsStranded() && successors.size() > 1 && (!check_extension || m_graph.Abundance(successors.front().m_node) <= 5)) { double fraction = 0.1*m_fraction; int target = -1; for(int j = 0; target < 0 && j < (int)successors.size(); ++j) { if(MostLikelySeq(successors[j], 3) == "ACC") target = j; } if(target >= 0) { int abundance = m_graph.Abundance(successors[target].m_node); if(abundance > 5) { double ap = abundance*m_graph.PlusFraction(successors[target].m_node); for(int j = 0; j < (int)successors.size(); ) { if(m_graph.Abundance(successors[j].m_node)*m_graph.PlusFraction(successors[j].m_node) < fraction*ap) successors.erase(successors.begin()+j); else ++j; } } } } // strand balance issue if(m_graph.GraphIsStranded() && successors.size() > 1) { double fraction = 0.1*m_fraction; bool has_both = false; for(int j = 0; !has_both && j < (int)successors.size(); ++j) { double plusf = m_graph.PlusFraction(successors[j].m_node); double minusf = 1.- plusf; has_both = GoodNode(successors[j].m_node) && (min(plusf,minusf) > 0.25); } if(has_both) { for(int j = 0; j < (int)successors.size(); ) { double plusf = m_graph.PlusFraction(successors[j].m_node); double minusf = 1.- plusf; if(min(plusf,minusf) < fraction*max(plusf,minusf)) successors.erase(successors.begin()+j); else ++j; } } } } DBGraph& Graph() { return m_graph; } enum EConnectionStatus {eSuccess, eNoConnection, eAmbiguousConnection}; struct SElement { SElement (Successor suc, SElement* link) : m_link(link), m_suc(suc) {} struct SElement* m_link; // previous element Successor m_suc; }; // connects two nodes in a finite number of steps pair, EConnectionStatus> ConnectTwoNodes(const Node& first_node, const Node& last_node, int steps) const { pair, EConnectionStatus> bases(TBases(), eNoConnection); deque storage; // will contain ALL extensions (nothing is deleted) typedef unordered_map TElementMap; //pointer to its own element OR zero if ambiguous path TElementMap current_elements; vector successors = m_graph.GetNodeSuccessors(first_node); FilterNeighbors(successors, false); for(auto& suc : successors) { storage.push_back(SElement(suc, 0)); current_elements[suc.m_node] = &storage.back(); } list connections; for(int step = 1; step < steps && !current_elements.empty(); ++step) { TElementMap new_elements; for(auto& el : current_elements) { vector successors = m_graph.GetNodeSuccessors(el.first); FilterNeighbors(successors, false); if(el.second == 0) { // ambiguous path for(auto& suc : successors) { new_elements[suc.m_node] = 0; if(suc.m_node == last_node) { bases.second = eAmbiguousConnection; return bases; } } } else { for(auto& suc : successors) { storage.push_back(SElement(suc, el.second)); if(suc.m_node == last_node) { if(!connections.empty()) { bases.second = eAmbiguousConnection; return bases; } else { connections.push_back(storage.back()); } } pair rslt = new_elements.insert(make_pair(suc.m_node, &storage.back())); if(!rslt.second || !GoodNode(suc.m_node)) rslt.first->second = 0; } } } swap(current_elements, new_elements); if(current_elements.size() > m_max_branch) return bases; } if(connections.empty()) return bases; SElement el = connections.front(); while(el.m_link != 0) { bases.first.push_front(el.m_suc); el = *el.m_link; } bases.first.push_front(el.m_suc); bases.second = eSuccess; return bases; } typedef list> TBasesList; typedef unordered_map, typename Node::Hash> TBranch; // all 'leaves' will have the same length typedef unordered_map>, typename Node::Hash> TLinks; void OneStepBranchExtend(TBranch& branch, TBasesList& sequences, TLinks& links) { TBranch new_branch; for(auto& leaf : branch) { vector successors = m_graph.GetNodeSuccessors(leaf.first); FilterNeighbors(successors, true); if(successors.empty()) { for(auto is : leaf.second) is->clear(); } else { for(int i = successors.size()-1; i >= 0; --i) { auto& lst = new_branch[successors[i].m_node]; for(auto is : leaf.second) { if(i > 0) { // copy sequence if it is a fork sequences.push_front(*is); is = sequences.begin(); for(int p = 0; p < (int)is->size()-1; ++p) links[(*is)[p].m_node].emplace_front(is, p); } links[leaf.first].emplace_front(is, is->size()-1); is->push_back(successors[i]); lst.emplace_front(is); } } } } for(auto it_loop = new_branch.begin(); it_loop != new_branch.end(); ) { auto it = it_loop++; auto rslt = links.find(it->first); if(rslt != links.end()) { auto& lst = rslt->second; set> seqs; // TODO set of intervals for(auto& link : lst) { if(!link.first->empty()) { if(link.first->back().m_node != it->first) { seqs.emplace(link.first->begin()+link.second+1, link.first->end()); } else { // circular extension branch.clear(); sequences.clear(); return; } } } if(!seqs.empty()) { for(auto is : it->second) { for(auto ex = next(seqs.begin()); ex != seqs.end(); ++ex) { sequences.push_front(*is); sequences.front().insert(sequences.front().end(), ex->begin(), ex->end()); for(int p = 0; p < (int)sequences.front().size()-1; ++p) links[sequences.front()[p].m_node].emplace_front(sequences.begin(), p); new_branch[sequences.front().back().m_node].push_front(sequences.begin()); } int l = is->size(); is->insert(is->end(), seqs.begin()->begin(), seqs.begin()->end()); for(int p = l-1; p < (int)is->size()-1; ++p) links[(*is)[p].m_node].emplace_front(is, p); new_branch[is->back().m_node].push_front(is); } new_branch.erase(it); } } } swap(branch, new_branch); } // sequences, last node, intrusion, node corresponding to intrusion shift, max_le - min_le tuple DiscoverOneSNP(const vector& successors, const TVariation& last_chunk, int max_extent) { tuple rslt; TBranch extensions; TBasesList sequences; // assembled seqs TLinks links; int kmer_len = m_graph.KmerLen(); if(max_extent == 0 || successors.empty()) return rslt; for(auto& suc : successors) { sequences.emplace_front(1,suc); extensions[suc.m_node].emplace_front(sequences.begin()); } int max_len = 1; int min_len = 1; size_t seq_num = sequences.size(); while(seq_num < m_max_branch && max_len < max_extent) { OneStepBranchExtend(extensions, sequences, links); max_len = 0; min_len = numeric_limits::max(); seq_num = 0; for(auto& seq : sequences) { if(!seq.empty()) { max_len = max(max_len, (int)seq.size()); min_len = min(min_len, (int)seq.size()); ++seq_num; } } if(extensions.empty()) // can't extend return rslt; if(extensions.size() == 1 && min_len >= kmer_len) break; } if(extensions.size() == 1 && min_len >= kmer_len && max_len <= max_extent) { set first_bases; for(auto it = sequences.begin(); it != sequences.end(); ) { if(it->empty()) { it = sequences.erase(it); } else { first_bases.insert(it->front().m_nt); ++it; } } if(first_bases.size() > 1) { // found snp // clip extra matches from the end int matches = 0; bool all_same = true; while(all_same) { for(auto& seq : sequences) { if(matches == (int)seq.size() || (seq.end()-matches-1)->m_nt != (sequences.front().end()-matches-1)->m_nt) { all_same = false; break; } } if(all_same) ++matches; } if(matches > kmer_len) { int extra = min(matches-kmer_len, max_len); max_len -= extra; min_len -= extra; for(auto& seq : sequences) seq.erase(seq.end()-extra, seq.end()); } // check all nodes for(auto& seq : sequences) { for(auto& base : seq) { if(!GoodNode(base.m_node)) return rslt; } } bool has_empty_variant = false; for(auto& seq : sequences) { if((int)seq.size() == kmer_len) { has_empty_variant = true; break; } } // copy seqs to result TLocalVariants& seqs = get<0>(rslt); for(auto& seq : sequences) { seqs.push_front(TVariation()); for(auto& base : seq) seqs.front().push_back(base.m_nt); } // last node get<1>(rslt) = sequences.front().back().m_node; // diff get<4>(rslt) = max_len-min_len; // check for repeat and report if found if(has_empty_variant) { for(auto& seq : seqs) // add kmer_len bases seq.insert(seq.begin(), last_chunk.end()-kmer_len, last_chunk.end()); bool all_same = true; int shift = 0; while(all_same) { for(auto& seq : seqs) { if(shift == (int)seq.size()-kmer_len || *(seq.end()-shift-1-kmer_len) != *(seqs.front().end()-shift-1-kmer_len)) { all_same = false; break; } } if(all_same) ++shift; } if(shift >= kmer_len) { return tuple(); } else { get<2>(rslt) = shift; get<3>(rslt) = (sequences.front().end()-1-shift)->m_node; max_len += shift; min_len += shift; for(auto& seq : seqs) // erase added extra sequence (shit bases remain) seq.erase(seq.begin(), seq.begin()+kmer_len-shift); } } } } return rslt; } //successors by value intentionally tuple DiscoverSNPCluster(vector successors, const TVariation& last_chunk, int max_extent) { tuple rslt; int kmer_len = m_graph.KmerLen(); const TVariation* last_chunkp = &last_chunk; int dist_to_snp = 0; while(dist_to_snp < 2*kmer_len && !successors.empty()) { tuple snp_data = DiscoverOneSNP(successors, *last_chunkp, max_extent); if(get<0>(snp_data).empty()) break; TLocalVariants& seqs = get<0>(snp_data); Node node = get<1>(snp_data); int shift = get<2>(snp_data); int diff_len = get<4>(snp_data); /* cerr << "Last chunk: "; for(char c : *last_chunkp) cerr << c; cerr << endl; cerr << "SNP: " << shift << " " << diff_len << " " << m_graph.GetNodeSeq(node) << endl; for(auto& seq: seqs) { cerr << "Chunk: "; for(char c : seq) cerr << c; cerr << endl; auto rseq = seq; ReverseComplementSeq(rseq.begin(), rseq.end()); cerr << "RChunk: "; for(char c : rseq) cerr << c; cerr << endl; } */ if(dist_to_snp == 0) { // first snp in cluster get<0>(rslt) = seqs; get<1>(rslt) = node; get<2>(rslt) = shift; } else { int& existing_shift = get<2>(rslt); if(dist_to_snp >= kmer_len+shift && // take into account repeat (if any) dist_to_snp+existing_shift >= kmer_len+diff_len-shift) // long indels need additional steps before they are connected break; // no inerference // combine snps int len = 0; for(auto& seq: get<0>(rslt)) { seq.erase(seq.end()-shift, seq.end()); // erase seq for new shift if(existing_shift > 0) seq.erase(seq.begin(), seq.begin()+existing_shift); // remove existing shift (if any) for(auto it = next(seqs.begin()); it != seqs.end(); ++it) { get<0>(rslt).push_front(seq); get<0>(rslt).front().insert(get<0>(rslt).front().end(), it->begin(), it->end()-shift); } seq.insert(seq.end(), seqs.front().begin(), seqs.front().end()-shift); len = max(len, (int)seq.size()); } if(len > max_extent) // longer than threshold return tuple(); existing_shift = 0; if(shift > 0) node = get<3>(snp_data); get<1>(rslt) = node; } dist_to_snp = kmer_len; // cerr << "Ext: "; bool fork = false; while(dist_to_snp < 2*kmer_len) { successors = m_graph.GetNodeSuccessors(node); FilterNeighbors(successors, true); fork = successors.size() > 1; if(fork || successors.empty() || !GoodNode(successors.front().m_node)) break; ++dist_to_snp; node = successors.front().m_node; for(auto& seq: get<0>(rslt)) seq.push_back(successors.front().m_nt); // cerr << successors.front().m_nt; } // cerr << endl; if(!fork) break; last_chunkp = &get<0>(rslt).front(); } if(dist_to_snp > kmer_len) { // remove extra extension for(auto& seq: get<0>(rslt)) seq.erase(seq.end()-(dist_to_snp-kmer_len), seq.end()); } return rslt; } // starting from initial_node assembles the right extension tuple, Node, int> ExtendToRight(const Node& initial_node, int allowed_intrusion) { // initial_node may be not owned Node node = initial_node; SContig extension(m_graph); int max_extent = m_jump; int kmer_len = m_graph.KmerLen(); int initial_node_intrusion = 0; while(true) { vector successors = m_graph.GetNodeSuccessors(node); FilterNeighbors(successors, true); if(successors.empty()) { // no extensions break; } else if(successors.size() == 1) { // simple extension Node new_node = successors.front().m_node; if(!GoodNode(new_node)) break; vector predecessors = m_graph.GetNodeSuccessors(DBGraph::ReverseComplement(new_node)); FilterNeighbors(predecessors, true); if(predecessors.size() != 1) // no extensions or end of unique seq before repeat break; if(DBGraph::ReverseComplement(predecessors[0].m_node) != node) // no return break; node = new_node; if(m_graph.SetVisited(node)) { // node is available if(extension.m_seq.empty()) { extension.m_seq.InsertNewChunk(); extension.m_seq.InsertNewVariant(); } extension.m_seq.ExtendTopVariant(successors.front().m_nt); } else { return make_tuple(extension, node, initial_node_intrusion); } } else if(!m_allow_snps) { // snps not allowed break; } else { // try snps int last_chunk_len = extension.m_seq.empty() ? 0 : extension.m_seq.back().front().size(); TVariation* last_chunkp = nullptr; TVariation last_chunk; if(last_chunk_len >= kmer_len) { last_chunkp = &extension.m_seq.back().front(); } else { string initial_node_seq = m_graph.GetNodeSeq(initial_node); last_chunk.insert(last_chunk.end(), initial_node_seq.begin(), initial_node_seq.end()); if(last_chunk_len > 0) last_chunk.insert(last_chunk.end(), extension.m_seq.back().front().begin(), extension.m_seq.back().front().end()); last_chunkp = &last_chunk; } // cerr << "Direct" << endl; auto forward = DiscoverSNPCluster(successors, *last_chunkp, max_extent); TLocalVariants& step = get<0>(forward); int shift = get<2>(forward); if(step.empty()) // no snp break; int step_size = 0; // not all step seqs have same length for(auto& var : step) step_size = max(step_size, (int)var.size()); // check return vector predecessors = m_graph.GetNodeSuccessors(DBGraph::ReverseComplement(get<1>(forward))); // cerr << "Lastkmer: " << get<1>(forward).isValid() << " " << m_graph.GetNodeSeq(DBGraph::ReverseComplement(get<1>(forward))) << endl; FilterNeighbors(predecessors, true); if(predecessors.empty()) break; TVariation back_chunk; back_chunk.insert(back_chunk.end(), step.front().end()-kmer_len, step.front().end()); ReverseComplementSeq(back_chunk.begin(), back_chunk.end()); // cerr << "Backward" << endl; auto backward = DiscoverSNPCluster(predecessors, back_chunk, max_extent); TLocalVariants& step_back = get<0>(backward); if(step_back.empty()) // no snp break; int step_back_size = 0; // not all step seqs have same length for(auto& var : step_back) step_back_size = max(step_back_size, (int)var.size()); if(step_size != step_back_size) break; for(auto& seq : step_back) ReverseComplementSeq(seq.begin(), seq.end()); if(!equal(last_chunkp->end()-kmer_len, last_chunkp->end()-shift, step_back.front().begin()+shift)) break; for(auto& seq : step_back) { seq.erase(seq.begin(), seq.begin()+kmer_len); seq.insert(seq.end(), step.front().end()-kmer_len, step.front().end()); } step.sort(); step_back.sort(); if(step != step_back) break; // snp is accepted node = get<1>(forward); if(shift > 0) { if(shift >= last_chunk_len) { // extension has no snp and short - pass intrusion (or part of it) to the caller initial_node_intrusion = shift-last_chunk_len; if(initial_node_intrusion > allowed_intrusion) { initial_node_intrusion = 0; break; } if(last_chunk_len > 0) extension.m_seq.pop_back(); } else { // shorten previous chunk extension.m_seq.back().front().erase(extension.m_seq.back().front().end()-shift, extension.m_seq.back().front().end()); } } extension.m_seq.InsertNewChunk(); // empty chunk for variable part for(auto& seq : step) { extension.m_seq.InsertNewVariant(); // empty seq for new variant extension.m_seq.ExtendTopVariant(seq.begin(), seq.end()-kmer_len); } extension.m_seq.InsertNewChunk(); // empty chunk for matching kmer-1 or kmer bases extension.m_seq.InsertNewVariant(step.front().end()-kmer_len, step.front().end()-1); CReadHolder rh(false); for(auto& seq : step) { if(shift == 0) // last chunk not clipped, nothing added to snp seq.insert(seq.begin(), last_chunkp->end()-(kmer_len-1), last_chunkp->end()); else if(shift > last_chunk_len) // last chunk not clipped, shift bases added to snp seq.insert(seq.begin(), last_chunkp->end()-(kmer_len-1), last_chunkp->end()-shift); else // last chunk clipped, shift bases added to snp seq.insert(seq.begin(), last_chunkp->end()-(kmer_len-1-shift), last_chunkp->end()); rh.PushBack(seq); } bool my_snp = true; list snp_nodes; for(CReadHolder::kmer_iterator ik = rh.kbegin(kmer_len) ; ik != rh.kend() && my_snp; ++ik) { Node n = m_graph.GetNode(*ik); if(n.isValid()) { if(m_graph.IsVisited(n)) my_snp = false; else snp_nodes.push_back(n); } } if(my_snp && m_graph.SetVisited(node)) { // snp belongs to this thread - extend one more base and set all visited extension.m_seq.ExtendTopVariant(step.front().back()); for(auto& n : snp_nodes) m_graph.SetVisited(n); continue; } else { return make_tuple(extension, node, initial_node_intrusion); } } } return make_tuple(extension, Node(), initial_node_intrusion); } // assembles a contig starting from initial_node // min_len - minimal length for accepted contigs // changes the state of all used nodes to 'visited' or 'temporary holding' SContig GetContigForKmer(const Node& initial_node, int min_len) { if(m_graph.Abundance(initial_node) < m_hist_min || !GoodNode(initial_node) || !m_graph.SetVisited(initial_node)) return SContig(m_graph); //node is good and this thread owns it // don't allow intrusion of snps in the initial kmer tuple, Node, int> to_right = ExtendToRight(initial_node, 0); tuple, Node, int> to_left = ExtendToRight(DBGraph::ReverseComplement(initial_node), 0); SContig scontig(get<0>(to_left), get<0>(to_right), initial_node, DBGraph::ReverseComplement(get<1>(to_left)), get<1>(to_right), m_graph); if(!scontig.m_next_left.isValid() && !scontig.m_next_right.isValid() && (int)scontig.LenMin() < min_len) { int kmer_len = m_graph.KmerLen(); for(int i = scontig.m_seq.size()-1; i >= 0; i -= 2) { if(i == (int)scontig.m_seq.size()-1) { // last chunk size >= kmer_len CReadHolder rh(false); rh.PushBack(scontig.m_seq.back().front()); for(CReadHolder::kmer_iterator ik = rh.kbegin(kmer_len) ; ik != rh.kend(); ++ik) m_graph.SetTempHolding(m_graph.GetNode(*ik)); } else { if((int)scontig.m_seq.ChunkLenMax(i) >= kmer_len) { TVariation seq(scontig.m_seq[i].front().begin(), scontig.m_seq[i].front().end()); CReadHolder rh(false); rh.PushBack(seq); for(CReadHolder::kmer_iterator ik = rh.kbegin(kmer_len) ; ik != rh.kend(); ++ik) m_graph.SetTempHolding(m_graph.GetNode(*ik)); } for(auto& variant : scontig.m_seq[i+1]) { TVariation seq(scontig.m_seq[i].front().end()-kmer_len+1, scontig.m_seq[i].front().end()); // all uniq chunks >= kmer_len-1 seq.insert(seq.end(), variant.begin(), variant.end()); seq.insert(seq.end(), scontig.m_seq[i+2].front().begin(), scontig.m_seq[i+2].front().begin()+kmer_len-1); CReadHolder rh(false); rh.PushBack(seq); for(CReadHolder::kmer_iterator ik = rh.kbegin(kmer_len) ; ik != rh.kend(); ++ik) m_graph.SetTempHolding(m_graph.GetNode(*ik)); } } } return SContig(m_graph); } else { return scontig; } } void CheckRepeats(TContigList& scontigs) { int kmer_len = m_graph.KmerLen(); for(auto it = scontigs.begin(); it != scontigs.end(); ++it) { auto& contig = it->m_seq; if(contig.m_left_repeat >= kmer_len && contig.m_left_repeat < (int)contig.LenMin()) { int last_chunk = 0; for(int len = contig.ChunkLenMin(last_chunk); len < contig.m_left_repeat+1; len += contig.ChunkLenMin(++last_chunk)); vector> kmers(contig.m_left_repeat+1-kmer_len+1); stack> active_chunks; active_chunks.emplace(&contig[0].front(), 0); deque current_seqs; while(!active_chunks.empty()) { TVariation* seqp = active_chunks.top().first; int chunk_num = active_chunks.top().second; active_chunks.pop(); current_seqs.resize(chunk_num); current_seqs.push_back(seqp); for(int chunk = chunk_num+1; chunk <= last_chunk; ++chunk) { auto it = contig[chunk].begin(); current_seqs.push_back(&(*it)); for(++it; it != contig[chunk].end(); ++it) active_chunks.emplace(&(*it), chunk); } TVariation seq; for(unsigned i = 0; i < current_seqs.size()-1; ++i) seq.insert(seq.end(), current_seqs[i]->begin(), current_seqs[i]->end()); seq.insert(seq.end(), current_seqs.back()->begin(), current_seqs.back()->begin()+contig.m_left_repeat+1-seq.size()); CReadHolder rh(false); rh.PushBack(seq); int pos = kmers.size()-1; for(CReadHolder::kmer_iterator ik = rh.kbegin(kmer_len) ; ik != rh.kend(); ++ik, --pos) { Node node = m_graph.GetNode(*ik); if(find(kmers[pos].begin(), kmers[pos].end(), node) == kmers[pos].end()) kmers[pos].push_front(node); } } for( ; contig.m_left_repeat >= kmer_len; --contig.m_left_repeat) { int p = contig.m_left_repeat-kmer_len; bool bad_node = false; for(auto& kmer : kmers[p]) { if(!kmer.isValid() || !GoodNode(kmer)) { bad_node = true; break; } } if(bad_node) break; bool no_step = false; for(auto& kmer : kmers[p]) { if(kmer.isValid()) { vector successors = m_graph.GetNodeSuccessors(kmer); FilterNeighbors(successors, true); if(successors.empty()) { no_step = true; break; } auto& next_lst = kmers[p+1]; for(auto& suc : successors) { if(find_if(next_lst.begin(), next_lst.end(), [suc](const Node& node) {return node == suc.m_node; }) == next_lst.end()) { no_step = true; break; } } } } if(no_step) break; } } if(contig.m_right_repeat >= kmer_len && contig.m_right_repeat < (int)contig.LenMin()) { int first_chunk = contig.size()-1; for(int len = contig.ChunkLenMin(first_chunk); len < contig.m_right_repeat+1; len += contig.ChunkLenMin(--first_chunk)); vector> kmers(contig.m_right_repeat+1-kmer_len+1); stack> active_chunks; active_chunks.emplace(&contig[contig.size()-1].front(), contig.size()-1); deque current_seqs; while(!active_chunks.empty()) { TVariation* seqp = active_chunks.top().first; int chunk_num = active_chunks.top().second; active_chunks.pop(); if(!current_seqs.empty()) current_seqs.erase(current_seqs.begin(), current_seqs.begin()+chunk_num-first_chunk+1); current_seqs.push_front(seqp); for(int chunk = chunk_num-1; chunk >= first_chunk; --chunk) { auto it = contig[chunk].begin(); current_seqs.push_front(&(*it)); for(++it; it != contig[chunk].end(); ++it) active_chunks.emplace(&(*it), chunk); } TVariation seq; for(unsigned i = current_seqs.size()-1; i > 0; --i) seq.insert(seq.begin(), current_seqs[i]->begin(), current_seqs[i]->end()); seq.insert(seq.begin(), current_seqs.front()->end()-(contig.m_right_repeat+1-seq.size()), current_seqs.front()->end()); CReadHolder rh(false); rh.PushBack(seq); int pos = kmers.size()-1; for(CReadHolder::kmer_iterator ik = rh.kbegin(kmer_len) ; ik != rh.kend(); ++ik, --pos) { Node node = m_graph.GetNode(*ik); if(find(kmers[pos].begin(), kmers[pos].end(), node) == kmers[pos].end()) kmers[pos].push_front(node); } } for( ; contig.m_right_repeat >= kmer_len; --contig.m_right_repeat) { int p = kmers.size()-(contig.m_right_repeat-kmer_len+1); bool bad_node = false; for(auto& kmer : kmers[p]) { if(!kmer.isValid() || !GoodNode(kmer)) { bad_node = true; break; } } if(bad_node) break; bool no_step = false; for(auto& kmer : kmers[p]) { if(kmer.isValid()) { vector successors = m_graph.GetNodeSuccessors(DBGraph::ReverseComplement(kmer)); FilterNeighbors(successors, true); if(successors.empty()) { no_step = true; break; } auto& prev_lst = kmers[p-1]; for(auto& suc : successors) { if(find_if(prev_lst.begin(), prev_lst.end(), [suc](const Node& node) {return node == DBGraph::ReverseComplement(suc.m_node); }) == prev_lst.end()) { no_step = true; break; } } } } if(no_step) break; } } } } void ConnectOverlappingContigs(TContigList& scontigs) { int kmer_len = m_graph.KmerLen(); unordered_map::iterator, int>>, typename Node::Hash> kmers; for(auto it = scontigs.begin(); it != scontigs.end(); ++it) { SContig& contig = *it; if((int)contig.m_seq.ChunkLenMax(0) > kmer_len) { CReadHolder rh(false); rh.PushBack(contig.m_seq[0].front()); int pos = contig.m_seq.ChunkLenMax(0)-kmer_len; for(CReadHolder::kmer_iterator ik = rh.kbegin(kmer_len) ; ik != rh.kend(); ++ik, --pos) { if(pos < (int)contig.m_seq.ChunkLenMax(0)-kmer_len) { Node node = m_graph.GetNode(*ik); if(node.isValid()) kmers[node].emplace_front(it, pos); } } } if(contig.m_seq.size() > 1 && (int)contig.m_seq.ChunkLenMax(contig.m_seq.size()-1) > kmer_len) { CReadHolder rh(false); rh.PushBack(contig.m_seq[contig.m_seq.size()-1].front()); int pos = contig.m_seq.LenMax()-kmer_len; for(CReadHolder::kmer_iterator ik = rh.kbegin(kmer_len) ; ik != rh.kend(); ++ik, --pos) { if(pos > (int)contig.m_seq.LenMax()-(int)contig.m_seq.ChunkLenMax(contig.m_seq.size()-1)) { Node node = m_graph.GetNode(*ik); if(node.isValid()) kmers[node].emplace_front(it, pos); } } } } list::iterator, typename TContigList::iterator, int, int, int>> overlaps; // first contig, second contig, start/end, start/end, len for(auto it = scontigs.begin(); it != scontigs.end(); ++it) { SContig& icontig = *it; // right overlap { list::iterator, typename TContigList::iterator, int, int, int>> contig_overlaps; auto& irchunk = icontig.m_seq.back().front(); auto rslt = kmers.find(icontig.BackKmer()); // rightend to left end if(rslt != kmers.end()) { for(auto& hit : rslt->second) { auto jt = hit.first; if(jt == it) continue; int overlap_len = hit.second+kmer_len; auto& jlchunk = jt->m_seq.front().front(); if(overlap_len > (int)irchunk.size() || overlap_len > (int)jlchunk.size()) continue; if(!equal(jlchunk.begin(), jlchunk.begin()+hit.second, irchunk.end()-overlap_len)) continue; contig_overlaps.emplace_back(it, jt, 1, -1, overlap_len); } } rslt = kmers.find(DBGraph::ReverseComplement(icontig.BackKmer())); // right end to right end if(rslt != kmers.end()) { for(auto& hit : rslt->second) { auto jt = hit.first; if(jt == it) continue; int overlap_len = jt->m_seq.LenMax()-hit.second; auto& jrchunk = jt->m_seq.back().front(); if(overlap_len > (int)irchunk.size() || overlap_len > (int)jrchunk.size()) continue; TVariation seq(irchunk.end()-overlap_len, irchunk.end()-kmer_len); ReverseComplementSeq(seq.begin(), seq.end()); if(!equal(seq.begin(), seq.end(), jrchunk.end()-overlap_len+kmer_len)) continue; contig_overlaps.emplace_back(it, jt, 1, 1, overlap_len); } } if(contig_overlaps.size() == 1) overlaps.splice(overlaps.end(), contig_overlaps); } //left overlap { list::iterator, typename TContigList::iterator, int, int, int>> contig_overlaps; auto& ilchunk = it->m_seq.front().front(); auto rslt = kmers.find(icontig.FrontKmer()); // left end to right end if(rslt != kmers.end()) { for(auto& hit : rslt->second) { auto jt = hit.first; if(jt == it) continue; int overlap_len = jt->m_seq.LenMax()-hit.second; auto& jrchunk = jt->m_seq.back().front(); if(overlap_len > (int)ilchunk.size() || overlap_len > (int)jrchunk.size()) continue; if(!equal(jrchunk.end()-overlap_len+kmer_len, jrchunk.end(), ilchunk.begin()+kmer_len)) continue; contig_overlaps.emplace_back(it, jt, -1, 1, overlap_len); } } rslt = kmers.find(DBGraph::ReverseComplement(icontig.FrontKmer())); // left end to left end if(rslt != kmers.end()) { for(auto& hit : rslt->second) { auto jt = hit.first; if(jt == it) continue; int overlap_len = hit.second+kmer_len; auto& jlchunk = jt->m_seq.front().front(); if(overlap_len > (int)ilchunk.size() || overlap_len > (int)jlchunk.size()) continue; TVariation seq(ilchunk.begin()+kmer_len, ilchunk.begin()+overlap_len); ReverseComplementSeq(seq.begin(), seq.end()); if(!equal(jlchunk.begin(), jlchunk.begin()+hit.second, seq.begin())) continue; contig_overlaps.emplace_back(it, jt, -1, -1, overlap_len); } } if(contig_overlaps.size() == 1) overlaps.splice(overlaps.end(), contig_overlaps); } } for(auto it = overlaps.begin(); it != overlaps.end(); ) { auto overlap = *it; swap(get<0>(overlap), get<1>(overlap)); swap(get<2>(overlap), get<3>(overlap)); auto jt = find(it, overlaps.end(), overlap); if(jt == overlaps.end()) { auto tmp = it++; overlaps.erase(tmp); } else { overlaps.erase(jt); ++it; } } for(auto it_loop = overlaps.begin(); it_loop != overlaps.end(); ) { auto it = it_loop++; auto& overlap = *it; int overlap_len = get<4>(overlap); auto icontigp = get<0>(overlap); auto jcontigp = get<1>(overlap); int diri = get<2>(overlap); int dirj = get<3>(overlap); auto NextIBase = [&]() { Node node = diri > 0 ? icontigp->BackKmer(): DBGraph::ReverseComplement(icontigp->FrontKmer()); auto forward = m_graph.GetNodeSuccessors(node); FilterNeighbors(forward, true); if(forward.size() == 1) { auto backward = m_graph.GetNodeSuccessors(DBGraph::ReverseComplement(forward.front().m_node)); FilterNeighbors(backward, true); if(backward.size() == 1 && DBGraph::ReverseComplement(backward.front().m_node) == node) return forward.front().m_nt; } return 'N'; }; auto NextJBase = [&]() { return dirj < 0 ? *(jcontigp->m_seq.front().front().begin()+overlap_len) : Complement(*(jcontigp->m_seq.back().front().end()-overlap_len-1)); }; bool connected; if(diri > 0) connected = (icontigp->m_seq.m_right_repeat < kmer_len); else connected = (icontigp->m_seq.m_left_repeat < kmer_len); if(dirj > 0) connected = connected && (jcontigp->m_seq.m_right_repeat < kmer_len); else connected = connected && (jcontigp->m_seq.m_left_repeat < kmer_len); connected = connected && (NextIBase() == NextJBase()); if(connected) { swap(icontigp, jcontigp); swap(diri, dirj); connected = (NextIBase() == NextJBase()); } if(!connected) overlaps.erase(it); } cerr << "Overlap connections: " << overlaps.size() << " " << kmer_len << endl; while(!overlaps.empty()) { auto& overlap = overlaps.front(); auto icontigp = get<0>(overlap); auto jcontigp = get<1>(overlap); int diri = get<2>(overlap); int dirj = get<3>(overlap); int overlap_len = get<4>(overlap); if(diri > 0) { if(dirj > 0) jcontigp->ReverseComplement(); jcontigp->ClipLeft(overlap_len-kmer_len+1); // AddToRight assumes kmer-1 overlap icontigp->AddToRight(*jcontigp); } else { if(dirj < 0) jcontigp->ReverseComplement(); jcontigp->ClipRight(overlap_len-kmer_len+1); // AddToLeft assumes kmer-1 overlap icontigp->AddToLeft(*jcontigp); } overlaps.pop_front(); for(auto& overlap : overlaps) { if(get<0>(overlap) == jcontigp) { get<0>(overlap) = icontigp; get<2>(overlap) = diri; } else if(get<1>(overlap) == jcontigp) { get<1>(overlap) = icontigp; get<3>(overlap) = diri; } } scontigs.erase(jcontigp); } } // Starting from available graph nodes, generates all contigs >= min_len_for_new_seeds. Uses ncores threads. TContigList GenerateNewSeeds(int min_len_for_new_seeds, int ncores, CDBGraphDigger* test_graphdiggerp) { //assemble new seeds vector> new_seeds_for_threads(ncores); list> jobs; for(auto& ns : new_seeds_for_threads) { jobs.push_back(bind(&CDBGraphDigger::NewSeedsJob, this, ref(ns), min_len_for_new_seeds)); } RunThreads(ncores, jobs); //connect fragments Graph().ClearHoldings(); TContigList new_seeds = SContig::ConnectFragments(new_seeds_for_threads, Graph()); int kmer_len = Graph().KmerLen(); CReadHolder removed_seq(false); for(auto iloop = new_seeds.begin(); iloop != new_seeds.end(); ) { auto ic = iloop++; if((int)ic->LenMin() < min_len_for_new_seeds+2*kmer_len) { removed_seq.PushBack(ic->m_seq[0].front()); new_seeds.erase(ic); continue; } if(test_graphdiggerp != nullptr) { CReadHolder rh(false); rh.PushBack(ic->m_seq[0].front()); double abundance = 0; int knum = 0; for(CReadHolder::kmer_iterator ik = rh.kbegin(test_graphdiggerp->Graph().KmerLen()) ; ik != rh.kend(); ++ik, ++knum) { auto node = test_graphdiggerp->Graph().GetNode(*ik); abundance += test_graphdiggerp->Graph().Abundance(node); } if(abundance < knum*test_graphdiggerp->m_hist_min) { removed_seq.PushBack(ic->m_seq[0].front()); new_seeds.erase(ic); continue; } } if(!ic->m_seq.m_circular) { string left(ic->m_seq[0].front().begin(), ic->m_seq[0].front().begin()+2*kmer_len-1); removed_seq.PushBack(left); string right(ic->m_seq[0].front().end()-2*kmer_len+1, ic->m_seq[0].front().end()); removed_seq.PushBack(right); ic->ClipLeft(kmer_len); ic->ClipRight(kmer_len); ic->m_seq.m_left_repeat = kmer_len-1; ic->m_seq.m_right_repeat = kmer_len-1; } } for(CReadHolder::kmer_iterator ik = removed_seq.kbegin(kmer_len) ; ik != removed_seq.kend(); ++ik) Graph().ClearVisited(Graph().GetNode(*ik)); return new_seeds; } // Using a longer kmer generates connectors and extenders and improves previously assembled contigs // scontigs - contigs (input/output) // ncores - number of threads void ConnectAndExtendContigs(TContigList& scontigs, int ncores) { vector> extensions_for_jobs(ncores); { for(auto& contig : scontigs) contig.m_is_taken = 0; list> jobs; for(auto& ex : extensions_for_jobs) { jobs.push_back(bind(&CDBGraphDigger::ExtendContigsJob, this, ref(scontigs), ref(ex))); } RunThreads(ncores, jobs); } TContigList extensions = SContig::ConnectFragments(extensions_for_jobs, Graph()); SContig::ConnectAndExtendContigs(scontigs, extensions, ncores); //stabilize orientation which is random in multithreading { for(auto& contig : scontigs) contig.m_is_taken = 0; list> jobs; for(int thr = 0; thr < ncores; ++thr) { jobs.push_back(bind(&CDBGraphDigger::StabilizeContigJob, this, ref(scontigs))); } RunThreads(ncores, jobs); } } list> ConnectPairs(const list>& mate_pairs, int insert_size, int ncores, bool extend_connected) { CStopWatch timer; timer.Restart(); list> paired_reads; list> jobs; for(auto& reads : mate_pairs) { auto& job_input = reads[0]; paired_reads.push_back(array({CReadHolder(false), CReadHolder(true)})); if(job_input.ReadNum() > 0) // not empty jobs.push_back(bind(&CDBGraphDigger::ConnectPairsJob, this, insert_size, ref(job_input), ref(paired_reads.back()), extend_connected)); } RunThreads(ncores, jobs); size_t connected = 0; size_t not_connected = 0; for(auto& rh : paired_reads) { connected += rh[0].ReadNum(); not_connected += rh[1].ReadNum(); } size_t mates = 0; for(auto& rh : mate_pairs) mates += rh[0].ReadNum(); cerr << "Connected: " << connected << " ambiguously connected: " << not_connected/2 << " from " << mates/2 << " mate pairs" << endl; cerr << "Connect pairs in " << timer.Elapsed(); return paired_reads; } // remove read parts not supported by graph uint8_t CheckAndClipReadLite(string& read) { int kmer_len = m_graph.KmerLen(); int rlen = read.size(); if(rlen < kmer_len) { read.clear(); return 0; } deque nodes; CReadHolder rh(false); rh.PushBack(read); for(CReadHolder::kmer_iterator ik = rh.kbegin(kmer_len) ; ik != rh.kend(); ++ik) // iteration from last kmer to first nodes.push_front(m_graph.GetNode(*ik)); vector bases(read.size(), 0); for(int ek = 0; ek < (int)nodes.size(); ++ek) { Node node = nodes[ek]; if(node.isValid() && GoodNode(node)) { int left_kmer_end = ek; // left kmer position on read int right_kmer_end = left_kmer_end+kmer_len-1; // right kmer position on read for(int p = left_kmer_end; p <= right_kmer_end; ++p) bases[p] = 1; } } int left = 0; // first good position int len = 0; // number of consecutive good positions for(int k = 0; k < rlen; ++k) { for( ; k < rlen && !bases[k]; ++k); // skip bad bases int current_left = k; int current_len = 0; for( ; k < rlen && bases[k]; ++k, ++current_len); // count adjacent good bases if(current_len > len) { left = current_left; len = current_len; } } uint8_t color = 0; if(len < kmer_len) { read.clear(); } else { read = read.substr(left, len); for(int ek = left; ek <= left+len-kmer_len; ++ek) { auto& node = nodes[ek]; if(node.isValid()) color |= m_graph.GetColor(node); } } return color; } private: // Prepares one of the mates of a read pair for connection // Finds the longest stretch of the read which could be assembled from both ends and clips the rest // read - read input/output // nodes - kmers for the remaining part void CheckAndClipRead(string& read, deque& nodes) { int kmer_len = m_graph.KmerLen(); string lextend = MostLikelyExtension(DBGraph::ReverseComplement(m_graph.GetNode(read.substr(0, kmer_len))), kmer_len); ReverseComplementSeq(lextend.begin(), lextend.end()); string rextend = MostLikelyExtension(m_graph.GetNode(read.substr(read.size()-kmer_len)), kmer_len); deque extended_nodes; CReadHolder rh(false); rh.PushBack(lextend+read+rextend); for(CReadHolder::kmer_iterator ik = rh.kbegin(kmer_len) ; ik != rh.kend(); ++ik) // iteration from last kmer to first extended_nodes.push_front(m_graph.GetNode(*ik)); vector bases(read.size(), 0); unsigned read_pos = kmer_len-lextend.size(); for(int kk = 0; lextend.size()+read_pos+1 < extended_nodes.size() && read_pos < read.size(); ++kk, ++read_pos) { Node left = extended_nodes[kk]; Node node = extended_nodes[kk+1]; if(!left.isValid() || !GoodNode(left) || !node.isValid() || !GoodNode(node)) continue; vector successors = m_graph.GetNodeSuccessors(left); FilterNeighbors(successors, false); if(find_if(successors.begin(),successors.end(),[node](const Successor& s){return s.m_node == node;}) == successors.end()) continue; Node right = m_graph.ReverseComplement(extended_nodes[lextend.size()+read_pos+1]); node = m_graph.ReverseComplement(extended_nodes[read_pos+lextend.size()]); if(!right.isValid() || !GoodNode(right) || !node.isValid() || !GoodNode(node)) continue; successors = m_graph.GetNodeSuccessors(right); FilterNeighbors(successors, false); if(find_if(successors.begin(),successors.end(),[node](const Successor& s){return s.m_node == node;}) == successors.end()) continue; bases[read_pos] = 1; } int left = 0; // first kmer position int len = 0; // number of consecutive good kmers (the sequence is longer by kmer_len-1) for(unsigned k = 0; k < read.size(); ++k) { for( ; k < read.size() && !bases[k]; ++k); // skip bad bases int current_left = k; int current_len = 0; for( ; k < read.size() && bases[k]; ++k, ++current_len); // count adjacent good bases if(current_len > len) { left = current_left; len = current_len; } } if(len < kmer_len) { read.clear(); nodes.clear(); } else { read = read.substr(left, len); nodes.resize(len-kmer_len+1); copy(extended_nodes.begin()+lextend.size()+left, extended_nodes.begin()+lextend.size()+left+len-kmer_len+1, nodes.begin()); } } // one-thread worker for paired reads connection // saves reads which were unambiguously connected; extends the ends of ambiguously connected reads and // keeps them for future; discards reads which don't have connection // insert_size - the maximal limit of the insert length // mate_pairs - pairs for connection (one mate after another) // paired_reads - [0] connected reads, [1] reads for future connection void ConnectPairsJob(int insert_size, const CReadHolder& mate_pairs, array& paired_reads, bool extend_connected) { if(mate_pairs.ReadNum() < 2) return; int kmer_len = m_graph.KmerLen(); for(CReadHolder::string_iterator is = mate_pairs.sbegin(); is != mate_pairs.send(); ++is) { string read1 = *is; string read2 = *(++is); if((int)min(read1.size(),read2.size()) < kmer_len) continue; deque nodes1; CheckAndClipRead(read1, nodes1); if(read1.empty()) continue; Node last_node1 = nodes1.back(); ReverseComplementSeq(read2.begin(), read2.end()); deque nodes2; CheckAndClipRead(read2, nodes2); if(read2.empty()) continue; Node first_node2 = nodes2.front(); int steps = insert_size; bool ambiguous = false; string read; //check for long overlap with extension int hit = find(nodes2.begin(), nodes2.end(), last_node1) - nodes2.begin(); // first kmer position of the hit if(hit < (int)min(nodes1.size(),nodes2.size()) && equal(nodes2.begin(), nodes2.begin()+hit, nodes1.end()-hit-1)) { // overlap // check for circularity pair, EConnectionStatus> rslt = ConnectTwoNodes(last_node1, last_node1, steps); if(rslt.second == CDBGraphDigger::eNoConnection) read = read1+read2.substr(hit+kmer_len); else ambiguous = true; } else { pair, EConnectionStatus> rslt = ConnectTwoNodes(last_node1, first_node2, steps); if(rslt.second == CDBGraphDigger::eAmbiguousConnection) { ambiguous = true; } else { if(rslt.second == eSuccess) { string r1 = read1; for(auto& suc : rslt.first) { r1.push_back(suc.m_nt); } r1 += read2.substr(kmer_len); rslt = ConnectTwoNodes(DBGraph::ReverseComplement(first_node2), DBGraph::ReverseComplement(last_node1), steps); if(rslt.second == eSuccess) { string seq; for(auto& suc : rslt.first) seq.push_back(suc.m_nt); ReverseComplementSeq(seq.begin(), seq.end()); string r2 = read1.substr(0, read1.size()-kmer_len)+seq+read2; if(r1 == r2) read = r1; } if(read.empty()) ambiguous = true; } } } if(!read.empty()) { if(extend_connected) { string lextend = StringentExtension(DBGraph::ReverseComplement(nodes1.front()), kmer_len).first; ReverseComplementSeq(lextend.begin(), lextend.end()); read = lextend+read; read += StringentExtension(nodes2.back(), kmer_len).first; } paired_reads[0].PushBack(read); } else if(ambiguous) { string lextend = StringentExtension(DBGraph::ReverseComplement(nodes1.front()), kmer_len).first; ReverseComplementSeq(lextend.begin(), lextend.end()); paired_reads[1].PushBack(lextend+read1); read2 += StringentExtension(nodes2.back(), kmer_len).first; ReverseComplementSeq(read2.begin(), read2.end()); paired_reads[1].PushBack(read2); } } } // one-thread worker for generating new seeds // returns contigs sequences which are either >= min_len or are known fragments // contigs - generated contigs // min_len - minimal length for acceptable contigs void NewSeedsJob(TContigList& contigs, int min_len) { for(auto it = Graph().Begin(); it != Graph().End(); ++it) { SContig contig = GetContigForKmer(it, min_len); if(!contig.m_seq.empty()) contigs.push_back(contig); } } void StabilizeContigJob(TContigList& scontigs) { for(auto& contig : scontigs) { if(contig.m_is_taken.Set(1)) // grab contig contig.SelectMinDirection(); } } // one-thread worker for generating connectors and extenders for previously assembled contigs // scontigs - contigs (input/output) // extensions - generated sequences void ExtendContigsJob(TContigList& scontigs, TContigList& extensions) { for(auto& contig : scontigs) { if(contig.m_seq.m_circular || !contig.m_is_taken.Set(1)) // grab contig continue; int kmer_len = contig.m_kmer_len; int chunks = contig.m_seq.size(); if(contig.m_seq.m_right_repeat < kmer_len) { Node takeoff_node = contig.BackKmer(); if(takeoff_node.isValid() && GoodNode(takeoff_node) && !Graph().IsMultContig(takeoff_node)) { // valid uniq kmer int allowed_intrusion = max(0, (int)contig.m_seq.ChunkLenMax(chunks-1)-kmer_len); tuple, Node, int> extension = ExtendToRight(takeoff_node, allowed_intrusion); if(!get<0>(extension).m_seq.empty() || get<1>(extension).isValid()) { // extension could be empty - starting kmer + landing kmer bool skip = false; int intrusion = get<2>(extension); if(intrusion > 0 && !get<1>(extension).isValid()) { // there is intrusion and no further extension CContigSequence& ext_seq = get<0>(extension).m_seq; int ext_chunks = ext_seq.size(); int last_chunk = ext_seq.ChunkLenMax(ext_chunks-1); if(last_chunk < kmer_len && (int)ext_seq.LenMax()-last_chunk-(int)ext_seq.ChunkLenMax(ext_chunks-2) < intrusion) // last chunk and snp will be clipped resulting in shorter sequence skip = true; TKmer back_kmer(contig.m_seq.back().front().end()-intrusion-kmer_len, contig.m_seq.back().front().end()-intrusion); if(!Graph().GetNode(back_kmer).isValid()) // new back kmer is not in the graph skip = true; } if(!skip) { contig.ClipRight(intrusion); SContig sc(&contig, 1, contig.BackKmer(), get<0>(extension), get<1>(extension), Graph()); extensions.push_back(sc); } } } } if(contig.m_seq.m_left_repeat < kmer_len) { Node takeoff_node = DBGraph::ReverseComplement(contig.FrontKmer()); if(takeoff_node.isValid() && GoodNode(takeoff_node) && !Graph().IsMultContig(takeoff_node)) { // valid uniq kmer int allowed_intrusion = max(0, (int)contig.m_seq.ChunkLenMax(0)-kmer_len); tuple, Node, int> extension = ExtendToRight(takeoff_node, allowed_intrusion); if(!get<0>(extension).m_seq.empty() || get<1>(extension).isValid()) { // extension could be empty - starting kmer + landing kmer bool skip = false; int intrusion = get<2>(extension); if(intrusion > 0 && !get<1>(extension).isValid()) { // there is intrusion and no further extension CContigSequence& ext_seq = get<0>(extension).m_seq; int ext_chunks = ext_seq.size(); int last_chunk = ext_seq.ChunkLenMax(ext_chunks-1); if(last_chunk < kmer_len && (int)ext_seq.LenMax()-last_chunk-(int)ext_seq.ChunkLenMax(ext_chunks-2) < intrusion) // last chunk and snp will be clipped resulting in shorter sequence skip = true; TKmer front_kmer(contig.m_seq.front().front().begin()+intrusion, contig.m_seq.front().front().begin()+intrusion+kmer_len); if(!Graph().GetNode(front_kmer).isValid()) // new back kmer is not in the graph skip = true; } if(!skip) { contig.ClipLeft(intrusion); SContig sc(&contig, -1, DBGraph::ReverseComplement(contig.FrontKmer()), get<0>(extension), get<1>(extension), Graph()); sc.ReverseComplement(); extensions.push_back(sc); } } } } } } DBGraph& m_graph; double m_fraction; int m_jump; int m_hist_min; int m_low_count; size_t m_max_branch; bool m_allow_snps; }; }; // namespace #endif /* _GraphDigger_ */ SKESA-2.3.0/ngs_includes.hpp000066400000000000000000000025511335720214300155530ustar00rootroot00000000000000/*=========================================================================== * * PUBLIC DOMAIN NOTICE * National Center for Biotechnology Information * * This software/database is a "United States Government Work" under the * terms of the United States Copyright Act. It was written as part of * the author's official duties as a United States Government employee and * thus cannot be copyrighted. This software/database is freely available * to the public for use. The National Library of Medicine and the U.S. * Government have not placed any restriction on its use or reproduction. * * Although all reasonable efforts have been taken to ensure the accuracy * and reliability of the software and data, the NLM and the U.S. * Government do not and cannot warrant the performance or results that * may be obtained by using this software or data. The NLM and the U.S. * Government disclaim all warranties, express or implied, including * warranties of performance, merchantability or fitness for any particular * purpose. * * Please cite the author in any work or product based on this material. * * =========================================================================== * */ #include #include #include #include #include SKESA-2.3.0/readsgetter.hpp000066400000000000000000000652101335720214300154100ustar00rootroot00000000000000/*=========================================================================== * * PUBLIC DOMAIN NOTICE * National Center for Biotechnology Information * * This software/database is a "United States Government Work" under the * terms of the United States Copyright Act. It was written as part of * the author's official duties as a United States Government employee and * thus cannot be copyrighted. This software/database is freely available * to the public for use. The National Library of Medicine and the U.S. * Government have not placed any restriction on its use or reproduction. * * Although all reasonable efforts have been taken to ensure the accuracy * and reliability of the software and data, the NLM and the U.S. * Government do not and cannot warrant the performance or results that * may be obtained by using this software or data. The NLM and the U.S. * Government disclaim all warranties, express or implied, including * warranties of performance, merchantability or fitness for any particular * purpose. * * Please cite the author in any work or product based on this material. * * =========================================================================== * */ #ifndef _ReadsGetter_ #define _ReadsGetter_ #include #include #include #include #include #include "DBGraph.hpp" #include "counter.hpp" #include "concurrenthash.hpp" #ifndef NO_NGS #include "ngs_includes.hpp" #endif namespace DeBruijn { // CReadsGetter gets reads from SRA, fasta, or fastq files. It finds the leftmost longest // subsequence of unambiguous characters from reads and stores them in list>. // Paired and unpaired reads are kept in different elements of the array (0 and 1). // // The input data is validated and an exception is thrown in case of error. // Validation for SRA: done by NGS library // Format validation for fasta: '>' starts defline // Format validation for fastq: '@' and '+' start first and third lines in every block of four lines // Validation for paired reads with input from fasta or fastq: mates have same prefix; suffix is // '[./][12]' with 1 and 2 for first and second mate, respectively // Exception messages produced are for error opening file, invalid file format, no valid reads in a // specific input source or in all sources available for assembly, read sequence is invalid, and paired // input contains different number of mates. class CReadsGetter { public: // sra_list, fasta_list, fastq_list - input reads from SRA accessions or files in fasta or fastq format. // Files for paired reads should have mates interleaved (first followed by second) or should be in two separate // files specified as a list separated by comma with file for first mate followed by the file for second mate. // ncores - number of cores // usepairedends - flag to indicate that input reads are paired CReadsGetter(const vector& sra_list, const vector& fasta_list, const vector& fastq_list, int ncores, bool usepairedends) : m_ncores(ncores), m_usepairedends(usepairedends) { CStopWatch timer; timer.Restart(); if(!fasta_list.empty()) ReadFastaOrFastq(fasta_list, true); if(!fastq_list.empty()) ReadFastaOrFastq(fastq_list, false); #ifndef NO_NGS if(!sra_list.empty()) GetFromSRA(sra_list); #endif size_t total = 0; size_t paired = 0; for(auto& reads : m_reads) { total += reads[0].ReadNum()+reads[1].ReadNum(); paired += reads[0].ReadNum(); } if(total == 0) throw runtime_error("No valid reads available for assembly"); if(paired > 0) cerr << "Total mates: " << total << " Paired reads: " << paired/2 << endl; else cerr << "Total reads: " << total << endl; cerr << "Reads acquired in " << timer.Elapsed(); m_kmer_for_adapters = 19; m_adapters = CKmerMap(m_kmer_for_adapters); } virtual ~CReadsGetter() {} list>& Reads() { return m_reads; } void ClipAdaptersFromReads_HashCounter(double vector_percent, int estimated_kmer_num, bool skip_bloom_filter) { CStopWatch timer; timer.Restart(); int max_count = MaxCount(vector_percent); int min_count_for_adapters = 15; int64_t MB = 1000000; CKmerHashCounter kmer_counter(m_reads, m_kmer_for_adapters, min_count_for_adapters, estimated_kmer_num*MB, true, m_ncores, skip_bloom_filter); auto& kmers = kmer_counter.Kmers(); for(auto it = kmers.Begin(); it != kmers.End(); ++it) { auto kcount = it.GetElement(); int count = kcount.second->Count(); if(count > max_count) { m_adapters[kcount.first] = count; m_adapters[revcomp(kcount.first, m_kmer_for_adapters)] = count; } } ClipAdapters(); cerr << "Adapters clipped in " << timer.Elapsed(); } void ClipAdaptersFromReads_SortedCounter(double vector_percent, int memory) { CStopWatch timer; timer.Restart(); int max_count = MaxCount(vector_percent); int min_count_for_adapters = 100; int64_t GB = 1000000000; CKmerCounter kmer_counter(m_reads, m_kmer_for_adapters, min_count_for_adapters, true, GB*memory, m_ncores); TKmerCount& kmers = kmer_counter.Kmers(); for(size_t index = 0; index < kmers.Size(); ++index) { pair kcount = kmers.GetKmerCount(index); int count = kcount.second; // clips out upper portion if(count > max_count) { m_adapters[kcount.first] = count; m_adapters[revcomp(kcount.first, m_kmer_for_adapters)] = count; } } ClipAdapters(); cerr << "Adapters clipped in " << timer.Elapsed(); } void PrintAdapters() { struct Printer { Printer(int kmer_len) : vec_kmer_len(kmer_len) {} int vec_kmer_len; set> adapters; void operator()(const TKmer& kmer, int count) { TKmer rkmer = revcomp(kmer, vec_kmer_len); if(kmer < rkmer) adapters.emplace(count, kmer.toString(vec_kmer_len)); else adapters.emplace(count, rkmer.toString(vec_kmer_len)); } }; if(m_adapters.Size() > 0) { Printer prob(m_adapters.KmerLen()); m_adapters.GetInfo(prob); for(auto it = prob.adapters.rbegin(); it != prob.adapters.rend(); ++it) cerr << "Adapter: " << it->second << " " << it->first << endl; } } CKmerMap& Adapters() { return m_adapters; } private: int MaxCount(double vector_percent) { int64_t total_reads = 0; for(const auto& reads : m_reads) total_reads += reads[0].ReadNum()+reads[1].ReadNum(); return vector_percent*total_reads; } void ClipAdapters() { int64_t total_reads = 0; int64_t total_seq = 0; for(const auto& reads : m_reads) { total_reads += reads[0].ReadNum()+reads[1].ReadNum(); total_seq += reads[0].TotalSeq()+reads[1].TotalSeq(); } map clipping_points; if(m_adapters.Size() > 0) { list> jobs; list> clipping_points_for_threads; for(auto& reads : m_reads) { clipping_points_for_threads.emplace_back(); jobs.push_back(bind(&CReadsGetter::ClipAdaptersFromReadsJob, this, ref(reads), ref(clipping_points_for_threads.back()))); } RunThreads(m_ncores, jobs); for(auto& cp : clipping_points_for_threads) { for(auto& count : cp) clipping_points[count.first] += count.second; } } for(auto& count : clipping_points) cerr << "Clipping point: " << count.first << " " << count.second << endl; int64_t total_reads_after = 0; int64_t total_seq_after = 0; for(const auto& reads : m_reads) { total_reads_after += reads[0].ReadNum()+reads[1].ReadNum(); total_seq_after+= reads[0].TotalSeq()+reads[1].TotalSeq(); } cerr << "Adapters: " << m_adapters.Size()/2 << " Reads before: " << total_reads << " Sequence before: " << total_seq << " Reads after: " << total_reads_after << " Sequence after: " << total_seq_after << endl; } // adapter position in read; -1 if not found int FindAdapterInRead(const CReadHolder::string_iterator& is) { int kmer_len = m_adapters.KmerLen(); int rlen = is.ReadLen(); if(rlen < kmer_len) return -1; int knum = rlen-kmer_len+1; CReadHolder::kmer_iterator ik = is.KmersForRead(kmer_len); ik += knum-1; // points to first kmer int pos = 0; for( ; knum > 0; --knum, ik += -1, ++pos) { if(m_adapters.Find(*ik) != nullptr) return pos; } return -1; } void ClipAdaptersFromReadsJob(array& reads, map& clipping_points) { array cleaned_reads{true, false}; { CReadHolder::string_iterator is1 = reads[0].sbegin(); CReadHolder::string_iterator is2 = reads[0].sbegin(); ++is2; for( ; is2 != reads[0].send(); ++is1, ++is1, ++is2, ++is2) { int p1 = FindAdapterInRead(is1); int p2 = FindAdapterInRead(is2); if(p1 >= 0) ++clipping_points[p1]; if(p2 >= 0) ++clipping_points[p2]; if(p1 < 0 && p2 < 0) { // no adapters cleaned_reads[0].PushBack(is1); cleaned_reads[0].PushBack(is2); } else if(p1 == 0 && p2 == 0) { // both start from adapter continue; } else if(p1 == 0) { // first starts from adapter if(p2 < 0) cleaned_reads[1].PushBack(is2); else cleaned_reads[1].PushBack((*is2).substr(0, p2)); } else if(p2 == 0) { // second starts from adapter if(p1 < 0) cleaned_reads[1].PushBack(is1); else cleaned_reads[1].PushBack((*is1).substr(0, p1)); } else { // some clipping but keep both if(p1 < 0) cleaned_reads[0].PushBack(is1); else cleaned_reads[0].PushBack((*is1).substr(0, p1)); if(p2 < 0) cleaned_reads[0].PushBack(is2); else cleaned_reads[0].PushBack((*is2).substr(0, p2)); } } } { for(CReadHolder::string_iterator is = reads[1].sbegin() ;is != reads[1].send(); ++is) { int p = FindAdapterInRead(is); if(p >= 0) ++clipping_points[p]; if(p < 0) cleaned_reads[1].PushBack(is); else if(p == 0) continue; else cleaned_reads[1].PushBack((*is).substr(0, p)); } } reads[0].Swap(cleaned_reads[0]); reads[1].Swap(cleaned_reads[1]); } // insert read from source_name to rholder static void InsertRead(string& read, CReadHolder& rholder, const string& source_name) { //convert to upper case for(char& c : read) c = toupper(c); //check if read is valid if(read.find_first_not_of("ACGTYRWSKMDVHBXN-") != string::npos) throw runtime_error("Invalid sequence in "+source_name); size_t best_start = 0; int best_len = 0; size_t start = 0; // find and store the leftmost longest unambiguous stretch of read while(start < read.size()) { size_t stop = min(read.size(),read.find_first_not_of("ACGT", start)); int len = stop-start; if(len > best_len) { best_len = len; best_start = start; } start = read.find_first_of("ACGT", stop); } if(best_len > 0) rholder.PushBack(read.substr(best_start, best_len)); else rholder.PushBack(string()); // keep a bogus read for paired } typedef tuple TSlice; typedef list TReadJob; // total_length - total number of reads in all runs (sum of file_length) // job_length - desired number of reads in one job // file_list - run names // file_length - run sizes // job_inputs - created jobs (lists of name,from,to) static void ReadJobInputs(size_t total_length, size_t job_length, const vector& file_list, const vector& file_length, list& job_inputs) { size_t assigned_length = 0; int file_num = 0; size_t assigned_length_from_file = 0; while(assigned_length < total_length) { job_inputs.push_back(TReadJob()); size_t current_job_length = 0; while(current_job_length < job_length && assigned_length < total_length) { size_t max_chunk = file_length[file_num] - assigned_length_from_file; if(current_job_length + max_chunk <= job_length) { // the rest of the file could be assigned job_inputs.back().push_back(TSlice(file_list[file_num], assigned_length_from_file, file_length[file_num]-1)); assigned_length += max_chunk; current_job_length += max_chunk; ++file_num; assigned_length_from_file = 0; } else { // something left for another job size_t chunk = job_length - current_job_length; job_inputs.back().push_back(TSlice(file_list[file_num], assigned_length_from_file, assigned_length_from_file+chunk-1)); assigned_length_from_file += chunk; assigned_length += chunk; current_job_length = job_length; } } } } #ifndef NO_NGS // A one-thread worker to get reads from SRA // job - accession(s) and interval(s) to get // rslt - destination static void GetFromSRAJob(const TReadJob job, array& rslt) { // job must be by value - it is deleted in the caller using namespace ngs; for(auto& slice : job) { const string& acc = get<0>(slice); ReadCollection run = ncbi::NGS::openReadCollection (acc); size_t from = get<1>(slice); size_t to = get<2>(slice); ReadIterator it = run.getReadRange (from+1, to-from+1, Read::all); while (it.nextRead()) { int fragments = it.getNumFragments (); if(fragments == 2) { // paired read it.nextFragment(); StringRef s1 = it.getFragmentBases(); int read_length1 = s1.size(); string read1 = string(s1.data(),read_length1); it.nextFragment(); StringRef s2 = it.getFragmentBases(); int read_length2 = s2.size(); string read2 = string(s2.data(),read_length2); InsertRead(read1, rslt[0], acc); InsertRead(read2, rslt[0], acc); } else { // unpaired read while(it.nextFragment()) { StringRef s = it.getFragmentBases(); int read_length = s.size(); string read = string(s.data(),read_length); InsertRead(read, rslt[1], acc); } } } } } // Acquires reads from SRA // sra_list - run accessions void GetFromSRA(const vector& sra_list) { using namespace ngs; vector file_length; size_t total_length = 0; for(const string& file : sra_list) { ReadCollection run = ncbi::NGS::openReadCollection (file); file_length.push_back(run.getReadCount()); total_length += file_length.back(); } list job_inputs; size_t job_length = total_length/m_ncores+1; ReadJobInputs(total_length, job_length, sra_list, file_length, job_inputs); list> jobs; for(auto& job_input : job_inputs) { m_reads.push_back({CReadHolder(true), CReadHolder(false)}); jobs.push_back(bind(GetFromSRAJob, job_input, ref(m_reads.back()))); } RunThreads(m_ncores, jobs); } #endif // Acquires reads from fasta or fastq // file_list - file names (could be separated by comma for paired reads) // isfasta - true for fasta file(s) void ReadFastaOrFastq(const vector& file_list, bool isfasta) { auto NextRead = [] (string& acc, string& read, bool isfasta, boost::iostreams::filtering_istream& is, const string& source_name) { acc.clear(); read.clear(); if(isfasta) {// fasta string record; if(!getline(is, record, '>')) { if(is.eof() && !is.bad()) return false; else throw runtime_error("Error reading "+source_name); } size_t first_ret = min(record.size(),record.find('\n')); if(first_ret == string::npos) throw runtime_error("Invalid fasta file format in "+source_name); acc = record.substr(0, first_ret); read = record.substr(first_ret+1); read.erase(remove(read.begin(),read.end(),'\n'),read.end()); } else { // fastq if(!getline(is, acc)) { if(is.eof() && !is.bad()) return false; else throw runtime_error("Error reading "+source_name); } if(acc[0] != '@') throw runtime_error("Invalid fastq file format in "+source_name); if(!getline(is, read)) throw runtime_error("Error reading "+source_name); string line; if(!getline(is, line) || line[0] != '+') throw runtime_error("Error reading "+source_name); if(!getline(is, line)) throw runtime_error("Error reading "+source_name); } acc = acc.substr(0, acc.find_first_of(" \t")); return true; }; auto OpenStream = [] (const string& file, bool isfasta, boost::iostreams::filtering_istream& is) { ifstream gztest(file, ios_base::in|ios_base::binary); if(!gztest.is_open()) throw runtime_error("Error opening "+file); array gzstart; if(!gztest.read(reinterpret_cast(gzstart.data()), 2)) throw runtime_error("Invalid file "+file); bool gzipped = (gzstart[0] == 0x1f && gzstart[1] == 0x8b); gztest.close(); ios_base::openmode mode = ios_base::in; if(gzipped) mode |= ios_base::binary; boost::iostreams::file_source f{file, mode}; if(gzipped) is.push(boost::iostreams::gzip_decompressor()); is.push(f); // do a quick check of validity on first character of the file char c; if(isfasta) { if(!(is >> c) || c != '>') throw runtime_error("Invalid fasta file format in "+file); } else { if(!(is >> c) || c != '@') throw runtime_error("Invalid fastq file format in "+file); is.putback(c); } }; // checks if ids for paired reads are name[./]1 and name[./]2 auto MatchIds = [] (const string& acc1, const string& acc2) { boost::regex re1("(.+)[./]1"); boost::cmatch matches1; boost::regex re2("(.+)[./]2"); boost::cmatch matches2; return (acc1 == acc2 || (boost::regex_match(acc1.c_str(), matches1, re1) && boost::regex_match(acc2.c_str(), matches2, re2) && matches1[1] == matches2[1])); }; array all_reads = {CReadHolder(true), CReadHolder(false)}; string acc1; string read1; string acc2; string read2; for(const string& file : file_list) { size_t total = all_reads[0].ReadNum()+all_reads[1].ReadNum(); size_t comma = file.find(','); if(comma == string::npos) { boost::iostreams::filtering_istream is; OpenStream(file, isfasta, is); if(!m_usepairedends) { while(NextRead(acc1, read1, isfasta, is, file)) InsertRead(read1, all_reads[1], file); } else { boost::regex re1("(.+)[./]1"); boost::cmatch matches1; boost::regex re2("(.+)[./]2"); boost::cmatch matches2; if(NextRead(acc1, read1, isfasta, is, file)) { while(NextRead(acc2, read2, isfasta, is, file)) { if(MatchIds(acc1, acc2)) { InsertRead(read1, all_reads[0], file); InsertRead(read2, all_reads[0], file); NextRead(acc1, read1, isfasta, is, file); } else { InsertRead(read1, all_reads[1], file); acc1 = acc2; read1 = read2; } } if(!read1.empty()) InsertRead(read1, all_reads[1], file); } } } else { boost::iostreams::filtering_istream is1; string file1 = file.substr(0,comma); OpenStream(file1, isfasta, is1); boost::iostreams::filtering_istream is2; string file2 = file.substr(comma+1); OpenStream(file2, isfasta, is2); while(NextRead(acc1, read1, isfasta, is1, file1)) { if(NextRead(acc2, read2, isfasta, is2, file2)) { InsertRead(read1, all_reads[0], file1); InsertRead(read2, all_reads[0], file2); } else { throw runtime_error("Files "+file+" contain different number of mates"); } } } if(total == all_reads[0].ReadNum()+all_reads[1].ReadNum()) throw runtime_error("File(s) "+file+" doesn't contain valid reads"); } // divide reads into ncores chunks for multithreading size_t job_length = (all_reads[0].ReadNum()+all_reads[1].ReadNum())/m_ncores+1; job_length += job_length%2; size_t num = 0; for(CReadHolder::string_iterator is = all_reads[0].sbegin(); is != all_reads[0].send(); ++is, ++num) { if(num%job_length == 0 || m_reads.empty()) m_reads.push_back(array({CReadHolder(true), CReadHolder(false)})); m_reads.back()[0].PushBack(is); } for(CReadHolder::string_iterator is = all_reads[1].sbegin(); is != all_reads[1].send(); ++is, ++num) { if(num%job_length == 0 || m_reads.empty()) m_reads.push_back(array({CReadHolder(true), CReadHolder(false)})); m_reads.back()[1].PushBack(is); } } int m_ncores; bool m_usepairedends; list> m_reads; int m_kmer_for_adapters; CKmerMap m_adapters; }; }; // namespace #endif /* _ReadsGetter_ */ SKESA-2.3.0/skesa.cpp000066400000000000000000000606551335720214300142100ustar00rootroot00000000000000/*=========================================================================== * * PUBLIC DOMAIN NOTICE * National Center for Biotechnology Information * * This software/database is a "United States Government Work" under the * terms of the United States Copyright Act. It was written as part of * the author's official duties as a United States Government employee and * thus cannot be copyrighted. This software/database is freely available * to the public for use. The National Library of Medicine and the U.S. * Government have not placed any restriction on its use or reproduction. * * Although all reasonable efforts have been taken to ensure the accuracy * and reliability of the software and data, the NLM and the U.S. * Government do not and cannot warrant the performance or results that * may be obtained by using this software or data. The NLM and the U.S. * Government disclaim all warranties, express or implied, including * warranties of performance, merchantability or fitness for any particular * purpose. * * Please cite the author in any work or product based on this material. * * =========================================================================== * */ #include #include "readsgetter.hpp" #include "assembler.hpp" using namespace boost::program_options; using namespace DeBruijn; template //void PrintRslt(CDBGAssembler& assembler, ofstream& contigs_out, ofstream& all_out, ofstream& hist_out, ofstream& connected_reads_out, ofstream& dbg_out, int mincontig) { void PrintRslt(CDBGAssembler& assembler, variables_map& argm) { ofstream contigs_out; ofstream all_out; ofstream hist_out; ofstream connected_reads_out; ofstream dbg_out; if(argm.count("contigs_out")) { contigs_out.open(argm["contigs_out"].as()); if(!contigs_out.is_open()) { cerr << "Can't open file " << argm["contigs_out"].as() << endl; exit(1); } } if(argm.count("all")) { all_out.open(argm["all"].as()); if(!all_out.is_open()) { cerr << "Can't open file " << argm["all"].as() << endl; exit(1); } } if(argm.count("hist")) { hist_out.open(argm["hist"].as()); if(!hist_out.is_open()) { cerr << "Can't open file " << argm["hist"].as() << endl; exit(1); } } if(argm.count("connected_reads")) { connected_reads_out.open(argm["connected_reads"].as()); if(!connected_reads_out.is_open()) { cerr << "Can't open file " << argm["connected_reads"].as() << endl; exit(1); } } if(argm.count("dbg_out")) { dbg_out.open(argm["dbg_out"].as(), ios::binary | ios::out); if(!dbg_out.is_open()) { cerr << "Can't open file " << argm["dbg_out"].as() << endl; exit(1); } } int mincontig = argm["min_contig"].as(); if(mincontig <= 0) { cerr << "Value of --min_contig must be > 0" << endl; exit(1); } DBGraph& first_graph = *assembler.Graphs().begin()->second; int first_kmer_len = first_graph.KmerLen(); int num = 0; ostream& out = contigs_out.is_open() ? contigs_out : cout; auto contigs = assembler.Contigs(); contigs.sort(); for(auto& contig : contigs) { if((int)contig.LenMin() >= mincontig) { deque>> scored_contig; for(unsigned chunk = 0; chunk < contig.size(); ++chunk) { scored_contig.emplace_back(); if(contig.VariableChunk(chunk)) { double total_abundance = 0.; for(auto& variant : contig[chunk]) { TVariation seq = variant; if(chunk < contig.size()-1) { auto a = contig[chunk+1].front().begin(); auto b = contig[chunk+1].front().end(); if((int)contig.ChunkLenMax(chunk+1) > first_kmer_len-1) b = a+first_kmer_len-1; seq.insert(seq.end(), a, b); } if(chunk > 0) { auto b = contig[chunk-1].front().end(); auto a = contig[chunk-1].front().begin(); if((int)contig.ChunkLenMax(chunk-1) > first_kmer_len-1) a = b-first_kmer_len+1; seq.insert(seq.begin(), a, b); } CReadHolder rh(false); rh.PushBack(seq); double abundance = 0; for(CReadHolder::kmer_iterator itk = rh.kbegin(first_graph.KmerLen()); itk != rh.kend(); ++itk) { typename DBGraph::Node node = first_graph.GetNode(*itk); abundance += first_graph.Abundance(node); } total_abundance += abundance; double score = abundance; string var_seq(variant.begin(), variant.end()); scored_contig.back().emplace_back(score, var_seq); } for(auto& score_seq : scored_contig.back()) score_seq.first /= total_abundance; scored_contig.back().sort(); scored_contig.back().reverse(); } else { double score = 1.; string var_seq(contig[chunk].front().begin(), contig[chunk].front().end()); scored_contig.back().emplace_back(score, var_seq); } } string first_variant; for(auto& lst : scored_contig) first_variant += lst.front().second; CReadHolder rh(false); if(contig.m_circular) first_variant += first_variant.substr(0, first_graph.KmerLen()-1); rh.PushBack(first_variant); double abundance = 0; // average count of kmers in contig for(CReadHolder::kmer_iterator itk = rh.kbegin(first_graph.KmerLen()); itk != rh.kend(); ++itk) { typename DBGraph::Node node = first_graph.GetNode(*itk); abundance += first_graph.Abundance(node); } abundance /= first_variant.size()-first_graph.KmerLen()+1; out << ">Contig_" << ++num << "_" << abundance; if(contig.m_circular) { out << "_Circ"; first_variant.erase(first_variant.size()-first_graph.KmerLen()+1, first_graph.KmerLen()-1); } out << "\n" << first_variant << "\n"; int pos = 0; for(unsigned chunk = 0; chunk < scored_contig.size(); ++chunk) { //output variants int chunk_len = scored_contig[chunk].front().second.size(); if(contig.VariableChunk(chunk)) { int left = 0; if(chunk > 0) left = min(100,(int)scored_contig[chunk-1].front().second.size()); int right = 0; if(chunk < scored_contig.size()-1) right = min(100,(int)scored_contig[chunk+1].front().second.size()); int var = 0; auto it = scored_contig[chunk].begin(); for(++it; it != scored_contig[chunk].end(); ++it) { double score = it->first; string& variant = it->second; out << ">Variant_" << ++var << "_for_Contig_" << num << ":" << pos-left+1 << "_" << pos+chunk_len+right << ":" << score << "\n"; if(chunk > 0) { for(int l = left ; l > 0; --l) out << *(scored_contig[chunk-1].front().second.end()-l); } out << variant; if(chunk < scored_contig.size()-1) { for(int r = 0; r < right; ++r) out << scored_contig[chunk+1].front().second[r]; } out << "\n"; } } pos += chunk_len; } } } if(contigs_out.is_open()) { contigs_out.close(); if(!contigs_out) { cerr << "Can't write to file " << argm["contigs_out"].as() << endl; exit(1); } } else { cout.flush(); if(!cout) { cerr << "Write failed " << endl; exit(1); } } if(all_out.is_open()) { auto graphp = assembler.Graphs().begin(); auto it = assembler.AllIterations().begin(); if(argm.count("seeds")) { auto& contigs = *it; int nn = 0; for(auto& contig : contigs) { string first_variant; for(auto& lst : contig) first_variant.insert(first_variant.end(), lst.front().begin(), lst.front().end()); all_out << ">Seed_" << ++nn << " " << contig.m_left_repeat << " " << contig.m_right_repeat << "\n" << first_variant << "\n"; } ++it; } for( ; graphp != assembler.Graphs().end(); ++it, ++graphp) { auto& contigs = *it; int nn = 0; for(auto& contig : contigs) { string first_variant; for(auto& lst : contig) first_variant.insert(first_variant.end(), lst.front().begin(), lst.front().end()); all_out << ">kmer" << graphp->first << "_" << ++nn << " " << contig.m_left_repeat << " " << contig.m_right_repeat << "\n" << first_variant << "\n"; } } if(argm.count("allow_snps")) { auto graphpr = assembler.Graphs().rbegin(); for( ; graphpr != assembler.Graphs().rend(); ++it, ++graphpr) { auto& contigs = *it; int nn = 0; for(auto& contig : contigs) { string first_variant; for(auto& lst : contig) first_variant.insert(first_variant.end(), lst.front().begin(), lst.front().end()); all_out << ">SNP_recovery_kmer" << graphpr->first << "_" << ++nn << " " << contig.m_left_repeat << " " << contig.m_right_repeat << "\n" << first_variant << "\n"; } } } all_out.close(); if(!all_out) { cerr << "Can't write to file " << argm["all"].as() << endl; exit(1); } } if(hist_out.is_open()) { for(auto& gr : assembler.Graphs()) { const TBins& bins = gr.second->GetBins(); for(auto& bin : bins) hist_out << gr.first << '\t' << bin.first << '\t' << bin.second << "\n"; } hist_out.close(); if(!hist_out) { cerr << "Can't write to file " << argm["hist"].as() << endl; exit(1); } } if(connected_reads_out.is_open()) { CReadHolder connected_reads = assembler.ConnectedReads(); int num = 0; for(CReadHolder::string_iterator is = connected_reads.sbegin(); is != connected_reads.send(); ++is) { string s = *is; connected_reads_out << ">ConnectedRead_" << ++num << "\n" << s << "\n"; } connected_reads_out.close(); if(!connected_reads_out) { cerr << "Can't write to file " << argm["connected_reads"].as() << endl; exit(1); } } if(dbg_out.is_open()) { for(auto& gr : assembler.Graphs()) gr.second->Save(dbg_out); dbg_out.close(); if(!dbg_out) { cerr << "Can't write to file " << argm["dbg_out"].as() << endl; exit(1); } } } int main(int argc, const char* argv[]) { for(int n = 0; n < argc; ++n) cerr << argv[n] << " "; cerr << endl << endl; int ncores; int steps; double fraction; double vector_percent; int jump; int min_count; int min_kmer; bool usepairedends; bool forcesinglereads; int maxkmercount; int max_kmer_paired = 0; vector sra_list; vector fasta_list; vector fastq_list; bool allow_snps; bool estimate_min_count = true; options_description general("General options"); general.add_options() ("help,h", "Produce help message") ("version,v", "Print version") ("cores", value()->default_value(0), "Number of cores to use (default all) [integer]") ("memory", value()->default_value(32), "Memory available (GB, only for sorted counter) [integer]") ("hash_count", "Use hash counter [flag]") ("estimated_kmers", value()->default_value(100), "Estimated number of unique kmers for bloom filter (M, only for hash counter) [integer]") ("skip_bloom_filter", "Don't do bloom filter; use --estimated_kmers as the hash table size (only for hash counter) [flag]"); options_description input("Input/output options : at least one input providing reads for assembly must be specified"); input.add_options() ("fasta", value>(), "Input fasta file(s) (could be used multiple times for different runs) [string]") ("fastq", value>(), "Input fastq file(s) (could be used multiple times for different runs) [string]") ("use_paired_ends", "Indicates that a single (not comma separated) fasta/fastq file contains paired reads [flag]") #ifndef NO_NGS ("sra_run", value>(), "Input sra run accession (could be used multiple times for different runs) [string]") #endif ("contigs_out", value(), "Output file for contigs (stdout if not specified) [string]"); options_description assembly("Assembly options"); assembly.add_options() ("kmer", value()->default_value(21), "Minimal kmer length for assembly [integer]") ("min_count", value(), "Minimal count for kmers retained for comparing alternate choices [integer]") ("max_kmer_count", value(), "Minimum acceptable average count for estimating the maximal kmer length in reads [integer]") ("vector_percent", value()->default_value(0.05, "0.05"), "Count for vectors as a fraction of the read number (1. disables) [float (0,1]]") ("insert_size", value(), "Expected insert size for paired reads (if not provided, it will be estimated) [integer]") ("steps", value()->default_value(11), "Number of assembly iterations from minimal to maximal kmer length in reads [integer]") ("fraction", value()->default_value(0.1, "0.1"), "Maximum noise to signal ratio acceptable for extension [float [0,1)]") ("max_snp_len", value()->default_value(150), "Maximal snp length [integer]") ("min_contig", value()->default_value(200), "Minimal contig length reported in output [integer]") ("allow_snps", "Allow additional step for snp discovery [flag]"); options_description debug("Debugging options"); debug.add_options() ("force_single_ends", "Don't use paired-end information [flag]") ("seeds", value(), "Input file with seeds [string]") ("all", value(), "Output fasta for each iteration [string]") ("dbg_out", value(), "Output kmer file [string]") ("hist", value(), "File for histogram [string]") ("connected_reads", value(), "File for connected paired reads [string]"); options_description deprecated(""); deprecated.add_options() ("gz", "Input fasta/fastq files are gzipped [flag]"); options_description all(""); all.add(general).add(input).add(assembly).add(debug).add(deprecated); options_description visible(""); visible.add(general).add(input).add(assembly).add(debug); try { variables_map argm; // boost arguments store(parse_command_line(argc, argv, all), argm); notify(argm); if(argm.count("gz")) cerr << "WARNING: option --gz is deprecated - gzipped files are now recognized automatically" << endl; if(argm.count("help")) { #ifdef SVN_REV cout << "SVN revision:" << SVN_REV << endl << endl; #endif cout << visible << "\n"; return 0; } if(argm.count("version")) { cout << "SKESA v.2.3.0"; #ifdef SVN_REV cout << "-SVN_" << SVN_REV; #endif cout << endl; return 0; } if(!argm.count("fasta") && !argm.count("fastq") #ifndef NO_NGS && !argm.count("sra_run") #endif ) { cerr << "Provide some input reads" << endl; cerr << visible << "\n"; return 1; } #ifndef NO_NGS if(argm.count("sra_run")) { sra_list = argm["sra_run"].as>(); unsigned num = sra_list.size(); sort(sra_list.begin(), sra_list.end()); sra_list.erase(unique(sra_list.begin(),sra_list.end()), sra_list.end()); if(sra_list.size() != num) cerr << "WARNING: duplicate input entries were removed from SRA run list" << endl; } #endif if(argm.count("fasta")) { fasta_list = argm["fasta"].as>(); unsigned num = fasta_list.size(); sort(fasta_list.begin(), fasta_list.end()); fasta_list.erase(unique(fasta_list.begin(),fasta_list.end()), fasta_list.end()); if(fasta_list.size() != num) cerr << "WARNING: duplicate input entries were removed from fasta file list" << endl; } if(argm.count("fastq")) { fastq_list = argm["fastq"].as>(); unsigned num = fastq_list.size(); sort(fastq_list.begin(), fastq_list.end()); fastq_list.erase(unique(fastq_list.begin(),fastq_list.end()), fastq_list.end()); if(fastq_list.size() != num) cerr << "WARNING: duplicate input entries were removed from fastq file list" << endl; } allow_snps = argm.count("allow_snps"); ncores = thread::hardware_concurrency(); if(argm["cores"].as()) { int nc = argm["cores"].as(); if(nc < 0) { cerr << "Value of --cores must be >= 0" << endl; exit(1); } else if(nc > ncores) { cerr << "WARNING: number of cores was reduced to the hardware limit of " << ncores << " cores" << endl; } else if(nc > 0) { ncores = nc; } } steps = argm["steps"].as(); if(steps <= 0) { cerr << "Value of --steps must be > 0" << endl; exit(1); } fraction = argm["fraction"].as(); if(fraction >= 1.) { cerr << "Value of --fraction must be < 1 (more than 0.25 is not recommended)" << endl; exit(1); } if(fraction < 0.) { cerr << "Value of --fraction must be >= 0" << endl; exit(1); } jump = argm["max_snp_len"].as(); if(jump < 0) { cerr << "Value of --max_snp_len must be >= 0" << endl; exit(1); } if(argm.count("insert_size")) max_kmer_paired = argm["insert_size"].as(); min_count = 2; if(argm.count("min_count")) { min_count = argm["min_count"].as(); estimate_min_count = false; } if(min_count <= 0) { cerr << "Value of --min_count must be > 0" << endl; exit(1); } maxkmercount = 10; if(argm.count("max_kmer_count")) { maxkmercount = argm["max_kmer_count"].as(); estimate_min_count = false; } if(maxkmercount <= 0) { cerr << "Value of --max_kmer_count must be > 0" << endl; exit(1); } if(max_kmer_paired < 0) { cerr << "Value of --insert_size must be >= 0" << endl; exit(1); } min_kmer = argm["kmer"].as(); if(min_kmer < 21 || min_kmer%2 ==0) { cerr << "Kmer must be an odd number >= 21" << endl; return 1; } vector_percent = argm["vector_percent"].as(); if(vector_percent > 1.) { cerr << "Value of --vector_percent must be <= 1" << endl; exit(1); } if(vector_percent <= 0.) { cerr << "Value of --vector_percent must be > 0" << endl; exit(1); } usepairedends = argm.count("use_paired_ends"); forcesinglereads = argm.count("force_single_ends"); TStrList seeds; if(argm.count("seeds")) { ifstream seeds_in; seeds_in.open(argm["seeds"].as()); if(!seeds_in.is_open()) { cerr << "Can't open file " << argm["seeds"].as() << endl; exit(1); } char c; if(!(seeds_in >> c)) { cerr << "Empty fasta file for seeds" << endl; } else if(c != '>') { cerr << "Invalid fasta file format in " << argm["seeds"].as() << endl; exit(1); } string record; while(getline(seeds_in, record, '>')) { size_t first_ret = min(record.size(),record.find('\n')); if(first_ret == string::npos) { cerr << "Invalid fasta file format in " << argm["seeds"].as() << endl; exit(1); } string sequence = record.substr(first_ret+1); sequence.erase(remove(sequence.begin(),sequence.end(),'\n'), sequence.end()); if(sequence.find_first_not_of("ACGTYRWSKMDVHBN") != string::npos) { cerr << "Invalid fasta file format in " << argm["seeds"].as() << endl; exit(1); } seeds.push_back(sequence); } } int low_count = max(min_count, 2); CReadsGetter readsgetter(sra_list, fasta_list, fastq_list, ncores, usepairedends); if(argm.count("hash_count")) { int estimated_kmer_num = argm["estimated_kmers"].as(); if(estimated_kmer_num <= 0) { cerr << "Value of --estimated_kmers must be > 0" << endl; exit(1); } bool skip_bloom_filter = argm.count("skip_bloom_filter"); if(vector_percent < 1.) { readsgetter.ClipAdaptersFromReads_HashCounter(vector_percent, estimated_kmer_num, skip_bloom_filter); readsgetter.PrintAdapters(); } else { cerr << "Adapters clip is disabled" << endl; } CDBGAssembler assembler(fraction, jump, low_count, steps, min_count, min_kmer, forcesinglereads, max_kmer_paired, maxkmercount, ncores, readsgetter.Reads(), seeds, allow_snps, estimate_min_count, estimated_kmer_num, skip_bloom_filter); PrintRslt(assembler, argm); } else { int memory = argm["memory"].as(); if(memory <= 0) { cerr << "Value of --memory must be > 0" << endl; exit(1); } if(vector_percent < 1.) { readsgetter.ClipAdaptersFromReads_SortedCounter(vector_percent, memory); readsgetter.PrintAdapters(); } else { cerr << "Adapters clip is disabled" << endl; } CDBGAssembler assembler(fraction, jump, low_count, steps, min_count, min_kmer, forcesinglereads, max_kmer_paired, maxkmercount, ncores, readsgetter.Reads(), seeds, allow_snps, estimate_min_count, memory); PrintRslt(assembler, argm); } cerr << "DONE" << endl; exit(0); } catch (exception &e) { cerr << endl << e.what() << endl; exit(1); } return 0; }