idba-1.1.3/0000775000175000017500000000000012740767035007431 500000000000000idba-1.1.3/src/0000775000175000017500000000000012740767035010220 500000000000000idba-1.1.3/src/graph/0000775000175000017500000000000012740767035011321 500000000000000idba-1.1.3/src/graph/hash_graph_path.h0000664000175000017500000000460512677406270014537 00000000000000/** * @file hash_graph_path.h * @brief HashGraphPath Class. * @author Yu Peng (ypeng@cs.hku.hk) * @version 1.0.4 * @date 2011-09-21 */ #ifndef __GRAPH_HASH_GRAPH_PATH_H_ #define __GRAPH_HASH_GRAPH_PATH_H_ #include "basic/kmer.h" #include "graph/hash_graph.h" #include /** * @brief It is a path of k-mers in de Bruijn graph (HashGraph). */ class HashGraphPath { public: HashGraphPath() {} HashGraphPath(const HashGraphPath &path) : vertices_(path.vertices_) {} const HashGraphPath &operator =(const HashGraphPath &path) { vertices_ = path.vertices_; return *this; } HashGraphVertexAdaptor &operator [](uint32_t index) { return vertices_[index]; } const HashGraphVertexAdaptor &operator [](uint32_t index) const { return vertices_[index]; } void Append(const HashGraphVertexAdaptor &vertex) { vertices_.push_back(vertex); } void Pop() { vertices_.pop_back(); } const HashGraphPath &ReverseComplement() { std::reverse(vertices_.begin(), vertices_.end()); for (unsigned i = 0; i < vertices_.size(); ++i) vertices_[i].ReverseComplement(); return *this; } bool IsSimplePath() const { for (unsigned i = 1; i+1 < vertices_.size(); ++i) { if (vertices_[i].out_edges().size() != 1) return false; if (vertices_[i].in_edges().size() != 1) return false; } return true; } void swap(HashGraphPath &path) { if (this != &path) vertices_.swap(path.vertices_); } uint64_t kmer_count() { uint64_t sum = 0; for (unsigned i = 0; i < vertices_.size(); ++i) sum += vertices_[i].count(); return sum; } HashGraphVertexAdaptor &front() { return vertices_.front(); } const HashGraphVertexAdaptor &front() const { return vertices_.front(); } HashGraphVertexAdaptor &back() { return vertices_.back(); } const HashGraphVertexAdaptor &back() const { return vertices_.back(); } uint32_t size() const { if (vertices_.empty()) return 0; else return vertices_[0].kmer().size() + vertices_.size() - 1; } uint32_t num_nodes() const { return vertices_.size(); } void clear() { vertices_.clear(); } private: std::deque vertices_; }; #endif idba-1.1.3/src/graph/hash_graph_branch_group.cpp0000664000175000017500000000627012677406270016607 00000000000000/** * @file hash_graph_branch_group.cpp * @brief * @author Yu Peng (ypeng@cs.hku.hk) * @version 1.0.4 * @date 2011-09-21 */ #include "hash_graph_branch_group.h" #include #include #include using namespace std; bool HashGraphBranchGroup::Search() { branches_.reserve(max_branches_); HashGraphPath path; path.Append(begin_); branches_.push_back(path); if (begin_.in_edges().size() != 1 || begin_.out_edges().size() <= 1 || begin_.out_edges().size() > max_branches_) return false; bool is_converge = false; for (int k = 1; k < max_length_; ++k) { int num_branches = branches_.size(); for (int i = 0; i < num_branches; ++i) { HashGraphVertexAdaptor current = branches_[i].back(); if (current.out_edges().size() == 0) return false; bool is_first = true; HashGraphPath path = branches_[i]; for (int x = 0; x < 4; ++x) { if (current.out_edges()[x]) { Kmer kmer = current.kmer(); kmer.ShiftAppend(x); HashGraphVertexAdaptor next = hash_graph_->FindVertexAdaptor(kmer); if (next.status().IsDead()) return false; if (is_first) { branches_[i].Append(next); is_first = false; } else { if ((int)branches_.size() == max_branches_) return false; path.Append(next); branches_.push_back(path); path.Pop(); } } } } end_ = branches_[0].back(); if (end_.out_edges().size() == 1) { is_converge = true; for (unsigned i = 1; i < branches_.size(); ++i) { if (branches_[i].back() != end_) { is_converge = false; break; } } if (is_converge) break; } } return is_converge && begin_ != end_; } void HashGraphBranchGroup::Merge() { unsigned best = 0; for (unsigned i = 1; i < branches_.size(); ++i) { if (branches_[i].kmer_count() > branches_[best].kmer_count()) best = i; } int kmer_size = begin_.kmer_size(); for (unsigned i = 0; i < branches_.size(); ++i) { HashGraphPath &path = branches_[i]; path.front().out_edges() = 0; path.back().in_edges() = 0; for (unsigned j = 1; j+1 < path.num_nodes(); ++j) { path[j].in_edges() = 0; path[j].out_edges() = 0; path[j].status().SetDeadFlag(); } } HashGraphPath &path = branches_[best]; for (unsigned j = 1; j+1 < path.num_nodes(); ++j) path[j].status().ResetDeadFlag(); for (unsigned j = 0; j+1 < path.num_nodes(); ++j) { hash_graph_->AddEdge(path[j], path[j+1].kmer()[kmer_size-1]); } } idba-1.1.3/src/graph/contig_graph_vertex.h0000664000175000017500000001733212677406270015461 00000000000000/** * @file contig_graph_vertex.h * @brief ContigGraphVertex Class and ContigGraphVertexAdaptor Class. * @author Yu Peng (ypeng@cs.hku.hk) * @version 1.0.0 * @date 2011-08-16 */ #ifndef __GRAPH_CONTIG_GRAPH_VERTEX_H_ #define __GRAPH_CONTIG_GRAPH_VERTEX_H_ #include #include #include #include "basic/kmer.h" #include "graph/bit_edges.h" #include "graph/contig_info.h" #include "graph/vertex_status.h" #include "sequence/sequence.h" /** * @brief It is the vertex class used in ContigGraph class. */ class ContigGraphVertex { public: explicit ContigGraphVertex(const Sequence &contig = Sequence(), const ContigInfo &contig_info = ContigInfo()) : contig_(contig), contig_info_(contig_info) {} ContigGraphVertex(const ContigGraphVertex &x) : contig_(x.contig_), id_(x.id_), status_(x.status_), contig_info_(x.contig_info_) {} const ContigGraphVertex &operator =(const ContigGraphVertex &x) { if (this != &x) { contig_ = x.contig_; id_ = x.id_; contig_info_ = x.contig_info_; } return *this; } const Sequence &contig() const { return contig_; } void set_contig(const Sequence &contig) { contig_ = contig; } uint32_t contig_size() const { return contig_.size(); } uint32_t num_kmer() const { return contig_.size() - kmer_size() + 1; } const ContigInfo &contig_info() const { return contig_info_; } void set_contig_info(const ContigInfo &contig_info) { contig_info_ = contig_info; } uint64_t kmer_count() const { return contig_info_.kmer_count(); } void set_kmer_count(uint64_t kmer_count) { contig_info_.set_kmer_count(kmer_count); } uint32_t id() const { return id_; } void set_id(uint32_t id) { id_ = id; } uint32_t kmer_size() const { return contig_info_.kmer_size(); } void set_kmer_size(uint32_t kmer_size) { contig_info_.set_kmer_size(kmer_size); } VertexStatus &status() { return status_; } const VertexStatus &status() const { return status_; } BitEdges &in_edges() { return contig_info_.in_edges(); } const BitEdges &in_edges() const { return contig_info_.in_edges(); } BitEdges &out_edges() { return contig_info_.out_edges(); } const BitEdges &out_edges() const { return contig_info_.out_edges(); } Kmer begin_kmer(int kmer_size) const { return contig_.GetKmer(0, kmer_size); } Kmer end_kmer(int kmer_size) const { return contig_.GetKmer(contig_.size() - kmer_size, kmer_size); } double coverage() const { return 1.0 * contig_info_.kmer_count() / (contig_size() - kmer_size() + 1); } const SequenceCount &counts() const { return contig_info_.counts(); } void set_counts(const SequenceCount &counts) { contig_info_.set_counts(counts); } char get_base(uint32_t index) const { return contig_[index]; } SequenceCountUnitType get_count(uint32_t index) const { return contig_info_.counts()[index]; } void swap(ContigGraphVertex &x) { if (this != &x) { contig_.swap(x.contig_); std::swap(id_, x.id_); status_.swap(x.status_); contig_info_.swap(x.contig_info_); } } void clear() { contig_.clear(); id_ = 0; status_.clear(); contig_info_.clear(); } private: Sequence contig_; uint32_t id_; VertexStatus status_; ContigInfo contig_info_; }; /** * @brief It is a adaptor class used to access ContigGraphVertex. Becase a contig and its * reverse complement share the same vertex, using adaptor makes sure that modification to * the vertex consistant. */ class ContigGraphVertexAdaptor { public: explicit ContigGraphVertexAdaptor(ContigGraphVertex *vertex = NULL, bool is_reverse = false) { vertex_ = vertex; is_reverse_ = is_reverse; } ContigGraphVertexAdaptor(const ContigGraphVertexAdaptor &x) { vertex_ = x.vertex_, is_reverse_ = x.is_reverse_; } const ContigGraphVertexAdaptor &operator =(const ContigGraphVertexAdaptor &x) { vertex_ = x.vertex_; is_reverse_ = x.is_reverse_; return *this; } bool operator <(const ContigGraphVertexAdaptor &x) const { return (vertex_ != x.vertex_) ? (vertex_ < x.vertex_) : (is_reverse_ < x.is_reverse_); } bool operator >(const ContigGraphVertexAdaptor &x) const { return (vertex_ != x.vertex_) ? (vertex_ > x.vertex_) : (is_reverse_ > x.is_reverse_); } bool operator ==(const ContigGraphVertexAdaptor &x) const { return vertex_ == x.vertex_ && is_reverse_ == x.is_reverse_; } bool operator !=(const ContigGraphVertexAdaptor &x) const { return vertex_ != x.vertex_ || is_reverse_ != x.is_reverse_; } const ContigGraphVertexAdaptor &ReverseComplement() { is_reverse_ = !is_reverse_; return *this; } Sequence contig() const { Sequence contig = vertex_->contig(); return !is_reverse_ ? contig : contig.ReverseComplement(); } uint32_t contig_size() const { return vertex_->contig().size(); } uint32_t num_kmer() const { return vertex_->num_kmer(); } void set_vertex(ContigGraphVertex *vertex, bool is_reverse) { vertex_ = vertex; is_reverse_ = is_reverse; } ContigInfo contig_info() const { ContigInfo contig_info = vertex_->contig_info(); return (!is_reverse_ ? contig_info : contig_info.ReverseComplement()); } uint64_t kmer_size() const { return vertex_->kmer_size(); } void set_kmer_size(uint64_t kmer_size) { vertex_->set_kmer_size(kmer_size); } uint64_t kmer_count() const { return vertex_->kmer_count(); } void set_kmer_count(uint64_t kmer_count) { vertex_->set_kmer_count(kmer_count); } uint32_t id() const { return vertex_->id(); } void set_id(uint32_t id) { vertex_->set_id(id); } VertexStatus &status() { return vertex_->status(); } const VertexStatus &status() const { return vertex_->status(); } BitEdges &in_edges() { return !is_reverse_ ? vertex_->in_edges() : vertex_->out_edges(); } const BitEdges &in_edges() const { return !is_reverse_ ? vertex_->in_edges() : vertex_->out_edges(); } BitEdges &out_edges() { return !is_reverse_ ? vertex_->out_edges() : vertex_->in_edges(); } const BitEdges &out_edges() const { return !is_reverse_ ? vertex_->out_edges() : vertex_->in_edges(); } SequenceCount counts() { if (!is_reverse_) return vertex_->counts(); else { SequenceCount counts = vertex_->counts(); std::reverse(counts.begin(), counts.end()); return counts; } } char get_base(uint32_t index) const { return (!is_reverse_) ? vertex_->get_base(index) : 3 - vertex_->get_base(contig_size() - 1 - index); } SequenceCountUnitType get_count(uint32_t index) const { return (!is_reverse_) ? vertex_->get_count(index) : vertex_->get_count(vertex_->counts().size() - 1 - index); } Kmer begin_kmer(int kmer_size) const { return !is_reverse_ ? vertex_->begin_kmer(kmer_size) : vertex_->end_kmer(kmer_size).ReverseComplement(); } Kmer end_kmer(int kmer_size) const { return !is_reverse_ ? vertex_->end_kmer(kmer_size) : vertex_->begin_kmer(kmer_size).ReverseComplement(); } double coverage() const { return vertex_->coverage(); } bool is_reverse() const { return is_reverse_; } void swap(ContigGraphVertexAdaptor &x) { if (this != &x) { std::swap(vertex_, x.vertex_); std::swap(is_reverse_, x.is_reverse_); } } bool is_null() const { return vertex_ == NULL; } void clear() { vertex_->clear(); } private: ContigGraphVertex *vertex_; bool is_reverse_; }; namespace std { template <> inline void swap(ContigGraphVertex &x, ContigGraphVertex &y) { x.swap(y); } template <> inline void swap(ContigGraphVertexAdaptor &x, ContigGraphVertexAdaptor &y) { x.swap(y); } } #endif idba-1.1.3/src/graph/scaffold_graph.h0000664000175000017500000002241512677406270014360 00000000000000/** * @file scaffold_graph.h * @brief ScaffoldGraph Class. * @author Yu Peng (ypeng@cs.hku.hk) * @version 1.0.1 * @date 2011-09-02 */ #ifndef __GRAPH_SCAFFOLD_GRAPH_H_ #define __GRAPH_SCAFFOLD_GRAPH_H_ #include #include #include #include #include #include "graph/contig_graph.h" #include "graph/scaffold_graph_path.h" #include "graph/scaffold_graph_vertex.h" /** * @brief It is a class for storing position of a pair of reads. */ class ScaffoldGraphPair { public: ScaffoldGraphPair() {} ScaffoldGraphPair(int level, int from, int to, int distance) : level_(level), from_(from), to_(to), distance_(distance) {} int level() { return level_; } int from() { return from_; } int to() { return to_; } int distance() { return distance_; } private: int level_; int from_; int to_; int distance_; }; /** * @brief It is the edge in ScaffoldGraph. */ class ScaffoldGraphEdge { public: ScaffoldGraphEdge() {} ScaffoldGraphEdge(int level, ScaffoldGraphVertexAdaptor from, ScaffoldGraphVertexAdaptor to, int d) { from_ = from; to_ = to; values_.push_back(d); level_ = level; } ScaffoldGraphVertexAdaptor from() const { return from_; } ScaffoldGraphVertexAdaptor to() const { return to_; } std::vector &values() { return values_; } const std::vector &values() const { return values_; } int distance() const { return distance_; } int level() const { return level_; } VertexStatus &status() { return status_; } const VertexStatus &status() const { return status_; } void Parse() { std::sort(values_.begin(), values_.end()); if (values_.empty()) distance_ = (1 << 30); else distance_ = values_[values_.size()/2]; } private: ScaffoldGraphVertexAdaptor from_; ScaffoldGraphVertexAdaptor to_; std::vector values_; int distance_; int level_; VertexStatus status_; }; /** * @brief It is an adaptor class for access ScaffoldGraphEdge. Because the edge * an its reverse complement share the same edge instance, using adaptor makes * sure the modification of edge consistant. */ class ScaffoldGraphEdgeAdaptor { public: explicit ScaffoldGraphEdgeAdaptor(ScaffoldGraphEdge *edge = NULL, bool is_reverse = false) { edge_ = edge; is_reverse_ = is_reverse; } ScaffoldGraphEdgeAdaptor(const ScaffoldGraphEdgeAdaptor &x) { edge_ = x.edge_, is_reverse_ = x.is_reverse_; } ScaffoldGraphVertexAdaptor from() const { return !is_reverse_ ? edge_->from() : edge_->to().ReverseComplement(); } ScaffoldGraphVertexAdaptor to() const { return !is_reverse_ ? edge_->to() : edge_->from().ReverseComplement(); } std::vector &values() { return edge_->values(); } const std::vector &values() const { return edge_->values(); } int distance() const { return edge_->distance(); } int level() const { return edge_->level(); } VertexStatus &status() { return edge_->status(); } const VertexStatus &status() const { return edge_->status(); } const ScaffoldGraphEdgeAdaptor &ReverseComplement() { is_reverse_ = !is_reverse_; return *this; } void Parse() { edge_->Parse(); } private: ScaffoldGraphEdge *edge_; bool is_reverse_; }; /** * @brief It is a contig graph built upon paired-end reads information. Each * vertex is a contg, maybe a path of contigs in ContigGraph, and each edge * between vertex u and vertex v means there are at least min_pairs paired-end * reads connecting u and v. */ class ScaffoldGraph { public: explicit ScaffoldGraph(uint32_t kmer_size = 0) : contig_graph_(kmer_size), min_pairs_(5) { Initialize(); } explicit ScaffoldGraph(uint32_t kmer_size, const std::deque &contigs) : contig_graph_(kmer_size, contigs), min_pairs_(5) { Initialize(); } explicit ScaffoldGraph(uint32_t kmer_size, const std::deque &contigs, const std::deque &contig_infos) : contig_graph_(kmer_size, contigs, contig_infos), min_pairs_(5) { Initialize(); } ~ScaffoldGraph() { clear(); } void Initialize(); void Initialize(std::deque &paths); void BuildContigToScaffoldMap(); void BuildEdges(); void RefreshEdges(); void ParseEdges(bool is_uneven = false); void FilterEdges(int min_pairs, int min_length); void ClearStatus(); bool IsConnected(int level, ScaffoldGraphVertexAdaptor from, ScaffoldGraphVertexAdaptor to); int64_t RemoveTransitiveConnections(int level); bool IsConsistent(int level, ScaffoldGraphVertexAdaptor current); bool IsConsistentMulti(int level, ScaffoldGraphVertexAdaptor current); bool ExtendPath(int level, ScaffoldGraphPath &scaffold_path); bool ExtendPathMulti(int level, ScaffoldGraphPath &scaffold_path); int64_t Assemble(int level, std::deque &paths); int64_t Assemble(int level, std::deque &contigs); int64_t AssembleMulti(int level, std::deque &paths); int64_t AssembleMulti(int level, std::deque &contigs); void AddPair(int level, int from, int to, int d) { pairs_.push_back(ScaffoldGraphPair(level, from, to, d)); } void AddEdge(int level, ScaffoldGraphVertexAdaptor from, ScaffoldGraphVertexAdaptor to, int d) { std::deque &all_edges = GetEdges(from); for (unsigned i = 0; i < all_edges.size(); ++i) { if (all_edges[i].level() == level && all_edges[i].to() == to) { all_edges[i].values().push_back(d); return; } } AddNewEdge(level, from, to, d); } void AddNewEdge(int level, ScaffoldGraphVertexAdaptor from, ScaffoldGraphVertexAdaptor to, int d) { ScaffoldGraphEdge edge(level, from, to, d); edge_data_.push_back(edge); ScaffoldGraphEdgeAdaptor adp(&edge_data_.back()); GetEdges(adp.from()).push_back(adp); adp.ReverseComplement(); GetEdges(adp.from()).push_back(adp); } std::deque GetEdges(int level, ScaffoldGraphVertexAdaptor current) { std::deque edges; std::deque &all_edges = GetEdges(current); for (unsigned i = 0; i < all_edges.size(); ++i) { if (all_edges[i].level() == level && !all_edges[i].status().IsDead()) edges.push_back(all_edges[i]); } return edges; } std::deque &GetEdges(ScaffoldGraphVertexAdaptor current) { return !current.is_reverse() ? out_edges_[current.id()] : in_edges_[current.id()]; } std::deque &vertices() { return vertices_; } const std::deque &vertices() const { return vertices_; } ContigGraph &contig_graph() { return contig_graph_; } const ContigGraph &contig_graph() const { return contig_graph_; } int num_edges(int level) const { int count = 0; for (unsigned i = 0; i < edge_data_.size(); ++i) { if (edge_data_[i].status().IsDead()) continue; if (edge_data_[i].level() != level) continue; ++count; } return count; } int min_pairs() const { return min_pairs_; } void set_min_pairs(int min_pairs) { min_pairs_ = min_pairs; } int kmer_size() const { return contig_graph_.kmer_size(); } int read_length(int level) const { return read_length_[level]; } double expected_coverage(int level) const { return expected_coverage_[level]; } double mean(int level) const { return mean_[level]; } double sd(int level) const { return sd_[level]; } int num_level() const { return read_length_.size(); } void set_library_info(int level, int read_length, double coverage, double mean, double sd) { if ((int)read_length_.size() < level + 1) { read_length_.resize(level+1); expected_coverage_.resize(level+1); mean_.resize(level+1); sd_.resize(level+1); } read_length_[level] = read_length; expected_coverage_[level] = coverage; mean_[level] = mean; sd_[level] = sd; } void clear() { contig_graph_.clear(); } private: ScaffoldGraph(const ScaffoldGraph &); const ScaffoldGraph &operator =(const ScaffoldGraph &); double ExpectedEdges(int level, int len1, int len2, int distance); double ExpectedEdges(int level, int len1, int len2, int distance, double expected_coverage); static const int kTimeLimit = 500; ContigGraph contig_graph_; std::deque vertices_; std::map contig_to_scaffold_; std::map contig_to_scaffold_position_; std::deque > in_edges_; std::deque > out_edges_; std::deque edge_data_; std::deque pairs_; int min_pairs_; std::vector read_length_; std::vector expected_coverage_; std::vector mean_; std::vector sd_; }; #endif idba-1.1.3/src/graph/contig_info.h0000664000175000017500000000630412677406270013713 00000000000000/** * @file contig_info.h * @brief ContigInfo Class. * @author Yu Peng (ypeng@cs.hku.hk) * @version 1.0.0 * @date 2011-08-26 */ #ifndef __GRAPH_CONTIG_INFO_H_ #define __GRAPH_CONTIG_INFO_H_ #include "graph/bit_edges.h" #include #include #include #include #include typedef uint32_t SequenceCountUnitType; typedef std::basic_string SequenceCount; class ContigBuilder; /** * @brief It is used to store information of contigs, like k-mer counts, in-edges, * out-edges, etc. */ class ContigInfo { friend class ContigBuilder; friend std::istream &operator >>(std::istream &is, ContigInfo &contig_info); friend std::ostream &operator <<(std::ostream &os, const ContigInfo &contig_info); public: ContigInfo() { kmer_count_ = 0; kmer_size_ = 0; } ContigInfo(const ContigInfo &contig_info) { in_edges_ = contig_info.in_edges_; out_edges_ = contig_info.out_edges_; kmer_count_ = contig_info.kmer_count_; kmer_size_ = contig_info.kmer_size_; counts_ = contig_info.counts_; } const ContigInfo &operator =(const ContigInfo &contig_info) { in_edges_ = contig_info.in_edges_; out_edges_ = contig_info.out_edges_; kmer_count_ = contig_info.kmer_count_; kmer_size_ = contig_info.kmer_size_; counts_ = contig_info.counts_; return *this; } const ContigInfo &ReverseComplement() { std::swap(in_edges_, out_edges_); std::reverse(counts_.begin(), counts_.end()); return *this; } BitEdges &in_edges() { return in_edges_; } const BitEdges &in_edges() const { return in_edges_; } BitEdges &out_edges() { return out_edges_; } const BitEdges &out_edges() const { return out_edges_; } uint32_t kmer_size() const { return kmer_size_; } void set_kmer_size(uint32_t kmer_size) { kmer_size_ = kmer_size; } uint32_t kmer_count() const { return kmer_count_; } void set_kmer_count(uint32_t kmer_count) { kmer_count_ = kmer_count; } const SequenceCount &counts() const { return counts_; } void set_counts(const SequenceCount &counts) { counts_ = counts; } void swap(ContigInfo &contig_info) { if (this != &contig_info) { std::swap(in_edges_, contig_info.in_edges_); std::swap(out_edges_, contig_info.out_edges_); std::swap(kmer_size_, contig_info.kmer_size_); std::swap(kmer_count_, contig_info.kmer_count_); counts_.swap(contig_info.counts_); } } void clear() { in_edges_ = 0; out_edges_ = 0; kmer_size_ = 0; kmer_count_ = 0; counts_.clear(); } private: BitEdges in_edges_; BitEdges out_edges_; uint16_t kmer_size_; uint32_t kmer_count_; SequenceCount counts_; }; namespace std { template <> inline void swap(ContigInfo &x, ContigInfo &y) { x.swap(y); } } std::istream &operator >>(std::istream &is, ContigInfo &contig_info); std::ostream &operator <<(std::ostream &os, const ContigInfo &contig_info); void ReadContigInfo(const std::string &filename, std::deque &contig_infos); void WriteContigInfo(const std::string &filename, const std::deque &contig_infos); #endif idba-1.1.3/src/graph/hash_graph.h0000664000175000017500000003100112740766272013513 00000000000000/** * @file hash_graph.h * @brief HashGraph Class. * @author Yu Peng (ypeng@cs.hku.hk) * @version 1.0.0 * @date 2011-08-05 */ #ifndef __GRAPH_HASH_GRAPH_H_ #define __GRAPH_HASH_GRAPH_H_ #include #include #include #include #include "basic/bit_operation.h" #include "basic/histgram.h" #include "basic/kmer.h" #include "container/hash_table.h" #include "graph/contig_info.h" #include "graph/hash_graph_vertex.h" #include "graph/hash_graph_path.h" #include "sequence/sequence.h" class Kmer; class Sequence; class ShortSequence; class CompactSequence; /** * @brief It is a hash table based de Bruijn graph implementation. */ class HashGraph { class RefreshVerticesFunc; class RefreshEdgesFunc; public: friend std::istream &operator >>(std::istream &is, HashGraph &hash_graph); friend std::ostream &operator <<(std::ostream &os, HashGraph &hash_graph); typedef HashTable vertex_table_type; typedef vertex_table_type::iterator iterator; explicit HashGraph(uint32_t kmer_size = 0) { set_kmer_size(kmer_size); num_edges_ = 0; } ~HashGraph() {} iterator begin() { return vertex_table_.begin(); } iterator end() { return vertex_table_.end(); } HashGraphVertex *InsertVertex(const Kmer &kmer, int count = 1) { Kmer key = kmer.unique_format(); HashGraphVertex &vertex = vertex_table_.find_or_insert(HashGraphVertex(key)); vertex.count() += count; return &vertex; } HashGraphVertex *InsertVertex(const HashGraphVertex &vertex) { return &vertex_table_.find_or_insert(vertex); } HashGraphVertex *FindVertex(const Kmer &kmer) { Kmer key = kmer.unique_format(); vertex_table_type::iterator p = vertex_table_.find(key); return (p != vertex_table_.end()) ? &*p : NULL; } const HashGraphVertex *FindVertex(const Kmer &kmer) const { Kmer key = kmer.unique_format(); vertex_table_type::const_iterator p = vertex_table_.find(key); return (p != vertex_table_.end()) ? &*p : NULL; } HashGraphVertexAdaptor FindVertexAdaptor(const Kmer &kmer) { Kmer key = kmer.unique_format(); vertex_table_type::iterator p = vertex_table_.find(key); return ((p != vertex_table_.end()) ? HashGraphVertexAdaptor(&*p, kmer != key) : HashGraphVertexAdaptor(NULL)); } HashGraphVertexAdaptor GetNeighbor(const HashGraphVertexAdaptor ¤t, int x) { Kmer kmer = current.kmer(); kmer.ShiftAppend(x); return FindVertexAdaptor(kmer); } int64_t InsertKmers(const Sequence &seq) { return InsertKmersWithPrefix(seq, 0, 0); } int64_t InsertKmersWithPrefix(const Sequence &seq, uint64_t prefix, uint64_t umask); int64_t InsertUncountKmers(const Sequence &seq); int64_t InsertInternalKmers(const Sequence &seq, int min_count = 0); int64_t InsertEdges(const Sequence &seq); int64_t InsertExistKmers(const Sequence &seq); int64_t RemoveKmers(const Sequence &seq); void RemoveEdge(HashGraphVertexAdaptor &node, int x) { node.out_edges().Remove(x); Kmer kmer = node.kmer(); kmer.ShiftAppend(x); HashGraphVertexAdaptor next = FindVertexAdaptor(kmer); if (!next.is_null()) next.in_edges().Remove(3 - node.kmer()[0]); } void AddEdge(HashGraphVertexAdaptor &node, int x) { node.out_edges().Add(x); Kmer kmer = node.kmer(); kmer.ShiftAppend(x); HashGraphVertexAdaptor next = FindVertexAdaptor(kmer); if (!next.is_null()) next.in_edges().Add(3 - node.kmer()[0]); } void BackupEdges() { BackupEdgesFunc func; vertex_table_.for_each(func); } void RestoreAndMergeEdges() { RestoreAndMergeEdgesFunc func; vertex_table_.for_each(func); } void AddAllEdges() { AddAllEdgesFunc func; vertex_table_.for_each(func); RefreshEdges(); } void ClearEdges() { ClearEdgesFunc func; vertex_table_.for_each(func); } void ClearStatus() { ClearStatusFunc func; vertex_table_.for_each(func); } void ClearCount() { ClearCountFunc func; vertex_table_.for_each(func); } void SetCountCap(int cap) { SetCountCapFunc func(cap); vertex_table_.for_each(func); } void Refresh(int min_count = 0) { RefreshVertices(min_count); RefreshEdges(); } int64_t RefreshVertices(int min_count = 0) { RefreshVerticesFunc func(min_count); return vertex_table_.remove_if(func); } void RefreshEdges() { RefreshEdgesFunc func(this); vertex_table_.for_each(func); num_edges_ = func.num_edges(); } int64_t ErodeEnd(int min_cover); int64_t Trim(int min_length); int64_t RemoveDeadEnd(int min_length); int64_t RemoveLowCoverage(double min_cover, int min_contig = (1 << 20)); int64_t RemoveBubble(); int64_t Assemble(std::deque &contigs); int64_t Assemble(std::deque &contigs, std::deque &contig_infos); // int64_t TrimSequentially(int min_length); // int64_t RemoveDeadEndSequentially(int min_length); // int64_t RemoveLowCoverageSequentially(double min_cover); // int64_t AssembleSequentially(std::deque &contigs); // int64_t AssembleSequentially(std::deque &contigs, std::deque &contig_infos); void reserve(uint64_t capacity) { vertex_table_.reserve(capacity); } uint32_t kmer_size() const { return kmer_size_; } void set_kmer_size(uint32_t kmer_size) { kmer_size_ = kmer_size; } Histgram coverage_histgram() const { CoverageHistgramFunc func; vertex_table_.for_each(func); return func.histgram(); } void swap(HashGraph &hash_graph) { if (this != &hash_graph) { vertex_table_.swap(hash_graph.vertex_table_); std::swap(kmer_size_, hash_graph.kmer_size_); std::swap(num_edges_, hash_graph.num_edges_); } } uint64_t num_bucket() const { return vertex_table_.bucket_count(); } uint64_t num_vertices() const { return vertex_table_.size(); } uint64_t num_edges() const { return num_edges_; } void clear() { vertex_table_.clear(); num_edges_ = 0; } private: #if __cplusplus >= 201103L HashGraph(const HashGraph &) = delete; const HashGraph &operator =(const HashGraph &) = delete; #else HashGraph(const HashGraph &); const HashGraph &operator =(const HashGraph &); #endif bool GetNextVertexAdaptor(const HashGraphVertexAdaptor ¤t, HashGraphVertexAdaptor &next) { if (current.out_edges().size() != 1) return false; Kmer kmer = current.kmer(); kmer.ShiftAppend(bit_operation::BitToIndex(current.out_edges())); next = FindVertexAdaptor(kmer); return !kmer.IsPalindrome() && next.in_edges().size() == 1; } bool IsLoop(const Sequence &contig, HashGraphVertexAdaptor &next) { Kmer kmer = next.kmer(); Kmer rev_comp = kmer; rev_comp.ReverseComplement(); return contig.GetKmer(0, kmer_size_) == kmer || contig.GetKmer(contig.size() - kmer_size_, kmer_size_) == rev_comp; } class BackupEdgesFunc { public: BackupEdgesFunc() {} void operator() (HashGraphVertex &vertex) { vertex.in_edges() = (vertex.in_edges() << 4) | (vertex.in_edges() & 15); vertex.out_edges() = (vertex.out_edges() << 4) | (vertex.out_edges() & 15); } }; class RestoreAndMergeEdgesFunc { public: RestoreAndMergeEdgesFunc() {} void operator() (HashGraphVertex &vertex) { vertex.in_edges() = ((unsigned)vertex.in_edges() >> 4) | (vertex.in_edges() & 15); vertex.out_edges() = ((unsigned)vertex.out_edges() >> 4) | (vertex.out_edges() & 15); } }; class AddAllEdgesFunc { public: AddAllEdgesFunc() {} void operator ()(HashGraphVertex &vertex) { vertex.in_edges() = 15; vertex.out_edges() = 15; } }; class ClearEdgesFunc { public: ClearEdgesFunc() {} void operator ()(HashGraphVertex &vertex) { vertex.in_edges() = 0; vertex.out_edges() = 0; } }; class ClearStatusFunc { public: ClearStatusFunc() {} void operator ()(HashGraphVertex &vertex) { vertex.status().clear(); } }; class ClearCountFunc { public: ClearCountFunc() {} void operator ()(HashGraphVertex &vertex) { vertex.count() = 0; } }; class SetCountCapFunc { public: SetCountCapFunc(int cap): cap_(cap) { } void operator ()(HashGraphVertex &vertex) { if (vertex.count() > cap_) vertex.count() = cap_; } private: int cap_; }; class RefreshVerticesFunc { public: explicit RefreshVerticesFunc(int min_count) : min_count_(min_count) {} bool operator ()(HashGraphVertex &vertex) const { if (vertex.count() < min_count_ || vertex.status().IsDead()) return true; return false; } private: int min_count_; }; class RefreshEdgesFunc { public: explicit RefreshEdgesFunc(HashGraph *hash_graph) { hash_graph_ = hash_graph; } void operator ()(HashGraphVertex &vertex) { HashGraphVertexAdaptor adaptor(&vertex); for (int strand = 0; strand < 2; ++strand) { Kmer kmer = adaptor.kmer(); for (int i = 0; i < 4; ++i) { if (adaptor.out_edges()[i]) { Kmer next = kmer; next.ShiftAppend(i); if (hash_graph_->FindVertex(next) == NULL) adaptor.out_edges().Remove(i); else total_degree_ += 1; } } adaptor.ReverseComplement(); } if ((vertex.kmer().size() & 1) == 0) vertex.FixPalindromeEdges(); } uint64_t num_edges() { return total_degree_ / 2; } private: HashGraph *hash_graph_; AtomicInteger total_degree_; }; class ErodeFunc { public: ErodeFunc(HashGraph *hash_graph, int min_cover) { hash_graph_ = hash_graph; min_cover_ = min_cover; } void operator ()(HashGraphVertex &vertex); private: HashGraph *hash_graph_; int min_cover_; }; class TrimFunc { public: TrimFunc(HashGraph *hash_graph, int min_length) { hash_graph_ = hash_graph; min_length_ = min_length; } void operator ()(HashGraphVertex &vertex); private: HashGraph *hash_graph_; int min_length_; }; class BubbleFunc { public: BubbleFunc(HashGraph *hash_graph) { hash_graph_ = hash_graph; omp_init_lock(&bubble_lock_); } ~BubbleFunc() { omp_destroy_lock(&bubble_lock_); } void operator ()(HashGraphVertex &vertex); std::deque &candidates() { return candidates_; } private: HashGraph *hash_graph_; std::deque candidates_; omp_lock_t bubble_lock_; }; class AssembleFunc { public: AssembleFunc(HashGraph *hash_graph) : hash_graph_(hash_graph) { omp_init_lock(&contig_lock_); } ~AssembleFunc() { omp_destroy_lock(&contig_lock_); } void operator ()(HashGraphVertex &vertex); std::deque &contigs() { return contigs_; } std::deque &contig_infos() { return contig_infos_; } private: HashGraph *hash_graph_; std::deque contigs_; std::deque contig_infos_; omp_lock_t contig_lock_; }; class CoverageHistgramFunc { public: void operator ()(HashGraphVertex &vertex) { histgram_.insert(vertex.count()); } const Histgram &histgram() { return histgram_; } private: Histgram histgram_; }; HashTable vertex_table_; uint32_t kmer_size_; uint64_t num_edges_; }; inline std::istream &operator >>(std::istream &is, HashGraph &hash_graph) { return is >> hash_graph.vertex_table_; } inline std::ostream &operator <<(std::ostream &os, HashGraph &hash_graph) { os << hash_graph.vertex_table_; hash_graph.RefreshEdges(); return os; } namespace std { inline void swap(HashGraph &x, HashGraph &y) { x.swap(y); } } #endif idba-1.1.3/src/graph/contig_builder.h0000664000175000017500000000562212677406270014410 00000000000000/** * @file contig_builder.h * @brief Contig Build Class which builds contig and related contig info. * @author Yu Peng (ypeng@cs.hku.hk) * @version 1.0.9 * @date 2011-12-27 */ #ifndef __GRAPH_CONTIG_BUILDER_H_ #define __GRAPH_CONTIG_BUILDER_H_ #include "graph/contig_graph_vertex.h" #include "graph/contig_info.h" #include "graph/hash_graph_vertex.h" #include "sequence/sequence.h" /** * @brief It is a builder class for building contigs. */ class ContigBuilder { public: ContigBuilder() {} explicit ContigBuilder(HashGraphVertexAdaptor x) { Append(x); } explicit ContigBuilder(ContigGraphVertexAdaptor x) { Append(x, 0); } void Append(HashGraphVertexAdaptor x) { if (contig_.size() == 0) { contig_.Assign(x.kmer()); contig_info_.in_edges_ = x.in_edges(); contig_info_.out_edges_ = x.out_edges(); contig_info_.kmer_size_ = x.kmer().size(); contig_info_.kmer_count_ = x.count(); contig_info_.counts_.resize(1); contig_info_.counts_[0] = x.count(); } else { contig_ += x.kmer()[x.kmer().size() - 1]; contig_info_.out_edges_ = x.out_edges(); contig_info_.kmer_count_ += x.count(); contig_info_.counts_ += x.count(); } } void Append(ContigGraphVertexAdaptor x, int d) { if (contig_.size() == 0) { contig_ = x.contig(); contig_info_.in_edges_ = x.in_edges(); contig_info_.out_edges_ = x.out_edges(); contig_info_.kmer_size_ = x.kmer_size(); contig_info_.kmer_count_ = x.kmer_count(); contig_info_.counts_ = x.counts(); } else { if (d <= 0) { contig_.Append(x.contig(), std::min(-d, (int)x.contig_size())); contig_info_.out_edges_ = x.out_edges(); contig_info_.kmer_count_ += x.kmer_count(); SequenceCount counts = x.counts(); contig_info_.counts_ += counts.substr(std::min(-d - contig_info_.kmer_size_ + 1, (int)counts.size())); } else { contig_.Append(d, 4); contig_.Append(x.contig()); contig_info_.out_edges_ = x.out_edges(); contig_info_.kmer_count_ += x.kmer_count(); contig_info_.counts_.append(d, 0); contig_info_.counts_ += x.counts(); } } } const ContigBuilder &ReverseComplement() { contig_.ReverseComplement(); contig_info_.ReverseComplement(); return *this; } const Sequence &contig() const { return contig_; } const ContigInfo &contig_info() const { return contig_info_; } void clear() { contig_.clear(); contig_info_.clear(); } private: Sequence contig_; ContigInfo contig_info_; }; #endif idba-1.1.3/src/graph/hash_graph_vertex.h0000664000175000017500000001236212677406270015117 00000000000000/** * @file hash_graph_vertex.h * @brief HashGraphVertex Class and HashGraphVertexAdaptor Class. * @author Yu Peng (ypeng@cs.hku.hk) * @version 1.0.0 * @date 2011-08-05 */ #ifndef __GRAPH_HASH_GRAPH_VERTEX_H_ #define __GRAPH_HASH_GRAPH_VERTEX_H_ #include #include "basic/atomic_integer.h" #include "basic/bit_operation.h" #include "basic/kmer.h" #include "graph/bit_edges.h" #include "graph/vertex_status.h" /** * @brief It is the vertex class used in HashGraph. */ class HashGraphVertex { public: explicit HashGraphVertex(const Kmer &kmer = Kmer()): kmer_(kmer) {} HashGraphVertex(const HashGraphVertex &x) : kmer_(x.kmer_), count_(x.count_), status_(x.status_), in_edges_(x.in_edges_), out_edges_(x.out_edges_) {} const HashGraphVertex &operator =(const HashGraphVertex &x) { kmer_ = x.kmer_; count_ = x.count_; status_ = x.status_; in_edges_ = x.in_edges_; out_edges_ = x.out_edges_; return *this; } void FixPalindromeEdges() { if (kmer_.IsPalindrome()) out_edges_ = in_edges_ = (in_edges_ | out_edges_); } const Kmer &key() const { return kmer_; } void set_key(const Kmer &key) { kmer_ = key; } const Kmer &kmer() const { return kmer_; } void set_kmer(const Kmer &kmer) { kmer_ = kmer; } AtomicInteger &count() { return count_; } const AtomicInteger &count() const { return count_; } VertexStatus &status() { return status_; } const VertexStatus &status() const { return status_; } BitEdges &in_edges() { return in_edges_; } const BitEdges &in_edges() const { return in_edges_; } BitEdges &out_edges() { return out_edges_; } const BitEdges &out_edges() const { return out_edges_; } void swap(HashGraphVertex &x) { if (this != &x) { kmer_.swap(x.kmer_); count_.swap(x.count_); status_.swap(x.status_); in_edges_.swap(x.in_edges_); out_edges_.swap(x.out_edges_); } } uint32_t kmer_size() const { return kmer_.size(); } void clear() { in_edges_.clear(); out_edges_.clear() ; status_.clear(); count_ = 0; } private: Kmer kmer_; AtomicInteger count_; VertexStatus status_; BitEdges in_edges_; BitEdges out_edges_; }; /** * @brief It is adaptor class used for accessing HashGraphVertex. Because * a k-mer and its reverse complemtn share the same vertex, using adaptor * makes sure the access to vertex consistant. */ class HashGraphVertexAdaptor { public: explicit HashGraphVertexAdaptor(HashGraphVertex *vertex = NULL, bool is_reverse = false) { vertex_ = vertex; is_reverse_ = is_reverse; } HashGraphVertexAdaptor(const HashGraphVertexAdaptor &x) { vertex_ = x.vertex_; is_reverse_ = x.is_reverse_; } const HashGraphVertexAdaptor &operator =(const HashGraphVertexAdaptor &x) { vertex_ = x.vertex_; is_reverse_ = x.is_reverse_; return *this; } bool operator <(const HashGraphVertexAdaptor &x) const { return (vertex_ != x.vertex_) ? (vertex_ < x.vertex_) : (is_reverse_ < x.is_reverse_); } bool operator >(const HashGraphVertexAdaptor &x) const { return (vertex_ != x.vertex_) ? (vertex_ > x.vertex_) : (is_reverse_ > x.is_reverse_); } bool operator ==(const HashGraphVertexAdaptor &x) const { return vertex_ == x.vertex_ && is_reverse_ == x.is_reverse_; } bool operator !=(const HashGraphVertexAdaptor &x) const { return vertex_ != x.vertex_ || is_reverse_ != x.is_reverse_; } const HashGraphVertexAdaptor &ReverseComplement() { is_reverse_ = !is_reverse_; return *this; } Kmer kmer() const { Kmer kmer = vertex_->kmer(); return !is_reverse_ ? kmer : kmer.ReverseComplement(); } HashGraphVertex &vertex() { return *vertex_; } const HashGraphVertex &vertex() const { return *vertex_; } void set_vertex(HashGraphVertex *vertex, bool is_reverse = false) { vertex_ = vertex; is_reverse_ = is_reverse; } AtomicInteger &count() { return vertex_->count(); } const AtomicInteger &count() const { return vertex_->count(); } VertexStatus &status() { return vertex_->status(); } const VertexStatus &status() const { return vertex_->status(); } BitEdges &in_edges() { return !is_reverse_ ? vertex_->in_edges() : vertex_->out_edges(); } const BitEdges &in_edges() const { return !is_reverse_ ? vertex_->in_edges() : vertex_->out_edges(); } BitEdges &out_edges() { return !is_reverse_ ? vertex_->out_edges() : vertex_->in_edges(); } const BitEdges &out_edges() const { return !is_reverse_ ? vertex_->out_edges() : vertex_->in_edges(); } void swap(HashGraphVertexAdaptor &x) { if (this != &x) { std::swap(vertex_, x.vertex_); std::swap(is_reverse_, x.is_reverse_); } } bool is_null() const { return vertex_ == NULL; } uint32_t kmer_size() const { return vertex_->kmer_size(); } void clear() { vertex_->clear(); } private: HashGraphVertex *vertex_; bool is_reverse_; }; namespace std { template <> inline void swap(HashGraphVertex &x, HashGraphVertex &y) { x.swap(y); } template <> inline void swap(HashGraphVertexAdaptor &x, HashGraphVertexAdaptor &y) { x.swap(y); } } #endif idba-1.1.3/src/graph/contig_graph.h0000664000175000017500000002342712677406270014066 00000000000000/** * @file contig_graph.h * @brief ContigGraph Class. * @author Yu Peng (ypeng@cs.hku.hk) * @version 1.0.0 * @date 2011-08-16 */ #ifndef __GRAPH_CONTIG_GRAPH_H_ #define __GRAPH_CONTIG_GRAPH_H_ #include #include #include #include "basic/bit_operation.h" #include "basic/kmer.h" #include "container/hash_map.h" #include "graph/contig_graph_path.h" #include "graph/contig_graph_vertex.h" #include "graph/contig_info.h" #include "graph/hash_graph.h" #include "sequence/sequence.h" /** * @brief It is compact version de Bruijn graph in which each vertex is a contig * and each edge between contigs means they are connected in de Bruijn graph. */ class ContigGraph { public: explicit ContigGraph(uint32_t kmer_size = 0) : num_edges_(0), kmer_size_(kmer_size) {} explicit ContigGraph(uint32_t kmer_size, const std::deque &contigs) : num_edges_(0), kmer_size_(kmer_size) { Initialize(contigs); } explicit ContigGraph(uint32_t kmer_size, const std::deque &contigs, const std::deque &contig_infos) : num_edges_(0), kmer_size_(kmer_size) { Initialize(contigs, contig_infos); } ~ContigGraph() { clear(); } double Binormial(int n, int m); void InitializeTable(); double Threshold(double k, double mean, double sd, double p_false); void Initialize(const std::deque &contigs) { std::deque contig_infos(contigs.size()); Initialize(contigs, contig_infos); } void Initialize(const std::deque &contigs, const std::deque &contig_infos); void BuildEdgeCountTable(); HashGraph &edge_count_table() { return edge_count_table_; } const HashGraph &edge_count_table() const { return edge_count_table_; } void Refresh(); void RefreshVertices(); void RefreshEdges(); void AddEdge(ContigGraphVertexAdaptor from, ContigGraphVertexAdaptor to) { from.out_edges().Add(to.contig()[kmer_size_-1]); from.ReverseComplement(); to.ReverseComplement(); std::swap(from, to); from.out_edges().Add(to.contig()[kmer_size_-1]); } void RemoveEdge(ContigGraphVertexAdaptor current, int x) { current.out_edges().Remove(x); ContigGraphVertexAdaptor next = GetNeighbor(current, x); next.ReverseComplement(); next.out_edges().Remove(3 - current.contig()[0]); } void AddAllEdges(); void RemoveAllEdges(); void ClearStatus(); void MergeSimplePaths(); void MergeSimilarPath(); int64_t Prune(int min_length); int64_t Trim(int min_length); int64_t Trim(int min_length, double min_cover); int64_t RemoveStandAlone(int min_length); int64_t RemoveDeadEnd(int min_length); int64_t RemoveDeadEnd(int min_length, double min_cover); int64_t RemoveBubble(); double IterateCoverage(int min_length, double min_cover, double max_cover, double factor = 1.1); double IterateLocalCoverage(int min_length, double ratio, double min_cover, double max_cover, double factor = 1.1); double IterateComponentCoverage(int min_length, double ratio, double min_cover, double max_cover, double factor = 1.1, int max_component_size = 30); double IterateComponentCoverage2(int min_length, double ratio, double min_cover, double max_cover, double factor = 1.1, int max_component_size = 30); bool RemoveLowCoverage(double min_cover, int min_length); bool RemoveLocalLowCoverage(double min_cover, int min_length, double ratio); bool RemoveComponentLowCoverage(double min_cover, int min_length, double ratio, int max_component_size); bool RemoveComponentLowCoverage2(double min_cover, int min_length, double ratio, int max_component_size); double LocalCoverage(ContigGraphVertexAdaptor current, int region_length); double LocalCoverageSingle(ContigGraphVertexAdaptor current, int region_length, double &num_count, int &num_kmer); int64_t Assemble(std::deque &contigs, std::deque &contig_infos); ContigGraphVertexAdaptor GetNeighbor(const ContigGraphVertexAdaptor ¤t, int x) { Kmer kmer = current.end_kmer(kmer_size_); kmer.ShiftAppend(x); return FindVertexAdaptorByBeginKmer(kmer); } void GetNeighbors(const ContigGraphVertexAdaptor ¤t, std::deque &neighbors) { neighbors.clear(); for (int x = 0; x < 4; ++x) { if (current.out_edges()[x]) neighbors.push_back(GetNeighbor(current, x)); } } bool IsConverged(ContigGraphVertexAdaptor current); int64_t SplitBranches(); void Decomposite(); void GetComponents(std::deque > &components, std::deque &component_strings); void GetConsensus(std::deque &consensus); bool FindPath(ContigGraphVertexAdaptor from, ContigGraphVertexAdaptor to, ContigGraphPath &path); void SortVertices() { std::sort(vertices_.begin(), vertices_.end(), CompareContigLength); Refresh(); } void GetContigs(std::deque &contigs, std::deque &contig_infos); std::deque &vertices() { return vertices_; } const std::deque &vertices() const { return vertices_; } void swap(ContigGraph &contig_graph) { begin_kmer_map_.swap(contig_graph.begin_kmer_map_); vertices_.swap(contig_graph.vertices_); std::swap(num_edges_, contig_graph.num_edges_); std::swap(kmer_size_, contig_graph.kmer_size_); } uint32_t kmer_size() const { return kmer_size_; } void set_kmer_size(uint32_t kmer_size) { kmer_size_ = kmer_size; } uint64_t num_vertices() const { return vertices_.size(); } uint64_t num_edges() const { return num_edges_; } void clear() { num_edges_ = 0; vertices_.clear(); begin_kmer_map_.clear(); in_kmer_count_table_.clear(); } private: ContigGraph(const ContigGraph &); const ContigGraph &operator =(const ContigGraph &); static bool CompareContigLength(const ContigGraphVertex &x, const ContigGraphVertex &y) { return x.contig_size() > y.contig_size(); } static bool CompareContigCoverage(const ContigGraphVertexAdaptor &x, const ContigGraphVertexAdaptor &y) { return x.coverage() > y.coverage(); } static double GetSimilarity(ContigGraphVertexAdaptor &x, ContigGraphVertexAdaptor &y) { Sequence a = x.contig(); Sequence b = y.contig(); return GetSimilarity(a, b); } static double GetSimilarity(const Sequence &x, const Sequence &y); void BuildBeginKmerMap(); bool GetNextVertexAdaptor(ContigGraphVertexAdaptor ¤t, ContigGraphVertexAdaptor &next) { if (current.out_edges().size() != 1) return false; next = GetNeighbor(current, bit_operation::BitToIndex(current.out_edges())); return next.in_edges().size() == 1 && !(next.contig_size() == kmer_size_ && next.contig().IsPalindrome()); } bool IsLoop(const ContigGraphPath &path, const ContigGraphVertexAdaptor &next) { return path.front().id() == next.id() || path.back().id() == next.id(); } ContigGraphVertexAdaptor FindVertexAdaptorByBeginKmer(const Kmer &begin_kmer) { Kmer key = begin_kmer.unique_format(); HashMap::iterator iter = begin_kmer_map_.find(key); if (iter != begin_kmer_map_.end()) { ContigGraphVertexAdaptor current(&vertices_[iter->second]); if (current.begin_kmer(kmer_size_) == begin_kmer) return current; current.ReverseComplement(); if (current.begin_kmer(kmer_size_) == begin_kmer) return current; } return ContigGraphVertexAdaptor(); } ContigGraphVertexAdaptor GetBeginVertexAdaptor(std::deque &component) { ContigGraphVertexAdaptor begin; for (unsigned i = 0; i < component.size(); ++i) { if (component[i].in_edges() == 0) { if (begin.is_null()) begin = component[i]; else return ContigGraphVertexAdaptor(NULL); } } return begin; } ContigGraphVertexAdaptor GetEndVertexAdaptor(std::deque &component) { ContigGraphVertexAdaptor end; for (unsigned i = 0; i < component.size(); ++i) { if (component[i].out_edges() == 0) { if (end.is_null()) end = component[i]; else return ContigGraphVertexAdaptor(NULL); } } return end; } bool IsValid(std::deque &component); bool CycleDetect(ContigGraphVertexAdaptor current, std::map &status); void FindLongestPath(std::deque &component, ContigGraphPath &path); void TopSort(std::deque &component, std::deque &order); void TopSortDFS(std::deque &order, ContigGraphVertexAdaptor current, std::map &status); int GetDepth(ContigGraphVertexAdaptor current, int length, int &maximum, int min_length); double FindSimilarPath(ContigGraphVertexAdaptor target, ContigGraphVertexAdaptor start); double FindSimilarPath(ContigGraphVertexAdaptor target, ContigGraphPath &path, int &time); static const uint32_t kMaxCheckSimilarity = 1024; HashMap begin_kmer_map_; std::deque vertices_; uint64_t num_edges_; uint32_t kmer_size_; HashMap in_kmer_count_table_; HashGraph edge_count_table_; std::vector > p_table; }; #endif idba-1.1.3/src/graph/vertex_status.h0000664000175000017500000000540712677406270014340 00000000000000/** * @file vertex_status.h * @brief VertexStatus Class. * @author Yu Peng (ypeng@cs.hku.hk) * @version 1.0.0 * @date 2011-08-05 */ #ifndef __GRAPH_VERTEX_STATUS_H_ #define __GRAPH_VERTEX_STATUS_H_ #include #include "basic/atomic_integer.h" /** * @brief It is a class for storing the status of a vertex. It provides many * useful functions to access the status of a vertex. */ class VertexStatus { public: VertexStatus(): status_(0) {} VertexStatus(const VertexStatus &vertex_status): status_(vertex_status.status_) {} const VertexStatus &operator =(const VertexStatus &vertex_status) { status_ = vertex_status.status_; return *this; } void SetUsedFlag() { SetFlag(kVertexStatusFlagUsed); } void ResetUsedFlag() { ResetFlag(kVertexStatusFlagUsed); } bool IsUsed() const { return GetFlag(kVertexStatusFlagUsed); } void SetDeadFlag() { SetFlag(kVertexStatusFlagDead); } void ResetDeadFlag() { ResetFlag(kVertexStatusFlagDead); } bool IsDead() const { return GetFlag(kVertexStatusFlagDead); } int GetLockID() { if (status_ & kVertexStatusFlagLock) return status_ & kVertexStatusMaskLock; return -1; } bool Lock(int id) { uint16_t old_status = status_; if (old_status & kVertexStatusFlagLock) return false; uint16_t new_status = (old_status & ~kVertexStatusMaskLock) | kVertexStatusFlagLock | id; if (status_.CompareAndSet(old_status, new_status)) return true; else return false; } bool LockPreempt(int id) { while (true) { uint16_t old_status = status_; int old_id = -1; if (old_status & kVertexStatusFlagLock) old_id = old_status & kVertexStatusMaskLock; if (old_id >= id) return false; uint16_t new_status = (old_status & ~kVertexStatusMaskLock) | kVertexStatusFlagLock | id; if (status_.CompareAndSet(old_status, new_status)) return true; } } void swap(VertexStatus &vertex_status) { if (this != &vertex_status) status_.swap(vertex_status.status_); } void clear() { status_ = 0; } static const uint16_t kVertexStatusFlagDead = 0x8000U; static const uint16_t kVertexStatusFlagUsed = 0x4000U; static const uint16_t kVertexStatusFlagLock = 0x2000U; static const uint16_t kVertexStatusMaskLock = 0x1FFFU; private: bool GetFlag(uint16_t flag) const { return status_ & flag; } void SetFlag(uint16_t flag) { status_ |= flag; } void ResetFlag(uint16_t flag) { status_ &= ~flag; } AtomicInteger status_; }; namespace std { template <> inline void swap(VertexStatus &x, VertexStatus &y) { x.swap(y); } } #endif idba-1.1.3/src/graph/contig_graph_branch_group.cpp0000664000175000017500000000675312677406270017155 00000000000000/** * @file contig_graph_branch_group.cpp * @brief * @author Yu Peng (ypeng@cs.hku.hk) * @version 1.0.9 * @date 2011-12-27 */ #include "graph/contig_graph_branch_group.h" #include #include #include using namespace std; bool ContigGraphBranchGroup::Search() { int kmer_size = contig_graph_->kmer_size(); branches_.reserve(max_branches_); ContigGraphPath path; path.Append(begin_, 0); branches_.push_back(path); if ((int)begin_.out_edges().size() <= 1 || (int)begin_.out_edges().size() > max_branches_ || (int)begin_.contig_size() == kmer_size) return false; bool is_converge = false; for (int k = 1; k < max_length_; ++k) { int num_branches = branches_.size(); bool is_extend = false; for (int i = 0; i < num_branches; ++i) { if ((int)branches_[i].internal_size(kmer_size) >= max_length_) continue; ContigGraphVertexAdaptor current = branches_[i].back(); if (current.out_edges().size() == 0) return false; bool is_first = true; ContigGraphPath path = branches_[i]; for (int x = 0; x < 4; ++x) { if (current.out_edges()[x]) { ContigGraphVertexAdaptor next = contig_graph_->GetNeighbor(current, x); if (next.status().IsDead()) return false; if (is_first) { branches_[i].Append(next, -kmer_size + 1); is_first = false; } else { if ((int)branches_.size() == max_branches_) return false; path.Append(next, -kmer_size + 1); branches_.push_back(path); path.Pop(); } is_extend = true; } } } end_ = branches_[0].back(); if ((int)end_.contig_size() > kmer_size) { is_converge = true; for (unsigned i = 0; i < branches_.size(); ++i) { if (branches_[i].back() != end_ || (int)branches_[i].internal_size(kmer_size) != max_length_) { is_converge = false; break; } } if (is_converge) break; } if (!is_extend) break; } return is_converge && begin_ != end_; } void ContigGraphBranchGroup::Merge() { unsigned best = 0; for (unsigned i = 1; i < branches_.size(); ++i) { if (branches_[i].kmer_count() > branches_[best].kmer_count()) best = i; } for (unsigned i = 0; i < branches_.size(); ++i) { ContigGraphPath &path = branches_[i]; path.front().out_edges() = 0; path.back().in_edges() = 0; for (unsigned j = 1; j+1 < path.num_nodes(); ++j) { path[j].in_edges() = 0; path[j].out_edges() = 0; path[j].status().SetDeadFlag(); } } ContigGraphPath &path = branches_[best]; for (unsigned j = 1; j+1 < path.num_nodes(); ++j) path[j].status().ResetDeadFlag(); for (unsigned j = 0; j+1 < path.num_nodes(); ++j) contig_graph_->AddEdge(path[j], path[j+1]); } idba-1.1.3/src/graph/contig_graph_path.h0000664000175000017500000001035012677406270015071 00000000000000/** * @file contig_graph_path.h * @brief ContigGraph Class. * @author Yu Peng (ypeng@cs.hku.hk) * @version 1.0.0 * @date 2011-08-16 */ #ifndef __GRAPH_CONTIG_GRAPH_PATH_H_ #define __GRAPH_CONTIG_GRAPH_PATH_H_ #include #include #include "graph/contig_builder.h" #include "graph/contig_graph_vertex.h" #include "graph/contig_info.h" /** * @brief It is a path of contigs in ContigGraph. */ class ContigGraphPath { public: ContigGraphPath() {} ContigGraphPath(const ContigGraphPath &path) : vertices_(path.vertices_), distances_(path.distances_) {} const ContigGraphPath &operator =(const ContigGraphPath &path) { vertices_ = path.vertices_; distances_ = path.distances_; return *this; } bool operator <(const ContigGraphPath &path) const { for (unsigned i = 0; i < num_nodes() && i < path.num_nodes(); ++i) { if ((*this)[i] != path[i]) return (*this)[i] < path[i]; } return num_nodes() < path.num_nodes(); } ContigGraphVertexAdaptor &operator [](uint32_t index) { return vertices_[index]; } const ContigGraphVertexAdaptor &operator [](uint32_t index) const { return vertices_[index]; } void Append(const ContigGraphVertexAdaptor &vertex, int d) { vertices_.push_back(vertex); if (vertices_.size() > 1) distances_.push_back(d); } void Append(const ContigGraphPath &path, int d) { for (unsigned i = 0; i < path.num_nodes(); ++i) { if (i == 0) Append(path[i], d); else Append(path[i], path.distances()[i-1]); } } void Pop() { vertices_.pop_back(); if (!distances_.empty()) distances_.pop_back(); } const ContigGraphPath &ReverseComplement() { std::reverse(vertices_.begin(), vertices_.end()); for (unsigned i = 0; i < vertices_.size(); ++i) vertices_[i].ReverseComplement(); std::reverse(distances_.begin(), distances_.end()); return *this; } void Assemble(Sequence &contig, ContigInfo &contig_info) { ContigBuilder contig_builder; if (vertices_.size() > 0) { contig_builder.Append(vertices_[0], 0); for (unsigned i = 1; i < vertices_.size(); ++i) contig_builder.Append(vertices_[i], distances_[i-1]); } contig = contig_builder.contig(); contig_info = contig_builder.contig_info(); } void swap(ContigGraphPath &path) { if (this != &path) { vertices_.swap(path.vertices_); distances_.swap(path.distances_); } } ContigGraphVertexAdaptor &front() { return vertices_.front(); } const ContigGraphVertexAdaptor &front() const { return vertices_.front(); } ContigGraphVertexAdaptor &back() { return vertices_.back(); } const ContigGraphVertexAdaptor &back() const { return vertices_.back(); } uint64_t kmer_count() const { uint64_t sum = 0; for (unsigned i = 0; i < vertices_.size(); ++i) sum += vertices_[i].kmer_count(); return sum; } uint32_t size() const { uint32_t size = 0; for (unsigned i = 0; i < vertices_.size(); ++i) size += vertices_[i].contig_size(); for (unsigned i = 0; i < distances_.size(); ++i) size += distances_[i]; return size; } uint32_t internal_size(int kmer_size) const { if (vertices_.size() <= 1) return vertices_.size(); uint32_t size = kmer_size + 1; for (unsigned i = 1; i+1 < vertices_.size(); ++i) size += vertices_[i].contig_size(); for (unsigned i = 0; i < distances_.size(); ++i) size += distances_[i]; return size; } uint32_t num_nodes() const { return vertices_.size(); } void clear() { vertices_.clear(); distances_.clear(); } std::deque &distances() { return distances_; } const std::deque &distances() const { return distances_; } private: std::deque vertices_; std::deque distances_; }; namespace std { template <> inline void swap(ContigGraphPath &x, ContigGraphPath &y) { x.swap(y); } } #endif idba-1.1.3/src/graph/contig_graph_branch_group.h0000664000175000017500000000223312677406270016607 00000000000000/** * @file contig_graph_branch_group.h * @brief ContigGraphBranchGroup Class. * @author Yu Peng (ypeng@cs.hku.hk) * @version 1.0.9 * @date 2011-12-27 */ #ifndef __GRAPH_CONTIG_GRAPH_BRANCH_GROUP_H_ #define __GRAPH_CONTIG_GRAPH_BRANCH_GROUP_H_ #include #include "graph/contig_graph.h" #include "graph/contig_graph_path.h" /** * @brief It is used to contain a branch group in ContigGraph. */ class ContigGraphBranchGroup { public: ContigGraphBranchGroup(ContigGraph *graph, ContigGraphVertexAdaptor begin, int max_branches = 2, int max_length = 0) { contig_graph_ = graph; begin_ = begin; max_branches_ = max_branches; max_length_ = max_length; if (max_length_ == 0) max_length_ = 2*contig_graph_->kmer_size() + 2; } bool Search(); void Merge(); ContigGraphVertexAdaptor begin() { return begin_; } ContigGraphVertexAdaptor end() { return end_; } private: ContigGraph *contig_graph_; ContigGraphVertexAdaptor begin_; ContigGraphVertexAdaptor end_; std::vector branches_; int max_branches_; int max_length_; }; #endif idba-1.1.3/src/graph/hash_graph.cpp0000664000175000017500000004006312677406270014054 00000000000000/** * @file hash_graph.cpp * @brief * @author Yu Peng (ypeng@cs.hku.hk) * @version 1.0.0 * @date 2011-08-05 */ #include "graph/hash_graph.h" #include #include #include #include "basic/bit_operation.h" #include "basic/histgram.h" #include "basic/kmer.h" #include "container/hash_table.h" #include "graph/contig_builder.h" #include "graph/contig_info.h" #include "graph/hash_graph_branch_group.h" #include "graph/hash_graph_vertex.h" #include "sequence/sequence.h" using namespace std; #include int64_t HashGraph::InsertKmersWithPrefix(const Sequence &seq, uint64_t prefix, uint64_t mask) { if (seq.size() < kmer_size_) return 0; Kmer kmer(kmer_size_); int length = 0; int64_t num_kmers = 0; for (uint64_t i = 0; i < seq.size(); ++i) { kmer.ShiftAppend(seq[i]); length = (seq[i] < 4) ? length + 1 : 0; if (length < (int)kmer_size_) continue; Kmer key = kmer.unique_format(); if ((((key.hash() * 10619863ULL + 17977) % 790738119649411319ULL) & mask) == prefix) { HashGraphVertex &vertex = vertex_table_.find_or_insert(HashGraphVertex(key)); vertex.count() += 1; HashGraphVertexAdaptor adaptor(&vertex, kmer != key); if (length > (int)kmer_size_ && seq[i-kmer_size_] < 4) adaptor.in_edges().Add(3 - seq[i-kmer_size_]); if (i+1 < seq.size() && seq[i+1] < 4) adaptor.out_edges().Add(seq[i+1]); ++num_kmers; } } return num_kmers; } int64_t HashGraph::InsertUncountKmers(const Sequence &seq) { if (seq.size() < kmer_size_) return 0; Kmer kmer(kmer_size_); int length = 0; int64_t num_kmers = 0; for (uint64_t i = 0; i < seq.size(); ++i) { kmer.ShiftAppend(seq[i]); length = (seq[i] < 4) ? length + 1 : 0; if (length < (int)kmer_size_) continue; Kmer key = kmer.unique_format(); HashGraphVertex &vertex = vertex_table_.find_or_insert(HashGraphVertex(key)); HashGraphVertexAdaptor adaptor(&vertex, kmer != key); if (length > (int)kmer_size_ && seq[i-kmer_size_] < 4) adaptor.in_edges().Add(3 - seq[i-kmer_size_]); if (i+1 < seq.size() && seq[i+1] < 4) adaptor.out_edges().Add(seq[i+1]); ++num_kmers; } return num_kmers; } int64_t HashGraph::InsertInternalKmers(const Sequence &seq, int min_count) { if (seq.size() < kmer_size_) return 0; Kmer kmer(kmer_size_); int length = 0; int64_t num_kmers = 0; deque found_index; deque found_kmer; for (uint64_t i = 0; i < seq.size(); ++i) { kmer.ShiftAppend(seq[i]); length = (seq[i] < 4) ? length + 1 : 0; if (length < (int)kmer_size_) continue; HashGraphVertexAdaptor adaptor = FindVertexAdaptor(kmer); if (adaptor.is_null()) continue; if (length > (int)kmer_size_ && seq[i-kmer_size_] < 4) adaptor.in_edges().Add(3 - seq[i-kmer_size_] + 4); if (i+1 < seq.size() && seq[i+1] < 4) adaptor.out_edges().Add(seq[i+1] + 4); if (adaptor.count() >= min_count) { found_index.push_back(i); found_kmer.push_back(adaptor); } } deque flags(seq.size(), 0); for (uint64_t i = 0; i+1 < found_index.size(); ++i) { HashGraphVertexAdaptor from = found_kmer[i]; //FindVertexAdaptor(found_kmer[i]); HashGraphVertexAdaptor to = found_kmer[i+1]; //FindVertexAdaptor(found_kmer[i+1]); if (from.is_null() || to.is_null()) { cout << "error" << endl; continue; } if ((from.out_edges() & 15) == 0 && (to.in_edges() & 15) == 0) { for (int j = found_index[i] + 1; j < found_index[i+1]; ++j) flags[j] = 1; } } if (found_index.size() > 0) { if (found_kmer.front().in_edges() == 0) { for (int j = kmer_size_ - 1; j < found_index.front(); ++j) flags[j] = 1; } if (found_kmer.back().out_edges() == 0) { for (int j = found_index.back() + 1; j < (int)seq.size(); ++j) flags[j] = 1; } } length = 0; for (uint64_t i = 0; i < seq.size(); ++i) { kmer.ShiftAppend(seq[i]); length = (seq[i] < 4) ? length + 1 : 0; if (length < (int)kmer_size_) continue; if (flags[i]) { Kmer key = kmer.unique_format(); HashGraphVertex &vertex = vertex_table_.find_or_insert(HashGraphVertex(key)); vertex.count() += 1; HashGraphVertexAdaptor adaptor(&vertex, kmer != key); if (length > (int)kmer_size_ && seq[i-kmer_size_] < 4) adaptor.in_edges().Add(3 - seq[i-kmer_size_] + 4); if (i+1 < seq.size() && seq[i+1] < 4) adaptor.out_edges().Add(seq[i+1] + 4); ++num_kmers; } } return num_kmers; } int64_t HashGraph::InsertEdges(const Sequence &seq) { if (seq.size() < kmer_size_) return 0; Kmer kmer(kmer_size_); int length = 0; int64_t num_kmers = 0; for (uint64_t i = 0; i < seq.size(); ++i) { kmer.ShiftAppend(seq[i]); length = (seq[i] < 4) ? length + 1 : 0; if (length < (int)kmer_size_) continue; HashGraphVertexAdaptor adaptor = FindVertexAdaptor(kmer); if (adaptor.is_null()) continue; if (length > (int)kmer_size_ && seq[i-kmer_size_] < 4) adaptor.in_edges().Add(3 - seq[i-kmer_size_]); if (i+1 < seq.size() && seq[i+1] < 4) adaptor.out_edges().Add(seq[i+1]); } return num_kmers; } int64_t HashGraph::InsertExistKmers(const Sequence &seq) { if (seq.size() < kmer_size_) return 0; Kmer kmer(kmer_size_); int length = 0; int64_t num_kmers = 0; for (uint64_t i = 0; i < seq.size(); ++i) { kmer.ShiftAppend(seq[i]); length = (seq[i] < 4) ? length + 1 : 0; if (length < (int)kmer_size_) continue; HashGraphVertexAdaptor adaptor = FindVertexAdaptor(kmer); if (adaptor.is_null()) continue; adaptor.count() += 1; if (length > (int)kmer_size_ && seq[i-kmer_size_] < 4) adaptor.in_edges().Add(3 - seq[i-kmer_size_]); if (i+1 < seq.size() && seq[i+1] < 4) adaptor.out_edges().Add(seq[i+1]); } return num_kmers; } int64_t HashGraph::RemoveKmers(const Sequence &seq) { if (seq.size() < kmer_size_) return 0; Kmer kmer(kmer_size_); int length = 0; int64_t num_kmers = 0; for (uint64_t i = 0; i < seq.size(); ++i) { kmer.ShiftAppend(seq[i]); length = (seq[i] < 4) ? length + 1 : 0; if (length < (int)kmer_size_) continue; Kmer key = kmer.unique_format(); HashGraphVertex &vertex = *vertex_table_.find(key); vertex.status().SetDeadFlag(); ++num_kmers; } return num_kmers; } int64_t HashGraph::ErodeEnd(int min_cover) { ErodeFunc func(this, min_cover); vertex_table_.for_each(func); uint64_t num_eroded_vertice = RefreshVertices(); RefreshEdges(); ClearStatus(); return num_eroded_vertice; } int64_t HashGraph::Trim(int min_length) { deque contigs; deque contig_infos; Assemble(contigs, contig_infos); #pragma omp parallel for for (int64_t i = 0; i < (int64_t)contigs.size(); ++i) { if ((contig_infos[i].out_edges() == 0 || contig_infos[i].in_edges() == 0) && (int)contigs[i].size() < min_length + (int)kmer_size_ - 1) RemoveKmers(contigs[i]); } uint64_t old_num_vertices = vertex_table_.size(); Refresh(); return old_num_vertices - vertex_table_.size(); } int64_t HashGraph::RemoveDeadEnd(int min_length) { uint64_t num_deadend = 0; int l = 1; while (true) { l = min(2*l, min_length); num_deadend += Trim(l); if (l == min_length) break; } num_deadend += Trim(min_length); return num_deadend; } int64_t HashGraph::RemoveLowCoverage(double min_cover, int min_length) { uint64_t old_num_vertices = vertex_table_.size(); int l = 1; while (true) { l = min(2*l, min_length); deque contigs; deque contig_infos; Assemble(contigs, contig_infos); #pragma omp parallel for for (int64_t i = 0; i < (int64_t)contigs.size(); ++i) { if (contig_infos[i].kmer_count() * 1.0 / (contigs[i].size() - kmer_size_ + 1) < min_cover && (int)contigs[i].size() < l + (int)kmer_size_ - 1) RemoveKmers(contigs[i]); } Refresh(); Trim(l); if (l == min_length) break; } return old_num_vertices - vertex_table_.size(); } int64_t HashGraph::RemoveBubble() { BubbleFunc func(this); vertex_table_.for_each(func); deque &candidates = func.candidates(); int64_t bubble = 0; for (unsigned i = 0; i < candidates.size(); ++i) { HashGraphVertexAdaptor current = candidates[i]; if (current.out_edges().size() > 1 && current.in_edges().size() == 1) { HashGraphBranchGroup branch_group(this, current, 4, kmer_size_*2 + 2); if (branch_group.Search()) { HashGraphVertexAdaptor begin = branch_group.begin(); HashGraphVertexAdaptor end = branch_group.end(); begin.ReverseComplement(); end.ReverseComplement(); std::swap(begin, end); HashGraphBranchGroup rev_branch_group(this, begin, 4, kmer_size_*2 + 2); if (rev_branch_group.Search() && rev_branch_group.end() == end) { branch_group.Merge(); ++bubble; } } } } // for (HashGraph::iterator p = begin(); p != end(); ++p) // { // for (int strand = 0; strand < 2; ++strand) // { // HashGraphVertexAdaptor current(&*p, strand); for (unsigned i = 0; i < candidates.size(); ++i) { HashGraphVertexAdaptor current = candidates[i]; if (current.out_edges().size() > 1 && current.in_edges().size() == 1) { HashGraphBranchGroup branch_group(this, current, 4, kmer_size_ + 2); if (branch_group.Search()) { HashGraphVertexAdaptor begin = branch_group.begin(); HashGraphVertexAdaptor end = branch_group.end(); begin.ReverseComplement(); end.ReverseComplement(); std::swap(begin, end); HashGraphBranchGroup rev_branch_group(this, begin, 4, kmer_size_ + 2); if (rev_branch_group.Search() && rev_branch_group.end() == end) { branch_group.Merge(); ++bubble; } } } //} } Refresh(); return bubble; } int64_t HashGraph::Assemble(std::deque &contigs) { deque contig_infos; return Assemble(contigs, contig_infos); } int64_t HashGraph::Assemble(std::deque &contigs, std::deque &contig_infos) { contigs.clear(); contig_infos.clear(); AssembleFunc func(this); vertex_table_.for_each(func); contigs.swap(func.contigs()); contig_infos.swap(func.contig_infos()); ClearStatus(); return contigs.size(); } void HashGraph::ErodeFunc::operator ()(HashGraphVertex &vertex) { if (vertex.in_edges().size() > 0 && vertex.out_edges().size() > 0) return; if (vertex.status().IsDead()) return; if (vertex.count() < min_cover_) { vertex.status().SetDeadFlag(); for (int strand = 0; strand < 2; ++strand) { HashGraphVertexAdaptor current(&vertex, strand); for (int x = 0; x < 4; ++x) { if (current.out_edges()[x]) { current.out_edges().Remove(x); Kmer kmer = current.kmer(); kmer.ShiftAppend(x); HashGraphVertexAdaptor next = hash_graph_->FindVertexAdaptor(kmer); if (!next.is_null()) { next.in_edges().Remove(3 - current.kmer()[0]); (*this)(next.vertex()); } } } } } } void HashGraph::TrimFunc::operator ()(HashGraphVertex &vertex) { if (vertex.in_edges().size() > 0 && vertex.out_edges().size() > 0) return; if (vertex.kmer().IsPalindrome()) return; if (!vertex.status().Lock(omp_get_thread_num())) return; for (int strand = 0; strand < 2; ++strand) { HashGraphVertexAdaptor current(&vertex, strand); if (current.in_edges().size() > 0) continue; deque path; path.push_back(current); for (int i = 0; i < min_length_; ++i) { if (current.out_edges().size() != 1) return; Kmer next_kmer = current.kmer(); next_kmer.ShiftAppend(bit_operation::BitToIndex(current.out_edges())); HashGraphVertexAdaptor next = hash_graph_->FindVertexAdaptor(next_kmer); if (next.in_edges().size() != 1) break; if (!next.status().LockPreempt(omp_get_thread_num())) return; current = next; path.push_back(current); } if ((int)path.size() < min_length_) { for (unsigned i = 0; i < path.size(); ++i) path[i].status().SetDeadFlag(); } } } void HashGraph::BubbleFunc::operator ()(HashGraphVertex &vertex) { for (int strand = 0; strand < 2; ++strand) { HashGraphVertexAdaptor current(&vertex, strand); if (current.out_edges().size() > 1 && current.in_edges().size() == 1) { HashGraphBranchGroup branch_group(hash_graph_, current, 4, hash_graph_->kmer_size()*2 + 2); if (branch_group.Search()) { HashGraphVertexAdaptor begin = branch_group.begin(); HashGraphVertexAdaptor end = branch_group.end(); begin.ReverseComplement(); end.ReverseComplement(); std::swap(begin, end); HashGraphBranchGroup rev_branch_group(hash_graph_, begin, 4, hash_graph_->kmer_size()*2 + 2); if (rev_branch_group.Search() && rev_branch_group.end() == end) { omp_set_lock(&bubble_lock_); candidates_.push_back(current); omp_unset_lock(&bubble_lock_); } } } } } void HashGraph::AssembleFunc::operator ()(HashGraphVertex &vertex) { if (!vertex.status().Lock(omp_get_thread_num())) return; ContigBuilder contig_builder; contig_builder.Append(HashGraphVertexAdaptor(&vertex)); if (!vertex.kmer().IsPalindrome()) { for (int strand = 0; strand < 2; ++strand) { HashGraphVertexAdaptor current(&vertex, strand); while (true) { HashGraphVertexAdaptor next; if (!hash_graph_->GetNextVertexAdaptor(current, next)) break; if (hash_graph_->IsLoop(contig_builder.contig(), next)) break; if (!next.status().LockPreempt(omp_get_thread_num())) return; contig_builder.Append(next); current = next; } contig_builder.ReverseComplement(); } } omp_set_lock(&contig_lock_); contigs_.push_back(contig_builder.contig()); contig_infos_.push_back(contig_builder.contig_info()); omp_unset_lock(&contig_lock_); } idba-1.1.3/src/graph/scaffold_graph_vertex.h0000664000175000017500000002016112677406270015751 00000000000000/** * @file scaffold_graph_vertex.h * @brief ScaffoldGraphVertex Class and ScaffoldGraphVertexAdaptor Class. * @author Yu Peng (ypeng@cs.hku.hk) * @version 1.0.10 * @date 2012-04-16 */ #ifndef __GRAPH_SCAFFOLD_GRAPH_VERTEX_H_ #define __GRAPH_SCAFFOLD_GRAPH_VERTEX_H_ #include #include #include #include "basic/kmer.h" #include "graph/bit_edges.h" #include "graph/vertex_status.h" #include "sequence/sequence.h" #include "graph/contig_info.h" #include "graph/contig_graph_path.h" /** * @brief It is the vertex class used in ScaffoldGraph class. */ class ScaffoldGraphVertex { public: explicit ScaffoldGraphVertex(const ContigGraphPath &path = ContigGraphPath()) : path_(path) {} ScaffoldGraphVertex(const ScaffoldGraphVertex &x) : path_(x.path_), id_(x.id_), status_(x.status_) {} const ScaffoldGraphVertex &operator =(const ScaffoldGraphVertex &x) { if (this != &x) { path_ = x.path_; id_ = x.id_; } return *this; } const ContigGraphPath &path() const { return path_; } void set_path(const ContigGraphPath &path) { path_ = path; } uint32_t path_size() const { return path_.size(); } // const Sequence &contig() const { return contig_; } // void set_contig(const Sequence &contig) { contig_ = contig; } // uint32_t contig_size() const { return contig_.size(); } // uint32_t num_kmer() const { return contig_.size() - kmer_size() + 1; } uint32_t id() const { return id_; } void set_id(uint32_t id) { id_ = id; } VertexStatus &status() { return status_; } const VertexStatus &status() const { return status_; } // Kmer begin_kmer(int kmer_size) const { return contig_.GetKmer(0, kmer_size); } // Kmer end_kmer(int kmer_size) const { return contig_.GetKmer(contig_.size() - kmer_size, kmer_size); } // // double coverage() const { return 1.0 * contig_info_.kmer_count() / (contig_size() - kmer_size() + 1); } double coverage() const { double sum = 0; int count = 0; for (unsigned i = 0; i < path_.num_nodes(); ++i) { sum += path_[i].kmer_count(); count += path_[i].contig_size() - path_[i].kmer_size() + 1; } return sum / count; } // const SequenceCount &counts() const { return contig_info_.counts(); } // void set_counts(const SequenceCount &counts) { contig_info_.set_counts(counts); } // char get_base(uint32_t index) const { return contig_[index]; } // SequenceCountUnitType get_count(uint32_t index) const { return contig_info_.counts()[index]; } void swap(ScaffoldGraphVertex &x) { if (this != &x) { path_.swap(x.path_); std::swap(id_, x.id_); status_.swap(x.status_); } } void clear() { path_.clear(); id_ = 0; status_.clear(); } private: ContigGraphPath path_; uint32_t id_; VertexStatus status_; }; /** * @brief It is a adaptor class used to access ScaffoldGraphVertex. Becase * a scaffold vertex and its reverse complement share the same vertex, * using adaptor makes sure that modification to the vertex consistant. */ class ScaffoldGraphVertexAdaptor { public: explicit ScaffoldGraphVertexAdaptor(ScaffoldGraphVertex *vertex = NULL, bool is_reverse = false) { vertex_ = vertex; is_reverse_ = is_reverse; } ScaffoldGraphVertexAdaptor(const ScaffoldGraphVertexAdaptor &x) { vertex_ = x.vertex_, is_reverse_ = x.is_reverse_; } const ScaffoldGraphVertexAdaptor &operator =(const ScaffoldGraphVertexAdaptor &x) { vertex_ = x.vertex_; is_reverse_ = x.is_reverse_; return *this; } bool operator <(const ScaffoldGraphVertexAdaptor &x) const { return (vertex_ != x.vertex_) ? (vertex_ < x.vertex_) : (is_reverse_ < x.is_reverse_); } bool operator >(const ScaffoldGraphVertexAdaptor &x) const { return (vertex_ != x.vertex_) ? (vertex_ > x.vertex_) : (is_reverse_ > x.is_reverse_); } bool operator ==(const ScaffoldGraphVertexAdaptor &x) const { return vertex_ == x.vertex_ && is_reverse_ == x.is_reverse_; } bool operator !=(const ScaffoldGraphVertexAdaptor &x) const { return vertex_ != x.vertex_ || is_reverse_ != x.is_reverse_; } const ScaffoldGraphVertexAdaptor &ReverseComplement() { is_reverse_ = !is_reverse_; return *this; } ContigGraphPath path() const { ContigGraphPath path = vertex_->path(); return !is_reverse_ ? path : path.ReverseComplement(); } uint32_t path_size() const { return vertex_->path_size(); } // Sequence contig() const // { // Sequence contig = vertex_->contig(); // return !is_reverse_ ? contig : contig.ReverseComplement(); // } // uint32_t contig_size() const { return vertex_->contig().size(); } // uint32_t num_kmer() const { return vertex_->num_kmer(); } void set_vertex(ScaffoldGraphVertex *vertex, bool is_reverse) { vertex_ = vertex; is_reverse_ = is_reverse; } // ContigInfo contig_info() const // { // ContigInfo contig_info = vertex_->contig_info(); // return (!is_reverse_ ? contig_info : contig_info.ReverseComplement()); // } // // uint64_t kmer_size() const { return vertex_->kmer_size(); } // void set_kmer_size(uint64_t kmer_size) { vertex_->set_kmer_size(kmer_size); } // // uint64_t kmer_count() const { return vertex_->kmer_count(); } // void set_kmer_count(uint64_t kmer_count) { vertex_->set_kmer_count(kmer_count); } uint32_t id() const { return vertex_->id(); } void set_id(uint32_t id) { vertex_->set_id(id); } VertexStatus &status() { return vertex_->status(); } const VertexStatus &status() const { return vertex_->status(); } // BitEdges &in_edges() { return !is_reverse_ ? vertex_->in_edges() : vertex_->out_edges(); } // const BitEdges &in_edges() const { return !is_reverse_ ? vertex_->in_edges() : vertex_->out_edges(); } // // BitEdges &out_edges() { return !is_reverse_ ? vertex_->out_edges() : vertex_->in_edges(); } // const BitEdges &out_edges() const { return !is_reverse_ ? vertex_->out_edges() : vertex_->in_edges(); } // uint32_t &in_kmer_count() { return !is_reverse_ ? vertex_->in_kmer_count() : vertex_->out_kmer_count(); } // const uint32_t &in_kmer_count() const { return !is_reverse_ ? vertex_->in_kmer_count() : vertex_->out_kmer_count(); } // // uint32_t &out_kmer_count() { return !is_reverse_ ? vertex_->out_kmer_count() : vertex_->out_kmer_count(); } // const uint32_t &out_kmer_count() const { return !is_reverse_ ? vertex_->out_kmer_count() : vertex_->out_kmer_count(); } // SequenceCount counts() // { // if (!is_reverse_) return vertex_->counts(); // else { SequenceCount counts = vertex_->counts(); std::reverse(counts.begin(), counts.end()); return counts; } // } // // char get_base(uint32_t index) const // { return (!is_reverse_) ? vertex_->get_base(index) : 3 - vertex_->get_base(contig_size() - 1 - index); } // // SequenceCountUnitType get_count(uint32_t index) const // { return (!is_reverse_) ? vertex_->get_count(index) : vertex_->get_count(vertex_->counts().size() - 1 - index); } // // Kmer begin_kmer(int kmer_size) const // { return !is_reverse_ ? vertex_->begin_kmer(kmer_size) : vertex_->end_kmer(kmer_size).ReverseComplement(); } // // Kmer end_kmer(int kmer_size) const // { return !is_reverse_ ? vertex_->end_kmer(kmer_size) : vertex_->begin_kmer(kmer_size).ReverseComplement(); } // double coverage() const { return vertex_->coverage(); } bool is_reverse() const { return is_reverse_; } void swap(ScaffoldGraphVertexAdaptor &x) { if (this != &x) { std::swap(vertex_, x.vertex_); std::swap(is_reverse_, x.is_reverse_); } } bool is_null() const { return vertex_ == NULL; } void clear() { vertex_->clear(); } private: ScaffoldGraphVertex *vertex_; bool is_reverse_; }; namespace std { template <> inline void swap(ScaffoldGraphVertex &x, ScaffoldGraphVertex &y) { x.swap(y); } template <> inline void swap(ScaffoldGraphVertexAdaptor &x, ScaffoldGraphVertexAdaptor &y) { x.swap(y); } } #endif idba-1.1.3/src/graph/hash_graph_branch_group.h0000664000175000017500000000222612677406270016251 00000000000000/** * @file hash_graph_branch_group.h * @brief HashGraphBranchGroup Class. * @author Yu Peng (ypeng@cs.hku.hk) * @version 1.0.4 * @date 2011-09-21 */ #ifndef __GRAPH_HASH_GRAPH_BRANCH_GROUP_H_ #define __GRAPH_HASH_GRAPH_BRANCH_GROUP_H_ #include "basic/kmer.h" #include "graph/hash_graph.h" #include "graph/hash_graph_path.h" #include /** * @brief It is used to contain a branch group in de Bruijn graph (HashGraph). */ class HashGraphBranchGroup { public: HashGraphBranchGroup(HashGraph *graph, HashGraphVertexAdaptor begin, int max_branches = 2, int max_length = 0) { hash_graph_ = graph; begin_ = begin; max_branches_ = max_branches; max_length_ = max_length; if (max_length_ == 0) max_length_ = begin_.kmer().size() + 2; } bool Search(); void Merge(); HashGraphVertexAdaptor begin() { return begin_; } HashGraphVertexAdaptor end() { return end_; } private: HashGraph *hash_graph_; HashGraphVertexAdaptor begin_; HashGraphVertexAdaptor end_; std::vector branches_; int max_branches_; int max_length_; }; #endif idba-1.1.3/src/graph/bit_edges.h0000664000175000017500000000254412677406270013344 00000000000000/** * @file bit_edges.h * @brief BitEdges Class. * @author Yu Peng (ypeng@cs.hku.hk) * @version 1.0.0 * @date 2011-08-26 */ #ifndef __GRAPH_BIT_EDGES_H_ #define __GRAPH_BIT_EDGES_H_ #include "basic/bit_operation.h" #include "basic/atomic_integer.h" /** * @brief It is compact bit vector used to represent edges in de Bruijn graph (HashGraph). */ class BitEdges { public: BitEdges() {} BitEdges(const BitEdges &bit_edges) : edges_(bit_edges.edges_) {} explicit BitEdges(uint8_t edges) : edges_(edges) {} ~BitEdges() {} const BitEdges &operator =(const BitEdges &bit_edges) { edges_ = bit_edges.edges_; return *this; } const BitEdges &operator =(uint8_t edges) { edges_ = edges; return *this; } operator uint8_t () const { return edges_; } void Add(int x) { edges_ |= uint8_t(1 << x); } void Remove(int x) { edges_ &= ~uint8_t(1 << x); } void swap(BitEdges &bit_edges) { if (this != &bit_edges) edges_.swap(bit_edges.edges_); } bool operator [] (int index) const { return edges_ & (1 << index); } int size() const { return bit_operation::BitCount(edges_); } bool empty() const { return edges_ == 0; } void clear() { edges_ = 0; } private: AtomicInteger edges_; }; namespace std { template <> inline void swap(BitEdges &x, BitEdges &y) { x.swap(y); } } #endif idba-1.1.3/src/graph/scaffold_graph_path.h0000664000175000017500000000717512677406270015402 00000000000000/** * @file scaffold_graph_path.h * @brief ScaffoldGraphPath Class. * @author Yu Peng (ypeng@cs.hku.hk) * @version 1.0.10 * @date 2011-08-16 */ #ifndef __GRAPH_SCAFFOLD_GRAPH_PATH_H_ #define __GRAPH_SCAFFOLD_GRAPH_PATH_H_ #include #include #include "graph/scaffold_graph_vertex.h" /** * @brief It is a path of scaffolds vertices in ScaffoldGraph. */ class ScaffoldGraphPath { public: ScaffoldGraphPath() {} ScaffoldGraphPath(const ScaffoldGraphPath &path) : vertices_(path.vertices_), distances_(path.distances_) {} const ScaffoldGraphPath &operator =(const ScaffoldGraphPath &path) { vertices_ = path.vertices_; distances_ = path.distances_; return *this; } ScaffoldGraphVertexAdaptor &operator [](uint32_t index) { return vertices_[index]; } const ScaffoldGraphVertexAdaptor &operator [](uint32_t index) const { return vertices_[index]; } void Append(const ScaffoldGraphVertexAdaptor &vertex, int d) { vertices_.push_back(vertex); if (vertices_.size() > 1) distances_.push_back(d); } void Pop() { vertices_.pop_back(); if (!distances_.empty()) distances_.pop_back(); } const ScaffoldGraphPath &ReverseComplement() { std::reverse(vertices_.begin(), vertices_.end()); for (unsigned i = 0; i < vertices_.size(); ++i) vertices_[i].ReverseComplement(); std::reverse(distances_.begin(), distances_.end()); return *this; } void Assemble(ContigGraphPath &path) { path.clear(); for (unsigned i = 0; i < vertices_.size(); ++i) { if (i == 0) path.Append(vertices_[i].path(), 0); else path.Append(vertices_[i].path(), distances_[i-1]); } } void swap(ScaffoldGraphPath &path) { if (this != &path) { vertices_.swap(path.vertices_); distances_.swap(path.distances_); } } ScaffoldGraphVertexAdaptor &front() { return vertices_.front(); } const ScaffoldGraphVertexAdaptor &front() const { return vertices_.front(); } ScaffoldGraphVertexAdaptor &back() { return vertices_.back(); } const ScaffoldGraphVertexAdaptor &back() const { return vertices_.back(); } // uint64_t kmer_count() const // { // uint64_t sum = 0; // for (unsigned i = 0; i < vertices_.size(); ++i) // sum += vertices_[i].kmer_count(); // return sum; // } uint32_t size() const { uint32_t size = 0; for (unsigned i = 0; i < vertices_.size(); ++i) size += vertices_[i].path_size(); for (unsigned i = 0; i < distances_.size(); ++i) size += distances_[i]; return size; } // uint32_t internal_size(int kmer_size) const // { // if (vertices_.size() <= 1) // return vertices_.size(); // // uint32_t size = kmer_size + 1; // for (unsigned i = 1; i+1 < vertices_.size(); ++i) // size += vertices_[i].contig_size(); // for (unsigned i = 0; i < distances_.size(); ++i) // size += distances_[i]; // return size; // } uint32_t num_nodes() const { return vertices_.size(); } void clear() { vertices_.clear(); distances_.clear(); } std::deque &distances() { return distances_; } const std::deque &distances() const { return distances_; } private: std::deque vertices_; std::deque distances_; }; namespace std { template <> inline void swap(ScaffoldGraphPath &x, ScaffoldGraphPath &y) { x.swap(y); } } #endif idba-1.1.3/src/graph/contig_graph.cpp0000664000175000017500000012566712677406270014432 00000000000000/** * @file contig_graph.cpp * @brief * @author Yu Peng (ypeng@cs.hku.hk) * @version 1.0.0 * @date 2011-08-26 */ #include "graph/contig_graph.h" #include #include #include #include #include #include #include "graph/contig_graph_branch_group.h" #include "sequence/sequence.h" using namespace std; double ContigGraph::Binormial(int n, int m) { double product = 1; for (int i = 1; i <= n; ++i) product *= i; for (int i = 1; i <= m; ++i) product /= i; return product; } void ContigGraph::InitializeTable() { const double err = 0.01; double p_err = err/3 * pow(1-err, double(kmer_size_-1)); p_table.resize(10); for (unsigned i = 0; i < 10; ++i) { p_table.resize(1000); } for (int m = 1; m < 10; ++m) { for (int x = 1; x < 1000; ++x) { double sum = 0; for (int i = 0; i <= x - m; ++i) sum += 3 * Binormial(x - m - i + 2, 2) * pow(p_err, x-i) * pow(1 - p_err, i); for (int i = 0; i <= x - 2*m; ++i) sum -= 3 * Binormial(x - 2*m - i + 2, 2) * pow(p_err, x-i) * pow(1 - p_err, i); for (int i = 0; i <= x - 3*m; ++i) sum += Binormial(x - 3*m - i + 2, 2) * pow(p_err, x-i) * pow(1 - p_err, i); p_table[m][x] = sum; } } } double ContigGraph::Threshold(double k, double mean, double sd, double p_false) { if (mean > 150) return mean; const double Pi = 3.1415926535; double x = 0; double sum = 0; double step = 0.1; while (sum < p_false && x < mean) { double y = 1 / sqrt(2*Pi*sd*sd) * exp(-(x-mean)*(x-mean)/(2*sd*sd)) * 4 * k * p_table[1][int(x)] * step; sum += y; x += step; } cout << x << " " << mean << endl; return x; } void ContigGraph::Initialize(const deque &contigs, const deque &contig_infos) { vertices_.clear(); vertices_.resize(contigs.size()); #pragma omp parallel for schedule(static, 1) for (int64_t i = 0; i < (int64_t)contigs.size(); ++i) { vertices_[i].clear(); vertices_[i].set_contig(contigs[i]); vertices_[i].set_contig_info(contig_infos[i]); vertices_[i].set_id(i); } RefreshEdges(); } void ContigGraph::BuildEdgeCountTable() { edge_count_table_.clear(); edge_count_table_.set_kmer_size(kmer_size_+1); #pragma omp parallel for schedule(static, 1) for (int64_t i = 0; i < (int64_t)vertices_.size(); ++i) { for (int strand = 0; strand < 2; ++strand) { ContigGraphVertexAdaptor current(&vertices_[i], strand); Kmer kmer = current.end_kmer(kmer_size_); kmer.resize(kmer_size_+1); for (int x = 0; x < 4; ++x) { if (current.out_edges()[x]) { kmer.set_base(kmer_size_, x); edge_count_table_.InsertVertex(kmer); } } } } edge_count_table_.ClearCount(); } void ContigGraph::Refresh() { RefreshVertices(); RefreshEdges(); } void ContigGraph::RefreshVertices() { uint64_t index = 0; for (unsigned i = 0; i < vertices_.size(); ++i) { if (!vertices_[i].status().IsDead()) { vertices_[index].swap(vertices_[i]); vertices_[index].set_id(index); ++index; } } vertices_.resize(index); } void ContigGraph::RefreshEdges() { BuildBeginKmerMap(); uint64_t total_degree = 0; #pragma omp parallel for reduction(+: total_degree) for (int64_t i = 0; i < (int64_t)vertices_.size(); ++i) { for (int strand = 0; strand < 2; ++strand) { ContigGraphVertexAdaptor current(&vertices_[i], strand); for (int x = 0; x < 4; ++x) { if (current.out_edges()[x]) { Kmer kmer = current.end_kmer(kmer_size_); kmer.ShiftAppend(x); if (FindVertexAdaptorByBeginKmer(kmer).is_null()) current.out_edges().Remove(x); } } //#pragma omp atomic total_degree += current.out_edges().size(); } if (vertices_[i].contig().size() == kmer_size_ && vertices_[i].contig().IsPalindrome()) { vertices_[i].in_edges() = vertices_[i].out_edges() | vertices_[i].out_edges(); vertices_[i].out_edges() = vertices_[i].in_edges(); } } num_edges_ = total_degree / 2; } void ContigGraph::AddAllEdges() { #pragma omp parallel for for (int64_t i = 0; i < (int64_t)vertices_.size(); ++i) { vertices_[i].in_edges() = 15; vertices_[i].out_edges() = 15; } RefreshEdges(); } void ContigGraph::RemoveAllEdges() { #pragma omp parallel for for (int64_t i = 0; i < (int64_t)vertices_.size(); ++i) { vertices_[i].in_edges() = 0; vertices_[i].out_edges() = 0; } RefreshEdges(); } void ContigGraph::ClearStatus() { #pragma omp parallel for for (int64_t i = 0; i < (int64_t)vertices_.size(); ++i) vertices_[i].status().clear(); } void ContigGraph::MergeSimilarPath() { #pragma omp parallel for for (int64_t i = 0; i < (int64_t)vertices_.size(); ++i) { for (int strand = 0; strand < 2; ++strand) { ContigGraphVertexAdaptor current(&vertices_[i], strand); if (current.status().IsDead()) continue; if (current.out_edges().size() > 1) { deque neighbors; GetNeighbors(current, neighbors); sort(neighbors.begin(), neighbors.end(), CompareContigCoverage); for (unsigned j = 0; j < neighbors.size(); ++j) { if (neighbors[j].status().IsDead()) continue; for (unsigned k = j+1; k < neighbors.size(); ++k) { if (!neighbors[k].status().IsDead() && neighbors[j].in_edges() == neighbors[k].in_edges() && neighbors[j].out_edges() == neighbors[k].out_edges() && neighbors[j].begin_kmer(kmer_size_-1) == neighbors[k].begin_kmer(kmer_size_-1) && neighbors[j].end_kmer(kmer_size_-1) == neighbors[k].end_kmer(kmer_size_-1) && GetSimilarity(neighbors[j], neighbors[k]) > 0.98) { neighbors[k].status().SetDeadFlag(); } } } } } } Refresh(); MergeSimplePaths(); } int64_t ContigGraph::Prune(int min_length) { uint64_t old_num_vertices = vertices_.size(); #pragma omp parallel for for (int64_t i = 0; i < (int64_t)vertices_.size(); ++i) { for (int strand = 0; strand < 2; ++strand) { ContigGraphVertexAdaptor current(&vertices_[i], strand); if (current.status().IsDead()) continue; if (current.out_edges().size() <= 1) continue; int maximum = 0; int depth = GetDepth(current, kmer_size_ - 1, maximum, min_length + kmer_size_ - 1); if (depth > min_length + (int)kmer_size_ - 1) depth = min_length + (int)kmer_size_ - 1; deque neighbors; GetNeighbors(current, neighbors); for (unsigned j = 0; j < neighbors.size(); ++j) { if (neighbors[j].in_edges().size() == 1 && neighbors[j].out_edges().size() == 0 && (int)neighbors[j].contig_size() < depth) neighbors[j].status().SetDeadFlag(); } } } Refresh(); MergeSimplePaths(); return old_num_vertices - vertices_.size(); } int64_t ContigGraph::Trim(int min_length) { uint64_t old_num_vertices = vertices_.size(); #pragma omp parallel for for (int64_t i = 0; i < (int64_t)vertices_.size(); ++i) { if (vertices_[i].contig().size() == kmer_size_ && vertices_[i].contig().IsPalindrome()) continue; if ((vertices_[i].in_edges().empty() || vertices_[i].out_edges().empty()) && vertices_[i].contig().size() < min_length + kmer_size_ - 1 && (vertices_[i].in_edges().size() + vertices_[i].out_edges().size() <= 1) ) { vertices_[i].status().SetDeadFlag(); } } Refresh(); MergeSimplePaths(); return old_num_vertices - vertices_.size(); } int64_t ContigGraph::Trim(int min_length, double min_cover) { uint64_t old_num_vertices = vertices_.size(); #pragma omp parallel for for (int64_t i = 0; i < (int64_t)vertices_.size(); ++i) { if (vertices_[i].contig().size() == kmer_size_ && vertices_[i].contig().IsPalindrome()) continue; if ((vertices_[i].in_edges().empty() || vertices_[i].out_edges().empty()) && vertices_[i].contig().size() < min_length + kmer_size_ - 1 && (vertices_[i].in_edges().size() + vertices_[i].out_edges().size() <= 1 && vertices_[i].coverage() < min_cover) ) { vertices_[i].status().SetDeadFlag(); } } Refresh(); MergeSimplePaths(); return old_num_vertices - vertices_.size(); } int64_t ContigGraph::RemoveStandAlone(int min_length) { uint64_t old_num_vertices = vertices_.size(); #pragma omp parallel for for (int64_t i = 0; i < (int64_t)vertices_.size(); ++i) { if (vertices_[i].contig().size() == kmer_size_ && vertices_[i].contig().IsPalindrome()) continue; if ((vertices_[i].in_edges().empty() && vertices_[i].out_edges().empty()) && vertices_[i].contig().size() < min_length + kmer_size_ - 1 ) { vertices_[i].status().SetDeadFlag(); } } Refresh(); MergeSimplePaths(); return old_num_vertices - vertices_.size(); } int64_t ContigGraph::RemoveDeadEnd(int min_length) { uint64_t num_deadend = 0; int l = 1; while (true) { l = min(2*l, min_length); num_deadend += Trim(l); if (l == min_length) break; } num_deadend += Trim(min_length); return num_deadend; } int64_t ContigGraph::RemoveDeadEnd(int min_length, double min_cover) { uint64_t num_deadend = 0; int l = 1; while (true) { l = min(2*l, min_length); num_deadend += Trim(l, min_cover); if (l == min_length) break; } num_deadend += Trim(min_length); return num_deadend; } int64_t ContigGraph::RemoveBubble() { deque candidates; omp_lock_t bubble_lock; omp_init_lock(&bubble_lock); #pragma omp parallel for schedule(static, 1) for (int64_t i = 0; i < (int64_t)vertices_.size(); ++i) { for (int strand = 0; strand < 2; ++strand) { ContigGraphVertexAdaptor current(&vertices_[i], strand); if (current.out_edges().size() > 1 && current.contig_size() > kmer_size_) { ContigGraphBranchGroup branch_group(this, current, 4, kmer_size_ + 2); if (branch_group.Search()) { ContigGraphVertexAdaptor begin = branch_group.begin(); ContigGraphVertexAdaptor end = branch_group.end(); begin.ReverseComplement(); end.ReverseComplement(); std::swap(begin, end); ContigGraphBranchGroup rev_branch_group(this, begin, 4, kmer_size_ + 2); if (rev_branch_group.Search() && rev_branch_group.end() == end) { omp_set_lock(&bubble_lock); candidates.push_back(current); omp_unset_lock(&bubble_lock); } } } } } int64_t bubble = 0; for (unsigned i = 0; i < candidates.size(); ++i) { ContigGraphVertexAdaptor current = candidates[i]; if (current.out_edges().size() > 1) { ContigGraphBranchGroup branch_group(this, current, 4, kmer_size_ + 2); if (branch_group.Search()) { ContigGraphVertexAdaptor begin = branch_group.begin(); ContigGraphVertexAdaptor end = branch_group.end(); begin.ReverseComplement(); end.ReverseComplement(); std::swap(begin, end); ContigGraphBranchGroup rev_branch_group(this, begin, 4, kmer_size_ + 2); if (rev_branch_group.Search() && rev_branch_group.end() == end) { branch_group.Merge(); ++bubble; } } } } Refresh(); MergeSimplePaths(); return bubble; } double ContigGraph::IterateCoverage(int min_length, double min_cover, double max_cover, double factor) { min_cover = min(min_cover, max_cover); while (true) { RemoveLowCoverage(min_cover, min_length); min_cover *= factor; if (min_cover >= max_cover) break; } return min_cover; } double ContigGraph::IterateLocalCoverage(int min_length, double ratio, double min_cover, double max_cover, double factor) { in_kmer_count_table_.reserve(vertices_.size()); min_cover = min(min_cover, max_cover); while (true) { bool is_changed = RemoveLocalLowCoverage(min_cover, min_length, ratio); if (!is_changed) break; if (min_cover >= max_cover) break; min_cover *= factor; } return min_cover; } double ContigGraph::IterateComponentCoverage(int min_length, double ratio, double min_cover, double max_cover, double factor, int max_component_size) { in_kmer_count_table_.reserve(vertices_.size()); min_cover = min(min_cover, max_cover); while (true) { bool is_changed = RemoveComponentLowCoverage(min_cover, min_length, ratio, max_component_size); if (!is_changed) break; if (min_cover >= max_cover) break; min_cover *= factor; } return min_cover; } double ContigGraph::IterateComponentCoverage2(int min_length, double ratio, double min_cover, double max_cover, double factor, int max_component_size) { in_kmer_count_table_.reserve(vertices_.size()); min_cover = min(min_cover, max_cover); while (true) { bool is_changed = RemoveComponentLowCoverage2(min_cover, min_length, ratio, max_component_size); if (!is_changed) break; if (min_cover >= max_cover) break; min_cover *= factor; } return min_cover; } bool ContigGraph::RemoveLowCoverage(double min_cover, int min_length) { bool is_changed = false; #pragma omp parallel for schedule(static, 1) for (int64_t i = 0; i < (int64_t)vertices_.size(); ++i) { ContigGraphVertexAdaptor current(&vertices_[i]); if (current.contig_size() < min_length + kmer_size_ - 1 && ((current.in_edges().size() <= 1 && current.out_edges().size() <= 1) || current.in_edges().size() == 0 || current.out_edges().size() == 0) ) { if (current.coverage() < min_cover) { is_changed = true; current.status().SetDeadFlag(); } } } Refresh(); //Trim(min_length); MergeSimplePaths(); return is_changed; } bool ContigGraph::RemoveLocalLowCoverage(double min_cover, int min_length, double ratio) { int region_length = 1000; //int region_length = 100; bool is_changed = false; #pragma omp parallel for schedule(static, 1) for (int64_t i = 0; i < (int64_t)vertices_.size(); ++i) { ContigGraphVertexAdaptor current(&vertices_[i]); if (current.contig_size() < min_length + kmer_size_ - 1 && ((current.in_edges().size() <= 1 && current.out_edges().size() <= 1) || current.in_edges().size() == 0 || current.out_edges().size() == 0) ) { if (is_changed && current.coverage() > min_cover) continue; double mean = LocalCoverage(current, region_length); double threshold = min_cover; if (min_cover < mean * ratio) is_changed = true; else threshold = mean * ratio; if (current.coverage() < threshold) { is_changed = true; current.status().SetDeadFlag(); } } } Refresh(); //Trim(min_length); MergeSimplePaths(); return is_changed; } bool ContigGraph::RemoveComponentLowCoverage(double min_cover, int min_length, double ratio, int max_component_size) { int region_length = 300; deque > components; deque component_strings; GetComponents(components, component_strings); deque average_coverage(components.size()); deque component_id_table(vertices_.size()); #pragma omp parallel for schedule(dynamic) for (int64_t i = 0; i < (int64_t)components.size(); ++i) { double total_kmer_count = 0; double total = 0; for (unsigned j = 0; j < components[i].size(); ++j) { total_kmer_count += components[i][j].kmer_count(); total += components[i][j].contig_size() - kmer_size_ + 1; component_id_table[components[i][j].id()] = i; } average_coverage[i] = total_kmer_count / total; } bool is_changed = false; //int max_component_size = 30; #pragma omp parallel for schedule(static, 1) for (int64_t i = 0; i < (int64_t)vertices_.size(); ++i) { ContigGraphVertexAdaptor current(&vertices_[i]); int id = component_id_table[current.id()]; if (components[id].size() <= 10) continue; if (current.contig_size() < min_length + kmer_size_ - 1 && (current.in_edges().size() <= 1 && current.out_edges().size() <= 1) //|| current.in_edges().size() == 0 || current.out_edges().size() == 0) ) { if (is_changed && current.coverage() > min_cover) continue; double threshold = min_cover; double mean = LocalCoverage(current, region_length); //double mean = average_coverage[id]; if (min_cover < ratio * mean || ((int)components[id].size() > max_component_size && min_cover < average_coverage[id])) is_changed = true; else threshold = ratio * mean; if (current.coverage() < threshold || ((int)components[id].size() > max_component_size && current.coverage() < average_coverage[id])) { is_changed = true; current.status().SetDeadFlag(); } } } Refresh(); MergeSimplePaths(); return is_changed; } bool ContigGraph::RemoveComponentLowCoverage2(double min_cover, int min_length, double ratio, int max_component_size) { int region_length = 300; deque > components; deque component_strings; GetComponents(components, component_strings); deque average_coverage(components.size()); deque component_id_table(vertices_.size()); #pragma omp parallel for schedule(dynamic) for (int64_t i = 0; i < (int64_t)components.size(); ++i) { double total_kmer_count = 0; double total = 0; Histgram histgram; for (unsigned j = 0; j < components[i].size(); ++j) { total_kmer_count += components[i][j].kmer_count(); total += components[i][j].contig_size() - kmer_size_ + 1; component_id_table[components[i][j].id()] = i; // SequenceCount counts = components[i][j].counts(); // for (unsigned k = 0; k < counts.size(); ++k) // histgram.insert(counts[k]); } average_coverage[i] = total_kmer_count / total; } bool is_changed = false; //int max_component_size = 30; #pragma omp parallel for schedule(static, 1) for (int64_t i = 0; i < (int64_t)vertices_.size(); ++i) { ContigGraphVertexAdaptor current(&vertices_[i]); int id = component_id_table[current.id()]; if (components[id].size() <= 10) continue; if (current.contig_size() < min_length + kmer_size_ - 1 && (current.in_edges().size() <= 1 && current.out_edges().size() <= 1) //|| current.in_edges().size() == 0 || current.out_edges().size() == 0) ) { if (is_changed && current.coverage() > min_cover) continue; double threshold = min_cover; double mean = LocalCoverage(current, region_length); //double mean = average_coverage[id]; double threshold2 = Threshold(kmer_size_, average_coverage[id], average_coverage[id]/10, 0.01); if (min_cover < ratio * mean || ((int)components[id].size() > max_component_size && min_cover < threshold2)) is_changed = true; else threshold = ratio * mean; if (current.coverage() < threshold || ((int)components[id].size() > max_component_size && current.coverage() < threshold2)) { is_changed = true; current.status().SetDeadFlag(); } } } Refresh(); MergeSimplePaths(); return is_changed; } double ContigGraph::LocalCoverage(ContigGraphVertexAdaptor current, int region_length) { double num_count = 0; int num_kmer = 0; LocalCoverageSingle(current, region_length, num_count, num_kmer); LocalCoverageSingle(current.ReverseComplement(), region_length, num_count, num_kmer); if (num_kmer == 0) //return 1e100; return 0; else return num_count / num_kmer; } double ContigGraph::LocalCoverageSingle(ContigGraphVertexAdaptor current, int region_length, double &total_count, int &total_kmer) { map visited; deque qu; qu.push_back(current); visited[current.id()] = 0; int index = 0; int num_added = 0; int num_count = 0; int num_kmer = 0; while (index < (int)qu.size()) { current = qu[index++]; if (num_added >= 4 * region_length) break; if (visited.size() > 32) break; if (visited[current.id()] >= region_length) continue; int dist = visited[current.id()]; for (int x = 0; x < 4; ++x) { if (current.out_edges()[x]) { ContigGraphVertexAdaptor next = GetNeighbor(current, x); if (visited.find(next.id()) == visited.end()) { visited[next.id()] = dist + next.num_kmer(); qu.push_back(next); if ((int)next.num_kmer() + dist > region_length) { if ((int)next.num_kmer() < region_length) { num_count += (int64_t)next.kmer_count() * (region_length - dist) / next.num_kmer(); num_kmer += region_length - dist; num_added += region_length - dist; } else { Kmer begin = next.begin_kmer(kmer_size_); if (in_kmer_count_table_.find(begin) == in_kmer_count_table_.end()) { int in_kmer_count = 0; for (int i = 0; i < region_length; ++i) in_kmer_count += next.get_count(i); in_kmer_count_table_[begin] = in_kmer_count; } num_count += (int64_t)in_kmer_count_table_[begin] * (region_length - dist) / region_length; num_kmer += region_length - dist; num_added += region_length - dist; } } else { num_count += next.kmer_count(); num_kmer += next.num_kmer(); num_added += next.num_kmer(); } } } } } total_count += num_count; total_kmer += num_kmer; if (num_kmer == 0) return 0; else return num_count * 1.0 / num_kmer; } void ContigGraph::MergeSimplePaths() { deque contigs; deque contig_infos; Assemble(contigs, contig_infos); Initialize(contigs, contig_infos); } int64_t ContigGraph::Assemble(deque &contigs, deque &contig_infos) { contigs.clear(); contig_infos.clear(); omp_lock_t contig_lock; omp_init_lock(&contig_lock); #pragma omp parallel for for (int64_t i = 0; i < (int64_t)vertices_.size(); ++i) { if (vertices_[i].contig().size() == kmer_size_ && vertices_[i].contig().IsPalindrome()) { vertices_[i].status().Lock(omp_get_max_threads()); Sequence contig = vertices_[i].contig(); //ContigInfo contig_info(vertices_[i].kmer_count(), vertices_[i].in_edges(), vertices_[i].out_edges()); ContigInfo contig_info; contig_info.set_kmer_count(vertices_[i].kmer_count()); contig_info.in_edges() = vertices_[i].in_edges(); contig_info.out_edges() = vertices_[i].out_edges(); omp_set_lock(&contig_lock); contigs.push_back(contig); contig_infos.push_back(contig_info); omp_unset_lock(&contig_lock); } } //cout << "palindrome " << contigs.size() << endl; #pragma omp parallel for schedule(static, 1) for (int64_t i = 0; i < (int64_t)vertices_.size(); ++i) { if (!vertices_[i].status().Lock(omp_get_thread_num())) continue; ContigGraphPath path; path.Append(ContigGraphVertexAdaptor(&vertices_[i]), 0); Sequence contig; ContigInfo contig_info; for (int strand = 0; strand < 2; ++strand) { while (true) { ContigGraphVertexAdaptor current = path.back(); ContigGraphVertexAdaptor next; if (!GetNextVertexAdaptor(current, next)) break; if (IsLoop(path, next)) break; if (!next.status().LockPreempt(omp_get_thread_num())) goto FAIL; path.Append(next, -kmer_size_ + 1); } path.ReverseComplement(); } path.Assemble(contig, contig_info); omp_set_lock(&contig_lock); contigs.push_back(contig); contig_infos.push_back(contig_info); omp_unset_lock(&contig_lock); FAIL: ; } omp_destroy_lock(&contig_lock); ClearStatus(); return contigs.size(); } struct SearchNode { ContigGraphVertexAdaptor node; int distance; int label; }; bool ContigGraph::IsConverged(ContigGraphVertexAdaptor current) { int TimeLimit = 1000; int DistanceLimit = 300; map reachable; queue qu; for (int x = 0; x < 4; ++x) { if (current.out_edges()[x]) { SearchNode search_node; search_node.node = GetNeighbor(current, x); search_node.distance = -(int)kmer_size_ + 1; search_node.label = x; //if (!search_node.node.status().IsDead()) qu.push(search_node); } } int time = 0; while (!qu.empty()) { if (time++ == TimeLimit) break; SearchNode search_node = qu.front(); qu.pop(); reachable[search_node.node] |= (1 << search_node.label); //cout << (reachable[search_node.node] == current.out_edges()) << " " << reachable[search_node.node] << " " << (int)current.out_edges() << endl; if (reachable[search_node.node] == (int)current.out_edges()) { return true; } if (search_node.distance + (int)search_node.node.contig_size() - (int)kmer_size_ + 1 > DistanceLimit) continue; for (int x = 0; x < 4; ++x) { if (search_node.node.out_edges()[x]) { ContigGraphVertexAdaptor next = GetNeighbor(search_node.node, x); SearchNode new_search_node; new_search_node.node = next; new_search_node.distance = search_node.distance + (int)search_node.node.contig_size() - (int)kmer_size_ + 1; new_search_node.label = search_node.label; // if (new_search_node.node == current) // continue; if (reachable[new_search_node.node] & (1 << new_search_node.label)) continue; //if (!new_search_node.node.status().IsDead()) qu.push(new_search_node); } } } return false; } int64_t ContigGraph::SplitBranches() { //cout << num_vertices() << " " << num_edges() << endl; deque branches; omp_lock_t lock; omp_init_lock(&lock); int64_t count = 0; #pragma omp parallel for for (int64_t i = 0; i < (int64_t)vertices_.size(); ++i) { ContigGraphVertexAdaptor current(&vertices_[i]); for (int strand = 0; strand < 2; ++strand) { if (!IsConverged(current)) { #pragma omp atomic ++count; omp_set_lock(&lock); branches.push_back(current); omp_unset_lock(&lock); } current.ReverseComplement(); } } omp_destroy_lock(&lock); set sources; for (unsigned i = 0; i < branches.size(); ++i) sources.insert(branches[i]); for (unsigned i = 0; i < branches.size(); ++i) { ContigGraphVertexAdaptor u = branches[i]; for (int x = 0; x < 4; ++x) { if (u.out_edges()[x]) { ContigGraphVertexAdaptor v = GetNeighbor(u, x); v.ReverseComplement(); if (sources.find(v) == sources.end()) { sources.insert(v); branches.push_back(v); } //RemoveEdge(u, x); } } } for (unsigned i = 0; i < branches.size(); ++i) { ContigGraphVertexAdaptor u = branches[i]; for (int x = 0; x < 4; ++x) { if (u.out_edges()[x]) RemoveEdge(u, x); } } RefreshEdges(); return count; } void ContigGraph::Decomposite() { int64_t last = 0; for (int i = 0; i < 100; ++i) { int64_t split = SplitBranches(); //cout << split << " " << 2*vertices_.size() << endl; if (last == split) break; last = split; } } void ContigGraph::GetComponents(deque > &components, deque &component_strings) { components.clear(); component_strings.clear(); for (unsigned i = 0; i < vertices().size(); ++i) { if (vertices()[i].status().IsUsed()) continue; deque qu; qu.push_back(ContigGraphVertexAdaptor(&vertices()[i], 0)); vertices()[i].status().SetUsedFlag(); stringstream ss; for (int index = 0; index < (int)qu.size(); ++index) { ContigGraphVertexAdaptor current = qu[index]; for (int strand = 0; strand < 2; ++strand) { //for (connection_list_iterator p = connections()[current].begin(); p != connections()[current].end(); ++p) for (int x = 0; x < 4; ++x) { if (current.out_edges()[x]) { ContigGraphVertexAdaptor next = GetNeighbor(current, x); if (strand == 0) { ss << current.id() << "_" << current.is_reverse() << "_" << current.contig_size() << "_" << current.kmer_count() << " " << next.id() << "_" << next.is_reverse() << "_" << next.contig_size() << "_" << next.kmer_count() << endl; if (!next.status().IsUsed()) qu.push_back(next); } else { ss << next.id() << "_" << next.is_reverse() << "_" << next.contig_size() << "_" << next.kmer_count() << " " << current.id() << "_" << current.is_reverse() << "_" << current.contig_size() << "_" << current.kmer_count() << endl; if (!next.status().IsUsed()) qu.push_back(next.ReverseComplement()); } next.status().SetUsedFlag(); } } current.ReverseComplement(); } } components.push_back(qu); component_strings.push_back(ss.str()); } ClearStatus(); } void ContigGraph::GetConsensus(deque &contigs) { deque > components; deque component_strings; GetComponents(components, component_strings); for (unsigned i = 0; i < components.size(); ++i) { ContigGraphVertexAdaptor begin = GetBeginVertexAdaptor(components[i]); ContigGraphVertexAdaptor end = GetEndVertexAdaptor(components[i]); if (begin.is_null() || end.is_null() || !IsValid(components[i])) { for (unsigned j = 0; j < components[i].size(); ++j) contigs.push_back(components[i][j].contig()); } else { ContigGraphPath path; FindLongestPath(components[i], path); Sequence contig; ContigInfo contig_info; path.Assemble(contig, contig_info); contigs.push_back(contig); } } } bool ContigGraph::FindPath(ContigGraphVertexAdaptor from, ContigGraphVertexAdaptor to, ContigGraphPath &path) { path.clear(); map is_used; map prev; deque qu; qu.push_back(from); prev[from] = ContigGraphVertexAdaptor(NULL); is_used[from.id()] = true; int time = 0; while (!qu.empty()) { if (++time >= 100) break; if (prev.find(to) != prev.end()) break; ContigGraphVertexAdaptor current = qu.front(); qu.pop_front(); deque neighbors; GetNeighbors(current, neighbors); for (unsigned i = 0; i < neighbors.size(); ++i) { ContigGraphVertexAdaptor next = neighbors[i]; //if (prev.find(next) == prev.end()) if (!is_used[next.id()]) { is_used[next.id()] = true; prev[next] = current; qu.push_back(next); } } } if (prev.find(to) != prev.end()) { deque tmp; tmp.push_back(to); while (!prev[tmp.back()].is_null()) tmp.push_back(prev[tmp.back()]); reverse(tmp.begin(), tmp.end()); for (unsigned i = 0; i < tmp.size(); ++i) path.Append(tmp[i], -kmer_size_ + 1); return true; } else return false; } void ContigGraph::GetContigs(deque &contigs, deque &contig_infos) { contigs.resize(vertices_.size()); contig_infos.resize(vertices_.size()); #pragma omp parallel for for (int64_t i = 0; i < (int64_t)vertices_.size(); ++i) { contigs[i] = vertices_[i].contig(); contig_infos[i] = vertices_[i].contig_info(); } } double ContigGraph::GetSimilarity(const Sequence &a, const Sequence &b) { if (a.size() >= kMaxCheckSimilarity || b.size() > kMaxCheckSimilarity) return 0; vector > table; table.resize(a.size() + 1); for (unsigned i = 0; i < table.size(); ++i) table[i].resize(b.size() + 1); for (int i = 0; i <= (int)a.size(); ++i) table[i][0] = i; for (int j = 0; j <= (int)b.size(); ++j) table[0][j] = j; for (int i = 1; i <= (int)a.size(); ++i) { for (int j = 1; j <= (int)b.size(); ++j) { table[i][j] = 1000000000; if (table[i-1][j] + 1 < table[i][j]) table[i][j] = table[i-1][j] + 1; if (table[i][j-1] + 1 < table[i][j]) table[i][j] = table[i][j-1] + 1; if (table[i-1][j-1] + (a[i-1] != b[j-1]) < table[i][j]) table[i][j] = table[i-1][j-1] + (a[i-1] != b[j-1]); } } return 1.0 - 1.0 * table[a.size()][b.size()] / max(a.size(), b.size()); } void ContigGraph::BuildBeginKmerMap() { begin_kmer_map_.clear(); begin_kmer_map_.reserve(vertices_.size()*2); #pragma omp parallel for for (int64_t i = 0; i < (int64_t)vertices_.size(); ++i) { for (int strand = 0; strand < 2; ++strand) { ContigGraphVertexAdaptor current(&vertices_[i], strand); Kmer kmer = current.begin_kmer(kmer_size_); Kmer key = kmer.unique_format(); begin_kmer_map_[key] = i; } } } bool ContigGraph::CycleDetect(ContigGraphVertexAdaptor current, map &status) { if (status[current.id()] == 0) { bool flag = false; status[current.id()] = 1; for (int x = 0; x < 4; ++x) { if (current.out_edges()[x]) { if (CycleDetect(GetNeighbor(current, x), status)) flag = true; } } status[current.id()] = 2; return flag; } else if (status[current.id()] == 1) return true; else return false; } bool ContigGraph::IsValid(deque &component) { ContigGraphVertexAdaptor begin = GetBeginVertexAdaptor(component); ContigGraphVertexAdaptor end = GetEndVertexAdaptor(component); map status; if (CycleDetect(begin, status)) return false; if (status.size() != component.size()) return false; status.clear(); end.ReverseComplement(); if (CycleDetect(end, status)) return false; if (status.size() != component.size()) return false; return true; } void ContigGraph::FindLongestPath(deque &component, ContigGraphPath &path) { ContigGraphVertexAdaptor begin = GetBeginVertexAdaptor(component); ContigGraphVertexAdaptor end = GetEndVertexAdaptor(component); deque order; TopSort(component, order); map dist; map prev; dist[begin] = 0; prev[begin] = ContigGraphVertexAdaptor(NULL); for (unsigned i = 0; i < order.size(); ++i) { ContigGraphVertexAdaptor current = order[i]; for (int x = 0; x < 4; ++x) { if (current.out_edges()[x]) { ContigGraphVertexAdaptor next = GetNeighbor(current, x); int tmp = dist[current] + (int)current.contig_size() - (int)kmer_size_ + 1; if (current.id() != next.id() && tmp > dist[next]) { dist[next] = tmp; prev[next] = current; } } } } deque v; v.push_back(end); while (!prev[v.back()].is_null()) v.push_back(prev[v.back()]); reverse(v.begin(), v.end()); path.clear(); for (unsigned i = 0; i < v.size(); ++i) path.Append(v[i], -(int)kmer_size_ + 1); } void ContigGraph::TopSort(deque &component, deque &order) { ContigGraphVertexAdaptor begin = GetBeginVertexAdaptor(component); ContigGraphVertexAdaptor end = GetEndVertexAdaptor(component); map status; TopSortDFS(order, begin, status); reverse(order.begin(), order.end()); } void ContigGraph::TopSortDFS(deque &order, ContigGraphVertexAdaptor current, map &status) { if (status[current.id()] == 0) { status[current.id()] = 1; for (int x = 0; x < 4; ++x) { if (current.out_edges()[x]) TopSortDFS(order, GetNeighbor(current, x), status); } order.push_back(current); } } int ContigGraph::GetDepth(ContigGraphVertexAdaptor current, int depth, int &maximum, int min_length) { if (depth > maximum) maximum = depth; if (maximum >= min_length) return min_length; deque neighbors; GetNeighbors(current, neighbors); for (unsigned i = 0; i < neighbors.size(); ++i) { if (neighbors[i].status().IsDead()) continue; GetDepth(neighbors[i], depth - kmer_size_ + 1 + neighbors[i].contig_size(), maximum, min_length); } return min(maximum, min_length); } double ContigGraph::FindSimilarPath(ContigGraphVertexAdaptor target, ContigGraphVertexAdaptor start) { if (start.status().IsDead() || target.begin_kmer(kmer_size_-1) != start.begin_kmer(kmer_size_-1) || target.in_edges() != start.in_edges()) return 0; ContigGraphPath path; path.Append(start, 0); int time = 0; return FindSimilarPath(target, path, time); } double ContigGraph::FindSimilarPath(ContigGraphVertexAdaptor target, ContigGraphPath &path, int &time) { if (++time > 100) return 0; ContigGraphVertexAdaptor current = path.back(); if (path.size() > 1.1 * target.contig_size()) return 0; else if (current.end_kmer(kmer_size_-1) == target.end_kmer(kmer_size_-1) && current.out_edges() == target.out_edges()) { Sequence contig; ContigInfo contig_info; path.Assemble(contig, contig_info); return GetSimilarity(target.contig(), contig); } else { double maximum = 0; deque neighbors; GetNeighbors(current, neighbors); for (unsigned i = 0; i < neighbors.size(); ++i) { path.Append(neighbors[i], -kmer_size_+1); double tmp = FindSimilarPath(target, path, time); path.Pop(); if (tmp > maximum) maximum = tmp; } return maximum; } } idba-1.1.3/src/graph/scaffold_graph.cpp0000664000175000017500000004265112677406270014717 00000000000000/** * @file scaffold_graph.cpp * @brief * @author Yu Peng (ypeng@cs.hku.hk) * @version 1.0.10 * @date 2012-04-16 */ #include "graph/scaffold_graph.h" #include #include #include #include #include #include #include "basic/math.h" using namespace std; static bool Compare(const ScaffoldGraphEdgeAdaptor &x, const ScaffoldGraphEdgeAdaptor &y) { return x.distance() > y.distance(); } void ScaffoldGraph::Initialize() { deque &contig_graph_vertices = contig_graph_.vertices(); vertices_.resize(contig_graph_vertices.size()); for (unsigned i = 0; i < contig_graph_vertices.size(); ++i) { ContigGraphPath path; path.Append(ContigGraphVertexAdaptor(&contig_graph_vertices[i]), 0); vertices_[i].set_path(path); vertices_[i].set_id(i); vertices_[i].status().clear(); } } void ScaffoldGraph::Initialize(std::deque &paths) { vertices_.resize(paths.size()); for (unsigned i = 0; i < paths.size(); ++i) { vertices_[i].set_path(paths[i]); vertices_[i].set_id(i); vertices_[i].status().clear(); } } void ScaffoldGraph::BuildContigToScaffoldMap() { contig_to_scaffold_.clear(); contig_to_scaffold_position_.clear(); for (unsigned i = 0; i < vertices_.size(); ++i) { if (vertices_[i].status().IsDead()) continue; for (int strand = 0; strand < 2; ++strand) { ScaffoldGraphVertexAdaptor current(&vertices_[i], strand); ContigGraphPath path = current.path(); int d = 0; for (unsigned j = 0; j < path.num_nodes(); ++j) { if (contig_to_scaffold_.find(path[j]) != contig_to_scaffold_.end()) { contig_to_scaffold_[path[j]] = ScaffoldGraphVertexAdaptor(); contig_to_scaffold_position_[path[j]] = -1; } else { contig_to_scaffold_[path[j]] = current; contig_to_scaffold_position_[path[j]] = d; } if (j+1 < path.num_nodes()) { d += path[j].contig_size(); d += path.distances()[j]; } } } } } void ScaffoldGraph::BuildEdges() { BuildContigToScaffoldMap(); in_edges_.clear(); out_edges_.clear(); in_edges_.resize(vertices_.size()); out_edges_.resize(vertices_.size()); edge_data_.clear(); for (unsigned i = 0; i < pairs_.size(); ++i) { ContigGraphVertexAdaptor contig_from(&contig_graph_.vertices()[pairs_[i].from()>>1], pairs_[i].from()&1); ContigGraphVertexAdaptor contig_to(&contig_graph_.vertices()[pairs_[i].to()>>1], pairs_[i].to()&1); ScaffoldGraphVertexAdaptor from = contig_to_scaffold_[contig_from]; ScaffoldGraphVertexAdaptor to = contig_to_scaffold_[contig_to]; if (from.is_null() || to.is_null() || from.id() == to.id()) continue; int level = pairs_[i].level(); int distance = pairs_[i].distance() - (from.path().size() - (contig_to_scaffold_position_[contig_from] + contig_from.contig_size())) - contig_to_scaffold_position_[contig_to]; AddEdge(level, from, to, distance); } } void ScaffoldGraph::RefreshEdges() { for (unsigned i = 0; i < vertices_.size(); ++i) { for (int strand = 0; strand < 2; ++strand) { ScaffoldGraphVertexAdaptor current(&vertices_[i], strand); deque &edges = GetEdges(current); int index = 0; for (unsigned j = 0; j < edges.size(); ++j) { if (edges[j].status().IsDead() || edges[j].from().status().IsDead() || edges[j].to().status().IsDead()) { edges[j].status().SetDeadFlag(); } else { edges[index++] = edges[j]; } } edges.resize(index); } } } void ScaffoldGraph::ParseEdges(bool is_uneven) { for (unsigned i = 0; i < vertices_.size(); ++i) { for (int strand = 0; strand < 2; ++strand) { ScaffoldGraphVertexAdaptor current(&vertices_[i], strand); deque &edges = GetEdges(current); for (unsigned j = 0; j < edges.size(); ++j) { if (edges[j].status().IsDead()) continue; edges[j].Parse(); double expected = ExpectedEdges(edges[j].level(), edges[j].from().path().size(), edges[j].to().path().size(), edges[j].distance() ); int level = edges[j].level(); if (is_uneven && edges[j].from().path().size() > mean_[level]*4 && edges[j].to().path().size() > mean_[level]*4 ) { double tmp = ExpectedEdges(edges[j].level(), edges[j].from().path().size(), edges[j].to().path().size(), edges[j].distance(), //(edges[j].from().coverage() + edges[j].to().coverage()) / 2 //(edges[j].from().coverage() + edges[j].to().coverage()) / 2 //max(edges[j].from().coverage(), edges[j].to().coverage()) min(edges[j].from().coverage(), edges[j].to().coverage()) //min(edges[j].from().path().back().coverage(), edges[j].to().path().front().coverage()) ); //if (tmp > expected) expected = tmp; } double rate = 0.3; // if (mean_[edges[j].level()] > 1000) // rate = 0.1; if ((int)edges[j].values().size() < expected * rate || edges[j].distance() > 0.75 * mean(edges[j].level()) || edges[j].distance() < -2*sd(edges[j].level()) - kmer_size() ) { edges[j].status().SetDeadFlag(); } } } } RefreshEdges(); } void ScaffoldGraph::FilterEdges(int min_pairs, int min_length) { for (unsigned i = 0; i < vertices_.size(); ++i) { for (int strand = 0; strand < 2; ++strand) { ScaffoldGraphVertexAdaptor current(&vertices_[i], strand); deque &edges = GetEdges(current); for (unsigned j = 0; j < edges.size(); ++j) { if (edges[j].status().IsDead()) continue; if ((int)edges[j].values().size() < min_pairs || (int)edges[j].from().path_size() < min_length || (int)edges[j].to().path_size() < min_length ) { edges[j].status().SetDeadFlag(); } } } } RefreshEdges(); } void ScaffoldGraph::ClearStatus() { for (unsigned i = 0; i < vertices().size(); ++i) vertices()[i].status().clear(); } bool ScaffoldGraph::IsConnected(int level, ScaffoldGraphVertexAdaptor from, ScaffoldGraphVertexAdaptor to) { deque qu; qu.push_back(from); from.status().SetUsedFlag(); int time = 0; int index = 0; bool is_found = false; while (++time < kTimeLimit && index < (int)qu.size() && !is_found) { ScaffoldGraphVertexAdaptor current = qu[index++]; deque edges = GetEdges(level, current); for (unsigned i = 0; i < edges.size(); ++i) { if (edges[i].to() == to) { is_found = true; break; } else if (!edges[i].to().status().IsUsed()) { qu.push_back(edges[i].to()); edges[i].to().status().SetUsedFlag(); } } } for (unsigned i = 0; i < qu.size(); ++i) qu[i].status().clear(); return is_found; } int64_t ScaffoldGraph::RemoveTransitiveConnections(int level) { int removed = 0; for (unsigned i = 0; i < vertices().size(); ++i) { for (int strand = 0; strand < 2; ++strand) { ScaffoldGraphVertexAdaptor current(&vertices()[i], strand); deque edges = GetEdges(level, current); if (edges.size() < 2U) continue; for (unsigned j = 0; j < edges.size(); ++j) { edges[j].status().SetDeadFlag(); if (!IsConnected(level, current, edges[j].to())) edges[j].status().ResetDeadFlag(); else ++removed; } } } return removed; } bool ScaffoldGraph::IsConsistent(int level, ScaffoldGraphVertexAdaptor current) { deque edges = GetEdges(level, current); return edges.size() == 1; } bool ScaffoldGraph::IsConsistentMulti(int level, ScaffoldGraphVertexAdaptor current) { deque edges = GetEdges(level, current); return edges.size() == 1; } bool ScaffoldGraph::ExtendPath(int level, ScaffoldGraphPath &scaffold_path) { ScaffoldGraphVertexAdaptor current = scaffold_path.back(); if (!IsConsistent(level, current)) return false; deque edges = GetEdges(level, current); ScaffoldGraphVertexAdaptor next = edges[0].to(); ScaffoldGraphVertexAdaptor rev_next = next; rev_next.ReverseComplement(); if (!IsConsistent(level, rev_next)) return false; if (next.status().IsDead()) return false; if (next.status().IsUsed()) return false; next.status().SetUsedFlag(); scaffold_path.Append(next, edges[0].distance()); return true; } bool ScaffoldGraph::ExtendPathMulti(int level, ScaffoldGraphPath &scaffold_path) { ScaffoldGraphVertexAdaptor current = scaffold_path.back(); deque edges = GetEdges(level, current); if (edges.size() == 0) return false; else { ScaffoldGraphVertexAdaptor next = edges[0].to(); ScaffoldGraphVertexAdaptor rev_next = next; rev_next.ReverseComplement(); if (IsConsistentMulti(level, current)) { if (!IsConsistentMulti(level, rev_next)) return false; if (next.status().IsDead()) return false; if (next.status().IsUsed()) return false; next.status().SetUsedFlag(); scaffold_path.Append(next, edges[0].distance()); return true; } else { set table; for (unsigned i = 0; i < scaffold_path.num_nodes(); ++i) { ScaffoldGraphVertexAdaptor x = scaffold_path[i]; for (int j = level+1; j < num_level(); ++j) { deque long_edges = GetEdges(j, x); for (unsigned k = 0; k < long_edges.size(); ++k) table.insert(long_edges[k].to()); } } int index = 0; for (unsigned i = 0; i < edges.size(); ++i) { if (table.find(edges[i].to()) != table.end()) edges[index++] = edges[i]; } edges.resize(index); if (edges.size() == 1) { cout << "succeed" << endl; ScaffoldGraphVertexAdaptor next = edges[0].to(); if (next.status().IsDead()) return false; if (next.status().IsUsed()) return false; next.status().SetUsedFlag(); scaffold_path.Append(next, edges[0].distance()); return true; } else { sort(edges.begin(), edges.end(), Compare); ScaffoldGraphVertexAdaptor next = edges[0].to(); bool is_consistent = true; for (unsigned i = 1; i < edges.size(); ++i) { deque middle_edges = GetEdges(level, edges[i].to()); is_consistent = false; for (unsigned j = 0; j < middle_edges.size(); ++j) { if (middle_edges[j].to() == next) is_consistent = true; } if (!is_consistent) break; } if (!is_consistent) { cout << "failed" << endl; return false; } cout << "succeed2" << endl; if (next.status().IsDead()) return false; if (next.status().IsUsed()) return false; next.status().SetUsedFlag(); scaffold_path.Append(next, edges[0].distance()); return true; } return false; } } } int64_t ScaffoldGraph::Assemble(int level, deque &paths) { paths.clear(); for (unsigned i = 0; i < vertices().size(); ++i) { if (vertices()[i].status().IsDead()) continue; if (vertices()[i].status().IsUsed()) continue; ScaffoldGraphVertexAdaptor current(&vertices()[i]); ScaffoldGraphPath scaffold_path; scaffold_path.Append(current, 0); current.status().SetUsedFlag(); for (int strand = 0; strand < 2; ++strand) { while (ExtendPath(level, scaffold_path)) ; scaffold_path.ReverseComplement(); } ContigGraphPath path; scaffold_path.Assemble(path); paths.push_back(path); } ClearStatus(); return paths.size(); } int64_t ScaffoldGraph::AssembleMulti(int level, deque &paths) { paths.clear(); for (unsigned i = 0; i < vertices().size(); ++i) { if (vertices()[i].status().IsDead()) continue; if (vertices()[i].status().IsUsed()) continue; ScaffoldGraphVertexAdaptor current(&vertices()[i]); ScaffoldGraphPath scaffold_path; scaffold_path.Append(current, 0); current.status().SetUsedFlag(); for (int strand = 0; strand < 2; ++strand) { while (ExtendPathMulti(level, scaffold_path)) ; scaffold_path.ReverseComplement(); } ContigGraphPath path; scaffold_path.Assemble(path); paths.push_back(path); } ClearStatus(); return paths.size(); } int64_t ScaffoldGraph::Assemble(int level, deque &contigs) { deque paths; Assemble(level, paths); contigs.resize(paths.size()); for (unsigned i = 0; i < contigs.size(); ++i) { ContigInfo contig_info; paths[i].Assemble(contigs[i], contig_info); } return contigs.size(); } int64_t ScaffoldGraph::AssembleMulti(int level, deque &contigs) { deque paths; AssembleMulti(level, paths); contigs.resize(paths.size()); for (unsigned i = 0; i < contigs.size(); ++i) { ContigInfo contig_info; paths[i].Assemble(contigs[i], contig_info); } return contigs.size(); } double ScaffoldGraph::ExpectedEdges(int level, int len1, int len2, int distance) { //cout << len1 << " " << len2 << " " << distance << endl; int read_length = read_length_[level]; double mean = mean_[level]; double sd = sd_[level]; double expected_coverage = expected_coverage_[level]; int from = max(0, int(len1 - mean - 2*sd)); int to = max(0, len1 - read_length); from = from - len1 - distance; to = to - len1 - distance; double sum = 0; for (int i = from; i < to; ++i) { double expected = i + mean; double left = read_length - expected; double right = min(len2, int(mean + 2*sd)) - expected; sum += NormalCDF(right/sd) - NormalCDF(left/sd); } double p = sum / (to - from); return p * (to - from) * expected_coverage/2; } double ScaffoldGraph::ExpectedEdges(int level, int len1, int len2, int distance, double expected_coverage) { //cout << len1 << " " << len2 << " " << distance << endl; int read_length = read_length_[level]; double mean = mean_[level]; double sd = sd_[level]; //expected_coverage = min(expected_coverage_[level], expected_coverage); //double expected_coverage = expected_coverage_[level]; int from = max(0, int(len1 - mean - 2*sd)); int to = max(0, len1 - read_length); from = from - len1 - distance; to = to - len1 - distance; double sum = 0; for (int i = from; i < to; ++i) { double expected = i + mean; double left = read_length - expected; double right = min(len2, int(mean + 2*sd)) - expected; sum += NormalCDF(right/sd) - NormalCDF(left/sd); } double p = sum / (to - from); return p * (to - from) * expected_coverage/2; } idba-1.1.3/src/graph/contig_info.cpp0000664000175000017500000000343312677406270014246 00000000000000#include "graph/contig_info.h" #include #include #include #include #include "graph/bit_edges.h" using namespace std; istream &operator >>(istream &is, ContigInfo &contig_info) { is.read((char *)&contig_info.in_edges_, sizeof(BitEdges)); is.read((char *)&contig_info.out_edges_, sizeof(BitEdges)); is.read((char *)&contig_info.kmer_size_, sizeof(uint16_t)); is.read((char *)&contig_info.kmer_count_, sizeof(uint32_t)); int size = 0; if (!is.read((char *)&size, sizeof(int))) return is; contig_info.counts_.resize(size); for (int i = 0; i < size; ++i) is.read((char *)&contig_info.counts_[i], sizeof(SequenceCountUnitType)); return is; } ostream &operator <<(ostream &os, const ContigInfo &contig_info) { os.write((char *)&contig_info.in_edges_, sizeof(BitEdges)); os.write((char *)&contig_info.out_edges_, sizeof(BitEdges)); os.write((char *)&contig_info.kmer_size_, sizeof(uint16_t)); os.write((char *)&contig_info.kmer_count_, sizeof(uint32_t)); int size = contig_info.counts_.size(); os.write((char *)&size, sizeof(int)); for (int i = 0; i < size; ++i) os.write((char *)&contig_info.counts_[i], sizeof(SequenceCountUnitType)); return os; } void ReadContigInfo(const string &filename, deque &contig_infos) { contig_infos.clear(); ifstream fin(filename.c_str(), ios_base::in | ios_base::binary); ContigInfo contig_info; while (fin >> contig_info) contig_infos.push_back(contig_info); } void WriteContigInfo(const string &filename, const deque &contig_infos) { ofstream fout(filename.c_str(), ios_base::out | ios_base::binary); for (unsigned i = 0; i < contig_infos.size(); ++i) fout << contig_infos[i]; } idba-1.1.3/src/misc/0000775000175000017500000000000012740767035011153 500000000000000idba-1.1.3/src/misc/hash_aligner.cpp0000664000175000017500000004524712677406270014237 00000000000000/** * @file hash_aligner.cpp * @brief * @author Yu Peng (ypeng@cs.hku.hk) * @version 1.0.0 * @date 2011-08-12 */ #include "misc/hash_aligner.h" #include #include #include #include #include "basic/bit_operation.h" #include "sequence/sequence.h" using namespace std; bool Compare(const HashAlignerRecord &x, const HashAlignerRecord &y) { return x.match_length > y.match_length; } HashAligner::HashAligner(uint32_t kmer_size, uint32_t min_length, uint32_t step) { kmer_size_ = kmer_size; min_length_ = min_length; step_ = step; //if (num_threads == 0) int num_threads = omp_get_max_threads(); buffer_records.resize(num_threads); buffer_tables.resize(num_threads); for (int i = 0; i < num_threads; ++i) { buffer_records[i].reserve(10000); //buffer_tables[i].reserve(10000); } } void HashAligner::Initialize(const deque &sequences) { uint64_t sum = 0; for (unsigned i = 0; i < sequences.size(); ++i) { if (sequences[i].size() >= min_length_) sum += sequences[i].size(); } hash_map_.reserve(sum/step_); sequences_.resize(sequences.size()); reverse_sequences_.resize(sequences.size()); sequence_words.resize(sequences.size()); reverse_sequence_words.resize(sequences.size()); //#pragma omp parallel for for (int64_t i = 0; i < (int64_t)sequences.size(); ++i) { if (sequences[i].size() < min_length_) continue; InsertSequence(sequences[i], i); sequences_[i] = sequences[i]; reverse_sequences_[i] = sequences[i]; reverse_sequences_[i].ReverseComplement(); Convert(sequences_[i], sequence_words[i]); sequences_[i].ReverseComplement(); Convert(sequences_[i], reverse_sequence_words[i]); sequences_[i].ReverseComplement(); } } int HashAligner::AlignRead(const Sequence &seq, deque &records, int min_match, int max_records) { int max_mismatch = seq.size() - min_match; vector words; Convert(seq, words); vector &tmp_records = buffer_records[omp_get_thread_num()]; tmp_records.resize(0); //HashSet &table = buffer_tables[omp_get_thread_num()]; set &table = buffer_tables[omp_get_thread_num()]; table.clear(); records.clear(); Kmer kmer(kmer_size_); int length = 0; for (uint64_t i = 0; i < seq.size(); ++i) { kmer.ShiftAppend(seq[i]); length = (seq[i] < 4) ? length + 1 : 0; if (length < (int)kmer_size_) continue; Kmer key = kmer.unique_format(); HashMap::iterator p = hash_map_.find(key); if (p == hash_map_.end()) continue; position_list_type &position_list = p->second; for (position_iterator iter = position_list.begin(); iter != position_list.end(); ++iter) { uint64_t position = *iter; uint64_t id = position >> 33; bool is_reverse = (position >> 32) & 1; uint64_t offset = position & ((1ULL << 32) - 1); if (key != kmer) { is_reverse = !is_reverse; offset = sequences_[id].size() - (offset + kmer_size_); } int64_t from = int64_t(offset) - int64_t(i+1 - kmer_size_); int64_t to = from + seq.size(); if (from >= 0 && to <= sequences_[id].size()) { uint64_t value = (id << 33) | (uint64_t(is_reverse) << 32) | from; if (table.insert(value).second) { HashAlignerRecord record; record.query_from = 0; record.query_to = seq.size(); record.query_length = seq.size(); record.ref_id = id; record.ref_from = from; record.ref_to = to; record.ref_length = sequences_[id].size(); record.is_reverse = is_reverse; record.match_length = 0; Match(words, record, max_mismatch); if (record.match_length >= min_match) { if ((int)records.size() >= max_records) { //return records.size(); records.resize(0); return 0; } else records.push_back(record); } } } } } return records.size(); } int HashAligner::AlignReadLocal(const Sequence &seq, deque &records, int min_match, int max_mismatch, int max_records) { vector words; Convert(seq, words); vector &tmp_records = buffer_records[omp_get_thread_num()]; tmp_records.resize(0); //HashSet &table = buffer_tables[omp_get_thread_num()]; set &table = buffer_tables[omp_get_thread_num()]; table.clear(); records.clear(); Kmer kmer(kmer_size_); int length = 0; for (uint64_t i = 0; i < seq.size(); ++i) { kmer.ShiftAppend(seq[i]); length = (seq[i] < 4) ? length + 1 : 0; if (length < (int)kmer_size_) continue; Kmer key = kmer.unique_format(); HashMap::iterator p = hash_map_.find(key); if (p == hash_map_.end()) continue; position_list_type &position_list = p->second; for (position_iterator iter = position_list.begin(); iter != position_list.end(); ++iter) { uint64_t position = *iter; uint64_t id = position >> 33; bool is_reverse = (position >> 32) & 1; uint64_t offset = position & ((1ULL << 32) - 1); if (key != kmer) { is_reverse = !is_reverse; offset = sequences_[id].size() - (offset + kmer_size_); } int64_t from = int64_t(offset) - int64_t(i+1 - kmer_size_); int64_t to = from + seq.size(); //if (from >= 0 && to <= sequences_[id].size()) { uint64_t value = (id << 33) | (uint64_t(is_reverse) << 32) | (from + seq.size()); if (table.insert(value).second) { HashAlignerRecord record; record.query_from = (from >= 0 ? 0 : -from); record.query_to = (to <= sequences_[id].size() ? seq.size() : seq.size() - (to - sequences_[id].size())); record.query_length = seq.size(); record.ref_id = id; record.ref_from = (from >= 0 ? from : 0); record.ref_to = record.ref_from + (record.query_to - record.query_from); record.ref_length = sequences_[id].size(); record.is_reverse = is_reverse; record.match_length = 0; Match(words, record, max_mismatch); if (record.match_length >= min_match) { if ((int)records.size() >= max_records) { records.resize(0); return 0; } else records.push_back(record); } } } } } return records.size(); // vector words; // Convert(seq, words); // // vector &tmp_records = buffer_records[omp_get_thread_num()]; // tmp_records.resize(0); // // //HashSet &table = buffer_tables[omp_get_thread_num()]; // set &table = buffer_tables[omp_get_thread_num()]; // table.clear(); // // records.clear(); // // Kmer kmer(kmer_size_); // int length = 0; // for (uint64_t i = 0; i < seq.size(); ++i) // { // kmer.ShiftAppend(seq[i]); // length = (seq[i] < 4) ? length + 1 : 0; // // if (length < (int)kmer_size_) // continue; // // Kmer key = kmer.unique_format(); // HashMap::iterator p = hash_map_.find(key); // // if (p == hash_map_.end()) // continue; // // position_list_type &position_list = p->second; // for (position_iterator iter = position_list.begin(); iter != position_list.end(); ++iter) // { // uint64_t position = *iter; // uint64_t id = position >> 33; // bool is_reverse = (position >> 32) & 1; // uint64_t offset = position & ((1ULL << 32) - 1); // // if (key != kmer) // { // is_reverse = !is_reverse; // offset = sequences_[id].size() - (offset + kmer_size_); // } // //// int64_t from = int64_t(offset) - int64_t(i+1 - kmer_size_); //// int64_t to = from + seq.size(); // int64_t from = int64_t(offset); // int64_t to = from + kmer_size_; // // if (from >= 0 && to <= sequences_[id].size()) // { // //uint64_t value = (id << 33) | (uint64_t(is_reverse) << 32) | from; // { // HashAlignerRecord record; // record.query_from = i+1 - kmer_size_; // record.query_to = i+1; // record.query_length = seq.size(); // record.ref_id = id; // record.ref_from = from; // record.ref_to = to; // record.ref_length = sequences_[id].size(); // record.is_reverse = is_reverse; // record.match_length = 0; // // records.push_back(record); // } // } // } // // sort(records.begin(), records.end()); // // int index = 1; // for (unsigned i = 1; i < records.size(); ++i) // { // if (records[index-1].Merge(records[i]) == false) // records[index++] = records[i]; // } // records.resize(index); // // index = 0; // for (unsigned i = 0; i < records.size(); ++i) // { // Match(words, records[i], max_mismatch); // // HashAlignerRecord x = records[i]; // if (records[i].match_length > 0) // ExtendRecord(seq, records[i], max_mismatch); // HashAlignerRecord y = records[i]; // //// if (x.query_from != y.query_from || x.query_to != y.query_to) //// cout << x.query_from << " " << x.query_to << " " << y.query_from << " " << y.query_to << endl; // // if (records[i].match_length >= min_match // && records[i].query_to - records[i].query_from - records[i].match_length <= max_mismatch) // { // records[index++] = records[i]; // } // } // records.resize(index); // } // // return records.size(); } int HashAligner::AlignSequence(const Sequence &seq, deque &records, int min_match, double similar, int max_records) { vector words; Convert(seq, words); vector &tmp_records = buffer_records[omp_get_thread_num()]; tmp_records.resize(0); //HashSet &table = buffer_tables[omp_get_thread_num()]; set &table = buffer_tables[omp_get_thread_num()]; table.clear(); records.clear(); Kmer kmer(kmer_size_); int length = 0; for (uint64_t i = 0; i < seq.size(); ++i) { kmer.ShiftAppend(seq[i]); length = (seq[i] < 4) ? length + 1 : 0; if (length < (int)kmer_size_) continue; Kmer key = kmer.unique_format(); HashMap::iterator p = hash_map_.find(key); if (p == hash_map_.end()) continue; position_list_type &position_list = p->second; for (position_iterator iter = position_list.begin(); iter != position_list.end(); ++iter) { uint64_t position = *iter; uint64_t id = position >> 33; bool is_reverse = (position >> 32) & 1; uint64_t offset = position & ((1ULL << 32) - 1); if (key != kmer) { is_reverse = !is_reverse; offset = sequences_[id].size() - (offset + kmer_size_); } int64_t ref_from = (int64_t)offset - int64_t(i+1 - kmer_size_); int64_t ref_to = ref_from + (int64_t)seq.size(); int64_t query_from = 0; int64_t query_to = seq.size(); if (ref_from < 0) { query_from = -ref_from; ref_from = 0; } if (ref_to > (int64_t)sequences_[id].size()) { query_to -= ref_to - sequences_[id].size(); ref_to = sequences_[id].size(); } if (ref_to - ref_from >= min_match) { int64_t from = ref_from - query_from; if (from < 0) from = -from + (1LL << 31); uint64_t value = (id << 33) | (uint64_t(is_reverse) << 32) | from; if (table.insert(value).second) { HashAlignerRecord record; record.query_from = query_from; record.query_to = query_to; record.query_length = seq.size(); record.ref_id = id; record.ref_from = ref_from; record.ref_to = ref_to; record.ref_length = sequences_[id].size(); record.is_reverse = is_reverse; record.match_length = 0; Match(words, record, (ref_to - ref_from) * (1 - similar)); if (record.match_length >= min_match) { if ((int)records.size() >= max_records) { records.resize(0); return 0; } else records.push_back(record); } } } } } return records.size(); } void HashAligner::Match(const vector &words, HashAlignerRecord &record, int max_mismatch) { vector &ref_words = (!record.is_reverse ? sequence_words[record.ref_id] : reverse_sequence_words[record.ref_id]); int mismatch = 0; int length = record.query_to - record.query_from; for (int offset = 0; offset < length; offset += 32) { int len = min(32, length - offset); uint64_t x = GetWord(words, record.query_from + offset, record.query_from + offset + len); uint64_t y = GetWord(ref_words, record.ref_from + offset, record.ref_from + offset + len); mismatch += bit_operation::BaseCount(x ^ y); if (mismatch > max_mismatch) { mismatch = length; break; } } record.match_length = length - mismatch; } void HashAligner::InsertSequence(const Sequence &seq, uint64_t id) { Kmer kmer(kmer_size_); int length = 0; int last = -1; for (int64_t i = 0; i < (int64_t)seq.size(); ++i) { kmer.ShiftAppend(seq[i]); length = (seq[i] < 4) ? length + 1 : 0; if (length < (int)kmer_size_) continue; if (i - last < (int)step_) continue; last = i; Kmer key = kmer.unique_format(); uint64_t position = ((id << 1) << 32) | (i+1 - kmer_size_); if (key != kmer) position = (((id << 1) + 1) << 32) | (seq.size() - (i+1)); hash_map_[key].set_pool(pool_); hash_map_[key].push_front(position); } } void HashAligner::ExtendRecord(const Sequence &seq, HashAlignerRecord &record) { Sequence a = seq; const Sequence &b = sequences_[record.ref_id]; for (int strand = 0; strand < 2; ++strand) { int mismatch = 0; while (record.query_to < record.query_length && record.ref_to < record.ref_length) { int x = record.query_to; int y = record.ref_to; if (!record.is_reverse) { if (a[x] != b[y]) ++mismatch; else ++record.match_length; } else { if (a[x] != 3 - b[b.size() - 1 - y]) ++mismatch; else ++record.match_length; } if (mismatch > 3) break; ++record.query_to; ++record.ref_to; } a.ReverseComplement(); record.ReverseComplement(); } } void HashAligner::ExtendRecord(const Sequence &seq, HashAlignerRecord &record, int max_mismatch) { int mismatch = record.query_to - record.query_from - record.match_length; if (mismatch >= max_mismatch) return; Sequence a = seq; for (int strand = 0; strand < 2; ++strand) { const Sequence &b = (!record.is_reverse ? sequences_[record.ref_id] : reverse_sequences_[record.ref_id]); while (record.query_to < record.query_length && record.ref_to < record.ref_length) { int &x = record.query_to; int &y = record.ref_to; if (a[x] == b[y]) ++record.match_length; else ++mismatch; ++x; ++y; if (mismatch > max_mismatch) { while (a[x-1] != b[y-1]) { --x; --y; --mismatch; } break; } } a.ReverseComplement(); record.ReverseComplement(); } } void HashAligner::Match(const Sequence &seq, HashAlignerRecord &record) { const Sequence &a = seq; const Sequence &b = sequences_[record.ref_id]; int match = 0; for (int i = 0; record.query_from + i < record.query_to; ++i) { int x = record.query_from + i; int y = record.ref_from + i; if (!record.is_reverse) { if (a[x] == b[y]) ++match; } else { if (a[x] == 3 - b[b.size() - 1 - y]) ++match; } } record.match_length = match; } void HashAligner::Convert(const Sequence &seq, vector &words) { words.resize((seq.size() + 31) >> 5); fill(words.begin(), words.end(), 0); for (unsigned i = 0; i < seq.size(); ++i) words[i>>5] |= uint64_t(seq[i]) << ((i&31) << 1); } idba-1.1.3/src/misc/options_description.h0000664000175000017500000000405312677406270015344 00000000000000/** * @file options_description.h * @brief OptionsDescription Class. * @author Yu Peng (ypeng@cs.hku.hk) * @version 1.0.0 * @date 2011-08-07 */ #ifndef __MISC_OPTIONS_DESCRIPTION_H_ #define __MISC_OPTIONS_DESCRIPTION_H_ #include #include #include /** * @brief It is a class used to store and manipulate command line options. */ class OptionsDescription { struct Option; public: enum OptionType { kOptionBool, kOptionInt, kOptionDouble, kOptionString }; friend std::ostream &operator <<(std::ostream &os, const OptionsDescription &desc); friend std::ostream &operator <<(std::ostream &os, const Option &option); void Parse(int &argc, char *argv[]); operator std::string() const; void AddOption(const std::string &long_name, const std::string &short_name, bool &bool_option, const std::string &description); void AddOption(const std::string &long_name, const std::string &short_name, int &int_option, const std::string &description); void AddOption(const std::string &long_name, const std::string &short_name, double &double_option, const std::string &description); void AddOption(const std::string &long_name, const std::string &short_name, std::string &string_option, const std::string &description); private: void AddOption(const std::string &long_name, const std::string &short_name, void *pointer, OptionType type, const std::string &description, const std::string &default_value); struct Option { Option(const std::string &long_name, const std::string &short_name, void *pointer, OptionType type, const std::string &description, const std::string &default_value); void Parse(); operator std::string() const; std::string long_name; std::string short_name; void *pointer; OptionType type; std::string description; std::string default_value; std::string value; }; std::vector