Sniffles-1.0.7/000077500000000000000000000000001320237057600132755ustar00rootroot00000000000000Sniffles-1.0.7/CMakeLists.txt000066400000000000000000000021611320237057600160350ustar00rootroot00000000000000cmake_minimum_required(VERSION 2.8) project(Sniffles) set( SNIF_VERSION_MAJOR 1 ) set( SNIF_VERSION_MINOR 0 ) IF(CMAKE_BUILD_TYPE STREQUAL "Debug") message(STATUS "Building in debug mode!") set( SNIF_VERSION_BUILD 7-debug ) else() set( SNIF_VERSION_BUILD 7 ) ENDIF() set(EXECUTABLE_OUTPUT_PATH ${PROJECT_SOURCE_DIR}/bin/sniffles-core-${SNIF_VERSION_MAJOR}.${SNIF_VERSION_MINOR}.${SNIF_VERSION_BUILD}/) file(MAKE_DIRECTORY ${EXECUTABLE_OUTPUT_PATH}) # Set a default build type for single-configuration # CMake generators if no build type is set. IF(NOT CMAKE_CONFIGURATION_TYPES AND NOT CMAKE_BUILD_TYPE) message(STATUS "No build type specified. Using 'release'") SET(CMAKE_BUILD_TYPE Release) ENDIF(NOT CMAKE_CONFIGURATION_TYPES AND NOT CMAKE_BUILD_TYPE) FIND_PACKAGE(OpenMP REQUIRED) if(OPENMP_FOUND) set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${OpenMP_C_FLAGS}") set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${OpenMP_CXX_FLAGS}") set(CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} ${OpenMP_EXE_LINKER_FLAGS}") endif() add_subdirectory(lib/zlib-1.2.7) add_subdirectory(lib/bamtools-2.3.0) add_subdirectory(src) Sniffles-1.0.7/LICENSE000066400000000000000000000020731320237057600143040ustar00rootroot00000000000000The MIT License (MIT) Copyright (c) 2015 Fritz Sedlazeck Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. Sniffles-1.0.7/README.md000066400000000000000000000052731320237057600145630ustar00rootroot00000000000000# Sniffles Sniffles is a structural variation caller using third generation sequencing (PacBio or Oxford Nanopore). It detects all types of SVs (10bp+) using evidence from split-read alignments, high-mismatch regions, and coverage analysis. Please note the current version of Sniffles requires sorted output from BWA-MEM (use -M and -x parameter) or NGM-LR with the optional SAM attributes enabled! If you experience problems or have suggestions please contact: fritz.sedlazeck@gmail.com Please see our github wiki for more information (https://github.com/fritzsedlazeck/Sniffles/wiki) ************************************** ## NextGenMap-LR: (NGM-LR) Sniffles performs best with the mappings of NGM-LR our novel long read mapping method. Please see: https://github.com/philres/nextgenmap-lr **************************************** ## Citation: Please see and cite our preprint: http://www.biorxiv.org/content/early/2017/07/28/169557 ************************************** ## Poster & Talks: [Accurate and fast detection of complex and nested structural variations using long read technologies](http://schatzlab.cshl.edu/presentations/2016/2016.10.28.BIODATA.PacBioSV.pdf) Biological Data Science, Cold Spring Harbor Laboratory, Cold Spring Harbor, NY, 26 - 29.10.2016 [NextGenMap-LR: Highly accurate read mapping of third generation sequencing reads for improved structural variation analysis](http://www.cibiv.at/~philipp_/files/gi2016_poster_phr.pdf) Genome Informatics 2016, Wellcome Genome Campus Conference Centre, Hinxton, Cambridge, UK, 19.09.-2.09.2016 ************************************** ## Datasets used in the mansucript: We provide the NGMLR aligned reads and the Sniffles calls for the data sets used: Arabidopsis trio: + [http://labshare.cshl.edu/shares/schatzlab/www-data/fsedlaze/Sniffles/Arabidopsis_trio](http://labshare.cshl.edu/shares/schatzlab/www-data/fsedlaze/Sniffles/Arabidopsis_trio) . Genome in the Bottle trio: + Mappings: [ftp://ftp-trace.ncbi.nlm.nih.gov/giab/ftp/data/AshkenazimTrio/HG002_NA24385_son/PacBio_MtSinai_NIST/Baylor_NGMLR_bam_GRCh37/](ftp://ftp-trace.ncbi.nlm.nih.gov/giab/ftp/data/AshkenazimTrio/HG002_NA24385_son/PacBio_MtSinai_NIST/Baylor_NGMLR_bam_GRCh37/) . + SV calls: [http://labshare.cshl.edu/shares/schatzlab/www-data/fsedlaze/Sniffles/GiaB/](http://labshare.cshl.edu/shares/schatzlab/www-data/fsedlaze/Sniffles/GiaB/) NA12878: + [http://labshare.cshl.edu/shares/schatzlab/www-data/fsedlaze/Sniffles/NA12878/](http://labshare.cshl.edu/shares/schatzlab/www-data/fsedlaze/Sniffles/NA12878/) . SKBR3: + [http://labshare.cshl.edu/shares/schatzlab/www-data/fsedlaze/Sniffles/Skbr3/](http://labshare.cshl.edu/shares/schatzlab/www-data/fsedlaze/Sniffles/Skbr3/) . Sniffles-1.0.7/src/000077500000000000000000000000001320237057600140645ustar00rootroot00000000000000Sniffles-1.0.7/src/Alignment.cpp000066400000000000000000001034731320237057600165160ustar00rootroot00000000000000/* * Alignments.cpp * * Created on: May 25, 2012 * Author: fritz */ #include "Alignment.h" void Alignment::setRef(string sequence) { alignment.second = sequence; } void Alignment::initAlignment() { al = new BamAlignment(); } void Alignment::setAlignment(BamAlignment * align) { al = align; } void update_aln(std::string & alignment, int & i, int pos_to_modify) { int ref_pos = 0; while (i < alignment.size() && ref_pos != pos_to_modify) { if (alignment[i] != '-') { ref_pos++; } i++; } alignment[i] = 'Y'; } void add_event(int pos, list::iterator & i, list & events) { //insert sorted into vector: while (i != events.end() && pos > (*i).position) { ++i; } differences_str ev; ev.position = pos; ev.type = 0; //mismatch events.insert(i, ev); } void add_event(int pos, size_t & i, vector & events) { //insert sorted into vector: while (i < events.size() && pos > events[i].position) { i++; } differences_str ev; ev.position = pos; ev.type = 0; //mismatch ev.readposition = -1; events.insert(events.begin() + i, ev); } vector Alignment::summarizeAlignment(std::vector &dels) { // clock_t comp_aln = clock(); vector events; int pos = this->getPosition(); differences_str ev; bool flag = (strcmp(this->getName().c_str(), Parameter::Instance()->read_name.c_str()) == 0); int read_pos = 0; if (al->CigarData[0].Type == 'S') { read_pos += al->CigarData[0].Length; } for (size_t i = 0; i < al->CigarData.size(); i++) { if (al->CigarData[i].Type == 'M') { pos += al->CigarData[i].Length; read_pos += al->CigarData[i].Length; } else if (al->CigarData[i].Type == 'D') { ev.position = pos; ev.type = al->CigarData[i].Length; //deletion ev.readposition = read_pos; events.push_back(ev); pos += al->CigarData[i].Length; } else if (al->CigarData[i].Type == 'I') { ev.position = pos; ev.readposition = read_pos; ev.type = al->CigarData[i].Length * -1; //insertion events.push_back(ev); read_pos += al->CigarData[i].Length; } else if (al->CigarData[i].Type == 'N') { pos += al->CigarData[i].Length; read_pos += al->CigarData[i].Length; } else if (al->CigarData[i].Type == 'S' && al->CigarData[i].Length > Parameter::Instance()->huge_ins) { /// Used for reads ranging into an inser string sa; al->GetTag("SA", sa); uint32_t sv; if ((al->GetTag("SV", sv) && sa.empty()) && (!(sv & Ns_CLIPPED) && !(sv & FULLY_EXPLAINED))) { // TODO remove last ) ev.position = pos; // - Parameter::Instance()->huge_ins; if (i == 0) { ev.readposition = 0; } else { ev.readposition = read_pos; } ev.type = Parameter::Instance()->huge_ins * -1; //insertion: WE have to fix the length since we cannot estimate it!] events.push_back(ev); } } } /*if (flag) { std::cout << "FIRST:" << std::endl; for (size_t i = 0; i < events.size(); i++) { if (abs(events[i].type) > 200) { cout << events[i].position << " " << events[i].type << endl; } } cout << endl; }*/ //set ref length requ. later on: this->ref_len = pos - getPosition(); //TODO compare to get_length! //Parameter::Instance()->meassure_time(comp_aln, "\t\tCigar: "); string md = this->get_md(); pos = this->getPosition(); int corr = 0; bool match = false; bool gap = false; int ref_pos = 0; size_t pos_events = 0; int max_size = (this->getRefLength() * 0.9) + getPosition(); // comp_aln = clock(); indel_str del; del.sequence = ""; del.pos = -1; for (size_t i = 0; i < md.size() && pos < max_size; i++) { if (md[i] == '^') { gap = true; } if ((atoi(&md[i]) == 0 && md[i] != '0')) { //is not a number if (!gap) { // only mismatches are stored. We should have the rest from CIGAR //correct for shift in position with respect to the ref: while (ref_pos < events.size() && pos > events[ref_pos].position) { if (events[ref_pos].type > 0) { pos += events[ref_pos].type; } ref_pos++; } //store in sorted order: add_event(pos, pos_events, events); pos++; //just the pos on ref! } else if (Parameter::Instance()->print_seq) { //can only be a deletion: if (del.pos == -1) { del.pos = pos; } else { //avoid first string position; del.sequence += md[i]; } } match = false; } else if (!match) { match = true; pos += atoi(&md[i]); gap = false; if (Parameter::Instance()->print_seq && del.sequence.size() > Parameter::Instance()->min_length) { dels.push_back(del); } del.sequence = ""; del.pos = -1; } } // Parameter::Instance()->meassure_time(comp_aln, "\t\tMD string: "); return events; } void Alignment::computeAlignment() { cout << "COMP ALN!" << endl; clock_t comp_aln = clock(); int to_del = 0; int pos = 0; for (size_t i = 0; i < al->CigarData.size(); i++) { if (al->CigarData[i].Type == 'I') { to_del += al->CigarData[i].Length; alignment.second.insert(pos, al->CigarData[i].Length, '-'); pos += al->CigarData[i].Length; } else if (al->CigarData[i].Type == 'D') { alignment.first.insert(pos, al->CigarData[i].Length, '-'); alignment.second.insert(pos, al->CigarData[i].Length, 'X'); pos += al->CigarData[i].Length; /*for (uint32_t t = 0; t < al->CigarData[i].Length; t++) { alignment.first.insert(pos, "-"); alignment.second.insert(pos, "X"); pos++; }*/ } else if (al->CigarData[i].Type == 'S') { if (pos == 0) { //front side alignment.second.erase(((int) alignment.second.size()) - al->CigarData[i].Length, al->CigarData[i].Length); } else { //backside alignment.second.erase(pos, al->CigarData[i].Length); } alignment.first.erase(pos, al->CigarData[i].Length); } else if (al->CigarData[i].Type == 'M') { pos += al->CigarData[i].Length; } else if (al->CigarData[i].Type == 'H') { //nothing todo } else if (al->CigarData[i].Type == 'N') { alignment.second.erase(pos, al->CigarData[i].Length); } } if (to_del > 0) { alignment.second = alignment.second.substr(0, alignment.second.size() - to_del); //alignment.second.erase(alignment.second.size() - to_del, to_del); } Parameter::Instance()->meassure_time(comp_aln, "\t\tCIGAR opterations "); comp_aln = clock(); //Apply MD string: string md = this->get_md(); pos = 0; int corr = 0; bool match = false; int last_pos_string = 0; int last_pos_ref = 0; for (size_t i = 0; i < md.size(); i++) { if (atoi(&md[i]) == 0 && md[i] != '0') { //is not a number! if (md[i] != '^') { update_aln(alignment.second, last_pos_string, pos - last_pos_ref); last_pos_ref = pos; pos++; } match = false; } else if (!match) { match = true; pos += atoi(&md[i]); } } Parameter::Instance()->meassure_time(comp_aln, "\t\tMD opterations "); if (alignment.first.size() != alignment.second.size()) { // || strcmp(this->getName().c_str(),"IIIIII_10892000")==0) { //if(al->CigarData[0].Length!=100){ cout << "Error alignment has different length" << endl; cout << " ignoring alignment " << al->Name << endl; cout << al->Position << endl; cout << endl; cout << "read: " << alignment.first << endl; cout << " ref: " << alignment.second << endl; cout << endl; cout << orig_length << endl; vector cig = getCigar(); for (size_t i = 0; i < cig.size(); i++) { cout << cig[i].Length << cig[i].Type << " "; } cout << endl; cout << this->get_md() << endl; // exit(0); // return; } } int32_t Alignment::getPosition() { return al->Position; } int32_t Alignment::getRefID() { return al->RefID; } bool Alignment::getStrand() { return !al->IsReverseStrand(); } vector Alignment::getCigar() { return al->CigarData; } string Alignment::getQualitValues() { return al->Qualities; } size_t Alignment::get_length(std::vector CigarData) { size_t len = 0; //orig_length; for (size_t i = 0; i < CigarData.size(); i++) { if (CigarData[i].Type == 'D' || CigarData[i].Type == 'M' || CigarData[i].Type == 'N') { len += CigarData[i].Length; } } return len; } size_t Alignment::getRefLength() { return this->ref_len; // return get_length(this->al->CigarData); } size_t Alignment::getOrigLen() { return orig_length; } pair Alignment::getSequence() { return alignment; } BamAlignment * Alignment::getAlignment() { return al; } string Alignment::getName() { return al->Name; } uint16_t Alignment::getMappingQual() { return al->MapQuality; } /*float Alignment::getIdentity() { if (is_computed) { float match = 0; for (size_t i = 0; i < alignment.first.size(); i++) { if (alignment.first[i] == alignment.second[i]) { match++; } } return match / (float) alignment.first.size(); } return -1; }*/ int Alignment::getAlignmentFlag() { return al->AlignmentFlag; } string Alignment::getQueryBases() { if (al != NULL) { return al->QueryBases; } else { return ""; } } void Alignment::clear_QueryBases() { al->QueryBases.clear(); al->QueryBases = ""; } string Alignment::getQualities() { return al->Qualities; } string convertInt(int number) { stringstream ss; //create a stringstream ss << number; //add number to the stream return ss.str(); //return a string with the contents of the stream } string Alignment::getTagData() { vector tags; uint32_t i = 0; if (al->GetTag("AS", i)) { string tmp = "AS:i:"; tmp += convertInt(i); tags.push_back(tmp); } i = 0; if (al->GetTag("NM", i)) { string tmp = "NM:i:"; tmp += convertInt(i); tags.push_back(tmp); } string md; if (al->GetTag("MD", md)) { string tmp = "MD:Z:"; tmp += md; tags.push_back(tmp); } i = 0; if (al->GetTag("UQ", i)) { string tmp = "UQ:i:"; tmp += convertInt(i); tags.push_back(tmp); } string sa; if (al->GetTag("SA", sa)) { string tmp = "SA:Z:"; tmp += sa; tags.push_back(tmp); } string res; for (size_t i = 0; i < tags.size(); i++) { res += tags[i]; if (i + 1 < tags.size()) { res += '\t'; } } return res; } void Alignment::initSequence() { this->alignment.first.clear(); this->alignment.second.clear(); } int Alignment::get_id(RefVector ref, std::string chr) { for (size_t i = 0; i < ref.size(); i++) { if (strcmp(ref[i].RefName.c_str(), chr.c_str()) == 0) { return i; } } return -1; //should not happen! } int get_readlen(std::vector cigar) { int pos = 0; for (size_t i = 0; i < cigar.size(); i++) { if (cigar[i].Type == 'I') { pos += cigar[i].Length; } else if (cigar[i].Type == 'D') { //pos += cigar[i].Length; } else if (cigar[i].Type == 'M') { pos += cigar[i].Length; } } return pos; } void Alignment::get_coords(aln_str tmp, int & start, int &stop) { size_t index = 0; if (!tmp.strand) { index = tmp.cigar.size() - 1; } // cout<<"Cigar: "<getName()<<" "< &entries) { bool flag = (strcmp(this->getName().c_str(), Parameter::Instance()->read_name.c_str()) == 0); if (flag) { std::cout << "Nested? " << std::endl; for (size_t i = 0; i < entries.size(); i++) { std::cout << entries[i].pos << "-" << entries[i].pos + entries[i].length << "(" << entries[i].read_pos_start << "-" << entries[i].read_pos_stop << ")"; if (entries[i].strand) { std::cout << "+ "; } else { std::cout << "- "; } //sort_insert_ref(entries[i], new_entries); } std::cout << std::endl; } int chr = entries[0].RefID; bool strand = entries[0].strand; int strands = 1; int valid = 1; double read_gaps = 0; double ref_gaps = 0; int ref_size = 0; int read_size = 0; for (size_t i = 1; i < entries.size(); i++) { if (entries[i].read_pos_stop - entries[i].read_pos_start > 200) { //only consider segments > 200bp. ref_size = min((int) abs((entries[i - 1].pos + entries[i - 1].length) - entries[i].pos), (int) abs(entries[i - 1].pos - (entries[i].pos + entries[i].length))); read_size = abs(entries[i - 1].read_pos_stop - entries[i].read_pos_start); if (abs(ref_size - read_size) > Parameter::Instance()->min_length) { valid++; } if (flag) { cout << "Read: " << read_size << " Ref: " << ref_size << " " << this->getName() << std::endl; } if (chr != entries[i].RefID) { return; } if (strand != entries[i].strand) { strands++; strand = entries[i].strand; } } } if (flag) { std::cout << "summary: " << strands << " " << valid << " " << std::endl; } if (strands < 3 || valid < 2) { //check! if (flag) { std::cout << "Return" << std::endl; } return; } for (size_t i = 1; i < entries.size(); i++) { int ref_dist = 0; int read_dist = 0; if (entries[i - 1].strand) { ref_dist = abs((entries[i - 1].pos + entries[i - 1].length) - entries[i].pos); read_dist = abs(entries[i - 1].read_pos_stop - entries[i].read_pos_start); } else { ref_dist = abs((entries[i - 1].pos) - (entries[i].pos + entries[i].length)); read_dist = abs(entries[i - 1].read_pos_stop - entries[i].read_pos_start); } if (flag) { std::cout << "REF DIST: " << ref_dist << " READ DIST: " << read_dist << std::endl; } if (abs(entries[i - 1].pos - entries[i].pos) < 100) { //inv dup: aln_str tmp; tmp.RefID = entries[i].RefID; tmp.strand = !entries[i].strand; tmp.mq = 60; tmp.length = 1; tmp.pos = entries[i].pos + entries[i].length; tmp.read_pos_start = entries[i].read_pos_stop; //fake... if (entries[0].strand) { tmp.pos = entries[i - 1].pos + entries[i - 1].length; tmp.read_pos_start = entries[i - 1].read_pos_stop; //fake... tmp.strand = !tmp.strand; } else { tmp.pos = entries[i].pos + entries[i].length; tmp.read_pos_start = entries[i].read_pos_stop; //fake... } tmp.read_pos_stop = tmp.read_pos_start + 1; entries.insert(entries.begin() + (i), tmp); break; } if (abs(ref_dist - read_dist) > Parameter::Instance()->min_length) { //distances between the inversion and the other split reads! aln_str tmp; tmp.RefID = entries[i].RefID; tmp.strand = !entries[i].strand; tmp.length = 1; tmp.mq = 60; //before the current element: tmp.pos = entries[i].pos - 1; tmp.read_pos_start = entries[i].read_pos_start - 1; tmp.read_pos_stop = tmp.read_pos_start + 1; //sort_insert(tmp, new_entries); //read_pos_start aln_str tmp2; tmp2 = tmp; //after the current element: tmp2.pos = entries[i].pos + entries[i].length; tmp2.read_pos_start = entries[i].read_pos_stop; //fake... tmp2.read_pos_stop = tmp2.read_pos_start + 1; //sort_insert(tmp, new_entries); if (entries[i - 1].strand) { entries.insert(entries.begin() + (i + 1), tmp2); entries.insert(entries.begin() + (i), tmp); } else { int start = tmp.read_pos_start; tmp.read_pos_start = tmp2.read_pos_start; tmp2.read_pos_start = start; tmp2.read_pos_stop = tmp2.read_pos_start + 1; tmp.read_pos_stop = tmp.read_pos_start + 1; entries.insert(entries.begin() + (i + 1), tmp); entries.insert(entries.begin() + (i), tmp2); } break; } } if (flag) { for (size_t i = 0; i < entries.size(); i++) { std::cout << entries[i].pos << "-" << entries[i].pos + entries[i].length << "(" << entries[i].read_pos_start << "-" << entries[i].read_pos_stop << ")"; if (entries[i].strand) { std::cout << "+ "; } else { std::cout << "- "; } } std::cout << std::endl; } } void Alignment::sort_insert_ref(aln_str tmp, vector &entries) { for (vector::iterator i = entries.begin(); i != entries.end(); i++) { if ((tmp.pos < (*i).pos)) { //insert before entries.insert(i, tmp); return; } } entries.push_back(tmp); } void Alignment::sort_insert(aln_str tmp, vector &entries) { for (vector::iterator i = entries.begin(); i != entries.end(); i++) { if ((tmp.read_pos_start < (*i).read_pos_start)) { //insert before entries.insert(i, tmp); return; } } entries.push_back(tmp); } bool Alignment::overlapping_segments(vector entries) { bool flag = (strcmp(this->getName().c_str(), Parameter::Instance()->read_name.c_str()) == 0); if (flag) { std::cout << "HO: " << entries.size() << std::endl; for (size_t i = 0; i < entries.size(); i++) { std::cout << "Seg: " << i << " " << entries[i].pos << " " << entries[i].length << std::endl; } } return (entries.size() == 2 && abs(entries[0].pos - entries[1].pos) < 100); } vector Alignment::getSA(RefVector ref) { string sa; vector entries; if (al->GetTag("SA", sa) && !sa.empty()) { //store the main aln: aln_str tmp; tmp.RefID = this->getRefID(); tmp.cigar = this->getCigar(); tmp.length = (long) get_length(tmp.cigar); tmp.mq = this->getMappingQual(); tmp.pos = (long) this->getPosition(); //+get_ref_lengths(tmp.RefID, ref); tmp.strand = getStrand(); uint32_t sv; al->GetTag("SV", sv); tmp.cross_N = ((sv & Ns_CLIPPED)); bool flag = strcmp(getName().c_str(), Parameter::Instance()->read_name.c_str()) == 0; get_coords(tmp, tmp.read_pos_start, tmp.read_pos_stop); if (flag) { cout << "\t read " << tmp.read_pos_start << " stop " << tmp.read_pos_stop << endl; } entries.push_back(tmp); if (flag) { std::cout << "Main Read: read start:" << tmp.read_pos_start << " REF: " << tmp.pos << " RefID: " << tmp.RefID << std::endl; } size_t i = 0; int count = 0; std::string cigar; std::string chr; bool nested = true; while (i < sa.size()) { if (count == 0 && sa[i] != ',') { chr += sa[i]; } if (count == 1 && sa[i - 1] == ',') { tmp.pos = (long) atoi(&sa[i]); } if (count == 2 && sa[i - 1] == ',') { tmp.strand = (bool) (sa[i] == '+'); } if (count == 3 && sa[i] != ',') { cigar += sa[i]; } if (count == 4 && sa[i - 1] == ',') { tmp.mq = atoi(&sa[i]); } if (count == 5 && sa[i] != ';') { tmp.nm = atoi(&sa[i]); } if (sa[i] == ',') { count++; } if (sa[i] == ';' && !cigar.empty()) { //TODO: maybe check how often this happens per read! if ((tmp.mq > Parameter::Instance()->min_mq || sv & FULLY_EXPLAINED) && entries.size() <= Parameter::Instance()->max_splits) { //TODO: check this! tmp.cigar = translate_cigar(cigar); //translates the cigar (string) to a type vector get_coords(tmp, tmp.read_pos_start, tmp.read_pos_stop); //get the coords on the read. if (flag) { cout << "\t read " << tmp.read_pos_start << " stop " << tmp.read_pos_stop << endl; } tmp.length = (long) get_length(tmp.cigar); //gives the length on the reference. tmp.RefID = get_id(ref, chr); //translates back the chr to the id of the chr; //TODO: should we do something about the MD string? if (flag) { std::cout << "Read: " << tmp.read_pos_start << " " << tmp.read_pos_stop << " REF: " << tmp.pos << " " << tmp.RefID; if (tmp.strand) { std::cout << "+" << std::endl; } else { std::cout << "-" << std::endl; } } //tmp.pos+=get_ref_lengths(tmp.RefID, ref); //insert sorted: includes_SV = true; sort_insert(tmp, entries); //al->GetTag("SV", sv); <-get that involved } else if (tmp.mq < Parameter::Instance()->min_mq) { nested = false; } else { //Ignore read due to too many splits entries.clear(); return entries; } chr.clear(); cigar.clear(); tmp.cigar.clear(); count = 0; tmp.mq = 0; } i++; } if (nested && (entries.size() > 2 || overlapping_segments(entries))) { check_entries(entries); } if (flag) { for (size_t i = 0; i < entries.size(); i++) { cout << "ENT: " << entries[i].pos << " " << entries[i].pos + entries[i].length << " Read: " << entries[i].read_pos_start << " " << entries[i].read_pos_stop << " "; if (entries[i].strand) { cout << "+" << endl; } else { cout << "-" << endl; } } } } return entries; } //returns -1 if flags are not set! double Alignment::get_scrore_ratio() { uint score = -1; uint subscore = -1; if (al->GetTag("AS", score)) { al->GetTag("XS", subscore); if (subscore == 0) { subscore = 1; } return (double) score / (double) subscore; } return -1; } bool Alignment::get_is_save() { string sa; double score = get_scrore_ratio(); //TODO should I use this again for bwa? return !((al->GetTag("XA", sa) && !sa.empty()) || (al->GetTag("XT", sa) && !sa.empty())) && (score == -1 || score > Parameter::Instance()->score_treshold); //|| //TODO: 7.5 } std::vector Alignment::translate_cigar(std::string cigar) { std::vector new_cigar; size_t i = 0; bool first = true; CigarOp tmp; tmp.Length = -1; while (i < cigar.size()) { if (tmp.Length == -1) { tmp.Length = atoi(&cigar[i]); } else if (tmp.Length != -1 && atoi(&cigar[i]) == 0 && cigar[i] != '0') { tmp.Type = cigar[i]; new_cigar.push_back(tmp); tmp.Length = -1; first = false; } i++; } return new_cigar; } double Alignment::get_avg_indel_length_Cigar() { double len = 0; double num = 0; for (size_t i = 0; i < al->CigarData.size(); i++) { if ((al->CigarData[i].Type == 'I' || al->CigarData[i].Type == 'D') && al->CigarData[i].Length > 1) { len += al->CigarData[i].Length; num++; } } return len / num; } vector Alignment::get_events_CIGAR() { size_t read_pos = 0; size_t pos = this->getPosition(); //orig_length; vector events; for (size_t i = 0; i < al->CigarData.size(); i++) { if (al->CigarData[i].Type == 'H' || (al->CigarData[i].Type == 'S' || al->CigarData[i].Type == 'M')) { read_pos += al->CigarData[i].Length; } if (al->CigarData[i].Type == 'D' && al->CigarData[i].Length > Parameter::Instance()->min_length) { str_event ev; ev.read_pos = read_pos; ev.length = al->CigarData[i].Length; //deletion ev.pos = pos; includes_SV = true; events.push_back(ev); } if (al->CigarData[i].Type == 'I' && al->CigarData[i].Length > Parameter::Instance()->min_length) { // std::cout<<"CIGAR: "<CigarData[i].Length<<" "<getName()<CigarData[i].Length * -1; //insertion; ev.pos = pos; ev.read_pos = read_pos; includes_SV = true; events.push_back(ev); read_pos += al->CigarData[i].Length; } if (al->CigarData[i].Type == 'D' || al->CigarData[i].Type == 'M' || al->CigarData[i].Type == 'N') { pos += al->CigarData[i].Length; } } return events; } double Alignment::get_num_mismatches(std::string md) { bool deletion = false; bool match = false; vector helper; double mis = 0; double len = 0; double maxim = 0; for (size_t i = 0; i < md.size(); i += 20) { mis = 0; len = 0; for (size_t j = 0; len < 100 && j + i < md.size(); j++) { if (match && atoi(&md[i + j]) == 0 && md[i + j] != '0') { //is not a number: if (md[i] == '^') { deletion = true; } else { len++; } if (!deletion) { //mistmatch!! mis++; match = false; } } else { len += atoi(&md[i + j]); match = true; deletion = false; } } if (strcmp(getName().c_str(), Parameter::Instance()->read_name.c_str()) == 0) { std::cout << (mis / len) << std::endl; } if ((mis / len) > maxim) { maxim = (mis / len); } } return maxim; // 0.03); } std::string Alignment::get_md() { std::string md; if (al->GetTag("MD", md)) { return md; } return md; } vector Alignment::get_events_MD(int min_mis) { vector events; /*std::string md; if (al->GetTag("MD", md)) { //TODO: remove: bool flag = strcmp(getName().c_str(), Parameter::Instance()->read_name.c_str()) == 0; if (flag) { std::cout << "found!" << std::endl; } //TODO think of a good threshold! if (get_num_mismatches(md) > Parameter::Instance()->min_num_mismatches) { if (flag) { std::cout << "is_strange!" << std::endl; } //generate a vector that holds the positions of the read std::vector aln; int pos = getPosition(); for (size_t i = 0; i < al->CigarData.size(); i++) { if (al->CigarData[i].Type == 'I') { //TODO check } if (al->CigarData[i].Type == 'D') { pos += al->CigarData[i].Length; } if (al->CigarData[i].Type == 'M') { for (size_t j = 0; j < al->CigarData[i].Length; j++) { aln.push_back(pos); pos++; //aln += "="; } } } //fill in the mismatches: bool deletion = false; bool match = false; double mis = 0; double len = 0; for (size_t i = 0; i < md.size(); i++) { if ((atoi(&md[i]) == 0 && md[i] != '0')) { //is not a number: if (md[i] == '^') { deletion = true; } if (!deletion) { //mistmatch!! mis++; aln[len] = aln[len] * -1; len++; } match = false; } else if (!match) { len += atoi(&md[i]); match = true; deletion = false; } } int runlength = 100; str_event ev; ev.pos = -1; ev.length = -1; ev.read_pos = 0; int start = 0; int last = 0; for (size_t i = 0; i < aln.size(); i += 50) { //+=runlength/2 ?? //std::cout< min_mis) { //TOOD ratio? if (ev.pos == -1) { start = i; ev.pos = first; ev.read_pos = ev.pos - getPosition(); } } else { if ((start > 20 && abs((int) (i + runlength) - (int) aln.size()) > 20) && ev.pos != -1) { if (flag) { std::cout << i << " " << (i + runlength) << " " << aln.size() << std::endl; std::cout << ev.pos << " " << last << " " << std::endl; } includes_SV = true; ev.length = last - ev.pos; if (flag) { std::cout << ev.pos << " " << ev.length << std::endl; } if (ev.length > runlength) { events.push_back(ev); } last = 0; ev.pos = -1; } else { ev.pos = -1; } } } } }*/ return events; } vector Alignment::get_avg_diff(double & dist, double & avg_del, double & avg_ins) { //computeAlignment(); //cout< mis_per_window; std::vector dels; vector event_aln = summarizeAlignment(dels); if (event_aln.empty()) { dist = 0; return mis_per_window; } PlaneSweep_slim * plane = new PlaneSweep_slim(); int min_tresh = 5; //reflects a 10% error rate. //compute the profile of differences: double del = 0; double ins = 0; double mis = 0; double length = event_aln[event_aln.size() - 1].position - event_aln[0].position; for (size_t i = 0; i < event_aln.size(); i++) { if (i != 0) { dist += event_aln[i].position - event_aln[i - 1].position; } pair_str tmp; tmp.position = -1; if (event_aln[i].type == 0) { tmp = plane->add_mut(event_aln[i].position, 1, min_tresh); } else { tmp = plane->add_mut(event_aln[i].position, abs(event_aln[i].type), min_tresh); } if (tmp.position != -1) { //check that its not the prev event! mis_per_window.push_back(tmp.coverage); //store #mismatch per window each time it exceeds. (which might be every event position!) } if (event_aln[i].type > 0) { avg_del += event_aln[i].type; } else if (event_aln[i].type < 0) { avg_ins += event_aln[i].type * -1; } } avg_ins = avg_ins / length; avg_del = avg_del / length; dist = dist / (double) event_aln.size(); plane->finalyze(); return mis_per_window; //total_num /num; } vector Alignment::get_events_Aln() { bool flag = (strcmp(this->getName().c_str(),Parameter::Instance()->read_name.c_str()) == 0); //clock_t comp_aln = clock(); std::vector dels; vector event_aln = summarizeAlignment(dels); //double time2 = Parameter::Instance()->meassure_time(comp_aln, "\tcompAln Events: "); vector events; PlaneSweep_slim * plane = new PlaneSweep_slim(); vector profile; // comp_aln = clock(); bool is_N_region = false; uint32_t sv; if (al->GetTag("SV", sv) && (!(sv & Ns_CLIPPED) && !(sv & FULLY_EXPLAINED))) { is_N_region = true; } int noise_events = 0; //compute the profile of differences: for (size_t i = 0; i < event_aln.size(); i++) { pair_str tmp; tmp.position = -1; if (event_aln[i].type == 0) { tmp = plane->add_mut(event_aln[i].position, 1, Parameter::Instance()->window_thresh); } else { tmp = plane->add_mut(event_aln[i].position, 1, Parameter::Instance()->window_thresh); // abs(event_aln[i].type) } if (tmp.position != -1 && (profile.empty() || (tmp.position - profile[profile.size() - 1].position) > 100)) { //for noisy events; profile.push_back(tmp); } else if (abs(event_aln[i].type) > Parameter::Instance()->min_length) { //for single events like NGM-LR would produce them. tmp.position = event_aln[i].position; profile.push_back(tmp); } } //comp_aln = clock(); int stop = 0; size_t start = 0; for (size_t i = 0; i < profile.size() && stop < event_aln.size(); i++) { if (profile[i].position >= event_aln[stop].position) { //find the postion: size_t pos = 0; while (pos < event_aln.size() && event_aln[pos].position != profile[i].position) { pos++; } //run back to find the start: start = pos; int prev = event_aln[pos].position; start = pos; int prev_type = 1; //todo it is actually pos + type and not *type while (start > 0 && (prev - event_aln[start].position) < (Parameter::Instance()->max_dist_alns)) { //13 //} * abs(event_aln[start].type) + 1)) { //TODO I dont like 13!?? prev = event_aln[start].position; prev_type = abs(event_aln[start].type); start--; if (prev_type == 0) { prev_type = 1; } prev += prev_type; } start++; //we are running one too far! //run forward to identify the stop: prev = event_aln[pos].position; stop = pos; prev_type = 1; while (stop < event_aln.size() && (event_aln[stop].position - prev) < (Parameter::Instance()->max_dist_alns)) { // * abs(event_aln[stop].type) + 1)) { prev = event_aln[stop].position; prev_type = abs(event_aln[stop].type); stop++; if (prev_type == 0) { prev_type = 1; } prev += prev_type; } if (stop > 0) { stop--; } int insert_max_pos = 0; int insert_max = 0; if (event_aln[start].type < 0) { insert_max_pos = event_aln[start].position; insert_max = abs(event_aln[start].type); } int del_max = 0; int del_max_pos = 0; double insert = 0; double del = 0; double mismatch = 0; for (size_t k = start; k <= stop; k++) { if (event_aln[k].type == 0) { mismatch++; } else if (event_aln[k].type > 0) { del += abs(event_aln[k].type); if (del_max < abs(event_aln[k].type)) { del_max = abs(event_aln[k].type); del_max_pos = event_aln[k].position; } } else if (event_aln[k].type < 0) { insert += abs(event_aln[k].type); if (insert_max < abs(event_aln[k].type)) { insert_max = abs(event_aln[k].type); insert_max_pos = event_aln[k].position; } } } str_event tmp; tmp.pos = event_aln[start].position; tmp.length = event_aln[stop].position; if (event_aln[stop].type > 1) { //because of the way we summarize mutations to one location tmp.length += event_aln[stop].type; } tmp.length = (tmp.length - event_aln[start].position); tmp.type = 0; if (insert_max > Parameter::Instance()->min_length && insert > (del + del)) { //we have an insertion! //todo check || vs. && if (is_N_region && insert_max * Parameter::Instance()->avg_ins < Parameter::Instance()->min_length) { tmp.type = 0; } else { tmp.length = insert_max; //TODO not sure! while (start < stop && event_aln[start].readposition == -1) { if (flag) { cout << event_aln[start].readposition << " " << event_aln[start].type << endl; } start++; } if (flag) { cout << event_aln[start].readposition << " " << event_aln[start].type << endl; } tmp.read_pos = event_aln[start].readposition; if (Parameter::Instance()->print_seq) { //if (tmp.read_pos + tmp.length > this->getAlignment()->QueryBases.size() || tmp.read_pos<0) { // cerr << "BUG! ALN event INS: " << this->getName() << " " << tmp.read_pos << " " << tmp.length << " " << this->getAlignment()->QueryBases.size() << endl; // } if(flag){ std::cout<<"Seq+:"<getAlignment()->QueryBases.substr(tmp.read_pos, tmp.length)<getAlignment()->QueryBases.substr(tmp.read_pos, tmp.length); } else { tmp.sequence = "NA"; } tmp.pos = insert_max_pos; tmp.type |= INS; tmp.is_noise = false; } } else if (del_max > Parameter::Instance()->min_length && (insert + insert) < del) { //deletion if (is_N_region && del_max * Parameter::Instance()->avg_del < Parameter::Instance()->min_length) { tmp.type = 0; } else { if (Parameter::Instance()->print_seq) { for (size_t del_pos = 0; del_pos < dels.size(); del_pos++) { if (abs(dels[del_pos].pos - tmp.pos) < 10) { tmp.sequence = dels[del_pos].sequence; } } } else { tmp.sequence = "NA"; } tmp.length = del_max; tmp.type |= DEL; tmp.is_noise = false; } } else if ((mismatch + del + insert) / 2 > Parameter::Instance()->min_length) { //TODO if (is_N_region || ((del_max > Parameter::Instance()->min_length && insert_max > Parameter::Instance()->min_length) && (del_max / insert_max) < Parameter::Instance()->min_length)) { tmp.type = 0; } else { noise_events++; tmp.type |= DEL; tmp.type |= INV; tmp.sequence = "NA"; tmp.is_noise = true; } } if (flag) { cout << "Read: " << " " << (double) this->getRefLength() << " events: " << event_aln.size() << " " << this->al->Name << std::endl; cout << "INS max " << insert_max << " del_max " << del_max << std::endl; cout << "INS:" << insert << " DEL: " << del << " MIS: " << mismatch << endl; cout << event_aln[start].position << " " << event_aln[stop].position << endl; cout << "store: " << tmp.pos << " " << tmp.pos + abs(tmp.length) << " " << tmp.length << endl; cout << tmp.sequence<meassure_time(comp_aln, "\tcompPosition: "); if (noise_events > 4) { events.clear(); } return events; } Sniffles-1.0.7/src/Alignment.h000066400000000000000000000065751320237057600161700ustar00rootroot00000000000000/* * Alignments.h * * Created on: May 25, 2012 * Author: fritz */ #ifndef ALIGNMENTS_H_ #define ALIGNMENTS_H_ #include #include #include #include #include "api/BamAux.h" #include "api/BamMultiReader.h" #include "api/BamWriter.h" #include "Paramer.h" #include "plane-sweep/PlaneSweep_slim.h" const unsigned char DEL = 0x01; // hex for 0000 0001 const unsigned char DUP = 0x02; // hex for 0000 0010 const unsigned char INS = 0x04; // hex for 0000 0100 const unsigned char INV = 0x08; // hex for 0000 1000 const unsigned char TRA = 0x10; // hex for 0001 0000 const unsigned char NEST =0x20; // hex for 0010 0000 const unsigned char NA = 0x80; // hex for 1000 0000 //NGM: choped alns: const unsigned int NOT_CLIPPED = 0x0; const unsigned int Ns_CLIPPED = 0x1; const unsigned int FULLY_EXPLAINED = 0x2; using namespace BamTools; using namespace std; typedef unsigned short ushort; typedef unsigned int uint; struct differences_str{ int position; int readposition; short type; }; struct indel_str{ int pos; std::string sequence; }; struct str_event{ short length; int pos; int read_pos; char type; bool is_noise; std::string sequence; //just for indels; }; struct aln_str{ int RefID; long pos; bool strand; std::vector cigar; ushort mq; ushort nm; long length; int read_pos_start; int read_pos_stop; bool cross_N; }; class Alignment { private: int ref_len; BamAlignment * al; bool includes_SV; pair alignment; bool is_computed; int32_t orig_length; int stop; std::vector translate_cigar(std::string cigar); size_t get_length(std::vector CigarData); int get_id(RefVector ref, std::string chr); vector summarizeAlignment(std::vector &dels); void sort_insert(aln_str tmp, vector &entries); void sort_insert_ref(aln_str tmp, vector &entries); void check_entries(vector &entries); bool overlapping_segments(vector entries); public: Alignment(){ al=NULL; ref_len=0; stop=0; orig_length=0; al=NULL; is_computed=false; includes_SV=false; } ~Alignment(){ alignment.first.clear(); alignment.second.clear(); delete al; } void setAlignment(BamAlignment * al); void setRef(string sequence); void computeAlignment(); void clear_QueryBases(); pair getSequence(); int32_t getPosition(); int32_t getRefID(); bool getStrand(); uint16_t getMappingQual(); string getName(); vector getCigar(); string getQualitValues(); size_t getRefLength(); size_t getOrigLen(); BamAlignment * getAlignment(); //float getIdentity(); void initAlignment(); int getAlignmentFlag(); string getQueryBases(); string getQualities(); string getTagData(); vector getSA(RefVector ref); void initSequence(); vector get_events_Aln(); int get_stop(){ return stop; } vector get_events_CIGAR(); vector get_events_MD(int min_length); void get_coords(aln_str tmp,int & start,int &stop); bool supports_SV(){ return this->includes_SV; } void set_supports_SV(bool flag){ this->includes_SV=flag; } bool get_is_save(); double get_num_mismatches(std::string md); double get_scrore_ratio(); std::string get_md(); double get_avg_indel_length_Cigar(); vector get_avg_diff(double & dist,double & avg_del, double & avg_len); }; #endif /* ALIGNMENTS_H_ */ Sniffles-1.0.7/src/BamParser.cpp000066400000000000000000000022301320237057600164410ustar00rootroot00000000000000/* * parser.cpp * * Created on: Apr 17, 2012 * Author: fritz */ #include "BamParser.h" BamParser::BamParser(string file){ vector tmps; tmps.push_back(file); if(!reader.Open(tmps)){ cerr<<"BAM Parser: could not open file: "<IsMapped() && al->MapQuality > mappingQv){ al->BuildCharData(); align->setAlignment(al); return align; } } return align; } void BamParser::parseReadFast(uint16_t mappingQv,Alignment*& align){ // Alignment *align = new Alignment(); BamAlignment* al = align->getAlignment(); // getSequence().first // align->initSequence(); align->getQueryBases().clear(); align->clear_QueryBases(); while(reader.GetNextAlignmentCore(al[0])){ if( al->IsMapped() && al->MapQuality > mappingQv){ al->BuildCharData(); align->setAlignment(al); return; } } } RefVector BamParser::get_refInfo(){ return reader.GetReferenceData(); } string BamParser::get_header(){ return reader.GetHeaderText(); } Sniffles-1.0.7/src/BamParser.h000066400000000000000000000012721320237057600161130ustar00rootroot00000000000000/* * parser.h * * Created on: Apr 17, 2012 * Author: fritz */ #ifndef BAMPARSER_H_ #define BAMPARSER_H_ #include "api/BamMultiReader.h" #include "api/BamWriter.h" #include "Alignment.h" #include "Parser.h" #include #include #include #include #include #include using namespace BamTools; using namespace std; class BamParser: public Parser { private: BamMultiReader reader; public: BamParser(string file); ~BamParser(){ reader.Close(); } Alignment * parseRead(uint16_t mappingQv); void parseReadFast(uint16_t mappingQv,Alignment*& aln); string get_header(); RefVector get_refInfo(); }; #endif /* PARSER_H_ */ Sniffles-1.0.7/src/CMakeLists.txt000066400000000000000000000035641320237057600166340ustar00rootroot00000000000000cmake_minimum_required(VERSION 2.8) project(Sniffles) include_directories (../lib/bamtools-2.3.0/src) include_directories(../lib/tclap-1.2.1/include) configure_file( Version.h.in ${CMAKE_SOURCE_DIR}/src/Version.h ) add_executable(sniffles tree/Breakpoint_Tree.cpp Genotyper/Genotyper.cpp Alignment.cpp BamParser.cpp Sniffles.cpp Ignore_Regions.cpp tree/Intervall_bed.cpp sub/Detect_Breakpoints.cpp sub/Breakpoint.cpp tree/IntervallTree.cpp tree/IntervallList.cpp realign/SWCPU.cpp realign/Realign.cpp print/VCFPrinter.cpp print/BedpePrinter.cpp print/IPrinter.cpp tree/BinTree.cpp print/NGMPrinter.cpp plane-sweep/PlaneSweep_slim.cpp cluster/Cluster_SVs.cpp force_calling/Force_calling.cpp force_calling/VCF_parser.cpp ) #target_link_libraries(ngm-core pthread) TARGET_LINK_LIBRARIES(sniffles BamTools-static) TARGET_LINK_LIBRARIES(sniffles zlibstatic) add_executable(sniffles-debug tree/Breakpoint_Tree.cpp Genotyper/Genotyper.cpp Alignment.cpp BamParser.cpp Sniffles.cpp Ignore_Regions.cpp tree/Intervall_bed.cpp tree/IntervallList.cpp sub/Detect_Breakpoints.cpp sub/Breakpoint.cpp tree/IntervallTree.cpp realign/SWCPU.cpp realign/Realign.cpp print/VCFPrinter.cpp print/BedpePrinter.cpp print/IPrinter.cpp tree/BinTree.cpp print/NGMPrinter.cpp plane-sweep/PlaneSweep_slim.cpp cluster/Cluster_SVs.cpp force_calling/Force_calling.cpp force_calling/VCF_parser.cpp ) SET_TARGET_PROPERTIES(sniffles-debug PROPERTIES COMPILE_FLAGS "-g3 -O0") #target_link_libraries(sniffles-debug pthread) TARGET_LINK_LIBRARIES(sniffles-debug BamTools-static) TARGET_LINK_LIBRARIES(sniffles-debug zlibstatic) Sniffles-1.0.7/src/Genotyper/000077500000000000000000000000001320237057600160405ustar00rootroot00000000000000Sniffles-1.0.7/src/Genotyper/Genotyper.cpp000066400000000000000000000231251320237057600205230ustar00rootroot00000000000000/* / * Genotyper.cpp * * Created on: Mar 28, 2016 * Author: fsedlaze */ #include "Genotyper.h" std::string Genotyper::assess_genotype(int ref, int support) { double allele = (double) support / (double) (support + ref); if (allele < Parameter::Instance()->min_allelel_frequency) { return ""; } std::stringstream ss; ss << ";AF="; ss << allele; ss << "\tGT:DR:DV\t"; if (allele > Parameter::Instance()->homfreq) { ss << "1/1:"; } else if (allele > Parameter::Instance()->hetfreq) { ss << "0/1:"; } else { ss << "0/0:"; } ss << ref; ss << ":"; ss << support; return ss.str(); } std::string Genotyper::mod_breakpoint_vcf(string buffer, int ref) { //find last of\t //parse #reads supporting //print #ref string entry; int pos = 0; pos = buffer.find_last_of("GT"); //tab entry = buffer.substr(0, pos - 2); buffer = buffer.substr(pos + 1); // the right part is only needed: pos = buffer.find_last_of(':'); int support = atoi(buffer.substr(pos + 1).c_str()); entry += assess_genotype(ref, support); return entry; } std::string Genotyper::mod_breakpoint_bedpe( string buffer, int ref) { std::string tmp = buffer; std::string entry = tmp; entry += '\t'; //int ref = max(tree.get_ref(node,var.chr,var.pos),tree.get_ref(node,var.chr2,var.pos2)); int pos = tmp.find_last_of('\t'); //TODO!! int support = atoi(tmp.substr(pos + 1).c_str()); double allele = (double) support / (double) (support + ref); if (allele < Parameter::Instance()->min_allelel_frequency) { return ""; } std::stringstream ss; ss << ref; ss << "\t"; ss << support; entry += ss.str(); return entry; } void Genotyper::parse_pos(char * buffer, int & pos, std::string & chr) { chr = ""; pos = -1; size_t i = 0; int count = 0; while (buffer[i] != '\t') { if (count == 1 && ((buffer[i] != '[' || buffer[i] != ']') && buffer[i] != ':')) { chr += buffer[i]; } if (count == 2 && buffer[i - 1] == ':') { pos = atoi(&buffer[i]); } if ((buffer[i] == ']' || buffer[i] == '[') || buffer[i] == ':') { count++; } i++; } } variant_str Genotyper::get_breakpoint_vcf(string buffer) { //TODO extend for TRA! size_t i = 0; int count = 0; variant_str tmp; while (buffer[i] != '\0' && buffer[i] != '\n') { if (count == 0 && buffer[i] != '\t') { tmp.chr += buffer[i]; } if (count == 1 && buffer[i - 1] == '\t') { tmp.pos = atoi(&buffer[i]); } if (tmp.pos2 == -1 && (count == 4 && (buffer[i - 1] == '[' || buffer[i - 1] == ']'))) { parse_pos(&buffer[i - 1], tmp.pos2, tmp.chr2); } if (count > 6 && strncmp(";CHR2=", &buffer[i], 6) == 0) { i += 6; while (buffer[i] != ';') { tmp.chr2 += buffer[i]; i++; } } if (count > 6 && strncmp(";END=", &buffer[i], 5) == 0) { tmp.pos2 = atoi(&buffer[i + 5]); //stores right most breakpoint break; } if (buffer[i] == '\t') { count++; } i++; } return tmp; } variant_str Genotyper::get_breakpoint_bedpe(string buffer) { size_t i = 0; int count = 0; std::string chr; variant_str tmp; while (buffer[i] != '\0' && buffer[i] != '\n') { if (count == 12 && buffer[i] != '\t') { tmp.chr += buffer[i]; } if (count == 13 && buffer[i - 1] == '\t') { tmp.pos = atoi(&buffer[i]); } if (count == 14 && buffer[i] != '\t') { tmp.chr2 += buffer[i]; } if (count == 15 && buffer[i - 1] == '\t') { tmp.pos2 = atoi(&buffer[i]); break; } if (buffer[i] == '\t') { count++; } i++; } return tmp; } void Genotyper::update_file(Breakpoint_Tree & tree, breakpoint_node *& node) { std::ifstream myfile; bool is_vcf = !Parameter::Instance()->output_vcf.empty(); string file_name; if (!Parameter::Instance()->output_vcf.empty()) { file_name = Parameter::Instance()->output_vcf; myfile.open(Parameter::Instance()->output_vcf.c_str(), std::ifstream::in); } else if (!Parameter::Instance()->output_bedpe.empty()) { file_name = Parameter::Instance()->output_bedpe; myfile.open(Parameter::Instance()->output_bedpe.c_str(), std::ifstream::in); } FILE*file = fopen(Parameter::Instance()->tmp_file.c_str(), "w"); // if (!myfile.good()) { std::cout << "SVParse: could not open file: " << std::endl; exit(0); } string buffer; getline(myfile,buffer); //parse SVs breakpoints in file while (!myfile.eof()) { // TODO:if first -> we need to define AF! if (buffer[0] != '#') { std::string to_print; // create binary tree to hold breakpoints! variant_str tmp; if (is_vcf) { tmp = get_breakpoint_vcf(buffer); } else { tmp = get_breakpoint_bedpe(buffer); } int ref = max(tree.get_ref(node, tmp.chr, tmp.pos), tree.get_ref(node, tmp.chr2, tmp.pos2)); if (is_vcf) { to_print = mod_breakpoint_vcf(buffer, ref); } else { to_print = mod_breakpoint_bedpe(buffer, ref); } if (!to_print.empty()) { fprintf(file, "%s", to_print.c_str()); fprintf(file, "%c", '\n'); } } else { fprintf(file, "%s", buffer.c_str()); fprintf(file, "%c", '\n'); } getline(myfile,buffer); } myfile.close(); fclose(file); string move = "mv "; move += Parameter::Instance()->tmp_file; move += " "; move += file_name; system(move.c_str()); } std::vector Genotyper::read_SVs(Breakpoint_Tree & tree, breakpoint_node * &node) { std::vector ref_dict; std::ifstream myfile; bool is_vcf = !Parameter::Instance()->output_vcf.empty(); if (!Parameter::Instance()->output_vcf.empty()) { myfile.open(Parameter::Instance()->output_vcf.c_str(), std::ifstream::in); } else if (!Parameter::Instance()->output_bedpe.empty()) { myfile.open(Parameter::Instance()->output_bedpe.c_str(), std::ifstream::in); } if (!myfile.good()) { std::cout << "SVParse: could not open file: " << std::endl; exit(0); } //size_t buffer_size = 250000000; string buffer; getline(myfile,buffer); //char* buffer = new char[buffer_size]; //myfile.getline(buffer, buffer_size); //parse SVs breakpoints in file int num_sv=0; int prev_pos1 = 0; int prev_pos2 = 0; while (!myfile.eof()) { //cout << buffer << endl; if (buffer[0] != '#') { // create binary tree to hold breakpoints! variant_str tmp; if (is_vcf) { tmp = get_breakpoint_vcf(buffer); } else { tmp = get_breakpoint_bedpe(buffer); } // std::cout << "SV: " << tmp.pos << " " << tmp.pos2 << std::endl; tree.insert(node, tmp.chr, tmp.pos, true); //true: start; tree.insert(node, tmp.chr2, tmp.pos2, false); //false: stop;// num_sv++; if(num_sv%1000==0){ cout<<"\t\tRead in SV: "< //fill the refdict. std::string id = ""; for (size_t i = 13; i < buffer.size() && buffer[i] != ','; i++) { id += buffer[i]; } ref_dict.push_back(id); } getline(myfile,buffer); //myfile.getline(buffer, buffer_size); } myfile.close(); return ref_dict; //tree.inorder(node); } void Genotyper::compute_cov(Breakpoint_Tree & tree, breakpoint_node *& node, std::vector ref_dict) { FILE * ref_allel_reads = fopen(Parameter::Instance()->tmp_genotyp.c_str(), "r"); if (ref_allel_reads == NULL) { std::cerr << "CovParse: could not open file: " << Parameter::Instance()->tmp_genotyp << std::endl; } //check if we want to compute the full coverage! str_read tmp; size_t nbytes = fread(&tmp, sizeof(struct str_read), 1, ref_allel_reads); int prev_id = -1; while (nbytes != 0) { // std::cout<<"Read: "<<" " < ref_dict = read_SVs(this->tree, this->node); cout << "\tUpdate reference alleles" << endl; compute_cov(this->tree, this->node, ref_dict); cout << "\tWriting SV calls" << endl; update_file(this->tree, this->node); cout << "\tCleaning tmp files" << endl; string del = "rm "; del += Parameter::Instance()->tmp_genotyp; //system(del.c_str()); } void Genotyper::update_SVs(std::vector & svs, long ref_space) { //refspace for the ref reads!! FILE * ref_allel_reads = fopen(Parameter::Instance()->tmp_genotyp.c_str(), "r"); if (ref_allel_reads == NULL) { std::cerr << "Genotype Parser: could not open file: " << Parameter::Instance()->tmp_genotyp << std::endl; } str_read tmp; size_t nbytes = fread(&tmp, sizeof(struct str_read), 1, ref_allel_reads); int num_reads = 0; while (nbytes != 0) { for (size_t i = 0; i < svs.size(); i++) { if (svs[i]->get_valid()) { long start = tmp.start + ref_space; long stop = start + (long) tmp.length; //start - 100 orig! if ((svs[i]->get_coordinates().start.min_pos - 100 > start && svs[i]->get_coordinates().start.min_pos + 100 < stop)) { //found svs[i]->set_refcount(svs[i]->get_refcount() + 1); } //stop coordinate if ((svs[i]->get_coordinates().stop.max_pos - 100 > start + 100 && svs[i]->get_coordinates().stop.max_pos + 100 < stop - 100)) { //found svs[i]->set_refcount(svs[i]->get_refcount() + 1); } } } //if reads should be included-> Planesweep for +- breakpoint (Maybe hit -> extra function for that region around the breakpoint! num_reads++; if (num_reads % 1000 == 0) { cout << "\tProcessed " << num_reads << endl; } nbytes = fread(&tmp, sizeof(struct str_read), 1, ref_allel_reads); } fclose(ref_allel_reads); } Sniffles-1.0.7/src/Genotyper/Genotyper.h000066400000000000000000000021511320237057600201640ustar00rootroot00000000000000/* * Genotyper.h * * Created on: Mar 28, 2016 * Author: fsedlaze */ #ifndef GENOTYPER_H_ #define GENOTYPER_H_ #include "../Paramer.h" #include "../print/IPrinter.h" #include "../tree/Breakpoint_Tree.h" struct variant_str{ std::string chr; std::string chr2; int pos; int pos2; }; class Genotyper{ private: Breakpoint_Tree tree; breakpoint_node * node; std::vector read_SVs(Breakpoint_Tree & tree,breakpoint_node *& node ); void compute_cov(Breakpoint_Tree & tree,breakpoint_node *& node,std::vector ref_dict); void update_file(Breakpoint_Tree & tree,breakpoint_node *& node); variant_str get_breakpoint_vcf(string buffer); variant_str get_breakpoint_bedpe(string buffer); std::string mod_breakpoint_vcf(string buffer, int ref); std::string mod_breakpoint_bedpe(string buffer, int ref); void parse_pos(char * buffer, int & pos, std::string & chr); public: Genotyper(){ node=NULL; } ~Genotyper(){ } void update_SVs(); void update_SVs(std::vector & points,long ref_space); std::string assess_genotype(int ref, int support); }; #endif /* GENOTYPER_H_ */ Sniffles-1.0.7/src/Ignore_Regions.cpp000066400000000000000000000053621320237057600175070ustar00rootroot00000000000000/* * Ignore_Regions.cpp * * Created on: Feb 4, 2016 * Author: fsedlaze */ #include "Ignore_Regions.h" #include "sub/Detect_Breakpoints.h" long get_ref_coords(std::string chr, RefVector ref) { long length = 0; for (size_t i = 0; i < ref.size(); i++) { if (strcmp(ref[i].RefName.c_str(), chr.c_str()) == 0) { return length; } length += ref[i].RefLength + Parameter::Instance()->max_dist; } return -1; //should not happen } long get_ref_lengths2(int id, RefVector ref) { long length = 0; for (size_t i = 0; i < (size_t) id && i < ref.size(); i++) { length += ref[i].RefLength + Parameter::Instance()->max_dist; } return length; } int get_id2(RefVector ref, std::string chr) { for (size_t i = 0; i < ref.size(); i++) { if (strcmp(ref[i].RefName.c_str(), chr.c_str()) == 0) { return i; } } return -1; //should not happen! } void initialize_bed(IntervallTree_bed &bed_tree, Leaf *&root,RefVector ref) { //bst.insert(point, root); size_t buffer_size = 2000000; char*buffer = new char[buffer_size]; std::ifstream myfile; myfile.open(Parameter::Instance()->ignore_regions_bed.c_str(), std::ifstream::in); if (!myfile.good()) { std::cout << "SAM Parser: could not open file: " << Parameter::Instance()->ignore_regions_bed.c_str() << std::endl; exit(0); } myfile.getline(buffer, buffer_size); while (!myfile.eof()) { int count = 0; string chr; int p1; int p2; for (size_t i = 0; i < buffer_size && buffer[i] != '\0' && buffer[i] != '\n'; i++) { if (count == 0 && buffer[i] != '\t') { chr += buffer[i]; } if (count == 1 && buffer[i - 1] == '\t') { p1 = atoi(&buffer[i]); } if (count == 2 && buffer[i - 1] == '\t') { p2 = atoi(&buffer[i]); break; } if (buffer[i] == '\t') { count++; } } //transfer coordinates: long ref_dist = get_ref_coords(chr, ref); std::cout << (long) p1 + ref_dist << " " << (long) p2 + ref_dist << std::endl; bed_tree.insert((long) p1 + ref_dist, (long) p2 + ref_dist, root); myfile.getline(buffer, buffer_size); } } void ignore_regions(std::vector & final_SV,RefVector ref) { IntervallTree_bed bed_tree; Leaf *root = NULL; initialize_bed(bed_tree, root,ref); bed_tree.postorder(root); size_t i = 0; while (i < final_SV.size()) { if (final_SV[i]->get_SVtype() & DUP) { std::cout << final_SV[i]->get_coordinates().start.most_support << " "; } if (bed_tree.is_in(final_SV[i]->get_coordinates().start.most_support, root) || bed_tree.is_in(final_SV[i]->get_coordinates().stop.most_support, root)) { if (final_SV[i]->get_SVtype() & DUP) { std::cout << "erase" << endl; } final_SV.erase(final_SV.begin() + i); } else { if (final_SV[i]->get_SVtype() & DUP) { std::cout << "keep" << endl; } i++; } } } Sniffles-1.0.7/src/Ignore_Regions.h000066400000000000000000000005541320237057600171520ustar00rootroot00000000000000/* * Ignore_Regions.h * * Created on: Feb 4, 2016 * Author: fsedlaze */ #ifndef IGNORE_REGIONS_H_ #define IGNORE_REGIONS_H_ #include "sub/Breakpoint.h" #include "tree/Intervall_bed.h" void ignore_regions(std::vector & final_SV); void initialize_bed(IntervallTree_bed &bed_tree, Leaf *&root,RefVector ref); #endif /* IGNORE_REGIONS_H_ */ Sniffles-1.0.7/src/Paramer.h000066400000000000000000000050471320237057600156320ustar00rootroot00000000000000/* * Paramer.h * * Created on: Aug 20, 2015 * Author: fsedlaze */ #ifndef PARAMER_H_ #define PARAMER_H_ #include #include #include #include #include #include #include struct region_str { std::string chr; int start; int stop; }; class Parameter { private: Parameter() { window_thresh=10;//TODO check! version="1.0.7"; huge_ins = 2000;//TODO check?? } ~Parameter() { } static Parameter* m_pInstance; std::vector regions; public: std::string output_vcf; std::string output_bedpe; std::string ref_seq; std::string read_name; std::string ignore_regions_bed; std::string tmp_file; std::string tmp_genotyp; std::string tmp_phasing; std::string version; std::string input_vcf; std::vector bam_files; std::map chr_names; short min_mq; short report_n_reads; short corridor; double error_rate; double score_treshold; double min_allelel_frequency; double avg_del; double avg_ins; double homfreq; double hetfreq; //double min_num_mismatches; int window_thresh; int min_support; int max_splits; int max_dist; int min_length; int min_reads_phase; int num_threads; int max_readlength; int min_grouping_support; //min num reads supporting the overlap of two SVs int huge_ins; int max_dist_alns; int min_segment_size; int min_zmw; // bool splitthreader_output; bool debug; bool genotype; bool phase; bool ignore_std; bool reportBND; bool print_seq; void set_regions(std::string reg) { size_t i = 0; region_str tmp; short sep; while (i < reg.size()) { tmp.chr = reg.substr(i, reg.find_first_of(':')); i += tmp.chr.size() + 1; tmp.start = atoi(®[i]); i += reg.find_first_of('-') + 1; tmp.stop = atoi(®[i]); i += reg.find_first_of(';') + 1; regions.push_back(tmp); } std::cout << "found regions: " << regions.size() << std::endl; } bool overlaps(std::string chr, int start, int stop) { for (size_t i = 0; i < regions.size(); i++) { if (strcmp(chr.c_str(), regions[i].chr.c_str()) == 0 && (abs(start - regions[i].start) < max_dist && abs(stop - regions[i].stop) < max_dist)) { return true; } } return false; } static Parameter* Instance() { if (!m_pInstance) { m_pInstance = new Parameter; } return m_pInstance; } double meassure_time(clock_t begin ,std::string msg){ clock_t end = clock(); double elapsed_secs = double(end - begin) / CLOCKS_PER_SEC; std::cout << msg<<" " << elapsed_secs< #include "Paramer.h" #include #include #include #include "Genotyper/Genotyper.h" #include "realign/Realign.h" #include "sub/Detect_Breakpoints.h" #include "print/IPrinter.h" #include "print/VCFPrinter.h" #include "print/BedpePrinter.h" #include "print/NGMPrinter.h" #include "Ignore_Regions.h" #include "plane-sweep/PlaneSweep_slim.h" #include "print/BedpePrinter.h" #include "force_calling/Force_calling.h" //cmake -D CMAKE_C_COMPILER=/opt/local/bin/gcc-mp-4.7 -D CMAKE_CXX_COMPILER=/opt/local/bin/g++-mp-4.7 .. //TODO: // strand bias?? // I think you could make your performance on PacBio reads even better with a few modifications: //b. In pbsv, I use a simply mononucleotide consistency check to determine whether to cluster insertions from different reads as supporting the "same" events. In addition to looking at the similarity of length and breakpoints, //you could measure [min(Act)+min(Cct)+min(Gct)+min(Tct) / max(Act)+max(Cct)+max(Gct)+max(Tct)] Even a lax criterion (>0.25) //can avoid clustering phantom insertions (where one is say all A and the another is G+T). //[min(A1,A2)+min(C1,C2)+min(G1,G2)+min(T1,T2)[/[max...]/ Parameter* Parameter::m_pInstance = NULL; void read_parameters(int argc, char *argv[]) { TCLAP::CmdLine cmd("Sniffles version ", ' ', Parameter::Instance()->version); TCLAP::ValueArg arg_bamfile("m", "mapped_reads", "Sorted bam File", true, "", "string"); TCLAP::ValueArg arg_vcf("v", "vcf", "VCF output file name", false, "", "string"); TCLAP::ValueArg arg_input_vcf("", "Ivcf", "Input VCF file name. Enable force calling", false, "", "string"); TCLAP::ValueArg arg_bedpe("b", "bedpe", " bedpe output file name", false, "", "string"); //TCLAP::ValueArg arg_chrs("c", "chrs", " comma seperated list of chrs to scan", false, "", "string"); TCLAP::ValueArg arg_support("s", "min_support", "Minimum number of reads that support a SV. Default: 10", false, 10, "int"); TCLAP::ValueArg arg_splits("", "max_num_splits", "Maximum number of splits per read to be still taken into account. Default: 7", false, 7, "int"); TCLAP::ValueArg arg_dist("d", "max_distance", "Maximum distance to group SV together. Default: 1kb", false, 1000, "int"); TCLAP::ValueArg arg_threads("t", "threads", "Number of threads to use. Default: 3", false, 3, "int"); TCLAP::ValueArg arg_minlength("l", "min_length", "Minimum length of SV to be reported. Default: 30", false, 30, "int"); TCLAP::ValueArg arg_mq("q", "minmapping_qual", "Minimum Mapping Quality. Default: 20", false, 20, "int"); TCLAP::ValueArg arg_numreads("n", "num_reads_report", "Report up to N reads that support the SV in the vcf file. -1: report all. Default: 0", false, 0, "int"); TCLAP::ValueArg arg_segsize("r", "min_seq_size", "Discard read if non of its segment is larger then this. Default: 2kb", false, 2000, "int"); TCLAP::ValueArg arg_zmw("z", "min_zmw", "Discard SV that are not supported by at least x zmws. This applies only for PacBio recognizable reads. Default: 0", false, 0, "int"); TCLAP::ValueArg arg_tmp_file("", "tmp_file", "path to temporary file otherwise Sniffles will use the current directory.", false, "", "string"); TCLAP::SwitchArg arg_genotype("", "genotype", "Enables Sniffles to compute the genotypes.", cmd, false); TCLAP::SwitchArg arg_cluster("", "cluster", "Enables Sniffles to phase SVs that occur on the same reads", cmd, false); TCLAP::SwitchArg arg_std("", "ignore_sd", "Ignores the sd based filtering. Default: false", cmd, false); TCLAP::SwitchArg arg_bnd("", "report_BND", "Report BND instead of Tra in vcf output. Default: false", cmd, false); TCLAP::SwitchArg arg_seq("", "report_seq", "Report sequences for indels in vcf output. (Beta version!) Default: false", cmd, false); TCLAP::ValueArg arg_cluster_supp("", "cluster_support", "Minimum number of reads supporting clustering of SV. Default: 1", false, 1, "int"); TCLAP::ValueArg arg_allelefreq("f", "allelefreq", "Threshold on allele frequency (0-1). Default=0.0", false, 0.0, "float"); TCLAP::ValueArg arg_hetfreq("", "min_het_af", "Threshold on allele frequency (0-1). Default=0.0", false, 0.3, "float"); TCLAP::ValueArg arg_homofreq("", "min_homo_af", "Threshold on allele frequency (0-1). Default=0.0", false, 0.8, "float"); cmd.add(arg_homofreq); cmd.add(arg_hetfreq); cmd.add(arg_input_vcf); cmd.add(arg_cluster_supp); cmd.add(arg_numreads); cmd.add(arg_zmw); cmd.add(arg_segsize); cmd.add(arg_tmp_file); cmd.add(arg_dist); cmd.add(arg_threads); cmd.add(arg_minlength); cmd.add(arg_mq); cmd.add(arg_splits); cmd.add(arg_bedpe); cmd.add(arg_vcf); cmd.add(arg_allelefreq); cmd.add(arg_support); cmd.add(arg_bamfile); // cmd.add(arg_chrs); //parse cmd: cmd.parse(argc, argv); Parameter::Instance()->debug = true; Parameter::Instance()->score_treshold = 10; Parameter::Instance()->read_name = " "; //21_16296949_+";//21_40181680_-";//m151102_123142_42286_c100922632550000001823194205121665_s1_p0/80643/0_20394"; //"22_36746138"; //just for debuging reasons! Parameter::Instance()->bam_files.push_back(arg_bamfile.getValue()); Parameter::Instance()->min_mq = arg_mq.getValue(); Parameter::Instance()->output_vcf = arg_vcf.getValue(); Parameter::Instance()->report_n_reads = arg_numreads.getValue(); Parameter::Instance()->min_support = arg_support.getValue(); Parameter::Instance()->max_splits = arg_splits.getValue(); Parameter::Instance()->max_dist = arg_dist.getValue(); Parameter::Instance()->min_length = arg_minlength.getValue(); Parameter::Instance()->genotype = arg_genotype.getValue(); Parameter::Instance()->phase = arg_cluster.getValue(); Parameter::Instance()->num_threads = arg_threads.getValue(); Parameter::Instance()->output_bedpe = arg_bedpe.getValue(); Parameter::Instance()->tmp_file = arg_tmp_file.getValue(); Parameter::Instance()->min_grouping_support = arg_cluster_supp.getValue(); Parameter::Instance()->min_allelel_frequency = arg_allelefreq.getValue(); Parameter::Instance()->min_segment_size = arg_segsize.getValue(); Parameter::Instance()->reportBND = arg_bnd.getValue(); Parameter::Instance()->input_vcf = arg_input_vcf.getValue(); Parameter::Instance()->print_seq = arg_seq.getValue(); Parameter::Instance()->ignore_std = arg_std.getValue(); Parameter::Instance()->min_zmw = arg_zmw.getValue(); Parameter::Instance()->homfreq = arg_homofreq.getValue(); Parameter::Instance()->hetfreq = arg_hetfreq.getValue(); //Parse IDS: /*std::string buffer = arg_chrs.getValue(); int count = 0; std::string name = ""; for (size_t i = 0; i < buffer.size(); i++) { if (buffer[i] == ',') { Parameter::Instance()->chr_names[name] = true; name.clear(); } else { name += buffer[i]; } } if (!name.empty()) { Parameter::Instance()->chr_names[name] = true; } */ if (Parameter::Instance()->min_allelel_frequency > 0 || !Parameter::Instance()->input_vcf.empty()) { std::cerr << "Automatically enabling genotype mode" << std::endl; Parameter::Instance()->genotype = true; } if (Parameter::Instance()->tmp_file.empty()) { //TODO change to genotyper file and phasing file! if(Parameter::Instance()->output_bedpe.empty()){ Parameter::Instance()->tmp_file = Parameter::Instance()->output_vcf; }else{ Parameter::Instance()->tmp_file = Parameter::Instance()->output_bedpe; } Parameter::Instance()->tmp_file += "_tmp"; } Parameter::Instance()->tmp_genotyp = Parameter::Instance()->tmp_file; Parameter::Instance()->tmp_phasing = Parameter::Instance()->tmp_file; Parameter::Instance()->tmp_genotyp += "_genotype"; Parameter::Instance()->tmp_phasing += "_phase"; //should I check tmp file path?? } //some toy/test functions: void parse_binary() { std::string tmp_name_file = Parameter::Instance()->tmp_file; // this file is created in IPrinter and stores the names and ID of SVS. tmp_name_file += "Names"; FILE * alt_allel_reads = fopen(tmp_name_file.c_str(), "r"); if (alt_allel_reads == NULL) { std::cerr << "ClusterParse: could not open tmp file: " << tmp_name_file.c_str() << std::endl; } std::cout << "start" << std::endl; name_str tmp; size_t nbytes = fread(&tmp, sizeof(struct name_str), 1, alt_allel_reads); std::cout << tmp.read_name << std::endl; while (nbytes != 0) { int max_ID = std::max(max_ID, tmp.svs_id); if (tmp.svs_id == 34 || tmp.svs_id == 35) { std::cout << "Cluster: " << tmp.svs_id << " " << tmp.read_name << std::endl; } // std::cout << tmp.read_name << std::endl; nbytes = fread(&tmp, sizeof(struct name_str), 1, alt_allel_reads); } fclose(alt_allel_reads); } double comp_std(std::vector pos, int start) { double count = 0; double std_start = 0; for (size_t i = 0; i < pos.size(); i++) { count++; if (pos[i] != -1) { long diff = (start - pos[i]); // std::cout << "DIFF Start: " << diff << std::endl; std_start += std::pow((double) diff, 2.0); } } return std::sqrt(std_start / count); } void test_sort_insert(int pos, std::vector & positions) { size_t i = 0; while (i < positions.size() && positions[i] < pos) { i++; } positions.insert(positions.begin() + i, pos); } double test_comp_std_quantile(std::vector positions, int position) { double count = 0; std::vector std_start_dists; double std_start = 0; for (std::vector::iterator i = positions.begin(); i != positions.end(); i++) { long diff = (position - (*i)); // std::cout << "DIFF Start: " << diff << std::endl; test_sort_insert(std::pow((double) diff, 2.0), std_start_dists); //std_start += std::pow((double) diff, 2.0); } count = 0; for (size_t i = 0; i < std_start_dists.size() / 2; i++) { std_start += std_start_dists[i]; count++; } return std::sqrt(std_start / count); } void test_std() { srand(time(NULL)); int start = rand() % 100000; /// sqrt(1/12) for ins. Plot TRA std vs. cov/support. std::vector positions; double avg = 0; double num = 0; for (int border = 100; border < 9001; border = border * 10) { for (int t = 0; t < 10; t++) { for (int cov = 2; cov < 5; cov += 1) { for (size_t i = 0; i < cov; i++) { int pos = (rand() % border) + (start - (border / 2)); positions.push_back(pos); } avg += comp_std(positions, start) / test_comp_std_quantile(positions, start); std::cout << "Cov: " << cov + 1 << " border: " << border << " STD: " << comp_std(positions, start) << std::endl; // / test_comp_std_quantile(positions, start) << std::endl; positions.clear(); num++; } } } std::cout << "AVG: " << avg / num << std::endl; } void get_rand(int mean, int num, vector & positions, int interval) { //std::cout << "sim " << num << std::endl; for (size_t i = 0; i < num; i++) { int pos = (rand() % interval) + (mean - (interval / 2)); positions.push_back(pos); } } #include std::vector sort_distance(std::vector positions, int mean) { std::vector distances; for (size_t i = 0; i < positions.size(); i++) { int dist = std::abs(mean - positions[i]); size_t j = 0; while (j < distances.size()) { if (std::abs(mean - distances[j]) < dist) { distances.insert(distances.begin() + j, positions[i]); break; } j++; } if (j == distances.size()) { distances.push_back(positions[i]); } } return distances; } void test_slimming() { double fract = 0.2; srand(time(NULL)); int mean = rand() % 100000; /// sqrt(1/12) for ins. Plot TRA std vs. cov/support. int intervall = 1000; std::vector > stds; int key = 0; int cov = 100; for (double fract = 0.1; fract < 1; fract += 0.1) { //std::cout< positions; get_rand(mean, round(cov * fract), positions, intervall); //random process get_rand(mean, round(cov * (1 - fract)), positions, 10); //focused calls // std::cout << "Cov: " << cov << " border: " << intervall << " STD: " << comp_std(positions, mean) << std::endl; std::vector dists; dists = sort_distance(positions, mean); /* for (size_t i = 0; i < dists.size(); i++) { std::cout << abs(mean - dists[i]) << std::endl; } */ std::vector std_tmp; for (size_t i = 0; i < dists.size(); i++) { std::vector tmp; tmp.assign(dists.rbegin(), dists.rend() - i); double std = comp_std(tmp, mean); //std::cout << "Points: " << tmp.size() << " STD: " << std << std::endl; std_tmp.push_back(std); } stds.push_back(std_tmp); } for (size_t i = 0; i < stds.size(); i++) { for (size_t j = 0; j < stds[i].size(); j++) { std::cout << stds[i][j] << "\t"; } std::cout << std::endl; } } int main(int argc, char *argv[]) { try { //init parameter and reads user defined parameter from command line. read_parameters(argc, argv); //init openmp: omp_set_dynamic(0); omp_set_num_threads(Parameter::Instance()->num_threads); if ((!Parameter::Instance()->output_vcf.empty()) && (!Parameter::Instance()->output_bedpe.empty())) { std::cerr << "Please select only vcf OR bedpe output format!" << std::endl; exit(1); } //init printer: IPrinter * printer; if (!Parameter::Instance()->output_vcf.empty()) { printer = new VCFPrinter(); } else if (!Parameter::Instance()->output_bedpe.empty()) { printer = new BedpePrinter(); } else { std::cerr << "Please specify an output file using -v or -b" << std::endl; return -1; } printer->init(); if (Parameter::Instance()->input_vcf.empty()) { //regular calling detect_breakpoints(Parameter::Instance()->bam_files[0], printer); //we could write out all read names for each sVs } else { //force calling was selected: force_calling(Parameter::Instance()->bam_files[0], printer); } printer->close_file(); //cluster the SVs together: if (Parameter::Instance()->phase) { std::cout << "Start phasing: " << std::endl; Cluster_SVS *cluster = new Cluster_SVS(); cluster->update_SVs(); } //determine genotypes: if (Parameter::Instance()->genotype) { std::cout << "Start genotype calling:" << std::endl; Genotyper * go = new Genotyper(); go->update_SVs(); } } catch (TCLAP::ArgException &e) // catch any exceptions { std::cerr << "Sniffles error: " << e.error() << " for arg " << e.argId() << std::endl; } return 0; } Sniffles-1.0.7/src/Version.h000066400000000000000000000002071320237057600156610ustar00rootroot00000000000000#ifndef VERSION_H #define VERSION_H #define VERSION_MAJOR "" #define VERSION_MINOR "" #define VERSION_BUILD "" #endif // VERSION_H Sniffles-1.0.7/src/Version.h.in000066400000000000000000000003001320237057600162600ustar00rootroot00000000000000#ifndef VERSION_H #define VERSION_H #define VERSION_MAJOR "@NGM_VERSION_MAJOR@" #define VERSION_MINOR "@NGM_VERSION_MINOR@" #define VERSION_BUILD "@NGM_VERSION_BUILD@" #endif // VERSION_H Sniffles-1.0.7/src/cluster/000077500000000000000000000000001320237057600155455ustar00rootroot00000000000000Sniffles-1.0.7/src/cluster/Cluster_SVs.cpp000066400000000000000000000104131320237057600204640ustar00rootroot00000000000000/* * Cluster_SVs.cpp * * Created on: Apr 28, 2016 * Author: fsedlaze */ #include "Cluster_SVs.h" std::map > Cluster_SVS::parse_names_ids(int & max_ID) { FILE * alt_allel_reads = fopen(Parameter::Instance()->tmp_phasing.c_str(), "r"); if (alt_allel_reads == NULL) { std::cerr << "ClusterParse: could not open tmp file: " << Parameter::Instance()->tmp_phasing << std::endl; } std::map > names; name_str tmp; size_t nbytes = fread(&tmp, sizeof(struct name_str), 1, alt_allel_reads); while (nbytes != 0) { max_ID = std::max(max_ID, tmp.svs_id); //needs to be a long as we need to know the size prior to storing! names[tmp.read_name].push_back(tmp.svs_id); nbytes = fread(&tmp, sizeof(struct name_str), 1, alt_allel_reads); } fclose(alt_allel_reads); return names; } void Cluster_SVS::update_SVs(std::vector & ids) { std::ifstream myfile; bool is_vcf = !Parameter::Instance()->output_vcf.empty(); std::string filename; int col; if (is_vcf) { col = 2; filename = Parameter::Instance()->output_vcf; } else { col = 6; filename = Parameter::Instance()->output_bedpe; } myfile.open(filename.c_str(), std::ifstream::in); if (!myfile.good()) { std::cout << "Cluster Parse: could not open file: " << std::endl; exit(0); } std::string tmp_name_file = filename; tmp_name_file += ".tmp"; FILE*file = fopen(tmp_name_file.c_str(), "w"); size_t buffer_size = 2500000; char* buffer = new char[buffer_size]; myfile.getline(buffer, buffer_size); //parse SVs breakpoints in file while (!myfile.eof()) { if (buffer[0] != '#') { int count = 0; for (size_t i = 0; i < buffer_size && (buffer[i] != '\0' && buffer[i] != '\n'); i++) { if (count == col) { //if colum of id: if (buffer[i - 1] == '\t') { int id = atoi(&buffer[i]); fprintf(file, "%s", find_id(id, ids).c_str()); fprintf(file, "%c", '\t'); } } else { fprintf(file, "%c", buffer[i]); } if (buffer[i] == '\t') { count++; } } } else { fprintf(file, "%s", buffer); } fprintf(file, "%c", '\n'); myfile.getline(buffer, buffer_size); } myfile.close(); fclose(file); std::string move = "mv "; move += tmp_name_file; move += " "; move += filename; system(move.c_str()); } void Cluster_SVS::add_id(int curr_id, int new_id, std::vector & ids, int subkey) { for (size_t i = 0; i < ids.size(); i++) { //check if already in the new array if (ids[i].curr_id == curr_id && ids[i].alt_id == new_id) { ids[i].support++; ids[i].hit = subkey; return; } } //make new entry: combine_str tmp; tmp.curr_id = curr_id; tmp.alt_id = new_id; //smallest ID of SVs tmp.support = 1; tmp.hit = subkey; ids.push_back(tmp); } std::string Cluster_SVS::find_id(int curr_id, std::vector & ids) { std::stringstream ss; for (size_t i = 0; i < ids.size(); i++) { if (ids[i].support > Parameter::Instance()->min_grouping_support) { if (ids[i].curr_id == curr_id) { ss << ids[i].alt_id; ss << '_'; ss << ids[i].hit; return ss.str(); } } } ss << curr_id; return ss.str(); } void Cluster_SVS::update_SVs() { //1: read in names + IDs -> store in map! int max_ID = 0; //TODO: restructure! //id=svs_id; std::map > names = parse_names_ids(max_ID); //key = read_id values: SVs id's //2: make array with ID as entry and value is the smalles ID in the colum of all storred readnames. std::vector ids; for (std::map >::iterator i = names.begin(); i != names.end(); i++) { if ((*i).second.size() > 1) { int min_id = max_ID + 1; for (size_t j = 0; j < (*i).second.size(); j++) { // get the smallest ID of SVs associated with the read id. min_id = std::min(min_id, (*i).second[j]); } //min_id is now the smallest SVs id that this read is associated with. int subkey = 0; for (size_t j = 0; j < (*i).second.size(); j++) { // update the other SVs IDs //if ((*i).second[j] != min_id) { add_id((*i).second[j], min_id, ids, subkey); subkey++; //} } } } names.clear(); //3: Update the IDS in the VCF/Bedpe files. update_SVs(ids); std::cout << "\tCleaning tmp files" << std::endl; std::string del = "rm "; del += Parameter::Instance()->tmp_phasing; // system(del.c_str()); } Sniffles-1.0.7/src/cluster/Cluster_SVs.h000066400000000000000000000017171320237057600201400ustar00rootroot00000000000000/* * Cluster_SVs.h * * Created on: Apr 28, 2016 * Author: fsedlaze */ #ifndef CLUSTER_CLUSTER_SVS_H_ #define CLUSTER_CLUSTER_SVS_H_ #include #include #include #include #include #include #include "../Paramer.h" #include #include struct __attribute__((packed)) name_str{ long read_name; //needs to be a number to store in binary! (bit reservation!) int svs_id; }; struct combine_str{ int curr_id; int alt_id; int support; short hit; }; class Cluster_SVS{ private: std::map > parse_names_ids(int & max_ID) ; void update_SVs( std::vector & ids); //just because the pass is more efficient void add_id(int curr_id,int new_id, std::vector & ids,int subkey); std::string find_id(int curr_id, std::vector & ids); public: Cluster_SVS(){ } ~Cluster_SVS(){ } void update_SVs(); }; #endif /* CLUSTER_CLUSTER_SVS_H_ */ Sniffles-1.0.7/src/force_calling/000077500000000000000000000000001320237057600166535ustar00rootroot00000000000000Sniffles-1.0.7/src/force_calling/Force_calling.cpp000066400000000000000000000137421320237057600221150ustar00rootroot00000000000000/* * Force_calling.cpp * * Created on: Aug 24, 2017 * Author: sedlazec */ #include "Force_calling.h" char assign_type(short type) { switch (type) { case 0: //DEL return DEL; case 1: //DUP return INV; case 2: //INV return INV; case 3: //TRA return TRA; case 4: //INS return INS; case 6: return NEST; } return ' '; //TODO check default. Should not happen! } void fill_tree(IntervallTree & final, TNode *& root_final, RefVector ref, std::map& ref_lens) { //prepare lookup: long length = 0; for (size_t i = 0; i < ref.size(); i++) { ref_lens[ref[i].RefName.c_str()] = length; length += ref[i].RefLength + Parameter::Instance()->max_dist; } //parse VCF file std::vector entries = parse_vcf(Parameter::Instance()->input_vcf, 0); std::cout << "\t\t" << entries.size() << " SVs found in input." << std::endl; for (size_t i = 0; i < entries.size(); i++) { if (entries[i].type != -1) { position_str svs; svs.start.min_pos = (long) entries[i].start.pos + ref_lens[entries[i].start.chr]; svs.stop.max_pos = (long) entries[i].stop.pos + ref_lens[entries[i].stop.chr]; read_str read; read.SV = assign_type(entries[i].type); read.strand=entries[i].strands; read.type = 2; //called svs.support["input"] = read; Breakpoint * br = new Breakpoint(svs, (long) entries[i].sv_len); final.insert(br, root_final); //std::cout << "Print:" << std::endl; //final.print(root_final); } else { cerr << "Invalid type found skipping" << endl; } } entries.clear(); } void force_calling(std::string bam_file, IPrinter *& printer) { cout<<"Force calling SVs"<bam_files[0]); BamParser * mapped_file = 0; RefVector ref; std::string read_filename = Parameter::Instance()->bam_files[0]; if (read_filename.find("bam") != string::npos) { mapped_file = new BamParser(read_filename); ref = mapped_file->get_refInfo(); } else { cerr << "File Format not recognized. File must be a sorted .bam file!" << endl; exit(0); } std::cout << "Construct Tree..." << std::endl; //construct the tree: IntervallTree final; TNode * root_final = NULL; std::map ref_lens; fill_tree(final, root_final, ref, ref_lens); std::cout << "Start parsing..." << std::endl; int current_RefID = 0; //FILE * alt_allel_reads; FILE * ref_allel_reads; if (Parameter::Instance()->genotype) { ref_allel_reads = fopen(Parameter::Instance()->tmp_genotyp.c_str(), "wb"); } Alignment * tmp_aln = mapped_file->parseRead(Parameter::Instance()->min_mq); long ref_space = ref_lens[ref[tmp_aln->getRefID()].RefName]; long num_reads = 0; while (!tmp_aln->getQueryBases().empty()) { if ((tmp_aln->getAlignment()->IsPrimaryAlignment()) && (!(tmp_aln->getAlignment()->AlignmentFlag & 0x800) && tmp_aln->get_is_save())) { //change CHR: if (current_RefID != tmp_aln->getRefID()) { current_RefID = tmp_aln->getRefID(); ref_space = ref_lens[ref[tmp_aln->getRefID()].RefName]; std::cout << "\tSwitch Chr " << ref[tmp_aln->getRefID()].RefName << std::endl; //" " << ref[tmp_aln->getRefID()].RefLength } //check if overlap with any breakpoint!! long read_start_pos = (long) tmp_aln->getPosition() - (long)Parameter::Instance()->max_dist; read_start_pos += ref_space; long read_stop_pos = read_start_pos + (long) tmp_aln->getAlignment()->Length + (long)Parameter::Instance()->max_dist; //getRefLength();//(long) tmp_aln->getPosition(); if (final.overlaps(read_start_pos, read_stop_pos, root_final)) { //SCAN read: std::vector aln_event; std::vector split_events; if (tmp_aln->getMappingQual() > Parameter::Instance()->min_mq) { double score = tmp_aln->get_scrore_ratio(); #pragma omp parallel // starts a new team { #pragma omp sections { { // clock_t begin = clock(); if ((score == -1 || score > Parameter::Instance()->score_treshold)) { aln_event = tmp_aln->get_events_Aln(); } // Parameter::Instance()->meassure_time(begin, " Alignment "); } #pragma omp section { // clock_t begin_split = clock(); split_events = tmp_aln->getSA(ref); // Parameter::Instance()->meassure_time(begin_split," Split reads "); } } } //tmp_aln->set_supports_SV(aln_event.empty() && split_events.empty()); //Store reference supporting reads for genotype estimation: str_read tmp; bool SV_support = !(aln_event.empty() && split_events.empty()); if ((Parameter::Instance()->genotype && !SV_support) && (score == -1 || score > Parameter::Instance()->score_treshold)) { //write read: //cout<<"REf: "<getName()<<" "<getPosition()<<" "<getRefLength()<getRefID(); tmp.start = tmp_aln->getPosition(); tmp.length = tmp_aln->getRefLength(); fwrite(&tmp, sizeof(struct str_read), 1, ref_allel_reads); } //store the potential SVs: if (!aln_event.empty()) { add_events(tmp_aln, aln_event, 0, ref_space, final, root_final, num_reads, true); } if (!split_events.empty()) { add_splits(tmp_aln, split_events, 1, ref, final, root_final, num_reads, true); } } } } //get next read: mapped_file->parseReadFast(Parameter::Instance()->min_mq, tmp_aln); num_reads++; if (num_reads % 10000 == 0) { cout << "\t\t# Processed reads: " << num_reads << endl; } } std::cout << "Print:" << std::endl; //final.print(root_final); //filter and copy results: std::cout << "Finalizing .." << std::endl; if (Parameter::Instance()->genotype) { fclose(ref_allel_reads); } // sweep->finalyze(); std::vector points; final.get_breakpoints(root_final, points); //std::cout<<"fin up"<calc_support(); points[i]->predict_SV(); printer->printSV(points[i]); //redo! Ignore min support + STD etc. } } Sniffles-1.0.7/src/force_calling/Force_calling.h000066400000000000000000000011061320237057600215510ustar00rootroot00000000000000/* * Force_calling.h * * Created on: Aug 24, 2017 * Author: sedlazec */ #ifndef FORCE_CALLING_FORCE_CALLING_H_ #define FORCE_CALLING_FORCE_CALLING_H_ #include #include #include #include #include #include #include #include "../Paramer.h" #include "../BamParser.h" #include "../tree/BinTree.h" #include "../sub/Breakpoint.h" #include "../sub/Detect_Breakpoints.h" #include "VCF_parser.h" void force_calling(std::string bam_file, IPrinter *& printer); #endif /* FORCE_CALLING_FORCE_CALLING_H_ */ Sniffles-1.0.7/src/force_calling/VCF_parser.cpp000066400000000000000000000273631320237057600213640ustar00rootroot00000000000000/* * VCF_parser.cpp * * Created on: Aug 24, 2017 * Author: sedlazec */ #include "VCF_parser.h" strcoordinate parse_stop(const char * buffer) { size_t i = 0; bool chr_flag = false; strcoordinate pos; pos.chr = ""; pos.pos = -1; while (buffer[i] != '\t' && (buffer[i] != '\n' && buffer[i] != '\0')) { if (strncmp(&buffer[i], ";END=", 5) == 0) { pos.pos = atoi(&buffer[i + 5]); } if ((strncmp(&buffer[i], "END=", 4) == 0 && i == 0)) { pos.pos = atoi(&buffer[i + 4]); //std::cout<<"pos"< parse_strands(const char * buffer) { std::pair strands; size_t i = 0; while (buffer[i] != '\t' && (buffer[i] != '\n' && buffer[i] != '\0')) { if (strncmp(&buffer[i], "3to5", 4) == 0) { strands.first = false; strands.second = true; return strands; } if (strncmp(&buffer[i], "3to3", 4) == 0) { strands.first = false; strands.second = false; return strands; } if (strncmp(&buffer[i], "5to3", 4) == 0) { strands.first = true; strands.second = false; return strands; } if (strncmp(&buffer[i], "5to5", 4) == 0) { strands.first = true; strands.second = true; return strands; } i++; } return strands; } std::vector parse_callers(char* buffer) { size_t i = 0; std::vector entries; //std::cout< parse_manta(char * buffer) { std::pair res; res.first = 0; res.second = 0; size_t i = 0; while (buffer[i] != '\t') { i++; } //std::cout< 3) { if (buffer[i - 1] == ':') { res.first += atoi(&buffer[i]); // std::cout<<"first: "< parse_delly(char * buffer) { // GT:GL:GQ:FT:RCL:RC:RCR:CN:DR:DV:RR:RV 0/1:-9.02876,0,-32.6842:90:PASS:0:22219093:0:-1:14:3:0:0 std::pair res; res.first = 0; res.second = 0; size_t i = 0; while (buffer[i] != '\t') { i++; } //std::cout< abs(ref - size)) { most_alt = tmp; } tmp.clear(); } else { tmp += alt[i]; } i++; } return most_alt; } std::vector parse_vcf(std::string filename, int min_svs) { size_t buffer_size = 200000000; char*buffer = new char[buffer_size]; std::ifstream myfile; myfile.open(filename.c_str(), std::ifstream::in); if (!myfile.good()) { std::cout << "VCF Parser: could not open file: " << filename.c_str() << std::endl; exit(0); } std::vector calls; myfile.getline(buffer, buffer_size); int num = 0; while (!myfile.eof()) { if (buffer[0] != '#') { // std::cout< 0) { tmp.type = 0; //deletion } else if (tmp.sv_len < 0) { tmp.type = 4; //insertions tmp.sv_len = abs(tmp.sv_len); // std::cout<<"INS: "<= min_svs))) { // || tmp.type==4 std::size_t found = tmp.stop.chr.find("chr"); if (found != std::string::npos) { tmp.stop.chr.erase(tmp.stop.chr.begin() + found, tmp.stop.chr.begin() + found + 3); } found = tmp.start.chr.find("chr"); if (found != std::string::npos) { tmp.start.chr.erase(tmp.start.chr.begin() + found, tmp.start.chr.begin() + found + 3); } if (tmp.type == 5) { //BND if (strcmp(tmp.stop.chr.c_str(), tmp.start.chr.c_str()) == 0) { tmp.type = 2; } else { tmp.type = 3; } } calls.push_back(tmp); } tmp.calls.clear(); } else { } myfile.getline(buffer, buffer_size); } myfile.close(); //std::cout << calls.size() << std::endl; return calls; } Sniffles-1.0.7/src/force_calling/VCF_parser.h000066400000000000000000000015471320237057600210250ustar00rootroot00000000000000/* * VCF_parser.h * * Created on: Aug 24, 2017 * Author: sedlazec */ #ifndef FORCE_CALLING_VCF_PARSER_H_ #define FORCE_CALLING_VCF_PARSER_H_ #include #include #include #include #include #include #include #include #include #include struct strcoordinate{ int pos; std::string chr; }; struct strvcfentry{ std::string header; strcoordinate start; strcoordinate stop; short type; //0=DEL,1=DUP,2=INV,3=TRA std::map calls; int sup_lumpy; int caller_id; std::vector caller_supports; std::pair strands; std::pair num_reads; //ref alt std::string genotype; int sv_len; //int num_reads; }; std::vector parse_vcf(std::string filename, int min_svs); #endif /* FORCE_CALLING_VCF_PARSER_H_ */ Sniffles-1.0.7/src/plane-sweep/000077500000000000000000000000001320237057600163045ustar00rootroot00000000000000Sniffles-1.0.7/src/plane-sweep/IContainer.h000066400000000000000000000005551320237057600205150ustar00rootroot00000000000000/* * IContainer.h * * Created on: Apr 14, 2015 * Author: fsedlaze */ #ifndef ICONTAINER_H_ #define ICONTAINER_H_ #include "Node.h" class IContainer{ protected: public: IContainer(){} virtual ~IContainer(){}; virtual void push(Alignment * read)=0; virtual int size()=0; virtual Node * get_entries(int new_start)=0; }; #endif /* ICONTAINER_H_ */ Sniffles-1.0.7/src/plane-sweep/Main.cpp000066400000000000000000000050351320237057600176770ustar00rootroot00000000000000//============================================================================ // Name : plane_sweep.cpp // Author : Fritz Sedlazeck // Version : // Copyright : Your copyright notice // Description : Hello World in C++, Ansi-style //============================================================================ /* #include #include #include #include "SamParser.h" #include "Plane-sweep.h" using namespace std::chrono; int main(int argc, char *argv[]) { if (argc > 1) { IParser *parser; std::string filename = std::string(argv[2]); std::cout << filename << std::endl; std::string samfile = "sam"; std::size_t found = filename.find(samfile); if (found != std::string::npos) { parser = new SamParser(); } else { parser = new TableParser(); } std::vector reads = parser->parse_reads(filename); std::cout << "Read in reads: " << reads.size() << std::endl; switch (atoi(argv[1])) { case 1: if (argc == 3) { //exchangeable by implementing IParser: PlaneSweep * sweep = new PlaneSweep(); std::cout << "start storing:" << std::endl; high_resolution_clock::time_point t1 = high_resolution_clock::now(); for (size_t i = 0; i < reads.size(); i++) { sweep->add_read(reads[i]); } sweep->finalyze(); high_resolution_clock::time_point t2 = high_resolution_clock::now(); auto duration = std::chrono::duration_cast( t2 - t1 ).count(); std::cerr<<"time elapsed: "< genome; genome.resize(maxim + 1, 0); for (size_t i = 0; i < reads.size(); i++) { size_t j = reads[i].start; while (j < reads[i].stop) { genome[j]++; j++; } } for (size_t i = 0; i < genome.size(); i++) { if (i > 0 && genome[i - 1] != genome[i]) { std::cout << i << '\t' << genome[i] << std::endl; } } genome.clear(); high_resolution_clock::time_point t2 = high_resolution_clock::now(); auto duration = std::chrono::duration_cast( t2 - t1 ).count(); std::cerr<<"time elapsed: "<get_stop() <= new_start){ Node * min = heap[0]; heap[0] = heap.at(heap.size() - 1); heap.pop_back(); heapifydown(0); std::cout<<"get entries end "<size()<::iterator pos = heap.begin(); std::cout << "Heap = "; while ( pos != heap.end() ) { std::cout << (*pos)->get_stop() << " "; ++pos; } std::cout << std::endl; } void MyHeap::heapifyup(int index) { while ( ( index > 0 ) && ( parent(index) >= 0 ) && ( heap[parent(index)]->get_stop() > heap[index]->get_stop() ) ) { Node * tmp = heap[parent(index)]; heap[parent(index)] = heap[index]; heap[index] = tmp; index = parent(index); } } void MyHeap::heapifydown(int index) { int child = left(index); if ( ( child > 0 ) && ( right(index) > 0 ) && ( heap[child]->get_stop() > heap[right(index)]->get_stop() ) ) { child = right(index); } if ( child > 0 ) { Node* tmp = heap[index]; heap[index] = heap[child]; heap[child] = tmp; heapifydown(child); } } int MyHeap::left(int parent) { int i = ( parent << 1 ) + 1; // 2 * parent + 1 return ( i < heap.size() ) ? i : -1; } int MyHeap::right(int parent) { int i = ( parent << 1 ) + 2; // 2 * parent + 2 return ( i < heap.size() ) ? i : -1; } int MyHeap::parent(int child) { if (child != 0) { int i = (child - 1) >> 1; return i; } return -1; } Sniffles-1.0.7/src/plane-sweep/MyHeap.h000066400000000000000000000011411320237057600176350ustar00rootroot00000000000000/* * MyHeap.h * * Created on: Apr 14, 2015 * Author: fsedlaze */ #ifndef MYHEAP_H_ #define MYHEAP_H_ #include #include #include "IContainer.h" #include "Node.h" class MyHeap: public IContainer{ private: std::vector heap; int left(int parent); int right(int parent); int parent(int child); void heapifyup(int index); void heapifydown(int index); public: MyHeap(){}; ~MyHeap(){}; void push(Alignment * read); Node * get_entries(int new_start); int size() { return this->heap.size(); } void print(); }; #endif /* MYHEAP_H_ */ Sniffles-1.0.7/src/plane-sweep/MyList.cpp000066400000000000000000000030211320237057600202250ustar00rootroot00000000000000/* * MyList.cpp * * Created on: Apr 14, 2015 * Author: fsedlaze */ #include "MyList.h" void MyList::push(Alignment * read) { Node * new_node = new Node(read); //std::cout << new_node->get_read()->getPosition() << " " // << new_node->get_read()->getPosition() + new_node->get_stop() // << std::endl; this->num_nodes++; if (this->start == NULL) { //set first node: start = new_node; last = new_node; } else { //we have already something: if (this->last != NULL && new_node->get_stop() > this->last->get_stop()) { //should be inserted after the last node last->set_next(new_node); last = new_node; } else { //insert within the list: if (start->get_stop() > new_node->get_stop()) { //insert before start; new_node->set_next(start); start = new_node; } else { Node * prev = start; Node * curr = start->get_next(); while (curr != NULL && curr->get_stop() < new_node->get_stop()) { prev = curr; curr = curr->get_next(); } new_node->set_next(curr); prev->set_next(new_node); } } } } Node* MyList::pop() { Node * tmp = start; if (tmp != NULL) { start = start->get_next(); if (tmp == this->last) { last = NULL; } this->num_nodes--; tmp->set_next(NULL); } return tmp; //TODO: care about deleting the object! } int MyList::size() { return num_nodes;// + num_nodes_lowMQ; } Node * MyList::get_entries(int new_start) { if (new_start == -1 || (this->start != NULL && this->start->get_stop() <= new_start)) { return pop(); } return NULL; } Sniffles-1.0.7/src/plane-sweep/MyList.h000066400000000000000000000014511320237057600176770ustar00rootroot00000000000000/* * MyList.h * * Created on: Apr 14, 2015 * Author: fsedlaze */ #ifndef MYLIST_H_ #define MYLIST_H_ #include #include "Node.h" #include "IContainer.h" class MyList: public IContainer{ private: Node * start; Node * last; int num_nodes; int split_read; int num_nodes_lowMQ; public: MyList(){ split_read=0; num_nodes=0; num_nodes_lowMQ=0; start=NULL; last=NULL; } ~MyList(){ delete [] start; num_nodes=0; split_read=0; } Node * pop(); void push(Alignment * read); int size(); Node * get_entries(int new_start); Node * get_start(){ return start; } int get_normal_reads(){ return this->num_nodes; //strange value! } int get_split_reads(){ return this->split_read; } int get_lowMqcov(){ return this->num_nodes_lowMQ; } }; #endif /* MYLIST_H_ */ Sniffles-1.0.7/src/plane-sweep/Node.h000066400000000000000000000023011320237057600173360ustar00rootroot00000000000000/* * Node.h * * Created on: Apr 14, 2015 * Author: fsedlaze */ #ifndef NODE_H_ #define NODE_H_ #include "../Alignment.h" class Node{ private: //Alignment * read; int start; int stop; short mq; Node * next; short times; //indicate if two reads stop at the same location bool processed; bool support; //to del public: Node(){ processed=false; next=NULL; times=0; start=0; stop=0; // read=NULL; mq=0; } Node(Alignment * read){ processed=false; next=NULL; times=1; mq=read->getMappingQual(); //this->read=read; this->start=read->getPosition(); this->stop=this->start+read->getRefLength(); support=read->supports_SV(); } ~Node(){ } bool supports_SV(){ return support; } Node * get_next(){ return this->next; } int get_stop(){ return this->stop; } void set_next(Node *next){ this->next=next; } /*Alignment * get_read(){ return this->read; } void set_read(Alignment * read){ this->start=read->getPosition(); this->stop=read->getRefLength(); this->read=read; }*/ void set_processed (bool value){ this->processed=value; } bool was_processed(){ return this->processed; } uint16_t getmq(){ return mq; } }; #endif /* NODE_H_ */ Sniffles-1.0.7/src/plane-sweep/Plane-sweep.h000066400000000000000000000020161320237057600206340ustar00rootroot00000000000000/* * Plane-sweep.h * * Created on: Apr 14, 2015 * Author: fsedlaze */ #ifndef PLANESWEEP_H_ #define PLANESWEEP_H_ #include #include #include "IContainer.h" #include "../Alignment.h" #include "MyList.h" #include "MyHeap.h" //#include "MyHeap.h" class PlaneSweep { private: MyList* current_reads; // will be used as heap. // void release_pos(int new_start); int RefID; //the current chr/contig; int curr_pos; public: PlaneSweep(){ curr_pos=0; RefID=-1; current_reads=new MyList(); //current_reads=new MyHeap(); } ~PlaneSweep(){ delete current_reads; } void release_pos(int new_start); void add_read(Alignment* read); void finalyze(); Node * get_current_reads(){ return current_reads->get_start(); } int get_num_reads(){ return current_reads->get_normal_reads(); } int get_num_SVreads(){ return current_reads->get_split_reads(); } int get_num_lowMQ_reads(){ return current_reads->get_lowMqcov(); } int get_RefID(){ return RefID; } }; #endif /* PLANESWEEP_H_ */ Sniffles-1.0.7/src/plane-sweep/PlaneSweep.cpp000066400000000000000000000020761320237057600210600ustar00rootroot00000000000000/* * PlaneSweep.cpp * * Created on: Apr 14, 2015 * Author: fsedlaze */ #include "Plane-sweep.h" void PlaneSweep::release_pos(int new_start) { Node * curr = this->current_reads->get_entries(new_start); while (curr != NULL) { delete curr; curr = NULL; curr = this->current_reads->get_entries(new_start); } } void PlaneSweep::add_read(Alignment* read) { //std::cout<<"\tnew: "<getPosition()<<" "<getPosition()+read->getRefLength()<current_reads->size() > 0 && read->getRefID() != this->RefID) { finalyze(); this->RefID = read->getRefID(); } else if (this->current_reads->size() == 0) { this->RefID = read->getRefID(); } //first check if we can release already some positions: release_pos(read->getPosition()); //insert read to our list if it does not support an SV: if(!read->supports_SV() && read->getMappingQual()>20){ current_reads->push(read); } } void PlaneSweep::finalyze() { //report the remaining positions/reads; std::cout << "Finalize" << std::endl; release_pos(-1); //flag for releasing all; } Sniffles-1.0.7/src/plane-sweep/PlaneSweep_slim.cpp000066400000000000000000000017001320237057600220750ustar00rootroot00000000000000/* * PlaneSweep_slim.cpp * * Created on: Mar 9, 2016 * Author: fsedlaze */ #include "PlaneSweep_slim.h" pair_str PlaneSweep_slim::add_mut(int pos,int new_cov, int min_cov) { //check if we need to release reads: std::vector::iterator j = entries.begin(); for (size_t i = 0; i < this->entries.size() && pos > entries[i].position; i++) { //no need to record ending events. We just search for starting events! this->cov-=entries[i].coverage; j++; } //erase old events: entries.erase(entries.begin(), j); //add current mut: //insert the stop coordinate! pair_str tmp; tmp.position=pos+boundary; tmp.coverage=new_cov; this->entries.push_back(tmp); this->cov+=new_cov; //record if we met the threshold: tmp.position=-1; //flag for not meeting the threshold if(this->cov>min_cov){ tmp.coverage = this->cov; tmp.position = pos; } return tmp; } void PlaneSweep_slim::finalyze() { entries.clear(); this->cov=0; } Sniffles-1.0.7/src/plane-sweep/PlaneSweep_slim.h000066400000000000000000000012411320237057600215420ustar00rootroot00000000000000/* * PlaneSweep_slim.h * * Created on: Mar 9, 2016 * Author: fsedlaze */ #ifndef PLANE_SWEEP_PLANESWEEP_SLIM_H_ #define PLANE_SWEEP_PLANESWEEP_SLIM_H_ #include #include #include #include #include "../Paramer.h" //for testing/debug struct pair_str{ int position; int coverage; }; class PlaneSweep_slim { private: int boundary; int cov; std::vector entries; public: PlaneSweep_slim() { boundary=100; cov=0; } ~PlaneSweep_slim(){ entries.clear(); } void release_pos(int new_start); pair_str add_mut(int pos,int cov, int min_cov); void finalyze(); }; #endif /* PLANE_SWEEP_PLANESWEEP_SLIM_H_ */ Sniffles-1.0.7/src/print/000077500000000000000000000000001320237057600152205ustar00rootroot00000000000000Sniffles-1.0.7/src/print/BedpePrinter.cpp000066400000000000000000000114271320237057600203140ustar00rootroot00000000000000/* * BedePrinter.cpp * * Created on: Aug 24, 2015 * Author: fsedlaze */ #include "BedpePrinter.h" void BedpePrinter::print_header() { fprintf(file, "%s", "#Chrom\tstart\tstop\tchrom2\tstart2\tstop2\tvariant_name/ID\tscore (smaller is better)\tstrand1\tstrand2\ttype\tnumber_of_split_reads\tbest_chr1\tbest_start\tbest_chr2\tbest_stop\tpredicted_length\n"); } void BedpePrinter::print_body(Breakpoint * &SV, RefVector ref) { if (!this->bed_tree.is_in(SV->get_coordinates().start.most_support, this->root) && !this->bed_tree.is_in(SV->get_coordinates().stop.most_support, this->root)) { //temp. store read names supporting this SVs to later group the SVs together. if (Parameter::Instance()->phase) { store_readnames(SV->get_read_ids(), id); } double std_quant_start = 0; double std_quant_stop = 0; pair kurtosis; pair std_quant; double std_length = 0; int zmws = 0; bool ok_to_print = (to_print(SV, std_quant, kurtosis, std_length, zmws) || Parameter::Instance()->ignore_std); if (ok_to_print && (zmws == 0 || zmws >= Parameter::Instance()->min_zmw)) { std::string chr; std::string strands = SV->get_strand(2); int pos = IPrinter::calc_pos(SV->get_coordinates().start.min_pos, ref, chr); fprintf(file, "%s", chr.c_str()); fprintf(file, "%c", '\t'); fprintf(file, "%i", pos); fprintf(file, "%c", '\t'); fprintf(file, "%i", IPrinter::calc_pos(SV->get_coordinates().start.max_pos, ref, chr)); fprintf(file, "%c", '\t'); pos = IPrinter::calc_pos(SV->get_coordinates().stop.min_pos, ref, chr); fprintf(file, "%s", chr.c_str()); fprintf(file, "%c", '\t'); fprintf(file, "%i", pos); fprintf(file, "%c", '\t'); fprintf(file, "%i", IPrinter::calc_pos(SV->get_coordinates().stop.max_pos, ref, chr)); fprintf(file, "%c", '\t'); fprintf(file, "%i", id); id++; fprintf(file, "%c", '\t'); fprintf(file, "%i", -1); //TODO: score fprintf(file, "%c", '\t'); fprintf(file, "%c", strands[0]); fprintf(file, "%c", '\t'); fprintf(file, "%c", strands[1]); fprintf(file, "%c", '\t'); fprintf(file, "%s", IPrinter::get_type(SV->get_SVtype()).c_str()); fprintf(file, "%c", '\t'); fprintf(file, "%i", SV->get_support()); fprintf(file, "%c", '\t'); pos = IPrinter::calc_pos(SV->get_coordinates().start.most_support, ref, chr); fprintf(file, "%s", chr.c_str()); fprintf(file, "%c", '\t'); fprintf(file, "%i", pos); fprintf(file, "%c", '\t'); pos = IPrinter::calc_pos(SV->get_coordinates().stop.most_support, ref, chr); fprintf(file, "%s", chr.c_str()); fprintf(file, "%c", '\t'); fprintf(file, "%i", pos); fprintf(file, "%c", '\t'); if (((SV->get_SVtype() & INS) && SV->get_length() == Parameter::Instance()->huge_ins) && !SV->get_types().is_SR) { fprintf(file, "%s", "NA"); } else { fprintf(file, "%i", SV->get_length()); } //fprintf(file, "%c", '\t'); //fprintf(file, "%i", SV->get_support()); fprintf(file, "%c", '\n'); } } } void BedpePrinter::print_body_recall(Breakpoint * &SV, RefVector ref) { std::string chr; std::string strands = SV->get_strand(2); int pos = IPrinter::calc_pos(SV->get_coordinates().start.min_pos, ref, chr); fprintf(file, "%s", chr.c_str()); fprintf(file, "%c", '\t'); fprintf(file, "%i", pos); fprintf(file, "%c", '\t'); fprintf(file, "%i", IPrinter::calc_pos(SV->get_coordinates().start.max_pos, ref, chr)); fprintf(file, "%c", '\t'); pos = IPrinter::calc_pos(SV->get_coordinates().stop.min_pos, ref, chr); fprintf(file, "%s", chr.c_str()); fprintf(file, "%c", '\t'); fprintf(file, "%i", pos); fprintf(file, "%c", '\t'); fprintf(file, "%i", IPrinter::calc_pos(SV->get_coordinates().stop.max_pos, ref, chr)); fprintf(file, "%c", '\t'); fprintf(file, "%i", id); id++; fprintf(file, "%c", '\t'); fprintf(file, "%i", -1); //TODO: score fprintf(file, "%c", '\t'); fprintf(file, "%c", strands[0]); fprintf(file, "%c", '\t'); fprintf(file, "%c", strands[1]); fprintf(file, "%c", '\t'); fprintf(file, "%s", IPrinter::get_type(SV->get_SVtype()).c_str()); fprintf(file, "%c", '\t'); fprintf(file, "%i", SV->get_support()); fprintf(file, "%c", '\t'); pos = IPrinter::calc_pos(SV->get_coordinates().start.most_support, ref, chr); fprintf(file, "%s", chr.c_str()); fprintf(file, "%c", '\t'); fprintf(file, "%i", pos); fprintf(file, "%c", '\t'); pos = IPrinter::calc_pos(SV->get_coordinates().stop.most_support, ref, chr); fprintf(file, "%s", chr.c_str()); fprintf(file, "%c", '\t'); fprintf(file, "%i", pos); fprintf(file, "%c", '\t'); if (((SV->get_SVtype() & INS) && SV->get_length() == Parameter::Instance()->huge_ins) && !SV->get_types().is_SR) { fprintf(file, "%s", "NA"); } else { fprintf(file, "%i", SV->get_length()); } //fprintf(file, "%c", '\t'); //fprintf(file, "%i", SV->get_support()); fprintf(file, "%c", '\n'); } Sniffles-1.0.7/src/print/BedpePrinter.h000066400000000000000000000006601320237057600177560ustar00rootroot00000000000000/* * BedePrinter.h * * Created on: Aug 24, 2015 * Author: fsedlaze */ #ifndef PRINT_BEDPEPRINTER_H_ #define PRINT_BEDPEPRINTER_H_ #include "IPrinter.h" class BedpePrinter:public IPrinter{ private: void print_header(); void print_body(Breakpoint *& SV, RefVector ref); void print_body_recall(Breakpoint * &SV, RefVector ref); public: BedpePrinter(){ } ~BedpePrinter(){ } }; #endif /* PRINT_BEDPEPRINTER_H_ */ Sniffles-1.0.7/src/print/IPrinter.cpp000066400000000000000000000214421320237057600174630ustar00rootroot00000000000000/* * IPrinter.cpp * * Created on: Aug 24, 2015 * Author: fsedlaze */ #include "IPrinter.h" std::string IPrinter::assess_genotype(int ref, int support) { double allele = (double) support / (double) (support + ref); if (allele < Parameter::Instance()->min_allelel_frequency) { return ""; } std::stringstream ss; ss << ";AF="; ss << allele; ss << "\tGT:DR:DV\t"; if (allele > Parameter::Instance()->homfreq) { ss <<"1/1:"; } else if (allele > Parameter::Instance()->hetfreq) { ss << "0/1:"; }else{ ss << "0/0:"; } ss << ref; ss << ":"; ss << support; return ss.str(); } bool IPrinter::is_huge_ins(Breakpoint * &SV) { int counts = 0; std::map support = SV->get_coordinates().support; for (std::map::iterator i = support.begin(); i != support.end(); i++) { if (((*i).second.coordinates.second - (*i).second.coordinates.first) == Parameter::Instance()->huge_ins) { counts++; } } //std::cout<<"Ratio: "<<((double)counts/(double)support.size())< 0.3); } bool IPrinter::to_print(Breakpoint * &SV, pair& std, pair & kurtosis, double & std_length, int & zmw_num) { std.first = 0; std.second = 0; //comp_std(SV, std_start, std_stop); std_length = 0; kurtosis = comp_std_quantile(SV, std, std_length, zmw_num); bool to_print = true; if ((SV->get_SVtype() & INS) && is_huge_ins(SV)) { return (std.first < 5 || std.second < 5); } if ((SV->get_SVtype() & INS) || (SV->get_SVtype() & DEL)) { //for insertions + deletions: double dist = (double) (SV->get_coordinates().stop.most_support - SV->get_coordinates().start.most_support); dist = dist * 4.0 * (uniform_variance / 2); //because we test against corrected value! return ((std.first < dist && std.second < dist)); //0.2886751 } if (SV->get_SVtype() & NEST) { return true; } double max_allowed = 4 * Parameter::Instance()->max_dist * (uniform_variance / 2); return (std.first < max_allowed && std.second < max_allowed); } std::string IPrinter::get_chr(long pos, RefVector ref) { // std::cout << "pos: " << pos << std::endl; size_t id = 0; while (id < ref.size() && pos >= 0) { pos -= ((long) ref[id].RefLength + (long) Parameter::Instance()->max_dist); // std::cout << id << std::endl; id++; } return ref[id - 1].RefName; } void IPrinter::store_readnames(std::vector names, int id) { name_str tmp; tmp.svs_id = id; //stays the same for (size_t i = 0; i < names.size(); i++) { tmp.read_name = names[i]; fwrite(&tmp, sizeof(struct name_str), 1, this->tmp_file); } } long IPrinter::calc_pos(long pos, RefVector ref, std::string &chr) { size_t i = 0; pos -= (ref[i].RefLength + Parameter::Instance()->max_dist); while (i + 1 < ref.size() && pos >= 0) { i++; // std::cout<max_dist<max_dist); } chr = ref[i].RefName; return pos + ref[i].RefLength + (long) Parameter::Instance()->max_dist; } std::string IPrinter::get_type(char type) { string tmp; if (type & DEL) { tmp += "DEL"; } if (type & INV) { if (!tmp.empty()) { tmp += '/'; } tmp += "INV"; } if (type & DUP) { if (!tmp.empty()) { tmp += '/'; } tmp += "DUP"; } if (type & INS) { if (!tmp.empty()) { tmp += '/'; } tmp += "INS"; } if (type & TRA) { if (!tmp.empty()) { tmp += '/'; } //tmp += "BND"; tmp += "TRA"; } if (type & NEST) { if (!tmp.empty()) { tmp += '/'; } tmp += "INVDUP"; } return tmp; // should not occur! } // Get current date/time, format is YYYY-MM-DD.HH:mm:ss const std::string IPrinter::currentDateTime() { time_t now = time(0); struct tm tstruct; char buf[80]; tstruct = *localtime(&now); // Visit http://en.cppreference.com/w/cpp/chrono/c/strftime // for more information about date/time format strftime(buf, sizeof(buf), "%Y%m%d", &tstruct); return buf; } void IPrinter::sort_insert(int pos, std::vector & positions) { size_t i = 0; while (i < positions.size() && positions[i] < pos) { i++; } positions.insert(positions.begin() + i, pos); } void IPrinter::comp_std_med(Breakpoint * &SV, double & std_start, double & std_stop) { std::vector std_start_dists; std::vector std_stop_dists; std::map support = SV->get_coordinates().support; for (std::map::iterator i = support.begin(); i != support.end(); i++) { if ((*i).second.SV & SV->get_SVtype()) { if ((*i).second.coordinates.first != -1) { long diff = (SV->get_coordinates().start.most_support - (*i).second.coordinates.first); // std::cout << "DIFF Start: " << diff << std::endl; sort_insert(std::pow((double) diff, 2.0), std_start_dists); //std_start += std::pow((double) diff, 2.0); } if ((*i).second.coordinates.second != -1) { long diff = (SV->get_coordinates().stop.most_support - (*i).second.coordinates.second); // std::cout << "DIFF Stop: " << diff << std::endl; sort_insert(std::pow((double) diff, 2.0), std_stop_dists); //std_stop += std::pow((double) diff, 2.0); } } } int median = std_stop_dists.size() / 2; std_start = std::sqrt(std_start_dists[median]); std_stop = std::sqrt(std_stop_dists[median]); } bool contains_zmw(std::string read_name, std::string & zmw) { //{movieName}/{zmwNumber}/{start}_{end}/ size_t i = 0; bool read = false; while (i < read_name.size()) { if (read && read_name[i] != '/') { zmw += read_name[i]; } if (read_name[i] == '/') { read = !read; if (!zmw.empty()) { return true; } } i++; } return false; } pair IPrinter::comp_std_quantile(Breakpoint * &SV, pair & std, double & std_length, int & zmw_num) { double count = 0; std::vector std_start_dists; std::vector std_stop_dists; std::vector std_length_dists; //std::stringstream ss; double s4_start = 0; double s4_stop = 0; double s2_start = 0; double s2_stop = 0; std_length = 0; std::map support = SV->get_coordinates().support; std::map zmws; for (std::map::iterator i = support.begin(); i != support.end(); i++) { if (((*i).second.SV & SV->get_SVtype()) && strncmp((*i).first.c_str(), "input", 5) != 0) { std::string zmw = ""; if (contains_zmw((*i).first, zmw)) { zmws[zmw] = true; } long diff = SV->get_length() - ((*i).second.coordinates.second - (*i).second.coordinates.first); // sort_insert(std::pow((double) diff, 2.0), std_length_dists); //TODO think about that!! std_length += std::pow((double) diff, 2.0); if ((*i).second.coordinates.first != -1) { diff = (SV->get_coordinates().start.most_support - (*i).second.coordinates.first); //ss << '\t'; //ss << diff; sort_insert(std::pow((double) diff, 2.0), std_start_dists); s4_start += std::pow((double) diff, 4.0); s2_start += std::pow((double) diff, 2.0); } if ((*i).second.coordinates.second != -1) { diff = (SV->get_coordinates().stop.most_support - (*i).second.coordinates.second); sort_insert(std::pow((double) diff, 2.0), std_stop_dists); s4_stop += std::pow((double) diff, 4.0); s2_stop += std::pow((double) diff, 2.0); } } count++; } zmw_num = zmws.size(); std_length = std::sqrt(std_length / count); count = 0; for (int i = 0; i < std::max((int) std_stop_dists.size() / 2, 10) && i < std_start_dists.size(); i++) { std.first += std_start_dists[i]; std.second += std_stop_dists[i]; count++; } std.first = std::sqrt(std.first / count); std.second = std::sqrt(std.second / count); s4_start = s4_start / count; s4_stop = s4_stop / count; s2_start = s2_start / count; s2_stop = s2_stop / count; pair kurtosis; kurtosis.first = (s4_start / std::pow(s2_start, 2.0)) - 3; kurtosis.second = (s4_stop / std::pow(s2_stop, 2.0)) - 3; return kurtosis; } void IPrinter::comp_std(Breakpoint * &SV, double & std_start, double & std_stop) { double count = 0; std_start = 0; std_stop = 0; std::map support = SV->get_coordinates().support; for (std::map::iterator i = support.begin(); i != support.end(); i++) { if ((*i).second.SV & SV->get_SVtype()) { count++; if ((*i).second.coordinates.first != -1) { long diff = (SV->get_coordinates().start.most_support - (*i).second.coordinates.first); // std::cout << "DIFF Start: " << diff << std::endl; std_start += std::pow((double) diff, 2.0); } if ((*i).second.coordinates.second != -1) { long diff = (SV->get_coordinates().stop.most_support - (*i).second.coordinates.second); // std::cout << "DIFF Stop: " << diff << std::endl; std_stop += std::pow((double) diff, 2.0); } } } std_start = std::sqrt(std_start / count); std_stop = std::sqrt(std_stop / count); } Sniffles-1.0.7/src/print/IPrinter.h000066400000000000000000000057121320237057600171320ustar00rootroot00000000000000/* * IPrinter.h * * Created on: Aug 24, 2015 * Author: fsedlaze */ #ifndef PRINT_IPRINTER_H_ #define PRINT_IPRINTER_H_ #include #include #include #include "../tree/Intervall_bed.h" #include "api/BamReader.h" #include "../Ignore_Regions.h" #include "../sub/Breakpoint.h" #include "../cluster/Cluster_SVs.h" #include "../Genotyper/Genotyper.h" #include double const uniform_variance = 0.2886751; //sqrt(1/12) see variance of uniform distribution -> std class IPrinter { protected: FILE *file; FILE *distances; FILE *tmp_file; uint id; RefVector ref; BamParser *mapped_file; IntervallTree_bed bed_tree; Leaf *root; virtual void print_header()=0; virtual void print_body(Breakpoint * &SV, RefVector ref)=0; virtual void print_body_recall(Breakpoint * &SV, RefVector ref)=0; long calc_pos(long pos, RefVector ref, std::string &chr); std::string get_chr(long pos, RefVector ref); std::string get_type(char type); void sort_insert(int pos, std::vector & positons); bool is_huge_ins(Breakpoint * &SV); std::string assess_genotype(int ref, int support); public: IPrinter() { id = 0; root = NULL; //we just need the ref information: } virtual ~IPrinter() { delete mapped_file; } void printSV(Breakpoint * SV) { if(Parameter::Instance()->input_vcf.empty()){ print_body(SV, ref); }else{ print_body_recall(SV,ref); } } void init() { try { if (!Parameter::Instance()->output_vcf.empty()) { file = fopen(Parameter::Instance()->output_vcf.c_str(), "w"); } else if (!Parameter::Instance()->output_bedpe.empty()) { file = fopen(Parameter::Instance()->output_bedpe.c_str(), "w"); } } catch (...) { std::cerr << "Output file could not be created. Please check if path exists and if you have write permissions." << std::endl; exit(0); } if (file == NULL) { std::cerr << "Output file could not be created. Please check if path exists and if you have write permissions." << std::endl; exit(EXIT_FAILURE); } BamParser *mapped_file = new BamParser(Parameter::Instance()->bam_files[0]); this->ref = mapped_file->get_refInfo(); print_header(); if (!Parameter::Instance()->ignore_regions_bed.empty()) { std::cout << "Cross checking..." << std::endl; initialize_bed(bed_tree, root, ref); } if (Parameter::Instance()->phase) { tmp_file = fopen(Parameter::Instance()->tmp_phasing.c_str(), "wb"); } } bool to_print(Breakpoint * &SV, pair &std, pair & kurtosis, double & std_length, int & zmw_num); void store_readnames(std::vector names, int id); void close_file() { fclose(this->file); } void comp_std(Breakpoint * &SV, double & std_start, double & std_stop); void comp_std_med(Breakpoint * &SV, double & std_start, double & std_stop); pair comp_std_quantile(Breakpoint * &SV, pair& std, double & std_lenght, int & zmw_num); const std::string currentDateTime(); }; #endif /* PRINT_IPRINTER_H_ */ Sniffles-1.0.7/src/print/NGMPrinter.cpp000066400000000000000000000042031320237057600177100ustar00rootroot00000000000000/* * NGMPrinter.cpp * * Created on: Sep 23, 2015 * Author: fsedlaze */ /* * MariaPrinter.cpp * * Created on: Sep 4, 2015 * Author: fsedlaze */ #include "NGMPrinter.h" void NGMPrinter::print_header() { } void NGMPrinter::print_body(Breakpoint *& SV, RefVector ref) { //"Chrom\tstart\tstop\tchrom2\tstart2\tstop2\tvariant_name/ID\tscore (smaller is better)\tstrand1\tstrand2\ttype\tnumber_of_split_reads\n" if (!this->bed_tree.is_in(SV->get_coordinates().start.most_support, this->root) && !this->bed_tree.is_in(SV->get_coordinates().stop.most_support, this->root)) { std::string chr; std::string strands = SV->get_strand(2); if ((SV->get_SVtype() & TRA) || SV->get_length() > 1000000) { //1MB?? int pos = IPrinter::calc_pos(SV->get_coordinates().start.min_pos, ref, chr) - Parameter::Instance()->max_dist; fprintf(file, "%s", chr.c_str()); fprintf(file, "%c", '\t'); if (pos > 0) { fprintf(file, "%i", pos); } else { fprintf(file, "%i", 0); } fprintf(file, "%c", '\t'); pos = IPrinter::calc_pos(SV->get_coordinates().start.max_pos, ref, chr) + Parameter::Instance()->max_dist; fprintf(file, "%i", pos); fprintf(file, "%c", '\n'); pos = IPrinter::calc_pos(SV->get_coordinates().stop.min_pos, ref, chr) - Parameter::Instance()->max_dist; fprintf(file, "%s", chr.c_str()); fprintf(file, "%c", '\t'); if (pos > 0) { fprintf(file, "%i", pos); } else { fprintf(file, "%i", 0); } fprintf(file, "%c", '\t'); pos = IPrinter::calc_pos(SV->get_coordinates().stop.max_pos, ref, chr) - Parameter::Instance()->max_dist; fprintf(file, "%i", pos); fprintf(file, "%c", '\n'); } else { //smaller SV: int pos = IPrinter::calc_pos(SV->get_coordinates().start.min_pos, ref, chr) - Parameter::Instance()->max_dist; fprintf(file, "%s", chr.c_str()); fprintf(file, "%c", '\t'); if (pos > 0) { fprintf(file, "%i", pos); } else { fprintf(file, "%i", 0); } fprintf(file, "%c", '\t'); pos = IPrinter::calc_pos(SV->get_coordinates().stop.max_pos, ref, chr) + Parameter::Instance()->max_dist; fprintf(file, "%i", pos); fprintf(file, "%c", '\n'); } } } Sniffles-1.0.7/src/print/NGMPrinter.h000066400000000000000000000005571320237057600173650ustar00rootroot00000000000000/* * NGMPrinter.h * * Created on: Sep 23, 2015 * Author: fsedlaze */ #ifndef PRINT_NGMPRINTER_H_ #define PRINT_NGMPRINTER_H_ #include "IPrinter.h" class NGMPrinter:public IPrinter{ private: void print_header(); void print_body(Breakpoint * &SV, RefVector ref); public: NGMPrinter(){ } ~NGMPrinter(){ } }; #endif /* PRINT_NGMPRINTER_H_ */ Sniffles-1.0.7/src/print/VCFPrinter.cpp000066400000000000000000000255411320237057600177150ustar00rootroot00000000000000/* * VCFPrinter.cpp * * Created on: Aug 24, 2015 * Author: fsedlaze */ #include "VCFPrinter.h" void VCFPrinter::print_header() { fprintf(file, "%s", "##fileformat=VCFv4.2\n"); fprintf(file, "%s", "##source=Sniffles\n"); string time = currentDateTime(); fprintf(file, "%s", "##fileDate="); fprintf(file, "%s", time.c_str()); //REport over all chrs: for (size_t i = 0; i < this->ref.size(); i++) { fprintf(file, "%s", "\n"); fprintf(file, "%s", "##contig='); } fprintf(file, "%s", "\n"); fprintf(file, "%s", "##ALT=\n"); fprintf(file, "%s", "##ALT=\n"); fprintf(file, "%s", "##ALT=\n"); fprintf(file, "%s", "##ALT=\n"); fprintf(file, "%s", "##ALT=\n"); fprintf(file, "%s", "##ALT=\n"); fprintf(file, "%s", "##INFO=\n"); fprintf(file, "%s", "##INFO=\n"); fprintf(file, "%s", "##INFO=\n"); fprintf(file, "%s", "##INFO=\n"); fprintf(file, "%s", "##INFO=\n"); fprintf(file, "%s", "##INFO=\n"); fprintf(file, "%s", "##INFO=\n"); fprintf(file, "%s", "##INFO=\n"); fprintf(file, "%s", "##INFO=\n"); fprintf(file, "%s", "##INFO=\n"); fprintf(file, "%s", "##INFO=\n"); fprintf(file, "%s", "##INFO=\n"); fprintf(file, "%s", "##INFO=\n"); fprintf(file, "%s", "##INFO=\n"); fprintf(file, "%s", "##INFO=\n"); fprintf(file, "%s", "##INFO=\n"); fprintf(file, "%s", "##INFO=\n"); fprintf(file, "%s", "##INFO=\n"); fprintf(file, "%s", "##FORMAT=\n"); fprintf(file, "%s", "##FORMAT=\n"); fprintf(file, "%s", "##FORMAT=\n"); fprintf(file, "%s", "#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT"); for (size_t i = 0; i < Parameter::Instance()->bam_files.size(); i++) { fprintf(file, "%c", '\t'); fprintf(file, "%s", Parameter::Instance()->bam_files[i].c_str()); } fprintf(file, "%c", '\n'); } void VCFPrinter::print_body(Breakpoint * &SV, RefVector ref) { if (!this->bed_tree.is_in(SV->get_coordinates().start.most_support, this->root) && !this->bed_tree.is_in(SV->get_coordinates().stop.most_support, this->root)) { //temp. store read names supporting this SVs to later group the SVs together. double std_quant_start = 0; double std_quant_stop = 0; pair kurtosis; pair std_quant; double std_length = 0; int zmws = 0; bool ok_to_print = (to_print(SV, std_quant, kurtosis, std_length, zmws) || Parameter::Instance()->ignore_std); //std::cout << "Print check: " << std_quant.first << " " << std_quant.second << endl; if (ok_to_print && (zmws == 0 || zmws >= Parameter::Instance()->min_zmw)) { if (Parameter::Instance()->phase) { store_readnames(SV->get_read_ids(), id); } std::string chr; int start = IPrinter::calc_pos(SV->get_coordinates().start.most_support, ref, chr); fprintf(file, "%s", chr.c_str()); fprintf(file, "%c", '\t'); fprintf(file, "%i", start); fprintf(file, "%c", '\t'); fprintf(file, "%i", id); id++; int end = IPrinter::calc_pos(SV->get_coordinates().stop.most_support, ref, chr); std::string strands = SV->get_strand(1); fprintf(file, "%s", "\tN\t"); if (Parameter::Instance()->reportBND && (SV->get_SVtype() & TRA)) { //N[22:36765684[ +- //]21:10540232]N -+ if (strands[0] == '-') { //&& fprintf(file, "%s", "]"); fprintf(file, "%s", chr.c_str()); fprintf(file, "%c", ':'); fprintf(file, "%i", end); fprintf(file, "%s", "]N"); } else { fprintf(file, "%s", "N["); fprintf(file, "%s", chr.c_str()); fprintf(file, "%c", ':'); fprintf(file, "%i", end); fprintf(file, "%c", '['); } } else { fprintf(file, "%c", '<'); fprintf(file, "%s", IPrinter::get_type(SV->get_SVtype()).c_str()); fprintf(file, "%c", '>'); } fprintf(file, "%s", "\t.\tPASS\t"); if (std_quant.first < 10 && std_quant.second < 10) { fprintf(file, "%s", "PRECISE"); } else { fprintf(file, "%s", "IMPRECISE"); } fprintf(file, "%s", ";SVMETHOD=Snifflesv"); fprintf(file, "%s", Parameter::Instance()->version.c_str()); if (!(Parameter::Instance()->reportBND && (SV->get_SVtype() & TRA))) { fprintf(file, "%s", ";CHR2="); fprintf(file, "%s", chr.c_str()); fprintf(file, "%s", ";END="); if (SV->get_SVtype() & INS) { fprintf(file, "%i", std::max((int) (end - SV->get_length()), start)); } else { fprintf(file, "%i", end); } } if (zmws != 0) { fprintf(file, "%s", ";ZMW="); fprintf(file, "%i", zmws); } fprintf(file, "%s", ";STD_quant_start="); fprintf(file, "%f", std_quant.first); fprintf(file, "%s", ";STD_quant_stop="); fprintf(file, "%f", std_quant.second); fprintf(file, "%s", ";Kurtosis_quant_start="); fprintf(file, "%f", kurtosis.first); fprintf(file, "%s", ";Kurtosis_quant_stop="); fprintf(file, "%f", kurtosis.second); fprintf(file, "%s", ";SVTYPE="); if (Parameter::Instance()->reportBND && (SV->get_SVtype() & TRA)) { fprintf(file, "%s", "BND"); } else { fprintf(file, "%s", IPrinter::get_type(SV->get_SVtype()).c_str()); } if (Parameter::Instance()->report_n_reads > 0 || Parameter::Instance()->report_n_reads == -1) { fprintf(file, "%s", ";RNAMES="); fprintf(file, "%s", SV->get_read_names().c_str()); } fprintf(file, "%s", ";SUPTYPE="); fprintf(file, "%s", SV->get_supporting_types().c_str()); fprintf(file, "%s", ";SVLEN="); if (((SV->get_SVtype() & INS) && SV->get_length() == Parameter::Instance()->huge_ins) && !SV->get_types().is_SR) { fprintf(file, "%s", "NA"); } else { fprintf(file, "%i", SV->get_length()); } // } fprintf(file, "%s", ";STRANDS="); fprintf(file, "%s", strands.c_str()); if (!SV->get_sequence().empty()) { fprintf(file, "%s", ";SEQ="); fprintf(file, "%s", SV->get_sequence().c_str()); } fprintf(file, "%s", ";RE="); fprintf(file, "%i", SV->get_support()); //if(Parameter::Instance()->genotype){ fprintf(file, "%s", "\tGT:DR:DV\t./.:.:"); fprintf(file, "%i", SV->get_support()); //}else{ // fprintf(file, "%s",this->assess_genotype(SV->get_refcount(),SV->get_support()).c_str()); //} fprintf(file, "%c", '\n'); } } } void VCFPrinter::print_body_recall(Breakpoint * &SV, RefVector ref) { if (Parameter::Instance()->phase) { store_readnames(SV->get_read_ids(), id); } std::string chr; int start = IPrinter::calc_pos(SV->get_coordinates().start.most_support, ref, chr); fprintf(file, "%s", chr.c_str()); fprintf(file, "%c", '\t'); fprintf(file, "%i", start); fprintf(file, "%c", '\t'); fprintf(file, "%i", id); id++; int end = IPrinter::calc_pos(SV->get_coordinates().stop.most_support, ref, chr); std::string strands = SV->get_strand(1); fprintf(file, "%s", "\tN\t"); if (Parameter::Instance()->reportBND && (SV->get_SVtype() & TRA)) { //N[22:36765684[ +- //]21:10540232]N -+ if (strands[0] == '-' && strands[0] == '+') { fprintf(file, "%s", "]"); fprintf(file, "%s", chr.c_str()); fprintf(file, "%c", ':'); fprintf(file, "%i", end); fprintf(file, "%s", "]N"); } else { fprintf(file, "%s", "N["); fprintf(file, "%s", chr.c_str()); fprintf(file, "%c", ':'); fprintf(file, "%i", end); fprintf(file, "%c", '['); } } else { fprintf(file, "%c", '<'); fprintf(file, "%s", IPrinter::get_type(SV->get_SVtype()).c_str()); fprintf(file, "%c", '>'); } fprintf(file, "%s", "\t.\tPASS\t"); fprintf(file, "%s", "IMPRECISE"); fprintf(file, "%s", ";SVMETHOD=Snifflesv"); fprintf(file, "%s", Parameter::Instance()->version.c_str()); if (!(Parameter::Instance()->reportBND && (SV->get_SVtype() & TRA))) { fprintf(file, "%s", ";CHR2="); fprintf(file, "%s", chr.c_str()); fprintf(file, "%s", ";END="); if (SV->get_SVtype() & INS) { fprintf(file, "%i", std::max((int) (end - SV->get_length()), start)); } else { fprintf(file, "%i", end); } } fprintf(file, "%s", ";SVTYPE="); if (Parameter::Instance()->reportBND && (SV->get_SVtype() & TRA)) { fprintf(file, "%s", "BND"); } else { fprintf(file, "%s", IPrinter::get_type(SV->get_SVtype()).c_str()); } if (Parameter::Instance()->report_n_reads > 0 || Parameter::Instance()->report_n_reads == -1) { fprintf(file, "%s", ";RNAMES="); fprintf(file, "%s", SV->get_read_names().c_str()); } fprintf(file, "%s", ";SUPTYPE="); fprintf(file, "%s", SV->get_supporting_types().c_str()); fprintf(file, "%s", ";SVLEN="); if (((SV->get_SVtype() & INS) && SV->get_length() == Parameter::Instance()->huge_ins) && !SV->get_types().is_SR) { fprintf(file, "%s", "NA"); } else { fprintf(file, "%i", SV->get_length()); } // } fprintf(file, "%s", ";STRANDS="); fprintf(file, "%s", strands.c_str()); fprintf(file, "%s", ";SEQ="); fprintf(file, "%s", SV->get_sequence().c_str()); fprintf(file, "%s", ";RE="); fprintf(file, "%i", SV->get_support()); fprintf(file, "%s", "\tGT:DR:DV\t./.:.:"); fprintf(file, "%i", SV->get_support()); fprintf(file, "%c", '\n'); } Sniffles-1.0.7/src/print/VCFPrinter.h000066400000000000000000000006501320237057600173540ustar00rootroot00000000000000/* * VCFPrinter.h * * Created on: Aug 24, 2015 * Author: fsedlaze */ #ifndef PRINT_VCFPRINTER_H_ #define PRINT_VCFPRINTER_H_ #include "IPrinter.h" class VCFPrinter:public IPrinter{ private: void print_header(); void print_body(Breakpoint * &SV, RefVector ref); void print_body_recall(Breakpoint * &SV, RefVector ref); public: VCFPrinter(){ } ~VCFPrinter(){ } }; #endif /* PRINT_VCFPRINTER_H_ */ Sniffles-1.0.7/src/realign/000077500000000000000000000000001320237057600155055ustar00rootroot00000000000000Sniffles-1.0.7/src/realign/IAlignment.h000066400000000000000000000042201320237057600177030ustar00rootroot00000000000000#ifndef __IALIGNMENT_H__ #define __IALIGNMENT_H__ struct Align { Align() : pBuffer1(0), pBuffer2(0), ExtendedData(0), PositionOffset(0), QStart( 0), QEnd(0), Score(0.0f), Identity(0.0f), NM(0) { } char * pBuffer1; // = pCigar = pRef char * pBuffer2; // = pMD = pQry void * ExtendedData; int PositionOffset; // Position in Ref, an der das Alignment beginnt int QStart; // Anzahl Basen, die beim Qry am Anfang abgeschnitten wurden int QEnd; // Anzahl Basen, die beim Qry am Ende abgeschnitten wurden float Score; float Identity; int NM; }; static int const cCookie = 0x10201130; /* Anmerkung zum Parameter mode: int AlignmentType = mode & 0xFF; // 0..Smith-Waterman, 1..Needleman-Wunsch int ReportType = (mode >> 8) & 0xFF; // 0..Plain alignment (Ref+Qry), 1..SAM (Cigar+MD) bool BSMappingActive = mode & 0x10000; Anmerkung BS-Mapping: extData zeigt bei BSMappingActive == true auf ein Flag-Array (char*) der L�nge batchSize, wobei bei 0 die TC-Match-Funktion, bei 1 die AG-Match-Funktion verwendet werden soll: if (extData[i] == 0) -> TC-Matching f�r ref/qry-Paar i if (extData[i] == 1) -> AG-Matching - "" - */ class IAlignment { public: virtual int GetScoreBatchSize() const = 0; virtual int GetAlignBatchSize() const = 0; virtual int BatchScore(int const mode, int const batchSize, char const * const * const refSeqList, char const * const * const qrySeqList, float * const results, void * extData) = 0; virtual int SingleAlign(int const mode, int const corridor, char const * const refSeq, char const * const qrySeq, Align & result, void * extData) { return 0; } virtual int SingleScore(int const mode, int const corridor, char const * const refSeq, char const * const qrySeq, float & result, void * extData) { return 0; } virtual int BatchAlign(int const mode, int const batchSize, char const * const * const refSeqList, char const * const * const qrySeqList, Align * const results, void * extData) = 0; virtual ~IAlignment() {} }; typedef IAlignment * (*pfCreateAlignment)(int const gpu_id); typedef void (*pfDeleteAlignment)(IAlignment*); #endif Sniffles-1.0.7/src/realign/Realign.cpp000066400000000000000000000107721320237057600176010ustar00rootroot00000000000000/* * Realign.cpp * * Created on: Aug 24, 2015 * Author: fsedlaze */ #include "Realign.h" void Realigner::init() { //run through ref sequence and store a file * at the begining of each chr; myfile.open(Parameter::Instance()->ref_seq.c_str(), ifstream::in); if (!myfile.good()) { cout << "Fastq Parser: could not open file: " << Parameter::Instance()->ref_seq.c_str() << endl; exit(0); } buffer_size = 20000; buffer = new char[buffer_size]; myfile.getline(buffer, buffer_size); long len = 0; while (!myfile.eof()) { if (buffer[0] == '>') { ref_str tmp; tmp.length = len; tmp.file_pos = myfile.tellg(); meta_info.push_back(tmp); } else { for (size_t i = 0; i < buffer_size && buffer[i] != '\0'; i++) { len++; } } myfile.getline(buffer, buffer_size); } myfile.close(); } std::string Realigner::read_new_part(long start, long stop) { long pos = start; int i = 0; for (; i < (int) meta_info.size() && start - meta_info[i].length > 0; i++) { } i--; //one step back start -= meta_info[i].length; stop -= meta_info[i].length; myfile.open(Parameter::Instance()->ref_seq.c_str(), ifstream::in); myfile.seekg(meta_info[i].file_pos); myfile.getline(buffer, buffer_size); string seq; pos = 0; while (!myfile.eof() && buffer[0] != '>') { for (size_t i = 0; i < buffer_size && buffer[i] != '\0' && buffer[i] != '\n' && buffer[0] != '>'; i++) { if (pos >= start && pos <= stop) { seq += toupper(buffer[i]); } pos++; } myfile.getline(buffer, buffer_size); } if (buffer[0] != '>') { for (size_t i = 0; i < buffer_size && buffer[i] != '\0' && buffer[i] != '\n' && buffer[0] != '>'; i++) { if (pos >= start && pos <= stop) { seq += toupper(buffer[i]); } pos++; } } myfile.close(); return seq; } std::string Realigner::read_chr(short id) { myfile.open(Parameter::Instance()->ref_seq.c_str(), ifstream::in); myfile.seekg(meta_info[id].file_pos); myfile.getline(buffer, buffer_size); string seq; while (!myfile.eof() && buffer[0] != '>') { for (size_t i = 0; i < buffer_size && buffer[i] != '\0' && buffer[i] != '\n'; i++) { seq += toupper(buffer[i]); } myfile.getline(buffer, buffer_size); } myfile.close(); return seq; } void get_coords_DEL(Breakpoint *& sv) { //not much todo unless size is large -> tra??? region_ref_str tmp; if(sv->get_length()<5000){ //take normal ref region; //tmp.start=sv->get_coordinates().start; }else{ //chop: } } void get_coords_DUP(Breakpoint *& sv) { //duplicate the dup region next to each other?? -> define 2 overlapping regions } void get_coords_TRA(Breakpoint *& sv) { //define 2 regions on the chr } void get_coords_INS(Breakpoint *& sv) { //nothing todo } void get_coords_INV(Breakpoint *& sv) { // Estimate 2 breakpoints and set directions! } void Realigner::align(std::vector sv) { long len = 0; std::string seq; //1 collect regions from ref for (size_t i = 0; i < sv.size(); i++) {//parallel //check if split read-> concatenate sequences; //else just a standard alignment. //split to ease job: if (sv[i]->get_SVtype() & DEL) { get_coords_DEL(sv[i]); } else if (sv[i]->get_SVtype() & DUP) { get_coords_DUP(sv[i]); } else if (sv[i]->get_SVtype() & TRA) { get_coords_TRA(sv[i]); } else if (sv[i]->get_SVtype() & INV) { get_coords_INV(sv[i]); } else if (sv[i]->get_SVtype() & INS) { get_coords_INS(sv[i]); } } //2: collect regions from ref: //for each chr run through all SV?? for (size_t i = 0; i < this->meta_info.size(); i++) { long curr_length=meta_info[i].length; std::string ref = this->read_chr(i); long next_length=curr_length+ref.size(); for(size_t j = 0; j < sv.size(); j++) { //parallel /*for(size_t k=0;kget_ref_coord().size();k++){ if((sv[j]->get_ref_coord()[k].start-curr_length) >0 && (sv[j]->get_ref_coord()[k].stop-next_length)<0){ //extract region: int start=sv[j]->get_ref_coord()[k].start-curr_length; int stop=sv[j]->get_ref_coord()[k].stop-curr_length; sv[j]->set_ref_seq(k,ref.substr(start,stop-start)); } }*/ } } for (size_t i = 0; i < sv.size(); i++) {//parallel //2 send SV+ regions to alignment using OpenMP IAlignment * aligner = new SWCPUCor(0); Align align; align.pBuffer1 = new char[400]; align.pBuffer2 = new char[400]; char * refSeq = new char[400]; char * readSeq = new char[400]; int mode = 0; int cigarLength = aligner->SingleAlign(mode, Parameter::Instance()->corridor, refSeq, readSeq, align, 0); //3 Backtrack information //4 filter out SV if necessary } } Sniffles-1.0.7/src/realign/Realign.h000066400000000000000000000012461320237057600172420ustar00rootroot00000000000000/* * Realign.h * * Created on: Aug 24, 2015 * Author: fsedlaze */ #ifndef REALIGN_REALIGN_H_ #define REALIGN_REALIGN_H_ #include "../sub/Breakpoint.h" #include #include #include #include "IAlignment.h" #include "SWCPU.h" struct ref_str { long length; streampos file_pos; }; class Realigner { private: std::string read_new_part(long start, long stop); vector meta_info; size_t buffer_size; char*buffer; ifstream myfile; void init(); std::string read_chr(short id); public: Realigner() { init(); } ~Realigner() { delete[] buffer; } void align(std::vector SV); }; #endif /* REALIGN_REALIGN_H_ */ Sniffles-1.0.7/src/realign/SWCPU.cpp000066400000000000000000000347011320237057600171170ustar00rootroot00000000000000/* * SWCPU.cpp * * Created on: Jun 15, 2011 * Author: fritz */ #include "SWCPU.h" //TODO: hack to pass data for debug output Align cur_align; SWCPUCor::SWCPUCor(int gpu_id) { // cigar = bool(((gpu_id >> 8) & 0xFF) == 1); batch_size = 1; mat = 2.0f; mis = -5.0f; gap_open_read = -5.0f; gap_open_ref = -5.0f; gap_ext = -5.0f; gap_decay = 0.05f; gap_ext_min = -1.0f; long maxLen = (long) 100000 * (long) 20000; alignMatrix = new MatrixElement[maxLen]; fprintf(stderr, "Allocationg: %llu\n", maxLen * sizeof(MatrixElement)); binaryCigar = new int[200000]; // short temp[6][6] = { mat, mis, mis, mis, 0, mis, mis, mat, mis, mis, 0, mis, // mis, mis, mat, mis, 0, mis, mis, mis, mis, mat, 0, mis, 0, 0, 0, 0, // 0, 0, 0, 0, 0, 0, 0, mat }; // memcpy(scores, temp, 6 * 6 * sizeof(short)); fprintf(stderr, "SWCPU initialized\n"); } SWCPUCor::~SWCPUCor() { delete[] alignMatrix; alignMatrix = 0; delete[] binaryCigar; binaryCigar = 0; } Score SWCPUCor::SW_Score(char const * const refSeqList, char const * const qrySeqList, int * fwResults, int corr_length, MatrixElement * mat_pointer) { // memset(local_mat_line, 0, corr_length * sizeof(short)); char const * scaff = refSeqList; char const * read = qrySeqList; //Init matrix lines MatrixElement * matrix = mat_pointer; //Init matrix lines for (int i = 0; i < corr_length; ++i) { //local_mat_line[i] = 0; matrix[i].direction = CIGAR_STOP; matrix[i].indelRun = 0; matrix[i].score = 0; } matrix[corr_length].direction = CIGAR_STOP; matrix[corr_length].indelRun = 0; matrix[corr_length].score = 0; Score curr_max = -1.0f; int read_index = 0; int x = 0; for (; *read != line_end; ++read) { char read_char_cache = *read; matrix += (corr_length + 1); // short left_cell = 0; matrix[0].direction = CIGAR_STOP; matrix[0].indelRun = 0; matrix[0].score = 0; for (int ref_index = 0; ref_index < corr_length - 1; ++ref_index) { MatrixElement & diag = matrix[-(corr_length + 1) + ref_index + 1]; MatrixElement & up = matrix[-(corr_length + 1) + ref_index + 2]; MatrixElement & left = matrix[ref_index]; bool eq = read_char_cache == scaff[ref_index]; Score diag_cell = diag.score + ((eq) ? mat : mis); Score up_cell = 0; Score left_cell = 0; int ins_run = 0; int del_run = 0; if (up.direction == CIGAR_I) { ins_run = up.indelRun; if (up.score == 0) { up_cell = 0; } else { up_cell = up.score + std::min(gap_ext_min, gap_ext + ins_run * gap_decay); } } else { up_cell = up.score + gap_open_read; } if (left.direction == CIGAR_D) { del_run = left.indelRun; if (left.score == 0) { left_cell = 0; } else { left_cell = left.score + std::min(gap_ext_min, gap_ext + del_run * gap_decay); } } else { left_cell = left.score + gap_open_ref; } //find max Score max_cell = 0; max_cell = max(left_cell, max_cell); max_cell = max(diag_cell, max_cell); max_cell = max(up_cell, max_cell); MatrixElement & current = matrix[(ref_index + 1)]; if (del_run > 0 && max_cell == left_cell) { current.score = max_cell; current.direction = CIGAR_D; current.indelRun = del_run + 1; } else if (ins_run > 0 && max_cell == up_cell) { current.score = max_cell; current.direction = CIGAR_I; current.indelRun = ins_run + 1; } else if (max_cell == diag_cell) { current.score = max_cell; if (eq) { current.direction = CIGAR_EQ; } else { current.direction = CIGAR_X; } current.indelRun = 0; } else if (max_cell == left_cell) { current.score = max_cell; current.direction = CIGAR_D; current.indelRun = 1; } else if (max_cell == up_cell) { current.score = max_cell; current.direction = CIGAR_I; current.indelRun = 1; } else { current.score = 0; current.direction = CIGAR_STOP; current.indelRun = 0; } if (max_cell > curr_max) { curr_max = max_cell; fwResults[param_best_ref_index] = ref_index; fwResults[param_best_read_index] = read_index; fwResults[3] = curr_max; } } matrix[corr_length].direction = CIGAR_STOP; matrix[corr_length].score = 0; matrix[corr_length].indelRun = 0; scaff++; read_index += 1; } fwResults[2] = (read_index - fwResults[0]) - 1; if (read_index == 0) { fwResults[0] = fwResults[1] = 2; } return curr_max; } int SWCPUCor::printCigarElement(char const op, int const length, char * cigar) { int offset = 0; offset = sprintf(cigar, "%d%c", length, op); return offset; } int SWCPUCor::computeCigarMD(Align & result, int const gpuCigarOffset, int const * const gpuCigar, char const * const refSeq, int corr_length, int read_length, int const QStart, int const QEnd) { int alignment_length = corr_length + read_length + 1; int finalCigarLength = 0; int cigar_offset = 0; // int md_offset = 0; if (((gpuCigar[gpuCigarOffset] >> 4) + QStart) > 0) { fprintf(stderr, "Adding %d to QSTart\n", QStart); result.QStart = (gpuCigar[gpuCigarOffset] >> 4) + QStart; cigar_offset += printCigarElement('S', result.QStart, result.pRef + cigar_offset); finalCigarLength += result.QStart; } int cigar_m_length = 0; // int md_eq_length = 0; int ref_index = 0; for (int j = gpuCigarOffset + 1; j < (alignment_length - 1); ++j) { int op = gpuCigar[j] & 15; int length = gpuCigar[j] >> 4; //debugCigar(op, length); switch (op) { case CIGAR_X: cigar_m_length += length; //Produces: [0-9]+(([A-Z]+|\^[A-Z]+)[0-9]+)* //instead of: [0-9]+(([A-Z]|\^[A-Z]+)[0-9]+)* // md_offset += sprintf(result.pQry + md_offset, "%d", md_eq_length); // for (int k = 0; k < length; ++k) { // md_offset += sprintf(result.pQry + md_offset, "%c", // refSeq[ref_index++]); // } // md_eq_length = 0; break; case CIGAR_EQ: cigar_m_length += length; // md_eq_length += length; ref_index += length; break; case CIGAR_D: if (cigar_m_length > 0) { cigar_offset += printCigarElement('M', cigar_m_length, result.pRef + cigar_offset); finalCigarLength += cigar_m_length; cigar_m_length = 0; } cigar_offset += printCigarElement('D', length, result.pRef + cigar_offset); // md_offset += sprintf(result.pQry + md_offset, "%d", md_eq_length); // md_eq_length = 0; // result.pQry[md_offset++] = '^'; // for (int k = 0; k < length; ++k) { // result.pQry[md_offset++] = refSeq[ref_index++]; // } break; case CIGAR_I: if (cigar_m_length > 0) { cigar_offset += printCigarElement('M', cigar_m_length, result.pRef + cigar_offset); finalCigarLength += cigar_m_length; cigar_m_length = 0; } cigar_offset += printCigarElement('I', length, result.pRef + cigar_offset); finalCigarLength += length; break; default: fprintf(stderr, "Invalid cigar string: %d\n", op); std::cout << "Offset: " << gpuCigarOffset << std::endl; for (int x = 0; x < alignment_length * 2; ++x) { std::cout << gpuCigar[x] << " "; } std::cout << std::endl; exit(1); } } // md_offset += sprintf(result.pQry + md_offset, "%d", md_eq_length); if (cigar_m_length > 0) { cigar_offset += printCigarElement('M', cigar_m_length, result.pRef + cigar_offset); finalCigarLength += cigar_m_length; cigar_m_length = 0; } if (((gpuCigar[alignment_length - 1] >> 4) + QEnd) > 0) { fprintf(stderr, "Adding %d to QEnd\n", QEnd); result.QEnd = (gpuCigar[alignment_length - 1] >> 4) + QEnd; cigar_offset += printCigarElement('S', result.QEnd, result.pRef + cigar_offset); finalCigarLength += result.QEnd; } //TODO: fix result.Identity = 1.0f; result.pRef[cigar_offset] = '\0'; // result.pQry[md_offset] = '\0'; return finalCigarLength; } bool SWCPUCor::Backtracking_CIGAR(char const * const scaff, char const * const read, int *& fwdResults, int *& alignments, int corr_length, int read_length, int alignment_length, MatrixElement * mat_pointer) { bool valid = true; MatrixElement * matrix = mat_pointer; int best_read_index = fwdResults[param_best_read_index]; int best_ref_index = fwdResults[param_best_ref_index]; int cigarLenth = 0; int cigarLengthCheck = 0; int totalDelLength = 0; int totalINsLength = 0; int minCorridor = corr_length * 0.01f; int maxCorridor = corr_length - minCorridor; if (best_read_index > 0) { matrix += (((corr_length + 1) * (best_read_index + 1))); int abs_ref_index = best_ref_index + best_read_index; int alignment_index = alignment_length - 1; int pointer = CIGAR_STOP; int cigar_element = CIGAR_S; int cigar_length = fwdResults[qend]; cigarLenth += fwdResults[qend]; while ((pointer = matrix[(best_ref_index + 1)].direction) != CIGAR_STOP) { // Log.Message("Best ref index: %d (%d)", best_ref_index + 1, (corr_length + 1)); // printf("%s\t%d\t%d\t%d\t%d\n", (char *) cur_align.ExtendedData, // cur_align.NM, best_read_index, best_ref_index + 1, // corr_length + 1); if (best_ref_index <= minCorridor || best_ref_index >= maxCorridor) { fprintf(stderr, "Corridor probably too small\n"); valid = false; // getchar(); } if (pointer == CIGAR_X || pointer == CIGAR_EQ) { matrix -= ((corr_length + 1)); best_read_index -= 1; abs_ref_index -= 1; cigarLenth += 1; } else if (pointer == CIGAR_I) { matrix -= ((corr_length + 1)); best_read_index -= 1; best_ref_index += 1; cigarLenth += 1; } else if (pointer == CIGAR_D) { best_ref_index -= 1; abs_ref_index -= 1; } else { fprintf(stderr, "Error in backtracking. Invalid CIGAR operation found\n"); exit(1); } if (pointer == cigar_element) { cigar_length += 1; } else { alignments[alignment_index--] = (cigar_length << 4 | cigar_element); if (cigar_element != CIGAR_D) { cigarLengthCheck += cigar_length; } cigar_element = pointer; cigar_length = 1; } } alignments[alignment_index--] = (cigar_length << 4 | cigar_element); if (cigar_element != CIGAR_D) { cigarLengthCheck += cigar_length; } alignments[alignment_index] = ((best_read_index + 1) << 4 | CIGAR_S); cigarLengthCheck += (best_read_index + 1); cigarLenth += (best_read_index + 1); fwdResults[ref_position] = abs_ref_index + 1; fwdResults[qstart] = best_read_index + 1; //qend was set by "forward" kernel fwdResults[alignment_offset] = alignment_index; if (cigarLenth != cigarLengthCheck) { fprintf(stderr, "Error in CIGAR length: %d vs %d\n", cigarLenth, cigarLengthCheck); } else { if (read_length != cigarLenth) { fprintf(stderr, "Error read length != cigar length: %d vs %d\n", read_length, cigarLenth); exit(1); } } fprintf(stderr, "Read length: %d, CIGAR length: %d\n", read_length, cigarLenth); } return valid; } int SWCPUCor::GetScoreBatchSize() const { return 0; } int SWCPUCor::GetAlignBatchSize() const { return 0; } int SWCPUCor::BatchAlign(int const mode, int const batchSize, char const * const * const refSeqList, char const * const * const qrySeqList, Align * const results, void * extData) { throw "Not implemented"; fprintf(stderr, "Unsupported alignment mode %i\n", mode); return 0; } void SWCPUCor::print_matrix(int alignment_length, const char* const refSeq, int read_length, const char* const qrySeq, int corr_length, MatrixElement* mat_pointer) { printf(" - "); for (int x = 0; x < alignment_length - 1; ++x) { printf(" %c ", refSeq[x]); } printf("\n"); for (size_t row = 0; row < read_length + 1; ++row) { if (row == 0) { printf("-: "); } else { printf("%c: ", qrySeq[row - 1]); } for (int x = 0; x < row; ++x) { printf(" "); } for (size_t col = 0; col < corr_length + 1; ++col) { MatrixElement* cell = mat_pointer + (row * (corr_length + 1) + col); printf("%*d ", 2, cell->indelRun); } printf("\n"); } printf(" - "); for (int x = 0; x < alignment_length - 1; ++x) { printf(" %c ", refSeq[x]); } printf("\n"); for (size_t row = 0; row < read_length + 1; ++row) { if (row == 0) { printf("-: "); } else { printf("%c: ", qrySeq[row - 1]); } for (int x = 0; x < row; ++x) { printf(" "); } for (size_t col = 0; col < corr_length + 1; ++col) { MatrixElement* cell = mat_pointer + (row * (corr_length + 1) + col); printf("%*d ", 2, cell->direction); } printf("\n"); } printf(" - "); for (int x = 0; x < alignment_length - 1; ++x) { printf(" %c ", refSeq[x]); } printf("\n"); for (size_t row = 0; row < read_length + 1; ++row) { if (row == 0) { printf("-: "); } else { printf("%c: ", qrySeq[row - 1]); } for (int x = 0; x < row; ++x) { printf(" "); } for (size_t col = 0; col < corr_length + 1; ++col) { MatrixElement* cell = mat_pointer + (row * (corr_length + 1) + col); printf("%*.*f ", 2, 0, cell->score); } printf("\n"); } } int SWCPUCor::SingleAlign(int const mode, int const corridor, char const * const refSeq, char const * const qrySeq, Align & align, void * extData) { // Log.Message("Aligning: "); // Log.Message("%s", refSeq); // Log.Message("%s", qrySeq); cur_align = align; int * clipping = 0; if (extData == 0) { clipping = new int[2]; clipping[0] = 0; clipping[1] = 0; } else { clipping = (int *) extData; } int read_length = strlen(qrySeq); fprintf(stderr, "Read length (single align) is %d\n", read_length); align.pBuffer1 = new char[read_length * 4]; // align.pBuffer2 = new char[read_length * 4]; align.pBuffer2 = new char[1]; align.pBuffer2[0] = '\0'; int finalCigarLength = 0; int corr_length = corridor; int alignment_length = (corr_length + read_length + 1); int * fwdResults = new int[result_number]; Score score = SW_Score(refSeq, qrySeq, fwdResults, corr_length, alignMatrix); // print_matrix(alignment_length, refSeq, read_length, qrySeq, corr_length, // alignMatrix); // Log.Message("%d, %d, %d, %d", fwdResults[0], fwdResults[1], fwdResults[2], fwdResults[3]); bool valid = Backtracking_CIGAR(refSeq, qrySeq, fwdResults, binaryCigar, corr_length, read_length, alignment_length, alignMatrix); if (valid) { finalCigarLength = computeCigarMD(align, fwdResults[3], binaryCigar, refSeq + fwdResults[0], corr_length, read_length, clipping[0], clipping[1]); align.PositionOffset = fwdResults[0]; align.Score = score; } delete[] fwdResults; if (extData == 0) { delete[] clipping; clipping = 0; } if (!valid) { finalCigarLength = -1; } return finalCigarLength; } int SWCPUCor::BatchScore(int const mode, int const batchSize, char const * const * const refSeqList, char const * const * const qrySeqList, float * const results, void * extData) { throw "Not implemented"; return 0; } Sniffles-1.0.7/src/realign/SWCPU.h000066400000000000000000000065311320237057600165640ustar00rootroot00000000000000/* * SWCPU.h * * Created on: Jun 15, 2011 * Author: fritz */ #ifndef SWCPU_H_ #define SWCPU_H_ #define pRef pBuffer1 #define pQry pBuffer2 #include "IAlignment.h" #include #include #include #include using std::endl; using std::cout; using std::max; #define CIGAR_STOP 10 #define short_min -16000 #define result_number 4 #define line_end '\0' #define ref_position 0 #define qstart 1 #define qend 2 #define alignment_offset 3 #define param_best_read_index 0 #define param_best_ref_index 1 #define CIGAR_M 0 #define CIGAR_I 1 #define CIGAR_D 2 #define CIGAR_N 3 #define CIGAR_S 4 #define CIGAR_H 5 #define CIGAR_P 6 #define CIGAR_EQ 7 #define CIGAR_X 8 typedef float Score; struct MatrixElement { Score score; int indelRun; char direction; }; const char trans[256] = { 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 0, 4, 1, 4, 4, 4, 2, 4, 4, 4, 4, 4, 4, 5, 4, 4, 4, 4, 4, 3, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 0, 4, 1, 4, 4, 4, 2, 4, 4, 4, 4, 4, 4, 5, 4, 4, 4, 4, 4, 3, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4 }; class SWCPUCor: public IAlignment { public: SWCPUCor(int gpu_id); virtual ~SWCPUCor(); virtual int GetScoreBatchSize() const; virtual int GetAlignBatchSize() const; virtual int BatchScore(int const mode, int const batchSize, char const * const * const refSeqList, char const * const * const qrySeqList, float * const results, void * extData); virtual int BatchAlign(int const mode, int const batchSize, char const * const * const refSeqList, char const * const * const qrySeqList, Align * const results, void * extData); virtual int SingleAlign(int const mode, int const corridor, char const * const refSeq, char const * const qrySeq, Align & result, void * extData); private: //bool cigar; //short scores[6][6]; Score mat; Score mis; Score gap_open_read; Score gap_open_ref; Score gap_ext; Score gap_ext_min; Score gap_decay; MatrixElement * alignMatrix; int * binaryCigar; //meta info unsigned int batch_size; //effictive thread number that is started per call int printCigarElement(char const op, int const length, char * cigar); int computeCigarMD(Align & result, int const gpuCigarOffset, int const * const gpuCigar, char const * const refSeq, int corr_length, int read_length, int const QStart, int const QEnd); Score SW_Score(char const * const scaff, char const * const read, int * result, int corr_length, MatrixElement * mat_pointer); bool Backtracking_CIGAR(char const * const scaff, char const * const read, int *& result, int *& alignments, int corr_length, int read_length, int alignment_length, MatrixElement * mat_pointer); void print_matrix(int alignment_length, const char* const refSeq, int read_length, const char* const qrySeq, int corr_length, MatrixElement* mat_pointer); }; #endif /* SWCPU_H_ */ Sniffles-1.0.7/src/sub/000077500000000000000000000000001320237057600146555ustar00rootroot00000000000000Sniffles-1.0.7/src/sub/Breakpoint.cpp000066400000000000000000000452401320237057600174640ustar00rootroot00000000000000/* * Breakpoint.cpp * * Created on: Sep 1, 2015 * Author: fsedlaze */ /* * Breakpoint.h * * Created on: Jun 23, 2015 * Author: fsedlaze */ #include "../print/IPrinter.h" #include "Breakpoint.h" ///////////////////////////////// MERGING//////////////////////////////////////////// std::string print_type(char SV1) { std::string type = ""; if ((SV1 & INS)) { type += "INS"; } if ((SV1 & TRA)) { type += "TRA"; } if ((SV1 & INV)) { type += "INV"; } if ((SV1 & DEL)) { type += "DEL"; } if ((SV1 & DUP)) { type += "DUP"; } if ((SV1 & NEST)) { type += "INVDUP"; } if ((SV1 & NEST)) { type += "INS_open"; } return type; } bool Breakpoint::check_SVtype(Breakpoint * break1, Breakpoint * break2) { //todo check that! char SV1 = (*break1->get_coordinates().support.begin()).second.SV; char SV2 = (*break2->get_coordinates().support.begin()).second.SV; //we have to check it that way,because we can have multiple types! if ((SV1 & INS) && (SV2 & INS)) { return true; } else if ((SV1 & INV) && (SV2 & INV)) { return true; } else if ((SV1 & TRA) && (SV2 & TRA)) { return true; } else if ((SV1 & DUP) && (SV2 & DUP)) { return true; } else if ((SV1 & DEL) && (SV2 & DEL)) { return true; } else if (((SV1 & NEST) && (SV2 & NEST)) || (((SV1 & NEST) && (SV2 & INV)) || ((SV1 & INV) && (SV2 & NEST)))) { return true; } else if (((SV1 & DUP) && (SV2 & INS)) || ((SV1 & INS) && (SV2 & DUP))) { //DUP and ins have often the same signal for alignments. return true; } //std::cout<<"S1: "<get_coordinates().support.begin()).second.type<get_coordinates().support.begin()).second.SV & TRA) { //only for tra since we get otherwise a problem with the cigar events // //std::string readname= (*tmp->get_coordinates().support.begin()).first; // return ((*tmp->get_coordinates().support.begin()).second.strand.first == (*this->get_coordinates().support.begin()).second.strand.first && (*tmp->get_coordinates().support.begin()).second.strand.second == (*this->get_coordinates().support.begin()).second.strand.second); //} return true; } return false; } int get_dist(Breakpoint * tmp) { position_str pos = tmp->get_coordinates(); //return Parameter::Instance()->max_dist; if ((*tmp->get_coordinates().support.begin()).second.SV & TRA) { return Parameter::Instance()->max_dist; //TODO: change! } long dist = (pos.stop.max_pos - pos.start.min_pos); if ((*pos.support.begin()).second.length != 1) { dist = (*pos.support.begin()).second.length; } /* if(dist<10){ std::cout<<"DIST <10 ! "<min_length * 2, dist); /*if(dist <10){//TODO dist=(long)Parameter::Instance()->max_dist; std::cout<<"LEN SMALLER! "<<(*pos.support.begin()).first<<" "<max_dist<<" "<<(pos.stop.max_pos - pos.start.min_pos)<< std::endl; }*/ return std::min((int) (dist * 4), Parameter::Instance()->max_dist); } long Breakpoint::overlap(Breakpoint * tmp) { bool flag = false; //flag = ((*tmp->get_coordinates().support.begin()).second.SV & DEL); int max_dist = std::min(get_dist(tmp), get_dist(this)); // Parameter::Instance()->max_dist if (flag) { std::cout << "\t Overlap: " << max_dist << " start: " << tmp->get_coordinates().start.min_pos << " " << positions.start.min_pos << " stop :" << tmp->get_coordinates().stop.max_pos << " " << positions.stop.max_pos; if ((*positions.support.begin()).second.SV & DEL) { std::cout << " Is DEL"; } else if ((*positions.support.begin()).second.SV & INS) { std::cout << " Is Ins "; } else if ((*positions.support.begin()).second.SV & DUP) { std::cout << " Is Dup "; } else if ((*positions.support.begin()).second.SV & INV) { std::cout << " Is Inv "; } std::cout << " Support: " << positions.support.size(); std::cout << std::endl; } //merging two robust calls: /*if (is_same_strand(tmp) && (abs(tmp->get_coordinates().start.min_pos - positions.start.min_pos) < max_dist || abs(tmp->get_coordinates().stop.max_pos - positions.stop.max_pos) < max_dist)) { if (tmp->get_coordinates().stop.max_pos - tmp->get_coordinates().start.min_pos == Parameter::Instance()->huge_ins || positions.stop.max_pos - positions.start.min_pos == Parameter::Instance()->huge_ins) { if (flag) { cout << "\tHIT" << endl; } return 0; } }*/ //Standard merging. if (is_same_strand(tmp) && (abs(tmp->get_coordinates().start.min_pos - positions.start.min_pos) < max_dist && abs(tmp->get_coordinates().stop.max_pos - positions.stop.max_pos) < max_dist)) { if (flag) { cout << "\tHIT" << endl; } return 0; } //merging "huge ins" and observed ins: if (is_same_strand(tmp) && (abs(tmp->get_coordinates().start.min_pos - tmp->get_coordinates().stop.max_pos) == Parameter::Instance()->huge_ins || abs(positions.start.min_pos - positions.stop.max_pos) == Parameter::Instance()->huge_ins) && (abs(tmp->get_coordinates().start.min_pos - positions.start.min_pos) < max_dist || abs(tmp->get_coordinates().stop.max_pos - positions.stop.max_pos) < max_dist)) { return 0; } //If there is an INVDUP: if ((is_NEST(tmp, this) && is_same_strand(tmp)) && (abs(tmp->get_coordinates().start.min_pos - positions.start.min_pos) < Parameter::Instance()->max_dist || abs(tmp->get_coordinates().start.min_pos - positions.stop.min_pos) < Parameter::Instance()->max_dist)) { return 0; } //extend Split read by noisy region: //not longer needed?? /* if (((tmp->get_types().is_Noise || this->get_types().is_Noise) && !(tmp->get_types().is_Noise && this->get_types().is_Noise)) && (abs(tmp->get_coordinates().start.min_pos - positions.stop.min_pos) < max_dist / 2 || abs(tmp->get_coordinates().stop.max_pos - positions.start.max_pos) < max_dist / 2)) { //TODO maybe add SV type check! if (flag) { cout << "\tHIT Noise" << endl; } return 0; }*/ //as abstraction lets try the start+stop coordinate! long diff = (tmp->get_coordinates().start.min_pos - positions.start.min_pos); //if (abs(diff) < max_dist) { //return (tmp->get_coordinates().stop.max_pos - positions.stop.max_pos); //} if (diff == 0) { return 1; } return diff; // + (tmp->get_coordinates().stop.max_pos - positions.stop.max_pos); } long Breakpoint::overlap_breakpoint(long start, long stop) { int max_dist = get_dist(this); // Parameter::Instance()->max_dist if ((start < positions.start.min_pos && positions.start.min_pos < stop) || (start < positions.stop.max_pos && positions.stop.max_pos < stop)) { return 0; } long diff = (start - positions.start.min_pos); //if (abs(diff) < max_dist) { //return (tmp->get_coordinates().stop.max_pos - positions.stop.max_pos); //} if (diff == 0) { return 1; } return diff; } void Breakpoint::add_read(Breakpoint * point) { //point = one read support! if (point != NULL) { //merge the support: std::map support = point->get_coordinates().support; this->set_refcount(max(point->get_refcount(), this->get_refcount())); //set ref count! for (std::map::iterator i = support.begin(); i != support.end(); i++) { this->positions.support[(*i).first] = (*i).second; } } } ///////////////////////////////// MERGING//////////////////////////////////////////// std::string Breakpoint::get_read_names() { std::string read_names; int num = Parameter::Instance()->report_n_reads; if (num == -1) { num = this->positions.support.size(); } for (std::map::iterator i = this->positions.support.begin(); num != 0 && i != this->positions.support.end(); i++) { read_names += ","; read_names += (*i).first; num--; } return read_names.substr(1); } std::vector Breakpoint::get_read_ids() { std: vector read_names; std::map support = this->positions.support; int num = 0; for (std::map::iterator i = support.begin(); i != support.end(); i++) { read_names.push_back((*i).second.id); num++; } return read_names; } //TODO define region object and inherit from that. Plus define avoid region objects for mappability problems. std::string Breakpoint::translate_strand(pair strand_pair) { if (strand_pair.first && strand_pair.second) { return "++"; } else if (strand_pair.first && !strand_pair.second) { return "+-"; } else if (!strand_pair.first && strand_pair.second) { return "-+"; } else if (!strand_pair.first && !strand_pair.second) { return "--"; } return " "; } void Breakpoint::summarize_type(char SV, ushort * array) { //std::string ss; if (SV & DEL) { // ss += "DEL; "; array[0]++; } if (SV & DUP) { // ss += "DUP; "; array[1]++; } if (SV & INS) { // ss += "INS; "; array[2]++; } if (SV & INV) { // ss += "INV; "; array[3]++; } if (SV & TRA) { // ss += "TRA; "; array[4]++; } if (SV & NEST) { // ss += "NEST; "; array[5]++; } //return ss; } char Breakpoint::get_SVtype() { if (sv_type & NA) { // std::cerr << "was not set" << std::endl; calc_support(); predict_SV(); } return this->sv_type; } void Breakpoint::calc_support() { ushort sv[6] = { 0, 0, 0, 0, 0, 0 }; //run over all supports and check the majority type: for (std::map::iterator i = positions.support.begin(); i != positions.support.end(); i++) { summarize_type((*i).second.SV, sv); } //given the majority type get the stats: this->sv_type = eval_type(sv); if (get_SVtype() & TRA) { // we cannot make assumptions abut support yet. set_valid((bool) (get_support() > 1)); // this is needed as we take each chr independently and just look at the primary alignment } else if (get_support() >= Parameter::Instance()->min_support) { predict_SV(); set_valid((bool) (get_length() > Parameter::Instance()->min_length)); } } char Breakpoint::eval_type(ushort* SV) { int maxim = 0; int id = 0; for (size_t i = 0; i < 6; i++) { if (maxim < SV[i]) { maxim = SV[i]; } } this->type_support = maxim; if (!Parameter::Instance()->input_vcf.empty()) { this->type_support--; // this is needed since we introduce a pseudo element } char max_SV = 0; if (maxim == SV[0]) { max_SV |= DEL; } if (maxim == SV[1]) { max_SV |= DUP; } if (maxim == SV[2]) { max_SV |= INS; } if (maxim == SV[3]) { max_SV |= INV; } if (maxim == SV[4]) { max_SV |= TRA; } if (maxim == SV[5]) { max_SV |= NEST; } return max_SV; } long get_median(std::vector corrds) { sort(corrds.begin(), corrds.end()); if (corrds.size() % 2 == 0) { return (corrds[corrds.size() / 2 - 1] + corrds[corrds.size() / 2]) / 2; } return corrds[corrds.size() / 2]; } void Breakpoint::predict_SV() { bool aln = false; bool split = false; bool noise = false; int num = 0; std::map starts; std::map stops; std::map lengths; //ins! std::map strands; std::vector start2; std::vector stops2; std::vector lengths2; for (std::map::iterator i = positions.support.begin(); i != positions.support.end(); i++) { if (((*i).second.SV & this->sv_type) && strncmp((*i).first.c_str(), "input", 5) != 0) { // && !((*i).second.SV & INS && (*i).second.length==Parameter::Instance()->huge_ins)) { ///check type if ((*i).second.coordinates.first != -1) { if ((*i).second.length != Parameter::Instance()->huge_ins) { if (starts.find((*i).second.coordinates.first) == starts.end()) { starts[(*i).second.coordinates.first] = 1; } else { starts[(*i).second.coordinates.first]++; } } start2.push_back((*i).second.coordinates.first); } if ((*i).second.coordinates.second != -1) { //TODO test if ((*i).second.length != Parameter::Instance()->huge_ins) { if (stops.find((*i).second.coordinates.second) == stops.end()) { stops[(*i).second.coordinates.second] = 1; } else { stops[(*i).second.coordinates.second]++; } } stops2.push_back((*i).second.coordinates.second); } if (((*i).second.SV & INS)) { //check lenght for ins only! if ((*i).second.length != Parameter::Instance()->huge_ins) { if (lengths.find((*i).second.length) == lengths.end()) { lengths[(*i).second.length] = 1; } else { lengths[(*i).second.length]++; } } lengths2.push_back((*i).second.length); } if (!((*i).second.type == 0 && ((*i).second.SV & INV))) { std::string tmp = translate_strand((*i).second.strand); if (strands.find(tmp) == strands.end()) { strands[tmp] = 1; } else { strands[tmp]++; } } if ((*i).second.type == 0) { aln = true; } else if ((*i).second.type == 1) { split = true; } else if ((*i).second.type == 2) { noise = true; } else { std::cerr << "Type " << (*i).second.type << std::endl; } num++; } } long mean = 0; long counts = 0; int maxim = 0; long coord = 0; if (num > 0) { for (map::iterator i = starts.begin(); i != starts.end(); i++) { if ((*i).second > maxim) { coord = (*i).first; maxim = (*i).second; } } if (maxim < 5) { this->positions.start.most_support = get_median(start2); //check if "input"! } else { this->positions.start.most_support = coord; } this->indel_sequence = ""; maxim = 0; coord = 0; mean = 0; counts = 0; for (map::iterator i = stops.begin(); i != stops.end(); i++) { if ((*i).second > maxim) { coord = (*i).first; maxim = (*i).second; } } if (maxim < 5) { this->positions.stop.most_support = get_median(stops2); // mean / counts; } else { this->positions.stop.most_support = coord; } if (!(this->get_SVtype() & INS)) { //all types but Insertions: this->length = this->positions.stop.most_support - this->positions.start.most_support; } else { //compute most supported length for insertions: maxim = 0; coord = 0; mean = 0; counts = 0; for (map::iterator i = lengths.begin(); i != lengths.end(); i++) { if ((*i).second > maxim) { coord = (*i).first; maxim = (*i).second; } } if (maxim < 3) { this->length = get_median(lengths2); } else { this->length = coord; } // if(del) } starts.clear(); stops.clear(); for (size_t i = 0; i < strands.size(); i++) { maxim = 0; std::string id; for (std::map::iterator j = strands.begin(); j != strands.end(); j++) { if (maxim < (*j).second) { maxim = (*j).second; id = (*j).first; //std::cout << '\t' << id << std::endl; } } if (maxim > 0) { this->strand.push_back(id); strands[id] = 0; } } strands.clear(); std::map::iterator tmp = positions.support.begin(); int start_prev_dist=1000; int stop_prev_dist=1000; while(tmp != positions.support.end()) { int start_dist=abs((*tmp).second.coordinates.first- this->positions.start.most_support); int stop_dist=abs((*tmp).second.coordinates.second- this->positions.stop.most_support); if (((*tmp).second.SV & this->sv_type) && ( (start_distindel_sequence = (*tmp).second.sequence; } tmp++; } } if (num == 0 && positions.support.find("input") != positions.support.end()) { this->positions.stop.most_support = this->positions.stop.max_pos; this->positions.start.most_support = this->positions.start.min_pos; this->length = this->positions.stop.max_pos - this->positions.start.min_pos; } this->supporting_types = ""; if (aln) { this->type.is_ALN = true; this->supporting_types += "AL"; } if (split) { this->type.is_SR = true; if (!supporting_types.empty()) { this->supporting_types += ","; } this->supporting_types += "SR"; } if (noise) { this->type.is_Noise = true; if (!supporting_types.empty()) { this->supporting_types += ","; } this->supporting_types += "NR"; } } std::string Breakpoint::get_chr(long pos, RefVector ref) { // std::cout << "pos: " << pos << std::endl; size_t id = 0; while (id < ref.size() && pos >= 0) { pos -= (long) ref[id].RefLength; // std::cout << id << std::endl; id++; } return ref[id - 1].RefName; } long Breakpoint::calc_pos(long pos, RefVector ref) { size_t i = 0; pos -= ref[i].RefLength; while (i < ref.size() && pos >= 0) { i++; pos -= ref[i].RefLength; } return pos + ref[i].RefLength; } int Breakpoint::get_support() { return type_support; } char complement(char nuc) { switch (nuc) { case 'A': return 'T'; break; case 'C': return 'G'; break; case 'G': return 'C'; break; case 'T': return 'A'; break; default: return nuc; break; } } std::string Breakpoint::rev_complement(std::string seq) { std::string tmp; for (std::string::reverse_iterator i = seq.rbegin(); i != seq.rend(); i++) { tmp += complement((*i)); } return tmp; } std::string Breakpoint::get_strand(int num_best) { //if(this->strand.empty()){ // predict_SV(); //} if (sv_type & NA) { // std::cout<<"was not set"<strand.empty()) { return "UNDEF"; } std::string tmp = this->strand[0]; for (int i = 1; i < num_best; i++) { tmp += '\t'; if (i < (int) this->strand.size()) { tmp += this->strand[i]; } else { tmp += ' '; } } return tmp; } #include "Detect_Breakpoints.h" std::string Breakpoint::to_string() { std::stringstream ss; if (positions.support.size() > 1) { ss << "\t\tTREE: "; ss << TRANS_type(this->get_SVtype()); ss << " "; ss << get_coordinates().start.min_pos; ss << ":"; ss << get_coordinates().stop.max_pos; ss << " "; ss << this->length; ss << " PE="; ss << positions.support.size(); ss << " "; ss << get_strand(1); } return ss.str(); } std::string Breakpoint::to_string(RefVector ref) { std::stringstream ss; ss << "("; ss << get_chr(get_coordinates().start.min_pos, ref); ss << ":"; ss << calc_pos(get_coordinates().start.min_pos, ref); ss << "-"; ss << get_chr(get_coordinates().stop.max_pos, ref); ss << ":"; ss << calc_pos(get_coordinates().stop.max_pos, ref); ss << " "; ss << positions.support.size(); ss << " "; ss << this->sv_debug; ss << " "; ss << this->get_strand(2); ss << "\n"; int num = 0; for (std::map::iterator i = positions.support.begin(); i != positions.support.end(); i++) { if (num < Parameter::Instance()->report_n_reads && Parameter::Instance()->report_n_reads != -1) { break; } ss << "\t"; ss << (*i).first; ss << " "; ss << (*i).second.type; if ((*i).second.strand.first) { ss << "+"; } else { ss << "-"; } if ((*i).second.strand.second) { ss << "+"; } else { ss << "-"; } num++; ss << "\n"; } ss << " "; return ss.str(); } Sniffles-1.0.7/src/sub/Breakpoint.h000066400000000000000000000105161320237057600171270ustar00rootroot00000000000000/* * Breakpoint.h * * Created on: Jun 23, 2015 * Author: fsedlaze */ #ifndef SUB_BREAKPOINT_H_ #define SUB_BREAKPOINT_H_ #include #include #include #include #include #include #include #include "../Paramer.h" #include "../BamParser.h" #include "../tree/BinTree.h" struct region_ref_str{ //not very nice! std::string read_seq; long read_aln_pos; bool direction; long start; long stop; std::string ref; }; struct svs_breakpoint_str{ long min_pos; long max_pos; long most_support; }; struct read_str { //to identify // std::string name; long id; region_ref_str aln; //maybe we can use this! short type; //split reads, cigar or md string //for later assessment: pair strand; pair read_strand; pair coordinates; // I could use the bin tree for that! char SV; // bit vector int length; std::string sequence; //just for indels! }; struct position_str { svs_breakpoint_str start; svs_breakpoint_str stop; //int pos; //the chromosomes are encoded over the positions. std::map support; //std::vector support; // map?? -> no duplicated reads, easy to catch up which read is included. int coverage; int lowmq_cov; int read_start; int read_stop; }; struct str_types{ bool is_SR; bool is_ALN; bool is_Noise; }; //TODO define region object and inherit from that. Plus define avoid region objects for mappability problems. class Breakpoint { private: str_types type; position_str positions; std::vector strand; std::string supporting_types; char sv_type; std::string sv_debug; std::string ref_seq; //std::vector support; short type_support; //for phasing: BinTree grouped; tree_node * grouped_node; long length; std::string indel_sequence; bool should_be_stored; int ref_allele; void summarize_support(short type); //void summarize_strand(pair strand, std::vector& array); void summarize_type(char SV, ushort * array); //std::string translate_strand(short id); char eval_type(ushort *SV); std::string rev_complement(std::string seq); bool is_in(short id); std::string translate_strand(pair strand); bool is_same_strand(Breakpoint * tmp); bool check_SVtype(Breakpoint * break1, Breakpoint * break2); bool is_NEST(Breakpoint * next, Breakpoint * curr){ return (( (*next->get_coordinates().support.begin()).second.SV& NEST) ||( (*curr->get_coordinates().support.begin()).second.SV& NEST) ); } public: Breakpoint(position_str sv,long len) { ref_allele=0; should_be_stored=false; sv_type |= NA; type.is_ALN=((*sv.support.begin()).second.type==0); type.is_SR=((*sv.support.begin()).second.type==1); type.is_Noise=((*sv.support.begin()).second.type==2); type_support=-1; this->positions = sv; this->grouped_node=NULL; this->length=len; } ~Breakpoint() { } int get_support(); long overlap(Breakpoint * tmp); long overlap_breakpoint(long start,long stop); void set_coordinates(int start, int stop){ this->positions.start.min_pos=start; this->positions.stop.max_pos=stop; } position_str get_coordinates() { return this->positions; } void predict_SV(); std::string to_string(RefVector ref); void add_read(Breakpoint * point); std::string get_chr(long pos, RefVector ref); long calc_pos(long pos, RefVector ref); char get_SVtype(); std::string get_strand(int num_best); std::string get_ref_seq() { return this->ref_seq; } void set_ref_seq(std::string seq) { this->ref_seq = seq; } long get_length(){ return length; } void set_length(long len){ this->length=len; } std::string get_supporting_types(){ return this->supporting_types; } void add_grouped(int id){ this->grouped.insert(this->grouped_node, id); } vector get_groupted(){ vector tmp; this->grouped.get_nodes(this->grouped_node,tmp); return tmp; } void calc_support(); str_types get_types(){ return this->type; } std::string get_read_names(); std::vector get_read_ids(); std::string to_string(); std::string get_sequence(){ return this->indel_sequence; } void set_valid(bool valid){ this-> should_be_stored=valid; } bool get_valid(){ return this->should_be_stored; } int get_refcount(){ return this->ref_allele; } void set_refcount(int value){ this->ref_allele+=value; } }; #endif /* SUB_BREAKPOINT_H_ */ Sniffles-1.0.7/src/sub/Container.h000066400000000000000000000002451320237057600167510ustar00rootroot00000000000000/* * Container.h * * Created on: Jun 30, 2015 * Author: fsedlaze */ #ifndef SUB_CONTAINER_H_ #define SUB_CONTAINER_H_ #endif /* SUB_CONTAINER_H_ */ Sniffles-1.0.7/src/sub/Detect_Breakpoints.cpp000066400000000000000000000654171320237057600211470ustar00rootroot00000000000000/* * Detect_Breapoints.cpp * * Created on: Jun 19, 2015 * Author: fsedlaze */ #include "Detect_Breakpoints.h" #include "../print/IPrinter.h" void store_pos(vector &positions, long pos, std::string read_name) { for (size_t i = 0; i < positions.size(); i++) { if (abs(positions[i].position - pos) < Parameter::Instance()->min_length) { positions[i].hits++; positions[i].names.push_back(read_name); return; } } hist_str tmp; tmp.position = pos; tmp.hits = 1; tmp.names.push_back(read_name); positions.push_back(tmp); } std::string reverse_complement(std::string sequence) { std::string tmp_seq; for (std::string::reverse_iterator i = sequence.rbegin(); i != sequence.rend(); i++) { switch ((*i)) { case 'A': tmp_seq += 'T'; break; case 'C': tmp_seq += 'G'; break; case 'G': tmp_seq += 'C'; break; case 'T': tmp_seq += 'A'; break; default: tmp_seq += (*i); break; } } return tmp_seq; } Breakpoint * split_points(vector names, std::map support) { std::map new_support; for (size_t i = 0; i < names.size(); i++) { new_support[names[i]] = support[names[i]]; } position_str svs; svs.start.min_pos = 0; //just to initialize. Should not be needed anymore at this stage of the prog. svs.stop.max_pos = 0; svs.support = new_support; Breakpoint * point = new Breakpoint(svs, (*new_support.begin()).second.coordinates.second - (*new_support.begin()).second.coordinates.first); return point; } void detect_merged_svs(position_str point, RefVector ref, vector & new_points) { new_points.clear(); //just in case! vector pos_start; vector pos_stop; for (std::map::iterator i = point.support.begin(); i != point.support.end(); ++i) { store_pos(pos_start, (*i).second.coordinates.first, (*i).first); store_pos(pos_stop, (*i).second.coordinates.second, (*i).first); } int start_count = 0; for (size_t i = 0; i < pos_start.size(); i++) { //std::cout< Parameter::Instance()->min_support) { start_count++; } } int stop_count = 0; for (size_t i = 0; i < pos_stop.size(); i++) { // std::cout << pos_stop[i].hits << ","; if (pos_stop[i].hits > Parameter::Instance()->min_support) { stop_count++; } } if (stop_count > 1 || start_count > 1) { std::cout << "\tprocessing merged TRA" << std::endl; if (start_count > 1) { new_points.push_back(split_points(pos_start[0].names, point.support)); new_points.push_back(split_points(pos_start[1].names, point.support)); } else { new_points.push_back(split_points(pos_stop[0].names, point.support)); new_points.push_back(split_points(pos_stop[1].names, point.support)); } } } std::string TRANS_type(char type) { string tmp; if (type & DEL) { tmp += "DEL"; } if (type & INV) { if (!tmp.empty()) { tmp += '/'; } tmp += "INV"; } if (type & DUP) { if (!tmp.empty()) { tmp += '/'; } tmp += "DUP"; } if (type & INS) { if (!tmp.empty()) { tmp += '/'; } tmp += "INS"; } if (type & TRA) { if (!tmp.empty()) { tmp += '/'; } tmp += "TRA"; } if (type & NEST) { if (!tmp.empty()) { tmp += '/'; } tmp += "NEST"; } return tmp; // should not occur! } long get_ref_lengths(int id, RefVector ref) { long length = 0; for (size_t i = 0; i < (size_t) id && i < ref.size(); i++) { length += (long) ref[i].RefLength + (long) Parameter::Instance()->max_dist; } return length; } bool should_be_stored(Breakpoint *& point) { point->calc_support(); // we need that before: //std::cout << "Stored: " << point->get_support() << " " << point->get_length() << std::endl; if (point->get_SVtype() & TRA) { // we cannot make assumptions abut support yet. point->set_valid((bool) (point->get_support() > 1)); // this is needed as we take each chr independently and just look at the primary alignment } else if (point->get_support() >= Parameter::Instance()->min_support) { point->predict_SV(); point->set_valid((bool) (point->get_length() > Parameter::Instance()->min_length)); } return point->get_valid(); } void polish_points(std::vector & points, RefVector ref) { //TODO might be usefull! but why does the tree not fully work?? return; for (size_t i = 0; i < points.size(); i++) { if (points[i]->get_SVtype() & INS && (points[i]->get_length() == Parameter::Instance()->huge_ins)) { for (size_t j = 0; j < points.size(); j++) { if (i != j) { if (abs(points[i]->get_coordinates().start.min_pos - points[j]->get_coordinates().start.min_pos) < Parameter::Instance()->max_dist || abs(points[i]->get_coordinates().stop.max_pos - points[j]->get_coordinates().stop.max_pos) < Parameter::Instance()->max_dist) { std::cout << "HIT!: " << points[j]->get_coordinates().start.min_pos << " " << points[i]->get_coordinates().start.min_pos << " " << points[j]->get_coordinates().stop.max_pos << " " << points[i]->get_coordinates().stop.max_pos << " len: " << points[j]->get_length() << " " << points[i]->get_length() << std::endl; break; } } } } } } void detect_breakpoints(std::string read_filename, IPrinter *& printer) { estimate_parameters(read_filename); BamParser * mapped_file = 0; RefVector ref; if (read_filename.find("bam") != string::npos) { mapped_file = new BamParser(read_filename); ref = mapped_file->get_refInfo(); } else { cerr << "File Format not recognized. File must be a sorted .bam file!" << endl; exit(0); } //Using PlaneSweep to comp coverage and iterate through reads: //PlaneSweep * sweep = new PlaneSweep(); std::cout << "Start parsing..." << std::endl; //Using Interval tree to store and manage breakpoints: IntervallTree final; IntervallTree bst; TNode * root_final = NULL; int current_RefID = 0; TNode *root = NULL; //FILE * alt_allel_reads; FILE * ref_allel_reads; if (Parameter::Instance()->genotype) { ref_allel_reads = fopen(Parameter::Instance()->tmp_genotyp.c_str(), "wb"); } Alignment * tmp_aln = mapped_file->parseRead(Parameter::Instance()->min_mq); long ref_space = get_ref_lengths(tmp_aln->getRefID(), ref); long num_reads = 0; /*Genotyper * go; if (Parameter::Instance()->genotype) { go = new Genotyper(); }*/ while (!tmp_aln->getQueryBases().empty()) { if ((tmp_aln->getAlignment()->IsPrimaryAlignment()) && (!(tmp_aln->getAlignment()->AlignmentFlag & 0x800) && tmp_aln->get_is_save())){// && (Parameter::Instance()->chr_names.empty() || Parameter::Instance()->chr_names.find(ref[tmp_aln->getRefID()].RefName) != Parameter::Instance()->chr_names.end())) { //change CHR: if (current_RefID != tmp_aln->getRefID()) { std::cout << "\tSwitch Chr " << ref[tmp_aln->getRefID()].RefName << std::endl; //" " << ref[tmp_aln->getRefID()].RefLength std::vector points; bst.get_breakpoints(root, points); //polish_points(points, ref); /* if (Parameter::Instance()->genotype) { fclose(ref_allel_reads); cout<<"\t\tGenotyping"<update_SVs(points, ref_space); cout<<"\t\tGenotyping finished"<tmp_genotyp.c_str(), "wb"); }*/ for (int i = 0; i < points.size(); i++) { points[i]->calc_support(); if (points[i]->get_valid()) { //invoke update over ref support! if (points[i]->get_SVtype() & TRA) { final.insert(points[i], root_final); } else { printer->printSV(points[i]); } } } bst.clear(root); current_RefID = tmp_aln->getRefID(); ref_space = get_ref_lengths(tmp_aln->getRefID(), ref); } //SCAN read: std::vector aln_event; std::vector split_events; if (tmp_aln->getMappingQual() > Parameter::Instance()->min_mq) { double score = tmp_aln->get_scrore_ratio(); #pragma omp parallel // starts a new team { #pragma omp sections { #pragma omp section { // clock_t begin = clock(); if ((score == -1 || score > Parameter::Instance()->score_treshold)) { aln_event = tmp_aln->get_events_Aln(); } // Parameter::Instance()->meassure_time(begin, " Alignment "); } #pragma omp section { // clock_t begin_split = clock(); split_events = tmp_aln->getSA(ref); // Parameter::Instance()->meassure_time(begin_split, " Split reads "); } } } //tmp_aln->set_supports_SV(aln_event.empty() && split_events.empty()); //Store reference supporting reads for genotype estimation: bool SV_support = (!aln_event.empty() && !split_events.empty()); if (Parameter::Instance()->genotype && !SV_support) { //write read: str_read tmp; tmp.chr_id = tmp_aln->getRefID(); //check string in binary??? tmp.start = tmp_aln->getPosition(); tmp.length = tmp_aln->getRefLength(); fwrite(&tmp, sizeof(struct str_read), 1, ref_allel_reads); } //store the potential SVs: if (!aln_event.empty()) { add_events(tmp_aln, aln_event, 0, ref_space, bst, root, num_reads, false); } if (!split_events.empty()) { add_splits(tmp_aln, split_events, 1, ref, bst, root, num_reads, false); } } } //get next read: mapped_file->parseReadFast(Parameter::Instance()->min_mq, tmp_aln); num_reads++; if (num_reads % 10000 == 0) { cout << "\t\t# Processed reads: " << num_reads << endl; } } //filter and copy results: std::cout << "Finalizing .." << std::endl; std::vector points; bst.get_breakpoints(root, points); /* if (Parameter::Instance()->genotype) { fclose(ref_allel_reads); go->update_SVs(points, ref_space); string del = "rm "; del += Parameter::Instance()->tmp_genotyp; del += "ref_allele"; system(del.c_str()); }*/ for (int i = 0; i < points.size(); i++) { points[i]->calc_support(); if (points[i]->get_valid()) { //invoke update over ref support! if (points[i]->get_SVtype() & TRA) { final.insert(points[i], root_final); } else { printer->printSV(points[i]); } } } bst.clear(root); points.clear(); final.get_breakpoints(root_final, points); //std::cout<<"Detect merged tra"<get_SVtype() & TRA) { vector new_points; detect_merged_svs(points[i]->get_coordinates(), ref, new_points); if (!new_points.empty()) { // I only allow for 1 split!! points[i] = new_points[0]; points.push_back(new_points[1]); } } } //std::cout<<"fin up"<get_SVtype() & TRA) { points[i]->calc_support(); points[i]->predict_SV(); } if (points[i]->get_support() >= Parameter::Instance()->min_support && points[i]->get_length() > Parameter::Instance()->min_length) { printer->printSV(points[i]); } } //std::cout<<"Done"< events, short type, long ref_space, IntervallTree & bst, TNode *&root, long read_id, bool add) { bool flag = (strcmp(tmp->getName().c_str(), Parameter::Instance()->read_name.c_str()) == 0); for (size_t i = 0; i < events.size(); i++) { position_str svs; read_str read; if (events[i].is_noise) { read.type = 2; } else { read.type = 0; } read.SV = events[i].type; read.sequence = events[i].sequence; if (flag) { std::cout << "ADD EVENT " << tmp->getName() << " " << tmp->getRefID() << " " << events[i].pos << " " << abs(events[i].length) << std::endl; } svs.start.min_pos = (long) events[i].pos + ref_space; svs.stop.max_pos = svs.start.min_pos + events[i].length; if (tmp->getStrand()) { read.strand.first = (tmp->getStrand()); read.strand.second = !(tmp->getStrand()); } else { read.strand.first = !(tmp->getStrand()); read.strand.second = (tmp->getStrand()); } // start.support[0].read_start.min = events[i].read_pos; read.read_strand.first = tmp->getStrand(); read.read_strand.second = tmp->getStrand(); if (flag) { std::cout << tmp->getName() << " " << tmp->getRefID() << " " << svs.start.min_pos << " " << svs.stop.max_pos << " " << svs.stop.max_pos - svs.start.min_pos << std::endl; } if (svs.start.min_pos > svs.stop.max_pos) { //can this actually happen? read.coordinates.first = svs.stop.max_pos; read.coordinates.second = svs.start.min_pos; } else { read.coordinates.first = svs.start.min_pos; read.coordinates.second = svs.stop.max_pos; } svs.start.max_pos = svs.start.min_pos; svs.stop.min_pos = svs.stop.max_pos; if (svs.start.min_pos > svs.stop.max_pos) { //incase they are inverted svs_breakpoint_str pos = svs.start; svs.start = svs.stop; svs.stop = pos; pair tmp = read.strand; read.strand.first = tmp.second; read.strand.second = tmp.first; } //TODO: we might not need this: if (svs.start.min_pos > svs.stop.max_pos) { read.coordinates.first = svs.stop.max_pos; read.coordinates.second = svs.start.min_pos; } else { read.coordinates.first = svs.start.min_pos; read.coordinates.second = svs.stop.max_pos; } read.id = read_id; svs.support[tmp->getName()] = read; svs.support[tmp->getName()].length = events[i].length; Breakpoint * point = new Breakpoint(svs, events[i].length); if (add) { bst.insert_existant(point, root); } else { bst.insert(point, root); } //std::cout<<"Print:"< events, short type, RefVector ref, IntervallTree& bst, TNode *&root, long read_id, bool add) { bool flag = (strcmp(tmp->getName().c_str(), Parameter::Instance()->read_name.c_str()) == 0); if (flag) { cout << "SPLIT: " << std::endl; for (size_t i = 0; i < events.size(); i++) { std::cout << events[i].pos << " stop: " << events[i].pos + events[i].length << " " << events[i].RefID << " READ: " << events[i].read_pos_start << " " << events[i].read_pos_stop; if (events[i].strand) { cout << " +" << endl; } else { cout << " -" << endl; } } } for (size_t i = 1; i < events.size(); i++) { position_str svs; //position_str stop; read_str read; read.sequence = "NA"; //read.name = tmp->getName(); read.type = type; read.SV = 0; read.read_strand.first = events[i - 1].strand; read.read_strand.second = events[i].strand; //stop.support.push_back(read); if (events[i].RefID == events[i - 1].RefID) { //IF different chr -> tra if (events[i - 1].strand == events[i].strand) { //IF same strand -> del/ins/dup if (events[i - 1].strand) { read.strand.first = events[i - 1].strand; read.strand.second = !events[i].strand; } else { read.strand.first = !events[i - 1].strand; read.strand.second = events[i].strand; } // int len1 = 0; //int len2 = 0; svs.read_start = events[i - 1].read_pos_stop; // (short) events[i - 1].read_pos_start + (short) events[i - 1].length; svs.read_stop = events[i].read_pos_start; if (events[i - 1].strand) { svs.start.min_pos = events[i - 1].pos + events[i - 1].length + get_ref_lengths(events[i - 1].RefID, ref); svs.stop.max_pos = events[i].pos + get_ref_lengths(events[i].RefID, ref); } else { svs.start.min_pos = events[i].pos + events[i].length + get_ref_lengths(events[i].RefID, ref); svs.stop.max_pos = events[i - 1].pos + get_ref_lengths(events[i - 1].RefID, ref); } if (flag) { cout << "Debug: SV_Size: " << (svs.start.min_pos - svs.stop.max_pos) << " tmp: " << (svs.stop.max_pos - svs.start.min_pos) << " Ref_start: " << svs.start.min_pos - get_ref_lengths(events[i].RefID, ref) << " Ref_stop: " << svs.stop.max_pos - get_ref_lengths(events[i].RefID, ref) << " readstart: " << svs.read_start << " readstop: " << svs.read_stop << std::endl; } if ((svs.stop.max_pos - svs.start.min_pos) > Parameter::Instance()->min_length * -1 && ((svs.stop.max_pos - svs.start.min_pos) + (Parameter::Instance()->min_length) < (svs.read_stop - svs.read_start) && (svs.read_stop - svs.read_start) > (Parameter::Instance()->min_length * 2))) { if (!events[i].cross_N || (double) ((svs.stop.max_pos - svs.start.min_pos) + Parameter::Instance()->min_length) < ((double) (svs.read_stop - svs.read_start) * Parameter::Instance()->avg_ins)) { svs.stop.max_pos += (svs.read_stop - svs.read_start); //TODO check! if (Parameter::Instance()->print_seq) { svs.read_stop = events[i].read_pos_start; svs.read_start = events[i - 1].read_pos_stop; if (svs.read_stop > tmp->getAlignment()->QueryBases.size()) { cerr << "BUG: split read ins! " << svs.read_stop << " " << tmp->getAlignment()->QueryBases.size() << " " << tmp->getName() << endl; } if (!events[i - 1].strand) { std::string tmp_seq = reverse_complement(tmp->getAlignment()->QueryBases); read.sequence = reverse_complement(tmp_seq.substr(svs.read_start, svs.read_stop - svs.read_start)); } else { read.sequence = tmp->getAlignment()->QueryBases.substr(svs.read_start, svs.read_stop - svs.read_start); } if (flag) { cout << "INS: " << endl; cout << "split read ins! " << events[i - 1].read_pos_stop << " " << events[i].read_pos_start << " " << " " << tmp->getAlignment()->QueryBases.size() << " " << tmp->getName() << endl; cout << "Seq+:" << read.sequence << endl; } } read.SV |= INS; } else { read.SV |= 'n'; } } else if ((svs.start.min_pos - svs.stop.max_pos) * -1 > (svs.read_stop - svs.read_start) + (Parameter::Instance()->min_length)) { if (!events[i].cross_N || (double) (svs.start.min_pos - svs.stop.max_pos) * Parameter::Instance()->avg_del * -1.0 > (double) ((svs.read_stop - svs.read_start) + (Parameter::Instance()->min_length))) { read.SV |= DEL; if (flag) { cout << "DEL2" << endl; } } else { read.SV |= 'n'; } } else if ((svs.start.min_pos - svs.stop.max_pos) > Parameter::Instance()->min_length && (svs.read_start - svs.read_stop) < Parameter::Instance()->min_length) { //check with respect to the coords of reads! if (flag) { cout << "DUP: " << endl; } read.SV |= DUP; } else { if (flag) { cout << "N" << endl; } read.SV = 'n'; } } else { // if first part of read is in a different direction as the second part-> INV read.strand.first = events[i - 1].strand; read.strand.second = !events[i].strand; bool is_overlapping = overlaps(events[i - 1], events[i]); if (is_overlapping && (events[i - 1].length > Parameter::Instance()->min_segment_size || events[i].length > Parameter::Instance()->min_segment_size)) { if (flag) { std::cout << "Overlap curr: " << events[i].pos << " " << events[i].pos + events[i].length << " prev: " << events[i - 1].pos << " " << events[i - 1].pos + events[i - 1].length << " " << tmp->getName() << std::endl; } read.SV |= NEST; if (events[i - 1].strand) { svs.start.min_pos = events[i - 1].pos + events[i - 1].length + get_ref_lengths(events[i - 1].RefID, ref); svs.stop.max_pos = (events[i].pos + events[i].length) + get_ref_lengths(events[i].RefID, ref); } else { svs.start.min_pos = events[i - 1].pos + get_ref_lengths(events[i - 1].RefID, ref); svs.stop.max_pos = events[i].pos + get_ref_lengths(events[i].RefID, ref); } if (svs.start.min_pos > svs.stop.max_pos) { long tmp = svs.start.min_pos; svs.start.min_pos = svs.stop.max_pos; svs.stop.max_pos = tmp; } } else if (!is_overlapping) { read.SV |= INV; if (events[i - 1].strand) { svs.start.min_pos = events[i - 1].pos + events[i - 1].length + get_ref_lengths(events[i - 1].RefID, ref); svs.stop.max_pos = (events[i].pos + events[i].length) + get_ref_lengths(events[i].RefID, ref); } else { svs.start.min_pos = events[i - 1].pos + get_ref_lengths(events[i - 1].RefID, ref); svs.stop.max_pos = events[i].pos + get_ref_lengths(events[i].RefID, ref); } } } } else { //if not on the same chr-> TRA read.strand.first = events[i - 1].strand; read.strand.second = !events[i].strand; if (events[i - 1].strand == events[i].strand) { //check this with + - strands!! if (events[i - 1].strand) { svs.start.min_pos = events[i - 1].pos + events[i - 1].length + get_ref_lengths(events[i - 1].RefID, ref); svs.stop.max_pos = events[i].pos + get_ref_lengths(events[i].RefID, ref); } else { svs.start.min_pos = events[i - 1].pos + get_ref_lengths(events[i - 1].RefID, ref); svs.stop.max_pos = events[i].pos + events[i].length + get_ref_lengths(events[i].RefID, ref); } } else { if (events[i - 1].strand) { svs.start.min_pos = events[i - 1].pos + events[i - 1].length + get_ref_lengths(events[i - 1].RefID, ref); svs.stop.max_pos = events[i].pos + events[i].length + get_ref_lengths(events[i].RefID, ref); } else { svs.start.min_pos = events[i - 1].pos + get_ref_lengths(events[i - 1].RefID, ref); svs.stop.max_pos = events[i].pos + get_ref_lengths(events[i].RefID, ref); } } read.SV |= TRA; } if (read.SV != 'n') { if (flag) { std::cout << "SPLIT: " << TRANS_type(read.SV) << " start: " << svs.start.min_pos - get_ref_lengths(events[i].RefID, ref) << " stop: " << svs.stop.max_pos - get_ref_lengths(events[i].RefID, ref); if (events[i - 1].strand) { std::cout << " +"; } else { std::cout << " -"; } if (events[i].strand) { std::cout << " +"; } else { std::cout << " -"; } std::cout << " " << tmp->getName() << std::endl; std::cout << "READ: " << svs.read_start << " " << svs.read_stop << " " << svs.read_start - svs.read_stop << std::endl; } //std::cout<<"split"< svs.stop.max_pos) { //maybe we have to invert the directions??? svs_breakpoint_str pos = svs.start; svs.start = svs.stop; svs.stop = pos; pair tmp = read.strand; read.strand.first = tmp.second; read.strand.second = tmp.first; } //TODO: we might not need this: if (svs.start.min_pos > svs.stop.max_pos) { read.coordinates.first = svs.stop.max_pos; read.coordinates.second = svs.start.min_pos; } else { read.coordinates.first = svs.start.min_pos; read.coordinates.second = svs.stop.max_pos; } //pool out? read.id = read_id; svs.support[tmp->getName()] = read; svs.support[tmp->getName()].length = abs(read.coordinates.second - read.coordinates.first); Breakpoint * point = new Breakpoint(svs, abs(read.coordinates.second - read.coordinates.first)); //std::cout<<"split ADD: " << <<" Name: "<getName()<<" "<< svs.start.min_pos- get_ref_lengths(events[i].RefID, ref)<<"-"<get_refInfo(); } else { cerr << "File Format not recognized. File must be a sorted .bam file!" << endl; exit(0); } Alignment * tmp_aln = mapped_file->parseRead(Parameter::Instance()->min_mq); double num = 0; double avg_score = 0; double avg_mis = 0; double avg_indel = 0; double avg_diffs_perwindow = 0; vector mis_per_window; //histogram over #differences vector scores; // std::string curr, prev = ""; double avg_dist = 0; double tot_avg_ins = 0; double tot_avg_del = 0; while (!tmp_aln->getQueryBases().empty() && num < 1000) { //1000 // std::cout<<"test "<getName()<getAlignment()->IsPrimaryAlignment()) && (!(tmp_aln->getAlignment()->AlignmentFlag & 0x800)))) { //}&& tmp_aln->get_is_save()))) { //1. check differences in window => min_treshold for scanning! //2. get score ration without checking before hand! (above if!) double dist = 0; double avg_del = 0; double avg_ins = 0; vector tmp = tmp_aln->get_avg_diff(dist, avg_del, avg_ins); tot_avg_ins += avg_ins; tot_avg_del += avg_del; //std::cout<<"Debug:\t"< mis_per_window.size()) { //adjust length mis_per_window.push_back(0); } avg_mis += tmp[i]; mis_per_window[tmp[i]]++; } // std::cout <get_scrore_ratio()); // std::cout< -1) { while (score + 1 > scores.size()) { scores.push_back(0); } scores[score]++; } num++; } mapped_file->parseReadFast(Parameter::Instance()->min_mq, tmp_aln); } if (num == 0) { std::cerr << "Too few reads detected in " << Parameter::Instance()->bam_files[0] << std::endl; exit(1); } vector nums; size_t pos = 0; Parameter::Instance()->max_dist_alns = floor(avg_dist / num) / 2; Parameter::Instance()->window_thresh = 50; //25; if (!mis_per_window.empty()) { for (size_t i = 0; i < mis_per_window.size(); i++) { for (size_t j = 0; j < mis_per_window[i]; j++) { nums.push_back(i); } } pos = nums.size() * 0.95; //the highest 5% cutoff if (pos > 0 && pos <= nums.size()) { Parameter::Instance()->window_thresh = std::max(Parameter::Instance()->window_thresh, nums[pos]); //just in case we have too clean data! :) } nums.clear(); } for (size_t i = 0; i < scores.size(); i++) { for (size_t j = 0; j < scores[i]; j++) { nums.push_back(i); } } pos = nums.size() * 0.05; //the lowest 5% cuttoff Parameter::Instance()->score_treshold = 2; //nums[pos]; //prev=2 Parameter::Instance()->avg_del = tot_avg_del / num; Parameter::Instance()->avg_ins = tot_avg_ins / num; std::cout << "\tMax dist between aln events: " << Parameter::Instance()->max_dist_alns << std::endl; std::cout << "\tMax diff in window: " << Parameter::Instance()->window_thresh << std::endl; std::cout << "\tMin score ratio: " << Parameter::Instance()->score_treshold << std::endl; std::cout << "\tAvg DEL ratio: " << Parameter::Instance()->avg_del << std::endl; std::cout << "\tAvg INS ratio: " << Parameter::Instance()->avg_ins << std::endl; } bool overlaps(aln_str prev, aln_str curr) { double ratio = 0; double overlap = 0; if (prev.pos + Parameter::Instance()->min_length < curr.pos + curr.length && prev.pos + prev.length - Parameter::Instance()->min_length > curr.pos) { overlap = min((curr.pos + curr.length), (prev.pos + prev.length)) - max(prev.pos, curr.pos); ratio = overlap / (double) min(curr.length, prev.length); } // std::cout< 0.4 && overlap > 200); } Sniffles-1.0.7/src/sub/Detect_Breakpoints.h000066400000000000000000000026241320237057600206030ustar00rootroot00000000000000/* * Detect_Breakpoints.h * * Created on: Jun 19, 2015 * Author: fsedlaze */ #ifndef SUB_DETECT_BREAKPOINTS_H_ #define SUB_DETECT_BREAKPOINTS_H_ #include "../BamParser.h" #include "../Parser.h" #include "../Alignment.h" #include "../plane-sweep/Plane-sweep.h" #include "../tree/IntervallTree.h" #include "../tree/TNode.h" #include "../tree/IntervallContainer.h" #include "../tree/IntervallList.h" #include "../Paramer.h" #include "../print/IPrinter.h" #include #include struct hist_str{ long position; int hits; std::vector names; }; void clarify(std::vector & points); void detect_breakpoints(std::string filename, IPrinter *& printer); //void screen_for_events(Node * list,IntervallTree & bst ,TNode *&root, int cov, int lowMQ_cov,RefVector ref); bool screen_for_events(Alignment * tmp, IntervallTree & bst, TNode *&root, RefVector ref, int cov); void add_events(Alignment *& tmp, std::vector events, short type, long ref_space, IntervallTree & bst, TNode *&root,long read_id,bool add); void add_splits(Alignment *& tmp, std::vector events, short type, RefVector ref, IntervallTree & bst, TNode *&root,long read_id,bool add); void estimate_parameters(std::string read_filename); bool overlaps(aln_str prev,aln_str curr); void detect_merged_svs(Breakpoint * point); std::string TRANS_type(char type); #endif /* SUB_DETECT_BREAKPOINTS_H_ */ Sniffles-1.0.7/src/sub/IRegion.h000066400000000000000000000020351320237057600163620ustar00rootroot00000000000000/* * IRegion.h * * Created on: Aug 27, 2015 * Author: fsedlaze */ #ifndef SUB_IREGION_H_ #define SUB_IREGION_H_ #include "../Paramer.h" #include "../BamParser.h" struct read_str { //to identify std::string name; short type; //for later assessment: pair strand; char SV; // bits vector }; struct position_str { long start; long stop; //int pos; //the chromosomes are encoded over the positions. std::vector support; int coverage; int lowmq_cov; short read_start; short read_stop; }; class IRegion { protected: position_str start; public: IRegion(position_str reg) { this->start = reg; //std::cout << "Break1: " << start.start << " " << start.stop << std::endl; if (reg.start > reg.stop) { this->start.start = reg.stop; this->start.stop = reg.start; } } virtual ~IRegion() { } virtual std::string to_string(RefVector ref)=0; virtual long overlap(IRegion * tmp) =0; position_str get_coordinates() { return this->start; } virtual int support()=0; }; #endif /* SUB_IREGION_H_ */ Sniffles-1.0.7/src/tree/000077500000000000000000000000001320237057600150235ustar00rootroot00000000000000Sniffles-1.0.7/src/tree/BinTree.cpp000066400000000000000000000113461320237057600170640ustar00rootroot00000000000000/* * BinTree.cpp * * Created on: Sep 3, 2015 * Author: fsedlaze */ #include "BinTree.h" void BinTree::find(int item, tree_node **par, tree_node **loc) { tree_node *ptr, *ptrsave; if (root == NULL) { *loc = NULL; *par = NULL; return; } if (item == root->key) { *loc = root; *par = NULL; return; } if (item < root->key) { ptr = root->left; } else { ptr = root->right; } ptrsave = root; while (ptr != NULL) { if (item == ptr->key) { *loc = ptr; *par = ptrsave; return; } ptrsave = ptr; if (item < ptr->key) { ptr = ptr->left; } else { ptr = ptr->right; } } *loc = NULL; *par = ptrsave; } /* * Inserting Element into the Tree */ void BinTree::insert(tree_node *tree, int value) { if (root == NULL) { root = new tree_node; root->key = value; root->num = 1; root->left = NULL; root->right = NULL; std::cout << "Root tree_node is Added" << std::endl; return; } if (tree->key > value) { if (tree->left != NULL) { insert(tree->left, value); } else { tree->left = new tree_node; tree->left->key = value; tree->left->num = 1; (tree->left)->left = NULL; (tree->left)->right = NULL; std::cout << "tree_node Added To Left" << std::endl; return; } } else if (tree->key < value) { if (tree->right != NULL) { insert(tree->right, value); } else { tree->right = new tree_node; tree->right->key = value; tree->right->num = 1; (tree->right)->left = NULL; (tree->right)->right = NULL; std::cout << "tree_node Added To Right" << std::endl; return; } } else { // found element -> already exist! tree->num++; } } /* * Delete Element from the tree */ void BinTree::del(int key) { tree_node *parent, *location; if (root == NULL) { std::cout << "Tree empty" << std::endl; return; } find(key, &parent, &location); if (location == NULL) { std::cout << "Item not present in tree" << std::endl; return; } if (location->left == NULL && location->right == NULL) { case_a(parent, location); } if (location->left != NULL && location->right == NULL) { case_b(parent, location); } if (location->left == NULL && location->right != NULL) { case_b(parent, location); } if (location->left != NULL && location->right != NULL) { case_c(parent, location); } delete location; } /* * Case A */ void BinTree::case_a(tree_node *par, tree_node *loc) { if (par == NULL) { root = NULL; } else { if (loc == par->left) { par->left = NULL; } else { par->right = NULL; } } } /* * Case B */ void BinTree::case_b(tree_node *par, tree_node *loc) { tree_node *child; if (loc->left != NULL) { child = loc->left; } else { child = loc->right; } if (par == NULL) { root = child; } else { if (loc == par->left) { par->left = child; } else { par->right = child; } } } /* * Case C */ void BinTree::case_c(tree_node *par, tree_node *loc) { tree_node *ptr, *ptrsave, *suc, *parsuc; ptrsave = loc; ptr = loc->right; while (ptr->left != NULL) { ptrsave = ptr; ptr = ptr->left; } suc = ptr; parsuc = ptrsave; if (suc->left == NULL && suc->right == NULL) { case_a(parsuc, suc); } else { case_b(parsuc, suc); } if (par == NULL) { root = suc; } else { if (loc == par->left) { par->left = suc; } else { par->right = suc; } } suc->left = loc->left; suc->right = loc->right; } void BinTree::get_nodes(tree_node *ptr, std::vector & nodes) { std::cout<<"get_nodes"<key); get_nodes(ptr->left,nodes); get_nodes(ptr->right,nodes); } } /* * Pre Order Traversal */ void BinTree::preorder(tree_node *ptr) { if (root == NULL) { std::cout << "Tree is empty" << std::endl; return; } if (ptr != NULL) { std::cout << ptr->key << " "; preorder(ptr->left); preorder(ptr->right); } } /* * In Order Traversal */ void BinTree::inorder(tree_node *ptr) { if (root == NULL) { std::cout << "Tree is empty" << std::endl; return; } if (ptr != NULL) { inorder(ptr->left); std::cout << ptr->key << " "; inorder(ptr->right); } } /* * Postorder Traversal */ void BinTree::postorder(tree_node *ptr) { if (root == NULL) { std::cout << "Tree is empty" << std::endl; return; } if (ptr != NULL) { postorder(ptr->left); postorder(ptr->right); std::cout << ptr->key << " "; } } /* * Display Tree Structure */ void BinTree::display(tree_node *ptr, int level) { int i; if (ptr != NULL) { display(ptr->right, level + 1); std::cout << std::endl; if (ptr == root) std::cout << "Root->: "; else { for (i = 0; i < level; i++) { std::cout << " "; } } std::cout << ptr->key; display(ptr->left, level + 1); } } Sniffles-1.0.7/src/tree/BinTree.h000066400000000000000000000015771320237057600165360ustar00rootroot00000000000000/* * BinTree.h * * Created on: Sep 3, 2015 * Author: fsedlaze */ #ifndef TREE_BINTREE_H_ #define TREE_BINTREE_H_ struct tree_node { int key; // value to store! int num;//times hit 1-> unique struct tree_node *left; struct tree_node *right; }; #include #include #include class BinTree { private: tree_node *root; public: BinTree() { root = NULL; } ~BinTree(){ } void find(int item, tree_node **par, tree_node **loc); void insert(tree_node *tree, int value); void del(int key); void case_a(tree_node *par, tree_node *loc); void case_b(tree_node *par, tree_node *loc); void case_c(tree_node *par, tree_node *loc); void preorder(tree_node *ptr); void inorder(tree_node *ptr); void postorder(tree_node *ptr); void display(tree_node *ptr, int); void get_nodes(tree_node *ptr, std::vector & nodes); }; #endif /* TREE_BINTREE_H_ */ Sniffles-1.0.7/src/tree/Breakpoint_Tree.cpp000066400000000000000000000123521320237057600206070ustar00rootroot00000000000000/* * Breakpoint_Tree.cpp * * Created on: Mar 28, 2016 * Author: fsedlaze */ #include "Breakpoint_Tree.h" void Breakpoint_Tree::find(int position, std::string chr, breakpoint_node *par, breakpoint_node *&loc) { if (par == NULL) { //not found loc = NULL; par = NULL; return; } if (position == par->position && strcmp(chr.c_str(), par->chr.c_str()) == 0) { //found loc = par; par = NULL; return; } //search goes on: if (position < par->position) { find(position, chr, par->left, loc); } else { find(position, chr, par->right, loc); } } void Breakpoint_Tree::overalps(int start, int stop, std::string chr, breakpoint_node *par) { //start + stop: read coordinates. if (par == NULL) { //not found return; } if (par->direction) { //start if ((par->position-100 > start && par->position+100 < stop) && strcmp(chr.c_str(), par->chr.c_str()) == 0) { //found par->ref_support++; // std::cout<<"start: "<position > start+100 && par->position < stop-100) && strcmp(chr.c_str(), par->chr.c_str()) == 0) { //found par->ref_support++; // std::cout<<"stop: "<< start<<" "<position) { overalps(start, stop, chr, par->left); } else { overalps(start, stop, chr, par->right); } } /* * Inserting Element into the Tree */ void Breakpoint_Tree::insert(breakpoint_node *&tree, std::string chr, int position, bool direction) { if (tree == NULL) { tree = new breakpoint_node; tree->position = position; tree->ref_support = 0; tree->chr = chr; tree->direction = direction; tree->left = NULL; tree->right = NULL; } else if (tree->position > position) { insert(tree->left, chr, position, direction); } else if (tree->position < position) { insert(tree->right, chr, position, direction); } else if (strcmp(chr.c_str(), tree->chr.c_str()) == 0) { // found element -> already exist! //std::cerr << "Element exists!" << std::endl; //TODO we should use this information to assess the reliability of this call! } else { insert(tree->left, chr, position,position); //think about that! } } int Breakpoint_Tree::get_ref(breakpoint_node *&tree, std::string chr, int position) { if (tree == NULL) { return -1; } if (tree->position > position) { return get_ref(tree->left, chr, position); } else if (tree->position < position) { return get_ref(tree->right, chr, position); } else if (strcmp(chr.c_str(), tree->chr.c_str()) == 0) { // found element return tree->ref_support; } else { return get_ref(tree->left, chr, position); //just in case. } } /* * Delete Element from the tree */ void Breakpoint_Tree::del(int position, std::string chr) { breakpoint_node *parent, *location; if (parent == NULL) { std::cout << "Tree empty" << std::endl; return; } find(position, chr, parent, location); if (location == NULL) { std::cout << "Item not present in tree" << std::endl; return; } if (location->left == NULL && location->right == NULL) { case_a(parent, location); } if (location->left != NULL && location->right == NULL) { case_b(parent, location); } if (location->left == NULL && location->right != NULL) { case_b(parent, location); } if (location->left != NULL && location->right != NULL) { case_c(parent, location); } delete location; } /* * Case A */ void Breakpoint_Tree::case_a(breakpoint_node *par, breakpoint_node *loc) { if (par == NULL) { loc = NULL; } else { if (loc == par->left) { par->left = NULL; } else { par->right = NULL; } } } /* * Case B */ void Breakpoint_Tree::case_b(breakpoint_node *par, breakpoint_node *loc) { breakpoint_node *child; if (loc->left != NULL) { child = loc->left; } else { child = loc->right; } if (par == NULL) { loc = child; } else { if (loc == par->left) { par->left = child; } else { par->right = child; } } } /* * Case C */ void Breakpoint_Tree::case_c(breakpoint_node *par, breakpoint_node *loc) { breakpoint_node *ptr, *ptrsave, *suc, *parsuc; ptrsave = loc; ptr = loc->right; while (ptr->left != NULL) { ptrsave = ptr; ptr = ptr->left; } suc = ptr; parsuc = ptrsave; if (suc->left == NULL && suc->right == NULL) { case_a(parsuc, suc); } else { case_b(parsuc, suc); } if (par == NULL) { loc = suc; } else { if (loc == par->left) { par->left = suc; } else { par->right = suc; } } suc->left = loc->left; suc->right = loc->right; } /* * Pre Order Traversal */ void Breakpoint_Tree::preorder(breakpoint_node *ptr) { if (ptr == NULL) { std::cout << "Tree is empty" << std::endl; return; } if (ptr != NULL) { std::cout << ptr->position << " "; preorder(ptr->left); preorder(ptr->right); } } /* * In Order Traversal */ void Breakpoint_Tree::inorder(breakpoint_node *ptr) { if (ptr == NULL) { std::cout << "Tree is empty" << std::endl; return; } if (ptr != NULL) { inorder(ptr->left); std::cout << ptr->chr << " " << ptr->position << " " << ptr->ref_support << std::endl; inorder(ptr->right); } } /* * Postorder Traversal */ void Breakpoint_Tree::postorder(breakpoint_node *ptr) { if (ptr == NULL) { return; } if (ptr != NULL) { postorder(ptr->left); postorder(ptr->right); std::cout << ptr->position << " "; } } Sniffles-1.0.7/src/tree/Breakpoint_Tree.h000066400000000000000000000024161320237057600202540ustar00rootroot00000000000000/* * Breakpoint_Tree.h * * Created on: Mar 28, 2016 * Author: fsedlaze */ #ifndef TREE_BREAKPOINT_TREE_H_ #define TREE_BREAKPOINT_TREE_H_ #include #include #include #include #include struct breakpoint_node { std::string chr; int position; // value to store! bool direction; int ref_support; breakpoint_node *left; breakpoint_node *right; }; class Breakpoint_Tree { private: public: Breakpoint_Tree() { } ~Breakpoint_Tree(){ } void find(int position,std::string chr, breakpoint_node *par, breakpoint_node *&loc); void insert(breakpoint_node *&tree, std::string chr,int position,bool direction); void del(int position,std::string chr); void case_a(breakpoint_node *par, breakpoint_node *loc); void case_b(breakpoint_node *par, breakpoint_node *loc); void case_c(breakpoint_node *par, breakpoint_node *loc); void preorder(breakpoint_node *ptr); void inorder(breakpoint_node *ptr); void postorder(breakpoint_node *ptr); void display(breakpoint_node *ptr, int); void get_nodes(breakpoint_node *ptr, std::vector & nodes); void overalps(int start,int stop,std::string chr, breakpoint_node *par); int get_ref(breakpoint_node *&tree, std::string chr, int position); }; #endif /* TREE_BREAKPOINT_TREE_H_ */ Sniffles-1.0.7/src/tree/IntervallContainer.h000066400000000000000000000010011320237057600207670ustar00rootroot00000000000000/* * IntervallContainer.h * * Created on: Nov 2, 2016 * Author: fsedlaze */ #ifndef TREE_INTERVALLCONTAINER_H_ #define TREE_INTERVALLCONTAINER_H_ class IntervallContainer { protected: public: IntervallContainer() { } virtual ~IntervallContainer() { } virtual void insert(Breakpoint * point, TNode *&)=0; virtual void get_breakpoints(TNode *p, std::vector & points)=0; virtual void clear(TNode*&)=0; virtual void print(TNode *p)=0; }; #endif /* TREE_INTERVALLCONTAINER_H_ */ Sniffles-1.0.7/src/tree/IntervallList.cpp000066400000000000000000000024211320237057600203220ustar00rootroot00000000000000/* * List.cpp * * Created on: Nov 2, 2016 * Author: fsedlaze */ #include "IntervallList.h" void IntervallList::insert(Breakpoint * point, TNode *& note) { for (size_t i = 0; i < this->breakpoints.size(); i++) { long score = this->breakpoints[i]->get_data()->overlap(point); if (score == 0) { // std::cout<<"overlap: "<breakpoints[i]->get_data()->get_coordinates().support.size()<breakpoints[i]->get_data()->add_read(point); delete point; return; }/* else if (score < 0) { TNode * p = new TNode(point); this->breakpoints.insert((this->breakpoints.begin()+i),p); return; }*/ } TNode * p = new TNode(point); this->breakpoints.push_back(p); } void IntervallList::get_breakpoints(TNode *p, std::vector & points) { for (size_t i = 0; i < this->breakpoints.size(); i++) { points.push_back(this->breakpoints[i]->get_data()); } } void IntervallList::clear(TNode*&){ this->breakpoints.clear(); } void IntervallList::print(TNode *p){ std::cout<<"Print:"<breakpoints.size(); i++) { std::cout<<"( "<breakpoints[i]->get_data()->get_coordinates().start.min_pos<<"-"<breakpoints[i]->get_data()->get_coordinates().stop.max_pos<<" )\t"; } std::cout< #include "TNode.h" #include "IntervallContainer.h" class IntervallList:public IntervallContainer { private: std::vector breakpoints; public: IntervallList(){ } ~IntervallList(){ } void insert(Breakpoint * point, TNode *&); void get_breakpoints(TNode *p,std::vector & points); void clear(TNode*&); void print(TNode *p); }; #endif /* TREE_INTERVALLLIST_H_ */ Sniffles-1.0.7/src/tree/IntervallTree.cpp000066400000000000000000000217141320237057600203140ustar00rootroot00000000000000/* * IntervallTree.cpp * * Created on: Jun 23, 2015 * Author: fsedlaze */ #include "IntervallTree.h" void IntervallTree::careful_screening(Breakpoint *& new_break, TNode *p) { //maybe I just need the pointer not a ref. if (p != NULL && !(new_break->get_coordinates().start.min_pos == -1 && new_break->get_coordinates().stop.max_pos == -1)) { careful_screening(new_break, p->left); if (p->get_data()->overlap(new_break) == 0) { //SV type p->get_data()->add_read(new_break); new_break->set_coordinates(-1, -1); return; } careful_screening(new_break, p->right); } } // Inserting a node void IntervallTree::insert(Breakpoint * new_break, TNode *&p) { if (new_break->get_coordinates().start.min_pos == -1 && new_break->get_coordinates().stop.max_pos == -1) { return; } if (p == NULL) { // add to tree: p = new TNode(new_break); if (p == NULL) { std::cout << "Out of Space\n" << std::endl; } } else { // find on tree: long score = p->get_data()->overlap(new_break); //comparison function if (score == 0) { //add SV types? p->get_data()->add_read(new_break); new_break->set_coordinates(-1, -1); //delete new_break; return; } else if (abs(score) < Parameter::Instance()->max_dist) { // if two or more events are too close: //std::cout<<"Screen"<get_coordinates().start.min_pos == -1 && new_break->get_coordinates().stop.max_pos == -1) { return; } } if (score > 0) { // go left insert(new_break, p->left); if ((bsheight(p->left) - bsheight(p->right)) == 2) { score = p->left->get_data()->overlap(new_break); if (score > 0) { p = srl(p); } else { p = drl(p); } } } else if (score < 0) { // go right insert(new_break, p->right); if ((bsheight(p->right) - bsheight(p->left)) == 2) { score = p->right->get_data()->overlap(new_break); if (score < 0) { p = srr(p); } else { p = drr(p); } } } } int m, n, d; m = bsheight(p->left); n = bsheight(p->right); d = max(m, n); p->set_height(d + 1); } void IntervallTree::insert_existant(Breakpoint * new_break, TNode *&p) { if (new_break->get_coordinates().start.min_pos == -1 && new_break->get_coordinates().stop.max_pos == -1) { return; } if (p == NULL) { // add to tree: return; } else { // find on tree: long score = p->get_data()->overlap(new_break); //comparison function if (score == 0) { //add SV types? p->get_data()->add_read(new_break); new_break->set_coordinates(-1, -1); //delete new_break; return; } else if (abs(score) < Parameter::Instance()->max_dist) { // if two or more events are too close: //std::cout<<"Screen"<get_coordinates().start.min_pos == -1 && new_break->get_coordinates().stop.max_pos == -1) { return; } } if (score > 0) { // go left insert_existant(new_break, p->left); if ((bsheight(p->left) - bsheight(p->right)) == 2) { score = p->left->get_data()->overlap(new_break); if (score > 0) { p = srl(p); } else { p = drl(p); } } } else if (score < 0) { // go right insert_existant(new_break, p->right); if ((bsheight(p->right) - bsheight(p->left)) == 2) { score = p->right->get_data()->overlap(new_break); if (score < 0) { p = srr(p); } else { p = drr(p); } } } } int m, n, d; m = bsheight(p->left); n = bsheight(p->right); d = max(m, n); p->set_height(d + 1); } bool IntervallTree::overlaps(long start, long stop, TNode *p) { if (p == NULL) { return false; } else { long score = p->get_data()->overlap_breakpoint(start,stop); if (score > 0) { overlaps(start,stop, p->left); } else if (score < 0) { overlaps(start,stop, p->right); } else { return true; } } } // Finding the Smallest TNode * IntervallTree::findmin(TNode * p) { if (p == NULL) { std::cout << "The tree is empty\n" << std::endl; return p; } else { while (p->left != NULL) { p = p->left; //return p; } return p; } } // Finding the Largest node TNode * IntervallTree::findmax(TNode * p) { if (p == NULL) { std::cout << "The tree is empty\n" << std::endl; return p; } else { while (p->right != NULL) { p = p->right; //return p; } return p; } } // Finding an get_value() void IntervallTree::find(Breakpoint * point, TNode * &p) { if (p == NULL) { std::cout << "Sorry! get_value() not found\n" << std::endl; } else { long score = p->get_data()->overlap(point); if (score > 0) { find(point, p->left); } else if (score < 0) { find(point, p->right); } else { std::cout << "get_value() found!\n" << std::endl; } } } // Copy a tree void IntervallTree::copy(TNode * &p, TNode * &p1) { clear(p1); p1 = nodecopy(p); } // Make a tree empty void IntervallTree::clear(TNode * &p) { TNode * d; if (p != NULL) { clear(p->left); clear(p->right); d = p; free(d); p = NULL; } } // Copy the nodes TNode * IntervallTree::nodecopy(TNode * &p) { TNode * temp; if (p == NULL) { return p; } else { temp = new TNode(p->get_data()); //TODO! temp->left = nodecopy(p->left); temp->right = nodecopy(p->right); return temp; } } // Deleting a node void IntervallTree::del(Breakpoint * point, TNode * &p) { TNode * d; if (p == NULL) { std::cout << "Sorry! get_value() not found\n" << std::endl; } else { long score = p->get_data()->overlap(point); if (score > 0) { del(point, p->left); } else if (score < 0) { del(point, p->right); } else if ((p->left == NULL) && (p->right == NULL)) { d = p; free(d); p = NULL; std::cout << "get_value() deleted successfully\n" << std::endl; } else if (p->left == NULL) { d = p; free(d); p = p->right; std::cout << "get_value() deleted successfully\n" << std::endl; } else if (p->right == NULL) { d = p; p = p->left; free(d); std::cout << "get_value() deleted successfully\n" << std::endl; } else { //p->set_value(deletemin(p->right)); } } } int IntervallTree::deletemin(TNode * &p) { int c; std::cout << "inside deltemin\n" << std::endl; if (p->left == NULL) { //c = p->get_value(); p = p->right; return c; } else { c = deletemin(p->left); return c; } } void IntervallTree::preorder(TNode * p) { if (p != NULL) { //std::cout << p->get_data()->to_string() << "\t"; preorder(p->left); preorder(p->right); } } void IntervallTree::get_breakpoints(TNode *p, std::vector & points) { if (p != NULL) { get_breakpoints(p->right, points); //std::cout << "( " << p->get_data()->get_coordinates().start.min_pos << "-" << p->get_data()->get_coordinates().stop.max_pos << " "<< p->get_data()->get_coordinates().support.size()<<" )"<get_data()); get_breakpoints(p->left, points); } } // Inorder Printing void IntervallTree::inorder(TNode * p) { if (p != NULL) { inorder(p->left); std::cout << p->get_data()->to_string() << endl; inorder(p->right); } } void IntervallTree::print(TNode *p) { if (p != NULL) { print(p->left); std::string msg = p->get_data()->to_string(); if (msg.size() > 3) { std::cout << msg << endl; } //std::cout << "( " << p->get_data()->get_coordinates().start.min_pos << "-" << p->get_data()->get_coordinates().stop.max_pos << " "<< p->get_data()->get_coordinates().support.size()<<" )"<right); } } // PostOrder Printing void IntervallTree::postorder(TNode * p) { if (p != NULL) { postorder(p->left); postorder(p->right); //std::cout << p->get_data()->to_string() << "\t"; } } int IntervallTree::max(int value1, int value2) { return ((value1 > value2) ? value1 : value2); } int IntervallTree::bsheight(TNode * p) { int t; if (p == NULL) { return -1; } else { t = p->get_height(); return t; } } TNode * IntervallTree::srl(TNode * &p1) { TNode * p2; p2 = p1->left; p1->left = p2->right; p2->right = p1; p1->set_height(max(bsheight(p1->left), bsheight(p1->right)) + 1); p2->set_height(max(bsheight(p2->left), p1->get_height()) + 1); return p2; } TNode * IntervallTree::srr(TNode * &p1) { TNode * p2; p2 = p1->right; p1->right = p2->left; p2->left = p1; p1->set_height(max(bsheight(p1->left), bsheight(p1->right)) + 1); p2->set_height(max(p1->get_height(), bsheight(p2->right)) + 1); return p2; } TNode * IntervallTree::drl(TNode * &p1) { p1->left = srr(p1->left); return srl(p1); } TNode * IntervallTree::drr(TNode * &p1) { p1->right = srl(p1->right); return srr(p1); } int IntervallTree::nonodes(TNode * p) { int count = 0; if (p != NULL) { nonodes(p->left); nonodes(p->right); count++; } return count; } void IntervallTree::collapse_intervalls(TNode *&p) { std::cout << "\t Collapse" << std::endl; TNode * new_root = NULL; std::vector points; get_breakpoints(p, points); for (size_t i = 0; i < points.size(); i++) { if (points[i]->get_support() > Parameter::Instance()->min_support) { //std::cout << "\tpoints: " << points[i]->to_string(ref) << std::endl; this->insert(points[i], new_root); } } this->clear(p); p = new_root; } Sniffles-1.0.7/src/tree/IntervallTree.h000066400000000000000000000022501320237057600177530ustar00rootroot00000000000000/* * IntervallTree.h * * Created on: Jun 23, 2015 * Author: fsedlaze */ #ifndef TREE_INTERVALLTREE_H_ #define TREE_INTERVALLTREE_H_ #include #include "TNode.h" #include "IntervallContainer.h" class IntervallTree:public IntervallContainer { private: int max(int, int); TNode * srl(TNode *&); TNode * drl(TNode *&); TNode * srr(TNode *&); TNode * drr(TNode *&); void careful_screening(Breakpoint *& new_break, TNode *p); public: void insert(Breakpoint * point, TNode *&); void insert_ref(Breakpoint * point, TNode *&); void insert_existant(Breakpoint * new_break, TNode *&p); void del(Breakpoint * point, TNode *&); int deletemin(TNode *&); void find(Breakpoint * point, TNode *&); bool overlaps(long start, long stop,TNode *p); TNode * findmin(TNode*); TNode * findmax(TNode*); void clear(TNode *&); void copy(TNode * &, TNode *&); TNode * nodecopy(TNode *&); void preorder(TNode*); void inorder(TNode * p); void postorder(TNode*); int bsheight(TNode*); void get_breakpoints(TNode *p,std::vector & points); int nonodes(TNode*); void collapse_intervalls(TNode *&p); void print(TNode *p); }; #endif /* TREE_INTERVALLTREE_H_ */ Sniffles-1.0.7/src/tree/Intervall_bed.cpp000066400000000000000000000117441320237057600203100ustar00rootroot00000000000000/* * Intervall_bed.cpp * * Created on: Feb 4, 2016 * Author: fsedlaze */ #include "Intervall_bed.h" // Inserting a node void IntervallTree_bed::insert(long start, long stop, Leaf *&p) { if (p == NULL) { p = new Leaf(start, stop); if (p == NULL) { std::cout << "Out of Space\n" << std::endl; } } else { long score = p->overlap(start, stop); //comparison function if (score > 0) { insert(start, stop, p->left); if ((bsheight(p->left) - bsheight(p->right)) == 2) { score = p->left->overlap(start, stop); if (score > 0) { p = srl(p); } else { p = drl(p); } } } else if (score < 0) { insert(start, stop, p->right); if ((bsheight(p->right) - bsheight(p->left)) == 2) { score = p->right->overlap(start, stop); if (score < 0) { p = srr(p); } else { p = drr(p); } } } else { //overlaps! std::cerr << "Two regions overlap and are thus ignored:" << std::endl; } } int m, n, d; m = bsheight(p->left); n = bsheight(p->right); d = max(m, n); p->set_height(d + 1); } // Finding the Smallest Leaf * IntervallTree_bed::findmin(Leaf * p) { if (p == NULL) { return p; } else { while (p->left != NULL) { p = p->left; //return p; } return p; } } // Finding the Largest node Leaf * IntervallTree_bed::findmax(Leaf * p) { if (p == NULL) { return p; } else { while (p->right != NULL) { p = p->right; //return p; } return p; } } // Finding an get_value() bool IntervallTree_bed::is_in(long position, Leaf * &p) { if (p == NULL) { return false; } else { long score = p->overlap(position); if (score > 0) { is_in(position, p->left); } else if (score < 0) { is_in(position, p->right); } else { return true; } } } // Copy a tree void IntervallTree_bed::copy(Leaf * &p, Leaf * &p1) { makeempty(p1); p1 = nodecopy(p); } // Make a tree empty void IntervallTree_bed::makeempty(Leaf * &p) { Leaf * d; if (p != NULL) { makeempty(p->left); makeempty(p->right); d = p; free(d); p = NULL; } } // Copy the nodes Leaf * IntervallTree_bed::nodecopy(Leaf * &p) { Leaf * temp; if (p == NULL) { return p; } else { temp = new Leaf(p->get_start(), p->get_stop()); //TODO! temp->left = nodecopy(p->left); temp->right = nodecopy(p->right); return temp; } } // Deleting a node void IntervallTree_bed::del(long start, long stop, Leaf * &p) { Leaf * d; if (p == NULL) { std::cout << "Sorry! get_value() not found\n" << std::endl; } else { long score = p->overlap(start, stop); if (score > 0) { del(start, stop, p->left); } else if (score < 0) { del(start, stop, p->right); } else if ((p->left == NULL) && (p->right == NULL)) { d = p; free(d); p = NULL; std::cout << "get_value() deleted successfully\n" << std::endl; } else if (p->left == NULL) { d = p; free(d); p = p->right; std::cout << "get_value() deleted successfully\n" << std::endl; } else if (p->right == NULL) { d = p; p = p->left; free(d); std::cout << "get_value() deleted successfully\n" << std::endl; } else { //p->set_value(deletemin(p->right)); } } } int IntervallTree_bed::deletemin(Leaf * &p) { int c; std::cout << "inside deltemin\n" << std::endl; if (p->left == NULL) { //c = p->get_value(); p = p->right; return c; } else { c = deletemin(p->left); return c; } } void IntervallTree_bed::preorder(Leaf * p) { if (p != NULL) { //std::cout << p->get_data()->to_string() << "\t"; preorder(p->left); preorder(p->right); } } // Inorder Printing void IntervallTree_bed::inorder(Leaf * p, Leaf * root) { if (p != NULL) { inorder(p->left, root); //std::cout << p->get_data()->to_string(); if (p == root) { std::cout << "*\t"; } else { std::cout << "\t"; } inorder(p->right, root); } } // PostOrder Printing void IntervallTree_bed::postorder(Leaf * p) { if (p != NULL) { postorder(p->left); postorder(p->right); std::cout << p->get_start()<<" "<get_stop()<< "\t"; } } int IntervallTree_bed::max(int value1, int value2) { return ((value1 > value2) ? value1 : value2); } int IntervallTree_bed::bsheight(Leaf * p) { int t; if (p == NULL) { return -1; } else { t = p->get_height(); return t; } } Leaf * IntervallTree_bed::srl(Leaf * &p1) { Leaf * p2; p2 = p1->left; p1->left = p2->right; p2->right = p1; p1->set_height(max(bsheight(p1->left), bsheight(p1->right)) + 1); p2->set_height(max(bsheight(p2->left), p1->get_height()) + 1); return p2; } Leaf * IntervallTree_bed::srr(Leaf * &p1) { Leaf * p2; p2 = p1->right; p1->right = p2->left; p2->left = p1; p1->set_height(max(bsheight(p1->left), bsheight(p1->right)) + 1); p2->set_height(max(p1->get_height(), bsheight(p2->right)) + 1); return p2; } Leaf * IntervallTree_bed::drl(Leaf * &p1) { p1->left = srr(p1->left); return srl(p1); } Leaf * IntervallTree_bed::drr(Leaf * &p1) { p1->right = srl(p1->right); return srr(p1); } int IntervallTree_bed::nonodes(Leaf * p) { int count = 0; if (p != NULL) { nonodes(p->left); nonodes(p->right); count++; } return count; } Sniffles-1.0.7/src/tree/Intervall_bed.h000066400000000000000000000014601320237057600177470ustar00rootroot00000000000000/* * Intervall_bed.h * * Created on: Feb 4, 2016 * Author: fsedlaze */ #ifndef TREE_INTERVALL_BED_H_ #define TREE_INTERVALL_BED_H_ #include "Leaf.h" #include #include "../Paramer.h" class IntervallTree_bed { private: int max(int, int); Leaf * srl(Leaf *&); Leaf * drl(Leaf *&); Leaf * srr(Leaf *&); Leaf * drr(Leaf *&); public: void insert(long start, long stop, Leaf *&); int deletemin(Leaf *&); bool is_in(long pos, Leaf *&); //true if found Leaf * findmin(Leaf*); Leaf * findmax(Leaf*); void makeempty(Leaf *&); void copy(Leaf * &, Leaf *&); Leaf * nodecopy(Leaf *&); void preorder(Leaf*); void inorder(Leaf*, Leaf * root); void postorder(Leaf*); int bsheight(Leaf*); int nonodes(Leaf*); void del(long start, long stop, Leaf * &p); }; #endif /* TREE_INTERVALL_BED_H_ */ Sniffles-1.0.7/src/tree/Leaf.h000066400000000000000000000023451320237057600160470ustar00rootroot00000000000000/* * Leaf.h * * Created on: Feb 4, 2016 * Author: fsedlaze */ #ifndef TREE_LEAF_H_ #define TREE_LEAF_H_ #include "../Paramer.h" #include #include #include class Leaf { private: long start; long stop; int height; void init() { height = 0; this->parent = NULL; this->left = NULL; this->right = NULL; } public: Leaf * parent; Leaf * left; Leaf * right; Leaf(long start, long stop) { this->start = start; this->stop = stop; init(); } int overlap(long position) { if (abs(position - get_start()) < Parameter::Instance()->max_dist && abs(position - get_stop()) < Parameter::Instance()->max_dist) { return 0; } return (position - start); //((start < position) && (stop > position)); } int overlap(long start, long stop) { if (abs(start - get_start()) < Parameter::Instance()->max_dist && abs(stop - get_stop()) < Parameter::Instance()->max_dist) { return 0; } //as abstraction lets try the start+stop coordinate! return (start - get_start()); // + (stop-get_stop()); } long get_start() { return start; } long get_stop() { return stop; } int get_height() { return height; } void set_height(int val) { this->height = val; } }; #endif /* TREE_LEAF_H_ */ Sniffles-1.0.7/src/tree/TNode.h000066400000000000000000000014231320237057600162050ustar00rootroot00000000000000/* * TNode.h * * Created on: Jun 23, 2015 * Author: fsedlaze */ #ifndef TREE_TNODE_H_ #define TREE_TNODE_H_ #include #include #include #include "../sub/Breakpoint.h" //#include "TNode.h" class TNode { private: Breakpoint * data; //int value; int height; int MAX_DIST; void init() { this->parent = NULL; this->left = NULL; this->right = NULL; MAX_DIST=500; } public: TNode * parent; TNode * left; TNode * right; TNode() { height=0; init(); this->data=NULL; } TNode(Breakpoint * point) { init(); this->data=point; height=0; } ~TNode() { } Breakpoint * get_data() { return data; } int get_height() { return height; } void set_height(int val) { this->height = val; } }; #endif /* TREE_TNODE_H_ */