./Source/0000755011075700120610000000000011720661041012371 5ustar yanghochmath-ar./Source/AlignmentsQ.cpp0000644011075700120610000001653611720654362015342 0ustar yanghochmath-ar#include "stdafx.h" #include "AlignmentsQ.h" const unsigned int CAlignmentsQ::NULL_RECORD = std::numeric_limits::max(); const unsigned short CAlignmentsQ::NULL_EDIT_DIS = std::numeric_limits::max(); CAlignmentsQ::CAlignmentsQ(unsigned int iMaxCapacity) { this->initialization(iMaxCapacity); } CAlignmentsQ::CAlignmentsQ(char cFlag_of_Queue_All_Best_One, unsigned int iMaxCapacity) { this->initialization(iMaxCapacity); this->cFlag_of_Queue_All_Best_One = cFlag_of_Queue_All_Best_One; } CAlignmentsQ::~CAlignmentsQ(void) { delete [] this->aiHitIndex; this->aiHitIndex = NULL; delete [] this->asdiff; this->asdiff = NULL; } void CAlignmentsQ::setQueue_All_Best_OneFlag(char cFlag_of_Queue_All_Best_One) { //Simply a public function used to set the flag option to queue the best set or all or one alignment this->cFlag_of_Queue_All_Best_One = cFlag_of_Queue_All_Best_One; } char CAlignmentsQ::returnQueue_All_Best_OneFlag() { return(this->cFlag_of_Queue_All_Best_One); } inline void CAlignmentsQ::pushHits(unsigned int startindex, unsigned short diff) { if (this->load < this->iMaxCapacity) { this->aiHitIndex[this->load] = startindex; this->asdiff[this->load] = diff; this->load++; } else { // if the buffer is overflow. // cout << "Alignment Queue overflow" << endl; } } // return true if there is a same hit bool CAlignmentsQ::checkHits(unsigned int startindex) { for (unsigned int i = 0; i < this->load && i < this->iMaxCapacity; i++) { // If the maping is in the record, return(true); if (this->aiHitIndex[i] == startindex) { return(true); } } return(false); } // replace the record with the largest diff int CAlignmentsQ::replaceHits(unsigned int startindex, unsigned short diff) { int candidateId = -1; unsigned short candidateDiff = this->MinDiff; for (unsigned int i = 0; i < this->load && i < this->iMaxCapacity; i++) { if (diff < this->asdiff[i]) { if (candidateDiff < asdiff[i]) { candidateId = i; } } } if (candidateId >= 0) { // replace the worse record this->aiHitIndex[candidateId] = startindex; this->asdiff[candidateId] = diff; return(candidateId); } return(-1); } unsigned int CAlignmentsQ::saveHits(unsigned int startindex, unsigned short diff) { bool recordIsAsGoodOrBetter = (diff <= this->MinDiff); bool saveAllMapping = (this->cFlag_of_Queue_All_Best_One == 'A'); if ( recordIsAsGoodOrBetter || saveAllMapping) { if (diff < this->MinDiff) { // If the new alignment is better this->MinDiff = diff; if (this->cFlag_of_Queue_All_Best_One == 'B') { this->load = 0; //NOT this->clearHits(); this->ForwardAlignmentLoad = 0; } // Definition of non-ambiguous is having a unique best mapping. this->AmbiguousFlag = false; } else { // linear check if the record has occured. (Buttlenect?) if (checkHits(startindex)) { return(this->load); } // This is put at the end to avoid duplicate record if (diff == this->MinDiff) { this->AmbiguousFlag = true; } } //(3) save the alignment in the queue bool qIsFull = (this->load >= this->iMaxCapacity); if (saveAllMapping && qIsFull) { this->replaceHits(startindex, diff); } else if (!qIsFull) { this->pushHits(startindex, diff); } } return(load); } int CAlignmentsQ::initialization(unsigned int MAX_Q_CAPACITY) { this->iMaxCapacity = MAX_Q_CAPACITY; this->aiHitIndex = new unsigned int [MAX_Q_CAPACITY + 1]; this->asdiff = new unsigned short [MAX_Q_CAPACITY + 1]; this->load = 0; this->clearHits(); // The default setting is to queue the best set (could be more than one) alignment this->cFlag_of_Queue_All_Best_One = 'B'; for (unsigned int i = 0; i < this->iMaxCapacity; i++) { this->aiHitIndex[i] = NULL_RECORD; this->asdiff[i] = NULL_EDIT_DIS; } this->qualityScores = NULL; this->readID = 0; this->tag[0] = '\0'; return (0); } int CAlignmentsQ::clearHits() { for (unsigned int i = 0; i < min(this->load, this->iMaxCapacity); i++) { this->aiHitIndex[i] = NULL_RECORD; this->asdiff[i] = NULL_EDIT_DIS; } /* Marked this to increase speed */ this->load = 0; this->ForwardAlignmentLoad = 0; this->MinDiff = MAX_READ_LENGTH; this->AmbiguousFlag = false; this->reverseIsBetter = false; return(0); } int CAlignmentsQ::sortHitsByLocation() { // Sort according to distance vector< pair > v; for (unsigned int i = 0; i < this->ForwardAlignmentLoad; i++) { v.push_back(pair(this->aiHitIndex[i], this->asdiff[i])); } std::sort(v.begin(), v.end()); for (unsigned int i = 0; i < this->ForwardAlignmentLoad; i++) { this->aiHitIndex[i] = v.at(i).first; this->asdiff[i] = (unsigned short)(v.at(i).second); } vector< pair > w; for (unsigned int i = this->ForwardAlignmentLoad; i < this->load; i++) { w.push_back(pair(this->aiHitIndex[i], this->asdiff[i])); } std::sort(w.begin(), w.end()); for (unsigned int i = this->ForwardAlignmentLoad; i < this->load; i++) { this->aiHitIndex[i] = w.at(i).first; this->asdiff[i] = (unsigned short)w.at(i).second; } return(0); } int CAlignmentsQ::filterAlignments(unsigned int mismatchThreshold, bool bKeepAllAlignmentsInThreshold) { int noOfMinMisMapping = 0; if (this->MinDiff > mismatchThreshold) { this->clearHits(); } else { unsigned int i, j; // move record from i to j for (i = 0, j = 0; i < this->load; i++) { if (i == this->ForwardAlignmentLoad) { this->ForwardAlignmentLoad = j; } bool isMinMisMapping = ((unsigned int)this->asdiff[i] == this->MinDiff); if (isMinMisMapping) { noOfMinMisMapping++; } if (isMinMisMapping || (bKeepAllAlignmentsInThreshold\ && (unsigned int) this->asdiff[i] <= mismatchThreshold)) { this->aiHitIndex[j] = this->aiHitIndex[i]; this->asdiff[j] = this->asdiff[i]; j++; } } this->load = j; this->AmbiguousFlag = (noOfMinMisMapping > 1); } return(this->load); } //Simply to a linear search in the queue to find the best alignment and return the genome InDex unsigned int CAlignmentsQ::topHitsinList(void) { unsigned short mindiff = NULL_EDIT_DIS;//Check it again unsigned int bestHitsIndex = 0; unsigned int i; for (i = 0; i < this->load; i++) { if (mindiff > this->asdiff[i]) { mindiff = this->asdiff[i]; bestHitsIndex = i; this->AmbiguousFlag = false; } else if (mindiff == this->asdiff[i]) this->AmbiguousFlag = true; } if (mindiff < MAXTOLERATSUBMIS) { this->reverseIsBetter = (bestHitsIndex >= this->ForwardAlignmentLoad); return(this->aiHitIndex[bestHitsIndex]); } else return(NULL_RECORD);//(Confuse between bad kmer and not found) } ./Source/ChrIndex2GeneName.cpp0000644011075700120610000000251311720654362016274 0ustar yanghochmath-ar#include "stdafx.h" #include "ChrIndex2GeneName.h" CGene::CGene(void) { ; } CGene::~CGene(void) { ; } CGene::CGene(string name, unsigned int startIndex) { this->name = name; this->startIndex = startIndex; this->isValid = true; } CGene::CGene(string name, unsigned int startIndex, bool isValid) { this->name = name; this->startIndex = startIndex; this->isValid = isValid; } bool CGene::operator<(const CGene &other) const { return (this->startIndex < other.startIndex); } bool CGene::operator==(const CGene &other) const { return(other.name == this->name); } ChrIndex2GeneName::ChrIndex2GeneName(void) { this->table.clear(); } ChrIndex2GeneName::~ChrIndex2GeneName(void) { } int ChrIndex2GeneName::insert(string name, unsigned int startIndex) { CGene g(name, startIndex); this->table.push_back(g); return((int)this->table.size()); } CGene ChrIndex2GeneName::query(unsigned int chrIndex) { CGene g("search", chrIndex); vector::iterator gIt = upper_bound(this->table.begin(), this->table.end(), g); if (gIt >= (this->table.begin() + 1) && (gIt - 1)->startIndex <= chrIndex) { // the gene before upper bound return(CGene((gIt - 1)->name, chrIndex - (gIt - 1)->startIndex, true)); } else { return(CGene(string("Null_Region:"), chrIndex, false)); } } ./Source/ChromosomeInBits.cpp0000644011075700120610000001011211720654362016324 0ustar yanghochmath-ar#include "stdafx.h" #include "chromosomeInBits.h" CChromosomeInBits::CChromosomeInBits(void) { } CChromosomeInBits::~CChromosomeInBits(void) { delete [] this->pLowerBits; delete [] this->pUpperBits; // don't delete this->caChromosome. } CChromosomeInBits::CChromosomeInBits(char* caChromosome, unsigned int uiChrLength) { this->caChromosome = caChromosome; this->uiChrLength = uiChrLength; this->uiChrLengthInWordSize = (uiChrLength - 1) / wordSize + 1; this->pLowerBits = new WORD_SIZE[uiChrLengthInWordSize]; this->pUpperBits = new WORD_SIZE[uiChrLengthInWordSize]; memset(pLowerBits, 0x00, uiChrLengthInWordSize); memset(pUpperBits, 0x00, uiChrLengthInWordSize); //TODO, find a better way to interpret the non ACGT character //If there are non-ACGT characters in caChromosome. The memory will still set to A unsigned int i; for (i = 0; i + 1 < this->uiChrLengthInWordSize; i++) { //The first this->uiChrLengthInWordSize - 1 word are fully encoded encodeRead(&caChromosome[i * wordSize], wordSize, &this->pUpperBits[i], &this->pLowerBits[i]); } encodeRead(&caChromosome[i * wordSize], uiChrLength - wordSize * i, &this->pUpperBits[i], &this->pLowerBits[i]); } int CChromosomeInBits::initialization() { this->pLowerBits = NULL; this->pUpperBits = NULL; this->caChromosome = NULL; caSubstring[0] = '\0'; return (0); } /* Note the chromosome encoding is NOT continuous. The more significant bits of each words encodes * nucleotides in front of those less significant bits. This encodeing make bits incontinuously mapped * to chromosome index. Note the first bit of each WORD represent the first base of each section */ CReadInBits CChromosomeInBits::getSubstringInBits(unsigned int uiGenomeIndex) { CReadInBits r; unsigned int indexInWords = uiGenomeIndex / wordSize; unsigned int bitsShits = uiGenomeIndex % wordSize; if (this->uiChrLengthInWordSize > indexInWords) { r.UpperBits = this->pUpperBits[indexInWords] >> bitsShits; r.LowerBits = this->pLowerBits[indexInWords] >> bitsShits; if (bitsShits != 0) { r.UpperBits |= (this->pUpperBits[indexInWords + 1] << (wordSize - bitsShits)); r.LowerBits |= (this->pLowerBits[indexInWords + 1] << (wordSize - bitsShits)); } } else cout << "Warning, wrong chromosome index " << endl; return(r); } // eliminate the tail bits out of read length range CReadInBits CChromosomeInBits::getSubstringInBits(unsigned int uiGenomeIndex, unsigned int uiSubstringLength) { CReadInBits r; unsigned int indexInWords = uiGenomeIndex / wordSize; unsigned int bitsShits = uiGenomeIndex % wordSize; r.UpperBits = this->pUpperBits[indexInWords] >> bitsShits; r.LowerBits = this->pLowerBits[indexInWords] >> bitsShits; if (bitsShits != 0) { r.UpperBits |= (this->pUpperBits[indexInWords + 1] << (wordSize - bitsShits)); r.LowerBits |= (this->pLowerBits[indexInWords + 1] << (wordSize - bitsShits)); } unsigned int elimatedBitsNo = wordSize - uiSubstringLength; r.UpperBits <<= elimatedBitsNo; r.LowerBits <<= elimatedBitsNo; r.UpperBits >>= elimatedBitsNo; r.LowerBits >>= elimatedBitsNo; return (r); } char* CChromosomeInBits::getSubstring(unsigned int uiGenomeIndex) { CReadInBits r = getSubstringInBits(uiGenomeIndex); decodeRead(caSubstring, wordSize, r.UpperBits, r.LowerBits); return (caSubstring); } char* CChromosomeInBits::getSubstring(unsigned int uiGenomeIndex, unsigned int uiSubstringLength) { CReadInBits r = getSubstringInBits(uiGenomeIndex); decodeRead(caSubstring, wordSize, r.UpperBits, r.LowerBits); if (uiSubstringLength <= wordSize) { caSubstring[uiSubstringLength] = '\0'; } if (uiGenomeIndex + uiSubstringLength > uiChrLength) { if (uiChrLength > uiGenomeIndex) caSubstring[uiChrLength - uiGenomeIndex] = '\0'; else { cout << "Warning, Wrong genome index " << endl; caSubstring[0] = '\0'; } } return (caSubstring); } ./Source/ColorSpaceRead.cpp0000644011075700120610000003744511720654362015750 0ustar yanghochmath-ar#include "stdafx.h" #include "ColorSpaceRead.h" // private function WORD_SIZE colors2Bases(WORD_SIZE readInColors) { WORD_SIZE readInBases = readInColors & 0x01; //Copy the first bit WORD_SIZE mask = 0x01; readInColors >>= 0x01; //shift the first encoded base bit for (int i = 0; i < CReadInBits::iReadLength; i++) { //get the ith digit and do the x WORD_SIZE nextBase = (readInBases ^ readInColors) & mask; mask <<= 0x01; readInBases += (nextBase << 0x01); } return(readInBases); } // Assum the first bits is encoded in readInColor CReadInBits colors2Bases(CReadInBits readInColors) { CReadInBits readInBases; readInBases.UpperBits = colors2Bases(readInColors.UpperBits); readInBases.LowerBits = colors2Bases(readInColors.LowerBits); return (readInBases); } int getSNPtype(CReadInBits readInColors, CReadInBits refInColors) { readInColors = readInColors.getPrefixStr((unsigned int)CReadInBits::iReadLength); refInColors = refInColors.getPrefixStr((unsigned int)CReadInBits::iReadLength); // compare only the first readlength bits WORD_SIZE upperBitsDiff = readInColors.UpperBits ^ refInColors.UpperBits; WORD_SIZE lowerBitsDiff = readInColors.LowerBits ^ refInColors.LowerBits; WORD_SIZE a = upperBitsDiff | lowerBitsDiff; int diff; // magic function to caculate how many ones are there #ifdef __GNUC__ // #ifdef AMD diff = __builtin_popcountll(a); #else for (diff = 0; a; diff++) { a &= a - 1; // clear the least significant bit set } #endif // check SNP type case if (diff >= 2 && diff <= 3) { // (1) Complement Type of SNP, (A <-> T, C<->G) // Indicated by two consecutive of color changed R <-> B or Y <-> G WORD_SIZE diffStr = upperBitsDiff & lowerBitsDiff; WORD_SIZE snpFlag = diffStr & (diffStr >> 1); // Note the first bit set is the position of SNP if (snpFlag) { return (-1); } // (2) Transversion Type of SNP, (A <-> C, G <-> T) // Indicated by two consecutive of color changed R <-> Y or B <-> G diffStr = ~upperBitsDiff & lowerBitsDiff; snpFlag = diffStr & (diffStr >> 1); if (snpFlag) { return (-2); } // (3) Transition Type of SNP, (A <-> G, C <-> T) // Indicated by two consecutive of color changed B <-> Y or R <-> G diffStr = upperBitsDiff & ~lowerBitsDiff; snpFlag = diffStr & (diffStr >> 1); if (snpFlag) { return (-3); } } return(diff); } bool encodeColorsNas3(const char* colorsStr, CReadInBits& readInColors) { const WORD_SIZE bit = 0x01; readInColors.LowerBits = 0; readInColors.UpperBits = 0; setFirstBase(base2Color(colorsStr[0], colorsStr[1]), readInColors); for (int i = 2; ; i++) { switch (colorsStr[i]) { case '0': break; case '1': readInColors.LowerBits += (bit << (i - 1)); break; case '2': readInColors.UpperBits += (bit << (i - 1)); break; case '3': readInColors.UpperBits += (bit << (i - 1)); readInColors.LowerBits += (bit << (i - 1)); break; case 'N': case '.': // encode unknown as 3 printf("Warning! Encode '.' in %s as color '3'\n", colorsStr); readInColors.UpperBits += (bit << (i - 1)); readInColors.LowerBits += (bit << (i - 1)); break; case '\0': return (true); default: return (false); } } } bool encodeColors(const char* colorsStr, CReadInBits& readInColors) { bool bNormalRead = true; const WORD_SIZE bit = 0x01; readInColors.LowerBits = 0; readInColors.UpperBits = 0; if (!setFirstBase(base2Color(colorsStr[0], colorsStr[1]), readInColors)) { bNormalRead = false; } if (colorsStr[1] == '.') { return(false); } for (int i = 2; ; i++) { switch (colorsStr[i]) { case '0': break; case '1': readInColors.LowerBits += (bit << (i - 1)); break; case '2': readInColors.UpperBits += (bit << (i - 1)); break; case '3': readInColors.UpperBits += (bit << (i - 1)); readInColors.LowerBits += (bit << (i - 1)); break; case '\0': return (bNormalRead); default: bNormalRead = false; break; } } return(bNormalRead); } /* * This funciton translate a color string in CReadInBits to a string start with ACGT followed by 0123 */ char* decodeColors(char* colorsStr, CReadInBits readInColors) { int i; for (i = 0; i < CReadInBits::iReadLength; i++) { WORD_SIZE c = (readInColors.UpperBits & 0x01) << 1 | (readInColors.LowerBits & 0x01); if (i == 0) { switch (c) { case 0x00: colorsStr[0] = 'A'; break; case 0x01: colorsStr[0] = 'C'; break; case 0x02: colorsStr[0] = 'G'; break; case 0x03: colorsStr[0] = 'T'; break; default: colorsStr[0] = 'N'; } } else { colorsStr[i] = '0' + (char)(c); } readInColors.LowerBits >>= 1; readInColors.UpperBits >>= 1; } colorsStr[i] = '\0'; return(colorsStr); } /* * This funciton translate a color string to pure string of 0123 */ char* decodePureColors(char* colorsStr, CReadInBits readInColors) { int i; for (i = 0; i < CReadInBits::iReadLength; i++) { WORD_SIZE upperBit = (readInColors.UpperBits & 0x01); WORD_SIZE lowerBit = (readInColors.LowerBits & 0x01); WORD_SIZE c = upperBit * 2 + lowerBit; colorsStr[i] = '0' + (char)(c); readInColors.LowerBits >>= 1; readInColors.UpperBits >>= 1; } colorsStr[i] = '\0'; return(colorsStr); } char* decodeLongColors(char* colorsStr, CReadInBits readInColors1stHalf, CReadInBits readInColors2ndHalf, bool oddReadLength) { int secondHalfStart = 0; if (oddReadLength) { secondHalfStart = CReadInBits::iReadLength - 1; } else { secondHalfStart = CReadInBits::iReadLength; } decodeColors(colorsStr, readInColors1stHalf); //decodeColors(&colorsStr[secondHalfStart], readInColors2ndHalf); decodePureColors(&colorsStr[secondHalfStart], readInColors2ndHalf); // TODO !!! The middle bit return(colorsStr); } // Correct the single color mismatch and adopted the valid SNP color // return number of type of SNP it involved. int correctReadInColorSpace(CReadInBits readInColors, CReadInBits refInColors, CReadInBits& correctedRead) { // printBitsStr(readInColors, CReadInBits::iReadLength);// DEBUG // printBitsStr(refInColors, CReadInBits::iReadLength); // DEBUG // compare only the first read-length bits readInColors = readInColors.getPrefixStr((unsigned int)CReadInBits::iReadLength); refInColors = refInColors.getPrefixStr((unsigned int)CReadInBits::iReadLength); correctedRead = refInColors; // Default is the same WORD_SIZE upperBitsDiff = readInColors.UpperBits ^ refInColors.UpperBits; WORD_SIZE lowerBitsDiff = readInColors.LowerBits ^ refInColors.LowerBits; WORD_SIZE d = upperBitsDiff | lowerBitsDiff; //bits string indicating which bits are different int diff; #ifdef AMD diff = __builtin_popcountll(d); // magic function to caculate how many ones are there #else for (diff = 0; d; diff++) { d &= d - 1; // clear the least significant bit set } #endif WORD_SIZE lastBit = SHIFT_LEFT(0x01, (CReadInBits::iReadLength - 1)); WORD_SIZE diffStr = upperBitsDiff | lowerBitsDiff; WORD_SIZE snpBits = diffStr & (diffStr >> 1); if (snpBits == 0) { // no consecutive mismatches if (diffStr == lastBit) { correctedRead = correctReadInColorSpace(readInColors, refInColors, lastBit); } return(diff); } else if (snpBits & (snpBits >> 1)) { correctedRead = refInColors; return(-1); // Three consecutive mismatches } else { WORD_SIZE replacedBits = 0x00; { // (1) Complement Type of SNP, (A <-> T, C<->G) WORD_SIZE diffStr = upperBitsDiff & lowerBitsDiff; WORD_SIZE snpBits = diffStr & (diffStr >> 1); replacedBits |= ( snpBits | (snpBits << 1) ); } // Indicated by two consecutive of color changed R <-> B or Y <-> G { // (2) Transversion Type of SNP, (A <-> C, G <-> T) WORD_SIZE diffStr = ~upperBitsDiff & lowerBitsDiff; WORD_SIZE snpBits = diffStr & (diffStr >> 1); replacedBits |= ( snpBits | (snpBits << 1) ); } // Indicated by two consecutive of color changed R <-> Y or B <-> G { // (3) Transition Type of SNP, (A <-> G, C <-> T) WORD_SIZE diffStr = upperBitsDiff & ~lowerBitsDiff; WORD_SIZE snpBits = diffStr & (diffStr >> 1); replacedBits |= ( snpBits | (snpBits << 1) ); } // Indicated by two consecutive of color changed B <-> Y or R <-> G // if the bit before last bit is not mismatched bool lastBitDiff = ((diffStr & lastBit) != 0); if (lastBitDiff && !(diffStr & ( lastBit >> 0x01 ) ) ) { // no consecutive mismatches in the end replacedBits |= lastBit; // take the last bit from read } correctedRead = correctReadInColorSpace(readInColors, refInColors, replacedBits); // ASSERT CReadInBits read = colors2Bases(correctedRead); // printBitsStr(read, (unsigned int)CReadInBits::iReadLength); CReadInBits ref = colors2Bases(refInColors); // printBitsStr(ref, (unsigned int)CReadInBits::iReadLength); unsigned int NoSNP = bitsStrNCompare(read, ref, (unsigned int)CReadInBits::iReadLength); // assert(NoSNP <= 5); // assertSNP(type, refInColors, correctedRead); return(diff - (int)NoSNP * 2 + (int)lastBitDiff); } } char* correctAndDecodeRead \ (CReadInBits read, CReadInBits ref, bool correct, char* caRead, char* caQscore) { if (correct) { CReadInBits correctedRead; int colorMis = correctReadInColorSpace(read, ref, correctedRead); if ( colorMis >= 0 ) { colorQV2baseQV(read, correctedRead, caQscore); } else { // TODO: use DP to get the base inferred by color string. // The corrected reads will be the reference reads. colorQV2baseQV(read, correctedRead, caQscore); } colors2Bases(correctedRead).decode(caRead); } else { decodeColorReadWithPrimer(caRead, read); // decodeColors(caRead, reads); } return(caRead); } // Given strings in bases, return the corresponding color signal in A=0 C=1 G=2, T=3 representation string readInBases2ColorsInACGT_Format(string readInBases) { char colorsPresentInACGT[MAX_READ_LENGTH]; CReadInBits r(readInBases.c_str()); bases2Colors(r).decode(colorsPresentInACGT); return(string(colorsPresentInACGT)); } // Given the color reads in ACGT format, return the corresponding read in 0123 format string colorReadInACGTto0123Format(string colorReadInACGT) { CReadInBits r(colorReadInACGT.c_str()); char colorsIn0123Format[MAX_READ_LENGTH]; // translate into 0123 format decodeColors(&(colorsIn0123Format[1]), r); colorsIn0123Format[0] = colorsIn0123Format[1]; colorsIn0123Format[1] = '0'; return(string(colorsIn0123Format)); } // TEST void assertSNP(int SNPType, CReadInBits refInColors, CReadInBits crInColors) { if (SNPType == 0) { ASSERT_TRUE(refInColors == crInColors, "MISMATCHES after correction"); } else { CReadInBits ref = colors2Bases(refInColors); CReadInBits cr = colors2Bases(crInColors); ref = ref.getPrefixStr(CReadInBits::iReadLength); cr = cr.getPrefixStr(CReadInBits::iReadLength); WORD_SIZE u = (cr.UpperBits ^ ref.UpperBits); WORD_SIZE l = (cr.LowerBits ^ ref.LowerBits); if (SNPType == 1) { ASSERT_TRUE((u & l) > 0, "Not real complement SNP"); } else if (SNPType == 2) { ASSERT_TRUE((~u & l) > 0, "Not real transverstion SNP"); } else if (SNPType == 3) { ASSERT_TRUE((u & ~l) > 0, "Not real transition SNP"); } else { ASSERT_TRUE(2 == bitsStrCompare(ref, cr), "Not double SNPs"); } } } void colorQV2baseQV(CReadInBits readInColors, CReadInBits& correctedRead, char* Qscores) { if (Qscores[0] != '0') { // In case quality score are not available colorQV2baseQV(getDiffBits(readInColors, correctedRead), Qscores, \ (unsigned int)CReadInBits::iReadLength); // TODO the read length is fixed under 64 now. } } // The input Q-scores, output is in the Phed char representation. bool colorQV2baseQV(WORD_SIZE singleColorErrorflag, char* Qscores, unsigned int readLength) { bool negativeQ = false; const char PhedScoreShift = 33; for (unsigned int i = 0; i < readLength; i++) { bool errorColorFlag = ((singleColorErrorflag & 0x01) > 0); Qscores[i] -= PhedScoreShift; if (errorColorFlag) { if (Qscores[i] > 0) { Qscores[i] *= (char)-1; } else if (Qscores[i] < 0) { Qscores[i] = 0; negativeQ = true; } } singleColorErrorflag >>= 0x01; } for (unsigned int i = 0; i < readLength; i++) { Qscores[i] = Qscores[i] + Qscores[i + 1]; if (Qscores[i] < 0) { Qscores[i] = 0; } Qscores[i] += PhedScoreShift; } return(negativeQ); } void testLonglongShift(void) { unsigned long long constNum = 1 << 30; unsigned long long word = constNum * 8; word = longlongShiftRight(word, 32); if ( word != 2) { cout << "testShift64Bit got unexpected result " << word << endl; } else { cout << "testShift64Bit got expected result " << word << endl; } } void testLongBases2ColorsCases(void) { const char* read1 = "AAAACCCCGGGGTTTTAAAACCCCGGGGTTTTACGTACGTACGTACGTAAAACCCCGGGGTTTTA"; const char* color1 ="A0001000300010003000100030001000313131313131313130001000300010003"; testLongBases2Colors(read1, color1); const char* read2 = "AAAACCCCGGGGTTTTAAAACCCCGGGGTTTTTCGTACGTACGTACGTAAAACCCCGGGGTTTTAA"; const char* color2 ="A00010003000100030001000300010000231313131313131300010003000100030"; testLongBases2Colors(read2, color2); } void testLongBases2Colors(const char* longRead, const char* expLongColorSignals) { unsigned int readLength = (unsigned int) strlen(longRead); bool oddReadLength = false; if (readLength % 2 == 1) { CReadInBits::iReadLength = (readLength + 1) / 2; oddReadLength = true; } else { CReadInBits::iReadLength = readLength / 2; oddReadLength = false; } CReadInBits r1stHalf, r2ndHalf, r1stHalfInColors, r2ndHalfInColors; encodeLongRead(longRead, r1stHalf, r2ndHalf); longBases2Colors(r1stHalf, r2ndHalf, r1stHalfInColors, r2ndHalfInColors, oddReadLength); char caBuf[FILENAME_MAX]; decodeLongColors(caBuf, r1stHalfInColors, r2ndHalfInColors, oddReadLength); if ( strcmp(expLongColorSignals, caBuf) != 0 ) { cout << "The decoded read in color is:" << caBuf << endl; cout << "The expected read is color is:" << expLongColorSignals << endl; } } void testReverseColorSignals(const char* colorSignalStr) { char caBuf[MAX_LINE]; char caBuf2[MAX_LINE]; strcpy(caBuf, colorSignalStr); CReadInBits::iReadLength = (int)strlen(caBuf) - 1; CReadInBits r; encodeColors(caBuf, r); decodeColors(caBuf2, r); cout << caBuf2 << endl; r = reverseColorRead(r); decodeColors(caBuf,r); cout << caBuf << endl; } ./Source/FileInputBuffer.cpp0000644011075700120610000001425011720654362016140 0ustar yanghochmath-ar#include "stdafx.h" #include "FileInputBuffer.h" FileInputBuffer::FileInputBuffer(void) { this->uiCapacity = 0; this->uiPtrIndex = 0; this->caBuffer = NULL; this->caBufp = NULL;//Don't delete this, this is a pointer to caBuffer. this->pbuf = NULL; } FileInputBuffer::FileInputBuffer(unsigned int uiCapacity, ifstream* pifile) { this->initialize(uiCapacity, pifile); } FileInputBuffer::~FileInputBuffer(void) { delete [] this->caBuffer; //Don't delete this->caBufp.. it points to middle of caBuffer if (this->pbuf != NULL) { if (this->pbuf->is_open()) { // LOG_INFO("Info %d: Close file when FileInput is deleted\n", FINE_LOG); this->pbuf->close();//Reach the end of the file } } //this->pbuf is not new in class. This is return from ifstream class. Don't delete it. this->pbuf = NULL; // cout << "Delete File input Buffer" << endl; } void FileInputBuffer::initialize(unsigned int uiCapacity, ifstream* pifile) { this->uiCapacity = uiCapacity; this->caBuffer = new char [uiCapacity]; this->pbuf = pifile->rdbuf(); this->uiPtrIndex = 0; //point to start this->caBufp = this->caBuffer; this->fflush(); } void FileInputBuffer::fflush() { //Load file directly to the buffer memset(this->caBuffer, 0x00, sizeof(char)*this->uiCapacity); //load this->uiCapacity - 1 character, so the last one will be 0 pbuf->sgetn(this->caBuffer, this->uiCapacity - 1); this->caBufp = this->caBuffer;//Get new content from file this->uiPtrIndex = 0; if (this->caBuffer[this->uiCapacity - 2] == 0) { //this must be the end of the file if (this->pbuf->is_open()) // LOG_INFO("Info %d: Close file when fflush to the end\n", FINE_LOG); this->pbuf->close();//Reach the end of the file } } unsigned int FileInputBuffer::Getline(char* caArray, unsigned int uiMax_Char_Per_Line) { //If the file is not ended, return 1, if it is ended of the file return 0 unsigned int i = uiPtrIndex, j = 0; //j is how many char has been read this time while (this->caBuffer[i] <= 126) { //if the character is not strange (The last character of the caBuffer should be 0. //If There is a new line or larger than the maximum output line, return current get string if (this->caBuffer[i] == '\n' || j >= uiMax_Char_Per_Line) { if (this->caBuffer[i] == '\n') i++;//next position to avoid '\n'; this->uiPtrIndex = i; this->caBufp = &(this->caBuffer[i]); caArray[j] = '\0'; return(1);//Successfully get a line } else if (this->caBuffer[i] == EOF || ((this->caBuffer[i] == 0) && (!this->pbuf->is_open()))) { //The end of the file //this->caBufp=&(this->caBuffer[this->uiCapacity-1]); //Use less to point to the end this->pbuf->close(); //close the file caArray[j] = '\0'; //put the end of the array this->uiPtrIndex = 0; return(0); } else if (i >= (uiCapacity - 1) || this->caBuffer[i] == 0) { //The buffer content is run out, try to refresh if (this->pbuf->is_open()) { this->fflush(); i = 0;//Keep going } else { //The end of the file caArray[j] = '\0'; return(0);//It is end of the file, but it is end of the buffer and end of file } } else { //Copy the string to buffer caArray[j++] = this->caBuffer[i++]; } } cout << "sChracter " << caBuffer[i] << "Regard as EOF" << endl; // Put the file pointer at the end this->caBufp = &(this->caBuffer[this->uiCapacity-1]); this->pbuf->close(); caArray[j] = '\0';//Return null array this->uiPtrIndex = 0; return(0); } bool FileInputBuffer::ready2Read() { bool isReady2Read = false; //if the buffer is not empty or the file is still open to read isReady2Read = (this->caBufp[0] != 0 || this->pbuf->is_open()); //should test if not come to EOF return(isReady2Read); } // return the file size in bytes unsigned long long getFileSize(const char* fileName) { // TODO Fix the potential bug that has file size larger than long unsigned long long filesize = 0; // If compiled by Microsoft visual c++ #ifdef __MSVC__ // Copy from others and need to be test bool fOk; WIN32_FILE_ATTRIBUTE_DATA fileInfo; if (NULL == fileName) return -1; fOk = GetFileAttributesEx(fileName, GetFileExInfoStandard, (void*) & fileInfo); if (!fOk) return -1; assert(0 == fileInfo.nFileSizeHigh); return (unsigned long long)fileInfo.nFileSizeLow; #elif defined _MSC_VER FILE * stream = fopen(fileName, "r"); fseek(stream, 0L, SEEK_END); filesize = (unsigned long long)ftell(stream); fclose(stream); #endif #ifdef __GNUC__ FILE * stream = fopen(fileName, "r"); fseek(stream, 0L, SEEK_END); filesize = (unsigned long long)ftell(stream); fclose(stream); #endif return(filesize); } unsigned long long getNumberOfLineInAFile(const char* fileName) { unsigned long long estimatedNoOfLines = 1; // count how many lines are there ifstream ifile; ifile.open(fileName, ifstream::in); char BUFFER[MAX_INPUT_BUFFER_SIZE]; memset(BUFFER, 0x00, sizeof(char)*MAX_INPUT_BUFFER_SIZE); ifile.getline(BUFFER, MAX_INPUT_BUFFER_SIZE); if (ifile.bad()) { cout << " Can't open file " << fileName << endl; } (ifile.rdbuf())->sgetn(BUFFER, MAX_INPUT_BUFFER_SIZE - 1); for (unsigned int i = 0; i < MAX_INPUT_BUFFER_SIZE; i++) { if (BUFFER[i] == EOF || !ifile.good()) { cout << "Total No of lines:" << (int)estimatedNoOfLines << endl; break; } else if (BUFFER[i] == '\n') { estimatedNoOfLines++; } else if (BUFFER[i] == 0) { if (i == MAX_INPUT_BUFFER_SIZE - 1) { memset(BUFFER, 0x00, sizeof(char)*MAX_INPUT_BUFFER_SIZE); (ifile.rdbuf())->sgetn(BUFFER, MAX_INPUT_BUFFER_SIZE - 1); i = 0; } else { break; } } else { //DO nothing; } } ifile.close(); return(estimatedNoOfLines); } ./Source/FileOutputBuffer.cpp0000644011075700120610000000411011720654362016333 0ustar yanghochmath-ar#include "stdafx.h" #include "FileOutputBuffer.h" const int MAXPEROUTPUTLINE = 10000; const unsigned int MINBUFFERSIZE = 100000; FileOutputBuffer::FileOutputBuffer(void) { this->caBuffer = NULL; this->caBufp = NULL;//pointer point to a location of caBuffer, (don't delete) this->uiCapacity = 0; this->uiSize = 0; } FileOutputBuffer::FileOutputBuffer(unsigned int uiCapacity, ofstream* pofile) { this->uiCapacity = uiCapacity; if (uiCapacity < MINBUFFERSIZE) { cout << "The buffer is set to the minimum" << MINBUFFERSIZE << endl; this->uiCapacity = MINBUFFERSIZE; } this->caBuffer = new char [uiCapacity]; memset(this->caBuffer, 0x00, sizeof(char)*uiCapacity); this->pofile = pofile; this->uiSize = 0; this->caBufp = this->caBuffer;//point to start2 } FileOutputBuffer::~FileOutputBuffer(void) { if (this->uiSize > 0) { this->fflush(); // (*this->pofile) << "\n"; } if (this->pofile != NULL) { if ((*this->pofile).good()) { (*this->pofile).close(); } } delete [] this->caBuffer; } void FileOutputBuffer::UpdateSize() { for (; this->uiSize < this->uiCapacity; this->uiSize++) { if (this->caBuffer[this->uiSize] == 0) { //if it firstly meet to unwritten part this->caBufp = &(this->caBuffer[this->uiSize]); break; } } if (this->uiSize > this->uiCapacity - MAXPEROUTPUTLINE) { //write to file before it is overflow this->fflush(); } } void FileOutputBuffer::fflush(void) { (*pofile) << this->caBuffer; // output to file memset(this->caBuffer, 0x00, sizeof(char)*uiCapacity);//flush buffer this->uiSize = 0; this->caBufp = this->caBuffer;//point to start } void FileOutputBuffer::removeEndBlankLine(void) { while (this->uiSize > 0) { //if the last character is a new line if (this->caBuffer[this->uiSize - 1] == '\n') { this->uiSize--; this->caBufp = &this->caBuffer[this->uiSize]; this->caBufp[0] = 0; } else { break; } } } ./Source/Filename.cpp0000644011075700120610000002334211720654362014631 0ustar yanghochmath-ar#include "Filename.h" inline char* myStrCpy(char* caBuf, const char* str, int iBufSize) { int iBufSizeMinus1 = iBufSize - 1; char* returnV = strncpy(caBuf, str, iBufSizeMinus1); caBuf[iBufSizeMinus1] = '\0'; return(returnV); } /* * Get the filename from the next line of a file list. * If there are no more files, return "\0" */ bool GetNextFilenameFromListFile(ifstream &ifile, char* filenameBuffer) { // test file object to see if the filename we get is correct ifstream testfile; if (ifile.good()) { do { ifile.getline(filenameBuffer, MAX_FILE_PATH); if (filenameBuffer[0] != '\0' && filenameBuffer[0] != EOF) { if (filenameBuffer[0] == '#') // # follows by comments continue; testfile.open(filenameBuffer); if (!testfile.good()) { cout << filenameBuffer << " doesn't exist or permission deny!!" << endl; testfile.close(); continue; //get next file name } else { testfile.close(); break; } } else if (ifile.eof()) { ifile.close(); filenameBuffer[0] = '\0'; return(false); } } while (strcmp(filenameBuffer, "") == 0 || filenameBuffer[0] == '#'); } else { return(false); } return(true); } bool GetNextFilenamePairFromListFile(ifstream &ifile, char* filenameBuffer1, char* filenameBuffer2) { ifstream testfile1; ifstream testfile2; bool getValidFilePair = false; filenameBuffer1[0] = '\0'; filenameBuffer2[0] = '\0'; if (ifile.good()) { do { char filenameBuffer[FILENAME_MAX]; ifile.getline(filenameBuffer, MAX_FILE_PATH); if (filenameBuffer[0] != '\0' && filenameBuffer[0] != EOF) { if (filenameBuffer[0] != '#') { // Not a comment char* pch = strtok(filenameBuffer, "\t, "); if (pch != NULL) strcpy(filenameBuffer1, pch); pch = strtok(NULL, "\t, "); if (pch != NULL) strcpy(filenameBuffer2, pch); if (filenameBuffer1[0] != '\0' && filenameBuffer2[0] != '\0') { if (!fileExist(filenameBuffer1)) { cout << filenameBuffer1 << " doesn't exist !!" << endl; filenameBuffer1[0] = '\0'; } else if (!fileExist(filenameBuffer2)) { cout << filenameBuffer2 << " doesn't exist !!" << endl; filenameBuffer2[0] = '\0'; } else { getValidFilePair = true; } } } } if (ifile.eof()) { ifile.close(); break; } } while (!getValidFilePair); } else { cout << "Can't get new file" << endl; } return(getValidFilePair); } bool GetNextFilenamePairFromListFile(const char* filename, char* filenameBuffer1, char* filenameBuffer2) { ifstream readsFileList(filename); if(readsFileList.bad()) { return(false); } bool bMatePairedReads = GetNextFilenamePairFromListFile(readsFileList, filenameBuffer1, filenameBuffer2); readsFileList.close(); return(bMatePairedReads); } int chExtName(char* filename, const char* Extname) { int i = 0; //Give Ext name starts with '.' if (filename != NULL && Extname != NULL) { for (i = (int)strlen(filename); i > 0; i--) { if (filename[i] == '.') break; } if (i > 0) strcpy(&(filename[i]), Extname); else { //no extended filename int l = (int)strlen(filename); strcpy(&(filename[l]), Extname); //concatenate the Extended file at the end } return((int)strlen(filename)); } return(-1); } string chExtName(string filename, string Extname) { char fileNameBuf[FILENAME_MAX]; myStrCpy(fileNameBuf, filename.c_str(), FILENAME_MAX); chExtName(fileNameBuf, Extname.c_str()); return(string(fileNameBuf)); } char* addPath(const char* filename, const char* directory, char* path) { char tmpbuffer[MAX_FILE_PATH]; myStrCpy(tmpbuffer, filename, MAX_FILE_PATH); //in case path and filename or the same buffer sprintf(path, "%s//%s", directory, tmpbuffer); return(path); } string getFullPath(string directory, string filename) { #if defined WIN32 || defined WIN64 string fullPath = directory.append("\\").append(filename); #else string fullPath = directory.append("/").append(filename); #endif return(fullPath); } // return the pointer to the start of a file name, given a path const char* getPtrBasename(const char* Path) { int i = 0; for (i = (int)strlen(Path) - 1; i >= 0; i--) { char c = Path[i]; if (isspace(c) || iscntrl(c) ||\ (c == '<') ||\ (c == '>') ||\ (c == ':') ||\ (c == '"') ||\ (c == '|') ||\ (c == '?') ||\ (c == '*') ||\ (c == '%') ||\ (c == '/') ||\ (c == '\\')) { break; // if not alpha, digit, '_', '.' or space } } return(&Path[i + 1]); } // get the file name from the path without extended file name like .txt int getBasename(const char* Path, char* fileNameWithoutExt) { // i and j are the start and end of the filename in the path. int j = 0; if (Path != NULL && fileNameWithoutExt != NULL) { const char* fileName = getPtrBasename(Path); for (j = (int)strlen(fileName); j >= 0; j--) { if (fileName[j] == '.') break; } // get the start position of ext file strcpy(fileNameWithoutExt, fileName); // if there is no ext name if (j > 0) { fileNameWithoutExt[j] = '\0'; } } return((int)strlen(fileNameWithoutExt)); } string getBasename(const char* Path) { char fileNameWithoutExt[FILENAME_MAX]; getBasename(Path, fileNameWithoutExt); return(string(fileNameWithoutExt)); } //Get the name from the path and store in Title //removed the number at the end of the Filename int getTitleFromPath(char* Path, char* Title) { int i = 0; getBasename(Path, Title); for (i = (int)strlen(Title); i > 0; i--) { if (Title[i] == '_') { if (atoi(&(Title[i+1])) > 0) { //remove the '_' and every number after it for (; Title[i] == '_'; i--) Title[i] = '\0'; } return(0); } } return(0); } const char* getExtName(const char* filename) { // Return the char pointer point to the postfix of the char array // Ex: given a.b.txt return const char* ".txt" // Ex: given abc return const char* "abc" int i = 0; for (i = (int)strlen(filename); i > 0; i--) { if (filename[i] == '.') break; } return(&(filename[i])); } bool hasTheExtName(const char* filename, const char* extName) { const char* realExtName = getExtName(filename); int i; for (i = 0; extName[i] != '\0' && realExtName[i] != '\0'; i++) { if (tolower(extName[i]) != tolower(realExtName[i])) return(false); } if (i == 0) { //No ExtName return (false); } else { return(extName[i] == realExtName[i]); // one may be the prefix of the other } } bool fileExist(const char* filename) { if (filename != NULL && filename[0] != 0) { ifstream iFile; iFile.open(filename, ifstream::in); // iFile.close(); if (iFile.fail()) { return(false); // not exist } else { iFile.close(); return (true); //exist; } } return(false); // wrong file name } bool checkFileExist(const char* filename) { if (!fileExist(filename)) { cout << filename << " dosent exist.\n" << endl; return(false); } else { return(true); } } void filenameLize(char* string) { for (int i = 0; string[i] != '\0'; i++) { char c = string[i]; if (!(isalnum(c) || c == '.')) { if (i == 0) { // If the first character is non_valid, replace to 'Z' strcpy(string, "NULL_Filename"); } else { // Truncated the string if a space, tab or comma if (c == ' ' || c == '\t' || c == ',' || c == '\n' || c == '\r') { string[i] = '\0'; break; } else { // else replace to '_' string[i] = '_'; } } } } } bool isPathWritable(const char* filename) { FILE *fp = NULL; fp = fopen(filename, "w"); if (fp == NULL) { return false; } else { fclose(fp); return true; } } bool checkPathCharsAreValid(const char* pathStr) { // null path will return true for (int i = 0; pathStr[i] != '\0'; i++) { char c = pathStr[i]; if (isspace(c) || iscntrl(c)) { return(false); } switch (c) { case '&': case '"': case '*': case '?': case ':': case '>': case '<': case '\'': return(false); default: break; } } return(true); } bool dirExist(const char* strFolderPath) { if ( fileExist(strFolderPath)) { struct stat status; stat( strFolderPath, &status ); if ( status.st_mode & S_IFDIR ) { return(true); } } return(false); } ./Source/Flags.cpp0000644011075700120610000000647411720654362014154 0ustar yanghochmath-ar#include "Flags.h" using namespace std; bool CFlags::checkArg(int argc, const char** argv, const char* arg) { this->flags.push_back(arg); for (int i = 1; i < argc; i++) { if (strcmp(argv[i], arg) == 0) { return(true); } } return(false); } bool CFlags::checkIntOpt(int argc, const char** argv, const char* arg, int& argValue) { this->flags.push_back(arg); for (int i = 1; i < argc - 1; i++) { if (strcmp(argv[i], arg) == 0) { if (argv[i+1][0] != '-') { argValue = (int) atof(argv[i+1]); } return(true); } } return(false); } // check the given parameters (argv) contain a specific unsigned integer option. bool CFlags::checkUnIntOpt(int argc, const char** argv, const char* arg, unsigned int& argValue) { this->flags.push_back(arg); for (int i = 1; i < argc - 1; i++) { if (strcmp(argv[i], arg) == 0) { if (argv[i+1][0] != '-') { argValue = (unsigned int) atof(argv[i+1]); } return(true); } } return(false); } bool CFlags::checkpCharOpt(int argc, const char** argv, const char* arg, char& argValue) { this->flags.push_back(arg); for (int i = 1; i < argc - 1; i++) { if (strcmp(argv[i], arg) == 0) { int argLength = (int)strlen(argv[i+1]); if (argLength == 1) { argValue = argv[i+1][0]; return(true); } else if (argLength == 3) { argValue = argv[i+1][1]; return(true); } else { cout << "Unrecognizable delimiter " << argv[i+1] << "for read id." << endl; cout << "Space, tab and comma are the default delimiters." << endl; cout << "Put single quote around the delimiter." << endl; } } } return(false); } // check if the parameters (argv) contain a specific string option. bool CFlags::checkpStrOpt(int argc, const char** argv, const char* arg, string& argStr) { this->flags.push_back(arg); for (int i = 1; i < argc - 1; i++) { if (strcmp(argv[i], arg) == 0) { if (argv[i+1][0] != '-') { argStr = string(argv[i+1]); } return(true); } } return(false); } // check if the parameters (argv) contain a specific string option. bool CFlags::checkpStrOpt(int argc, const char** argv, const char* arg, char* argStr) { this->flags.push_back(arg); for (int i = 1; i < argc - 1; i++) { if (strcmp(argv[i], arg) == 0) { if (argv[i+1][0] != '-') { strcpy(argStr, argv[i+1]); } return(true); } } return(false); } bool CFlags::checkUnrecognizedFlags(int argc, const char ** argv) { for (int i = 1; i < argc; i++) { if ( argv[i][0] == '-') { bool bRecognized = false; for (vector::iterator it = this->flags.begin(); \ it != this->flags.end(); it++) { if (strcmp(argv[i], it->c_str()) == 0) { bRecognized = true; } } if (bRecognized == false) { LOG_INFO("\nInfo %d: unrecognized option %s.\n", INFO_LOG, argv[i] ); } } } return(true); }./Source/GenomeInBits.cpp0000644011075700120610000001717211720654362015440 0ustar yanghochmath-ar#include "GenomeInBits.h" CGenomeInBits::CGenomeInBits(unsigned int uiGenomeSize) { this->initialization(uiGenomeSize); } CGenomeInBits::~CGenomeInBits(void) { delete [] this->pLowerBits; delete [] this->pUpperBits; delete this->pNBits; // don't delete this->pGenome. } CGenomeInBits::CGenomeInBits(CGenomeNTdata* pGenome) { this->initialization(pGenome->iGenomeSize); this->pGenome = pGenome; // If there are non-ACGT characters in a chromosome, the corresponding bits in memory will be set to A (00). for (unsigned int chrId = 0; chrId < this->pGenome->iNo_of_chromosome; chrId++) { CchromosomeNTdata* chr = this->pGenome->paChromosomes[chrId]; // (1) Encode the junction of each chromosome encodeJunction(chrId); // (2) Encode the middle part of the chromosome unsigned int uiGenomeLoucs = this->pGenome->chrIndex2genomelocusID(chrId, 0); unsigned int blockId = (uiGenomeLoucs / wordSize) + 1; // start at the second encoding block for the chromosome uiGenomeLoucs = blockId * wordSize ; // The start genome Locus Id of the block unsigned int chrLocusId = this->pGenome->genomeLocusID2chrIndex(uiGenomeLoucs); unsigned int i; // i is the block local Id in the chromosome. blockId is the global bloack Id for (i = 1; chrLocusId + i * wordSize < chr->iChromosome_size; blockId++, i++) { encodeReadNasA(&(chr->caChromosome[chrLocusId + (i - 1) * wordSize]), wordSize, &this->pUpperBits[blockId], &this->pLowerBits[blockId]); } // (3) Encode the tail of the chromosome (won't be empty) encodeReadNasA(&(chr->caChromosome[chrLocusId + (i - 1) * wordSize]), chr->iChromosome_size - (chrLocusId + (i - 1) * wordSize), &this->pUpperBits[blockId], &this->pLowerBits[blockId]); } // (4) Encode bits N in a boolFlagArray unsigned int noOfNInGenome = 0; for (unsigned int chrId = 0; chrId < this->pGenome->iNo_of_chromosome; chrId++) { CchromosomeNTdata* chr = this->pGenome->paChromosomes[chrId]; unsigned int uiGenomeLoucs = this->pGenome->chrIndex2genomelocusID(chrId, 0); for (unsigned int i = 0; i < chr->iChromosome_size; i++, uiGenomeLoucs++) { if (!isACGT(chr->caChromosome[i])) { this->pNBits->setflag(uiGenomeLoucs, true); noOfNInGenome ++; } } } if ( noOfNInGenome > 0) { LOG_INFO("Info %d: There are %u N in the genome.\r", INFO_LOG, noOfNInGenome); } } int CGenomeInBits::initialization(unsigned int uiGenomeSize) { this->pNBits = NULL; this->pLowerBits = NULL; this->pUpperBits = NULL; this->pGenome = NULL; caSubstring[0] = '\0'; return (allocBitStrSpace(uiGenomeSize)); } int CGenomeInBits::allocBitStrSpace(unsigned int uiGenomeSize) { if (uiGenomeSize > 0) { if (this->pLowerBits == NULL && this->pUpperBits == NULL && this->pNBits == NULL) { this->uiGenomeLength = uiGenomeSize; this->uiGenomeLengthInWordSize = (uiGenomeLength - 1) / wordSize + 2; // add one more this->pLowerBits = new WORD_SIZE[uiGenomeLengthInWordSize]; this->pUpperBits = new WORD_SIZE[uiGenomeLengthInWordSize]; this->pNBits = new CboolFlagArray(uiGenomeLength); memset(pLowerBits, 0x00, uiGenomeLengthInWordSize); memset(pUpperBits, 0x00, uiGenomeLengthInWordSize); } else { ERR; } } return(0); } int CGenomeInBits::encodeJunction(unsigned int chrId) { char region[wordSize + 1]; unsigned int uiGenomeLoucs = this->pGenome->chrIndex2genomelocusID(chrId, 0); unsigned int blockId = uiGenomeLoucs / wordSize; if (chrId == 0) { strncpy(region, this->pGenome->paChromosomes[chrId]->caChromosome, wordSize); } else { unsigned int tailLength = uiGenomeLoucs % wordSize; // tail of the previous chromosome in the junction block CchromosomeNTdata* pChr; pChr = this->pGenome->paChromosomes[chrId - 1]; strncpy(region, &(pChr->caChromosome[pChr->iChromosome_size - tailLength]), tailLength); pChr = this->pGenome->paChromosomes[chrId]; strncpy(®ion[tailLength], pChr->caChromosome, wordSize - tailLength); } region[wordSize] = '\0'; encodeReadNasA(region, wordSize, &this->pUpperBits[blockId], &this->pLowerBits[blockId]); return(0); } /* Note the chromosome encoding is NOT continuous. The more significant bits of each words encodes * nucleotides in front of those less significant bits. This encodeing make bits incontinuously mapped * to chromosome index. Note the first bit of each WORD represent the first base of each section */ CReadInBits CGenomeInBits::getSubstringInBits(unsigned int uiGenomeIndex) const { CReadInBits r; unsigned int indexInWords = uiGenomeIndex / wordSize; unsigned int bitsShift = uiGenomeIndex % wordSize; if (this->uiGenomeLengthInWordSize > indexInWords) { r.UpperBits = this->pUpperBits[indexInWords] >> bitsShift; r.LowerBits = this->pLowerBits[indexInWords] >> bitsShift; if (bitsShift != 0) { r.UpperBits |= (this->pUpperBits[indexInWords + 1] << (wordSize - bitsShift)); r.LowerBits |= (this->pLowerBits[indexInWords + 1] << (wordSize - bitsShift)); } } else LOG_INFO("\nInfo %d: wrong genome index.\n", WARNING_LOG); return(r); } // eliminate the tail bits out of read length range CReadInBits CGenomeInBits::getSubstringInBits\ (unsigned int uiGenomeIndex, unsigned int uiSubstringLength) const { CReadInBits r; if(uiGenomeIndex < this->uiGenomeLength) { unsigned int indexInWords = uiGenomeIndex / wordSize; unsigned int bitsShift = uiGenomeIndex % wordSize; r.UpperBits = this->pUpperBits[indexInWords] >> bitsShift; r.LowerBits = this->pLowerBits[indexInWords] >> bitsShift; if (bitsShift != 0) { r.UpperBits |= (this->pUpperBits[indexInWords + 1] << (wordSize - bitsShift)); r.LowerBits |= (this->pLowerBits[indexInWords + 1] << (wordSize - bitsShift)); } unsigned int elimatedBitsNo = wordSize - uiSubstringLength; r.UpperBits <<= elimatedBitsNo; r.LowerBits <<= elimatedBitsNo; r.UpperBits >>= elimatedBitsNo; r.LowerBits >>= elimatedBitsNo; } else { cout << "Access outside of the genomeInBits" << endl; } return (r); } char* CGenomeInBits::getSubstring(unsigned int uiGenomeIndex) { CReadInBits r = getSubstringInBits(uiGenomeIndex); decodeRead(caSubstring, wordSize, r.UpperBits, r.LowerBits); return (caSubstring); } char* CGenomeInBits::getSubstring(unsigned int uiGenomeIndex, unsigned int uiSubstringLength) { CReadInBits r = getSubstringInBits(uiGenomeIndex); decodeRead(caSubstring, wordSize, r.UpperBits, r.LowerBits); if (uiSubstringLength <= wordSize) { caSubstring[uiSubstringLength] = '\0'; } if (uiGenomeIndex + uiSubstringLength > uiGenomeLength) { if (uiGenomeLength > uiGenomeIndex) caSubstring[uiGenomeLength - uiGenomeIndex] = '\0'; else { LOG_INFO("\nInfo %d: wrong genome index.\n", WARNING_LOG); caSubstring[0] = '\0'; } } return (caSubstring); } bool CGenomeInBits::fragACGTKmerInBits(CReadInBits& kmerInBits, int startIndex, int kmerLength) { for (int i = 0; i < kmerLength; i++) { int baseGenomeIndex = i + startIndex; if (this->pNBits->b(baseGenomeIndex)) { // Meet N return(false); } } kmerInBits = getSubstringInBits(startIndex, kmerLength); return(true); } ./Source/GenomeNTdata.cpp0000644011075700120610000002367511720654362015430 0ustar yanghochmath-ar#include "GenomeNTdata.h" #include CGenomeNTdata::CGenomeNTdata(void) { this->initialization(); } CGenomeNTdata::CGenomeNTdata(const char* DataSetListFile) { this->initialization(); ifstream ListFile(DataSetListFile); char inputfastaFilename[MAX_LINE + 1]; while (GetNextFilenameFromListFile(ListFile, inputfastaFilename)) { //New a CchromosomeNTdata object and add it genome if (fileExist(inputfastaFilename)) { this->addChromosome(inputfastaFilename); } else { LOG_INFO("Info %d: Can't open %s in %s.\r", CONFIG_LOG,\ inputfastaFilename, get_working_directory().c_str()); } } myStrCpy(this->refName, getBasename(DataSetListFile).c_str(), MAX_LINE); if (this->iNo_of_chromosome > 0) { LOG_INFO("Info %d: %d seqs are in %s.\r", CONFIG_LOG,\ this->iNo_of_chromosome, DataSetListFile); } else { LOG_INFO("Info %d: No seqs are in %s.\r", WARNING_LOG, DataSetListFile); } this->checkRefsNames(); } CGenomeNTdata::~CGenomeNTdata(void) { unsigned int i; for (i = 0; i < GENOME_CAPACITY; i++) { delete this->paChromosomes[i]; this->paChromosomes[i] = NULL; } // this->paChromosomes is not new } int CGenomeNTdata::initialization(void) { unsigned int i; this->iGenomeSize = 0; this->refName[0] = '\0'; this->iNo_of_chromosome = 0; this->caKmer[0] = '\0'; for (i = 0; i < GENOME_CAPACITY; i++) { IndexCovertTable[i] = 0; this->paChromosomes[i] = NULL; } return(0); } // delete the spaced used character string of each chromosome int CGenomeNTdata::freeChromosomeSpace(void) { for (unsigned int i = 0; i < this->iNo_of_chromosome; i++) { delete [] this->paChromosomes[i]->caChromosome; this->paChromosomes[i]->caChromosome = NULL; } return(0); } /* This function add one more chromosome in the genome set, * return the size of newly add in chromosome */ unsigned int CGenomeNTdata::addChromosome(const char* chromosomeFileName, bool bFastaFormat) { // Concatenate the chromosome file name as the name of the reference genome (No ext file name) string newRefName = string(refName).append(getBasename(chromosomeFileName)); myStrCpy(refName, newRefName.c_str(), MAX_LINE ); if (fileExist(chromosomeFileName)) { CchromosomeNTdata* pChr = NULL; pChr = new CchromosomeNTdata(chromosomeFileName, bFastaFormat); if (pChr != NULL) { this->paChromosomes[iNo_of_chromosome] = pChr; if (iNo_of_chromosome < GENOME_CAPACITY - 1) { if (iNo_of_chromosome == 0) { this->IndexCovertTable[iNo_of_chromosome] = pChr->iChromosome_size; this->iGenomeSize = pChr->iChromosome_size; } else { /*iNo_of_chromosome > 0 */ this->IndexCovertTable[iNo_of_chromosome] = this->IndexCovertTable[iNo_of_chromosome-1] + pChr->iChromosome_size; this->iGenomeSize += pChr->iChromosome_size; } LOG_INFO("\nInfo %d: Reference %s has %u bases.\n",\ INFO_LOG, chromosomeFileName, pChr->iChromosome_size); } else { LOG_INFO("\nInfo %d: Add too many fasta ref file in the list.\n", WARNING_LOG); LOG_INFO("\nInfo %d: Concatenate ref files or increase constant GENOME_CAPACITY in the code.\n", INFO_LOG); } this->iNo_of_chromosome++; return(pChr->iChromosome_size); } else { LOG_INFO("\nInfo %d: %s is in a wrong format.\n", ERROR_LOG, chromosomeFileName); return(0); } } else { string dirPath = get_working_directory(); LOG_INFO("\nInfo %d: %s are not found in %s.\n", WARNING_LOG, chromosomeFileName, dirPath.c_str()); return (0); // can not open file } } char* CGenomeNTdata::genomeLocusID2Kmer(unsigned int uiKmer_Length, unsigned int genomeLocusID) { this->caKmer[0] = '\0'; //initialize if (this->paChromosomes[0] == NULL || this->paChromosomes[0]->caChromosome == NULL) { LOG_INFO("\nInfo %d: No chromosome or has been free.\n", WARNING_LOG); return(this->caKmer); } if (genomeLocusID == BAD_GENOME_INDEX) { LOG_INFO("\nInfo %d: Wrong Genome Locus.\n", WARNING_LOG); } else { unsigned int chrID = this->genomeIndex2chrID(genomeLocusID); if (chrID >= this->iNo_of_chromosome) { LOG_INFO("\nInfo %d: Error in the chrID.\n", ERROR_LOG); } else { unsigned int chrLocusID; if (chrID == 0) { chrLocusID = genomeLocusID; } else { chrLocusID = genomeLocusID - this->IndexCovertTable[chrID-1]; } if (chrLocusID < this->paChromosomes[chrID]->iChromosome_size) { strncpy(this->caKmer, &(this->paChromosomes[chrID]->caChromosome[chrLocusID]), uiKmer_Length); } else { LOG_INFO("\nInfo %d: Error in the chrLocusID %u, %u.\n", ERROR_LOG, chrID, chrLocusID); } this->caKmer[uiKmer_Length] = '\0'; } } return(this->caKmer); } /* This function will covert the position described by (chromosome ID and chromosome locus ID), * a number pair to genome locus index recorded in the table list. */ unsigned int CGenomeNTdata::chrIndex2genomelocusID(unsigned int iChrID, unsigned int iChrLocusID) { if (iChrID > 0) { unsigned int returnValue = this->IndexCovertTable[iChrID-1] + iChrLocusID; return(returnValue); } else return(iChrLocusID); } unsigned int CGenomeNTdata::genomeIndex2chrID(unsigned int igenomeLocusID) { unsigned int i; for (i = 0; i < this->iNo_of_chromosome; i++) { if (igenomeLocusID < this->IndexCovertTable[i]) { return(i); } } LOG_INFO("\nInfo %d: Unknown Chromosome %u.\n", WARNING_LOG, i); return(this->iNo_of_chromosome); } /* This function will covert to genome locus index recorded in the table list * to the locus index recorded in the table list */ unsigned int CGenomeNTdata::genomeLocusID2chrIndex(unsigned int igenomeLocusID) { unsigned int i; for (i = 0; i < this->iNo_of_chromosome; i++) { if (igenomeLocusID < this->IndexCovertTable[i]) { break; } } if (i == 0) { return(igenomeLocusID); } else if (i < this->iNo_of_chromosome) { return(igenomeLocusID - this->IndexCovertTable[i-1]); } else { LOG_INFO("\nInfo %d: wrong genome locus %u.\n", WARNING_LOG, igenomeLocusID); } return(igenomeLocusID); } void CGenomeNTdata::checkRefsNames(void) { set refNames; for (unsigned int i = 0; i < this->iNo_of_chromosome; i++) { vector* chrRefNs = &(this->paChromosomes[i]->geneVec.table); if (chrRefNs->size() > 0) { vector::iterator it = chrRefNs->begin(); for (; it != chrRefNs->end(); it++) { if (!refNames.insert(it->name).second) { LOG_INFO("Info %d: ref %s in %s are duplicated.\n",\ WARNING_LOG, it->name.c_str(), this->paChromosomes[i]->caInputFileName); } } } } } vector CGenomeNTdata::getRefNamesLengths(void) { vector refNamesLengthes; for (unsigned int i = 0; i < this->iNo_of_chromosome; i++) { CchromosomeNTdata* chr = this->paChromosomes[i]; vector* chrGenes = &(chr->geneVec.table); if ((int)(chrGenes->size()) > 0) { int originalSize = (int)refNamesLengthes.size(); copy(chrGenes->begin(), chrGenes->end(), std::back_inserter(refNamesLengthes)); // change the meaning CGene.startIndex to length of the gene vector::iterator it = refNamesLengthes.begin() + originalSize; for (; (it + 1) != refNamesLengthes.end(); it++) { it->startIndex = ((it + 1)->startIndex - it->startIndex); } it->startIndex = chr->iChromosome_size - it->startIndex; } else { refNamesLengthes.push_back(CGene(getBasename(chr->caInputFileName), chr->iChromosome_size)); } } return(refNamesLengthes); } unsigned int BruteForceSearch(CGenomeNTdata& genome, char* Kmer) { unsigned int kmer_length = (unsigned int) strlen(Kmer); unsigned int chrID = 0, chromsomeIndex = 0; // Best alignments chrID and index. unsigned int i, j, k; unsigned MinDiff = kmer_length; bool reverseIsBetter = false; for (i = 0; i < genome.iNo_of_chromosome; i++) { for (j = 0; j < genome.paChromosomes[i]->iChromosome_size - kmer_length; j++) { unsigned int diff = 0; for (k = 0; k < kmer_length; k++) { if (genome.paChromosomes[i]->caChromosome[j + k] != Kmer[k]) diff++; } if (diff < MinDiff) { MinDiff = diff; chrID = i; chromsomeIndex = j; reverseIsBetter = false; } //Also check the Reveres direction for (diff = 0, k = 0; k < kmer_length; k++) { char nt = genome.paChromosomes[i]->caChromosome[j+k]; nt = complimentBase(nt);//Get the complement base if (nt != Kmer[kmer_length - 1 - k]) diff++; } if (diff < MinDiff) { MinDiff = diff; chrID = i; chromsomeIndex = j; reverseIsBetter = true; } } } if (MinDiff <= 3) { cout << "Best alignment is located at "; unsigned int genomeIndex = genome.chrIndex2genomelocusID(chrID, chromsomeIndex); cout << genomeIndex << ':' << genome.genomeLocusID2Kmer(kmer_length, genomeIndex); if (reverseIsBetter) cout << "reverse is better" << endl; } return(MinDiff); } ./Source/Genome_Index.cpp0000644011075700120610000001774711720654362015466 0ustar yanghochmath-ar#include "Genome_Index.h" CGenome_Index::CGenome_Index(void) { this->initialization(); } CGenome_Index::~CGenome_Index(void) { delete this->pgenomeNT; delete this->pgenomeNTInBits; } int CGenome_Index::initialization(void) { this->NO_OF_BUCKET = 0x4000000; // 64M buckets, 256 MB 4^13 // this->NO_OF_BUCKET = 0x1000000; // 16M buckets, 64 MB, 4^12 // this->NO_OF_BUCKET = 0x400000; // 4M buckets, 16 MB, 4^11 this->pgenomeNT = NULL; this->pgenomeNTInBits = NULL; this->fpHashValue = NULL; this->fpSeedKey = NULL; this->caRefName[0] = '\0'; this->bEXTEND_SEED = true; this->iHashDigits = 0; this->iKeyDigits = 0; return(0); } unsigned int CGenome_Index::getHashValue(char* slide_window) const { CReadInBits r(slide_window); return(this->getHashValue(r) & (NO_OF_BUCKET - 1)); } // generate Key for sorting unsigned int CGenome_Index::getSeedKey(char* slide_window) const { CReadInBits r(slide_window); return(getSeedKey(r)); } unsigned int CGenome_Index::getHashValue(CReadInBits r) const { if (this->fpHashValue != NULL) return(this->fpHashValue(r) & (NO_OF_BUCKET - 1)); // DEBUG else return(0); } unsigned int CGenome_Index::getSeedKey(CReadInBits r) const { if (this->fpSeedKey != NULL) return(this->fpSeedKey(r, this->iKeyDigits)); else return(0); } int CGenome_Index::chooseHashFunction(unsigned int uiReadLength, unsigned int chosenSeedId) { const char* F0SeedRepeat = "111"; const char* F1SeedRepeat = "1110"; const char* F2SeedRepeat = "1110100"; const char* S11SeedRepeat = "1111001000"; const char* F3SeedRepeat = "11101001000"; const char* S20SeedRepeat = "11110010000"; const char* S12SeedRepeat = "11110010000000"; //Only for read length 44 - 50 const char* F4SeedRepeat = "1100010000"; const char* SeedRepeatPattern; // (1) choose SeedRepeatPattern and Hash function to bucket ID switch (chosenSeedId) { case 0: SeedRepeatPattern = F0SeedRepeat; this->fpHashValue = selectF0(uiReadLength); break; case 1: SeedRepeatPattern = F1SeedRepeat; this->fpHashValue = selectF1(uiReadLength); break; case 2: SeedRepeatPattern = F2SeedRepeat; this->fpHashValue = selectF2(uiReadLength); break; case 11: // full sensitive to three mismatches but two of them must to be consecutive. SeedRepeatPattern = S11SeedRepeat; this->fpHashValue = selectS1_1(uiReadLength); break; case 20: // full sensitive to 2 pairs of consecutive mismatches SeedRepeatPattern = S20SeedRepeat; this->fpHashValue = selectS2_0(uiReadLength); break; case 3: SeedRepeatPattern = F3SeedRepeat; this->fpHashValue = selectF3(uiReadLength); break; case 12: // full sensitive to four mismatches but two of them must to be consecutive. if (uiReadLength >= 44) { SeedRepeatPattern = S12SeedRepeat; this->fpHashValue = getS1_2SeedHashValue; break; } // otherwise use F4 case 4: SeedRepeatPattern = F4SeedRepeat; this->fpHashValue = selectF4(uiReadLength); this->fpSeedKey = &returnDummyHashKey; // DEFAULT break; default: SeedRepeatPattern = F3SeedRepeat; // DEFAULT LOG_INFO("\nInfo %d: The sensitivity threshold haven't been implemented.\n",\ INFO_LOG); LOG_INFO("\nInfo %d: Use seed pattern which is full sensitivit to 3 mismatches instead.\n",\ INFO_LOG); } if (this->fpHashValue == NULL) { string seedStr = seedSymbol(chosenSeedId); LOG_INFO("Info %d: Read length is too short (or long) for the seed %s.\n" \ , ERROR_LOG, seedStr.c_str()); return(-1); } // (2) Get the hashkey function for binary search int DEFAULT_HASHING_BITS = 13; this->uiNoOfShift = (unsigned int)strlen(SeedRepeatPattern) - 1; if (bEXTEND_SEED) { this->iHashDigits = getNoOfCaredPositions4FullRead(SeedRepeatPattern, uiReadLength); // Speical setting to extend 34-bp and 32 reads because the min weight is twelve only if (uiReadLength == 34 && (chosenSeedId == FULL_SENSITIVE_OPT_TO_TWO_BASE_MIS || chosenSeedId == 3)) { DEFAULT_HASHING_BITS = 12; } else if (uiReadLength == 32 && chosenSeedId == 3) { DEFAULT_HASHING_BITS = 10; } int seedWeight = getNoOfCaredPositions(SeedRepeatPattern, uiReadLength); if (DEFAULT_HASHING_BITS > seedWeight) { string msg("The mapping could be slow.\n"); LOG_INFO("Info %d: Seed weight %d is low due to the short read length.%s" \ , INFO_LOG, seedWeight, msg.c_str()); this->iHashDigits = getNoOfCaredPositions(SeedRepeatPattern, uiReadLength); this->bEXTEND_SEED = false; } } else { this->iHashDigits = getNoOfCaredPositions(SeedRepeatPattern, uiReadLength); } if (this->iHashDigits > DEFAULT_HASHING_BITS) { this->iKeyDigits = this->iHashDigits - DEFAULT_HASHING_BITS; this->iHashDigits = DEFAULT_HASHING_BITS; } else { this->iKeyDigits = 0; } // When hashing index, use this->uiSeedLength to filter out sliding windows with N this->uiSeedLength = uiReadLength - this->uiNoOfShift; // If extended seed method is used, this->uiSeedLength will be changed to read length. // (3) chooose See key function return(this->chooseSeedKeyFunction(uiReadLength, chosenSeedId)); } int CGenome_Index::chooseSeedKeyFunction(unsigned int uiReadLength, unsigned int chosenSeedId) { switch (chosenSeedId) { case 0: if (iKeyDigits > 0) { this->fpSeedKey = &getF0SeedKey; } else { this->fpSeedKey = &returnDummyHashKey; } break; case 1: if (iKeyDigits > 0) { this->fpSeedKey = &getF1SeedKey; } else { this->fpSeedKey = &returnDummyHashKey; } break; case 2: // Full sensitivie to 2 mis if (iKeyDigits > 0) { this->fpSeedKey = &getF2SeedKey; } else { this->fpSeedKey = &returnDummyHashKey; } break; case 11: // Full sensitivie to 1 color + 1 base mis if (iKeyDigits > 0) { this->fpSeedKey = &getS1_1SeedKey; } else { this->fpSeedKey = &returnDummyHashKey; } break; case 20: // full sensitive to 2 pairs of consecutive mis. if (iKeyDigits > 0) { if (uiReadLength == 34) { this->fpSeedKey = &getS2_0SeedKey4ReadLength34; } else this->fpSeedKey = &getS2_0SeedKey; } else { this->fpSeedKey = &returnDummyHashKey; } break; case 3: // Full sensitivie to 3 mis if (iKeyDigits > 0) { if (uiReadLength == 34) { this->fpSeedKey = &getF3SeedKey4ReadLength34; } else if (uiReadLength == 32) { this->fpSeedKey = &getF3SeedKey4ReadLength32; } else this->fpSeedKey = &getF3SeedKey; } else { this->fpSeedKey = &returnDummyHashKey; // DEBUG } break; case 12: // Full sensitive to 1 color + 2 mis if (iKeyDigits > 0) { if (46 <= uiReadLength && uiReadLength <= 49) { this->fpSeedKey = &getS1_2SeedKey4ReadLength46_49; } else { this->fpSeedKey = &returnDummyHashKey; // DEFAULT } break; } // otherwise use F4 case 4: this->fpSeedKey = &returnDummyHashKey; // DEFAULT break; default: this->fpSeedKey = &returnDummyHashKey; // DEFAULT if (chosenSeedId > MAX_MISMATCH_THRESHOLD) { LOG_INFO("Info %d: No Seed key function is defined for\ full sensitive opt %d!\n", WARNING_LOG, chosenSeedId); return(-1); } } return (0); } ./Source/Genome_Index_Table.cpp0000644011075700120610000007031611720654362016564 0ustar yanghochmath-ar#include "Genome_Index_Table.h" // For test #define PRINT_GENOME_SUBSTRING(genomeIndex, slideWindowLength)\ do {\ char ref[FILENAME_MAX];\ this->pgenomeNTInBits->getSubstringInBits(genomeIndex, slideWindowLength).decode(ref);\ cout << genomeIndex << "\t" << ref << endl;\ } while(0) #define GENOME_SUBSTRING_CMP(genomeIndex1, genomeIndex2, slideWindowLength)\ do {\ char ref1[FILENAME_MAX];\ char ref2[FILENAME_MAX];\ this->pgenomeNTInBits->getSubstringInBits(genomeIndex1, slideWindowLength).decode(ref1);\ this->pgenomeNTInBits->getSubstringInBits(genomeIndex2, slideWindowLength).decode(ref2);\ if(strcmp(ref1, ref2) != 0) {\ cout << genomeIndex1 << ',' << ref1 << "\t" << endl;\ cout << genomeIndex2 << ',' << ref2 << "\t" << endl;\ }\ } while(0) CGenome_Index_Table::CGenome_Index_Table(void) { this->initialization(); } CGenome_Index_Table::~CGenome_Index_Table(void) { } int CGenome_Index_Table::initialization(void) { this->num_of_repeat_patterns = 0; this->pbaRepeatRepresentativeFlag = NULL; this->pbaRepeatMaskedFlag = NULL; return(0); } int CGenome_Index_Table::getGenomeNTdata(const char* genomeListfileName, string refFormat = "") { if (hasTheExtName(genomeListfileName, ".fasta") || hasTheExtName(genomeListfileName, ".fna") || hasTheExtName(genomeListfileName, ".mfa") || hasTheExtName(genomeListfileName, ".dat") || hasTheExtName(genomeListfileName, ".fa") || refFormat == "fasta") { const bool bFastaFormat = true; this->pgenomeNT = new CGenomeNTdata(); this->pgenomeNT->addChromosome(genomeListfileName, bFastaFormat); } else { this->pgenomeNT = new CGenomeNTdata(genomeListfileName); } return(0); } bool CGenome_Index_Table::read_index_table(const char* indexFilePath, bool bPrintErrMsg) { bool sucessfuallyReadTable = true; if (!fileExist(indexFilePath)) { if (hasTheExtName(indexFilePath, ".index")) { if (bPrintErrMsg) { LOG_INFO("Info %d: Index file %s haven't been built in %s.\n",\ INFO_LOG, indexFilePath, get_working_directory().c_str()); } } return(false); } if (this->pgenomeNT == NULL) { this->pgenomeNT = new CGenomeNTdata(); } if (this->pgenomeNTInBits == NULL) { this->pgenomeNTInBits = new CGenomeInBits(); } FILE* fp = fopen(indexFilePath, "rb"); if (fscanf(fp, "%s\n", this->caRefName) == 0) { ERR; } if (readRefInBinFile(fp, this->pgenomeNTInBits, this->pgenomeNT)) { LOG_INFO("Info %d: Fail to readin ref from index file\n", INFO_LOG); return(false); } TIME_INFO(sucessfuallyReadTable = this->read_Hash_Table(fp), "Read in index "); TIME_INFO(check_masked_flags(), "Masked region built"); fclose(fp); if (this->chooseHashFunction(this->uiRead_Length, \ this->chosenSeedId, bMapReadInColors) < 0) { sucessfuallyReadTable = false; } // When table is built, this is set after bucket the index if (bEXTEND_SEED) { this->uiSeedLength = this->uiRead_Length; } if (sucessfuallyReadTable) { LOG_INFO("Info %d: Successfully read in the index\n", INFO_LOG); } else { LOG_INFO("Info %d: No saved index is found. Building a new index\n", INFO_LOG); } return(sucessfuallyReadTable); } bool CGenome_Index_Table::make_index_table(unsigned int uiReadLength, unsigned int uiSeedId, bool bMapReadInColors, bool makedMathRepeats) { if ( uiReadLength > MAX_READ_LENGTH) { uiReadLength /= 2; // Make sure the read length is in the range } delete this->pbaRepeatMaskedFlag; this->pbaRepeatMaskedFlag = new CboolFlagArray(this->pgenomeNT->iGenomeSize); delete this->pbaRepeatRepresentativeFlag; this->pbaRepeatRepresentativeFlag = new CboolFlagArray(this->pgenomeNT->iGenomeSize); if (makedMathRepeats) { LOG_INFO("Info %d: Identify Maskable mathmatical repeats\n", INFO_LOG); if (find_maskable_mathmatical_repeats(uiReadLength, uiSeedId) < 0) { LOG_INFO("Info %d: Fail to find mathmatical repeats\n", ERROR_LOG); return(false); } } const int USE_HASH_KEY_TO_SORT = 0; // Sort by the default hash key, by setting it to 0. if (this->chooseHashFunction(uiReadLength, uiSeedId, bMapReadInColors) < 0) { return(false); // fail to choose the hash function. } if (this->pHashIndexTable != NULL) { delete this->pHashIndexTable; this->pHashIndexTable = NULL; } this->pHashIndexTable = new CHashIndexT(NO_OF_BUCKET); // this->add_repeat_masked_flags(); // Read in the masked region from fixed file and set the flag TIME_INFO(this->countBucketSize(), "Count bucket size "); // counting each bucket size TIME_INFO(this->hashKmer2Bucket(), "Hash record "); // put each element into a bucket. TIME_INFO(this->sortTable(USE_HASH_KEY_TO_SORT), "Sort table "); // sort in each bucket, using TIME_INFO(this->check_masked_flags(), "Check masked loci "); // put each element into a bucket. LOG_INFO("Info %d: Successfully made the index\n", INFO_LOG); return(true); } bool CGenome_Index_Table::save_index_table(const char* indexFilePath, bool bPrintErrMsg) { this->indexFileName = get_index_path(string(indexFilePath)); remove( this->indexFileName.c_str() ); // delete the previous indexa string tmpIndexFileN = chExtName(this->indexFileName, ".tmp"); // save the incomplete index in *.tmp LOG_INFO("Info %d: Save index to %s as tmp\n", CONFIG_LOG, tmpIndexFileN.c_str()); FILE *fp = fopen(tmpIndexFileN.c_str(), "wb+"); fprintf(fp, "%s\n", this->caRefName); if (saveRefInBinFile(fp, this->pgenomeNTInBits, this->pgenomeNT)) { if (bPrintErrMsg) { LOG_INFO("Info %d: Fail to save ref in binary to %s\n", ERROR_LOG, tmpIndexFileN.c_str()); } return(false); } TIME_INFO(save_Hash_Table(fp), "Index table saved to disk"); fclose(fp); if (rename(tmpIndexFileN.c_str(), indexFileName.c_str())) { if (bPrintErrMsg) { LOG_INFO("Info %d: ERR to rename %s to index file\n"\ , ERROR_LOG, tmpIndexFileN.c_str()); } return(false); } // DEBUG this->read_index_table(this->indexFileName.c_str()); return(true); } string CGenome_Index_Table::get_index_path(string indexFilePath) { string defaultFileName = default_index_path(this->caRefName, this->bMapReadInColors,\ this->chosenSeedId, this->uiRead_Length); if (is_accessible_directory(indexFilePath.c_str())) { indexFilePath = getFullPath(indexFilePath, defaultFileName); } if (isPathWritable(indexFilePath.c_str())) { this->indexFileName = string(indexFilePath); } else { this->indexFileName = defaultFileName; // const char* pindexFileName = indexFilePath.c_str(); if (indexFilePath != "") { LOG_INFO("Info %d: Path %s is not writable.\nUse default Path %s instead\n"\ , WARNING_LOG, indexFilePath.c_str(), this->indexFileName.c_str()); } } return(this->indexFileName); } // this function will first build "suffix-like array" and compare the neighbor substring to find all mathamatical repeat in uiReadLength + uiNoOfShift int CGenome_Index_Table::find_maskable_mathmatical_repeats(unsigned int uiReadLength, unsigned int uiSubThreshold) { bool originalbMapReadInColors = this->bMapReadInColors; // Save the original setting this->bMapReadInColors = false; // Find the uiReadLength + this->uiNoOfShift repeat without consider color space or not if (this->chooseHashFunction(uiReadLength, uiSubThreshold, false) < 0) { return(-1); } if (this->pHashIndexTable != NULL) { delete this->pHashIndexTable; this->pHashIndexTable = NULL; } this->pHashIndexTable = new CHashIndexT(NO_OF_BUCKET); unsigned int originalReadLengthInBits = CReadInBits::iReadLength; CReadInBits::iReadLength = uiReadLength + this->uiNoOfShift; // (uiReadLength + this->uiNoOfShift) bp substring with N won't be bucket to sort. this->uiSeedLength = uiReadLength + this->uiNoOfShift; // (this->uiSeedLength was set in chooseHashFunction) TIME_INFO(this->countBucketSize(), "Count bucket size "); // counting each bucket size TIME_INFO(this->hashKmer2Bucket(), "Hash record created "); // put each element into a bucket. TIME_INFO(this->sortTable(this->uiNoOfShift + this->uiRead_Length), "Sort table "); // sort in each bucket TIME_INFO(this->find_mathmatical_repeats(), "Find math repeats"); // check the repeat and put into a flag array this->pbaRepeatMaskedFlag // TIME_INFO(this->check_masked_flags(), "Check masked loci "); // set the flag for each loci its following windows contain 'N' // The this->pHashIndexTable and this->pIndexTable will be deleted and new when make_index_table(); CReadInBits::iReadLength = originalReadLengthInBits; // Push back the original setting this->bMapReadInColors = originalbMapReadInColors; return(0); } int CGenome_Index_Table::chooseHashFunction(unsigned int uiReadLength, \ unsigned int chosenSeedId, \ bool bMapReadInColors) { CReadInBits::iReadLength = (int) uiReadLength; this->uiRead_Length = uiReadLength; this->chosenSeedId = chosenSeedId ; this->bMapReadInColors = bMapReadInColors; if (bMapReadInColors) { return(CGenome_Index::chooseHashFunction(uiReadLength - 1, chosenSeedId )); } else { return(CGenome_Index::chooseHashFunction(uiReadLength, chosenSeedId )); } } // Private function called by Construct_IndexTable, which do the counting for each bucket - Step 1 int CGenome_Index_Table::countBucketSize(void) { unsigned int kmer_length; // Length of the sliding windows on the reference. if (this->bMapReadInColors) {// Need one more base to get colors for index. kmer_length = this->uiSeedLength + 1; } else { kmer_length = this->uiSeedLength; } unsigned int uiNonMaskedLoci = 0; for (unsigned int chrId = 0; chrId < this->pgenomeNT->iNo_of_chromosome; chrId++) { uiNonMaskedLoci += countBucketSize4Chr(chrId, kmer_length); } return(this->bucketCount2Index(uiNonMaskedLoci)); } int CGenome_Index_Table::countBucketSize4Chr(int chrId, unsigned int kmer_length) { unsigned int uiNonMaskedLoci = 0; CchromosomeNTdata* pChr = this->pgenomeNT->paChromosomes[chrId]; unsigned int* counter = this->pHashIndexTable->aiIndexTable; unsigned int chrIndexStart = this->pgenomeNT->chrIndex2genomelocusID(chrId, 0); /* int th_id = 0, no_CPU = 1; unsigned int countersNoPerCpu = this->pHashIndexTable->uiSize; #ifdef _OPENMP // Parallelization no_CPU = omp_get_num_procs(); countersNoPerCpu /= no_CPU; cout << "Divide counters into " << countersNoPerCpu << "counters per CPU" << endl; // DEBUG #pragma omp parallel private(th_id) { th_id = omp_get_thread_num(); #endif */ for (unsigned int chrIndex = 0; chrIndex < pChr->iChromosome_size - this->uiSeedLength; chrIndex ++) { unsigned int genomeIndex = chrIndexStart + chrIndex; unsigned int uiHashValue = 0; CReadInBits kmerInBits; bool goodKmer = this->pgenomeNTInBits->fragACGTKmerInBits(kmerInBits, genomeIndex, kmer_length); if (goodKmer && !this->pbaRepeatMaskedFlag->b(genomeIndex) /* if not masked */) { /*char caRead[wordSize]; kmerInBits.decode(caRead); // check debug*/ if (this->bMapReadInColors) { // put in index for colors instead of bases. kmerInBits = bases2PureColors(kmerInBits); } uiHashValue = this->getHashValue(kmerInBits); // if (th_id == 0) uiNonMaskedLoci++; // if (uiHashValue / countersNoPerCpu == (unsigned int)th_id) counter[uiHashValue]++; // In crease the count to the bucket } } /* #ifdef _OPENMP // Parallelization } #endif */ return(uiNonMaskedLoci); } // private function only called by countBucketSize int CGenome_Index_Table::bucketCount2Index(unsigned int uiNonMaskedLoci) { bool error = false; // Let the bucket size counter become hash index to each bucket if (this->pHashIndexTable->aiIndexTable[this->pHashIndexTable->uiSize] != 0) { ERR; error = true; } this->pHashIndexTable->Counter2Index(); if (this->pHashIndexTable->aiIndexTable[this->pHashIndexTable->uiSize] != this->pHashIndexTable->aiIndexTable[this->pHashIndexTable->uiSize -1]) { ERR; error = true; } // The last record shows the total nubmer of Kmers that have been hashed to the buckets this->size = this->pHashIndexTable->aiIndexTable[this->pHashIndexTable->uiSize]; if (this->size != uiNonMaskedLoci) { ERR; cout << this->size; cout << uiNonMaskedLoci; error = true; } return((int)error); } // Hash the Kmer to the HashIndexTable bucket, without sorting the dHashkey int CGenome_Index_Table::hashKmer2Bucket(void) { delete [] this->pIndexTable; this->pIndexTable = new CIndex_Type[this->size]; // Note there are this->pHashIndexTable->uiSize buckets but this->pHashIndexTable->uiSize + 1 bucket pointers unsigned int uiNonMaskedLoci = 0; for (unsigned int chrId = 0; chrId < this->pgenomeNT->iNo_of_chromosome; chrId++) { unsigned int kmer_length; // Length of the sliding windows on the reference. if (this->bMapReadInColors) { // Need one more base to get colors for index. kmer_length = this->uiSeedLength + 1; } else { kmer_length = this->uiSeedLength; } uiNonMaskedLoci += this->hashKmer2Bucket4Chr(chrId, kmer_length); } if (this->size != uiNonMaskedLoci) { ERR; return(1); } return(0); } int CGenome_Index_Table::hashKmer2Bucket4Chr(int chrId, unsigned int kmer_length) { unsigned int uiNonMaskedLoci = 0; CchromosomeNTdata* pChr = this->pgenomeNT->paChromosomes[chrId]; unsigned int chrIndexStart = this->pgenomeNT->chrIndex2genomelocusID(chrId, 0); /* int th_id = 0, no_CPU = 1; unsigned int countersNoPerCpu = this->pHashIndexTable->uiSize; #ifdef _OPENMP // Parallelization no_CPU = omp_get_num_procs(); countersNoPerCpu /= no_CPU; cout << "Divide counters into " << countersNoPerCpu << " counters per CPU" << endl; // DEBUG #pragma omp parallel private(th_id) { th_id = omp_get_thread_num(); } #endif */ for (unsigned int chrIndex = 0; chrIndex < pChr->iChromosome_size - this->uiSeedLength; chrIndex++) { unsigned int genomeIndex = chrIndexStart + chrIndex; unsigned int uiHashValue = 0; unsigned int uiTableIndex = 0; CReadInBits kmerInBits; bool goodKmer = this->pgenomeNTInBits->fragACGTKmerInBits(kmerInBits, genomeIndex, kmer_length); if (goodKmer && !this->pbaRepeatMaskedFlag->b(genomeIndex) /* if not masked */) { if (this->bMapReadInColors) { // put in index for colors instead of bases.( kmerInBits = bases2PureColors(kmerInBits); } uiHashValue = this->getHashValue(kmerInBits); // Check, if the Bucket counting and the Hashing assign is consistent, it should be the same. if (this->pHashIndexTable->aiIndexTable[uiHashValue] < 0) { LOG_INFO("\nInfo %d: Inconsistent in bucket size. \n", WARNING_LOG); } else { // if (th_id == 0) uiNonMaskedLoci++; //if (uiHashValue / countersNoPerCpu == (unsigned int)th_id) { // Put the record at -1 position of the stack this->pHashIndexTable->aiIndexTable[uiHashValue]--; uiTableIndex = this->pHashIndexTable->aiIndexTable[uiHashValue]; this->pIndexTable[uiTableIndex] = genomeIndex; } } } } /* #ifdef _OPENMP // Parallelization } #endif */ return(uiNonMaskedLoci); } bool CGenome_Index_Table::compareKey(CIndex_Type I1, CIndex_Type I2) { CReadInBits ref1, ref2; if (this->bMapReadInColors) { ref1 = this->pgenomeNTInBits->getSubstringInBits(I1, this->uiSeedLength + 1); ref2 = this->pgenomeNTInBits->getSubstringInBits(I2, this->uiSeedLength + 1); ref1 = bases2PureColors(ref1); // ref1 is now in pure colors ref2 = bases2PureColors(ref2); // ref2 is now in pure colors } else { ref1 = this->pgenomeNTInBits->getSubstringInBits(I1, this->uiSeedLength); ref2 = this->pgenomeNTInBits->getSubstringInBits(I2, this->uiSeedLength); } unsigned int key1 = this->getSeedKey(ref1); unsigned int key2 = this->getSeedKey(ref2); return(key1 < key2); } // return true if the corresponding substring I1 < I2 bool CGenome_Index_Table::compareSubstring(CIndex_Type I1, CIndex_Type I2, unsigned int slidingWindows) { CReadInBits ref1, ref2; ref1 = this->pgenomeNTInBits->getSubstringInBits(I1, slidingWindows); ref2 = this->pgenomeNTInBits->getSubstringInBits(I2, slidingWindows); return(ref1 < ref2); } // return true if the corresponding substring is the same bool CGenome_Index_Table::sameSubstring(CIndex_Type I1, CIndex_Type I2, unsigned int slidingWindows) { CReadInBits ref1, ref2; ref1 = this->pgenomeNTInBits->getSubstringInBits(I1, slidingWindows); ref2 = this->pgenomeNTInBits->getSubstringInBits(I2, slidingWindows); return(ref1 == ref2); } int CGenome_Index_Table::sortTable(unsigned int substringLength) { if (bEXTEND_SEED) { // The previous seed Length is the "shortest seed length" to exclude 'N' when hash genome Index into bins. // The longest seed length in the exteneded seed mode is the read length. this->uiSeedLength = this->uiRead_Length; } CcompareFunctor4Sort sortFunctor(this, substringLength); #ifdef _OPENMP // Parallelization int numberOfCPUs = omp_get_num_procs(); LOG_INFO("\nInfo %d: Sortubg buckets using %d CPUs %s.\n",\ INFO_LOG, numberOfCPUs, BLANK_LINE); #pragma omp parallel for #endif // Because OpenMP 2.5 only support signed integer loop. Case the uiSize. The number of bucket is limited to signed int for (int i = 0; i < (int)this->pHashIndexTable->uiSize; i++) { unsigned int BucketStart = this->pHashIndexTable->aiIndexTable[i]; unsigned int NextBucketStart = this->pHashIndexTable->aiIndexTable[i + 1]; // No need to sort if bucket is empty or has only one element if (NextBucketStart > BucketStart + 1) { // Put the functor for sorting (sorting use the hashkey generated by the corresponding genome sequence) std::sort(&this->pIndexTable[BucketStart], &this->pIndexTable[NextBucketStart], sortFunctor); } else if (this->pHashIndexTable->aiIndexTable[i + 1] < BucketStart) { ERR; } // else empty bucket, no need to sort } return(0); } // Find math repeat by checking the neighbor of index array in a bucket. // Build an flags arrays for masked mathematical repeats. This is imcomplete. int CGenome_Index_Table::find_mathmatical_repeats(void) { int num_of_repeats = 0; unsigned int slideWindowLength = this->uiRead_Length + this->uiNoOfShift; // char outputFileName[FILENAME_MAX]; // sprintf(outputFileName, "maskable_repeat_%s_%d_%d.txt", this->caRefName, this->uiRead_Length, this->uiNoOfShift); ofstream ofile; // (outputFileName); Don't open the file for for (unsigned int i = 0; i < this->pHashIndexTable->uiSize; i++) { unsigned int BucketStart = this->pHashIndexTable->aiIndexTable[i]; unsigned int NextBucketStart = this->pHashIndexTable->aiIndexTable[i + 1]; num_of_repeats += find_mathmatical_repeats_in_a_bucket(ofile, BucketStart, NextBucketStart); } cout << num_of_repeats << " repeats with " << num_of_repeat_patterns << " patterns in " << slideWindowLength << " bp are found!" << endl; ofile.close(); return(num_of_repeats); } // Find math repeat by checking the neighbor of index array in a bucket. int CGenome_Index_Table::find_mathmatical_repeats_in_a_bucket(ofstream& ofile, unsigned int BucketStart, unsigned int NextBucketStart) { int num_of_repeats = 0; int num_of_repeats_in_the_pattern = 0; unsigned int slideWindowLength = this->uiRead_Length + this->uiNoOfShift; bool bRepeat = false; // No need to check if the bucket is empty or has only one element for (unsigned int j = BucketStart; j + 1 < NextBucketStart; j++) { //PRINT_GENOME_SUBSTRING(this->pIndexTable[j], 50); // check, print the string unsigned int genomeIndex1 = this->pIndexTable[j]; unsigned int genomeIndex2 = this->pIndexTable[j + 1]; unsigned int representativeRepeatStartGIndex; if (this->sameSubstring(genomeIndex1, genomeIndex2, slideWindowLength)) { // if there is repeat. if ( bRepeat == false ) { // new repeat pattern this->num_of_repeat_patterns++; num_of_repeats++; num_of_repeats_in_the_pattern = 1; // record the pattern char ref[FILENAME_MAX]; this->pgenomeNTInBits->getSubstringInBits(genomeIndex1, slideWindowLength).decode(ref); representativeRepeatStartGIndex = genomeIndex1 + this->uiNoOfShift; ofile << representativeRepeatStartGIndex << '\t' << &ref[this->uiNoOfShift] << '\t'; // set the "shifts" number is duplicated if mapped for (unsigned int i = 0; i < this->uiNoOfShift; i++) { unsigned int duplicatedLocus = genomeIndex1 + i; this->pbaRepeatRepresentativeFlag->setflag(duplicatedLocus, true); } } // PRINT_GENOME_SUBSTRING(genomeIndex, 50); // check, print the string // masked the duplicate pattern unsigned int maskableRepeatStartIndex = genomeIndex2 + this->uiNoOfShift; this->pbaRepeatMaskedFlag->setflag(maskableRepeatStartIndex, true); // mask the maskable repeat num_of_repeats++; num_of_repeats_in_the_pattern ++; bRepeat = true; ofile << ',' << maskableRepeatStartIndex; } else { if (bRepeat == true) { ofile << '\t' << num_of_repeats_in_the_pattern << endl; // end of an repeat pattern } bRepeat = false; num_of_repeats_in_the_pattern = 0; } } if (bRepeat == true) { ofile << '\t' << num_of_repeats_in_the_pattern << endl; // end of an repeat pattern } return(num_of_repeats); } // Set a flag for masked region, which is a read-length-long sliding windows with 'N' in it. int CGenome_Index_Table::check_masked_flags(void) { unsigned int uiNonMaskedLoci = 0; if (this->pbaMaskedFlag == NULL) { this->pbaMaskedFlag = new CboolFlagArray(this->pgenomeNT->iGenomeSize + 1); } // Default value of each flag is false for (unsigned int chrId = 0; chrId < this->pgenomeNT->iNo_of_chromosome; chrId++) { char* pChrNT = this->pgenomeNT->paChromosomes[chrId]->caChromosome; unsigned int chrLength = this->pgenomeNT->paChromosomes[chrId]->iChromosome_size; unsigned int last_masked_base_dis = 0; unsigned int genomeLoucsIndex = this->pgenomeNT->chrIndex2genomelocusID(chrId, 0); for (unsigned int i = 0; i < chrLength; i++, genomeLoucsIndex++) { // Set masked by default this->pbaMaskedFlag->setflag(genomeLoucsIndex, true); bool thisLocusIsN; if (pChrNT == NULL) { thisLocusIsN = this->pgenomeNTInBits->pNBits->b(genomeLoucsIndex); } else { thisLocusIsN = (pChrNT[i] == 'N'); } if (thisLocusIsN) { last_masked_base_dis = 0; } else { last_masked_base_dis ++; } if (last_masked_base_dis >= this->uiRead_Length) { // If the distance to the previous N is larger than read length // genomeLoucsIndex >= this->uiRead_Length becasue last_masked_base_dis was initialize to 0 this->pbaMaskedFlag->setflag(genomeLoucsIndex + 1 - this->uiRead_Length, false); uiNonMaskedLoci ++; } } // The tail of chromosome region are masked. } LOG_INFO("Info %d: %u out of %u loci are unmasked\n", FINE_LOG, uiNonMaskedLoci, this->pgenomeNT->iGenomeSize); return(0); } // add repeat masked region from fixed files and set another flags. The region won't be put in the index table. int CGenome_Index_Table::add_repeat_masked_flags(void) { if (this->pbaRepeatMaskedFlag == NULL) { this->pbaRepeatMaskedFlag = new CboolFlagArray(this->pgenomeNT->iGenomeSize + 1); } // Default value of each flag is false for (unsigned int chrId = 0; chrId < this->pgenomeNT->iNo_of_chromosome; chrId++) { unsigned int chrLength = this->pgenomeNT->paChromosomes[chrId]->iChromosome_size; unsigned int chrEndGenomeLocus = this->pgenomeNT->chrIndex2genomelocusID(chrId, 0) + chrLength - 1; // Open the files that contained the masked regions in (masked start, masked end) in each line char repeatMaskRegionsFile[FILENAME_MAX]; sprintf(repeatMaskRegionsFile, "deadzone_Maskable_%u_%u.txt", this->uiRead_Length, chrId); ifstream ifile(repeatMaskRegionsFile); if (!ifile.good()) { cout << "Can't open file " << repeatMaskRegionsFile << endl; continue; } while (true) { char caBuffer[FILENAME_MAX]; caBuffer[0] = '\0'; ifile.getline(caBuffer, FILENAME_MAX); if (caBuffer[0] == '\0') { break; // end of the file } // Get the masked region number and set the flag. Assume the index also starts from 0. unsigned maskStart = 0; unsigned maskEnd = 0; sscanf(caBuffer, "%d%d", &maskStart, &maskEnd); unsigned int maskSpan = maskEnd - maskStart + 1; unsigned int genomeLoucsIndex = this->pgenomeNT->chrIndex2genomelocusID(chrId, maskStart); if ( genomeLoucsIndex + maskSpan < chrEndGenomeLocus) { for (unsigned int j = genomeLoucsIndex; j < genomeLoucsIndex + maskSpan; j++) { this->pbaRepeatMaskedFlag->setflag(j, true); } } } } unsigned int uiRepeatMaskedLoci = 0; for (unsigned int i = 0; i < this->pgenomeNT->iGenomeSize; i++) { if (this->pbaRepeatMaskedFlag->b(i)) { uiRepeatMaskedLoci++; } } cout << uiRepeatMaskedLoci << " out of " << this->pgenomeNT->iGenomeSize; cout << " loci are skip as maskable repeats." << endl; return(0); } bool testTable(CGenome_Index_Table& table) { // FOR TEST Purpose // (1) Each Index in the table contains no N wihtin the sliding windows of length (this->readlength - this->iNoOfShift) for (unsigned int i = 0; i < table.pHashIndexTable->uiSize; i++) { unsigned int BucketStart, NextBucketStart; BucketStart = table.pHashIndexTable->aiIndexTable[i]; NextBucketStart = table.pHashIndexTable->aiIndexTable[i + 1]; for (CIndex_Type* it = &(table.pIndexTable[BucketStart]); \ it < &(table.pIndexTable[NextBucketStart]); it++) { if (table.pgenomeNTInBits->pNBits->b(*it, table.uiRead_Length - table.uiNoOfShift)) { cout << "Wrongly put masked region to the index " << endl; ERR } /* Check if the reads is sorted CReadInBits kmerInBits = this->pgenomeNTInBits->getSubstringInBits(*it, this->uiRead_Length); kmerInBits = bases2PureColors(kmerInBits); printBitsStr(kmerInBits, this->uiRead_Length - 1); unsigned int uiHashValue = this->getHashValue(kmerInBits); unsigned int uiSeedKey = this->getSeedKey(kmerInBits); cout << "Got you " << *it << ',' << uiHashValue << ',' << this->getSeedKey(kmerInBits) << endl; */ } } return(0); } ./Source/Genome_Index_TableQ.cpp0000644011075700120610000005407411720654362016710 0ustar yanghochmath-ar#include "Genome_Index_TableQ.h" CGenome_Index_TableQ::CGenome_Index_TableQ(void) { initialization(); } CGenome_Index_TableQ::~CGenome_Index_TableQ(void) { // this->pgenomeNTInBits and this->pgenome will be deleted in parent class; } int CGenome_Index_TableQ::initialization(void) { this->bExcludeAmbiguous = true; return(0); } bool CGenome_Index_TableQ::getSeqFromFasta(const char* genomeListfileName, string refFormat) { getBasename(genomeListfileName, this->caRefName); if (fileExist(genomeListfileName)) { this->getGenomeNTdata(genomeListfileName, refFormat); TIME_INFO(this->pgenomeNTInBits = new CGenomeInBits(this->pgenomeNT), "Build genome in bits "); this->pgenomeNT->freeChromosomeSpace(); return(true); } else { cout << "File " << genomeListfileName << " not found!" << endl; return(false); } } bool CGenome_Index_TableQ::getSeqFromDS(CGenomeNTdata* pgenomeNT) { this->pgenomeNT = pgenomeNT; myStrCpy(this->caRefName, pgenomeNT->refName, FILENAME_MAX); TIME_INFO(this->pgenomeNTInBits = new CGenomeInBits(this->pgenomeNT), "Build genome in bits "); this->pgenomeNT->freeChromosomeSpace(); return(true); } // (1) Query each seed pattern for hit. (2) Check each hit for valid alignment and put in the palignmentsQ // (3) Return the minDiff for alignment pair CGenome_Index_TableQ::queryKmer\ (CReadInBits window, unsigned int shift) const { unsigned int BIN_SIZE_CHECK_THRESHOLD = 1; // If there are only this number in bin, check window = window.getSuffixStr(shift); unsigned int uiHashValue = this->getHashValue(window); unsigned int uiSeedKeyL = this->getSeedKey(window); unsigned int uiSeedKeyU; if (bEXTEND_SEED) { uiSeedKeyU = this->getSeedKeyUpperBound(window, shift); } else { uiSeedKeyU = uiSeedKeyL; } // cout << uiHashValue << ',' << uiSeedKey << endl; unsigned int bucketStart = this->pHashIndexTable->aiIndexTable[uiHashValue]; unsigned int nextBucketStart = this->pHashIndexTable->aiIndexTable[uiHashValue + 1]; if (bucketStart > nextBucketStart) ERR; // Error Case if (bucketStart >= nextBucketStart) { // empty bucket CIndex_Type* tmp = NULL; return(pair(tmp, tmp)); //No hit } CIndex_Type* hitStart = &this->pIndexTable[bucketStart]; CIndex_Type* hitEndPlus1 = &this->pIndexTable[nextBucketStart]; // If the bin is large enough, use binary search to find the correct hituiSeedKey if (bucketStart + BIN_SIZE_CHECK_THRESHOLD < nextBucketStart) { hitStart = lower_bound(hitStart, hitEndPlus1, uiSeedKeyL, CcompareFunctor4LowerBound(this)); // TODO fill the suffix digit for the uiSeedKey hitEndPlus1 = upper_bound(hitStart, hitEndPlus1, uiSeedKeyU, CcompareFunctor4UpperBound(this)); } // else assume everything in the bucket is a hit without equal_range check return(pair(hitStart, hitEndPlus1)); } unsigned int CGenome_Index_TableQ::getSeedKeyUpperBound(CReadInBits window, unsigned int shift) const { //WORD_SIZE padding = 0xffffffffffffffff; WORD_SIZE padding = -1; if (this->bMapReadInColors) { padding <<= (this->uiRead_Length - 1 - shift); } else { padding <<= (this->uiRead_Length - shift); } window.UpperBits |= padding; window.LowerBits |= padding; return(this->getSeedKey(window)); } // Query a read in bases (Illumina) for hit and check uiDiff and put the result into the given Queue unsigned int CGenome_Index_TableQ::queryReadBases(CReadInBits readInBases, CAlignmentsQ& aQue, bool bClearQ, bool bForward) const { if (bClearQ) { aQue.clearHits(); } if (!bForward) { //Query reverse complement read for alignment readInBases = reverseCompliment(this->uiRead_Length, readInBases); } bool bUseShortCut = this->bExcludeAmbiguous && !(aQue.qAllInThreshold()); bool qAllHits = aQue.qAllInThreshold(); for (unsigned int shift = 0; shift <= this->uiNoOfShift; shift++) { // If the best match is exact matched, no need to go to the next shift // for all exact matches must be found previously. if ((aQue.MinDiff == 0) && (shift > 0) && (aQue.load > 0) && !qAllHits) { break; } pair hits = queryKmer(readInBases, shift); if (hits.first != NULL) { for (CIndex_Type* it = hits.first; it < hits.second; it++) { if (*it >= shift) { unsigned int alignStart = *it - shift; if (isMasked(alignStart)) { continue; } else { CReadInBits ref = this->pgenomeNTInBits->getSubstringInBits(alignStart); unsigned int uiDiff = bitsStrNCompare(ref, readInBases, this->uiRead_Length); // The flag in alignmentsQ decide whether all alignments within uiDiff or // only the alignments with minimum Diff are queue. if (uiDiff <= this->uiSubDiffThreshold) { aQue.saveHits(alignStart, (unsigned short)uiDiff); } if (bUseShortCut) { // short cut to exclude ambiguous reads bool bMap2Repeat = this->pbaRepeatRepresentativeFlag->b(alignStart); if (bMap2Repeat) { aQue.AmbiguousFlag = true; } if (aQue.MinDiff == 0 && (aQue.load >= 2 || bMap2Repeat)) { aQue.setForwardLoad(bForward); return(0); } } } } } if (aQue.MinDiff == 0) { // short cut. Output no more than iMaxCapacity exaxt alignment /* if (aQue.load >= aQue.iMaxCapacity - 1) { aQue.setForwardLoad(bForward); return(0); }*/ // short cut. All exact matches will be found after first shift if (bUseShortCut && aQue.load >= 2) { aQue.setForwardLoad(bForward); return(1); } } } } /* DEBUG if(palignmentsQ.MinDiff <= this->uiSubDiffThreshold) { CReadInBits ref = this->pgenomeNTInBits->getSubstringInBits(aQue.aiHitIndex[0]); printBitsStrCompare(ref, readInBases, "Good Alignments!!"); }*/ // The records before are for the forward direction if this is a forward query aQue.setForwardLoad(bForward); return(aQue.MinDiff); } // Query a read in bases (Illumina) for hit and check uiDiff and put the result into the given Queue unsigned int CGenome_Index_TableQ::queryLongReadBases(CReadInBits r1, CReadInBits r2, bool oddReadLength, CAlignmentsQ& aQue, int queryHalf, bool bClearQ, bool bForward) const { const unsigned int firstPartLength = this->uiRead_Length; const unsigned int secondPartLength = oddReadLength ? this->uiRead_Length - 1 : this->uiRead_Length; if (bClearQ) { aQue.clearHits(); } if (!bForward) { //Query reverse complement read for alignment CReadInBits tmp = r2; r2 = reverseCompliment(firstPartLength, r1); r1 = reverseCompliment(firstPartLength, tmp); } CReadInBits readInBases = (queryHalf == 1) ? r1 : r2; bool bUseShortCut = this->bExcludeAmbiguous && !(aQue.qAllInThreshold()); bool qAllHits = aQue.qAllInThreshold(); for (unsigned int shift = 0; shift <= this->uiNoOfShift; shift++) { // If the best match is exact matched, no need to go to the next shift // for all exact matches must be found previously. if ((aQue.MinDiff == 0) && (shift > 0) && (aQue.load > 0) && !qAllHits) { break; } pair hits = queryKmer(readInBases, shift); if (hits.first != NULL) { for (CIndex_Type* it = hits.first; it < hits.second; it++) { if (*it >= shift) { unsigned int alignStart = *it - shift; if (queryHalf == 2 ) { if ( alignStart >= secondPartLength) { alignStart -= secondPartLength; } else { continue; } } if (isMasked(alignStart) || isMasked(alignStart + secondPartLength)) { continue; // if the two windows contain N or is in references' border } else { unsigned int uiDiff = checkAlignment(alignStart, r1, r2, oddReadLength); // The flag in alignmentsQ decide whether all alignments within uiDiff or // only the alignments with minimum Diff are queue. if (uiDiff <= this->uiSubDiffThreshold) { aQue.saveHits(alignStart, (unsigned short)uiDiff); } if (bUseShortCut) { // short cut to exclude ambiguous reads bool bMap2Repeat = this->pbaRepeatRepresentativeFlag->b(alignStart); if (bMap2Repeat) { aQue.AmbiguousFlag = true; } if (aQue.MinDiff == 0 && (aQue.load >= 2 || bMap2Repeat)) { aQue.setForwardLoad(bForward); return(0); } } } } } if (aQue.MinDiff == 0) { // short cut. Output no more than iMaxCapacity exaxt alignment /* if (aQue.load >= aQue.iMaxCapacity - 1) { aQue.setForwardLoad(bForward); return(0); }*/ // short cut. All exact matches will be found after first shift if (bUseShortCut && aQue.load >= 2) { aQue.setForwardLoad(bForward); return(1); } } } } // The records before are for the forward direction if this is a forward query aQue.setForwardLoad(bForward); return(aQue.MinDiff); } // TODO Complete the function // Query a read in bases (Illumina) for hit and check uiDiff and put the result into the given Queue unsigned int CGenome_Index_TableQ::queryLongReadColors(CReadInBits r1, CReadInBits r2, bool oddReadLength, CAlignmentsQ& aQue, int queryHalf, bool bClearQ, bool bForward) const { // const unsigned int firstPartLength = this->uiRead_Length; const unsigned int secondPartLength = oddReadLength ? this->uiRead_Length - 1 : this->uiRead_Length; if (bClearQ) { aQue.clearHits(); } if (!bForward) { // Query reverse complement read for alignment reverseLongColorRead(r1, r2, oddReadLength); } CReadInBits readInBases = (queryHalf == 1) ? r1 : r2; bool bUseShortCut = this->bExcludeAmbiguous && !(aQue.qAllInThreshold()); bool qAllHits = aQue.qAllInThreshold(); for (unsigned int shift = 0; shift <= this->uiNoOfShift; shift++) { // If the best match is exact matched, no need to go to the next shift // for all exact matches must be found previously. if ((aQue.MinDiff == 0) && (shift > 0) && (aQue.load > 0) && !qAllHits) { break; } pair hits = queryKmer(readInBases, shift); if (hits.first != NULL) { for (CIndex_Type* it = hits.first; it < hits.second; it++) { if (*it >= shift) { unsigned int alignStart = *it - shift; if (queryHalf == 2 ) { if ( alignStart >= secondPartLength) { alignStart -= secondPartLength; } else { continue; } } if (isMasked(alignStart) || isMasked(alignStart + secondPartLength)) { continue; // if the two windows contain N or is in references' border } else { // TODO implement the checkColor unsigned int uiDiff = checkColorAlignment(alignStart, r1, r2, oddReadLength); // The flag in alignmentsQ decide whether all alignments within uiDiff or // only the alignments with minimum Diff are queue. if (uiDiff <= this->uiSubDiffThreshold) { aQue.saveHits(alignStart, (unsigned short)uiDiff); } if (bUseShortCut) { // short cut to exclude ambiguous reads bool bMap2Repeat = this->pbaRepeatRepresentativeFlag->b(alignStart); if (bMap2Repeat) { aQue.AmbiguousFlag = true; } if (aQue.MinDiff == 0 && (aQue.load >= 2 || bMap2Repeat)) { aQue.setForwardLoad(bForward); return(0); } } } } } if (aQue.MinDiff == 0) { // short cut. All exact matches will be found after first shift if (bUseShortCut && aQue.load >= 2) { aQue.setForwardLoad(bForward); return(1); } } } } // The records before are for the forward direction if this is a forward query aQue.setForwardLoad(bForward); return(aQue.MinDiff); } // check match for long read unsigned int CGenome_Index_TableQ::checkAlignment(unsigned int alignStartGenomeIndex, CReadInBits& half1, CReadInBits& half2, bool oddReadLength) const { unsigned int uiDiff = 0; CReadInBits ref1, ref2; ref1 = this->pgenomeNTInBits->getSubstringInBits(alignStartGenomeIndex); uiDiff += bitsStrNCompare(ref1, half1, this->uiRead_Length); int secondPartStart; if (oddReadLength) { secondPartStart = alignStartGenomeIndex + this->uiRead_Length - 1; ref2 = this->pgenomeNTInBits->getSubstringInBits(secondPartStart); uiDiff += bitsStrMNCompare(ref2, half2, 1, this->uiRead_Length - 1); // skip the 1st (middle) base } else { secondPartStart = alignStartGenomeIndex + this->uiRead_Length; ref2 = this->pgenomeNTInBits->getSubstringInBits(secondPartStart); uiDiff += bitsStrNCompare(ref2, half2, this->uiRead_Length); } return(uiDiff); } // TODO complete the function for color alignment unsigned int CGenome_Index_TableQ::checkColorAlignment(unsigned int alignStartGenomeIndex, CReadInBits& half1, CReadInBits& half2, bool oddReadLength) const { unsigned int uiDiff = 0; CReadInBits ref1 = this->pgenomeNTInBits->getSubstringInBits(alignStartGenomeIndex); CReadInBits ref1InColors = bases2Colors(ref1); uiDiff += bitsStrNCompare(ref1InColors, half1, this->uiRead_Length); int secondPartStart = alignStartGenomeIndex + this->uiRead_Length - 1; if (oddReadLength) { CReadInBits ref2 = this->pgenomeNTInBits->getSubstringInBits(secondPartStart - 1); CReadInBits ref2InColor = bases2PureColors(ref2); uiDiff += bitsStrMNCompare(ref2, half2, 1, this->uiRead_Length - 1); // TODO Check the middle and last color signal. // Watch out the boundary } else { CReadInBits ref2 = this->pgenomeNTInBits->getSubstringInBits(secondPartStart - 1); CReadInBits ref2InColor = bases2PureColors(ref2); // Warning! The last base info is missing due to bases2PureColors uiDiff += bitsStrNCompare(ref2, half2, this->uiRead_Length - 1); } return(uiDiff); } // Given alignments in alignmentsQ, check reads can be also aligned in the extended position unsigned int CGenome_Index_TableQ::extendAlignment\ (CAlignmentsQ& alignmentsQ, CReadInBits extendedReadHalf) const { unsigned int i; int minDiff = this->uiRead_Length; bool extendForward = true; for (i = 0; i < alignmentsQ.ForwardAlignmentLoad; i++) { unsigned int alignStart = alignmentsQ.aiHitIndex[i] + this->uiRead_Length; if (alignStart + this->uiRead_Length >= this->pgenomeNT->iGenomeSize || this->isMasked(alignStart)) { alignmentsQ.asdiff[i] += (unsigned short)this->uiRead_Length; // no space to align } else { CReadInBits ref = this->pgenomeNTInBits->getSubstringInBits(alignStart, this->uiRead_Length); alignmentsQ.asdiff[i] += (unsigned short)bitsStrNCompare(ref, extendedReadHalf, this->uiRead_Length); if (alignmentsQ.asdiff[i] < minDiff) minDiff = alignmentsQ.asdiff[i]; } } extendForward = false; CReadInBits reverseComplimentRead = reverseCompliment(this->uiRead_Length, extendedReadHalf); for (; i < alignmentsQ.load; i++) { // reverse read if ( alignmentsQ.aiHitIndex[i] < this->uiRead_Length) { alignmentsQ.asdiff[i] += (unsigned short)this->uiRead_Length; // no space to align } else { unsigned int alignStart = alignmentsQ.aiHitIndex[i] - this->uiRead_Length; if ( this->isMasked(alignStart) ) { alignmentsQ.asdiff[i] += (unsigned short)this->uiRead_Length; // no space to align } else { CReadInBits ref = this->pgenomeNTInBits->getSubstringInBits(alignStart, this->uiRead_Length); alignmentsQ.asdiff[i] += (unsigned short)bitsStrNCompare(ref, reverseComplimentRead, this->uiRead_Length); alignmentsQ.aiHitIndex[i] = alignStart; if (alignmentsQ.asdiff[i] < minDiff) { minDiff = alignmentsQ.asdiff[i]; } } } } alignmentsQ.MinDiff = minDiff; // The alignments may exceed the threshold so it should be checked outside the function. return(alignmentsQ.MinDiff); } // Query a read in colors (SOLiD) for hit and check uiDiff and put the result into the given Queue unsigned int CGenome_Index_TableQ::queryReadColors(CReadInBits readInColors, CAlignmentsQ& alignmentsQ, bool bClearQ, bool bForward, bool bDEBUG) const { if (bClearQ) { alignmentsQ.clearHits(); } CReadInBits pureColors = readInColors.getSuffixStr(1); // query colors if (!bForward) { // Query alignment with reversed color. (Reverse complement reads has color in reversed direction) pureColors = reversePureColors(pureColors, this->uiRead_Length - 1); } bool bUseShortCut = this->bExcludeAmbiguous && !(alignmentsQ.qAllInThreshold()); bool qAllHits = alignmentsQ.qAllInThreshold(); for (unsigned int shift = 0; shift <= this->uiNoOfShift; shift++) { // If the best alignment is exact matched, no need to go to the next shift, for all exact matches must be found previously. if ((alignmentsQ.MinDiff == 0) && (shift > 0) && (alignmentsQ.load > 0) && !qAllHits) { break; } /* if(bDEBUG) { cout << shift << "-Shift" << endl; if(shift == 7) { cout << "Got you" << endl; } }*/ // printBitsStr(pureColors.getSuffixStr(shift), this->uiSeedLength);// DEBUG pair hits = queryKmer(pureColors, shift); if (hits.first != NULL) { // for each hit, check if it is a good alignment. for (CIndex_Type* it = hits.first; it < hits.second; it++) { if (*it >= shift) { unsigned int alignStart = *it - shift; if (isMasked(alignStart)) { continue; } else { CReadInBits ref = this->pgenomeNTInBits->getSubstringInBits(alignStart, this->uiRead_Length); unsigned int uiDiff; if (bForward) { CReadInBits refInColors = bases2Colors(ref); uiDiff = bitsStrNCompare(refInColors, readInColors, this->uiRead_Length); } else { //reverse complement the reference to compare in the read CReadInBits rcRefInColors = bases2Colors(reverseCompliment(this->uiRead_Length, ref)); uiDiff = bitsStrNCompare(rcRefInColors, readInColors, this->uiRead_Length); } // To queue all alignment within uiDiff or the alignments with min difference are set in flag of alignmentsQ if (uiDiff <= this->uiSubDiffThreshold) { alignmentsQ.saveHits(alignStart, (unsigned short)uiDiff); } if (bUseShortCut) { // short cut to exclude ambiguous reads bool bMap2Repeat = this->pbaRepeatRepresentativeFlag->b(alignStart); if (bMap2Repeat) { alignmentsQ.AmbiguousFlag = true; } if (alignmentsQ.MinDiff == 0 && (alignmentsQ.load >= 2 || bMap2Repeat)) { alignmentsQ.setForwardLoad(bForward); return(0); } } } } } if (alignmentsQ.MinDiff == 0) { /* // short cut to exclude reads that has too many ambiguous mapping if (alignmentsQ.load >= MAX_Q_CAPACITY) { alignmentsQ.setForwardLoad(bForward); return(alignmentsQ.MinDiff); } */ // short cut for ambiguous reads if (bUseShortCut && alignmentsQ.load >= 2) { alignmentsQ.setForwardLoad(bForward); return(1); } } } } alignmentsQ.setForwardLoad(bForward); return(alignmentsQ.MinDiff); } ./Source/HashIndexT.cpp0000644011075700120610000000307111720654362015105 0ustar yanghochmath-ar#include "stdafx.h" #include "HashIndexT.h" const unsigned int CHashIndexT::INDEX_BASES_LIMIT = 13; //Maximum 64M index array CHashIndexT::CHashIndexT(void) { this->initialization(0); } CHashIndexT::CHashIndexT(unsigned int uiBucketSize) { this->initialization(uiBucketSize); } CHashIndexT::~CHashIndexT(void) { // cout << "Free a " << this->uiSize << "Hash Index table" << endl; delete [] this->aiIndexTable; this->aiIndexTable = NULL; } int CHashIndexT::initialization(unsigned int BucketSize) { unsigned int i; if (BucketSize > 0) { i = BucketSize - 1;// 1024 map to 5 1023 map to 5, 1025 map to 6 for (uiWindowSize = 0; i > 0; uiWindowSize++) { i >>= 2; } // Get the number of base pair need to use as hashkey if (uiWindowSize > INDEX_BASES_LIMIT) uiWindowSize = INDEX_BASES_LIMIT; this->uiSize = (unsigned int) pow(4.0, (double)uiWindowSize); LOG_INFO("Info %d: Allocate a %u Hash Index table.\r", FINE_LOG, this->uiSize); } else { uiSize = 0; } if (uiSize > 0) { //One more record at the end, after filling the table, the record will shift 1 this->aiIndexTable = new unsigned int [uiSize + 1]; memset(this->aiIndexTable, 0x00, sizeof(unsigned int) *(this->uiSize + 1)); } else this->aiIndexTable = NULL; return(0); }; int CHashIndexT::Counter2Index(void) { unsigned int i; aiIndexTable[uiSize] = 0; for (i = 1; i <= this->uiSize; i++) { aiIndexTable[i] += aiIndexTable[i-1]; } return(0); } ./Source/Index_Table.cpp0000644011075700120610000001412211720654362015263 0ustar yanghochmath-ar#include "Index_Table.h" const char* CHECK_SUM = "End_of_Index"; CIndex_Table::CIndex_Table(void) { initialization(); } CIndex_Table::~CIndex_Table(void) { delete_index_table(); } int CIndex_Table::initialization() { // this->capacity = 0; this->size = 0; this->chosenSeedId = 3; this->uiSubDiffThreshold = 3; this->uiRead_Length = 0; this->bMapReadInColors = false; this->pIndexTable = NULL; this->pHashIndexTable = NULL; this->pbaMaskedFlag = NULL; return(0); } void CIndex_Table::delete_index_table(void) { delete [] this->pIndexTable; delete this->pbaMaskedFlag; delete this->pHashIndexTable; this->initialization(); // pointer to NULL; } void CIndex_Table::printInfo(void) const { LOG_INFO("Info %d: Read length is %u with size %u\n",\ CONFIG_LOG, this->uiRead_Length, this->size); if (this->bMapReadInColors) { LOG_INFO("Info %d: This index table is for SOLiD reads. %s\n",\ CONFIG_LOG, BLANK_LINE); } else { LOG_INFO("Info %d: This index table is for Illumina reads. %s\n",\ CONFIG_LOG, BLANK_LINE); } } // Save the index to a file with binary file. filePostfix should indicate which reference genome is preprocessed. bool CIndex_Table::save_Hash_Table(FILE* fp) const { bool sucessfullySaveTable = true; char colorReads = this->bMapReadInColors ? 'Y' : 'N'; // TODO: write the seed option into index // int seedOption = this->chosenSeedId; if (this->pIndexTable != NULL) { printInfo(); fprintf(fp, "%u\n%u\n%u\n%c", this->chosenSeedId, this->uiRead_Length, this->size, colorReads); // save hash table sucessfullySaveTable = myFwrite((void*)this->pIndexTable, sizeof(CIndex_Type), this->size, fp); if (!sucessfullySaveTable) ERR; if (this->pHashIndexTable != NULL) { fprintf(fp, "\n%u\n%u", \ this->pHashIndexTable->uiWindowSize, this->pHashIndexTable->uiSize); // The Hash Index table has uiSize + 1 record sucessfullySaveTable = myFwrite((void*)this->pHashIndexTable->aiIndexTable, \ sizeof(unsigned int), (this->pHashIndexTable->uiSize + 1), fp); if (!sucessfullySaveTable) ERR; } else { ERR; sucessfullySaveTable = false; } // Write math repeat flags if (this->pbaRepeatRepresentativeFlag != NULL) { fprintf(fp, "\n%u", this->pbaRepeatRepresentativeFlag->size); sucessfullySaveTable = myFwrite((void*) this->pbaRepeatRepresentativeFlag->bflag, sizeof(unsigned char), (this->pbaRepeatRepresentativeFlag->size / 8 + 1), fp); if (!sucessfullySaveTable) ERR; } fprintf(fp, "\n%s\n", CHECK_SUM); } else { LOG_INFO("Info %d: The Index is empty\n", ERROR_LOG); sucessfullySaveTable = false; } return(sucessfullySaveTable); } bool CIndex_Table::read_Hash_Table(FILE* fp) { bool sucessfullyReadTable = true; // int seedOption = this->chosenSeedId; if (fscanf(fp, "%u\n%u\n%u\n", \ &this->chosenSeedId, &this->uiRead_Length, &this->size) != 3) { ERR; // Read hash table } // Get a character until Y or N, indicating this is the SOliD reads. while (char c = (char)fgetc(fp)) { // This is a check sum of the correctness. if (c == 'Y' || c == 'N') { this->bMapReadInColors = (c == 'Y'); LOG_INFO("Info %d: Mapping SOLiD reads (Yes/No): %c%s\n", CONFIG_LOG, c, BLANK_LINE); break; } } LOG_INFO("Info %d: Anchor length: %u, Size: %u\n", CONFIG_LOG, this->uiRead_Length, this->size); if (this->pIndexTable == NULL) { this->pIndexTable = new CIndex_Type[this->size]; if (this->pIndexTable == NULL) ERR; // fail to new space for table } else LOG_INFO("Info %d: HashTable was not NULL\n", WARNING_LOG); sucessfullyReadTable = myFread((void*)this->pIndexTable, sizeof(CIndex_Type), this->size, fp); if (!sucessfullyReadTable) { LOG_INFO("Info %d: Fail to read index.\n", ERROR_LOG); delete this->pIndexTable; this->pIndexTable = NULL; return(!sucessfullyReadTable); } unsigned int uiWindowSize = 0, uiHashIndexTableSize = 0; if (fscanf(fp, "\n%u\n%u", &(uiWindowSize), &(uiHashIndexTableSize)) <=0) { ERR; } LOG_INFO("Info %d: Take the first %u bases for hashing(bucketing)\n", FINE_LOG, uiWindowSize); if (uiHashIndexTableSize > 0 && uiHashIndexTableSize > 0) { if (this->pHashIndexTable == NULL) { this->pHashIndexTable = new CHashIndexT(uiHashIndexTableSize); //Note there are one more counter } if (this->pHashIndexTable == NULL) { ERR; // Fail to new pHashIndexTable return(!sucessfullyReadTable); } sucessfullyReadTable = myFread((void*)this->pHashIndexTable->aiIndexTable,\ sizeof(unsigned int), (this->pHashIndexTable->uiSize + 1), fp); if (!sucessfullyReadTable) ERR; } else { ERR; } // Read math repeat flags unsigned int numOfFlags = 0; if (fscanf(fp, "\n%u", &(numOfFlags)) <=0 ) { ERR; } if (this->pbaRepeatRepresentativeFlag == NULL) { this->pbaRepeatRepresentativeFlag = new CboolFlagArray(numOfFlags); } sucessfullyReadTable = myFread((void*)this->pbaRepeatRepresentativeFlag->bflag, sizeof(unsigned char), (this->pbaRepeatRepresentativeFlag->size / 8 + 1), fp); if (!sucessfullyReadTable || !assertFile(fp, CHECK_SUM)) { ERR; } return(sucessfullyReadTable); } string default_index_path(string filePostfix, bool bColorReads,\ unsigned int seedOption, unsigned int uiReadLength) { char fileName[MAX_LINE]; const char* colorReads = bColorReads ? "SOLiD" : "Illumina"; sprintf(fileName, "%u_%u_%s_%s_v2.index", \ uiReadLength, seedOption, colorReads, filePostfix.c_str()); return(string(fileName)); } ./Source/LongReadsSet.cpp0000644011075700120610000001530111720654362015437 0ustar yanghochmath-ar#include "LongReadsSet.h" CLongReadsSet::CLongReadsSet(void) { } CLongReadsSet::CLongReadsSet(const char* InputFile, const char* fileFormat,\ unsigned int expReadStrLineLength, unsigned int allowedNumOfNinRead,\ unsigned int readStartIndex) : CPairedReadsSet(InputFile, fileFormat, expReadStrLineLength, false, allowedNumOfNinRead, readStartIndex) { this->longReadLength = expReadStrLineLength;// call parent constructor to open a file for reading long reads with bool in5to3cat3to5Format = false; } CLongReadsSet::~CLongReadsSet(void) { } int CLongReadsSet::size() { int size1 = this->R_Reads->pReadsSet->size(); int size2 = this->F_Reads->pReadsSet->size(); // Two half should be the same return(min(size1,size2)); } void CLongReadsSet::setBadReadOutputFile(FileOutputBuffer* pOut) { this->parser.pOBuf = pOut; } unsigned int CLongReadsSet::get_next_capacity_long_reads() { bool bStoreQS = (this->R_Reads->pQualScores != NULL) && (this->F_Reads->pQualScores != NULL); bool bSOLiDReadFormat = (this->cFileType == 'Q' || this->cFileType == 'S'); bool bGetQScores = (this->cFileType == 'Q' || this->cFileType == 'q') && bStoreQS; this->clearReads(); do { const char* caNextRead = parser.get_Next_Read(); // get next read and store in this->parser.caNextRead if (caNextRead[0] == '\0') { this->parser.pBuf->fflush(); break; // End of the file } else if (isBadRead(bSOLiDReadFormat, caNextRead, this->longReadLength)) { this->parser.print_Next_Read(); this->handleBadRead(); } else { this->save_next_long_read(bSOLiDReadFormat, bGetQScores, this->in5to3cat3to5Format); } } while (this->F_Reads->pReadsID->size() < this->F_Reads->pReadsSet->capacity()); printf("Deal read no. %u in %s.\r", this->uiNo_of_Reads, this->InputFile); this->removeExtraTags(); if(bStoreQS) { this->getQualityScoresFromQUAL(); } return((unsigned int)this->R_Reads->pReadsSet->size()); } // The private function store next read in the parser object // For reads longer than 64 and shorter than 128, reads are store as two parts in two CReadInBits // For odd read length, the two parts are overlapped with one base. bool CLongReadsSet::save_next_long_read(bool bSOLiDReadFormat, bool getQScores,\ bool in5to3cat3to5Format) { // bool bDiscardReadWithN = this->F_Reads->bDiscardReadWithN && this->R_Reads->bDiscardReadWithN; char* readSeq = this->parser.caNextRead; unsigned int fullReadLength = (unsigned int)strlen(readSeq); unsigned int expFullReadLength = getExpReadLength(fullReadLength); bool returnV; if(isBadRead(bSOLiDReadFormat, this->parser.caNextRead, expFullReadLength)) { return(false); } else { if(bSOLiDReadFormat) { returnV = save_next_long_SOLiD_read(fullReadLength, getQScores); } else { returnV = save_next_long_Illumina_read(fullReadLength, getQScores, in5to3cat3to5Format); } } this->save_next_read_id(this->parser.caNextReadTag); this->uiNo_of_Reads++; return(returnV); } bool CLongReadsSet::save_next_long_Illumina_read(unsigned int fullReadLength, bool getQScores, bool in5to3cat3to5Format) { const bool bSOLiDReadFormat = false; char* readSeq = this->parser.caNextRead; char* readQS = this->parser.caNextReadQSs; unsigned int eachPartLength = this->uiRead_Length; unsigned int secondPartStart = fullReadLength - eachPartLength; if (in5to3cat3to5Format) { reverseKmer(&readSeq[secondPartStart]); if (getQScores) { reverseKmer(&readQS[secondPartStart]); } } const char* rReadSeq = &readSeq[secondPartStart]; this->R_Reads->save_next_read(rReadSeq, bSOLiDReadFormat); this->parser.caNextRead[eachPartLength] = '\0'; this->F_Reads->save_next_read(readSeq, bSOLiDReadFormat); if (getQScores) { const char* rReadQS = &readQS[secondPartStart]; this->R_Reads->pQualScores->addQSs(rReadQS); readQS[eachPartLength] = '\0'; this->F_Reads->pQualScores->addQSs(readQS); } return(true); } // The first base and the following color signals are saved into two parts bool CLongReadsSet::save_next_long_SOLiD_read(unsigned int fullReadLength, bool getQScores) { const bool bSOLiDReadFormat = true; char* readSeq = this->parser.caNextRead; char* readQS = this->parser.caNextReadQSs; unsigned int eachPartLength = this->uiRead_Length; unsigned int secondPartStart = fullReadLength - eachPartLength; const char* rReadSeq = &readSeq[secondPartStart]; this->R_Reads->save_next_read(rReadSeq, bSOLiDReadFormat); this->parser.caNextRead[eachPartLength] = '\0'; this->F_Reads->save_next_read(readSeq, bSOLiDReadFormat); if (getQScores) { const char* rReadQS = &readQS[secondPartStart]; this->R_Reads->pQualScores->addQSs(rReadQS); readQS[eachPartLength] = '\0'; this->F_Reads->pQualScores->addQSs(readQS); } return(false); } int get_next_capacity_long_paired_reads(CLongReadsSet &set1, CLongReadsSet &set2) { bool bStoreQS = (set1.R_Reads->pQualScores != NULL) && (set1.F_Reads->pQualScores != NULL); bool bGetQScores = (set1.cFileType == 'Q' || set1.cFileType == 'q') && bStoreQS; bool bSOLiDReadFormat = (set1.cFileType == 'Q' || set1.cFileType == 'S'); set1.clearReads(); set2.clearReads(); do { const char* caNextRead1 = set1.parser.get_Next_Read(); const char* caNextRead2 = set2.parser.get_Next_Read(); if (caNextRead1[0] == '\0' || caNextRead2[0] == '\0') { set1.parser.pBuf->fflush(); set2.parser.pBuf->fflush(); break; // End of the file } else if (isBadRead(bSOLiDReadFormat, caNextRead1, set1.longReadLength) || isBadRead(bSOLiDReadFormat, caNextRead2, set2.longReadLength)) { set1.handleBadRead(); set2.handleBadRead(); } else { bool in5to3cat3to5Format = false; set1.save_next_long_read(bSOLiDReadFormat, bGetQScores, in5to3cat3to5Format); set2.save_next_long_read(bSOLiDReadFormat, bGetQScores, in5to3cat3to5Format); } } while (set1.F_Reads->pReadsID->size() < set1.F_Reads->pReadsSet->capacity() && set2.F_Reads->pReadsID->size() < set2.F_Reads->pReadsSet->capacity()); printf("Deal read no. %u in %s.\r", set1.uiNo_of_Reads, set1.InputFile); set1.removeExtraTags(); set2.removeExtraTags(); if(bStoreQS) { set1.getQualityScoresFromQUAL(); set2.getQualityScoresFromQUAL(); } return((unsigned int)min(set1.size(), set2.size())); }./Source/MappingResult.cpp0000644011075700120610000000333311720654362015701 0ustar yanghochmath-ar#include "MappingResult.h" CMappingResult::CMappingResult(void) { this->initialization(); } CMappingResult::CMappingResult(CAlignmentsQ& aQue, unsigned int uiReadLength = 0) { initialization(); this->uiReadLength = uiReadLength; this->uiDiff = aQue.MinDiff; //This is the minimum distance of mappings this->MultipleMappedNo = aQue.load; myStrCpy(this->QNAME, aQue.tag, FILENAME_MAX); // aQue.read.decode(this->caRead); } CMappingResult::~CMappingResult(void) { ; } void CMappingResult::initialization(void) { this->QNAME[0] = '\0'; this->FLAG = 0; this->strand = '+'; this->RNAME[0] = '\0'; this->uiPOS = 0; this->MAPQ = 0; this->CIGAR[0] = '\0'; this->MRNM[0] = '\0'; this->uiMPOS = 0; this->ISIZE = 0; this->caRead[0] = '\0'; this->QScores[0] = '\0'; this->revComRead[0] = '\0'; this->revQScores[0] = '\0'; this->rawScores[0] = '\0'; this->TAG[0] = '\0'; this->SNPtype = ' '; this->caRef[0] = '\0'; this->mismatchScore = 0; this->uiReadLength = 0; this->isColorRead = false; // this->uiDiff = 0; // TODO uncomment it } // currently not used string get_diff_bases_bt_read_ref(const char* ref, const char* read) { char diff_bases_list[MAX_LINE]; vector mis; for (unsigned int i = 0; read[i] != '\0'; i++) { if (read[i] != ref[i]) { mis.push_back(i+1); } } int j = 0; for (vector::iterator it = mis.begin(); it != mis.end(); it++) { sprintf(&(diff_bases_list[j]), "%d_%c,", *it, ref[*it]); j += ((*it < 10) ? 3 : 4); // assume read length < 100 if ( *it >= 100) j++; // assume read length < 1000 } return(string(diff_bases_list)); } ./Source/MismatchScores.cpp0000644011075700120610000002024611720654362016035 0ustar yanghochmath-ar#include "MismatchScores.h" CMismatchScores::CMismatchScores(void) { this->uiNoOfReads = 0; this->mismatchScore = NULL; // Set function pointers for updates the best records this->switchUpdatesAndIsBest(true, false /* Default is normal update */); } CMismatchScores::CMismatchScores(unsigned int uiNoOfReads) { this->isBest = &CMismatchScores::dummyBest; this->update = &CMismatchScores::normalUpdate; this->uiNoOfReads = uiNoOfReads; this->mismatchScore = new short[uiNoOfReads]; this->noOfBestMappings = new unsigned char[uiNoOfReads]; memset(mismatchScore, CHAR_MAX, uiNoOfReads); memset(noOfBestMappings, 0, uiNoOfReads); // Set function pointers for updates the best records } CMismatchScores::~CMismatchScores(void) { delete [] this->mismatchScore; delete [] this->noOfBestMappings; } int CMismatchScores::printArray(char* filename) { ofstream ofile(filename); for (unsigned int i = 0; i < this->uiNoOfReads; i++) { ofile << (int)this->mismatchScore[i] << '\n'; } ofile.close(); return(0); } int CMismatchScores::doStatistics(unsigned int uiNoOfReads, char* dataSet, int numOfSNP) { const int maxTolerateSNP = 5; int misMatchCount[maxTolerateSNP]; for (int i = 0; i < (int)maxTolerateSNP; i++) { misMatchCount[i] = 0; } int complementSNPcount = 0; int transversionSNPCount = 0; int transitionSNPCount = 0; int doubleSNPcount = 0; int totalMapedRead = 0; for (int i = 0; i < (int)uiNoOfReads; i++) { int mis = ((int)this->mismatchScore[i] >> 3); if (mis <= maxTolerateSNP) { // There is an alignment int SNPtype = (int)(this->mismatchScore[i] & 0x07); switch (SNPtype) { case 1: complementSNPcount++; break; case 2: transversionSNPCount++; break; case 3: transitionSNPCount++; break; case 4: doubleSNPcount++; break; default: break; } misMatchCount[mis]++; totalMapedRead++; } } int noReadsSupportValidSnp = complementSNPcount + transversionSNPCount + transitionSNPCount + doubleSNPcount; cout << "There are " << totalMapedRead << " reads mapped." << endl; cout << "Total " << noReadsSupportValidSnp << " reads indicating valid SNP " << endl; cout << complementSNPcount << " reads indicate complement SNPs " << endl; cout << transversionSNPCount << " reads indicate transversion SNPs " << endl; cout << transitionSNPCount << " reads indicate transition SNPs " << endl; cout << doubleSNPcount << " reads indicate more than one SNPs " << endl; cout << "Identify " << numOfSNP << endl; cout << " The color mismatches histogram:\n" << endl; for (int i = 0; i < maxTolerateSNP; i++) { cout << i << " mismatches " << misMatchCount[i] << endl; } // Print in the log file ofstream ofile("SNP.log", ofstream::app); if (ofile.good()) { ofile << dataSet << ',' << totalMapedRead << ','; ofile << misMatchCount[0] << ',' << misMatchCount[1] << ',' << misMatchCount[2] << ','\ << misMatchCount[3] << ',' << misMatchCount[4] << ','; ofile << noReadsSupportValidSnp << ',' << complementSNPcount << ',' << transversionSNPCount << ','\ << transitionSNPCount << ',' << doubleSNPcount << ',' << numOfSNP << endl; ofile.close(); } /* DEBUG, print the read id of the one mismatch. ofstream debugfile("debug.3sub", ofstream::app); { for(unsigned int i = 0; i < this->uiNoOfReads; i++) { if((int)this->mismatchScore[i] < 0) { debugfile << i << endl; } } } debugfile.close(); */ return(0); } // Currently mismatch Score are encoded as # of mismatches * 8 + the SNP type. // Only the best mapping are record. The order of "better mapping as follows. // Exact match << 1 Sub << 1 SNP << 2 Sub << 1 SNP 1Sub << 3 Sub << 2 SNP << 2 Sub 1 SNP << 4 Sub int CMismatchScores::normalUpdate(unsigned int readId, int score) { int updateFlag = 0; // No update if ((char)score < this->mismatchScore[readId]) { this->mismatchScore[readId] = (char)score; updateFlag = 1; } if (updateFlag > 0) { noOfBestMappings[readId] = 1; // New best record } else if ((char)score == this->mismatchScore[readId]) { // No update but same score addCounter(readId); //Ambiguous reads } // else the score is worse than the best return(updateFlag); } int CMismatchScores::solidUpdate(unsigned int readId, int score) { const int DIGITS4_SNP_TYPE = 3; const char SNP_TYPE_MASK = 0x07; // get last three digit enum UPDATED_FLAG { NEW_BEST_RECORD, SAME_BEST_RECORD, WORSE_RECORD }; int updateFlag = WORSE_RECORD; int noOfSub = (score >> DIGITS4_SNP_TYPE); int noOfBestSubInRecord = (this->mismatchScore[readId] >> DIGITS4_SNP_TYPE); if (noOfSub < noOfBestSubInRecord) { this->mismatchScore[readId] = (short)score; updateFlag = NEW_BEST_RECORD; } else if (noOfSub == noOfBestSubInRecord) { // If the new alignment has valid SNP while the recored alignment don't if (((score & 0x07) > 1) && ((this->mismatchScore[readId] & SNP_TYPE_MASK) == 0)) { this->mismatchScore[readId] = (short)score; updateFlag = NEW_BEST_RECORD; } else { updateFlag = SAME_BEST_RECORD; // has the same best record; } } // else no updated if (updateFlag == NEW_BEST_RECORD) { noOfBestMappings[readId] = 1; // New best record } else if (updateFlag == SAME_BEST_RECORD) { addCounter(readId); //Ambiguous reads } // else the score is worse than the best in record return(updateFlag); } int CMismatchScores::dummyUpdate(unsigned int readId , int score) { score = 0; readId = 0; return(0); } bool CMismatchScores::callIsBest(unsigned int readId , int score) { return((this->*isBest)(readId, score)); } bool CMismatchScores::isBestInRecords(unsigned int readId , int score) { return((char) score == this->mismatchScore[readId]); } inline bool CMismatchScores::compareRecord4Solid(unsigned int readId , int score) { const int DIGITS4_SNP_TYPE = 3; const char SNP_TYPE_MASK = 0x07; // get last three digit int diff = (score >> DIGITS4_SNP_TYPE); int minDiffInRecord = ((int)this->mismatchScore[readId]) >> DIGITS4_SNP_TYPE; if (diff > minDiffInRecord) { return(false); } else { bool isRecordSupportSNP = ((int)(this->mismatchScore[readId] & SNP_TYPE_MASK) > 0); bool isAlignmentSupportSNP = ((score & SNP_TYPE_MASK) > 0); return(isRecordSupportSNP == isAlignmentSupportSNP); } } bool CMismatchScores::isBestInRecords4Solid(unsigned int readId , int score) { if ((char) score == this->mismatchScore[readId]) { return(true); } else { if (score >= 0x08) { //SOliD encoding (TODO) find a better way to change this return(compareRecord4Solid(readId, score)); } else { return(false); } } } bool CMismatchScores::dummyBest(unsigned int readId , int score) { score = 0; readId = 0; return(true); } void CMismatchScores::switchUpdatesAndIsBest(bool firstRun, bool solidScore) { if (firstRun) { if (solidScore) { this->update = &CMismatchScores::solidUpdate; } else { this->update = &CMismatchScores::normalUpdate; } this->isBest = &CMismatchScores::dummyBest; } else { this->update = &CMismatchScores::dummyUpdate; if (solidScore) { this->isBest = &CMismatchScores::isBestInRecords4Solid; } else { this->isBest = &CMismatchScores::isBestInRecords; } } } int CMismatchScores::callUpdate(unsigned int readId , int score) { return((this->*update)(readId, score)); } inline unsigned char CMismatchScores::addCounter(int readId) { if (noOfBestMappings[readId] < UCHAR_MAX) { noOfBestMappings[readId]++; } return(noOfBestMappings[readId]); } ./Source/PairedReadsMapping.cpp0000644011075700120610000006100011720654362016601 0ustar yanghochmath-ar#include "PairedReadsMapping.h" // Given two mapped paired read set list and the index table, // this function maps reads in parallel int parallelMappingPairedReads(vector& readSetsList1, vector& readSetsList2,\ CGenome_Index_TableQ& indexTable, MappingOpts P) { unsigned int readStartIndex = P.truncatedReadPrefix;// ignore the first few base pair P.bDiscardReadWithN = false; ASSERT_TRUE(((int)(readSetsList1.size()) == (int)(readSetsList2.size())), "Read sets are not in paired"); P.clearOutputFileName(readSetsList1.size() > 1); int i; #ifdef _OPENMP int numberOfCPUs = omp_get_num_procs(); LOG_INFO("\nInfo %d: %d CPUs detected. %s.\n",\ INFO_LOG, numberOfCPUs, BLANK_LINE); #pragma omp parallel for #endif // __OPENMP_FOR_PARALLEL__(#, pragma) for (i = 0; i < min((int)readSetsList1.size(), (int)readSetsList2.size()); i++) { CPairedReadsMapping mapping(P); const char* readSetName1 = (readSetsList1.at(i)).c_str(); const char* readSetName2 = (readSetsList2.at(i)).c_str(); if (checkFileExist(readSetName1) && checkFileExist(readSetName2)) { CReadInBitsSet readSet1\ (readSetName1, P.readsFileFormat, readStartIndex, indexTable.uiRead_Length, P.allowedNumOfNinRead); CReadInBitsSet readSet2\ (readSetName2, P.readsFileFormat, readStartIndex, indexTable.uiRead_Length, P.allowedNumOfNinRead); if (P.bIgnoreQS) { readSet1.ignoreQScores(); readSet2.ignoreQScores(); } TIME_INFO(mapping.mapPairedReadsInPairedFiles(readSet1, readSet2, indexTable), "Mapping takes"); } } return(0); } // Given the list of paired read sets in the single file // format and the index table, this function maps reads int parallel. int parallelMappingPairedReads(vector& readSetsList,\ CGenome_Index_TableQ& indexTable, MappingOpts P) { P.bDiscardReadWithN = false; P.clearOutputFileName(readSetsList.size() > 1); int i; #ifdef _OPENMP int numberOfCPUs = omp_get_num_procs(); LOG_INFO("\nInfo %d: %d CPUs detected. %s.\n",\ INFO_LOG, numberOfCPUs, BLANK_LINE); #pragma omp parallel for #endif //__OPENMP_FOR_PARALLEL__(#pragma)#ifdef _OPENMP for (i = 0; i < (int)readSetsList.size(); i++) { CPairedReadsMapping mapping(P); const char* readSetName = (readSetsList.at(i)).c_str(); if (checkFileExist(readSetName)) { // assume the paired end reads is in the format // that concatenate paired read in 5'-3' and 3' to 5'. bool in5to3cat3to5Format = true; int allowedNumOfNinRead = P.readLength; // Don't throw reads with N in the paired-end mapping CPairedReadsSet pairedReadSet\ (readSetName, P.readsFileFormat, indexTable.uiRead_Length * 2, in5to3cat3to5Format, allowedNumOfNinRead); if (P.bIgnoreQS) { pairedReadSet.ignoreQScores(); } TIME_INFO(mapping.mapPairedReads(*pairedReadSet.F_Reads, *pairedReadSet.R_Reads,\ indexTable), "Mapping takes"); } } return(0); } int parallelMappingPairedLongReads(vector& readSetsList1, vector& readSetsList2,\ CGenome_Index_TableQ& indexTable, MappingOpts P) { unsigned int readStartIndex = P.truncatedReadPrefix;// ignore the first few base pair P.bDiscardReadWithN = false; ASSERT_TRUE(((int)(readSetsList1.size()) == (int)(readSetsList2.size())), "Read sets are not in paired"); P.clearOutputFileName(readSetsList1.size() > 1); int i; #ifdef _OPENMP int numberOfCPUs = omp_get_num_procs(); LOG_INFO("\nInfo %d: %d CPUs detected. %s.\n",\ INFO_LOG, numberOfCPUs, BLANK_LINE); #pragma omp parallel for #endif // __OPENMP_FOR_PARALLEL__(#, pragma) for (i = 0; i < min((int)readSetsList1.size(), (int)readSetsList2.size()); i++) { CPairedReadsMapping mapping(P); const char* readSetName1 = (readSetsList1.at(i)).c_str(); const char* readSetName2 = (readSetsList2.at(i)).c_str(); if (checkFileExist(readSetName1) && checkFileExist(readSetName2)) { int allowedNumOfNinRead = P.readLength; // Don't throw reads with N in the paired-end mapping CLongReadsSet readSet1(readSetName1, P.readsFileFormat, P.readLength, allowedNumOfNinRead, readStartIndex); CLongReadsSet readSet2(readSetName2, P.readsFileFormat, P.readLength, allowedNumOfNinRead, readStartIndex); if (P.bIgnoreQS) { readSet1.ignoreQScores(); readSet2.ignoreQScores(); } TIME_INFO(mapping.mapPairedLongReadsInBases(readSet1, readSet2, indexTable), "Mapping takes"); } } return(0); } CBestPairedMapping::CBestPairedMapping(void) { this->validMappingNo = 0; this->bestMappingNo = 0; this->minDiff = MAX_READ_LENGTH; } CBestPairedMapping::~CBestPairedMapping(void) { } inline void CBestPairedMapping::update\ (CMappingResult &m1, CMappingResult &m2, bool excludeAmbigousRead) { unsigned int diff = m1.uiDiff + m2.uiDiff; if (diff < this->minDiff) { this->bm1 = m1; this->bm2 = m2; minDiff = m1.uiDiff + m2.uiDiff; this->bestMappingNo = 1; } else if (diff == minDiff) { this->bestMappingNo++; } this->validMappingNo++; } CPairedReadsMapping::CPairedReadsMapping(void) { this->initialization(); } CPairedReadsMapping::~CPairedReadsMapping(void) { } CPairedReadsMapping::CPairedReadsMapping(const MappingOpts P): CReadsMapping( P ) { this->initialization(); } void CPairedReadsMapping::initialization(void) { this->noOfPairsInRange = 0; this->noOfSingle1stEndMapped = 0; this->noOfSingle2ndEndMapped = 0; this->noOfAmbiguousPairs = 0; this->noOfPairsSepLess = 0; this->noOfPairsSepMore = 0; this->noOfPairsSepMoreAndLess = 0; this->noOfExpMappedPairedStrand = 0; } /* int CPairedReadsMapping::mapPairedReadsInASingleFile\ (CPairedReadsSet& readSet, CGenome_Index_TableQ& table) { CReadInBitsSet& readSet1 = *readSet.F_Reads; CReadInBitsSet& readSet2 = *readSet.R_Reads; cout << "Start mapping " << readSet1.InputFile << " and " << readSet2.InputFile << endl; getReadsFileFormat(readSet.InputFile, opt.readsFileFormat); if (this->setUpIO4Aligment(readSet.InputFile, table) != 0) { LOG_INFO("\nInfo %d: Fail to setup I/O files.", ERROR_LOG); return(1); } while (readSet.get_next_capacity_reads_pairs_from_single_file()) { mapPairedReads(readSet1, readSet2, table); } this->printMappedPairStats(cout, readSet1, table.uiSubDiffThreshold * 2); this->tearDownIO4Aligment(); return(0); } */ int CPairedReadsMapping::mapPairedReadsInPairedFiles\ (CReadInBitsSet& readSet1, CReadInBitsSet& readSet2, CGenome_Index_TableQ& table) { cout << "Start mapping " << readSet1.InputFile << " and " << readSet2.InputFile << endl; getReadsFileFormat(readSet1.InputFile, opt.readsFileFormat); if (this->setUpIO4Aligment(readSet1.InputFile, table) != 0) { LOG_INFO("\nInfo %d: Fail to setup I/O files.", ERROR_LOG); } while (readSet1.get_next_capacity_reads(BUFFERED_READS_SIZE, opt.readtag_delimiter) && readSet2.get_next_capacity_reads(BUFFERED_READS_SIZE, opt.readtag_delimiter)) { mapPairedReads(readSet1, readSet2, table); } this->printMappedPairStats(cout, readSet1, table.uiSubDiffThreshold * 2); this->tearDownIO4Aligment(); return(0); } int CPairedReadsMapping::mapPairedReads(CReadInBitsSet& readSet1, CReadInBitsSet& readSet2, CGenome_Index_TableQ& table) { vector::iterator it1 = readSet1.pReadsSet->begin(); vector::iterator it2 = readSet2.pReadsSet->begin(); for (int i = 0; it1 != readSet1.pReadsSet->end() && it2 != readSet2.pReadsSet->end(); it1++, it2++, i++) { alignmentsQ[0].read = *it1; alignmentsQ[1].read = *it2; readSet1.get_read_id(i, alignmentsQ[0].tag); readSet2.get_read_id(i, alignmentsQ[1].tag); alignmentsQ[0].qualityScores = readSet1.getQScoresPtr(i); alignmentsQ[1].qualityScores = readSet2.getQScoresPtr(i); if (table.bMapReadInColors) { table.queryReadColors(*it1, alignmentsQ[0], true, true); table.queryReadColors(*it1, alignmentsQ[0], false, false); table.queryReadColors(*it2, alignmentsQ[1], true, true); table.queryReadColors(*it2, alignmentsQ[1], false, false); } else { table.queryReadBases(*it1, alignmentsQ[0], true, true); table.queryReadBases(*it1, alignmentsQ[0], false, false); table.queryReadBases(*it2, alignmentsQ[1], true, true); table.queryReadBases(*it2, alignmentsQ[1], false, false); } this->dealMappedPairedReads(table); } return(0); } inline bool isExpPairedMappedStrand(bool firstEndFirst, char end1Strand, char end2Strand, bool bSOLiD) { if (bSOLiD) { if (firstEndFirst && (end1Strand == '+') && (end2Strand == '+')) { return(true); } else if (!firstEndFirst && (end1Strand == '-') && (end2Strand == '-')) { return(true); } } else { if (firstEndFirst && (end1Strand == '+') && (end2Strand == '-')) { return(true); } else if (!firstEndFirst && (end1Strand == '-') && (end2Strand == '+')) { return(true); } } return(false); } inline bool isValidPaired(CMappingResult& m1, CMappingResult& m2, MappingOpts& opts) { bool allowSameStrand = !(opts.frOnly); bool allowDiffStrand = !(opts.ffOnly); if (strcmp(m1.RNAME, m2.RNAME) == 0) { if(m1.strand == m2.strand) { return(allowSameStrand); } else { return(allowDiffStrand); } } return(false); } // TODO becareful about the big separation that overflow to negative value inline int getSep(unsigned int uiPos1, unsigned int uiPos2, bool expM2gtM1) { int range = 0; if (uiPos1 < uiPos2) { range = (int)(uiPos2 - uiPos1); if(!expM2gtM1) { range *= -1; } } else { range = (int)(uiPos1 - uiPos2); if(expM2gtM1) { range *= -1; } } return(range); } inline int set_ISIZE(CMappingResult &m1, CMappingResult &m2, int readLength) { bool expM2gtM1 = (m1.strand = '+'); int sep = getSep(m1.uiPOS, m2.uiPOS, expM2gtM1); m2.ISIZE = m1.ISIZE = sep + readLength; return(sep); } int CPairedReadsMapping::printValidMappedPair(const CGenome_Index_TableQ& table, CMappingResult& m1, CMappingResult& m2, int validMappedPairNo) { for (unsigned int i = 0; i < this->alignmentsQ[0].load; i++) { for (unsigned int j = 0; j < this->alignmentsQ[1].load; j++) { getSingleMappingIndex(*table.pgenomeNT, this->alignmentsQ[0], i, m1); getSingleMappingIndex(*table.pgenomeNT, this->alignmentsQ[1], j, m2); // Require reads mapped to the same ref sequence if (isValidPaired(m1, m2, opt)) { int sep = set_ISIZE(m1, m2, (int)opt.readLength); if (opt.disLB <= sep && sep <= opt.disUB) { this->printAMappedPair(table, m1, m2, validMappedPairNo); } } } } return(validMappedPairNo); } int CPairedReadsMapping::printBestMappedPair(const CGenome_Index_TableQ& table, CMappingResult& m1, CMappingResult& m2, int minMismatchNo, int bestMappedPairNo) { for (unsigned int i = 0; i < this->alignmentsQ[0].load; i++) { for (unsigned int j = 0; j < this->alignmentsQ[1].load; j++) { getSingleMappingIndex(*table.pgenomeNT, this->alignmentsQ[0], i, m1); getSingleMappingIndex(*table.pgenomeNT, this->alignmentsQ[1], j, m2); // Require reads mapped to the same ref sequence if (isValidPaired(m1, m2, opt)) { int sep = set_ISIZE(m1, m2, (int)opt.readLength); if (opt.disLB <= sep && sep <= opt.disUB) { if ((int)(m1.uiDiff + m2.uiDiff) == minMismatchNo) { this->printAMappedPair(table, m1, m2, bestMappedPairNo); } } } } } return(bestMappedPairNo); } int CPairedReadsMapping::dealMappedPairedReads(CGenome_Index_TableQ& table) { bool mapSOLiDRead = table.bMapReadInColors; bool bNoMapping = true, sepMore = false, sepLess = false, pairedOnExpStrand = false; bool bPrintAllMapping = opt.bGetAllAlignments && !opt.bExcludeAmbiguousPaired; CBestPairedMapping bestMP; CMappingResult m1(this->alignmentsQ[0], opt.readLength); CMappingResult m2(this->alignmentsQ[1], opt.readLength); bool samFormat = (this->cOutputFormat == 's'); if (!table.bMapReadInColors) { // get mapping info /* getQscores4Solexa(this->alignmentsQ[0], m1, samFormat); getQscores4Solexa(this->alignmentsQ[1], m2, samFormat); */ getReadQscores4Solexa(this->alignmentsQ[0], m1, samFormat); getReadQscores4Solexa(this->alignmentsQ[1], m2, samFormat); } for (unsigned int i = 0; i < this->alignmentsQ[0].load; i++) { for (unsigned int j = 0; j < this->alignmentsQ[1].load; j++) { getSingleMappingIndex(*table.pgenomeNT, this->alignmentsQ[0], i, m1); getSingleMappingIndex(*table.pgenomeNT, this->alignmentsQ[1], j, m2); // Require reads mapped to the same ref sequence if (isValidPaired(m1, m2, opt)) { int sep = set_ISIZE(m1, m2, (int)opt.readLength); if (opt.disLB <= sep && sep <= opt.disUB) { bNoMapping = false; bestMP.update(m1, m2, opt.bExcludeAmbiguousReads); /* if (bPrintAllMapping) { this->printAMappedPair(table, m1, m2, ); }*/ } else { sepMore |= (sep > opt.disUB); sepLess |= (sep < opt.disLB); } bool firstEndFirst = m1.uiPOS > m2.uiPOS; pairedOnExpStrand |= (isExpPairedMappedStrand(firstEndFirst, m1.strand, m2.strand, mapSOLiDRead)); } } } if (bNoMapping) { dealNoMapping(table, m1, m2); this->bookNoMappedKeepPairs(sepMore, sepLess, pairedOnExpStrand); } else { bool isSamFormat = (this->cOutputFormat == 's'); this->getPairedRInfo(table, m1, m2, isSamFormat); if(bPrintAllMapping) { this->printValidMappedPair(table, m1, m2, bestMP.validMappingNo); } else { this->dealBestMapping(table, bestMP, m1, m2); } this->bookKeepMappedPairs(bestMP); } return(0); } void CPairedReadsMapping::dealNoMapping(const CGenome_Index_TableQ& table, CMappingResult& m1, CMappingResult& m2) { setSamFlags4OnlyOneEndMapped(m1, true); const int endId = 1; this->dealMappedSingleRead(table, this->alignmentsQ[0], m1, endId == 1); setSamFlags4OnlyOneEndMapped(m2,false); this->dealMappedSingleRead(table, this->alignmentsQ[1], m2, endId == 2); } int CPairedReadsMapping::dealBestMapping(const CGenome_Index_TableQ& table, CBestPairedMapping& bestMP, CMappingResult& m1, CMappingResult& m2) { if(bestMP.validMappingNo == 1) { // -A -e & 1 mapping this->printAMappedPair(table, bestMP.bm1, bestMP.bm2, 1); } else if (opt.bExcludeAmbiguousPaired && !opt.bGetAllAlignments && bestMP.bestMappingNo == 1) { // -e 1 best mapping this->printAMappedPair(table, bestMP.bm1, bestMP.bm2, 1); } else if (!opt.bExcludeAmbiguousPaired && !opt.bGetAllAlignments) { // -B this->printBestMappedPair(table, m1, m2, bestMP.minDiff, bestMP.bestMappingNo); } return(bestMP.bestMappingNo); } inline void CPairedReadsMapping::getPairedRInfo(const CGenome_Index_TableQ& table, CMappingResult &m1, CMappingResult &m2, bool samFormat) { if (this->opt.bPrintAlignments) { if (table.bMapReadInColors) { getSingleMappingSeqAndQ4SOLiD(table, this->alignmentsQ[0], m1, samFormat); getSingleMappingSeqAndQ4SOLiD(table, this->alignmentsQ[1], m2, samFormat); } else { bool bNoRef = samFormat || !opt.bPrintRef4PairedInMapping; if(opt.bPrintPairedRQ) { // Get quality score getQscores4Solexa(this->alignmentsQ[0], m1, samFormat); getQscores4Solexa(this->alignmentsQ[1], m2, samFormat); } if(opt.bMappedLongRead) { getLongRefSeq(table, m1, bNoRef); getLongRefSeq(table, m2, bNoRef); } else { getSingleMappingSeq4Solexa(table, m1, bNoRef); getSingleMappingSeq4Solexa(table, m2, bNoRef); } } } } // F3read and R3read are index in AlignmentQ void CPairedReadsMapping::printAMappedPair\ (const CGenome_Index_TableQ& table, CMappingResult &m1, CMappingResult &m2, int noPairedLoc) { if (this->opt.bPrintAlignments) { set_ISIZE(m1, m2, (int)opt.readLength); bool samFormat = (this->cOutputFormat == 's'); // getPairedRInfo(table, m1, m2, samFormat); if (samFormat) { // sam format printAPairedMappingInSam(this->AlignResult, m1, m2); } else { // string category = getCategory(m1.strand,m2.strand, m1.ISIZE, opt.disLB, opt.disUB); printAPairedMappingInPerM(this->AlignResult, m1, m2, noPairedLoc, opt.bPrintNM); } } } int CPairedReadsMapping::dealMappedSingleRead\ (const CGenome_Index_TableQ& table, CAlignmentsQ &Q, CMappingResult &m, bool bFirstEnd) { bool samFormat = (this->cOutputFormat == 's'); if (Q.load > 0 && samFormat) { for (unsigned int i = 0; i < Q.load && i < Q.iMaxCapacity; i++) { if (this->opt.bPrintAlignments) { getSingleMappingInfo(table, Q, i, m, samFormat); this->printSingleEndReads(m); } } } else if (this->opt.bPrintUnMappedReads) { // check if the seq is not decoded for SOLiD read? // check if code has never reach here? this->dealMissedRead(m); } if (Q.load > 0) { if (bFirstEnd) { this->noOfSingle1stEndMapped++; } else { this->noOfSingle2ndEndMapped++; } } return(this->iMissReadCounter++); } void CPairedReadsMapping::printMappedPairStats\ (ostream& out, CReadInBitsSet& readSet, unsigned int uiSubThreshold) { string readSetName = getBasename(readSet.InputFile); out << '\n'; out << readSetName.c_str() << ", #Pairs, " << readSet.uiNo_of_Reads << ", "; out << "#Mapped Pairs, " << this->noOfPairsInRange << ", "; out << "#Multi-mapped Pairs, " << this->noOfAmbiguousPairs << "\n"; out << readSetName << ", "; out << "#Pairs sep more, " << this->noOfPairsSepMore << ", "; out << "#Pairs sep less, " << this->noOfPairsSepLess << ", "; out << "#Pairs sep more and less, " << this->noOfPairsSepMoreAndLess << endl; out << "#Pairs on exp strands " << this->noOfExpMappedPairedStrand << endl; out << readSetName << ", "; out << "#Mapped single 1st end, " << this->noOfSingle1stEndMapped << endl; out << "#Mapped single 2nd end, " << this->noOfSingle2ndEndMapped << endl; out << readSetName << ", "; unsigned int i; for (i = 0; i <= uiSubThreshold; i++) { out << "Sub" << i << ", " << iMapDiffCount[i] << ", " ; } out << endl; } unsigned int CPairedReadsMapping::getPairedReadSetSize (CReadInBitsSet& setA1, CReadInBitsSet& setA2, CReadInBitsSet& setB1, CReadInBitsSet& setB2) { int setAsize = this->checkPairedReadSetSize(setA1, setA2); int setBsize = this->checkPairedReadSetSize(setB1, setB2); return(min(setAsize, setBsize)); } int CPairedReadsMapping::dealMappedLongPairedRead (CAlignmentsQ& q1, CAlignmentsQ& q2, CMappingResult& m1, CMappingResult& m2, const CGenome_Index_TableQ& table) { bool bNoMapping = true, sepMore = false, sepLess = false, pairedOnExpStrand = false; bool bPrintAllMapping = opt.bGetAllAlignments && !opt.bExcludeAmbiguousPaired; CBestPairedMapping bestMP; // bool samFormat = (this->cOutputFormat == 's'); for (unsigned int i = 0; i < q1.load; i++) { for (unsigned int j = 0; j < q2.load; j++) { // get mapping info getSingleMappingIndex(*table.pgenomeNT, q1, i, m1); getSingleMappingIndex(*table.pgenomeNT, q2, j, m2); if (isValidPaired(m1, m2, opt)) { int sep = set_ISIZE(m1, m2, (int)opt.readLength); if (opt.disLB <= sep && sep <= opt.disUB) { bNoMapping = false; bestMP.update(m1, m2, opt.bExcludeAmbiguousPaired); /* if (bPrintAllMapping) { this->printAMappedPair(table, m1, m2, ); }*/ } else { sepMore |= (sep > opt.disUB); sepLess |= (sep < opt.disLB); } bool firstEndFirst = m1.uiPOS > m2.uiPOS; bool mapSOLiDRead = false; pairedOnExpStrand |= (isExpPairedMappedStrand(firstEndFirst, m1.strand, m2.strand, mapSOLiDRead)); } } } if (opt.bExcludeAmbiguousReads && bestMP.bestMappingNo == 1) { this->printAMappedPair(table, bestMP.bm1, bestMP.bm2, bestMP.bestMappingNo); } if (bNoMapping) { dealNoMapping(table, m1, m2); this->bookNoMappedKeepPairs(sepMore, sepLess, pairedOnExpStrand); } else { // TODO get quality score, read sequence? if(bPrintAllMapping) { this->printValidMappedPair(table, m1, m2, bestMP.validMappingNo); } else { this->dealBestMapping(table, bestMP, m1, m2); } this->bookKeepMappedPairs(bestMP); // Book keep mapping } return(0); } int CPairedReadsMapping::mapPairedLongReadsInBases (CLongReadsSet& longReadSet1, CLongReadsSet& longReadSet2, const CGenome_Index_TableQ& table) { unsigned int uiReadLength = this->opt.readLength; CAlignmentsQ& aQue1 = this->alignmentsQ[0]; CAlignmentsQ& aQue2 = this->alignmentsQ[1]; CReadInBitsSet& readSetA1stHalf = *(longReadSet1.F_Reads); CReadInBitsSet& readSetA2ndHalf = *(longReadSet1.R_Reads); CReadInBitsSet& readSetB1stHalf = *(longReadSet2.F_Reads); CReadInBitsSet& readSetB2ndHalf = *(longReadSet2.R_Reads); const char* readSetName = readSetA1stHalf.InputFile; // Flag that set the alignment is ambiguous or not getReadsFileFormat(readSetName, opt.readsFileFormat); string seedStr = seedSymbol(table.chosenSeedId); printf("Mapping %s (%u-bp reads) with %s seed.\n", \ readSetName, uiReadLength, seedStr.c_str()); this->initializeStatsCounter(); if (this->setUpIO4Aligment(readSetName, table) != 0) { LOG_INFO("\nInfo %d: Fail to setup I/O files.", ERROR_LOG); return(1); } // alignmentsQ[0].setQueue_All_Best_OneFlag('A'); while (get_next_capacity_long_paired_reads(longReadSet1, longReadSet2) > 0) { int bufferedReadNo = this->getPairedReadSetSize(readSetA1stHalf, readSetA2ndHalf, readSetB1stHalf, readSetB2ndHalf); vector::iterator itA1, itA2, itB1, itB2; itA1 = readSetA1stHalf.pReadsSet->begin(); itA2 = readSetA2ndHalf.pReadsSet->begin(); itB1 = readSetB1stHalf.pReadsSet->begin(); itB2 = readSetB2ndHalf.pReadsSet->begin(); for (int i = 0; i < bufferedReadNo; i++, itA1++, itA2++, itB1++, itB2++) { this->printCheckPointInfo(i); CMappingResult m1, m2; this->getLongBaseReadInfo(readSetA1stHalf, readSetA2ndHalf, i, *itA1, *itA2, m1); this->getLongBaseReadInfo(readSetB1stHalf, readSetB2ndHalf, i, *itB1, *itB2, m2); this->queryALongReadInBase(*itA1, *itA2, table, aQue1); this->queryALongReadInBase(*itB1, *itB2, table, aQue2); // statistics and output if (aQue1.load > 0 && aQue2.load > 0) { this->dealMappedLongPairedRead(aQue1, aQue2, m1, m2, table); } else if (this->opt.bPrintUnMappedReads) { // TODO to be done // dealMissedPairedRead(m1, m2); } } this->iReadCounter += bufferedReadNo; } this->tearDownIO4Aligment(); this->iBadReadCounter = longReadSet1.uiNo_of_Bad_Reads + longReadSet2.uiNo_of_Bad_Reads; this->printLogFile(readSetName); return(0); }./Source/PairedReadsSet.cpp0000644011075700120610000001532011720654362015745 0ustar yanghochmath-ar#include "PairedReadsSet.h" CPairedReadsSet::CPairedReadsSet() { this->initialization(); } CPairedReadsSet::CPairedReadsSet(const char* InputFile, const char* fileFormat,\ unsigned int expReadStrLineLength, bool in5to3cat3to5Format,\ unsigned int allowedNumOfNinRead, unsigned int readStartIndex) { bool bDiscardReadsWN = (allowedNumOfNinRead == 0); this->initialization(); // TODO rename the uiRead_Length variable to proper name if (expReadStrLineLength % 2 == 1) { this->uiRead_Length = (expReadStrLineLength + 1) / 2; } else { // The uiRead_Length length is the length of each end; not the total read length this->uiRead_Length = expReadStrLineLength / 2; } this->in5to3cat3to5Format = in5to3cat3to5Format; unsigned int uiCapacity = BUFFERED_READS_SIZE ; this->F_Reads = new CReadInBitsSet(uiCapacity, this->uiRead_Length); this->F_Reads->bDiscardReadWithN = bDiscardReadsWN; this->F_Reads->allowedNumOfNinRead = allowedNumOfNinRead; this->R_Reads = new CReadInBitsSet(uiCapacity, this->uiRead_Length); this->R_Reads->bDiscardReadWithN = bDiscardReadsWN; this->R_Reads->allowedNumOfNinRead = allowedNumOfNinRead; myStrCpy(this->InputFile, InputFile, FILENAME_MAX); myStrCpy(this->F_Reads->InputFile, InputFile, FILENAME_MAX); // chExtName(this->F_Reads->InputFile, "_F"); myStrCpy(this->R_Reads->InputFile, InputFile, FILENAME_MAX); // chExtName(this->R_Reads->InputFile, "_R"); // The length will be changed for long read with odd bases this->parser.caNextRead[expReadStrLineLength] = '\0'; this->openAFileReady2GetRead(InputFile, fileFormat, expReadStrLineLength, bDiscardReadsWN); // Haven't seen csfasta has QUAL file in the 5'-3'3'-5' form } CPairedReadsSet::CPairedReadsSet(unsigned int Capacity, unsigned int uiReadLength) { this->F_Reads = new CReadInBitsSet(Capacity, uiReadLength); this->R_Reads = new CReadInBitsSet(Capacity, uiReadLength); } CPairedReadsSet::~CPairedReadsSet(void) { delete this->F_Reads; delete this->R_Reads; delete this->pQualScoresF; delete this->pQualScoresR; } unsigned int CPairedReadsSet::openAFileReady2GetRead\ (const char* InputFile, const char* fileFormat, unsigned int uiExpReadsStrLength, bool bDiscardReadsWN) { // const unsigned int uiExpReadsStrLength = this->uiRead_Length * 2; const unsigned int uiReadStartIndex = 0; // In this format, no base can be removed from the 5' end myStrCpy(this->InputFile, InputFile, FILENAME_MAX); this->cFileType = this->parser.openAFileReady2GetRead\ (InputFile, fileFormat, uiReadStartIndex, uiExpReadsStrLength, bDiscardReadsWN); bool bGetQScores = (this->cFileType == 'Q' || this->cFileType == 'q'); bool bSOLiDReadFormat = (this->cFileType == 'S' || this->cFileType == 'Q'); if ( this->cFileType == 'N') { return(1); } else if (bSOLiDReadFormat) { string msg1 = "Currently, we don't know any SOLiD pair-end reads in the format,"; string msg2 = "that concatenate forward and backward trend together"; LOG_INFO("\nInfo %d: %s\n%s\n", WARNING_LOG, msg1.c_str(), msg2.c_str()); return(1); } else if (bGetQScores) { this->F_Reads->pQualScores = new CReadsQualScores(this->uiRead_Length, BUFFERED_READS_SIZE); this->R_Reads->pQualScores = new CReadsQualScores(this->uiRead_Length, BUFFERED_READS_SIZE); } // For csfasta reads, new the quality score space when QUAL files exists this->F_Reads->clear(BUFFERED_READS_SIZE); this->R_Reads->clear(BUFFERED_READS_SIZE); return(BUFFERED_READS_SIZE); } void CPairedReadsSet::clearReads(void) { this->F_Reads->clear(BUFFERED_READS_SIZE); this->R_Reads->clear(BUFFERED_READS_SIZE); } void CPairedReadsSet::ignoreQScores(void) { this->F_Reads->ignoreQScores(); this->R_Reads->ignoreQScores(); } void CPairedReadsSet::removeExtraTags(void) { for (unsigned int i = (unsigned int)(F_Reads->pReadsID->size()); i > this->uiNo_of_Reads; i--) { this->F_Reads->pReadsID->pop_back(); // remove extra tags this->R_Reads->pReadsID->pop_back(); // remove extra tags } } void CPairedReadsSet::getQualityScoresFromQUAL(void) { if (this->cFileType == 'S') { this->F_Reads->pQualScores->getQualityScoresFromQUAL(this->F_Reads->pReadsID); this->R_Reads->pQualScores->getQualityScoresFromQUAL(this->R_Reads->pReadsID); } } /* unsigned int CPairedReadsSet::get_next_capacity_reads_pairs_from_single_file() { bool bStoreQS = (this->R_Reads->pQualScores != NULL) && (this->F_Reads->pQualScores != NULL); bool bSOLiDReadFormat = (this->cFileType == 'Q' || this->cFileType == 'S'); bool bGetQScores = (this->cFileType == 'Q' || this->cFileType == 'q') && bStoreQS; this->clearReads(); do { const char* caNextRead = parser.get_Next_Read(); // get next read and store in this->parser.caNextRead if (caNextRead[0] == '\0') { this->parser.pBuf->fflush(); break; // End of the file } else if (isBadRead(bSOLiDReadFormat, caNextRead, this->longReadLength)) { this->handleBadread(); } else { this->save_next_long_read(bSOLiDReadFormat, bGetQScores, this->in5to3cat3to5Format); } } while (this->F_Reads->pReadsID->size() < this->F_Reads->pReadsSet->capacity()); printf("Deal read no. %u in %s.\r", this->uiNo_of_Reads, this->InputFile); this->removeExtraTags(); if(bStoreQS) { this->getQualityScoresFromQUAL(); } return((unsigned int)this->R_Reads->pReadsSet->size()); } */ unsigned int CPairedReadsSet::getExpReadLength(unsigned int fullReadLength) { bool oddReadLength = (fullReadLength % 2 == 1); unsigned int expFullReadLength = this->uiRead_Length * 2; if (oddReadLength) { expFullReadLength --; } return(expFullReadLength); } void CPairedReadsSet::save_next_read_id(const char* tagLine, char sep) { this->F_Reads->save_next_read_id(tagLine, sep); this->R_Reads->save_next_read_id(tagLine, sep); } int CPairedReadsSet::initialization(void) { this->uiNo_of_Bad_Reads = 0; this->uiNo_of_Reads = 0; this->uiRead_Length = 0; this->F_Reads = NULL; this->R_Reads = NULL; this->pQualScoresF = NULL; this->pQualScoresR = NULL; this->in5to3cat3to5Format = false; return(0); } void CPairedReadsSet::handleBadRead(void) { // Currently no message for Bad read if (this->R_Reads->pReadsID->size() > this->R_Reads->uiNo_of_Reads) { this->R_Reads->pReadsID->pop_back(); } if (this->F_Reads->pReadsID->size() > this->F_Reads->uiNo_of_Reads) { this->F_Reads->pReadsID->pop_back(); } this->uiNo_of_Bad_Reads ++; // double check if the counter is correct } ./Source/ParameterList.cpp0000644011075700120610000012302011720654362015657 0ustar yanghochmath-ar/* * This code is adopted from SOCS */ #include "ParameterList.h" using namespace std; const unsigned int DEFAULT_READ_LENGTH = 32; const unsigned int DEFAULT_MAPPING_PER_READ = 200; const unsigned int MAX_MAP_NO_PER_READ = 2000; // should match MAX_Q_CAPACITY; const unsigned int DEFAULT_UPPER_BOUND = 1000000; //const unsigned int MAX_MAP_NO_PER_READ = 1000000; // should match MAX_Q_CAPACITY; MappingOpts::MappingOpts(void) { this->setDefaults(); } MappingOpts::~MappingOpts(void) { ; } void MappingOpts::setDefaults(void) { this->fullCommand = string("perm"); this->readLength = DEFAULT_READ_LENGTH; this->anchorLength = DEFAULT_READ_LENGTH; // Mapping criteria this->ambiguousDiffThreshold = 0; this->maxAlignPerRead = DEFAULT_MAPPING_PER_READ; int mismatchScoreThreshold = -1; // No filtering by sum of mismatch score this->subDiffThreshold = 2; this->truncatedReadPrefix = 0; // Output all the best alignments in terms of # of mismatches by default. this->bMap2ForwardStrandOnly = false; this->bMap2ReverseStrandOnly = false; this->bExcludeAmbiguousReads = false; this->bGetAllAlignments = false; this->bIgnoreQS = false; this->bPrintAlignments = true; this->bPrintAmbiguousReadsOnly = false; this->bPrintBadReads = false; this->bPrintFirstAlignmentOnly = false; this->bPrintNM = false; this->bPrintSamHeader = true; this->bPrintAmbigReadsSeparately = false; this->bPrintUnMappedReads = false; this->bPrintAmbigReadsInOneLine = false; // I/O. this->readtag_delimiter = ','; strcpy(this->readsFileFormat, ""); strcpy(this->logFileN, ""); strcpy(this->outputDir, ""); strcpy(this->outputFileN, ""); strcpy(this->outputFormat, ""); strcpy(this->readsFileFormat, ""); strcpy(this->ambiguousReadFileN, ""); strcpy(this->badReadFileN, ""); strcpy(this->unmappedFileN, ""); // mate-pairs this->bExcludeAmbiguousPaired = false; this->bPrintBestPaired = false; this->disUB = DEFAULT_UPPER_BOUND; this->disLB = 0; this->frOnly = false; // Paired end can only aligned to the same strand this->ffOnly = false; // Paired end can only aligned to the same strand. this->bPrintRef4PairedInMapping = false; this->bPrintPairedRQ = false; // performance #ifdef _OPENMP // Parallelization this->maxThreadNum = omp_get_num_procs(); #else this->maxThreadNum = 1; #endif // Because OpenMP 2.5 only support signed integer loop. Case the uiSize. The number of bucket is limited to signed int } void MappingOpts::clearOutputFileName(bool clear) { if (clear) { strcpy(this->outputFileN, ""); strcpy(this->unmappedFileN, ""); } } ParameterList::ParameterList() { setDefaults(); } void ParameterList::setDefaults() { this->validFlag = true; this->refFormat == "unknown"; strcpy(refFile, "ref"); strcpy(readsFile, "reads"); strcpy(this->indexFileN, ""); strcpy(this->seedName, ""); this->seedId = 2; // Index this->bMaskedMathRepeat = false; this->bMakeIndex = false; this->bSaveIndex = false; // Mate-pair this->bMatePairedReads = false; strcpy(this->matePairFileN1, ""); strcpy(this->matePairFileN2, ""); } int selectSeedByReadLength(ParameterList& P) { if (P.subDiffThreshold == 0 && P.readLength < 30) { strcpy(P.seedName, "F0"); return(0); } if (P.bMappedSOLiDRead) { if (P.readLength <= 25) { return(2); } else if (P.readLength <= 35) { return(11); } } else { if (P.readLength <= 35) { if (P.readLength < 25 || P.subDiffThreshold < 2) { return(1); } else { return(2); } } else if (P.bMappedLongRead) { int potentialSeedId = P.subDiffThreshold / 2; if (potentialSeedId <= 4 && potentialSeedId > 0) { return(potentialSeedId); } } } if (P.subDiffThreshold <= 2) { return(2); } else { return(3); } } // This function decide options base on the ext name of some options int selectSeed(ParameterList& P) { string seed(P.seedName); if ( seed == "F0") { return(0); } else if ( seed == "F1") { return(1); } else if ( seed == "F2") { return(2); } else if (seed == "S11") { return(11); } else if (seed == "F3") { return(3); } else if (seed == "F4") { return(4); } else if (seed == "S20") { return(20); } else if (seed == "S12") { return(12); } else { // current don't provide higher full sensitivity seed return(selectSeedByReadLength(P)); } } inline string fileFormatSymbol2String(char cFileFormatSymbol) { string fileFormat; switch (cFileFormatSymbol) { case 'Q': fileFormat = string("csfastq"); return(fileFormat); case 'q': fileFormat = string("fastq"); return(fileFormat); case 'S': fileFormat = string("csfasta"); return(fileFormat); default: fileFormat = string("fasta"); return(fileFormat); } } // Print out the setting info void ParameterList::printSetting(void) { // Read type, allowing mismatches, allowing ambiguous read printf("Options Info:\n"); if (this->bMappedSOLiDRead) { printf("Expect reads in color space.\n"); } string fileType = fileFormatSymbol2String(this->cFileFormatSymbol); printf("Reads are processed as in %s format.\n", fileType.c_str()); if (this->bDiscardReadWithN) { printf("Reads with 'N' or unknown characters will be discarded.\n"); } else if (this->allowedNumOfNinRead > 0) { printf("Reads with %d 'N' or unknown characters will be discarded.\n", this->allowedNumOfNinRead); } printf("Results for reads that map to more than %d locations will not be reported.\n", this->maxAlignPerRead); if (this->truncatedReadPrefix > 0) { printf("The first %u bases will be excluded as barcode.\n", this->truncatedReadPrefix); } printf("The effective read length is %u.\n", this->readLength); printf("%d mismatches are allowed in the length.\n", this->subDiffThreshold); if (this->bPrintAmbiguousReadsOnly) { printf("Only reads mapped to multiple locations will be printed.\n"); } else if (this->bExcludeAmbiguousReads) { printf("Reads mapped to multiple locations will be exclueded.\n"); } else if (this->bGetAllAlignments && !this->bExcludeAmbiguousReads) { printf("Reads which have alignments within %u mismatches will be counted as mapped.\n", this->subDiffThreshold); printf("Reads which have more than %u alignments won't be printed.\n", this->maxAlignPerRead); } else { printf("Alignments with minimum mismatches will be collected.\n"); } if(this->bMap2ForwardStrandOnly) { printf("Reads are mapped to only the forward strand.\n"); } if (this->bMap2ReverseStrandOnly) { printf("Reads are mapped to only the reverse strand.\n"); } if(this->bMap2ForwardStrandOnly && this->bMap2ReverseStrandOnly) { printf("Conflict!\n"); } cout << endl; } // This function decide options base on the ext name of some options void ParameterList::getOptsByCheckingExtName(void) { // fixed the --readsFormat input char c = this->readsFileFormat[0]; if (c != '\0' && c != '.') { strcpy(&this->readsFileFormat[1], string(this->readsFileFormat).c_str()); this->readsFileFormat[0] = '.'; } // choose the output format by ext name // Also check .SAM .Mapping or .fastq (Not case sensitive) if ((hasTheExtName(this->outputFileN, ".sam"))) { if (this->outputFormat[0] == '\0') { strcpy(this->outputFormat, "sam"); } else if (strcmp(this->outputFormat,"sam") != 0 && strcmp(this->outputFormat,"SAM")) { LOG_INFO("Info %d: Conflict between ext file and --outputformat %s!\n", WARNING_LOG, this->outputFormat); } } if (hasTheExtName(this->outputFileN, ".mapping")) { if (this->outputFormat[0] == '\0') { strcpy(this->outputFormat, "mapping"); } else if (strcmp(this->outputFormat,"mapping") != 0 && strcmp(this->outputFormat,"MAPPING") != 0) { LOG_INFO("Info %d: Conflict between ext file and --outputformat %s!\n", WARNING_LOG, this->outputFormat); } } if (hasTheExtName(this->outputFileN, ".fastq") || hasTheExtName(this->outputFileN, ".fq") || hasTheExtName(this->outputFileN, ".FASTQ") || hasTheExtName(this->outputFileN, ".FQ") ) { if (this->outputFormat[0] == '\0') { strcpy(this->outputFormat, "fastq"); } else if (strcmp(this->outputFormat,"fastq") != 0 && strcmp(this->outputFormat,"FASTQ") != 0 && strcmp(this->outputFormat,"fq") != 0 && strcmp(this->outputFormat,"FQ") != 0) { LOG_INFO("Info %d: Conflict between ext file and --outputformat %s!\n", WARNING_LOG, this->outputFormat); } } } bool ParameterList::checkRefValidity(void) { if (!fileExist(this->refFile)) { LOG_INFO("Info %d: Reference file %s is not found.\n",\ ERROR_LOG, this->refFile); this->validFlag = false; } else if (!(withFastaExtFileName(this->refFile) || \ hasTheExtName(this->refFile, ".dat") || \ hasTheExtName(this->refFile, ".txt") || \ hasTheExtName(this->refFile, ".index")) &&\ !(this->refFormat == "fasta" ||\ this->refFormat == "list" ||\ this->refFormat == "index" )) { LOG_INFO("\nInfo %d: Reference %s has an unexpected ext name.\n",\ ERROR_LOG, this->readsFile); this->validFlag = false; printSynopsis(); } return(this->validFlag); } bool ParameterList::truncatReadLength(void) { // Truncate reads if opts are specified bool bTruncateRead = this->truncatedReadPrefix > 0; // Ex: if -t 5 -T 40 for a line with 50 char, cut the reads [5-44] bTruncateRead |= (this->readLength > (this->truncatedReadLength + this->truncatedReadPrefix)); // this->bMatePairedReads has a special format fro 5'-3' 3'-5' (Don't truncate in that case) if (bTruncateRead) { if (this->readLength > this->truncatedReadPrefix) { this->readLength = min(this->truncatedReadLength, this->readLength - this->truncatedReadPrefix); } if (this->truncatedReadPrefix > 0) { LOG_INFO("Info %d: The first %d bp of the read will be ignored.\n", INFO_LOG, this->truncatedReadPrefix); } LOG_INFO("Info %d: The following %d bp will be kept.\n", INFO_LOG, this->readLength); } this->anchorLength = this->readLength; if (this->readLength > MAX_READ_LENGTH * 2) { LOG_INFO("\nInfo %d: Read length %d is longer than the maximum read length %d.\n",\ ERROR_LOG, this->readLength, MAX_READ_LENGTH * 2); return(false); } else if (this->readLength < MIN_READ_LENGTH) { LOG_INFO("Info %d: Read length %d could be too short!\n", INFO_LOG, this->readLength); return(false); } else if (this->readLength > MAX_READ_LENGTH) { if ( this->bMappedSOLiDRead) { // TODO: Enable to map long solid read by not limit the read length this->readLength = MAX_READ_LENGTH; // Truncated to read length 64 for SOLiD read. this->anchorLength = MAX_READ_LENGTH; // Truncated to read length 64 for SOLiD read. /* this->bMappedLongRead = true; // long reads in Illumina // when build up the table, use half read length as seed. if (this->readLength % 2 == 1) { this->bOddReadLengthAndLongRead = true; this->anchorLength = (this->readLength + 1) / 2; } else { this->anchorLength = this->readLength / 2; } */ } else if (this->bMatePairedReads && this->readLength > MAX_READ_LENGTH * 2) { LOG_INFO("\nInfo %d: Currnetly PerM doesn't support pair-end reads with %d bp > %d bp.\n",\ WARNING_LOG, this->readLength, MAX_READ_LENGTH * 2); LOG_INFO("\nInfo %d: Truncated reads into %d bp\n", INFO_LOG, MAX_READ_LENGTH); this->readLength = MAX_READ_LENGTH; // Truncated to read length 64 for SOLiD read. this->anchorLength = MAX_READ_LENGTH; // Truncated to read length 64 for SOLiD read. } else { this->bMappedLongRead = true; // long reads in Illumina // when build up the table, use half read length as seed. if (this->readLength % 2 == 1) { this->bOddReadLengthAndLongRead = true; this->anchorLength = (this->readLength + 1) / 2; } else { this->anchorLength = this->readLength / 2; } } } return(true); } unsigned int nonSpaceCharCount(const char* str) { unsigned int returnValue = 0; for (int i = 0; str[i] != '\0'; i++) { if (!(isspace(str[i]))) { returnValue++; } } return(returnValue); } // Return the read length given a line in the reads input file unsigned int getReadLength(char* line, char cFileType) { unsigned int readLength = 0; switch (cFileType) { case 'F': // Given the reads file are fasta format case 'q': readLength = (unsigned int) nonSpaceCharCount(line); return (readLength); case 'S': // Given the reads file are SOLiD format case 'Q': readLength = (unsigned int) nonSpaceCharCount(line) - 1; return (readLength); default: return DEFAULT_READ_LENGTH; } } bool isSkipLines(const char* line) { return(!isNucleotide(line[0])); } // Must add new reads estimate function to support other reads format. unsigned int getReadLength(const char* readFile, char expFileFormat) { char cFileType; if (expFileFormat != 'N') { cFileType = expFileFormat; } else { cFileType = getReadsFileFormatSymbol(readFile); } ifstream ifile(readFile); char seqLine[MAX_LINE]; do { seqLine[0] = EOF; ifile.getline(seqLine, MAX_LINE, '\n'); if (seqLine[0] == EOF) { LOG_INFO("\nInfo %d: The read length can not be decided from the input format.\n", WARNING_LOG); return(DEFAULT_READ_LENGTH); } } while (isSkipLines(seqLine)); unsigned int readLength = getReadLength(seqLine, cFileType); // Add the extra read length if it is printed in the following second line. if (readLength >= MAX_READ_LENGTH) { ifile.getline(seqLine, MAX_LINE, '\n'); if (!(isSkipLines(seqLine))) { readLength += getReadLength(seqLine, cFileType); } } return (readLength); } void printSingleSynopsis(void) { cout << "For single end:" << endl; cout << "perm <-flag options>" << endl; cout << "Ex: perm Ref.fasta Reads.fa -v 5 -o out.sam " << endl; cout << "Ex: perm RefFilesList.txt ReadsFileList.txt -v 3 -A -m -s my.index" << endl; cout << "Ex: perm Ref.index ReadsSetFilesList.txt -E > my.log" << endl; cout << endl; } void printPairedSynopsis(void) { cout << "For paired-end reads:" << endl; cout << "perm -1 -2 <-flag options>" << endl; cout << "Ex: perm chrM.fa -1 F3.fq -2 R3.fq -U 500 -L 200 -o out.sam." << endl; cout << "Ex: perm hg18.txt -1 F3.csfasta -2 R3.csfasta -U 500 -L 0 -e -m -s hg18.index" << endl; cout << "Ex: perm Transcriptom.index -1 F3.fa -2 R3.fa" << endl; cout << endl; } void printBuildInDexInfo(void) { cout << "For build index only:" << endl; cout << "perm " << endl; cout << "Ex: perm hg18.fa 50 --readFormat csfasta --seed F3 -s 50F3hg18Solid.index" << endl; cout << "Ex: perm hg18.txt 50 --readFormat fastq --seed F2 -s 50F2hg18Illumina.index " << endl; cout << endl; } void printSingleEndExample(void) { cout << "Ex: ./perm Ref.fa Reads.fa -v 5 -E -T 36 -m -s -o out.sam" << endl; cout << "PerM will report only the uniquely mapped reads within 5 mismatches in the first 36 bases." << endl; cout << "PerM will reconstruct and save the indexes to the default path for future mappings." << endl; cout << "PerM will output the mappings file to out.sam in the SAM format." << endl; cout << endl; } void printPairedEndExample(void) { cout << "Ex: ./PerM -1 F3.fq -2 R3.fq -U 500 -L 200 -m -s -o out.sam." << endl; cout << "PerM will report only the non-ambiguous mate pair in the range of [200bp, 500bp]" << endl; cout << "PerM will reconstruct and save the indexes to the default path for future mappings." << endl; cout << "PerM will output the mappings file to out.sam in the SAM format." << endl; cout << endl; } void printIoInfo(void) { cout << "" << endl; cout << "The reference file should be in the fasta format, with *.fa or *.fasta ext filename." << endl; cout << "It can be transcriptom, with multiple seqs separated by lines start with '>' and the ref seq name." << endl; cout << "If you have multiple files, you can put each file path a line in a .txt file." << endl; cout << "ex: chr1.fa\\n chr2.fa\\n .. etc, with each path in one row." << endl; cout << "PerM also takes the txt file as ref, a file listing paths allreference paths." << endl; cout << "PerM parses a ref file according to it's ext name, which can be overwritten by --refFormat flag."<< endl; cout << endl; cout << "" << endl; cout << "The reads file should be in the .fasta, .fastq or .csfasta format." << endl; cout << "PerM parses a reads file according to it's ext name, which can be overwritten by --readFormat flag."<< endl; cout << "Large reads file can be split to small files. Make *.txt file with each line a path of the reads." << endl; cout << "Use the txt file as the reads input so PerM can map reads set in parallels." << endl; cout << endl; cout << "" << endl; cout << "PerM's default output format has the ext name *.mapping." << endl; cout << "To output in SAM format, specify flag -o path.sam, instead of -o path.mapping." << endl; cout << endl; } void printDefault(void) { cout << "When you type:" << endl; cout << "\tperm for single end read mapping, or" << endl; cout << "\tperm -1 -2 for paired reads maping" << endl; cout << endl; cout << "PerM uses its default options, which can be overwritten by the options flags." << endl; cout << "The min and max of pair-end separations are 0 and 1Mb in a same ref, respectively" << endl; cout << "By default, PerM outputs all the best alignments in terms of the mismatches number." << endl; cout << "It throw away all reads with 'N' or '.' unless --includeReadsWN is specified" << endl; cout << "The default seeds varied in different the read lengthes and types." << endl; cout << "It won't save the index if -s is not specified." << endl; cout << "When -s is specified but not follow a path, a default filename is used to save the index." << endl; cout << "That path will be test for index reused for the mapping next time." << endl; cout << "The I/O formats are decided by the the ext name (.sam or .mapping) of the specified I/O file." << endl; cout << "unless special flags are set to overwrite the guess." << endl; cout << endl; } void printSingleEndOptions(void) { cout << "Options for both single and pair-end:" << endl; cout << "The default is to output all best alignments in terms of # of mismatches." << endl; cout << "-A Output all alignments within the mismatch threshold (see -v option), end-to-end." << endl; cout << "-E Output only uniquely mapped reads remaining after the best down selection has been applied if applicable." << endl; cout << "-E -A Output only the uniquely mapped reads" << endl; cout << " ex: a read with a exact match and one mismatch, it is unique under only -E" << endl; cout << " but not unique under -E -A." << endl; // cout << "-a Print out ambiguous reads only. Those uniquely mapped reads will be ignored" << endl; cout << "-v Int Maximum number of mismatches allowed (or allowed in each end for pair-end reads)." << endl; cout << "" << endl; cout << "-t Number of bases at the 5' end of each read to ignore. For example, if the first 5 bases are used as a barcode or to index multiple samples together, use -t 5. If not specified, no initial bases will be ignored. " << endl; cout << "-T Int Number of bases in each read to use, starting after any bases ignored by -t option. " << endl; cout << " Later bases at the 3' of the read are ignored. For example, -T 30 means use only first 30 bases (signals) after the any bases ignored due to the -t option." << endl; cout << "-k Int The naxmimun num of mapping per read. The default is " << DEFAULT_MAPPING_PER_READ << '.' << "The maximun is " << MAX_MAP_NO_PER_READ << "." << endl; cout << " reads mapped to more than that location won't be printed by default, unless flag --ambiguosReadOnly is set." << endl; cout << "-m Create the reference index without reusing the saved index even if available. " << endl; cout << "-s Path Save the reference index to accelerate the mapping in the future. If path is not specified, the default path will be used. " << endl; cout << "-o Path Name of mapping output file when mapping a single read set. " << endl; cout << " The output file format will be either the .mapping tab delimited text format or the SAM format as determined by the extension of the output filename." << endl; cout << "-d Path Output directory for mapping output files when mapping multiple read sets (output files will be named automatically)." << endl; cout << "-a Path Create a FASTA (FASTQ) file for reads mapped to more positions than the threshold specified by -k or the default of 200. " << endl; cout << "-b Path Create a FASTA (FASTAQ) file of bad reads or reads shorter than expected." << endl; cout << "-u Path Create a FASTA (FASTAQ) file of unmapped reads." << endl; cout << "-u When multiple read sets are mapped, filename is irrelevant and should be omitted." << endl; cout << " the files of unmapped sequences will automatically be named and created in the directory PerM is run from." << endl; cout << "If only -a -u or -o is specified without giving a path, a default path will be used instead." << endl; cout << "--log Path Output the mapping count to a specified file" << endl; cout << endl; cout << "--forwardOnly Map reads to the forward strand only: (This is for SOLiD Strand specific sequencing) " << endl; cout << "--reverseOnly Map reads to the reverse strand only: (This is for SOLiD Strand specific sequencing) " << endl; cout << "--forwardOnly and --reverseOnly shouldn't use together" << endl; cout << "--ambiguosReadOnly Output only ambiguous mapping to find repeats (similar regions within substitution threshold)" << endl; cout << "--ambiguosReadInOneLine Output reads mapped to more than k places in one line" << endl; cout << ". when this option is specified, reads that mapped to over mapping number threshold that specified by -k will still be printed." << endl; cout << "--noSamHeader Do not print the SAM header. This makes it easier to concatenate multiple SAM output files." << endl; cout << "--includeReadsWN Int Map reads with at most given number of N or '.' bases by encoding N or '.' as A or 3." << endl; cout << " Normally such reads are ignored. " << endl; cout << "--statsOnly Output the mapping statistics to stdout only, without saving alignments to files. " << endl; cout << "--ignoreQS Ignore the quality scores in fastq or QUAL files." << endl; cout << "--printNM When quality scores are available, use this flag to print number of mismatches, instead of mismatch scores in mapping format. " << endl; cout << "--delimiter 'c', where c is the delimiter of the read id, for strange format of read files" << endl; cout << "--seed {F2 | S11 | F3 | F4}. Specify the seed pattern. The F0, F1, F2, F3, and F4 seeds are fully sensitive to 0-4 mismatches respectively." << endl; cout << "The S11 and S12 seeds are fully sensitive to one biological mismatch (SNP) and one or two SOLiD color mismatches respectively" << endl; cout << "See http://bioinformatics.oxfordjournals.org/cgi/content/abstract/25/19/2514" << endl; cout << "--refFormat {fasta | list | index }. Take refs file in the specified format," << endl; cout << "instead of gussing according to its ext name." << endl; cout << "--readFormat {fasta | fastq | csfasta | csfastq}. Take reads file in the specified format," << endl; cout << "instead of gussing according to its ext name." << endl; cout << "--outputFormat {sam | mapping | fastq}. Output mapping in the specified format," << endl; cout << "instead of gussing according to its ext name." << endl; cout << endl; printSingleEndExample(); } void printPairedEndOptions(void) { cout << "Options for mate-paired reads:" << endl; cout << "-L I lower bound for mate-paired separation distance" << endl; cout << "-U I upper bound for mate-paired separation distance" << endl; cout << "-e Exclude ambiguous paired " << endl; cout << "-o P where P is the path (single paired read set) or the directory (multiple paired read sets)" << endl; cout << "The default is set as -L 0 -U 1000000" << endl; cout << "--fr Map paired-end reads to different strand only" << endl; cout << "--ff Map paired-end reads to the same strand only" << endl; cout << "The default option collects paried-end mappings in both the same and difference strands" << endl; cout << "--printRefSeq Print the reference sequence of paired-end mapping in .mapping format" << endl; cout << "--printQual Print the read qual scores for paired-end mapping in .mapping format" << endl; cout << endl; printPairedEndExample(); } void printOptionsInfo(void) { cout << "Type \"perm single\" to see options for single end mapping" << endl; cout << "Type \"perm paired\" to see options for paired end mapping" << endl; /* printSingleEndOptions(); printPairedEndOptions(); */ } void printSynopsis(void) { cout << "PerM (Efficient read mapping with periodic full sensitive seeds)" << endl; cout << "Version: 0.4.0\n" << endl; cout << "Synopsis:" << endl; printSingleSynopsis(); printPairedSynopsis(); printBuildInDexInfo(); cout << "Please type" << endl; cout << "perm io (to see the I/O format)" << endl; cout << "perm default (to see the default opt setting)" << endl; cout << "perm single (to see options for single-end mapping)" << endl; cout << "perm paired (to see options for paired-end mapping)" << endl; cout << "For more info, please check: http://code.google.com/p/perm/" << endl; cout << endl; } void printUsageInfo(string helepOpt) { if (helepOpt == "io") { printIoInfo(); } else if (helepOpt == "default") { printDefault(); } else if (helepOpt == "options") { printOptionsInfo(); } else if(helepOpt == "single") { printSingleEndOptions(); } else if(helepOpt == "paired") { printPairedEndOptions(); } else { printSynopsis(); } } string getFullCommand(int argc, const char** argv) { string fullCommand = string(argv[0]); int i; for (i = 1; i < argc; i++) { string tmpStr(argv[i]); fullCommand = fullCommand.append(" ").append(tmpStr); } return(fullCommand); } static int assignMismatchThresholdBySeed(const char* seedName, int threshold) { if (strcmp(seedName, "F2") == 0) { threshold = 2; } else if (strcmp(seedName, "S11") == 0) { threshold = 3; } else if (strcmp(seedName, "F3") == 0) { threshold = 3; } else if (strcmp(seedName, "S12") == 0) { threshold = 4; } else if (strcmp(seedName, "F4") == 0) { threshold = 4; } return(threshold); } ParameterList getParameterList(int argc, const char** argv) { ParameterList parameters; if (argc <= 1 || argv == NULL) { printSynopsis(); } else if (argc == 2) { printUsageInfo(argv[1]); } else { parameters.fullCommand = getFullCommand(argc, argv); CFlags f; strcpy(parameters.refFile, argv[1]); f.checkpStrOpt(argc, argv, "--refFormat", parameters.refFormat); strcpy(parameters.readsFile, argv[2]); f.checkpStrOpt(argc, argv, "--readFormat", parameters.readsFileFormat); if (argc > 3) { int DEFAULT_VAR_T = parameters.subDiffThreshold; parameters.subDiffThreshold = atoi(argv[3]); if (argv[3][0] != '0' && atoi(argv[3]) == 0) { parameters.subDiffThreshold = DEFAULT_VAR_T; // Arg not a number } } bool bAssignSeed = f.checkpStrOpt(argc, argv, "--seed", parameters.seedName); if (bAssignSeed) { parameters.subDiffThreshold = assignMismatchThresholdBySeed(parameters.seedName, parameters.subDiffThreshold); } f.checkIntOpt(argc, argv, "-v", parameters.subDiffThreshold); f.checkIntOpt(argc, argv, "-q", parameters.mismatchScoreThreshold); f.checkIntOpt(argc, argv, "-k", parameters.maxAlignPerRead); if (parameters.maxAlignPerRead >= (int)MAX_MAP_NO_PER_READ) { parameters.maxAlignPerRead = (int)MAX_MAP_NO_PER_READ - 1; // Must save a space } // Special options to map strand specific reads to forward strand only parameters.bMap2ForwardStrandOnly = f.checkArg(argc, argv, "--forwardOnly"); parameters.bMap2ReverseStrandOnly = f.checkArg(argc, argv, "--reverseOnly"); // The default is to output the best alignments in term of number of mismatches. parameters.bGetAllAlignments = f.checkArg(argc, argv, "-A"); parameters.bExcludeAmbiguousReads = f.checkArg(argc, argv, "-E"); parameters.bPrintBestPaired = f.checkArg(argc, argv, "-B"); // Speicil hidden flag that print only ambiugous read. Should be used with -E parameters.bPrintAmbiguousReadsOnly = f.checkArg(argc, argv, "--ambiguosReadOnly"); parameters.bPrintFirstAlignmentOnly = f.checkArg(argc, argv, "--1stAlignmentOnly"); parameters.bPrintAmbigReadsInOneLine = f.checkArg(argc, argv, "--ambiguosReadInOneLine"); // The default will try to used the saved index table, without attempting to save table. parameters.bMakeIndex = f.checkArg(argc, argv, "-m"); parameters.bSaveIndex = f.checkArg(argc, argv, "-s"); f.checkpStrOpt(argc, argv, "-s", parameters.indexFileN); parameters.bPrintAmbigReadsSeparately = f.checkArg(argc, argv, "-a"); f.checkpStrOpt(argc, argv, "-a", parameters.ambiguousReadFileN); parameters.bPrintBadReads = f.checkArg(argc, argv, "-b"); f.checkpStrOpt(argc, argv, "-b", parameters.badReadFileN); parameters.bPrintUnMappedReads = f.checkArg(argc, argv, "-u"); f.checkpStrOpt(argc, argv, "-u", parameters.unmappedFileN); f.checkpStrOpt(argc, argv, "-o", parameters.outputFileN); f.checkpStrOpt(argc, argv, "--outputFormat", parameters.outputFormat); parameters.bPrintSamHeader = !f.checkArg(argc, argv, "--noSamHeader"); f.checkpStrOpt(argc, argv, "-d", parameters.outputDir); f.checkpStrOpt(argc, argv, "--log", parameters.logFileN); f.checkUnIntOpt(argc, argv, "-T", parameters.truncatedReadLength); f.checkUnIntOpt(argc, argv, "-t", parameters.truncatedReadPrefix); // read filtering parameters.bDiscardReadWithN = !f.checkArg(argc, argv, "--includeReadsWN"); if (!parameters.bDiscardReadWithN) { parameters.allowedNumOfNinRead = MAX_READ_LENGTH; } f.checkUnIntOpt(argc, argv, "--includeReadsWN", parameters.allowedNumOfNinRead); if (parameters.allowedNumOfNinRead > 0) { parameters.bDiscardReadWithN = false; } // truncate the read ID f.checkpCharOpt(argc, argv, "--delimiter", parameters.readtag_delimiter); // for quality score (incomplete) parameters.bPrintAlignments = !f.checkArg(argc, argv, "--statsOnly"); parameters.bIgnoreQS = f.checkArg(argc, argv, "--ignoreQS"); parameters.bPrintNM = f.checkArg(argc, argv, "--printNM"); // Set max number of Multi-thread unsigned int noMaxTreadNo = parameters.maxThreadNum; if (f.checkUnIntOpt(argc, argv, "-p", noMaxTreadNo)) { if (noMaxTreadNo > 0) { parameters.maxThreadNum = min(noMaxTreadNo, parameters.maxThreadNum); } } // for mate-pairs parameters.bExcludeAmbiguousPaired = f.checkArg(argc, argv, "-e"); f.checkpStrOpt(argc, argv, "-1", parameters.matePairFileN1); f.checkpStrOpt(argc, argv, "-2", parameters.matePairFileN2); f.checkIntOpt(argc, argv, "--LowerBound", parameters.disLB); f.checkIntOpt(argc, argv, "-L", parameters.disLB); f.checkIntOpt(argc, argv, "--upperBound", parameters.disUB); f.checkIntOpt(argc, argv, "-U", parameters.disUB); // Pair end can only align to different strands parameters.frOnly = f.checkArg(argc, argv, "--fr"); // Pair end can only align to the same strand. parameters.ffOnly = f.checkArg(argc, argv, "--ff"); parameters.bPrintRef4PairedInMapping = f.checkArg(argc, argv, "--printRefSeq"); parameters.bPrintPairedRQ = f.checkArg(argc, argv, "--printQual"); // Check if the first parameter are options not file if (argv[1][0] == '-') { // use the last two options as ref and read strcpy(parameters.refFile, argv[argc - 2]); strcpy(parameters.readsFile, argv[argc - 1]); } f.checkUnrecognizedFlags(argc, argv); parameters.getOptsByCheckingExtName(); } return(parameters); } bool withFastaExtFileName(const char* fileName) { if (hasTheExtName(fileName, ".fasta") || hasTheExtName(fileName, ".fna") || hasTheExtName(fileName, ".mfa") || hasTheExtName(fileName, ".fa")) { return (true); } else { return(false); } } bool withSupportExtFileName(const char* fileName) { if (withFastaExtFileName(fileName) || hasTheExtName(fileName, ".csfasta") || hasTheExtName(fileName, ".csfa") || hasTheExtName(fileName, ".fastqsanger") || hasTheExtName(fileName, ".fastq") || hasTheExtName(fileName, ".fq") || hasTheExtName(fileName, ".csfastq") || hasTheExtName(fileName, ".csfq")) { return (true); } else { return(false); } } /* * This function get the read set file names in a file list to vectors. * If reads are not paired, only readSetList1 will be filled. * If reads are paired, both readSetList1 and readSetList2 will be filled. */ bool getReadSetsFilenames(ParameterList &P, \ vector& readSetList1,\ vector& readSetList2) { P.bMatePairedReads = false; if (fileExist(P.readsFile)) { if (hasTheExtName(P.readsFile, ".txt") || P.refFormat == "list") { // read files are in a list. LOG_INFO("Info %d: Reading the file as a read set list\n", FINE_LOG); char readSetFile1[FILENAME_MAX]; char readSetFile2[FILENAME_MAX]; /* ifstream readsFileList(P.readsFile); P.bMatePairedReads = GetNextFilenamePairFromListFile(readsFileList, readSetFile1, readSetFile2); readsFileList.close(); */ P.bMatePairedReads = GetNextFilenamePairFromListFile(P.readsFile, readSetFile1, readSetFile2); // When forward and backward paired reads are stored in separated files and listed in a row. if (P.bMatePairedReads) { ifstream readsFileList(P.readsFile); while (GetNextFilenamePairFromListFile(readsFileList, readSetFile1, readSetFile2)) { readSetList1.push_back(readSetFile1); readSetList2.push_back(readSetFile2); } readsFileList.close(); } else { ifstream readsFileList(P.readsFile); while (GetNextFilenameFromListFile(readsFileList, readSetFile1)) { readSetList1.push_back(readSetFile1); } readsFileList.close(); } } else if (withSupportExtFileName(P.readsFile) || withSupportExtFileName(P.readsFileFormat)) { readSetList1.push_back(string(P.readsFile)); } else { cout << " The reads file is not in a recognizable format (ext name).\n" << endl; return(false); } } else if (fileExist(P.matePairFileN1) && fileExist(P.matePairFileN2)) { if((hasTheExtName(P.matePairFileN1, ".txt") && hasTheExtName(P.matePairFileN2, ".txt")) || P.refFormat == "list") { ifstream readsFileList1(P.matePairFileN1); ifstream readsFileList2(P.matePairFileN2); char readSetFile1[FILENAME_MAX]; char readSetFile2[FILENAME_MAX]; while (GetNextFilenameFromListFile(readsFileList1, readSetFile1) && GetNextFilenameFromListFile(readsFileList2, readSetFile2)) { readSetList1.push_back(readSetFile1); readSetList2.push_back(readSetFile2); } readsFileList1.close(); readsFileList2.close(); } else if ((withSupportExtFileName(P.matePairFileN1)\ && withSupportExtFileName(P.matePairFileN2))\ || withSupportExtFileName(P.readsFileFormat)) { readSetList1.push_back(string(P.matePairFileN1)); readSetList2.push_back(string(P.matePairFileN2)); P.bMatePairedReads = true; } else { cout << " The reads files are not in a recognizable format (ext name).\n" << endl; return(false); } } else { if (P.matePairFileN1[0] != '\0' && !fileExist(P.matePairFileN1)) { printf("Can't open reads file %s.", P.matePairFileN1); } if (P.matePairFileN2[0] != '\0' && !fileExist(P.matePairFileN2)) { printf("Can't open reads file %s.", P.matePairFileN2); } if (atoi(P.readsFile) == 0 && !fileExist(P.readsFile)) { printf("Can't open reads file %s.", P.readsFile); } return(false); } return(true); } bool printOptWarning4PairedEndOpts(ParameterList &P) { bool optIsCorrect = true; if(!P.bMatePairedReads) { if(P.disLB > 0) { cout << "Unexpected -L option for single end mapping " << endl; optIsCorrect = false; } if(P.disUB != (int)DEFAULT_UPPER_BOUND) { cout << "Unexpected -U option for single end mapping " << endl; optIsCorrect = false; } if(P.frOnly) { cout << "Unexpected --fr option for single end mapping " << endl; optIsCorrect = false; } /* if(P.rfOnly) { cout << "Unexpected --rf option for single end mapping " << endl; optIsCorrect = false; } */ if(P.ffOnly) { cout << "Unexpected --ff option for single end mapping " << endl; optIsCorrect = false; } if(P.bExcludeAmbiguousPaired) { cout << "Unexpected --e option for single end mapping " << endl; optIsCorrect = false; } } return(optIsCorrect); } bool printOptWarning4SingleEndOpts(ParameterList &P) { bool optIsCorrect = true; if(P.bMatePairedReads) { if(P.bExcludeAmbiguousReads > 0) { cout << "Unexpected -E option for paired-end mapping " << endl; optIsCorrect = false; } } return(optIsCorrect); } // return false if the ext name of readSet is not expected bool checkFileListHasTheRightExt(vector& readSetList) { std::string extName; for ( vector::iterator it = readSetList.begin(); it != readSetList.end(); it++) { if ( it == readSetList.begin()) { extName = std::string(getExtName(it->c_str())); } else if ( *it == string("") ) { continue; } else { if (extName != std::string(getExtName(it->c_str()))) { cout << "Reads input file " << *it << " doesn't have the ext name " << extName << ".\n"; return(false); } } if (!withSupportExtFileName(it->c_str())) { cout << "Reads input file " << *it << " has a unexpected ext name\n"; return(false); } } return(true); } bool checkReadsSetNamesValidity(vector& readSetsList1, vector& readSetsList2) { bool valid_flag = true; if (readSetsList1.size() == 0) { cout << "There are no read sets to mapped " << endl; return(false); } else { valid_flag = checkFileListHasTheRightExt(readSetsList1); if (valid_flag && readSetsList2.size() > 0) { if (readSetsList1.size() != readSetsList2.size()) { cout << "The paired end read set has different size" << endl; valid_flag = false; } valid_flag = checkFileListHasTheRightExt(readSetsList2); } } if (!valid_flag) { cout << "Please check the input reads files.\n" << endl; } return(valid_flag); } ./Source/ParseReadsOpts.cpp0000644011075700120610000000103111720654362015777 0ustar yanghochmath-ar#include "ParseReadsOpts.h" CParseReadsOpts::CParseReadsOpts(void) { this->setDefaults(); } CParseReadsOpts::~CParseReadsOpts(void) { } void CParseReadsOpts::setDefaults(void) { strcpy(this->readsFile, ""); strcpy(this->qualityFile, ""); // Reads files this->cFileFormatSymbol = 'N'; this->truncatedReadLength = MAX_LINE; this->bDiscardReadWithN = true; this->allowedNumOfNinRead = 0; this->bMappedLongRead = false; this->bOddReadLengthAndLongRead = false; this->bMappedSOLiDRead = false; }./Source/PerM.cpp0000644011075700120610000001507711720654362013762 0ustar yanghochmath-ar//============================================================== // Name : PerM // Author : Yangho Chen at University of southern California. // Version : 0.4.0 // Copyright : Open source // Description : C++, Ansi-style //============================================================================ #include "PairedReadsMapping.h" #include "ReadsMapping.h" #include "Genome_Index_TableQ.h" #include "PairedReadsSet.h" #include "ReadInBitsSet.h" #include "ReadInBits.h" #include "ParameterList.h" #include "Filename.h" #include "chdir.h" #include "stdafx.h" #include #include #include using namespace std; time_t startt, endt; bool retriveReadSetsAndSettings\ (ParameterList& P, vector& readSetsList1, vector& readSetsList2) { // (1) check reference if (!P.checkRefValidity()) { return(false); } // (2) check reads files if (getReadSetsFilenames(P, readSetsList1, readSetsList2)) { if ((int)readSetsList1.size() > 0) { const char* firstReadSetFileName = readSetsList1.at(0).c_str(); P.bMappedSOLiDRead = P.bMappedSOLiDRead ||\ is_colorspace_reads(P.readsFileFormat) ||\ is_colorspace_reads(firstReadSetFileName); P.cFileFormatSymbol = getReadsFileFormatSymbol(firstReadSetFileName, P.readsFileFormat); P.readLength = getReadLength(firstReadSetFileName, P.cFileFormatSymbol); } else { LOG_INFO("Info %d: Cannot get read set from the read list.\n", ERROR_LOG); return(false); } } else if (atoi(P.readsFile) >= (int)MIN_READ_LENGTH) { P.bMakeIndex = true; P.bSaveIndex = true; P.bMappedSOLiDRead = is_colorspace_reads(P.readsFileFormat); P.readLength = atoi(P.readsFile); // Must call P.truncatReadLength() later. } else if (atoi(P.readsFile) > 0) { LOG_INFO("Info %d: Incorrect reads file or the read length is too short\n", ERROR_LOG); return(false); } else { return(false); // can not open the read file } P.truncatReadLength(); // truncated reads according to options or mapping long reads // (3) select seed according to the setting P.seedId = selectSeed(P); // (4) avoid to use the same name if (readSetsList1.size() > 1) P.outputFileN[0] = '\0'; // (5) TODO remove the mask repeat process P.bMaskedMathRepeat \ = (!P.bMatePairedReads && !P.bMappedLongRead && P.bExcludeAmbiguousReads); bool validSetting = true; validSetting &= printOptWarning4PairedEndOpts(P); validSetting &= printOptWarning4PairedEndOpts(P); return(validSetting); } string get_Index_Path(ParameterList P) { if (hasTheExtName(P.refFile, ".index") || P.refFormat == "index" ) { return(string(P.refFile)); } else { string indexPath = default_index_path(getBasename\ (P.refFile), P.bMappedSOLiDRead, P.seedId, P.readLength); return(indexPath); } } void setQueryIndexSubThreshold(CGenome_Index_TableQ& indexTable, unsigned int subDiffThreshold) { const unsigned int MAX_SUB_WHEN_QUERY_TABLE = 20; if (subDiffThreshold > MAX_SUB_WHEN_QUERY_TABLE) { indexTable.uiSubDiffThreshold = MAX_SUB_WHEN_QUERY_TABLE; } else { indexTable.uiSubDiffThreshold = subDiffThreshold; } } bool buildIndexTable(CGenome_Index_TableQ& indexTable, ParameterList& P) { if (P.refFormat == "index" || hasTheExtName(P.refFile, ".index")) { LOG_INFO("\nInfo %d: Index file %s has incorrect format\n", ERROR_LOG, P.refFile); } else if (indexTable.getSeqFromFasta(P.refFile, P.refFormat)) { bool bMaskedMathRepeat = P.bMaskedMathRepeat && (atoi(P.readsFile) != 0); bool bMakeIndexSucessful = indexTable.make_index_table(P.anchorLength, P.seedId, P.bMappedSOLiDRead, bMaskedMathRepeat); if (bMakeIndexSucessful) { if (P.bSaveIndex) { const bool bPrintErrMsg = true; indexTable.save_index_table(P.indexFileN, bPrintErrMsg); } indexTable.bExcludeAmbiguous = P.bExcludeAmbiguousReads; setQueryIndexSubThreshold(indexTable, P.subDiffThreshold); return(true); } else { LOG_INFO("\nInfo %d: Faile to build index file %s.\n", ERROR_LOG, P.indexFileN); } } return(false); } /* * The main function construct or read in the index of a set of reference genome. * It maps the reads */ int main(int argc, const char* argv[]) { // testLongBases2Colors(); // (1) Get parameters. ParameterList P = getParameterList(argc, argv); if (argc <= 2) return 0; vector readSetsList1, readSetsList2; bool validSetting = retriveReadSetsAndSettings(P, readSetsList1, readSetsList2); if (validSetting) { P.printSetting(); } else { LOG_INFO("\nInfo %d: Invalid or confusing command. Check the opt\n", CONFIG_LOG); #ifdef WIN32 STRIKE_KEY2CONTINUE; #endif return(-1); } // (2) Build or read index table CGenome_Index_TableQ indexTable; indexTable.uiSubDiffThreshold = (unsigned int)P.subDiffThreshold; string indexPath = get_Index_Path(P); const bool bPrintWarning = false; if (P.bMakeIndex || indexTable.read_index_table(indexPath.c_str(), bPrintWarning) == false) { if (buildIndexTable(indexTable, P) == false ) { return(-1); } } // testGenome_Index_TableQ(&indexTable); // testMappingLongRead(&indexTable); // (3) Mapped reads. if ( P.bMatePairedReads && !P.bMappedLongRead ) { if (readSetsList1.size() > 0 && (readSetsList1.size() == readSetsList2.size())) { TIME_INFO(parallelMappingPairedReads\ (readSetsList1, readSetsList2, indexTable, P),"Mapping paired reads\n"); } else { TIME_INFO(parallelMappingPairedReads\ (readSetsList1, indexTable, P), "Mapping paired reads"); } } else if (P.bMatePairedReads && P.bMappedLongRead ) { TIME_INFO(parallelMappingPairedLongReads(readSetsList1, readSetsList2, indexTable, P),\ "Mapped paired-ended long reads"); } else if ( !P.bMatePairedReads && P.bMappedLongRead ) { TIME_INFO(parallelMappingLongReads(readSetsList1, indexTable, P),\ "Mapped single-ended long reads"); } else if ( !P.bMatePairedReads && !P.bMappedLongRead ) { TIME_INFO(parallelMapping(readSetsList1, indexTable, P),\ "Mapped single-ended reads"); } cout << endl; return 0; } ./Source/PerMTest.cpp0000644011075700120610000000034011720654362014605 0ustar yanghochmath-ar// PerM.cpp : Defines the entry point for the console application. // #include "stdafx.h" #include "ColorSpaceRead.h" int main(int argc, char* argv[]) { //testShift64Bit(); testLongBases2Colors(); return 0; } ./Source/ReSeq_bits.cpp0000644011075700120610000000000011720654362015133 0ustar yanghochmath-ar./Source/ReadInBits.cpp0000644011075700120610000002356011720654362015077 0ustar yanghochmath-ar#include "stdafx.h" #include "ReadInBits.h" int CReadInBits::iReadLength = 0; //read length must <= WORDSIZE CReadInBits::CReadInBits(void) { } CReadInBits::~CReadInBits(void) { } CReadInBits::CReadInBits(const char* caRead) { encodeRead(caRead, CReadInBits::iReadLength, &this->UpperBits, &this->LowerBits); } CReadInBits::CReadInBits(const char* caRead, int readlength) { encodeRead(caRead, readlength, &this->UpperBits, &this->LowerBits); } unsigned int CReadInBits::encode(const char* caRead) { return(encodeRead(caRead, CReadInBits::iReadLength, &this->UpperBits, &this->LowerBits)); } unsigned int CReadInBits::encode(const char* caRead, int readlength) { return(encodeRead(caRead, readlength, &this->UpperBits, &this->LowerBits)); } // Encode 'N' as 'A' unsigned int CReadInBits::encodeRead_NasA(const char* caRead, int readlength) { return(encodeReadNasA(caRead, readlength, &this->UpperBits, &this->LowerBits)); } char* CReadInBits::decode(char* caRead) { decodeRead(caRead, CReadInBits::iReadLength, this->UpperBits, this->LowerBits); return(caRead); } int* CReadInBits::decode(int* iaRead) const { decodeRead(iaRead, CReadInBits::iReadLength, this->UpperBits, this->LowerBits); return(iaRead); } bool CReadInBits::operator==(const CReadInBits &other) const { /* int shift = (int)wordSize - iReadLength; return((this->UpperBits << shift) == (other.UpperBits << shift) && (this->LowerBits << shift) == (other.LowerBits << shift)); */ return(((this->UpperBits ^ other.UpperBits) | (this->LowerBits ^ other.LowerBits)) << (wordSize - iReadLength) == 0); } bool CReadInBits::operator<(const CReadInBits &other) const { unsigned int spareTail = wordSize - iReadLength; WORD_SIZE u1 = this->UpperBits << spareTail; WORD_SIZE u2 = other.UpperBits << spareTail; if (u1 < u2) { return(true); } else if (u1 == u2) { return((this->LowerBits << spareTail) < (other.LowerBits << spareTail)); } else { return(false); } } // uiReadLength must < WORD_SIZE which is 32 bp in 32big machine and 64 bp in 64 bit machine // Each base is encoded into 2 bits: A -> 00, C->01, G->10 and T->11. // These two digits are located in two word, for bits operation. // The first nucleotide is encoded as the last digit. unsigned int encodeRead(const char* caRead, int uiReadLength, WORD_SIZE* encodUpperBits, WORD_SIZE* encodedLowerBits) { WORD_SIZE UpperBits = 0; WORD_SIZE LowerBits = 0; int i = uiReadLength - 1; do { switch (caRead[i]) { case 'A': case 'a': break; case 'C': case 'c': LowerBits ++; break; case 'G': case 'g': UpperBits ++; break; case 'T': case 't': UpperBits ++; LowerBits ++; break; case 'N': case 'n': case '.': return (1); //invalid read default: cout << "Unexpected character: " << caRead[i] << BLANK_LINE<< endl; return (1); //invalid read } i--; if (i >= 0) { UpperBits <<= 1; //shift 1 LowerBits <<= 1; } } while ( i >= 0); *encodUpperBits = UpperBits; *encodedLowerBits = LowerBits; return (0); } // Encode 'N' as 'A' unsigned int encodeReadNasA(const char* caRead, int uiReadLength, WORD_SIZE* encodUpperBits, WORD_SIZE* encodedLowerBits) { WORD_SIZE UpperBits = 0; WORD_SIZE LowerBits = 0; int i = uiReadLength - 1; do { switch (caRead[i]) { case 'A': case 'a': case 'N': case '.': break; case 'C': case 'c': LowerBits ++; break; case 'G': case 'g': UpperBits ++; break; case 'T': case 't': UpperBits ++; LowerBits ++; break; default: cout << "Unexpected character: " << caRead[i] << " in " << caRead << endl; return (1); //invalid read } i--; if (i >= 0) { UpperBits <<= 1; //shift 1 LowerBits <<= 1; } } while ( i >= 0); *encodUpperBits = UpperBits; *encodedLowerBits = LowerBits; return (0); } unsigned int decodeRead(int* iaRead, int iReadLength, WORD_SIZE UpperBits, WORD_SIZE LowerBits) { int i; for (i = 0; i < iReadLength; i++) { WORD_SIZE c = (UpperBits & 0x01) << 1 | (LowerBits & 0x01); iaRead[i] = (int)c; LowerBits >>= 1; UpperBits >>= 1; } iaRead[i] = -1; return 0; } unsigned int decodeRead(char* caRead, int iReadLength, WORD_SIZE UpperBits, WORD_SIZE LowerBits) { int i; for (i = 0; i < iReadLength; i++) { WORD_SIZE c = (UpperBits & 0x01) << 1 | (LowerBits & 0x01); switch (c) { case 0x00: caRead[i] = 'A'; break; case 0x01: caRead[i] = 'C'; break; case 0x02: caRead[i] = 'G'; break; case 0x03: caRead[i] = 'T'; break; default: caRead[i] = 'N'; } LowerBits >>= 1; UpperBits >>= 1; } caRead[i] = '\0'; return 0; } void reverseCompliment(unsigned int uiReadLength, WORD_SIZE* pUpperBits, WORD_SIZE* pLowerBits) { WORD_SIZE LowerBits = ~(*pLowerBits); WORD_SIZE UpperBits = ~(*pUpperBits); #ifdef __32BITS__ UpperBits = reverse32bits(UpperBits); LowerBits = reverse32bits(LowerBits); unsigned int shifts = (wordSize - uiReadLength); #else UpperBits = reverse64bits(UpperBits); LowerBits = reverse64bits(LowerBits); unsigned int shifts = (wordSize - uiReadLength); #endif (*pUpperBits) = UpperBits >> shifts; (*pLowerBits) = LowerBits >> shifts; } CReadInBits reverseCompliment(unsigned int uiReadLength, CReadInBits r) { reverseCompliment(uiReadLength, &(r.UpperBits), &(r.LowerBits)); return(r); } // return number of bits set inline unsigned int bitsSetCount(WORD_SIZE bits) { // magic function to caculate how many ones are there #ifdef WIN32 unsigned int c; // c accumulates the total bits set in v for (c = 0; bits; c++) { bits &= bits - 1; // clear the least significant bit set } return (c); #else #ifdef _WIN64 unsigned int c; // c accumulates the total bits set in v for (c = 0; bits; c++) { bits &= bits - 1; // clear the least significant bit set } return (c); /* bits = ((bits & 0xAAAAAAAAAAAAAAAA) >> 1) + (bits & 0x5555555555555555); bits = ((bits & 0xCCCCCCCCCCCCCCCC) >> 2) + (bits & 0x3333333333333333); bits = ((bits & 0xF0F0F0F0F0F0F0F0) >> 4) + (bits & 0x0F0F0F0F0F0F0F0F); bits = ((bits & 0xFF00FF00FF00FF00) >> 8) + (bits & 0x00FF00FF00FF00FF); bits = ((bits & 0xFFFF0000FFFF0000) >> 16) + (bits & 0x0000FFFF0000FFFF); bits = ((bits & 0xFFFFFFFF00000000) >> 32) + (bits & 0x00000000FFFFFFFF); return (unsigned int)(bits); */ #else return(__builtin_popcountll(bits)); #endif #endif } unsigned int bitsStrCompare(CReadInBits r1, CReadInBits r2) { WORD_SIZE bits = (r1.UpperBits ^ r2.UpperBits) | (r1.LowerBits ^ r2.LowerBits); return(bitsSetCount(bits)); } // compare only the last N bases (bits) unsigned int bitsStrNCompare(CReadInBits r1, CReadInBits r2, unsigned int N) { WORD_SIZE bits = (r1.UpperBits ^ r2.UpperBits) | (r1.LowerBits ^ r2.LowerBits); bits <<= (wordSize - N); return(bitsSetCount(bits)); /* // magic function to calculate how many ones are there #ifdef WIN32 unsigned int c; // c accumulates the total bits set in v for (c = 0; bits; c++) { bits &= bits - 1; // clear the least significant bit set } return (c); #else #ifdef _WIN64 unsigned int c; // c accumulates the total bits set in v for (c = 0; bits; c++) { bits &= bits - 1; // clear the least significant bit set } return (c); bits = ((bits & 0xAAAAAAAAAAAAAAAA) >> 1) + (bits & 0x5555555555555555); bits = ((bits & 0xCCCCCCCCCCCCCCCC) >> 2) + (bits & 0x3333333333333333); bits = ((bits & 0xF0F0F0F0F0F0F0F0) >> 4) + (bits & 0x0F0F0F0F0F0F0F0F); bits = ((bits & 0xFF00FF00FF00FF00) >> 8) + (bits & 0x00FF00FF00FF00FF); bits = ((bits & 0xFFFF0000FFFF0000) >> 16) + (bits & 0x0000FFFF0000FFFF); bits = ((bits & 0xFFFFFFFF00000000) >> 32) + (bits & 0x00000000FFFFFFFF); return (unsigned int)(bits); #else return(__builtin_popcountll(bits)); #endif #endif */ } // skip the first M base and compare the following N base pairs unsigned int bitsStrMNCompare(CReadInBits r1, CReadInBits r2, unsigned int M, unsigned int N) { WORD_SIZE bits = (r1.UpperBits ^ r2.UpperBits) | (r1.LowerBits ^ r2.LowerBits); bits >>= M; bits <<= (wordSize - N); return(bitsSetCount(bits)); } unsigned int encodeLongRead(const char* read, CReadInBits& firstHalf, CReadInBits& secondHalf) { int readLength = (int)strlen(read); int secondHalfStart = readLength - CReadInBits::iReadLength; if (read != NULL && readLength > (int)MAX_READ_LENGTH) { secondHalf.encode(&read[secondHalfStart]); firstHalf.encode(read); return(0); // Potential error for buffer overflow } else { return(1); } } unsigned int decodeLongRead(CReadInBits& firstHalf, CReadInBits& secondHalf, char* read, bool oddReadLength) { int secondHalfStart; if (oddReadLength) { secondHalfStart = CReadInBits::iReadLength - 1; } else { secondHalfStart = CReadInBits::iReadLength; } if (read != NULL) { firstHalf.decode(read); secondHalf.decode(&read[secondHalfStart]); read[secondHalfStart + CReadInBits::iReadLength] = '\0'; return(0); // Potential error for buffer overflow } else { return(1); } }./Source/ReadInBitsSet.cpp0000644011075700120610000002666711720654362015566 0ustar yanghochmath-ar/* * CReadsSet.cpp * * Created on: Jan 18, 2009 * Author: yanghoch * This class reads and store DNA short reads with fixed length * It will firstly get reads from a file (fasta, quality score or txt format), * by the extended name of the input files. */ #include "ReadInBitsSet.h" #ifndef MAX_PATH const int MAX_PATH = 2048; #endif CReadInBitsSet::CReadInBitsSet() { this->initialization(); } CReadInBitsSet::~CReadInBitsSet() { delete this->pReadsSet; delete this->pReadsID; delete this->pMismatchScores; delete this->pQualScores; } CReadInBitsSet::CReadInBitsSet(unsigned int Capacity, unsigned int uiReadLength, unsigned int allowedNumOfNinRead) { this->initialization(); this->uiRead_Length = uiReadLength; this->allowedNumOfNinRead = allowedNumOfNinRead; this->bDiscardReadWithN = (allowedNumOfNinRead == 0); CReadInBits::iReadLength = (int)uiReadLength; // Set the Kmer has length this->pReadsSet = new vector(); this->pReadsSet->reserve(Capacity); this->pReadsID = new vector(); this->pReadsID->reserve(Capacity); if (this->pReadsSet == NULL || this->pReadsID == NULL) { ERR; // Fail to new space storing read } } /* * Given a input filename and the length of the input read. * Get reads and save in the data structure of Vector */ CReadInBitsSet::CReadInBitsSet(const char* InputFile, const char* fileFormat,\ unsigned int uiReadStartIndex, unsigned int uiReadLength, unsigned int allowedNumOfNinRead) { this->initialization(); myStrCpy(this->InputFile, InputFile, FILENAME_MAX); this->uiRead_Length = uiReadLength; this->parser.caNextRead[uiReadLength] = '\0'; this->allowedNumOfNinRead = allowedNumOfNinRead; this->bDiscardReadWithN = (allowedNumOfNinRead == 0); this->pReadsSet = new vector(); this->pReadsID = new vector(); if (this->pReadsSet == NULL || this->pReadsID == NULL) { ERR; // Fail to new space storing read } if (fileExist(InputFile) ) { this->openAFileReady2GetRead(InputFile, fileFormat, uiReadStartIndex); } else { LOG_INFO("Info %d: File %s is not available!\n", WARNING_LOG, InputFile); } } int CReadInBitsSet::initialization(void) { this->InputFile[0] = '\0'; this->uiRead_Length = 0; this->uiNo_of_Reads = 0; this->uiNo_of_Bad_Reads = 0; //Counting the reads with low quality score this->pReadsSet = NULL; this->pReadsID = NULL; this->pMismatchScores = NULL; this->pQualScores = NULL; this->bDiscardReadWithN = true; return(0); } // clear and reserve the capacity for read, readId and quality int CReadInBitsSet::clear(int capacity) { if (this->pReadsSet != NULL) { this->pReadsSet->clear(); this->pReadsSet->reserve(capacity); if (this->pReadsID != NULL) { this->pReadsID->clear(); this->pReadsID->reserve(capacity); } if (this->pQualScores != NULL) { this->pQualScores->clear(); this->pQualScores->reserve(capacity); } return(0); } return(1); } void CReadInBitsSet::handleBadread(void) { this->uiNo_of_Bad_Reads++; while (this->pReadsID->size() > this->pReadsSet->size()) { this->pReadsID->pop_back(); } } void CReadInBitsSet::setBadReadOutputFile(FileOutputBuffer* pOut) { this->parser.pOBuf = pOut; } // get reads from the file and store (append) in a vector. Return how many reads are read-in. unsigned int CReadInBitsSet::openAFileReady2GetRead(const char* InputFile, const char* fileFormat,\ unsigned int uiReadStartIndex) { this->cFileType = this->parser.openAFileReady2GetRead (InputFile, fileFormat, \ uiReadStartIndex, this->uiRead_Length, this->bDiscardReadWithN); bool bGetQScores = (this->cFileType == 'Q' || this->cFileType == 'q'); if ( this->cFileType == 'N') { return(0); } if (bGetQScores) { this->pQualScores = new CReadsQualScores(this->uiRead_Length, BUFFERED_READS_SIZE); } else if (this->cFileType == 'S') { // check if the quality score with the same basename exist or not this->openAFileReady2GetReadQSinQUAL(InputFile, this->uiRead_Length); } this->clear(BUFFERED_READS_SIZE); return(BUFFERED_READS_SIZE); } unsigned int CReadInBitsSet::get_next_capacity_reads(int capacity, char sep) { if (this->uiNo_of_Reads > 0) { printf("Deal read no. %u in %s.\r", this->uiNo_of_Reads, this->InputFile); } fflush(stdout); bool bGetQScoresinQUAL = (this->pQualScores != NULL && this->cFileType == 'S'); bool bGetQScoresInFastq = (this->pQualScores != NULL) && \ (this->cFileType == 'Q' || this->cFileType == 'q'); bool bSOLiDReadFormat = (this->cFileType == 'Q' || this->cFileType == 'S'); this->clear(capacity); do { parser.get_Next_Read(); // the next read are in this->parser.caNextRead bool isABadRead = isBadRead(bSOLiDReadFormat, this->parser.caNextRead, this->parser.uiRead_Length); if (this->parser.caNextRead[0] == '\0') { this->parser.pBuf->fflush(); break; // End of the file } else if (!isABadRead && this->save_next_read(this->parser.caNextRead, bSOLiDReadFormat)) { this->save_next_read_id(this->parser.caNextReadTag, sep); if (bGetQScoresInFastq) { this->pQualScores->addQSs(this->parser.caNextReadQSs); } } else { this->parser.print_Next_Read(); this->handleBadread(); } } while (this->pReadsSet->size() < this->pReadsSet->capacity()); for (unsigned int i = (unsigned int)this->pReadsID->size(); \ i > this->pReadsSet->size(); i--) { this->pReadsID->pop_back(); // remove extra tags } if (bGetQScoresinQUAL) { this->pQualScores->getQualityScoresFromQUAL(this->pReadsID); } return((unsigned int)this->pReadsSet->size()); } void CReadInBitsSet::ignoreQScores(void) { if (this->pQualScores != NULL) { delete this->pQualScores; this->pQualScores = NULL; } } inline void makeFakeReadId(char* readIdBuf, unsigned int readIdNo) { printf("\r%uth read has no tag.", readIdNo); sprintf(readIdBuf, "fakeTag%u", readIdNo); } void getReadIdFromTagLine(char* readIdStr, const char* tagLine, unsigned int readIdNo, char sep = ',') { int trimStart; for (trimStart = 1; tagLine[trimStart]!= '\0'; trimStart++) { if (!(isspace(tagLine[trimStart]) || tagLine[trimStart] == sep)) { break; } } if ( tagLine[trimStart] == '\0') { makeFakeReadId(readIdStr, readIdNo); } else { strncpy(readIdStr, &tagLine[trimStart], READ_ID_LENGTH - 1); readIdStr[READ_ID_LENGTH - 1] = '\0'; formatReadId(readIdStr, sep); } } void CReadInBitsSet::get_read_id(int no, char* readId) { if ((int)this->pReadsID->size() > no) { strcpy(readId, this->pReadsID->at(no).id); } else { sprintf(readId, "Fake_Read_%d", no + 1); } } inline bool isReadHasNlessThan(const char* read, int threshold) { int counter = 0; for (int i = 0; read[i] != '\0'; i++) { if (read[i] == '.' || read[i] == 'N') { counter ++; if (counter > threshold) { return(false); } } } return(true); } bool CReadInBitsSet::save_next_read(const char* readSeq, bool bSOLiDReadFormat) { bool goodRead; CReadInBits r; if (bSOLiDReadFormat) { if (this->bDiscardReadWithN) { goodRead = encodeColors(readSeq, r); } else { if (isReadHasNlessThan(readSeq, this->allowedNumOfNinRead)) { goodRead = encodeColorsNas3(readSeq, r); } else { goodRead = false; } } if (goodRead) { this->uiNo_of_Reads++; this->pReadsSet->push_back(r); return(true); } } else { if (this->bDiscardReadWithN) { goodRead = (r.encode(readSeq, this->uiRead_Length) == 0); } else { if (isReadHasNlessThan(readSeq, this->allowedNumOfNinRead)) { goodRead = (r.encodeRead_NasA(readSeq, this->uiRead_Length) == 0); } else { goodRead = false; } } if (goodRead) { this->uiNo_of_Reads++; // No of newly read-in reads this->pReadsSet->push_back(r); return(true); } } return(false); } void CReadInBitsSet::save_next_read_id(const char* tagLine, char sep) { CReadID tag; // Save the read ID in the vector unsigned int readIdNo = (unsigned int)this->pReadsID->size(); if ((readIdNo + 1) == this->pReadsSet->size()) { getReadIdFromTagLine(tag.id, tagLine, readIdNo, sep); this->pReadsID->push_back(tag); } else if (readIdNo >= this->pReadsSet->size()) { printf("\r%uth read has more than one tag.", readIdNo - 1); getReadIdFromTagLine(tag.id, tagLine, readIdNo, sep); this->pReadsID->pop_back(); this->pReadsID->push_back(tag); } else { // Make fake read Id tags for (int readID = readIdNo + 1; readID < (int)this->pReadsSet->size(); readID++) { makeFakeReadId(tag.id, readID); this->pReadsID->push_back(tag); } getReadIdFromTagLine(tag.id, tagLine, readIdNo, sep); this->pReadsID->push_back(tag); } } int printMissReads(const char* outputfile, CReadInBitsSet& readsSet, int missMatchScoreT) { if (readsSet.uiNo_of_Reads > 0 && readsSet.pMismatchScores != NULL) { ofstream ofile(outputfile); int missReadsNo = 0; for (unsigned int i = 0; i < readsSet.pReadsSet->size(); i++) { char caRead[wordSize + 1]; if ((int)readsSet.pMismatchScores->mismatchScore[i] >= missMatchScoreT) { if (readsSet.cFileType == 'S') { // csfasta format for solid read decodeColors(caRead, readsSet.pReadsSet->at(i)); } else { readsSet.pReadsSet->at(i).decode(caRead); } ofile << i << ',' << caRead << endl; missReadsNo++; } } ofile.close(); return(missReadsNo); } else { LOG_INFO("Info %d: Reads set is empty or haven't been mapped!\n", INFO_LOG); } return(-1); } unsigned int CReadInBitsSet::openAFileReady2GetReadQSinQUAL(const char* InputFile, unsigned int readQsLength) { char qualFile[MAX_LINE]; strcpy(qualFile, InputFile); chExtName(qualFile, ".QUAL"); if (!fileExist(qualFile)) { chExtName(qualFile, ".qual"); if (!fileExist(qualFile)) { chExtName(qualFile, "_QV.QUAL"); if (!fileExist(qualFile)) { chExtName(qualFile, ".qual"); } } } if (fileExist(qualFile)) { LOG_INFO("Info %d: Get quality scores from %s!\n", INFO_LOG, qualFile); if (this->pQualScores != NULL) { delete this->pQualScores; this->pQualScores = NULL; } if (this->pQualScores == NULL) { this->pQualScores = new CReadsQualScores(readQsLength, BUFFERED_READS_SIZE); this->pQualScores->openQUALfile(qualFile); } return(BUFFERED_READS_SIZE); } else { LOG_INFO("Info %d: Quality score file %s is not available!\n", INFO_LOG, qualFile); } return(0); } ./Source/ReadsFileParser.cpp0000644011075700120610000004716711720654362016137 0ustar yanghochmath-ar#include "ReadsFileParser.h" inline bool isNorDot(char c) { return(c == '.' || c == 'N'); } CReadsFileParser::CReadsFileParser(void) { this->initialization(); } CReadsFileParser::~CReadsFileParser(void) { ifile.close(); delete this->pBuf; // delete this->pOBuf; // Don't delete it. It is setup/destroy from outside. } int CReadsFileParser::initialization(void) { this->pBuf = NULL; this->pOBuf = NULL; this->cFileType = 'F'; myStrCpy(this->caNextRead,"\0", FILENAME_MAX); myStrCpy(this->caNextReadTag, "\0", FILENAME_MAX); myStrCpy(this->caNextReadQSs,"\0", FILENAME_MAX); myStrCpy(this->InputFile, "\0", FILENAME_MAX); this->uiRead_Length = 0; this->readStartIndex = 0; // The bases before the start index (5' end) will be removed. return(0); } char CReadsFileParser::openAFileReady2GetRead\ (const char* filename, const char* fileFormat, unsigned int readStartIndex, unsigned int uiRead_Length, bool bDiscardReadsWN, FileOutputBuffer* pBadReadBuf) { this->bDiscardReadWN = bDiscardReadsWN; this->readStartIndex = readStartIndex; this->uiRead_Length = uiRead_Length; this->cFileType = getReadsFileFormatSymbol(filename, fileFormat); myStrCpy(this->InputFile, filename, MAX_PATH); this->ifile.open(InputFile); if (this->ifile.bad() || !fileExist(InputFile)) { char tmpWorkDir[MAX_PATH]; get_working_directory(tmpWorkDir); LOG_INFO("\nInfo %d: Cannot open reads file %s in %s\n",\ WARNING_LOG, InputFile, tmpWorkDir); return(this->cFileType); } else { this->pBuf = new FileInputBuffer(READS_INPUT_BUFFER_SIZE, &ifile); if (this->pBuf == NULL) { ERR; // Fail to new FileInputBuffer return('N'); } // The initial of the output buffer is outside the class this->pOBuf = pBadReadBuf; } return(this->cFileType); } /* * This function read in the next read from the inputfile (buffer) this->pBuf. * Currently it only read the sequence (no quality) in the following format. */ char* CReadsFileParser::get_Next_Read(void) { // If there are still something to read from file or buffer // The kmer will be read to this->caNextRead and return it this->caNextRead[0] = '\0'; if (this->pBuf == NULL) { LOG_INFO("\nInfo %d: Read File Buffer is NULL\n", WARNING_LOG); } else if (this->pBuf->ready2Read()) { switch (this->cFileType) { case 'F': get_Next_Read_From_Fasta(); break; case 'S': get_Next_Read_From_csFasta(); break; case 'q': get_Next_Read_From_Fastq(); break; case 'Q': get_Next_Read_From_csFastq(); break; default: get_Next_Read_From_Fasta(); } } if (ifile.eof() == true) { ifile.close(); } return(this->caNextRead); } void CReadsFileParser::print_Next_Read(void) { if (this->pOBuf == NULL) { LOG_INFO("\nInfo %d: Read Output Buffer is NULL\n", WARNING_LOG); } else { unsigned int theReadLength = (unsigned int)strlen(this->caNextRead); switch (this->cFileType) { case 'q': trQScores(theReadLength, SolexaScoreEncodingShift, this->caNextReadQSs, this->caNextReadQSs); sprintf(this->pOBuf->caBufp, "@%s\n%s\n+\n%s\n", this->caNextReadTag, this->caNextRead, this->caNextReadQSs); this->pOBuf->fflush(); break; case 'Q': trQScores(theReadLength, Phred_SCALE_QUAL_SHIFT, this->caNextReadQSs, this->caNextReadQSs); sprintf(this->pOBuf->caBufp, "@%s\n%s\n+\n%s\n", this->caNextReadTag, this->caNextRead, this->caNextReadQSs); this->pOBuf->fflush(); break; case 'F': case 'S': default: sprintf(this->pOBuf->caBufp, ">%s\n%s\n", this->caNextReadTag, this->caNextRead); this->pOBuf->fflush(); } } } char* CReadsFileParser::get_Next_Read_From_Fasta(void) { char caBuf[ MAX_CHAR_PER_LINE ]; char* pch; while (1) { // Line which don't start with '>' or nucleotide symbol will be ignored. if (this->pBuf->Getline(caBuf, MAX_CHAR_PER_LINE - 1) == 0) { this->caNextRead[0] = '\0'; // It is file end break; } else { pch = strtok(caBuf, " ,\t\n"); //This should be the name if (pch == NULL) { this->caNextRead[0] = '\0'; // It is file end break; } else if (pch[0] == '>' && !ifile.eof()) { // TODO fix the problem when a space after '>' by getReadIdFromTagLine() myStrCpy(this->caNextReadTag, pch, FILENAME_MAX); continue; } else if ( !isNucleotide(pch[0]) && !isNorDot(pch[0]) && !ifile.eof()) { continue; } unsigned int line_length = (unsigned int)strlen(pch); if (line_length >= uiRead_Length) { const char* readSeq = &(pch[this->readStartIndex]); strncpy(this->caNextRead, readSeq, uiRead_Length); this->caNextRead[uiRead_Length] = '\0'; break; } else if (line_length < uiRead_Length) { // May be the read is too long so is print in the next line this->pBuf->Getline(&caBuf[line_length], MAX_CHAR_PER_LINE - line_length); line_length = (unsigned int)strlen(caBuf); if (line_length < uiRead_Length) { LOG_INFO("Info %d: Read %s %s is %u bp < expected length %u bp.\n",\ WARNING_LOG, this->caNextReadTag, caBuf, line_length, uiRead_Length); this->caNextRead[0] = '\0'; continue; } else if (line_length > uiRead_Length) { int minUiReadLength = (int)uiRead_Length; int iBufSize = MAX_LINE; if ( minUiReadLength >= iBufSize) { minUiReadLength = iBufSize - 1; } strncpy(this->caNextRead, caBuf, minUiReadLength); this->caNextRead[minUiReadLength] = '\0'; break; } else { myStrCpy(this->caNextRead, pch, FILENAME_MAX); break; } } } } return(this->caNextRead); } #define GET_LINE_UNTIL(pBuf, caBuf, syntex) {\ while (1) {\ if(pBuf->Getline(caBuf, MAX_CHAR_PER_LINE - 1) == 0) {\ return(false);\ }\ syntex;\ }\ return(true);\ } inline bool CReadsFileParser::getNextSeqNameInFq(FileInputBuffer* pBuf, char* caBuf) { GET_LINE_UNTIL(pBuf, caBuf, { if ( caBuf[0] == '@' && !ifile.eof()) { myStrCpy(this->caNextReadTag, caBuf, FILENAME_MAX); return(true); } else if (caBuf[0] == '>' ) { LOG_INFO("Info %d: Invalid fastq file. Is it a fasta file?\n", WARNING_LOG); return(false); } }) } inline bool CReadsFileParser::getNextLine(FileInputBuffer* pBuf, char* caBuf, const char exp1stChar) { GET_LINE_UNTIL(pBuf, caBuf, { if ( caBuf[0] == exp1stChar && !ifile.eof()) { return(true); } }) } inline bool CReadsFileParser::getNextSeqInFq (FileInputBuffer* pBuf, char* caBuf, unsigned int expLength) { /* * The while loop provide certain input error correction that will read until * a line that start with a nucleotide or a dot. */ while (1) { if(pBuf->Getline(caBuf, MAX_CHAR_PER_LINE - 1) == 0) { return(false); } unsigned int line_length = 0; if ((isNucleotide(caBuf[0]) || isNorDot(caBuf[0])) && !ifile.eof()) { line_length = (unsigned int)strlen(caBuf); // TODO: double check when the readStartIndex > 0. (line_length - this->readStartIndex >= expLength ?) if ( line_length >= expLength && line_length > this->readStartIndex ) { unsigned int theReadLength = min(expLength, MAX_LONG_READ_LENGTH); myStrCpy(this->caNextRead, &caBuf[this->readStartIndex], MAX_LONG_READ_LENGTH + 1); this->caNextRead[theReadLength] = '\0'; return(true); } else { if(this->pOBuf == NULL) { LOG_INFO("Info %d: Read %s:%s has length %d < the expected %u bp\n",\ WARNING_LOG,this->caNextReadTag, caBuf, line_length, uiRead_Length); } // Keep the shorter read in buffer myStrCpy(this->caNextRead, &caBuf[this->readStartIndex], MAX_LONG_READ_LENGTH + 1); unsigned int theReadLength = min(line_length, MAX_LONG_READ_LENGTH); this->caNextRead[theReadLength] = '\0'; return(false); } } } } inline bool CReadsFileParser::getNextQScoreInFq(FileInputBuffer* pBuf, char* caBuf, unsigned int expLength) { GET_LINE_UNTIL(pBuf, caBuf, { unsigned int line_length = (unsigned int)strlen(caBuf); if (line_length > this->readStartIndex) { if (line_length >= expLength) { return(true); } else { return(false); } } else { return(false); } }) } char* CReadsFileParser::get_Next_Read_From_Fastq(void) { char caBuf[ MAX_CHAR_PER_LINE ]; bool bHasReadName = getNextSeqNameInFq(this->pBuf, caBuf); bool bHasLongReadSeq = getNextSeqInFq(this->pBuf, caBuf, this->uiRead_Length); bool bHas3rdLine = getNextLine(this->pBuf, caBuf,'+'); bool bHasLongEnoughQualityScore = getNextQScoreInFq(this->pBuf, caBuf, this->uiRead_Length + this->readStartIndex); if ( bHasReadName && bHasLongReadSeq && bHas3rdLine && bHasLongEnoughQualityScore) { const char* qScoreBuf = &caBuf[this->readStartIndex]; trQScores(this->uiRead_Length, -1 * SolexaScoreEncodingShift, qScoreBuf, this->caNextReadQSs); // sucessfully get a read } else if (!bHasLongReadSeq) { // Reads shorter than expected const char* qScoreBuf = &caBuf[this->readStartIndex]; unsigned int shorterReadLength = (unsigned int)strlen(this->caNextRead); trQScores(shorterReadLength, -1 * SolexaScoreEncodingShift, qScoreBuf, this->caNextReadQSs); caNextRead[shorterReadLength] = '\0'; } else if (!bHasLongEnoughQualityScore) { // Not long enough quality score for(unsigned int i = 0; i < this->uiRead_Length; i++) this->caNextReadQSs[i] = -1 * SolexaScoreEncodingShift; // use sudo score } else { // if EOF (file end) caNextRead[0] = '\0'; ifile.close(); } return(this->caNextRead); } char* CReadsFileParser::get_Next_Read_From_csFastq(void) { /* char caBuf[ MAX_CHAR_PER_LINE ]; if (getNextSeqNameInFq(this->pBuf, caBuf) &&\ getNextSeqInFq(this->pBuf, caBuf, this->uiRead_Length + 1) &&\ getNextLine(this->pBuf, caBuf,'+') &&\ getNextQScoreInFq(this->pBuf, caBuf, this->uiRead_Length)) { trQScores(this->uiRead_Length, -1 * Phred_SCALE_QUAL_SHIFT, caBuf, this->caNextReadQSs); // sucessfully get a read } else { // if EOF (file end) caNextRead[0] = '\0'; ifile.close(); }*/ char caBuf[ MAX_CHAR_PER_LINE ]; bool bHasReadName = getNextSeqNameInFq(this->pBuf, caBuf); bool bHasLongReadSeq = getNextSeqInFq(this->pBuf, caBuf, this->uiRead_Length + 1); bool bHas3rdLine = getNextLine(this->pBuf, caBuf,'+'); bool bHasLongEnoughQualityScore = getNextQScoreInFq(this->pBuf, caBuf, this->uiRead_Length + this->readStartIndex); if ( bHasReadName && bHasLongReadSeq && bHas3rdLine && bHasLongEnoughQualityScore) { const char* qScoreBuf = &caBuf[this->readStartIndex]; trQScores(this->uiRead_Length, -1 * Phred_SCALE_QUAL_SHIFT, qScoreBuf, this->caNextReadQSs); // sucessfully get a read } else if (!bHasLongReadSeq) { // Reads shorter than expected unsigned int shorterReadLength = (unsigned int)strlen(this->caNextRead); trQScores(shorterReadLength, -1 * Phred_SCALE_QUAL_SHIFT, caBuf, this->caNextReadQSs); caNextRead[shorterReadLength] = '\0'; } else if (!bHasLongEnoughQualityScore) { // Not long enough quality score for(unsigned int i = 0; i < this->uiRead_Length; i++) this->caNextReadQSs[i] = -1 * Phred_SCALE_QUAL_SHIFT; // use sudo score } else { // if EOF (file end) caNextRead[0] = '\0'; ifile.close(); } return(this->caNextRead); } inline char getNtBaseFromSOLiDRead(const char* SOLiDRead, int position) { const char* colors = &(SOLiDRead[1]); char nt = SOLiDRead[0]; return(getBaseFromColors(nt, colors, position)); } char* CReadsFileParser::get_Next_Read_From_csFasta(void) { // A polished version of et_Next_Read_From_fasta char caBuf[ MAX_CHAR_PER_LINE ]; while (1) { caBuf[0] = '\0'; //Note this->pBuf->Getline() will return 0 if EOF is meet O if (this->pBuf->Getline(caBuf, MAX_CHAR_PER_LINE - 1) == 0) { this->caNextRead[0] = '\0'; // It is file end ifile.close();// Must close the file break; } else { //If this line is header, new line, comment or null line however not EOF, read the next line if (caBuf[0] == '>' && !ifile.eof() ) { myStrCpy(this->caNextReadTag, caBuf, FILENAME_MAX); continue; } else if ( !isNucleotide(caBuf[0]) && !isNorDot(caBuf[0]) && !ifile.eof()) { continue; } else { unsigned int line_length = (unsigned int)strlen(caBuf); // Note the first base is from the primer not chromosome // The read length is counted as the # of colors 0, 1, 2, 3 if (line_length == (uiRead_Length + 1)) { myStrCpy(this->caNextRead, caBuf, MAX_LINE); break; } else if (line_length > (uiRead_Length + 1)) { char firstBase = getNtBaseFromSOLiDRead(caBuf, this->readStartIndex); char* readsStartInBuf = &caBuf[this->readStartIndex]; *readsStartInBuf = firstBase; // Truncated the prefix for SOLiD read strncpy(this->caNextRead, readsStartInBuf, uiRead_Length + 1); this->caNextRead[uiRead_Length + 1] = '\0'; break; } else { LOG_INFO("Info %d: Read %s %s is %u bp < expected length %u bp.\n",\ WARNING_LOG, this->caNextReadTag, caBuf, line_length, uiRead_Length); this->caNextRead[0] = '\0'; continue; } } } } return(this->caNextRead); } /* unsigned int estimateNoOfReads(const char* fileName, const char* fileFormat) { char fileType = getReadsFileFormatSymbol(fileName, fileFormat); switch (fileType) { case 'F': case 'S': return(estimateNoOfReads_From_Fasta(fileName)); case 'Q': case 'q': return(estimateNoOfReads_From_Fastq(fileName)); default: break; } return (MAX_READ_SET_CAPACITY); } unsigned int estimateNoOfReads_From_Fasta(const char* fileName) { // Assume each line has a header unsigned int uiNoOfRead = (unsigned int)getNumberOfLineInAFile(fileName) / 2; return(uiNoOfRead); } unsigned int estimateNoOfReads_From_Fastq(const char* fileName) { // Assume each line has a header if (fileExist(fileName)) { unsigned int uiNoOfRead = (unsigned int)getNumberOfLineInAFile(fileName) / 4; return(uiNoOfRead); } else { return(0); } } */ // check if the format string is for fastq inline bool isFqFormatStr(const char* fileFormat) { bool bFqFormatStr = (strncmp(fileFormat, "fq", 2) == 0) || \ (strncmp(fileFormat, ".fq", 3) == 0) || \ (strncmp(fileFormat, "fastq", 5) == 0) ||\ (strncmp(fileFormat, ".fastq", 6) == 0); return(bFqFormatStr); } // Return true fo csfastq or csfq. Judge by content for fastq. bool is_csFastq_format(const char* fileName, const char* fileFormat) { if (hasCsfqExtName(fileName) || hasCsfqExtName(fileFormat)) { return(true); } bool returnValue = false; if (hasFqExtName(fileName)) { // A simple check if it is a fastq format by if (isFqFormatStr(fileName)) { ; // this is a format string setting. It should be fastq format. } else if ( fileExist(fileName) ) { ifstream ifile(fileName); char caBuf[MAX_LINE]; do { caBuf[0] = '\0'; ifile.getline(caBuf, MAX_LINE); if (caBuf[0] == '@') { ifile.getline(caBuf, MAX_LINE); returnValue = isACGT(caBuf[0]) && is0123(caBuf[1]); break; } } while (!ifile.eof()); ifile.close(); } else { string msg = "Can't decide the format based on the ext name."; LOG_INFO("\nInfo %d: %s doesn't exist.\n %s",\ ERROR_LOG, fileName, msg.c_str()); } } return (returnValue); } char getReadsFileFormatSymbol(string InputFile, string fileFormat) { // The fileFormat string setting can overwrite the previous setting char formatSymbol = getReadsFileFormatSymbol(fileFormat.c_str()); if (formatSymbol == 'N') { formatSymbol = getReadsFileFormatSymbol(InputFile.c_str()); if (formatSymbol == 'N') { LOG_INFO("Info %d: Unknown reads format.\n", WARNING_LOG); } } return(formatSymbol); } char getReadsFileFormatSymbol(const char* InputFile, const char* fileFormat) { string formatStr; if (strcmp(InputFile, fileFormat) == 0 || fileFormat[0] == '\0') { string inputFileStr(InputFile); return(getReadsFileFormatSymbol(inputFileStr, inputFileStr)); } else { // in case it is fastq, fastq without '.', add a '.' for string formatStr = string (getExtName(fileFormat)); if (formatStr[0] != '.') { formatStr = string(".").append(InputFile); } return(getReadsFileFormatSymbol(string(InputFile), string(formatStr))); } } char getReadsFileFormatSymbol(const char* fileName) { if (hasTheExtName(fileName, ".fasta") || hasTheExtName(fileName, ".fa") || hasTheExtName(fileName, ".mfa") || hasTheExtName(fileName, ".fna")) { return('F'); } else if (hasTheExtName(fileName, ".csfasta") || hasTheExtName(fileName, ".csfa")) { return('S'); } else if (is_csFastq_format(fileName)) { // Note csFastq format can have a ext name .fastq or .fq return('Q'); } else if (hasTheExtName(fileName, ".fastqsanger") || hasTheExtName(fileName, ".fastq") || hasTheExtName(fileName, ".fq")) { return('q'); } else { return('N'); } } bool is_colorspace_reads(const char* fileName) { char fileFormat = getReadsFileFormatSymbol(fileName); return(fileFormat == 'S' || fileFormat == 'Q'); } void getReadsFileFormat(const char* fileName, char* fileFormat) { const char formatSymbol = getReadsFileFormatSymbol(fileName, fileFormat); switch (formatSymbol) { case 'F': strcpy(fileFormat, "fasta"); break; case 'q': strcpy(fileFormat, "fastq"); break; case 'Q': strcpy(fileFormat, "csfastq"); break; case 'S': strcpy(fileFormat, "csfasta"); break; default: LOG_INFO("Info %d: Unknown read file format.\n", WARNING_LOG); } } ./Source/ReadsMapping.cpp0000644011075700120610000012131411720654362015461 0ustar yanghochmath-ar#include "ReadsMapping.h" #include "MappingResult.h" const int MISQUERYCHECKPOINT = 100000; int parallelMappingLongReads(vector& readSetsList,\ CGenome_Index_TableQ& indexTable, MappingOpts P) { P.clearOutputFileName(readSetsList.size() > 1); indexTable.bExcludeAmbiguous = P.bExcludeAmbiguousReads; //__OPENMP_FOR_PARALLEL__(#pragma) int i; #ifdef _OPENMP int numberOfCPUs = omp_get_num_procs(); LOG_INFO("\nInfo %d: %d CPUs detected. %s.\n",\ INFO_LOG, numberOfCPUs, BLANK_LINE); #pragma omp parallel for #endif for (i = 0; i < (int)readSetsList.size(); i++) { CReadsMapping mapping(P); const char* readSetName = (readSetsList.at(i)).c_str(); if (checkFileExist(readSetName)) { CLongReadsSet longReadSet(readSetName, P.readsFileFormat, P.readLength,\ P.allowedNumOfNinRead, P.truncatedReadPrefix); if (P.bIgnoreQS) { longReadSet.ignoreQScores(); } TIME_INFO(mapping.mapLongReads(longReadSet, indexTable), "Mapping takes"); } } return(0); } // Given a read set list and the index table, this function maps reads in parallel int parallelMapping(vector& readSetsList, \ CGenome_Index_TableQ& indexTable, MappingOpts P) { P.clearOutputFileName(readSetsList.size() > 1); //__OPENMP_FOR_PARALLEL__(#pragma) int i; #ifdef _OPENMP int numberOfCPUs = omp_get_num_procs(); LOG_INFO("\nInfo %d: %d CPUs detected. %s.\n",\ INFO_LOG, numberOfCPUs, BLANK_LINE); #pragma omp parallel for #endif for (i = 0; i < (int)readSetsList.size(); i++) { CReadInBitsSet readSet (readSetsList.at(i).c_str(), \ P.readsFileFormat, P.truncatedReadPrefix,\ indexTable.uiRead_Length, P.allowedNumOfNinRead); if (P.bIgnoreQS) { readSet.ignoreQScores(); } CReadsMapping mapping(P); TIME_INFO(mapping.mapReads(readSet, indexTable), "Mapping completed"); } return(0); } CReadsMapping::CReadsMapping(void) { this->initialization(); } CReadsMapping::CReadsMapping(MappingOpts P) { this->initialization(); this->opt= P; this->iMultiMappedLocationThreshold = P.maxAlignPerRead; alignmentsQ[0].iMaxCapacity = P.maxAlignPerRead + 1; // store one more record alignmentsQ[1].iMaxCapacity = P.maxAlignPerRead + 1; // So the load can be used to judge if overflow if (P.bGetAllAlignments) { // WARNING if index indexTable.bExcludeAmbiguous will have some error alignmentsQ[0].setQueue_All_Best_OneFlag('A'); alignmentsQ[1].setQueue_All_Best_OneFlag('A'); } if (P.outputFormat[0] != '\0') { if (strcmp(P.outputFormat, "SAM") == 0 || strcmp(P.outputFormat, "sam") == 0 ) { this->cOutputFormat = 's'; } else if (strcmp(P.outputFormat, "MAPPING") == 0 || strcmp(P.outputFormat, "mapping") == 0 ) { this->cOutputFormat = 'm'; } else if (strcmp(P.outputFormat, "FASTQ") == 0 || strcmp(P.outputFormat, "fastq") == 0 ) { this->cOutputFormat = 'F'; this->opt.bPrintFirstAlignmentOnly = true; } else { LOG_INFO("Info %d: Specified output format %s is not recognizable.\n", WARNING_LOG, P.outputFormat); } } // check if the director exist, if not, create one if (P.outputDir[0] != '\0') { if (dirExist(P.outputDir) || createdir(P.outputDir) == 0) { #ifdef WIN32 sprintf(this->opt.outputDir, "%s\\", P.outputDir); #else sprintf(this->opt.outputDir, "%s/", P.outputDir); #endif } else { LOG_INFO("Info %d: Can't create dir %s.\n", WARNING_LOG, P.outputDir); this->opt.outputDir[0] = '\0'; } } } CReadsMapping::~CReadsMapping(void) { delete AlignResult; delete MissReads; } void CReadsMapping::initialization(void) { this->AlignResult = NULL; this->AmbiguousReads = NULL; this->MissReads = NULL; this->BadReads = NULL; this->cOutputFormat = 'm'; } int CReadsMapping::mapReadsSets(const char* ReadsSetsList, CGenome_Index_TableQ& table, bool bDiscardReadsWN) { // Counter for coverage is not initialize this->initializeStatsCounter(); char readsFile[FILENAME_MAX]; ifstream readsFileList(ReadsSetsList); while (GetNextFilenameFromListFile(readsFileList, readsFile)) { CReadInBitsSet readSet(readsFile, this->opt.readsFileFormat,\ this->opt.truncatedReadPrefix,\ table.uiRead_Length, bDiscardReadsWN); this->mapReads(readSet, table); } return(0); } int CReadsMapping::mapReads(CReadInBitsSet& readSet, const CGenome_Index_TableQ& table) { if (wrongIndex(readSet, table)) return(0); unsigned qsShift = table.bMapReadInColors ? Phred_SCALE_QUAL_SHIFT : SolexaScoreEncodingShift; getReadsFileFormat(readSet.InputFile, opt.readsFileFormat); string seedStr = seedSymbol(table.chosenSeedId); unsigned int uiReadLength = readSet.uiRead_Length; printf("Mapping %s (%u-bp reads) with %s seed.%s\n", \ readSet.InputFile, uiReadLength, seedStr.c_str(), BLANK_LINE); // if there are multiple gene in first chromosome. // bool bPrintGeneName = (table.pgenomeNT->paChromosomes[0]->geneVec.table.size() >= 0); this->initializeStatsCounter(); if (this->setUpIO4Aligment(readSet.InputFile, table) != 0) { LOG_INFO("\nInfo %d: Fail to setup I/O files.", ERROR_LOG); return(1); } readSet.setBadReadOutputFile(this->BadReads); CAlignmentsQ& aQue = this->alignmentsQ[0]; while (readSet.get_next_capacity_reads(BUFFERED_READS_SIZE, opt.readtag_delimiter) != 0) { vector::iterator it = readSet.pReadsSet->begin(); for (int i = 0; it != readSet.pReadsSet->end(); i++, it++) { this->printCheckPointInfo(i); readSet.get_read_id(i, aQue.tag); aQue.read = *it; aQue.qualityScores = readSet.getQScoresPtr(i); bool map2forwardStrand = !opt.bMap2ReverseStrandOnly; bool map2reverseStrand = !opt.bMap2ForwardStrandOnly; if (table.bMapReadInColors) { if(map2forwardStrand) { bool clearQ; table.queryReadColors(*it, aQue, clearQ = true, true); } if(map2reverseStrand) { bool clearQ = this->opt.bMap2ReverseStrandOnly ? true : false; table.queryReadColors(*it, aQue, clearQ, false); } } else { bool clearQ; if(map2forwardStrand) { table.queryReadBases(*it, aQue, clearQ = true, true); } if(map2reverseStrand) { clearQ = opt.bMap2ReverseStrandOnly ? true : false; table.queryReadBases(*it, aQue, clearQ, false); } } // statistics and output if (aQue.load > 0) { bookKeepMapping(aQue); bool bPrintAlignment = this->printAlignmentOrNot(aQue, this->opt.bExcludeAmbiguousReads, this->opt.bPrintAmbiguousReadsOnly); if (bPrintAlignment) { this->dealMappedRead(table, aQue); } } else if (this->opt.bPrintUnMappedReads) { char qs[MAX_READ_LENGTH]; qs[0] = '\0'; if (readSet.pQualScores != NULL) { const char* qsPtr = readSet.pQualScores->qScores((unsigned int)i); trQScores(table.uiRead_Length, qsShift, qsPtr, qs); } dealMissedRead(table.bMapReadInColors, aQue.tag, aQue.read, qs); } } iReadCounter += (unsigned int)readSet.pReadsSet->size(); } this->tearDownIO4Aligment(); this->iBadReadCounter = readSet.uiNo_of_Bad_Reads; this->printMappingStats(cout, readSet.InputFile, opt.subDiffThreshold); return(0); } // Map a single long read in two CReadInBits format int CReadsMapping::queryALongReadInBase(CReadInBits& r1stHalf, CReadInBits& r2ndHalf, const CGenome_Index_TableQ& table, CAlignmentsQ& aQue) const { // TODO: The reads should be query and "Check" immediately before que. // In this way, "short cut and filter can be applied right away. Should be changed /* table.queryReadBases(r1stHalf, aQue, true, true); table.queryReadBases(r1stHalf, aQue, false, false); table.extendAlignment(aQue, r2ndHalf); aQue.filterAlignments(this->opt.subDiffThreshold, this->opt.bGetAllAlignments); */ const bool bOddReadLength = this->opt.bOddReadLengthAndLongRead; const bool bAmbiguousOnly = this->opt.bPrintAmbiguousReadsOnly; const bool bShortCutAE = this->opt.bExcludeAmbiguousReads && aQue.qAllInThreshold() && !bAmbiguousOnly; const bool bShortCutE = this->opt.bExcludeAmbiguousReads && !bAmbiguousOnly; const bool bShortCutB = !this->opt.bExcludeAmbiguousReads && !aQue.qAllInThreshold() && !bAmbiguousOnly; const bool bMap2ForwardStrand = ! this->opt.bMap2ReverseStrandOnly; const bool bMap2ReverseStrand = ! this->opt.bMap2ForwardStrandOnly; if(bMap2ForwardStrand) { bool clearQ; table.queryLongReadBases(r1stHalf, r2ndHalf, bOddReadLength, aQue, 1, clearQ = true, true); if (aQue.load > 1) { if (bShortCutAE || (bShortCutE && aQue.MinDiff == 0)) { return (aQue.load); } } if (!(bShortCutB && aQue.MinDiff == 0)) { table.queryLongReadBases(r1stHalf, r2ndHalf, bOddReadLength, aQue, 2, clearQ = false, true); if (aQue.load > 1) { if (bShortCutAE || (bShortCutE && aQue.MinDiff == 0)) { return (aQue.load); } } } } if(bMap2ReverseStrand) { bool clearQ = this->opt.bMap2ReverseStrandOnly ? true : false; table.queryLongReadBases(r1stHalf, r2ndHalf, bOddReadLength, aQue, 1, clearQ, false); if (aQue.load > 1) { if (bShortCutAE || (bShortCutE && aQue.MinDiff == 0)) { return (aQue.load); } } if (!(bShortCutB && aQue.MinDiff == 0)) { table.queryLongReadBases(r1stHalf, r2ndHalf, bOddReadLength, aQue, 2, clearQ = false, false); } } return(aQue.load); } int CReadsMapping::queryALongReadInColors(CReadInBits& r1stHalf, CReadInBits& r2ndHalf, const CGenome_Index_TableQ& table, CAlignmentsQ& aQue) const { LOG_INFO("\nInfo %d: Currently PerM doesn't support mapping SOLiD reads longer than 64 base pairs.\n",\ ERROR_LOG); const bool bOddReadLength = this->opt.bOddReadLengthAndLongRead; const bool bAmbiguousOnly = this->opt.bPrintAmbiguousReadsOnly; const bool bShortCutAE = this->opt.bExcludeAmbiguousReads && aQue.qAllInThreshold() && !bAmbiguousOnly; const bool bShortCutE = this->opt.bExcludeAmbiguousReads && !bAmbiguousOnly; const bool bShortCutB = !this->opt.bExcludeAmbiguousReads && !aQue.qAllInThreshold() && !bAmbiguousOnly; const bool bMap2ForwardStrand = ! this->opt.bMap2ReverseStrandOnly; const bool bMap2ReverseStrand = ! this->opt.bMap2ForwardStrandOnly; if(bMap2ForwardStrand) { bool clearQ; table.queryLongReadColors(r1stHalf, r2ndHalf, bOddReadLength, aQue, 1, clearQ = true, true); if (aQue.load > 1) { if (bShortCutAE || (bShortCutE && aQue.MinDiff == 0)) { return (aQue.load); } } if ( !(bShortCutB && aQue.MinDiff == 0)) { table.queryLongReadColors(r1stHalf, r2ndHalf, bOddReadLength, aQue, 2, clearQ = false, true); if (aQue.load > 1) { if (bShortCutAE || (bShortCutE && aQue.MinDiff == 0)) { return (aQue.load); } } } } if(bMap2ReverseStrand) { bool clearQ = this->opt.bMap2ReverseStrandOnly ? true : false; table.queryLongReadColors(r1stHalf, r2ndHalf, bOddReadLength, aQue, 1, clearQ, false); if (aQue.load > 1) { if (bShortCutAE || (bShortCutE && aQue.MinDiff == 0)) { return (aQue.load); } } if ( !(bShortCutB && aQue.MinDiff == 0)) { table.queryLongReadColors(r1stHalf, r2ndHalf, bOddReadLength, aQue, 2, clearQ = false, false); } } return(aQue.load); } int CReadsMapping::printLogFile(const char* inputFile) { ofstream logFile(opt.logFileN, ofstream::app); if (logFile.good()) { this->printCommand(logFile, opt.fullCommand); this->printMappingStats(logFile, inputFile, opt.subDiffThreshold); this->printCommand(cout, opt.fullCommand); this->printMappingStats(cout, inputFile, opt.subDiffThreshold); } else { this->printCommand(cout, opt.fullCommand); this->printMappingStats(cout, inputFile, opt.subDiffThreshold); } logFile.close(); return(0); } // To map reads longer than MAX_READ_LENGTH, locate alignments using first half and check the mismatches in the second half int CReadsMapping::mapLongReads(CLongReadsSet& longReadSet, const CGenome_Index_TableQ& table) { unsigned int uiReadLength = this->opt.readLength; CAlignmentsQ& aQue = this->alignmentsQ[0]; CReadInBitsSet& readSet1stHalf = *(longReadSet.F_Reads); CReadInBitsSet& readSet2ndHalf = *(longReadSet.R_Reads); const char* readSetName = readSet1stHalf.InputFile; // Flag that set the alignment is ambiguous or not getReadsFileFormat(readSetName, opt.readsFileFormat); string seedStr = seedSymbol(table.chosenSeedId); printf("Mapping %s (%u-bp reads) with %s seed.\n", \ readSetName, uiReadLength, seedStr.c_str()); this->initializeStatsCounter(); if (this->setUpIO4Aligment(readSetName, table) != 0) { LOG_INFO("\nInfo %d: Fail to setup I/O files.", ERROR_LOG); return(1); } longReadSet.setBadReadOutputFile(this->BadReads); // alignmentsQ[0].setQueue_All_Best_OneFlag('A'); const bool bMapReadInColors = table.bMapReadInColors; while (longReadSet.get_next_capacity_long_reads() > 0) { int bufferedReadNo = this->checkPairedReadSetSize(readSet1stHalf, readSet2ndHalf); vector::iterator it1, it2; it1 = readSet1stHalf.pReadsSet->begin(); it2 = readSet2ndHalf.pReadsSet->begin(); for (int i = 0; i < bufferedReadNo; i++, it1++, it2++) { this->printCheckPointInfo(i); CMappingResult m; if (bMapReadInColors) { this->getLongColorReadInfo(readSet1stHalf, readSet2ndHalf, i, *it1, *it2, m); this->queryALongReadInColors(*it1, *it2, table, aQue); return(-1); } else { this->getLongBaseReadInfo(readSet1stHalf, readSet2ndHalf, i, *it1, *it2, m); this->queryALongReadInBase(*it1, *it2, table, aQue); } // statistics and output if (aQue.load > 0) { bookKeepMapping(aQue); bool bPrintAlignment = this->printAlignmentOrNot(aQue, this->opt.bExcludeAmbiguousReads, this->opt.bPrintAmbiguousReadsOnly); if (bPrintAlignment) { this->dealMappedLongRead(table, aQue, m); } } else if (this->opt.bPrintUnMappedReads) { dealMissedRead(m); } } this->iReadCounter += bufferedReadNo; } this->tearDownIO4Aligment(); this->iBadReadCounter = longReadSet.uiNo_of_Bad_Reads; this->printLogFile(readSetName); return(0); } int CReadsMapping::dealMissedRead(bool bMapReadInColors, const char* readName, CReadInBits r, const char* qs) { this->printRead(this->MissReads, bMapReadInColors, readName, r, qs); return(this->iMissReadCounter++); } int CReadsMapping::dealAmbiguousRead(bool bMapReadInColors, const char* readName, CReadInBits r, const char* qs) { this->printRead(this->AmbiguousReads, bMapReadInColors, readName, r, qs); return(this->iReadsMapped2tooManyLocations); } inline void CReadsMapping::printRead(FileOutputBuffer* FileBuf, bool bMapReadInColors, const char* readName, CReadInBits r, const char* qs) { char caRead[MAX_READ_LENGTH + 1]; if (bMapReadInColors) { decodeColorReadWithPrimer(caRead, r); } else { r.decode(caRead); } if (qs == NULL || qs[0] == '\0') { sprintf(FileBuf->caBufp, ">%s\n%s\n", readName, caRead); } else { sprintf(FileBuf->caBufp, "@%s\n%s\n+\n%s\n", readName, caRead, qs); } FileBuf->UpdateSize(); } inline void CReadsMapping::printRead(FileOutputBuffer* FileBuf, CMappingResult& m) { const char* readSeqOrColors; if (m.caRead[0] != '\0') { readSeqOrColors = m.caRead; } else { readSeqOrColors = m.TAG; } if (m.QScores[0] == '\0') { sprintf(FileBuf->caBufp, ">%s\n%s\n", m.QNAME, readSeqOrColors); FileBuf->UpdateSize(); } else { sprintf(FileBuf->caBufp, "@%s\n%s\n+\n%s\n", m.QNAME, readSeqOrColors, m.QScores); FileBuf->UpdateSize(); } } int CReadsMapping::dealMissedRead(CMappingResult& m) { printRead(this->MissReads, m); return(this->iMissReadCounter++); } int CReadsMapping::dealAmbiguousRead(CMappingResult& m) { printRead(this->AmbiguousReads, m); return(this->iReadsMapped2tooManyLocations); } unsigned int CReadsMapping::checkPairedReadSetSize\ (CReadInBitsSet& firstHalfSet, CReadInBitsSet& SecondHalfSet) { unsigned int size1 = (unsigned int)firstHalfSet.pReadsSet->size(); unsigned int size2 = (unsigned int)SecondHalfSet.pReadsSet->size(); if (size1 == size2) { return(size1); } else { LOG_INFO("Info %d: Not every read has the second half\n", WARNING_LOG); return(min(size1, size2)); } } void printSingleEndReads(char format, unsigned int uiReadLength); // This function print out alignments for each read and the corresponding information. int CReadsMapping::dealMappedRead(const CGenome_Index_TableQ& table, CAlignmentsQ& aQue) { bool samFormat = (this->cOutputFormat == 's'); CMappingResult m(aQue, opt.readLength); if (!table.bMapReadInColors) { // TODO: get read getReadQscores4Solexa(aQue, m, samFormat); } if (this->opt.bPrintAlignments) { if (aQue.load < aQue.iMaxCapacity || this->opt.bPrintAmbiguousReadsOnly) { for (unsigned int i = 0; i < aQue.load && i < aQue.iMaxCapacity; i++) { getSingleMappingInfo(table, aQue, i, m, samFormat); this->printSingleEndReads(m); if (this->opt.bPrintFirstAlignmentOnly) { break; } } } if (aQue.load >= aQue.iMaxCapacity) { // If a read is mapped to over threshold place, don't print it this->iReadsMapped2tooManyLocations++; if (this->opt.bPrintAmbigReadsSeparately) { this->dealAmbiguousRead(table.bMapReadInColors,aQue.tag, aQue.read, aQue.qualityScores); /* The alternative ambiguous print will not include the adapter const bool IF_COLOR2BASE = false; const int FIRST_INDEX = 0; getSingleMappingInfo(table, aQue, FIRST_INDEX, m, IF_COLOR2BASE); this->dealAmbiguousRead(m); */ } else if (this->opt.bPrintAmbigReadsInOneLine) { getSingleMappingInfo(table, aQue, 0, m, samFormat); this->printSingleEndReads(m); } } } return(aQue.load); } int CReadsMapping::dealMappedLongRead\ (const CGenome_Index_TableQ& table, CAlignmentsQ& aQue, CMappingResult& m) { bool samFormat = (this->cOutputFormat == 's'); if (aQue.load < aQue.iMaxCapacity || this->opt.bPrintAmbiguousReadsOnly) { if (this->opt.bPrintAlignments) { for (unsigned int i = 0; i < aQue.load && i < aQue.iMaxCapacity; i++) { // get and store all the output info in CMappingResult getLongMappingInfo(table, aQue, samFormat, i, m); // TODO make change for SOLiD long read this->printSingleEndReads(m); if (this->opt.bPrintFirstAlignmentOnly) { break; } } } } if (aQue.load >= aQue.iMaxCapacity) { // If a read is mapped to over threshold place, don't print it this->iReadsMapped2tooManyLocations++; if (this->opt.bPrintAmbigReadsSeparately) { this->dealAmbiguousRead(m); } else if (this->opt.bPrintAmbigReadsInOneLine) { getLongMappingInfo(table, aQue, samFormat, 0, m); this->printSingleEndReads(m); } } return(aQue.load); } int CReadsMapping::printSingleEndReads(CMappingResult& m) { string dummyStr = ""; int map_score = 0; switch (this->cOutputFormat) { case 's': printAMappingInSam(this->AlignResult, m); break; case 'F': printAMappingInFastq(this->AlignResult, m); break; case 'g': printAMappingInGff(this->AlignResult, m, map_score, dummyStr); break; default: printAMappingInPerM(this->AlignResult, m, this->opt.bPrintNM); break; } return(0); } /* int CReadsMapping::printMappingsInBEDformat(FileOutputBuffer &OBuf, char* caKmer, CMapOfSingleRead& mapPatternsSet, unsigned int ReadID) { const unsigned int SUB_LEVEL = 4; const char* RGB[SUB_LEVEL]; RGB[0] = "0,0,0"; RGB[1] = "0,0,255"; RGB[2] = "0,255,0"; RGB[3] = "255,0,0"; unsigned int readLength = this->uiKmer_Length; for(int i = 0; i < mapPatternsSet.size; i++) { // For each kind of pattern set* pSet = &(mapPatternsSet.mappingsSets[i]); set::iterator j; // Print each occurrence of a same pattern for(j = pSet->begin(); j != pSet->end(); j++) { unsigned int chrom = j->chrId; unsigned int chromStart = j->chrLocusId; unsigned int chromEnd = chromStart + readLength; int name = ReadID; int score = 1; if( score > 0) { char strand = j->bReverse ? '-' : '+'; unsigned int subNum = diNtStrWildCardComp(mapPatternsSet.caPattern[i], caKmer, readLength); const char* subRGB = (subNum > (SUB_LEVEL - 1)) ? RGB[subNum] : RGB[SUB_LEVEL - 1] ; sprintf(OBuf.caBufp,"chr%u %u %u read%d %d %c %d %d %s %d %d 0\n", chrom, chromStart, chromEnd, name, score, strand, chromStart, chromEnd, subRGB, subNum, readLength); OBuf.UpdateSize(); } else { // bad read; } } // delete each set if it has been report mapPatternsSet.mappingsSets[i].clear(); } return(mapPatternsSet.size); } */ // This private function returns a string as the prefix of the output file name for mapping string getOutputFileNamePrefix(const char* dir, const CGenome_Index_TableQ& table, MappingOpts& opt) { // The prefix contains 4 parts: (0) dir (1) ref file (2) ambiguous flag (3) mismatch threshold char outputPrefixStr[MAX_PATH]; char excludeAmbiguous = opt.bExcludeAmbiguousReads ? 'E' : 'B'; if (opt.bGetAllAlignments) excludeAmbiguous = 'A'; sprintf(outputPrefixStr, "%s%s_%c_%u_%d",\ dir, table.caRefName, excludeAmbiguous, table.chosenSeedId, opt.subDiffThreshold); return(string(outputPrefixStr)); } bool isGalaxyOutputPath(const char* fileName) { return(hasTheExtName(fileName, ".dat")); } bool isSupportedExtName(const char* fileN) { if (hasTheExtName(fileN, ".fasta") || hasTheExtName(fileN, ".fastq") || hasTheExtName(fileN, ".fq") || hasTheExtName(fileN, ".csfq") || hasTheExtName(fileN, ".csfastq") || hasTheExtName(fileN, ".csfasta") || hasTheExtName(fileN, ".dat") || hasTheExtName(fileN, ".txt")) { return(true); } return(false); } string rmSupportedExtName(const char* fileN) { char newFileN[FILENAME_MAX]; strcpy(newFileN, fileN); if (isSupportedExtName(fileN)) { for (int i = (int)strlen(newFileN); i > 0; i--) { if (newFileN[i] == '.') { newFileN[i] = '\0'; return(string(newFileN)); } } } return(string(fileN)); } // generate the unmapped FileN, if the original one is not null void getAmbiguousFileN(char* ambiguousFileN, const char* readFileN, const char* readFormat,\ const char* refN, bool bQual = false) { string refNStr = rmSupportedExtName(refN); string readFileNStr = rmSupportedExtName(readFileN); if (strcmp(ambiguousFileN, "") == 0) { // make sure the readFormat is valid if (strcmp(readFormat, "csfasta") == 0) { if (bQual) { sprintf(ambiguousFileN, "%s_ambig_%s.%s", readFileNStr.c_str(), refNStr.c_str(), "fastq"); } else { sprintf(ambiguousFileN, "%s_ambig_%s", readFileNStr.c_str(), refNStr.c_str()); // Not sure csfasta or not } } else { if (readFormat[0] == '\0') { sprintf(ambiguousFileN, "%s_ambig_%s", readFileNStr.c_str(), refNStr.c_str()); } else { sprintf(ambiguousFileN, "%s_ambig_%s.%s", readFileNStr.c_str(), refNStr.c_str(), readFormat); } } } } // generate the unmapped FileN, if the original one is not null void getUnmappedFileN(char* unmappedFileN, const char* readFileN, const char* readFormat,\ const char* refN, bool bQual = false) { string refNStr = rmSupportedExtName(refN); string readFileNStr = rmSupportedExtName(readFileN); if (strcmp(unmappedFileN, "") == 0) { // make sure the readFormat is valid if (strcmp(readFormat, "csfasta") == 0) { if (bQual) { sprintf(unmappedFileN, "%s_miss_%s.%s", readFileNStr.c_str(), refNStr.c_str(), "fastq"); } else { sprintf(unmappedFileN, "%s_miss_%s", readFileNStr.c_str(), refNStr.c_str()); // Not sure csfasta or not } } else { if (readFormat[0] == '\0') { sprintf(unmappedFileN, "%s_miss_%s", readFileNStr.c_str(), refNStr.c_str()); } else { sprintf(unmappedFileN, "%s_miss_%s.%s", readFileNStr.c_str(), refNStr.c_str(), readFormat); } } } } // generate the unmapped FileN, if the original one is not null void getBadReadsFileN(char* badFileN, const char* readFileN, const char* readFormat,\ const char* refN, bool bQual = false) { string refNStr = rmSupportedExtName(refN); string readFileNStr = rmSupportedExtName(readFileN); if (strcmp(badFileN, "") == 0) { // make sure the readFormat is valid if (strcmp(readFormat, "csfasta") == 0) { if (bQual) { sprintf(badFileN, "%s_bad.%s", readFileNStr.c_str(), "fastq"); } else { sprintf(badFileN, "%s_bad", readFileNStr.c_str()); // Not sure csfasta or not } } else { if (readFormat[0] == '\0') { sprintf(badFileN, "%s_bad", readFileNStr.c_str()); } else { sprintf(badFileN, "%s_bad.%s", readFileNStr.c_str(), readFormat); } } } } string CReadsMapping::getMappingFileN(const char* caReadsSetName, const CGenome_Index_TableQ& table) { if (this->cOutputFormat == 'm') { strcpy(opt.outputFormat, "mapping"); } else if (this->cOutputFormat == 's') { strcpy(opt.outputFormat, "sam"); } else if (this->cOutputFormat == 'F') { strcpy(opt.outputFormat, "fastq"); } char outputPath[MAX_LINE]; if (this->opt.outputFileN[0] == '\0') { // No output file has been set string outFileNPrefix = getOutputFileNamePrefix(this->opt.outputDir, table, opt); string fileNameOfReadSet = getBasename(caReadsSetName); const char* extN = opt.outputFormat; if (string(extN) == "sam" || string(extN) == "mapping" || string(extN) == "fastq") { sprintf(outputPath, "%s_%s.%s", outFileNPrefix.c_str(), fileNameOfReadSet.c_str(), extN); } else { if (extN[0] != '0') { char msg[MAX_LINE]; sprintf(msg, "The specified output format %s is unrecognizable.\n", extN); cout << msg; // LOG_INFO("%s", WARNING_LOG, msg); } sprintf(outputPath, "%s_%s", outFileNPrefix.c_str(), fileNameOfReadSet.c_str()); } } else { sprintf(outputPath, "%s%s", this->opt.outputDir, this->opt.outputFileN); } return(string(outputPath)); } int CReadsMapping::setUpIO4Aligment(const char* caReadsSetName, const CGenome_Index_TableQ& table) { if (this->opt.bGetAllAlignments) { // Find all alignments with in threshold, instead of queuing // only all best(with the fewest mismatches) alignments as default. alignmentsQ[0].setQueue_All_Best_OneFlag('A'); alignmentsQ[1].setQueue_All_Best_OneFlag('A'); } char outputPath[MAX_PATH]; strcpy(outputPath, this->getMappingFileN(caReadsSetName, table).c_str()); // (1)Initialize the I/O for output the alignments if (this->opt.bPrintAlignments) { if (this->AlignResult == NULL) { ofstream* AlignResultFile = new ofstream(outputPath); this->AlignResult = new FileOutputBuffer(ALIGNMENT_RESULT_FILE_BUFFER_SIZE, AlignResultFile); if (this->AlignResult == NULL) { ERR;//check new FileOutputBuffer return(1); } else if (this->cOutputFormat == 's') { if (this->opt.bPrintSamHeader) { string RG = getSamRG(caReadsSetName, table.bMapReadInColors); vector refs = table.pgenomeNT->getRefNamesLengths(); string commandLineStr = this->opt.fullCommand; printSamHeader(AlignResult, refs, RG.c_str(), commandLineStr.c_str()); } } } } // (2)Initialize the I/O for output the up-mapped reads. if (this->opt.bPrintUnMappedReads) { // Make sure opt.readsFileFormat has been set const char* refN = table.pgenomeNT->refName; getUnmappedFileN(opt.unmappedFileN, caReadsSetName, opt.readsFileFormat, refN); /* // This line force the output to be the right ext name (fasta, csfasta or fastq) if (!isGalaxyOutputPath(opt.unmappedFileN)) { string extName = string(".").append(string(opt.readsFileFormat)); chExtName(opt.unmappedFileN, extName.c_str()); }*/ if (this->MissReads == NULL) { ofstream* MissReadsFile = new ofstream(opt.unmappedFileN); this->MissReads = new FileOutputBuffer(ALIGNMENT_RESULT_FILE_BUFFER_SIZE, MissReadsFile); if (this->MissReads == NULL) { ERR; // check new FileOutputBuffer return(1); } } } // (3) Initialize the I/O for output the ambiguous reads over the threshold if (this->opt.bPrintAmbigReadsSeparately) { // Make sure opt.readsFileFormat has been set const char* refN = table.pgenomeNT->refName; getAmbiguousFileN(opt.ambiguousReadFileN, caReadsSetName, opt.readsFileFormat, refN); if (this->AmbiguousReads == NULL) { ofstream* ambiguousReadsFile = new ofstream(opt.ambiguousReadFileN); this->AmbiguousReads = new FileOutputBuffer(ALIGNMENT_RESULT_FILE_BUFFER_SIZE, ambiguousReadsFile); if (this->AmbiguousReads == NULL) { ERR; // check new FileOutputBuffer return(1); } } } // (4) Initialize the I/O for output the bad reads or reads shorter than expected if (this->opt.bPrintBadReads) { // Make sure opt.readsFileFormat has been set const char* refN = table.pgenomeNT->refName; getBadReadsFileN(opt.badReadFileN, caReadsSetName, opt.readsFileFormat, refN); if (this->BadReads == NULL) { ofstream* badReadsFile = new ofstream(opt.badReadFileN); this->BadReads = new FileOutputBuffer(ALIGNMENT_RESULT_FILE_BUFFER_SIZE, badReadsFile); if (this->BadReads == NULL) { ERR; // check new FileOutputBuffer return(1); } } } return(0); } int CReadsMapping::tearDownIO4Aligment(void) { // close file, which has been opened before if (this->opt.bPrintAlignments) { // this->AlignResult->removeEndBlankLine(); delete this->AlignResult; this->AlignResult = NULL; } if (this->opt.bPrintUnMappedReads) { // this->MissReads->removeEndBlankLine(); delete this->MissReads; this->MissReads = NULL; } if (this->opt.bPrintAmbigReadsSeparately) { delete this->AmbiguousReads; this->AmbiguousReads = NULL; } if (this->opt.bPrintBadReads) { delete this->BadReads; this->BadReads = NULL; } return(0); } void getSingleMappingSeq4Solexa(const CGenome_Index_TableQ& table, CMappingResult& m, bool noRef) { if (noRef) { // sam format doesn't need to know reference sequence m.caRef[0] = '\0'; } else { const unsigned int readLength = (unsigned int)strlen(m.caRead); // (1) Get ref in bits CReadInBits ref = table.pgenomeNTInBits->getSubstringInBits(m.uiGlobalMappedPos, readLength); if (m.strand == '-') { ref = reverseCompliment(readLength, ref); } // (2) Get ref seq ref.decode(m.caRef); } } void getQscores4Solexa(CAlignmentsQ& aQue, CMappingResult& m, bool samFormat) { if (aQue.qualityScores == NULL) { m.mismatchScore = (int)m.uiDiff; m.QScores[0] = '\0'; } else { // If quality score are available, get quality score m.mismatchScore = alignmentScore(m.caRead, m.caRef, m.uiReadLength, aQue.qualityScores); trQScores(m.uiReadLength, SolexaScoreEncodingShift, aQue.qualityScores, m.QScores); if (samFormat) { m.getReverseReadandQual(); } } } void getReadQscores4Solexa(CAlignmentsQ& aQue, CMappingResult& m, bool samFormat) { aQue.read.decode(m.caRead); getQscores4Solexa(aQue, m, samFormat); } inline CReadInBits getSingleMappingRef4SOLiD(const CGenome_Index_TableQ& table, CAlignmentsQ& aQue, CMappingResult& m, bool samFormat) { const unsigned int readlength = table.uiRead_Length; // (1) Get ref in bits CReadInBits ref = table.pgenomeNTInBits->getSubstringInBits(m.uiGlobalMappedPos, readlength); if (m.strand == '-') { ref = reverseCompliment(readlength, ref); } // (2) Get ref in color CReadInBits colorsInRef = bases2Colors(ref); m.SNPtype = returnSNPtype(aQue.read, colorsInRef); // (3) Get ref in sequence if(!samFormat) { decodeColors(m.caRef, colorsInRef); // Show the reference for validation purpose } return(colorsInRef); } inline void getQualityScore4SOLiD(unsigned int readlength, CReadInBits &colorsInRef, CAlignmentsQ& aQue, CMappingResult& m, bool samFormat) { if (samFormat) { // correctRead if (aQue.qualityScores == NULL) { char dummyScore = (char)(20); fillDummyQScores(readlength, dummyScore + Phred_SCALE_QUAL_SHIFT, m.QScores); } else { trQScores(readlength, Phred_SCALE_QUAL_SHIFT, aQue.qualityScores, m.QScores); } sprintf(m.TAG, "%s\tCS:Z:%s\tCQ:Z:%s", m.TAG, m.caRead, m.QScores); //sprintf(m.TAG, "%s\tX1:Z:%s\tCS:Z:%s\tCQ:Z:%s", m.TAG, m.caRef, m.caRead, m.QScores); correctAndDecodeRead(aQue.read, colorsInRef, samFormat, m.caRead, m.QScores); if (m.strand == '-') m.getReverseReadandQual(); } else { if (aQue.qualityScores == NULL) { m.mismatchScore = (int)m.uiDiff; m.QScores[0] = '\0'; m.rawScores[0] = '\0'; } else { m.mismatchScore = alignmentScore(m.caRead, m.caRef,\ readlength, aQue.qualityScores); printCommaSepScoresStr(readlength, aQue.qualityScores, m.QScores); } } } void getSingleMappingSeqAndQ4SOLiD\ (const CGenome_Index_TableQ& table, CAlignmentsQ& aQue, CMappingResult& m, bool samFormat) { /* const unsigned int readlength = table.uiRead_Length; // (1) Get ref in bits CReadInBits ref = table.pgenomeNTInBits->getSubstringInBits(m.uiGlobalMappedPos, readlength); if (m.strand == '-') { ref = reverseCompliment(readlength, ref); } // (2) Get ref (and read) in Seq decodeColors(m.caRead, aQue.read); CReadInBits colorsInRef = bases2Colors(ref); */ const unsigned int readlength = table.uiRead_Length; // (1) Get read in Seq decodeColors(m.caRead, aQue.read); // (2) Get ref in Seq CReadInBits colorsInRef = getSingleMappingRef4SOLiD(table, aQue, m, samFormat); // (3) Get nucleotide and quality sequence getQualityScore4SOLiD(readlength, colorsInRef, aQue, m, samFormat); /* if (samFormat) { // correctRead if (aQue.qualityScores == NULL) { char dummyScore = (char)(20); fillDummyQScores(readlength, dummyScore + Phred_SCALE_QUAL_SHIFT, m.QScores); } else { trQScores(readlength, Phred_SCALE_QUAL_SHIFT, aQue.qualityScores, m.QScores); } sprintf(m.TAG, "%s\tCS:Z:%s\tCQ:Z:%s", m.TAG, m.caRead, m.QScores); //sprintf(m.TAG, "%s\tX1:Z:%s\tCS:Z:%s\tCQ:Z:%s", m.TAG, m.caRef, m.caRead, m.QScores); correctAndDecodeRead(aQue.read, colorsInRef, samFormat, m.caRead, m.QScores); if (m.strand == '-') m.getReverseReadandQual(); } else { decodeColors(m.caRef, colorsInRef); // Show the reference for validation purpose m.SNPtype = returnSNPtype(aQue.read, colorsInRef); if (aQue.qualityScores == NULL) { m.mismatchScore = (int)m.uiDiff; m.QScores[0] = '\0'; m.rawScores[0] = '\0'; } else { m.mismatchScore = alignmentScore(m.caRead, m.caRef,\ readlength, aQue.qualityScores); printCommaSepScoresStr(readlength, aQue.qualityScores, m.QScores); } }*/ } unsigned int getNoOfDiff(const char* str1, const char* str2, unsigned int readLength) { unsigned int noOfDiff = 0; for (unsigned int i = 0; i < readLength; i++) { if (str1[i] != str2[i]) { noOfDiff ++; } } return(noOfDiff); } // For one read to one location, void getSingleMappingInfo(const CGenome_Index_TableQ& table, CAlignmentsQ& aQue,\ unsigned int mappingId, CMappingResult& m, bool samFormat) { getSingleMappingIndex(*(table.pgenomeNT), aQue, mappingId, m); if (samFormat) { // must set after the strand flag is set m.setSingleEndSamFields(); } // must set before the color seq and QUAL is set if (table.bMapReadInColors) { getSingleMappingSeqAndQ4SOLiD(table, aQue, m, samFormat); } else { getSingleMappingSeq4Solexa(table, m, samFormat); } if (aQue.qAllInThreshold() && !samFormat) { int noMisMatch = getNoOfDiff(m.caRead, m.caRef, m.uiReadLength); /* if(m.uiDiff != noMisMatch) { ERR; }*/ m.uiDiff = noMisMatch; } if (table.pbaRepeatRepresentativeFlag->b(aQue.aiHitIndex[mappingId])) { m.MultipleMappedNo++; } } void getLongMappingInfo(const CGenome_Index_TableQ& table, CAlignmentsQ& aQue, bool samFormat,\ unsigned int mappingId, CMappingResult& m) { // (1) Get index getSingleMappingIndex(*(table.pgenomeNT), aQue, mappingId, m); if (samFormat) { m.setSingleEndSamFields(); } else { // (2) Get Ref Sequence getLongRefSeq(table, m, samFormat); // (3) Get mismatch score if (m.QScores[0] == '\0') { m.mismatchScore = (int)m.uiDiff; } else { // If quality score are available, get quality score m.mismatchScore = alignmentScore(m.caRead, m.caRef, m.uiReadLength, m.rawScores); } if (m.strand == '-') { reverseComplementKmer(m.caRef); } } } bool wrongIndex(const CReadInBitsSet& readSet, const CGenome_Index_TableQ& table) { if (readSet.uiRead_Length != table.uiRead_Length) { LOG_INFO("Info %d: The index is not for read length %d, is for %d.\n",\ readSet.uiRead_Length, table.uiRead_Length, ERROR_LOG); return(true); } bool bColorRead = (readSet.cFileType == 'Q' || readSet.cFileType == 'S'); if ( bColorRead != table.bMapReadInColors) { if (bColorRead) { LOG_INFO("Info %d: The index is not for color read.\n", ERROR_LOG); } else { LOG_INFO("Info %d: The index is for color read.\n", ERROR_LOG); } return(true); } return(false); } ./Source/ReadsMappingFlags.cpp0000644011075700120610000000106311720654362016434 0ustar yanghochmath-ar#include "stdafx.h" #include "ReadsMappingFlags.h" CReadsMappingFlags::CReadsMappingFlags(void) { this->set_Default_Opt(); } CReadsMappingFlags::~CReadsMappingFlags(void) { } int CReadsMappingFlags::set_Default_Opt(void) { // OUTPUT // Default is print all the 'best' alignments with the fewest substitutions. this->bSearchAllAlignment = false; this->bPrintUnMappedReads = false; this->bPrintGeneName = true; this->bPrintAlignment = true; this->cOutputFormat = 'm'; // PREPROCESSING this->bSaveTable = false; return(0); } ./Source/ReadsMappingStats.cpp0000644011075700120610000000534111720654362016501 0ustar yanghochmath-ar#include "stdafx.h" #include "ReadsMappingStats.h" CReadsMappingStats::CReadsMappingStats(void) { this->initialization(); } CReadsMappingStats::~CReadsMappingStats(void) { } void CReadsMappingStats::initialization(void) { this->iReadsFileCount = 0; this->iMultiMappedLocationThreshold = 2000; // this->iMultiMappedLocationThreshold = 1000000; this->initializeStatsCounter(); // Counters } void CReadsMappingStats::initializeStatsCounter(void) { this->iBadReadCounter = 0; this->iMapCount = 0; this->iReadCounter = 0; this->iMissReadCounter = 0; this->iMultiMappedReads = 0; this->iReadsFileCount = 0; this->iReadsMapped2tooManyLocations = 0; for (unsigned int i = 0; i <= MAXTOLERATSUBMIS; i++) { // Temporarily assume there have same # of record this->iMapDiffCount[i] = 0; } } void CReadsMappingStats::printCommand(ostream& out, string command) { out << command << endl; } int CReadsMappingStats::printMappingStats(ostream& out, const char* readSetName,\ unsigned int uiSubThreshold) const { unsigned int i; out << endl; out << readSetName << ", Reads:, "; out << "Filtered#, " << iBadReadCounter << ", "; out << "Kept#, " << iReadCounter << ", "; out << "Mapped#, " << iMapCount << ", "; out << "Multimapped#, " << iMultiMappedReads << ", "; out << "Multimapped>" << iMultiMappedLocationThreshold << "#, "; out << iReadsMapped2tooManyLocations << BLANK_LINE << endl; out << readSetName << ",_ "; for (i = 0; i <= uiSubThreshold; i++) { out << "Sub" << i << ", " << iMapDiffCount[i] << ", " ; } out << endl; return(0); } // return true if print the alignments void CReadsMappingStats::bookKeepMapping(CAlignmentsQ& que) { this->iMapCount++; this->iMapDiffCount[que.MinDiff]++; if (que.AmbiguousFlag) { this->iMultiMappedReads++; } } bool CReadsMappingStats::printAlignmentOrNot(CAlignmentsQ& que, bool bExcludeAmbiguous, bool ambiguousOnly) const { bool ambiguousFlag = false; if (que.qAllInThreshold()) { // if bExcludeAmbiguous and qAllInThreshold (-E -A), print only if there are single record. if (que.load > 1) { ambiguousFlag = true; } } else { if (que.AmbiguousFlag) { ambiguousFlag = true; } } if (ambiguousFlag) { if (ambiguousOnly) { return(true); } else if (bExcludeAmbiguous) { return(false); } else { return(true); } } else { if (ambiguousOnly) { return(false); // If the mapping is not ambiguous but only ambiguous mapping are meant to be printed. } else { return(true); } } } ./Source/ReadsQualScores.cpp0000644011075700120610000001673111720654362016155 0ustar yanghochmath-ar#include "ReadsQualScores.h" CReadsQualScores::CReadsQualScores(void) { initialization(0, 0); } CReadsQualScores::CReadsQualScores(unsigned int readLength, unsigned int numOfReads) { this->initialization(readLength, numOfReads); } CReadsQualScores::~CReadsQualScores(void) { delete [] this->QSarray; this->QSarray = NULL; } void CReadsQualScores::clear(void) { this->load = 0; } void CReadsQualScores::reserve(unsigned int numOfReads) { if (numOfReads > this->numOfReads) { this->numOfReads = numOfReads; if (this->QSarray != NULL) { delete [] this->QSarray; // all stored quality score will be gone this->QSarray = NULL; } this->size = readLength * numOfReads; this->QSarray = new char[this->size]; memset(this->QSarray, '\0', this->size); } } void CReadsQualScores::initialization(unsigned int readLength, unsigned int numOfReads) { this->size = 0; // capacity in bases this->QSarray = NULL; this->scoreType = 'I'; this->load = 0; // how many reads' qualities score have been load this->numOfReads = 0; // must initialize to 0 and reserve this->readLength = readLength; this->reserve(numOfReads); } /* * This function open an QUAL file and get the first pReadsID->size quality scores */ bool CReadsQualScores::openQUALfile(const char* Filename) { if (fileExist(Filename)) { if (hasTheExtName(Filename, ".qual") || hasTheExtName(Filename, ".QUAL")) { const unsigned int FILE_INPUT_BUFFER_SIZE = 1000000; this->scoreType = 'S'; this->ifile.open(Filename); this->IBuf.initialize(FILE_INPUT_BUFFER_SIZE, &ifile); return(true); } } return(false); } unsigned int CReadsQualScores::getQualityScoresFromQUAL(vector* pReadsID) { char caBuffer[MAX_LINE]; char caReadTag[MAX_LINE]; caReadTag[0] = '\0'; while (this->load < (unsigned int)pReadsID->size()) { unsigned int qsIndex = this->load * this->readLength; if (this->load >= numOfReads) { const char* theReadId = pReadsID->at(this->load).id; LOG_INFO("Info %d: Read %s has no quality.\n", WARNING_LOG, theReadId); break; } caBuffer[0] = '\0'; if (this->IBuf.Getline(caBuffer, MAX_LINE) == 0) { break; // Note this->pBuf->Getline() will return 0 if EOF } else { //If this line is header, new line, comment or null line however not EOF, read the next line if (caBuffer[0] == '>' && ifile.eof() == false) { myStrCpy(caReadTag, &(caBuffer[1]), READ_ID_LENGTH - 2); formatReadId(caReadTag); } else if ((caBuffer[0] == ' ' || caBuffer[0] == '\n' || caBuffer[0] == '#' || caBuffer[0] == '\0') && ifile.eof() == false) { continue; } else { // No warning for the case that the number of quality score and the read length are not matched. // check the tag to see if it is the correct place int iReadId = alignReadId4QScores(caReadTag, this->load, pReadsID); if (iReadId < 0) { continue; // bad read, skip its quality } else { this->load = (unsigned int)iReadId; } int bufferIndex = 0; for (unsigned int pos = 0; pos < this->readLength; pos++) { //read the quality score for each base of the read this->QSarray[qsIndex + pos] = (char)atoi(&caBuffer[bufferIndex]); for (; caBuffer[bufferIndex] != ' '; bufferIndex++) { if (caBuffer[bufferIndex] == '\0') break; }; if (caBuffer[bufferIndex] == '\0') break; bufferIndex++; // mov to the start of next qscore in the buffer } this->load ++; // get qs for one more reads } } } if (this->load < (unsigned int)pReadsID->size()) { LOG_INFO("Info %d: Not every read has quality score.\n", WARNING_LOG); for (unsigned int i = this->load; i < (unsigned int)pReadsID->size(); i++) { cout << "Ex:" << pReadsID->at(i).id << " doesn't get QS" << endl; break; // show only one } LOG_INFO("Info %d: Try arg --delimiter to get the correct read Id.\n", INFO_LOG); } return(this->load); } int CReadsQualScores::alignReadId4QScores(char* tag, unsigned int searchPoint, vector* pReadsID) { if (pReadsID == NULL) { LOG_INFO("Info %d: No read Id is available.\n", WARNING_LOG); } else if (searchPoint >= (pReadsID->size())) { LOG_INFO("Info %d: Request quality scores over the read size %u.\n",\ WARNING_LOG, (unsigned int)pReadsID->size()); return(searchPoint); } else { // Currently, only check the previous and the next tag if the current one is not matched. char* expectedTag = pReadsID->at(searchPoint).id; if (strcmp(expectedTag, tag) == 0) { return(searchPoint); // read ID correctly aligned } // align to next tag if (pReadsID->size() > searchPoint + 1) { expectedTag = pReadsID->at(searchPoint + 1).id; if (strcmp(expectedTag, tag) == 0) { LOG_INFO("\nInfo %d: Read %s has no quality score.\n",\ WARNING_LOG, pReadsID->at(searchPoint).id); return(searchPoint + 1); } } // aligned to previous tag if (pReadsID->size() > 0 && searchPoint > 0) { expectedTag = pReadsID->at(searchPoint - 1).id; if (strcmp(expectedTag, tag) == 0) { LOG_INFO("\nInfo %d: Extra quality scores for read %d.\n",\ WARNING_LOG, searchPoint - 1); return(searchPoint - 1); } } // cout << "Bad read " << tag << endl; // STRIKE_KEY2CONTINUE; } return(-1); // bad read with N, should skip its quality. } void trQScores(unsigned int readLength, char qShift, const char* oldQSs, char* newQSs) { unsigned int i; for (i = 0; i < readLength; i++) { newQSs[i] = oldQSs[i] + qShift; } newQSs[i] = '\0'; } double getAverageQualityScores(CReadsQualScores& scores) { double sumOfQulityScore = 0; for (unsigned int i = 0; i < scores.load; i++) { for (unsigned int j = 0; j < scores.readLength; j++) { sumOfQulityScore += (double)scores.qs(i, j); } } return(sumOfQulityScore / (double)(scores.readLength * scores.load)); } int alignmentScore(char* str1, char* str2, unsigned int readLength, const char* sc) { int score = 0; for (unsigned int i = 0; i < readLength; i++) { // sum of mismatched scores if (str1[i] != str2[i]) { int baseQuality = (int)sc[i]; if ( baseQuality > 0) { score += baseQuality; } } } return(score); } void printCommaSepScoresStr(unsigned int readlength, const char* qScores, char* qScoresStr) { char scoreStr[MAX_LINE]; int scoreStrLength = 0; unsigned int i = 0, j = 0; for (i = 0, j = 0; i < readlength; i++, j+=scoreStrLength) { sprintf(scoreStr, "%d,", (int)qScores[i]); scoreStrLength = (int)strlen(scoreStr); sprintf(&(qScoresStr[j]), "%s", scoreStr); } if (j > 0) j--; // removed the last ',' qScoresStr[j] = '\0'; } ./Source/SeedPattern.cpp0000644011075700120610000012741111720654362015331 0ustar yanghochmath-ar#include "stdafx.h" #include "SeedPattern.h" // This function return the hash value of first 13 bits inline unsigned int getF0SeedHashValue(WORD_SIZE encodedRead) { const WORD_SIZE HASH_MASK = 0x1fff; return ((unsigned int)(encodedRead & HASH_MASK)); // Get the first 13 digit } unsigned int getF0SeedHashValue(CReadInBits r) { WORD_SIZE hashValue = getF0SeedHashValue(r.UpperBits); hashValue <<= BITS_FOR_HASHING; hashValue += getF0SeedHashValue(r.LowerBits); return ((unsigned int)hashValue); } ptHashFunc selectF0(int readlength) { if (BITS_FOR_HASHING <= (unsigned int) readlength ) { if (readlength <= 32) { return &getF0SeedHashValue; } else if (readlength <= 48) { LOG_INFO("\nInfo %d: Use S1 seed instead %d\n", WARNING_LOG, readlength); return &getF1SeedHashValue; } else if (readlength <= 64) { LOG_INFO("\nInfo %d: Use S2 seed instead %d\n", WARNING_LOG, readlength); return &getF2SeedHashValue; } } else { // TODO deal with reads shorter than 13. ; } LOG_INFO("\nInfo %d: No hash function defined for read with length %d\n", WARNING_LOG, readlength); return (NULL); } // private function called by its overloaded getF1SeedHashKey(WORD_SIZE encoedRead, WORD_SIZE encoedRead); inline unsigned int getF0SeedKey(WORD_SIZE encoedRead, int keyWeight) { // Use the suffixLength bits based on the seed pattern 11*1**(111*1**)(111*1**)1 to create a value if (keyWeight > 0 ) { WORD_SIZE HASH_MASK = ((0x01 << keyWeight) - 1); WORD_SIZE uiKey = encoedRead & HASH_MASK; return ((unsigned int)uiKey); } else { return(0); } } unsigned int getF0SeedKey(CReadInBits r, int keyWeight) { r.UpperBits >>= BITS_FOR_HASHING; r.LowerBits >>= BITS_FOR_HASHING; unsigned int uiKey1 = getF0SeedKey(r.UpperBits, keyWeight); unsigned int uiKey2 = getF0SeedKey(r.LowerBits, keyWeight); #ifdef EXTEND_SEED unsigned int uiKey = InterleaveBits((unsigned short) uiKey2, (unsigned short) uiKey1); #else unsigned int uiKey = (uiKey1 << keyWeight) + uiKey2; #endif return (uiKey); } //////////////////////////////////////////////////////////////////////////////// inline unsigned int getF1SeedHashValue15(WORD_SIZE encodedRead) { WORD_SIZE hashValue = 0; hashValue += (encodedRead & 0x07); // Get the first 3 digit hashValue <<= 3; encodedRead >>= 4; hashValue += (encodedRead & 0x07); // Get the first 4-6 digit hashValue <<= 3; encodedRead >>= 4; hashValue += (encodedRead & 0x07); // Get the first 7-9 digit hashValue <<= 3; encodedRead >>= 4; return (unsigned int) hashValue; } // This function is based on the seed pattern (111*). inline unsigned int getF1SeedHashValue(WORD_SIZE encodedRead) { // To save the number of CPU instructions, the value encoded bits that doesn't follow the order of reads // It still gets the first digit and the fifth digit.. and so on to get total 13 digits WORD_SIZE hashValue = 0; hashValue += (encodedRead & 0x07); // Get the first 3 digit hashValue <<= 3; encodedRead >>= 4; hashValue += (encodedRead & 0x07); // Get the first 4-6 digit hashValue <<= 3; encodedRead >>= 4; hashValue += (encodedRead & 0x07); // Get the first 7-9 digit hashValue <<= 3; encodedRead >>= 4; hashValue += (encodedRead & 0x07); // Get the first 11-12 digit hashValue <<= 1; encodedRead >>= 4; hashValue += (encodedRead & 0x01); // Get the 13th position return (unsigned int) hashValue; } unsigned int getF1SeedHashValue15(CReadInBits r) { WORD_SIZE hashValue = getF1SeedHashValue15(r.UpperBits); hashValue <<= BITS_FOR_HASHING; hashValue += getF1SeedHashValue15(r.LowerBits); return ((unsigned int)hashValue); } unsigned int getF1SeedHashValue(CReadInBits r) { WORD_SIZE hashValue = getF1SeedHashValue(r.UpperBits); hashValue <<= BITS_FOR_HASHING; hashValue += getF1SeedHashValue(r.LowerBits); return ((unsigned int)hashValue); } ptHashFunc selectF1(int readlength) { if (15 <= readlength && readlength < 20) { return &getF1SeedHashValue15; } else if (20 <= readlength && readlength <= 60) { return &getF1SeedHashValue; } else { LOG_INFO("\nInfo %d: No hash function defined for read with length %d\n", WARNING_LOG, readlength); return (NULL); } } // private function callse by its overloaded getF1SeedHashKey(WORD_SIZE encoedRead, WORD_SIZE encoedRead); inline unsigned int getF1SeedKey(WORD_SIZE encoedRead, int keyWeight) { //index 234567 8901234 5678901 2 // Use the suffixLength bits based on the seed pattern 11*1**(111*1**)(111*1**)1 to create a value if (keyWeight > 0 ) { WORD_SIZE uiKey = encoedRead & 0x03; // Get the 14 and 15 digit encoedRead >>= 3; for (keyWeight -= 2; keyWeight > 0; keyWeight -= 3) { uiKey += encoedRead & 0x07; uiKey <<= 3; encoedRead >>= 4; } uiKey >>= (-1 * keyWeight); return ((unsigned int)uiKey); } else { return(0); } } unsigned int getF1SeedKey(CReadInBits r, int keyWeight) { r.UpperBits >>= 20; // The first 20 base are used as hash key (111*)(111*)(111*)(111*)1 r.LowerBits >>= 20; unsigned int uiKey1 = getF1SeedKey(r.UpperBits, keyWeight); unsigned int uiKey2 = getF1SeedKey(r.LowerBits, keyWeight); #ifdef EXTEND_SEED unsigned int uiKey = InterleaveBits((unsigned short) uiKey2, (unsigned short) uiKey1); #else unsigned int uiKey = (uiKey1 << keyWeight) + uiKey2; #endif return (uiKey); } //////////////////////////////////////////////////////////////////////////////// // This function is based on the seed pattern (111*1**)(111*1**)(111*1**)1. inline unsigned int getF2SeedHashValue(WORD_SIZE encodedRead) { // To save the number of CPU instructions, the value encoded bits that doesn't follow the order of reads // It still gets the first 3 digit and the fifth digit.. and so on to get total 13 digits WORD_SIZE hashValue = (encodedRead & 0x07) << 1; // Get the first 3 digit encodedRead >>= 4; hashValue += (encodedRead & 0x01); // Get the first 4th digit in the 5th position hashValue <<= 3; encodedRead >>= 3; hashValue += (encodedRead & 0x07); // Get the 5-7th digit hashValue <<= 1; encodedRead >>= 4; hashValue += (encodedRead & 0x01); // Get 8th digit hashValue <<= 3; encodedRead >>= 3; hashValue += (encodedRead & 0x07); // Get 9-11th digit hashValue <<= 1; encodedRead >>= 4; hashValue += (encodedRead & 0x01); // Get 12th digit hashValue <<= 1; encodedRead >>= 3; hashValue += (encodedRead & 0x01); // Get the 13th position return (unsigned int) hashValue; } unsigned int getF2SeedHashValue(CReadInBits r) { WORD_SIZE hashValue = getF2SeedHashValue(r.UpperBits); hashValue <<= BITS_FOR_HASHING; hashValue += getF2SeedHashValue(r.LowerBits); return ((unsigned int)hashValue); } unsigned int getF2SeedHashValue4ReadLength25_27(CReadInBits r) { const int weight = 12; const int eliminatedCarePossitionsNo = 1; unsigned int hashValue = getF2SeedHashValue(r.UpperBits); hashValue >>= eliminatedCarePossitionsNo; hashValue <<= weight; hashValue += (getF2SeedHashValue(r.LowerBits) >> eliminatedCarePossitionsNo); return hashValue; } unsigned int getF2SeedHashValue4ReadLength23_24(CReadInBits r) { const int weight = 11; const int eliminatedCarePossitionsNo = 2; unsigned int hashValue = getF2SeedHashValue(r.UpperBits); hashValue >>= eliminatedCarePossitionsNo; hashValue <<= weight; hashValue += (getF2SeedHashValue(r.LowerBits) >> eliminatedCarePossitionsNo); return hashValue; } unsigned int getF2SeedHashValue4ReadLength22(CReadInBits r) { const int weight = 10; const int eliminatedCarePossitionsNo = 3; unsigned int hashValue = getF2SeedHashValue(r.UpperBits); hashValue >>= eliminatedCarePossitionsNo; hashValue <<= weight; hashValue += (getF2SeedHashValue(r.LowerBits) >> eliminatedCarePossitionsNo); return hashValue; } unsigned int getF2SeedHashValue4ReadLength21(CReadInBits r) { const int weight = 9; const int eliminatedCarePossitionsNo = 4; unsigned int hashValue = getF2SeedHashValue(r.UpperBits); hashValue >>= eliminatedCarePossitionsNo; hashValue <<= weight; hashValue += (getF2SeedHashValue(r.LowerBits) >> eliminatedCarePossitionsNo); return hashValue; } unsigned int getF2SeedHashValue4ReadLength20(CReadInBits r) { const int weight = 8; const int eliminatedCarePossitionsNo = 5; unsigned int hashValue = getF2SeedHashValue(r.UpperBits); hashValue >>= eliminatedCarePossitionsNo; hashValue <<= weight; hashValue += (getF2SeedHashValue(r.LowerBits) >> eliminatedCarePossitionsNo); return hashValue; } ptHashFunc selectF2(int readlength) { if (readlength > 27) { return &getF2SeedHashValue; } else if (25 <= readlength && readlength <= 27) { return &getF2SeedHashValue4ReadLength25_27; } else if (23 <= readlength && readlength <= 24) { return &getF2SeedHashValue4ReadLength23_24; } else if (readlength == 22) { return &getF2SeedHashValue4ReadLength22; } else if (readlength == 21) { return &getF2SeedHashValue4ReadLength21; } else if (readlength == 20) { return &getF2SeedHashValue4ReadLength20; } else { LOG_INFO("\nInfo %d: No hash function defined for read with length %d\n", WARNING_LOG, readlength); return (NULL); } } // private function callse by its overloaded getF2SeedHashKey(WORD_SIZE encoedRead, WORD_SIZE encoedRead); inline unsigned int getF2SeedKey(WORD_SIZE encoedRead, int keyWeight) { //index 234567 8901234 5678901 2 // Use the suffixLength bits based on the seed pattern 11*1**(111*1**)(111*1**)1 to create a value WORD_SIZE uiKey = encoedRead & 0x03; // Get the 14 and 15 digit encoedRead >>= 3; uiKey <<= 1; // Get the 16 digit uiKey += (encoedRead & 0x01); for (keyWeight -= 3; keyWeight > 0; keyWeight -= 4) { encoedRead >>= 3; uiKey <<= 1; // Get 17th + 4i digit uiKey += (encoedRead & 0x01); encoedRead >>= 1; uiKey <<= 1; // Get 18th + 4i digit uiKey += (encoedRead & 0x01); encoedRead >>= 1; uiKey <<= 1; // Get 19th + 4i digit uiKey += (encoedRead & 0x01); encoedRead >>= 2; // don't care uiKey <<= 1; // Get 20th + 4i digit uiKey += (encoedRead & 0x01); } uiKey >>= (-1 * keyWeight); return ((unsigned int)uiKey); } unsigned int getF2SeedKey(CReadInBits r, int keyWeight) { r.UpperBits >>= 22; // The first 22 base are used as hash key (111*1**)(111*1**)(111*1**)1 r.LowerBits >>= 22; unsigned int uiKey1 = getF2SeedKey(r.UpperBits, keyWeight); unsigned int uiKey2 = getF2SeedKey(r.LowerBits, keyWeight); #ifdef EXTEND_SEED unsigned int uiKey = InterleaveBits((unsigned short) uiKey2, (unsigned short) uiKey1); #else unsigned int uiKey = (uiKey1 << keyWeight) + uiKey2; #endif return (uiKey); } //////////////////////////////////////////////////////////////////////////////// inline unsigned int getS1_1SeedHashValue(WORD_SIZE Bits) { // For read length >= 32 (SOLiD Read length 33), it has corresponding seed with weight >= 13 // 32 - (10-1) = 23 // (1111**1***) (1111**1***) 111 WORD_SIZE hashValue = 0; hashValue += (Bits & 0x0f); // Get the first 4 digit Bits >>= 6; hashValue <<= 1; hashValue += (Bits & 0x01); // Get the first 5 th digit Bits >>= 4; hashValue <<= 4; hashValue += (Bits & 0x0f); // Get the first 6-9 digit Bits >>= 6; hashValue <<= 1; hashValue += (Bits & 0x01); // Get the first 10 digit Bits >>= 4; hashValue <<= 1; hashValue += (Bits & 0x01); // Get the first 11 digit Bits >>= 1; hashValue <<= 1; hashValue += (Bits & 0x01); // Get the first 12 digit Bits >>= 1; hashValue <<= 1; hashValue += (Bits & 0x01); // Get the first 13 digit return ((unsigned int)hashValue); } unsigned int getS1_1SeedHashValue(CReadInBits r) { unsigned int hashValue = getS1_1SeedHashValue(r.UpperBits); hashValue <<= BITS_FOR_HASHING; // BITS_FOR_HASHING is a fixed bits for 13 hashValue += getS1_1SeedHashValue(r.LowerBits); return hashValue; } // Get first 12 care bits and concatenate the key together. unsigned int getS1_1SeedHashValue4ReadLength31(CReadInBits r) { const int weight = 12; const int eliminatedCarePossitionsNo = 1; unsigned int hashValue = getS1_1SeedHashValue(r.UpperBits); hashValue >>= eliminatedCarePossitionsNo; hashValue <<= weight; hashValue += (getS1_1SeedHashValue(r.LowerBits) >> eliminatedCarePossitionsNo); return hashValue; } // Get first 11 care bits and concatenate the key together. unsigned int getS1_1SeedHashValue4ReadLength30(CReadInBits r) { const int weight = 11; const int eliminatedCarePossitionsNo = 2; unsigned int hashValue = getS1_1SeedHashValue(r.UpperBits); hashValue >>= eliminatedCarePossitionsNo; hashValue <<= weight; hashValue += (getS1_1SeedHashValue(r.LowerBits) >> eliminatedCarePossitionsNo); return hashValue; } // Get first 10 care bits and concatenate the key together. unsigned int getS1_1SeedHashValue4ReadLength26_29(CReadInBits r) { const int weight = 10; const int eliminatedCarePossitionsNo = 3; unsigned int hashValue = getS1_1SeedHashValue(r.UpperBits); hashValue >>= eliminatedCarePossitionsNo; hashValue <<= weight; hashValue += (getS1_1SeedHashValue(r.LowerBits) >> eliminatedCarePossitionsNo); return hashValue; } // Get first 9 care bits and concatenate the key together. unsigned int getS1_1SeedHashValue4ReadLength23_25(CReadInBits r) { const int weight = 9; const int eliminatedCarePossitionsNo = 4; unsigned int hashValue = getS1_1SeedHashValue(r.UpperBits); hashValue >>= eliminatedCarePossitionsNo; hashValue <<= weight; hashValue += (getS1_1SeedHashValue(r.LowerBits) >> eliminatedCarePossitionsNo); return hashValue; } ptHashFunc selectS1_1(int readlength) { if (readlength > 31) { return &getS1_1SeedHashValue; } else if (readlength == 31) { return &getS1_1SeedHashValue4ReadLength31; } else if (readlength == 30) { return &getS1_1SeedHashValue4ReadLength30; } else if (26 <= readlength && readlength <= 29) { return &getS1_1SeedHashValue4ReadLength26_29; } else if (23 <= readlength && readlength <= 25) { return &getS1_1SeedHashValue4ReadLength23_25; } else { LOG_INFO("\nInfo %d: No hash function defined for read with length %d\n", WARNING_LOG, readlength); return (NULL); } } inline unsigned int getS1_1SeedKey(WORD_SIZE Bits, int keyWeight) { // 1**1*** 1111**1*** 111 WORD_SIZE uiKey = 0; // For read lenght = 33 to 35, keyWeight == 1 uiKey += (Bits & 0x01); // Get the first 14 digit if (keyWeight > 1) { Bits >>= 3; uiKey <<= 1; uiKey += (Bits & 0x01); // Get the first 15 th digit, for read length 36 to 39 for (keyWeight -= 2; keyWeight > 0; keyWeight -= 5) { // For read lenght == 40 Bits >>= 4; uiKey <<= 1; uiKey += (Bits & 0x01); // Get 16 th + 5i; Bits >>= 1; uiKey <<= 1; uiKey += (Bits & 0x01); // Get 17 th + 5i; Bits >>= 1; uiKey <<= 1; uiKey += (Bits & 0x01); // Get 18 th + 5i; Bits >>= 1; uiKey <<= 1; uiKey += (Bits & 0x01); // Get 19 th + 5i; Bits >>= 3; uiKey <<= 1; uiKey += (Bits & 0x01); // Get 20 th + 5i; } uiKey >>= (-1 * keyWeight); } return ((unsigned int)uiKey); } unsigned int getS1_1SeedKey(CReadInBits r, int keyWeight) { // The seed pattern is (1111**1***)(1111**1***)111 // The first 13 care position, that is the first 23 positions are used for hash value. // keyWeight number of base used to generate key for binary search if (keyWeight > 0) { r.UpperBits >>= 23; r.LowerBits >>= 23; unsigned int uiKey1 = getS1_1SeedKey(r.UpperBits, keyWeight); unsigned int uiKey2 = getS1_1SeedKey(r.LowerBits, keyWeight); #ifdef EXTEND_SEED unsigned int uiKey = InterleaveBits((unsigned short) uiKey2, (unsigned short) uiKey1); #else unsigned int uiKey = (uiKey1 << keyWeight) + uiKey2; #endif return uiKey; } else return (0); } //////////////////////////////////////////////////////////////////////////////// inline unsigned int getS2_0SeedHashValue(WORD_SIZE Bits) { // For read length >= 35 or more, weight is 13. (Note SOliD read 35 actually has only color 34) // (1111**1****)(1111**1****)111 WORD_SIZE hashValue = 0; hashValue += (Bits & 0x0f); // Get the first 4 digit Bits >>= 6; hashValue <<= 1; hashValue += (Bits & 0x01); // Get the first 5 digit Bits >>= 5; hashValue <<= 4; hashValue += (Bits & 0x0f); // Get the first 6-9 digit Bits >>= 6; hashValue <<= 1; hashValue += (Bits & 0x01); // Get the first 10 digit Bits >>= 5; hashValue <<= 1; hashValue += (Bits & 0x01); // Get the first 11 digit Bits >>= 1; hashValue <<= 1; hashValue += (Bits & 0x01); // Get the first 12 digit Bits >>= 1; hashValue <<= 1; hashValue += (Bits & 0x01); // Get the first 13 digit return ((unsigned int)hashValue); } unsigned int getS2_0SeedHashValue(CReadInBits r) { WORD_SIZE hashValue = getS2_0SeedHashValue(r.UpperBits); hashValue <<= BITS_FOR_HASHING; // BITS_FOR_HASHING is a fixed bits for 13 hashValue += getS2_0SeedHashValue(r.LowerBits); return ((unsigned int)hashValue); } inline unsigned int getS2_0SeedHashValue4ReadLength34(WORD_SIZE Bits) { // For read 34, the seed length is 24. // (1111**1****)(1111**1****)11 WORD_SIZE hashValue = 0; hashValue += (Bits & 0x0f); // Get the first 4 digit Bits >>= 6; hashValue <<= 1; hashValue += (Bits & 0x01); // Get the first 5 digit Bits >>= 5; hashValue <<= 4; hashValue += (Bits & 0x0f); // Get the first 6-9 digit Bits >>= 6; hashValue <<= 1; hashValue += (Bits & 0x01); // Get the first 10 digit Bits >>= 5; hashValue <<= 2; hashValue += (Bits & 0x03); // Get the first 11 - 12th digit return ((unsigned int)hashValue); } // For Solid Read, the read length typical read length is 34. // The seed weight for it is 12 unsigned int getS2_0SeedHashValue4ReadLength34(CReadInBits r) { const int weight = 12; WORD_SIZE hashValue = getS2_0SeedHashValue4ReadLength34(r.UpperBits); hashValue <<= weight; hashValue += getS2_0SeedHashValue4ReadLength34(r.LowerBits); return ((unsigned int)hashValue); } inline unsigned int getS2_0SeedHashValue4ReadLength33(WORD_SIZE Bits) { // For read 33, the seed length is 23. // (1111**1****)(1111**1****)1 WORD_SIZE hashValue = 0; hashValue += (Bits & 0x0f); // Get the first 4 digit Bits >>= 6; hashValue <<= 1; hashValue += (Bits & 0x01); // Get the first 5 digit Bits >>= 5; hashValue <<= 4; hashValue += (Bits & 0x0f); // Get the first 6-9 digit Bits >>= 6; hashValue <<= 1; hashValue += (Bits & 0x01); // Get the first 10 digit Bits >>= 5; hashValue <<= 1; hashValue += (Bits & 0x01); // Get the first 11th digit return ((unsigned int)hashValue); } // For read length is 33, the seed weight for it is 11 unsigned int getS2_0SeedHashValue4ReadLength33(CReadInBits r) { const int weight = 11; WORD_SIZE hashValue = getS2_0SeedHashValue4ReadLength33(r.UpperBits); hashValue <<= weight; hashValue += getS2_0SeedHashValue4ReadLength33(r.LowerBits); return ((unsigned int)hashValue); } inline unsigned int getS2_0SeedHashValue4ReadLength28_32(WORD_SIZE Bits) { // For read 32, the seed length is 22. The weight is 10 // (1111**1****)(1111**1****) // For read 28, the seed length is 18. The weight is 10 // (1111**1****)(1111**1) WORD_SIZE hashValue = 0; hashValue += (Bits & 0x0f); // Get the first 4 digit Bits >>= 6; hashValue <<= 1; hashValue += (Bits & 0x01); // Get the first 5 digit Bits >>= 5; hashValue <<= 4; hashValue += (Bits & 0x0f); // Get the first 6-9 digit Bits >>= 6; hashValue <<= 1; hashValue += (Bits & 0x01); // Get the first 10 digit return ((unsigned int)hashValue); } // Hash Value for read length 28 to 32. Get the first 10 care bits and concatenate them together unsigned int getS2_0SeedHashValue4ReadLength28_32(CReadInBits r) { const int weight = 10; WORD_SIZE hashValue = getS2_0SeedHashValue4ReadLength28_32(r.UpperBits); hashValue <<= weight; hashValue += getS2_0SeedHashValue4ReadLength28_32(r.LowerBits); return ((unsigned int)hashValue); } inline unsigned int getS2_0SeedHashValue4ReadLength25_27(WORD_SIZE Bits) { // For read 25, the seed length is 15. The weight is 9 // (1111**1****)(1111) WORD_SIZE hashValue = 0; hashValue += (Bits & 0x0f); // Get the first 4 digit Bits >>= 6; hashValue <<= 1; hashValue += (Bits & 0x01); // Get the first 5 digit Bits >>= 5; hashValue <<= 4; hashValue += (Bits & 0x0f); // Get the first 6-9 digit return ((unsigned int)hashValue); } // Hash Value for read length 25. Get the first 9 care bits and concatenate them together unsigned int getS2_0SeedHashValue4ReadLength25_27(CReadInBits r) { const int weight = 9; WORD_SIZE hashValue = getS2_0SeedHashValue4ReadLength25_27(r.UpperBits); hashValue <<= weight; hashValue += getS2_0SeedHashValue4ReadLength25_27(r.LowerBits); return ((unsigned int)hashValue); } ptHashFunc selectS2_0(int readlength) { if (readlength > 34) { return &getS2_0SeedHashValue; } else if (readlength == 34) { return &getS2_0SeedHashValue4ReadLength34; } else if (readlength == 33) { return &getS2_0SeedHashValue4ReadLength33; } else if (28 <= readlength && readlength <= 32) { return &getS2_0SeedHashValue4ReadLength28_32; } else if (25 <= readlength && readlength <= 27) { return &getS2_0SeedHashValue4ReadLength25_27; } else { LOG_INFO("\nInfo %d: No hash function defined for read with length %d\n", WARNING_LOG, readlength); return (NULL); } } inline unsigned int getS2_0SeedKey(WORD_SIZE Bits, int keyWeight) { // 1(**1****)1111**1****) WORD_SIZE uiKey = (Bits & 0x01);; // 14th bit for (keyWeight -= 1; keyWeight > 0; keyWeight -= 5) { Bits >>= 3; uiKey <<= 1; uiKey += (Bits & 0x01); // 15th bit Bits >>= 5; uiKey <<= 1; uiKey += (Bits & 0x01); // 16th bit Bits >>= 1; uiKey <<= 1; uiKey += (Bits & 0x01); // 17th bit Bits >>= 1; uiKey <<= 1; uiKey += (Bits & 0x01); // 18th bit Bits >>= 1; uiKey <<= 1; uiKey += (Bits & 0x01); // 19th bit } uiKey >>= (-1 * keyWeight); return ((unsigned int)uiKey); } unsigned int getS2_0SeedKey(CReadInBits r, int keyWeight) { // The seed pattern is (1111**1****)(11111**1****) // The first 13 care position, that is the first 25 position are used for hash value if (keyWeight > 0) { r.UpperBits >>= 25; r.LowerBits >>= 25; unsigned int uiKey1 = getS2_0SeedKey(r.UpperBits, keyWeight); unsigned int uiKey2 = getS2_0SeedKey(r.LowerBits, keyWeight); #ifdef EXTEND_SEED unsigned int uiKey = InterleaveBits((unsigned short) uiKey2, (unsigned short) uiKey1); #else unsigned int uiKey = (uiKey1 << keyWeight) + uiKey2; #endif return uiKey; } else return (0); } inline unsigned int getS2_0SeedKey4ReadLength34(WORD_SIZE Bits, int keyWeight) { keyWeight = 0; // 11(**1****)1 WORD_SIZE uiKey = (Bits & 0x03);; // 13 14th bit Bits >>= 4; uiKey <<= 1; uiKey += (Bits & 0x01); // 15th bit Bits >>= 5; uiKey <<= 1; uiKey += (Bits & 0x01); // 16th bit return ((unsigned int)uiKey); } // EXTEND_SEED unsigned int getS2_0SeedKey4ReadLength34(CReadInBits r, int keyWeight) { // The seed pattern is (1111**1****)(1111**1****)11 // The first 13 care position, that is the first 25 position are used for hash value if (keyWeight > 0) { r.UpperBits >>= 24; r.LowerBits >>= 24; unsigned int uiKey1 = getS2_0SeedKey4ReadLength34(r.UpperBits, keyWeight); unsigned int uiKey2 = getS2_0SeedKey4ReadLength34(r.LowerBits, keyWeight); unsigned int uiKey = InterleaveBits((unsigned short) uiKey2, (unsigned short) uiKey1); return uiKey; } else return (0); } //////////////////////////////////////////////////////////////////////////////// inline unsigned int getF3SeedHashValue(WORD_SIZE Bits) { // To save the number of CPU instructions, the value enocoded bits that doesn't follow the order of reads // It still gets the first 3 digit and the fivth digit.. and so on to get total 13 digits WORD_SIZE hashValue = 0; hashValue += (Bits & 0x07); // Get the first 3 digit Bits >>= 4; hashValue <<= 1; hashValue += (Bits & 0x01); // Get the first 4th digit Bits >>= 3; hashValue <<= 1; hashValue += (Bits & 0x01); // Get the first 5th digit Bits >>= 4; hashValue <<= 3; hashValue += (Bits & 0x07); // Get the first 6-8 digit Bits >>= 4; hashValue <<= 1; hashValue += (Bits & 0x01); // Get the first 9th digit Bits >>= 3; hashValue <<= 1; hashValue += (Bits & 0x01); // Get the first 10th digit Bits >>= 4; hashValue <<= 3; hashValue += (Bits & 0x07); // Get the first 11-13th digit return ((unsigned int)hashValue); } unsigned int getF3SeedHashValue(CReadInBits r) { WORD_SIZE hashValue = getF3SeedHashValue(r.UpperBits); hashValue <<= BITS_FOR_HASHING; hashValue += getF3SeedHashValue(r.LowerBits); return ((unsigned int)hashValue); } inline unsigned int getF3SeedHashValue4ReadLength34(WORD_SIZE Bits) { // To save the number of CPU instructions, the value enocoded bits that doesn't follow the order of reads // It still gets the first 3 digit and the fivth digit.. and so on to get total 13 digits WORD_SIZE hashValue = 0; hashValue += (Bits & 0x07); // Get the first 3 digit Bits >>= 4; hashValue <<= 1; hashValue += (Bits & 0x01); // Get the first 4th digit Bits >>= 3; hashValue <<= 1; hashValue += (Bits & 0x01); // Get the first 5th digit Bits >>= 4; hashValue <<= 3; hashValue += (Bits & 0x07); // Get the first 6-8 digit Bits >>= 4; hashValue <<= 1; hashValue += (Bits & 0x01); // Get the first 9th digit Bits >>= 3; hashValue <<= 1; hashValue += (Bits & 0x01); // Get the first 10th digit Bits >>= 4; hashValue <<= 1; hashValue += (Bits & 0x01); // Get the first 11th digit Bits >>= 1; hashValue <<= 1; hashValue += (Bits & 0x01); // Get the first 12th digit return ((unsigned int)hashValue); } unsigned int getF3SeedHashValue4ReadLength34(CReadInBits r) { // The seed length is (34 - 10) = 24. // // (11101001000 11101001000 11. Has weight only 12 const int weight = 12; WORD_SIZE hashValue = getF3SeedHashValue4ReadLength34(r.UpperBits); hashValue <<= weight; hashValue += getF3SeedHashValue4ReadLength34(r.LowerBits); return ((unsigned int)hashValue); } unsigned int getF3SeedHashValue4ReadLength33(CReadInBits r) { // The seed length is (33 - 10) = 23. // // (11101001000 11101001000 1. Has weight only 11 const int eliminatedCarePossitionsNo = 1; const int weight = 11; unsigned int hashValue = getF3SeedHashValue4ReadLength34(r.UpperBits); hashValue >>= eliminatedCarePossitionsNo; hashValue <<= weight; hashValue += (getF3SeedHashValue4ReadLength34(r.LowerBits) >> eliminatedCarePossitionsNo); return hashValue; } inline unsigned int getF3SeedHashValue4ReadLength29_32(WORD_SIZE Bits) { // The seed length is (32 - 10) = 22. // (11101001000 11101001000. Has weight only 11 WORD_SIZE hashValue = 0; hashValue += (Bits & 0x07); // Get the first 3 digit Bits >>= 4; hashValue <<= 1; hashValue += (Bits & 0x01); // Get the first 4th digit Bits >>= 3; hashValue <<= 1; hashValue += (Bits & 0x01); // Get the first 5th digit Bits >>= 4; hashValue <<= 3; hashValue += (Bits & 0x07); // Get the first 6-8 digit Bits >>= 4; hashValue <<= 1; hashValue += (Bits & 0x01); // Get the first 9th digit Bits >>= 3; hashValue <<= 1; hashValue += (Bits & 0x01); // Get the first 10th digit return ((unsigned int)hashValue); } // Get first 10 care bits and concatenate together. unsigned int getF3SeedHashValue4ReadLength29_32(CReadInBits r) { // The seed length is (29 - 10) = 19. // 11101001000 11101001 Has weight only 10 const int weight = 10; WORD_SIZE hashValue = getF3SeedHashValue4ReadLength29_32(r.UpperBits); hashValue <<= weight; hashValue += getF3SeedHashValue4ReadLength29_32(r.LowerBits); return ((unsigned int)hashValue); } // Get first 9 care bits and concatenate together. unsigned int getF3SeedHashValue4ReadLength26_28(CReadInBits r) { // The seed length is (26 - 10) = 16. // 11101001000 11101 Has weight only 9 const int eliminatedCarePossitionsNo = 1; const int weight = 9; unsigned int hashValue = getF3SeedHashValue4ReadLength29_32(r.UpperBits); hashValue >>= eliminatedCarePossitionsNo; hashValue <<= weight; hashValue += (getF3SeedHashValue4ReadLength29_32(r.LowerBits) >> eliminatedCarePossitionsNo); return hashValue; } // Get first 8 care bits and concatenate together. unsigned int getF3SeedHashValue4ReadLength25(CReadInBits r) { // The seed length is (25 - 10) = 15. // 11101001000 11101 Has weight only 8 const int eliminatedCarePossitionsNo = 2; const int weight = 8; unsigned int hashValue = getF3SeedHashValue4ReadLength29_32(r.UpperBits); hashValue >>= eliminatedCarePossitionsNo; hashValue <<= weight; hashValue += (getF3SeedHashValue4ReadLength29_32(r.LowerBits) >> eliminatedCarePossitionsNo); return hashValue; } ptHashFunc selectF3(int readlength) { if (readlength > 34) { return &getF3SeedHashValue; } else if (readlength == 34) { return &getF3SeedHashValue4ReadLength34; } else if (readlength == 33) { return &getF3SeedHashValue4ReadLength33; } else if (29 <= readlength && readlength <= 32) { return &getF3SeedHashValue4ReadLength29_32; } else if (26 <= readlength && readlength <= 28) { return &getF3SeedHashValue4ReadLength26_28; } else { LOG_INFO("\nInfo %d: No hash function defined for read with length %d\n", WARNING_LOG, readlength); return (NULL); } } unsigned int getF3SeedKey(WORD_SIZE Bits, int keyWeight) { // Use the suffixLength bits based on the seed pattern (11101001000 11101001000 1110 ) (100 1000 1110) (100 1000 1110) to create a value WORD_SIZE uiKey = 0; for (; keyWeight > 0; keyWeight -= 5) { uiKey <<= 1; uiKey += (Bits & 0x01); Bits >>= 3; uiKey <<= 1; uiKey += (Bits & 0x01); Bits >>= 4; uiKey <<= 1; uiKey += (Bits & 0x01); Bits >>= 1; uiKey <<= 1; uiKey += (Bits & 0x01); Bits >>= 1; uiKey <<= 1; uiKey += (Bits & 0x01); // for read length 47 (Possition 36) Bits >>= 2; } uiKey >>= (-1 * keyWeight); return ((unsigned int)uiKey); } unsigned int getF3SeedKey(CReadInBits r, int keyWeight) { if (keyWeight > 0) { // The seed pattern is 11101001000 11101001000 111 01 // The first 13 care position, that is the first 25 position are used for hash value r.UpperBits >>= 26; r.LowerBits >>= 26; unsigned int uiKey1 = getF3SeedKey(r.UpperBits, keyWeight); unsigned int uiKey2 = getF3SeedKey(r.LowerBits, keyWeight); #ifdef EXTEND_SEED unsigned int uiKey = InterleaveBits((unsigned short) uiKey2, (unsigned short) uiKey1); #else unsigned int uiKey = (uiKey1 << keyWeight) + uiKey2; #endif return uiKey; } else { return(0); } } unsigned int getF3SeedKey4ReadLength34(WORD_SIZE Bits, int keyWeight) { // Use the suffixLength bits based on the seed pattern (11101001000 11101001000 11)10 100 1000 1 to create a value // the keyWeight under extended seed is 12 + 4. (Because the maximium weight is 16 add 4) keyWeight = 0; WORD_SIZE uiKey = 0; uiKey += (Bits & 0x01); uiKey <<= 1; Bits >>= 2; uiKey += (Bits & 0x01); uiKey <<= 1; Bits >>= 3; uiKey += (Bits & 0x01); uiKey <<= 1; Bits >>= 4; uiKey += (Bits & 0x01); return ((unsigned int)uiKey); } unsigned int getF3SeedKey4ReadLength34(CReadInBits r, int keyWeight) { if (keyWeight > 0) { // Use the suffixLength bits based on the seed pattern (11101001000 11101001000 11)10 100 1000 1 to create a value // the keyWeight under extended seed is 12 + 4. (Because the maximum weight is 16 add 4) r.UpperBits >>= 24; r.LowerBits >>= 24; unsigned int uiKey1 = getF3SeedKey4ReadLength34(r.UpperBits, keyWeight); unsigned int uiKey2 = getF3SeedKey4ReadLength34(r.LowerBits, keyWeight); unsigned int uiKey = InterleaveBits((unsigned short) uiKey2, (unsigned short) uiKey1); return uiKey; } else { return(0); } } unsigned int getF3SeedKey4ReadLength32(WORD_SIZE Bits, int keyWeight) { // Use the suffixLength bits based on the seed pattern (11101001000 11101001000) 1110 100 1000 // the keyWeight under extended seed is 10 + 5. (Because the maximum weight is 10 add 5) WORD_SIZE uiKey = 0; uiKey += (Bits & 0x07); uiKey <<= 1; Bits >>= 4; uiKey += (Bits & 0x01); uiKey <<= 1; Bits >>= 3; uiKey += (Bits & 0x01); return ((unsigned int)uiKey); } unsigned int getF3SeedKey4ReadLength32(CReadInBits r, int keyWeight) { // Use the suffixLength bits based on the seed pattern (11101001000 11101001000) 1110 100 1000 // the keyWeight under extended seed is 10 + 5. (Because the maximum weight is 10 add 5) if (keyWeight > 0) { r.UpperBits >>= 22; r.LowerBits >>= 22; unsigned int uiKey1 = getF3SeedKey4ReadLength32(r.UpperBits, keyWeight); unsigned int uiKey2 = getF3SeedKey4ReadLength32(r.LowerBits, keyWeight); unsigned int uiKey = InterleaveBits((unsigned short) uiKey2, (unsigned short) uiKey1); return uiKey; } else { return(0); } } //////////////////////////////////////////////////////////////////////////////// inline unsigned int getS1_2SeedHashValue(WORD_SIZE Bits) { // To save the number of CPU instructions, the value encoded bits that doesn't follow the order of reads // It still gets the first 3 digit and the fifth digit.. and so on to get total 13 digits // (11110010000000)(11110010000000)(111) WORD_SIZE hashValue = 0; hashValue += (Bits & 0x0f); // Get the first 4 digit Bits >>= 6; hashValue <<= 1; hashValue += (Bits & 0x01); // Get the 5th digit Bits >>= 8; hashValue <<= 4; hashValue += (Bits & 0x0f); // Get the first 6-9th digit Bits >>= 6; hashValue <<= 1; hashValue += (Bits & 0x01); // Get the 10th digit Bits >>= 8; hashValue <<= 3; hashValue += (Bits & 0x07); // Get the first 11-13 digit return ((unsigned int)hashValue); } unsigned int getS1_2SeedHashValue(CReadInBits r) { WORD_SIZE hashValue = getS1_2SeedHashValue(r.UpperBits); hashValue <<= BITS_FOR_HASHING; hashValue += getS1_2SeedHashValue(r.LowerBits); return ((unsigned int)hashValue); } unsigned int getS1_2SeedKey4ReadLength46_49(WORD_SIZE Bits, int keyWeight) { // 1001 WORD_SIZE uiKey = 0; uiKey = Bits & 0x01; if (keyWeight == 1) return ((unsigned int)uiKey); else { uiKey <<= 1; Bits >>= 3; uiKey += Bits & 0x01; return ((unsigned int)uiKey); } } // EXTEND_SEED unsigned int getS1_2SeedKey4ReadLength46_49(CReadInBits r, int keyWeight) { // The seed pattern is 11110010000000 11110010000000 111 100100 // The first 13 care position, that is the first 31 position are used for hash value r.UpperBits >>= 31; // The first 21 base are used as hash key (111*1**)(111*1**)(111*1**)1 r.LowerBits >>= 31; unsigned int uiKey1 = getS1_2SeedKey4ReadLength46_49(r.UpperBits, keyWeight); unsigned int uiKey2 = getS1_2SeedKey4ReadLength46_49(r.LowerBits, keyWeight); #ifdef EXTEND_SEED unsigned int uiKey = InterleaveBits((unsigned short) uiKey2, (unsigned short) uiKey1); #else unsigned int uiKey = (uiKey1 << keyWeight) + uiKey2; #endif return uiKey; } //////////////////////////////////////////////////////////////////////////////// inline unsigned int getF4SeedHashValue(WORD_SIZE Bits) { // To save the number of CPU instructions, the value encoded bits that doesn't follow the order of reads //50 - 9 = 41 // (1100010000)(1100010000)(1100010000)(1100010000) 1 The weight is 13 WORD_SIZE hashValue = 0; hashValue += (Bits & 0x03); // Get the first 2 digit Bits >>= 5; hashValue <<= 1; hashValue += (Bits & 0x01); // Get the 3th digit Bits >>= 5; hashValue <<= 2; hashValue += (Bits & 0x03); // Get the 4 and 5th digit Bits >>= 5; hashValue <<= 1; hashValue += (Bits & 0x01); // Get the 6th digit Bits >>= 5; hashValue <<= 2; hashValue += (Bits & 0x03); // Get the first 7 and 8th digit Bits >>= 5; hashValue <<= 1; hashValue += (Bits & 0x01); // Get the 9th digit Bits >>= 5; hashValue <<= 1; hashValue += (Bits & 0x01); // Get the 10th digit Bits >>= 1; hashValue <<= 1; hashValue += (Bits & 0x01); // Get the 11th digit Bits >>= 4; hashValue <<= 1; hashValue += (Bits & 0x01); // Get the 12th digit Bits >>= 5; hashValue <<= 1; hashValue += (Bits & 0x01); // Get the 13th digit return ((unsigned int)hashValue); } // Get first 13 care bits and concatenate together, with read length = 50 unsigned int getF4SeedHashValue(CReadInBits r) { // The seed length is 50 - (10 - 1) = 41 // (1100010000)(1100010000)(1100010000) 1100010000. The weight is 13 const int weight = 13; unsigned int hashValue = getF4SeedHashValue(r.UpperBits); hashValue <<= weight; hashValue += getF4SeedHashValue(r.LowerBits); return hashValue; } // Get first 12 care bits and concatenate together. unsigned int getF4SeedHashValue4ReadLength45_49(CReadInBits r) { // The seed length is 45 - (10 - 1) = 36 // (1100010000)(1100010000)(1100010000) 110001 The weight is 12 const int eliminatedCarePossitionsNo = 1; const int weight = 12; unsigned int hashValue = getF4SeedHashValue(r.UpperBits); hashValue >>= eliminatedCarePossitionsNo; hashValue <<= weight; hashValue += (getF4SeedHashValue(r.LowerBits) >> eliminatedCarePossitionsNo); return hashValue; } // Get first 11 care bits and concatenate together. unsigned int getF4SeedHashValue4ReadLength41_44(CReadInBits r) { // The seed length is 41 - (10 - 1) = 32 // (1100010000)(1100010000)(1100010000) 11 The weight is 11 const int eliminatedCarePossitionsNo = 2; const int weight = 11; unsigned int hashValue = getF4SeedHashValue(r.UpperBits); hashValue >>= eliminatedCarePossitionsNo; hashValue <<= weight; hashValue += (getF4SeedHashValue(r.LowerBits) >> eliminatedCarePossitionsNo); return hashValue; } // Get first 10 care bits and concatenate together. unsigned int getF4SeedHashValue4ReadLength40(CReadInBits r) { // The seed length is 40 - (10 - 1) = 31 // (1100010000)(1100010000)(1100010000) 1 The weight is 10 const int eliminatedCarePossitionsNo = 3; const int weight = 10; unsigned int hashValue = getF4SeedHashValue(r.UpperBits); hashValue >>= eliminatedCarePossitionsNo; hashValue <<= weight; hashValue += (getF4SeedHashValue(r.LowerBits) >> eliminatedCarePossitionsNo); return hashValue; } inline unsigned int getF4SeedHashValue4ReadLength35_39(WORD_SIZE Bits) { // The seed length is 35 - (10 - 1) = 26; 39 - (10 - 1) = 30 // (1100010000)(1100010000)(1100010000) The weight is 9 WORD_SIZE hashValue = 0; hashValue += (Bits & 0x03); // Get the first 2 digit Bits >>= 5; hashValue <<= 1; hashValue += (Bits & 0x01); // Get the 3th digit Bits >>= 5; hashValue <<= 2; hashValue += (Bits & 0x03); // Get the 4 and 5th digit Bits >>= 5; hashValue <<= 1; hashValue += (Bits & 0x01); // Get the 6th digit Bits >>= 5; hashValue <<= 2; hashValue += (Bits & 0x03); // Get the first 7 and 8th digit Bits >>= 5; hashValue <<= 1; hashValue += (Bits & 0x01); // Get the 9th digit return ((unsigned int)hashValue); } // Get first 9 care bits and concatenate together. unsigned int getF4SeedHashValue4ReadLength35_39(CReadInBits r) { // The seed length is 35 - (10 - 1) = 26; 39 - (10 - 1) = 30 // (1100010000)(1100010000)(110001) The weight is 9 const int SeedWeight = 9; WORD_SIZE hashValue = getF4SeedHashValue4ReadLength35_39(r.UpperBits); hashValue <<= SeedWeight; hashValue += getF4SeedHashValue4ReadLength35_39(r.LowerBits); return ((unsigned int)hashValue); } inline unsigned int getF4SeedHashValue4ReadLength31_34(WORD_SIZE Bits) { // The seed length is 34 - (10 - 1) = 25; 31 - (10 - 1) = 22 // (1100010000)(1100010000)(11000) The weight is 8 WORD_SIZE hashValue = 0; hashValue += (Bits & 0x03); // Get the first 2 digit Bits >>= 5; hashValue <<= 1; hashValue += (Bits & 0x01); // Get the 3th digit Bits >>= 5; hashValue <<= 2; hashValue += (Bits & 0x03); // Get the 4 and 5th digit Bits >>= 5; hashValue <<= 1; hashValue += (Bits & 0x01); // Get the 6th digit Bits >>= 5; hashValue <<= 2; hashValue += (Bits & 0x03); // Get the first 7 and 8th digit return ((unsigned int)hashValue); } // Get first 8 care bits and concatenate together. unsigned int getF4SeedHashValue4ReadLength31_34(CReadInBits r) { // The seed length is 34 - (10 - 1) = 25; 31 - (10 - 1) = 22 // (1100010000)(1100010000)(11000) The weight is 8 const int SeedWeight = 8; WORD_SIZE hashValue = getF4SeedHashValue4ReadLength31_34(r.UpperBits); hashValue <<= SeedWeight; hashValue += getF4SeedHashValue4ReadLength31_34(r.LowerBits); return ((unsigned int)hashValue); } ptHashFunc selectF4(int readlength) { if (readlength >= 50) { return &getF4SeedHashValue; // weight 13 } else if (45 <= readlength && readlength <= 49) { return &getF4SeedHashValue4ReadLength45_49; // weight 12 } else if (41 <= readlength && readlength <= 44) { return &getF4SeedHashValue4ReadLength41_44; // weight 10 } else if (readlength == 40) { return &getF4SeedHashValue4ReadLength40; // weight 9 } else if (35 <= readlength && readlength <= 39) { return &getF4SeedHashValue4ReadLength35_39; // weight 8 } else if (31 <= readlength && readlength <= 34) { return &getF4SeedHashValue4ReadLength31_34; // weight 8 } else { LOG_INFO("\nInfo %d: No hash function defined for read with length %d\n", WARNING_LOG, readlength); return (NULL); } } unsigned int getNoOfCaredPositions(const char* caSeedRepeat, unsigned int uiReadLength) { unsigned int carePositions = 0; unsigned int seedRepeatLength = (unsigned int)strlen(caSeedRepeat); for (unsigned int i = 0; i < uiReadLength - seedRepeatLength + 1; i++) { if (caSeedRepeat[ i % seedRepeatLength ] == '1') carePositions++; } return(carePositions); } unsigned int getNoOfCaredPositions4FullRead(const char* caSeedRepeat, unsigned int uiReadLength) { unsigned int carePositions = 0; unsigned int seedRepeatLength = (unsigned int)strlen(caSeedRepeat); for (unsigned int i = 0; i < uiReadLength; i++) { if (caSeedRepeat[ i % seedRepeatLength ] == '1') carePositions++; } return(carePositions); } ./Source/ShortReadUtil.cpp0000644011075700120610000001443411720654362015644 0ustar yanghochmath-ar#include "stdafx.h" #include "ShortReadUtil.h" void toUpperCase(char* caArray, int length) { int i; for (i = 0; i < length; i++) { caArray[i] = (char)toupper(caArray[i]); } } char* mutateRead(char* Kmer , unsigned int No_of_mutation) { //This function simply generate mutations in a given string //To guarantee exactly No_of_mutation loci have been changed, transfer the original nt + a random number between 1-3 const int MAX_MUTATION = 64; int mutatedPosition[MAX_MUTATION]; unsigned int l = (unsigned int)strlen(Kmer); if (No_of_mutation > l) { cout << "More mutation than it could be" << endl; No_of_mutation = l; } //Pick the mutation position in advance and avoid picking the same position for (unsigned int i = 0; i < No_of_mutation; i++) { mutatedPosition[i] = rand() % l; for (unsigned int j = 0; j < i; j++) { if (mutatedPosition[i] == mutatedPosition[j]) { i--; break; } } } //Mutated the selected base for (unsigned int i = 0; i < No_of_mutation; i++) { int position = mutatedPosition[i]; mutateBase(&Kmer[position]); } return(Kmer); } void mutateBase(char* Base) { int nt = rand() % 3 + 1; switch (*Base) { case 'A': case 'a': nt = (nt + 0) % 4; break; case 'C': case 'c': nt = (nt + 1) % 4; break; case 'G': case 'g': nt = (nt + 2) % 4; break; case 'T': case 't': nt = (nt + 3) % 4; break; default: ;//cout<3', destroy the original kmer char* reverseComplementKmer(char* Kmer) { if (Kmer != NULL) { unsigned int length = 0; length = (unsigned int)strlen(Kmer); unsigned int i; for (i = 0; i < length / 2; i++) { swap(Kmer[i], Kmer[length-1-i]); } for (i = 0; i < length; i++) { Kmer[i] = complimentBase(Kmer[i]); } } return(Kmer); } unsigned int strComp(char* str1, char* str2, int l) { int i; unsigned int miscounter = 0; for (i = 0; i < l; i++) { if (str1[i] != str2[i]) miscounter++; } return(miscounter); } /* * This function compare two string. It lower-case the character in the * second string, if it mismatches the corresponding position in the first * string */ unsigned int strCompMarkDiff(char* str1, char* str2) { unsigned int iDiff = 0; for (int i = 0; str1[i] != '\0' && str2[i] != '\0'; i++) { if (str1[i] != str2[i]) { iDiff++; str2[i] = (char)tolower(str2[i]); } } return(iDiff); } /* return the # of wild-card mismatches bases between the two read */ unsigned int diNtStrWildCardComp(char* read1, char* read2, unsigned int readlength) { unsigned int uiDiff = 0; for (unsigned int i = 0; i < readlength ; i++) { if (!diNtWildCardComp(read1[i], read2[i])) { uiDiff++; } } return(uiDiff); } bool isDummyRead(const char* read, int dummyT) { if (dummyT > 0) { int dummyCount = 1; if (read[0] == '\0') { return(false); } for ( int i = 1; read[i] != '\0'; i++) { if (read[i] != read[i-1]) { dummyCount = 1; } else { dummyCount++; } if (dummyCount >= dummyT) { break; } } return(dummyCount >= dummyT); } else { return(false); } } char getBaseFromColors(char nt, const char* colors, int pos) { for (int i = 0; i < pos; i++) { if (is0123(colors[i])) { // TODO The function should be named as color2base // TODO It should be moved to color space read nt = base2color(nt, colors[i]); } else { return('N'); } } return(nt); }./Source/TestChromosomeNTdata.cpp0000644011075700120610000000340611720654362017157 0ustar yanghochmath-ar#include "TestChromosomeNTdata.h" TestChromosomeNTdata::TestChromosomeNTdata(const char* testInputChrFileN, const char* testOutputChrFileN) { cout << "Start TestChromosomeNTdata" << endl; bool bFasta = true; unsigned int ntPerLine = 96; // TODO manually test generateTestInput(testInputChrFileN); cout << "generateTestInput" << endl; CchromosomeNTdata* chr = new CchromosomeNTdata(testInputChrFileN, bFasta); cout << "Got Chr" << endl; outputFasta(testOutputChrFileN, ntPerLine, chr->caChromosome); cout << "end" << endl; delete chr; } TestChromosomeNTdata::~TestChromosomeNTdata(void) { } int TestChromosomeNTdata::outputFasta(const char* filename, unsigned int ntPerLine, const char* ntStr) { int lineCounter = 0; ofstream ofile(filename); int length = (int)strlen(ntStr); for(int ntCount = strlen(ntStr); ntCount > 0; ntCount = ntCount - ntPerLine) { for(int i = 0; i < ntPerLine; i++) { int index = lineCounter * ntPerLine + i; char c = ntStr[index]; if( c != EOF && c != '\0' && index != length) { ofile << c; } else { ofile << "\n"; break; } } ofile << endl; lineCounter ++; } ofile.close(); return(0); } int TestChromosomeNTdata::generateTestInput(const char* filename) { // defaultNtPerLine = 48; int lineNo = 500000; const char* line = "ACGTNNAnaCGTNNCnACGTNNGnAcGTNNTnACGTNNAnACgTNNCnACGtNNGnaCGTNNTnAcGTNNAnACgTNNCnACgTNNGnACGtNNTn"; ofstream ofile(filename); ofile << ">Test Ref" << endl; for(int i = 0; i < lineNo; i++) { ofile << line << endl; } ofile.close(); return(0); }./Source/TestGenome_Index_TableQ.cpp0000644011075700120610000003640111720654362017542 0ustar yanghochmath-ar#include "TestGenome_Index_TableQ.h" SimulateLongRead::SimulateLongRead(CGenomeInBits* pgenomeNTInBits, unsigned int startIndex) { unsigned int halfLength = this->uiReadLength / 2; this->goodRead = pgenomeNTInBits->fragACGTKmerInBits(this->half1st, startIndex, halfLength); this->goodRead = this->goodRead & (pgenomeNTInBits->fragACGTKmerInBits(half2nd, startIndex + halfLength, halfLength)); if (goodRead) { bool bOddReadLength = (this->uiReadLength % 2 == 1); decodeLongRead(this->half1st, this->half2nd, this->read, bOddReadLength); } } SimulateLongRead::~SimulateLongRead(void) { } void introduceMutation(char* kmer, int mutationPattern) { if (mutationPattern == 2 || mutationPattern == 3 || mutationPattern == 4) { mutateRead(kmer, mutationPattern); } else if (mutationPattern == (int)FULL_SENSITIVE_OPT_TO_ONE_BASE_ONE_COLOR_MIS) { mutatePairsOfConsecutiveBases(kmer, 1); mutateRead(kmer, 1); } else if (mutationPattern == (int)FULL_SENSITIVE_OPT_TO_TWO_BASE_MIS ) { mutatePairsOfConsecutiveBases(kmer, 2); } else if (mutationPattern == (int)FULL_SENSITIVE_OPT_TO_ONE_BASE_TWO_COLOR_MIS) { mutatePairsOfConsecutiveBases(kmer, 1); mutateRead(kmer, 2); } } /* int getTestChrId(CGenome_Index_TableQ* table) { } const char* getTestChr(CGenome_Index_TableQ* table) { }*/ /* * Test function which mutates every sliding windows on the reference genome and check if the all hits are found */ bool testGenome_Index_TableQ(CGenome_Index_TableQ* table) { bool pass = true; CAlignmentsQ alignmentsQ('B'); CchromosomeNTdata* testChr; unsigned int testChrId; if (table->pgenomeNT->iNo_of_chromosome > 1) { testChrId = 1; // If there are more than one chromosome, test the second one. testChr = table->pgenomeNT->paChromosomes[1]; } else { testChrId = 0; testChr = table->pgenomeNT->paChromosomes[0]; } unsigned int testGenomeIdStart = table->pgenomeNT->chrIndex2genomelocusID(testChrId, 0); unsigned int testGenomeIdEnd = table->pgenomeNT->chrIndex2genomelocusID(testChrId, 0) + testChr->iChromosome_size - table->uiRead_Length; for (unsigned int i = testGenomeIdStart; i < testGenomeIdEnd; i++) { CReadInBits slideWindows; bool goodRead; goodRead = table->pgenomeNTInBits->fragACGTKmerInBits(slideWindows, i, table->uiRead_Length); if (table->bMapReadInColors) { slideWindows = bases2Colors(slideWindows); } if (goodRead) { char read[wordSize + 1]; char originalRead[wordSize + 1]; slideWindows.decode(read); CReadInBits readInBits(read); alignmentsQ.clearHits(); // (1) Test exact match if (table->bMapReadInColors) { table->queryReadColors(readInBits, alignmentsQ, true, true); } else { table->queryReadBases(readInBits, alignmentsQ, true, true); } if (alignmentsQ.MinDiff != 0) { cout << "Miss exact match " << i << endl; pass = false; PRINT_MASKED_REPERAT_FLAG(table, i); } // (2) Test mutated reads myStrCpy(originalRead, read, MAX_READ_LENGTH); introduceMutation(read, table->chosenSeedId); CReadInBits mReadInBits(read); alignmentsQ.clearHits(); if (table->bMapReadInColors) { table->queryReadColors(mReadInBits, alignmentsQ, true, true); } else { table->queryReadBases(mReadInBits, alignmentsQ, true, true); } if (alignmentsQ.MinDiff > table->uiSubDiffThreshold) { cout << "Miss mutated read" << i << endl; pass = false; // PRINT_MASKED_REPERAT_FLAG(table, i); } else if (alignmentsQ.MinDiff < table->uiSubDiffThreshold) { // cout << "Found better alignment!" << endl; } // (3) Check reversed complement reads if (table->bMapReadInColors) { table->pgenomeNTInBits->fragACGTKmerInBits(slideWindows, i, table->uiRead_Length); CReadInBits rcRead = reverseCompliment(table->uiRead_Length, slideWindows); CReadInBits rcReadInColors = bases2Colors(rcRead); rcReadInColors.decode(read); myStrCpy(originalRead, read, MAX_READ_LENGTH); introduceMutation(read, table->chosenSeedId); CReadInBits rcMreadInBits(read); alignmentsQ.clearHits(); table->queryReadColors(rcMreadInBits, alignmentsQ, true, false); } else { reverseComplementKmer(read); CReadInBits rcMreadInBits(read); alignmentsQ.MinDiff = table->uiRead_Length; // SET A LARGE NUMBER table->queryReadBases(rcMreadInBits, alignmentsQ, true, false); } if (alignmentsQ.MinDiff > table->uiSubDiffThreshold) { cout << "Miss reversed mutated read" << i << endl; pass = false; // PRINT_MASKED_REPERAT_FLAG(table, i); } else if (alignmentsQ.MinDiff < table->uiSubDiffThreshold) { // cout << "Found better alignment!" << endl; } } } return(pass); } bool testMappingLongRead(CGenome_Index_TableQ* table) { bool bOddReadLength = false; // Currently only test even long read bool pass = true; CAlignmentsQ alignmentsQ('B'); CchromosomeNTdata* testChr; unsigned int testChrId; if (table->pgenomeNT->iNo_of_chromosome > 1) { testChrId = 1; // If there are more than one chromosome, test the second one. testChr = table->pgenomeNT->paChromosomes[1]; } else { testChrId = 0; testChr = table->pgenomeNT->paChromosomes[0]; } const unsigned halfLength = table->uiRead_Length; unsigned int testGenomeIdStart = table->pgenomeNT->chrIndex2genomelocusID(testChrId, 0); unsigned int testGenomeIdEnd = table->pgenomeNT->chrIndex2genomelocusID(testChrId, 0) + testChr->iChromosome_size - halfLength; for (unsigned int i = testGenomeIdStart; i < testGenomeIdEnd; i++) { CReadInBits half1st, half2nd; bool goodRead; goodRead = table->pgenomeNTInBits->fragACGTKmerInBits(half1st, i, halfLength); goodRead = goodRead & (table->pgenomeNTInBits->fragACGTKmerInBits(half2nd, i + halfLength, halfLength)); if (goodRead) { char read[MAX_LINE]; char originalRead[MAX_LINE]; decodeLongRead(half1st, half2nd, read, bOddReadLength); alignmentsQ.clearHits(); // (1) Test exact match table->queryLongReadBases(half1st, half2nd, bOddReadLength, alignmentsQ, 1, true, true); if (alignmentsQ.MinDiff != 0) { cout << "Miss exact match " << i << endl; pass = false; PRINT_MASKED_REPERAT_FLAG(table, i); } // (2) Test mutated reads myStrCpy(originalRead, read, MAX_LINE); introduceMutation(read, table->chosenSeedId * 2); encodeLongRead(read, half1st, half2nd); alignmentsQ.clearHits(); table->queryLongReadBases(half1st, half2nd, bOddReadLength, alignmentsQ, 1, true, true); if (alignmentsQ.MinDiff > table->uiSubDiffThreshold) { cout << "Miss mutated read" << i << endl; pass = false; // PRINT_MASKED_REPERAT_FLAG(table, i); } else if (alignmentsQ.MinDiff < table->uiSubDiffThreshold) { // cout << "Found better alignment!" << endl; } // (3) Check reversed complement reads reverseComplementKmer(read); CReadInBits rcR1, rcR2; encodeLongRead(read, rcR1, rcR2); alignmentsQ.MinDiff = table->uiRead_Length; // SET A LARGE NUMBER table->queryLongReadBases(rcR1, rcR2, bOddReadLength, alignmentsQ, 1, true, false); table->queryLongReadBases(rcR1, rcR2, bOddReadLength, alignmentsQ, 2, true, false); if (alignmentsQ.MinDiff > table->uiSubDiffThreshold) { cout << "Miss reversed mutated read" << i << endl; pass = false; // PRINT_MASKED_REPERAT_FLAG(table, i); } else if (alignmentsQ.MinDiff < table->uiSubDiffThreshold) { // cout << "Found better alignment!" << endl; } } } return(pass); } bool testMappingLongSOLiDRead(CGenome_Index_TableQ* table) { bool bOddReadLength = false; bool pass = true; CAlignmentsQ alignmentsQ('B'); CchromosomeNTdata* testChr; unsigned int testChrId; if (table->pgenomeNT->iNo_of_chromosome > 1) { testChrId = 1; // If there are more than one chromosome, test the second one. testChr = table->pgenomeNT->paChromosomes[1]; } else { testChrId = 0; testChr = table->pgenomeNT->paChromosomes[0]; } const unsigned halfLength = table->uiRead_Length; unsigned int testGenomeIdStart = table->pgenomeNT->chrIndex2genomelocusID(testChrId, 0); unsigned int testGenomeIdEnd = table->pgenomeNT->chrIndex2genomelocusID(testChrId, 0) + testChr->iChromosome_size - halfLength; for (unsigned int i = testGenomeIdStart; i < testGenomeIdEnd; i++) { CReadInBits half1st, half2nd, half1stInColors, half2ndInColors; bool goodRead; goodRead = table->pgenomeNTInBits->fragACGTKmerInBits(half1st, i, halfLength); goodRead = goodRead & (table->pgenomeNTInBits->fragACGTKmerInBits(half2nd, i + halfLength, halfLength)); if (goodRead) { char read[MAX_LINE]; char originalRead[MAX_LINE]; decodeLongRead(half1st, half2nd, read, bOddReadLength); // TODO: encode each half in colors signals alignmentsQ.clearHits(); // (1) Test exact match table->queryLongReadColors(half1stInColors, half2ndInColors, bOddReadLength, alignmentsQ, 1, true, true); if (alignmentsQ.MinDiff != 0) { cout << "Miss exact match " << i << endl; pass = false; PRINT_MASKED_REPERAT_FLAG(table, i); } // (2) Test mutated reads myStrCpy(originalRead, read, MAX_LINE); introduceMutation(read, table->chosenSeedId * 2); encodeLongRead(read, half1st, half2nd); // TODO: encode each half in colors signals alignmentsQ.clearHits(); table->queryLongReadColors(half1stInColors, half2ndInColors, bOddReadLength, alignmentsQ, 1, true, true); if (alignmentsQ.MinDiff > table->uiSubDiffThreshold) { cout << "Miss mutated read" << i << endl; pass = false; // PRINT_MASKED_REPERAT_FLAG(table, i); } else if (alignmentsQ.MinDiff < table->uiSubDiffThreshold) { // cout << "Found better alignment!" << endl; } // (3) Check reversed complement reads reverseComplementKmer(read); CReadInBits rcR1, rcR2; encodeLongRead(read, rcR1, rcR2); // TODO: encode each half in colors signals alignmentsQ.MinDiff = table->uiRead_Length; // SET A LARGE NUMBER table->queryLongReadBases(rcR1, rcR2, bOddReadLength, alignmentsQ, 1, true, false); table->queryLongReadBases(rcR1, rcR2, bOddReadLength, alignmentsQ, 2, true, false); if (alignmentsQ.MinDiff > table->uiSubDiffThreshold) { cout << "Miss reversed mutated read" << i << endl; pass = false; // PRINT_MASKED_REPERAT_FLAG(table, i); } else if (alignmentsQ.MinDiff < table->uiSubDiffThreshold) { // cout << "Found better alignment!" << endl; } } } return(pass); } bool testMappingLongPairedRead(CGenome_Index_TableQ* table) { bool bOddReadLength = false; bool pass = true; /* CAlignmentsQ alignmentsQ('B'); CchromosomeNTdata* testChr; unsigned int testChrId; if (table->pgenomeNT->iNo_of_chromosome > 1) { testChrId = 1; // If there are more than one chromosome, test the second one. testChr = table->pgenomeNT->paChromosomes[1]; } else { testChrId = 0; testChr = table->pgenomeNT->paChromosomes[0]; } int separation = 300; const unsigned halfLength = table->uiRead_Length; SimulateLongRead::uiReadLength = table->uiRead_Length * 2; unsigned int testGenomeIdStart = table->pgenomeNT->chrIndex2genomelocusID(testChrId, 0); unsigned int testGenomeIdEnd = table->pgenomeNT->chrIndex2genomelocusID(testChrId, 0) + testChr->iChromosome_size - halfLength = separation; for (unsigned int i = testGenomeIdStart; i < testGenomeIdEnd; i++) { CReadInBits half1st, half2nd, half1stInColors, half2ndInColors; SimulateLongRead r1(table->pgenomeNTInBits, i); SimulateLongRead r2(table->pgenomeNTInBits, i + separation); if (r1.goodRead && r2.goodRead) { char originalRead[MAX_LINE]; alignmentsQ.clearHits(); // (1) Test exact match table->queryLongReadColors(r1.half1st, r1.half2nd, bOddReadLength, alignmentsQ, 1, true, true); if (alignmentsQ.MinDiff != 0) { cout << "Miss exact match " << i << endl; pass = false; PRINT_MASKED_REPERAT_FLAG(table, i); } // (2) Test mutated reads myStrCpy(originalRead, r.read, MAX_LINE); introduceMutation(read, table->chosenSeedId * 2); encodeLongRead(read, half1st, half2nd); // TODO: encode each half in colors signals alignmentsQ.clearHits(); table->queryLongReadColors(half1stInColors, half2ndInColors, bOddReadLength, alignmentsQ, 1, true, true); if (alignmentsQ.MinDiff > table->uiSubDiffThreshold) { cout << "Miss mutated read" << i << endl; pass = false; // PRINT_MASKED_REPERAT_FLAG(table, i); } else if (alignmentsQ.MinDiff < table->uiSubDiffThreshold) { // cout << "Found better alignment!" << endl; } // (3) Check reversed complement reads reverseComplementKmer(read); CReadInBits rcR1, rcR2; encodeLongRead(read, rcR1, rcR2); // TODO: encode each half in colors signals alignmentsQ.MinDiff = table->uiRead_Length; // SET A LARGE NUMBER table->queryLongReadBases(rcR1, rcR2, bOddReadLength, alignmentsQ, 1, true, false); table->queryLongReadBases(rcR1, rcR2, bOddReadLength, alignmentsQ, 2, true, false); if (alignmentsQ.MinDiff > table->uiSubDiffThreshold) { cout << "Miss reversed mutated read" << i << endl; pass = false; // PRINT_MASKED_REPERAT_FLAG(table, i); } else if (alignmentsQ.MinDiff < table->uiSubDiffThreshold) { // cout << "Found better alignment!" << endl; } } } */ return(pass); }./Source/bitsOperationUtil.cpp0000644011075700120610000000602211720654362016565 0ustar yanghochmath-ar#include "stdafx.h" #include "bitsOperationUtil.h" /* static const unsigned char BitReverseTable256[] = { 0x00, 0x80, 0x40, 0xC0, 0x20, 0xA0, 0x60, 0xE0, 0x10, 0x90, 0x50, 0xD0, 0x30, 0xB0, 0x70, 0xF0, 0x08, 0x88, 0x48, 0xC8, 0x28, 0xA8, 0x68, 0xE8, 0x18, 0x98, 0x58, 0xD8, 0x38, 0xB8, 0x78, 0xF8, 0x04, 0x84, 0x44, 0xC4, 0x24, 0xA4, 0x64, 0xE4, 0x14, 0x94, 0x54, 0xD4, 0x34, 0xB4, 0x74, 0xF4, 0x0C, 0x8C, 0x4C, 0xCC, 0x2C, 0xAC, 0x6C, 0xEC, 0x1C, 0x9C, 0x5C, 0xDC, 0x3C, 0xBC, 0x7C, 0xFC, 0x02, 0x82, 0x42, 0xC2, 0x22, 0xA2, 0x62, 0xE2, 0x12, 0x92, 0x52, 0xD2, 0x32, 0xB2, 0x72, 0xF2, 0x0A, 0x8A, 0x4A, 0xCA, 0x2A, 0xAA, 0x6A, 0xEA, 0x1A, 0x9A, 0x5A, 0xDA, 0x3A, 0xBA, 0x7A, 0xFA, 0x06, 0x86, 0x46, 0xC6, 0x26, 0xA6, 0x66, 0xE6, 0x16, 0x96, 0x56, 0xD6, 0x36, 0xB6, 0x76, 0xF6, 0x0E, 0x8E, 0x4E, 0xCE, 0x2E, 0xAE, 0x6E, 0xEE, 0x1E, 0x9E, 0x5E, 0xDE, 0x3E, 0xBE, 0x7E, 0xFE, 0x01, 0x81, 0x41, 0xC1, 0x21, 0xA1, 0x61, 0xE1, 0x11, 0x91, 0x51, 0xD1, 0x31, 0xB1, 0x71, 0xF1, 0x09, 0x89, 0x49, 0xC9, 0x29, 0xA9, 0x69, 0xE9, 0x19, 0x99, 0x59, 0xD9, 0x39, 0xB9, 0x79, 0xF9, 0x05, 0x85, 0x45, 0xC5, 0x25, 0xA5, 0x65, 0xE5, 0x15, 0x95, 0x55, 0xD5, 0x35, 0xB5, 0x75, 0xF5, 0x0D, 0x8D, 0x4D, 0xCD, 0x2D, 0xAD, 0x6D, 0xED, 0x1D, 0x9D, 0x5D, 0xDD, 0x3D, 0xBD, 0x7D, 0xFD, 0x03, 0x83, 0x43, 0xC3, 0x23, 0xA3, 0x63, 0xE3, 0x13, 0x93, 0x53, 0xD3, 0x33, 0xB3, 0x73, 0xF3, 0x0B, 0x8B, 0x4B, 0xCB, 0x2B, 0xAB, 0x6B, 0xEB, 0x1B, 0x9B, 0x5B, 0xDB, 0x3B, 0xBB, 0x7B, 0xFB, 0x07, 0x87, 0x47, 0xC7, 0x27, 0xA7, 0x67, 0xE7, 0x17, 0x97, 0x57, 0xD7, 0x37, 0xB7, 0x77, 0xF7, 0x0F, 0x8F, 0x4F, 0xCF, 0x2F, 0xAF, 0x6F, 0xEF, 0x1F, 0x9F, 0x5F, 0xDF, 0x3F, 0xBF, 0x7F, 0xFF }; unsigned int reverse32bits(unsigned int word) { unsigned int c; unsigned char * p = (unsigned char *) &word; unsigned char * q = (unsigned char *) &c; q[3] = BitReverseTable256[p[0]]; q[2] = BitReverseTable256[p[1]]; q[1] = BitReverseTable256[p[2]]; q[0] = BitReverseTable256[p[3]]; return c; } unsigned long long reverse64bits(unsigned long long word) { //get the lower 32bits first unsigned long long returnValue = (unsigned long long) reverse32bits((unsigned int) (0xffffffff & word)); returnValue <<= 32; //shift to upper bits word >>= 32; //Shift upper bits to lower bits. returnValue += (unsigned long long) reverse32bits((unsigned int) 0xffffffff & word); return (returnValue); } void printBinaryString(unsigned long long word, unsigned int length) { for(unsigned int i = 0; i < length ; i++) { if((word & 0x01) == 1) { cout << '1'; } else { cout << '0'; } word >>= 1; } cout << '\n' << endl; } */ void printBinaryString(unsigned long long word, unsigned int length) { for (unsigned int i = 0; i < length ; i++) { if ((word & 0x01) == 1) { cout << '1'; } else { cout << '0'; } word >>= 1; } cout << '\n' << endl; } ./Source/boolFlagArray.cpp0000644011075700120610000000326111720654362015633 0ustar yanghochmath-ar#include "boolFlagArray.h" CboolFlagArray::CboolFlagArray(void) { this->bflag = NULL; this->size = 0; } CboolFlagArray::~CboolFlagArray(void) { delete [] this->bflag; } //Watch out the size is counted as bits. so size should be No of bits/8+1 CboolFlagArray::CboolFlagArray(unsigned int size) { this->bflag = new unsigned char[size2sizeInByte(size)]; this->initialization(size); } bool CboolFlagArray::b(unsigned int index) const//return the flag { if (this->bflag != NULL && index < this->size) { int bitID = index % 8; //Save unsigned char c = this->bflag[index/8], d = 0x01; d = d << bitID; if (c & d) return(true); else return(false); } else return(0); } // if there is a flag set within the widows bool CboolFlagArray::b(unsigned int index, unsigned int windowLength) const { for (unsigned int i = index; i < index + windowLength; i++) { if (this->b(i)) { return(true); } } return(false); } void CboolFlagArray::setflag(unsigned int index, bool flag) { if (this->bflag != NULL && index < this->size) { int bitID = index % 8; unsigned char d = (0x01 << bitID); if (flag) { this->bflag[index/8] |= d; } else { d ^= 0xff; //Complement this->bflag[index/8] &= d; } } else cout << "Wrongly set Flag" << endl; } unsigned int CboolFlagArray::initialization(unsigned int size) { this->size = size; //Default Setting of flags are 0 -> false memset(this->bflag, 0x00, sizeof(unsigned char)*(size2sizeInByte(this->size))); return(size2sizeInByte(this->size)); } ./Source/chdir.cpp0000644011075700120610000001116711720654362014204 0ustar yanghochmath-ar#include "chdir.h" #ifdef _WIN32 #include #else #ifdef _WIN64 #include #else #define _chdir chdir #define _getcwd getcwd #include #endif #endif int goto_working_directory(const char* path) { int returnV; if (path[0] == '\0') { returnV = -1; } else { returnV = tosomedir(path); } /** currently no warning message if(returnV == -1) { cout << "Stay in the current directory" << endl; print_working_directory(); } */ return(returnV); } char* get_working_directory(char* path) { if (_getcwd(path, FILENAME_MAX) == NULL) { perror("_getcwd error"); path[0] = '\0'; } return(path); } string get_working_directory(void) { char path[FILENAME_MAX]; if (get_working_directory(path)) { return(string(path)); } return(string("")); } bool is_accessible_directory(const char * path) { bool isAccessibleDir = false; string cwd = get_working_directory(); if (goto_working_directory(path) == 0) { isAccessibleDir = true; } // go back to original dir goto_working_directory(cwd.c_str()); return(isAccessibleDir); } int print_working_directory(void) { char path[FILENAME_MAX]; if (get_working_directory(path)) { cout << "\nThe working directory is " << path << endl; } return(0); } int toparentdir(void) { int i, l; char path[FILENAME_MAX]; if (_getcwd(path, FILENAME_MAX) == NULL) perror("_getcwd error"); l = (int)strlen(path); for (i = l - 1; i >= 0; i--) { #ifdef WIN32 if (path[i] == '\\') #else if (path[i] == '/') #endif break; else if (path[i] == ':') { printf("Already in the root"); return(-1); } } path[i] = '\0'; if (_chdir(path)) { printf("Unable to locate the directory: %s\n", path); return(-1); } else if (path[i-1] == ':') { printf("Disk root"); return(0);// in the root } else { return(1); } } int tochilddir(const char* dirname) { int l; char path[FILENAME_MAX]; if (_getcwd(path, FILENAME_MAX) == NULL) perror("_getcwd error"); l = (int)strlen(path); #ifdef WIN32 path[l] = '\\'; #elif defined _WIN64 path[l] = '\\'; #else path[l] = '/'; #endif strcpy(&path[l+1], dirname); if (_chdir(path)) { printf("Unable to locate the directory: %s\n", path); return(-1); } else { return(system("dir *.wri")); } } int tosomedir(const char* path) { if (_chdir(path)) { // printf("Unable to locate the directory: %s\n", path); return(-1); } else { return(0); } } int tosiblingdir(const char* dirname) { toparentdir(); tochilddir(dirname); return(0); } int str2Int(char* str) { int value; int i, j; //j is a value of a digit; for (value = 0, i = 0; (str[i] != '\0'); i++) { j = (int)(str[i] - '0'); if (j >= 0 && j <= 9) { value = value * 10 + j; } else { return(-1); } } return(value); } string pathLize(const char* path) //For windows { char dirPath[FILENAME_MAX]; #ifdef WIN32 strcpy(dirPath, path); #elif defined _WIN64 strcpy(dirPath, path); #else if (path[0] == '/') strcpy(dirPath, path); else sprintf(dirPath, "./%s", path); #endif return(string(dirPath)); } int createdir(const char* path) { struct stat st; #ifdef WIN32 if ( stat(path, &st) == 0 || \ _mkdir(path) == 0) { #else #ifdef _WIN64 if ( stat(path, &st) == 0 || \ _mkdir(path) == 0) { #else #define MODE_MASK 0777 string dirPath = pathLize(path); if ( stat(dirPath.c_str(), &st) == 0 || \ mkdir(dirPath.c_str(), MODE_MASK ) == 0) { #endif #endif return(0); } else { printf("Fail to create the dir %s", path); return(1); } } int deletedir(const char* path) { char systemcomand[FILENAME_MAX]; sprintf(systemcomand, "rmdir %s", path); return(system(systemcomand)); } int deletefile(const char* path) { char systemcomand[FILENAME_MAX]; sprintf(systemcomand, "del %s", path); return(system(systemcomand)); } char* getnamefrompath(char* Path, char* filename) { int i; for (i = (int)strlen(Path); i >= 0; i--) { #ifdef WIN32 if (Path[i] == '\\') #else if (Path[i] == '/') #endif { break; } } strcpy(filename, &(Path[i+1])); for (i = (int)strlen(filename); i > 0; i--) { if (filename[i] == '.') { filename[i] = '\0'; } } return(filename); } ./Source/chromosomeNTdata.cpp0000644011075700120610000002367611720654362016372 0ustar yanghochmath-ar#include "chromosomeNTdata.h" const int MAX_CHAR_PER_LINE = 5000; CchromosomeNTdata::CchromosomeNTdata(void) { this->initialization(); } CchromosomeNTdata::CchromosomeNTdata(const char* Filename, bool bFasta) { this->initialization(); LOG_INFO("Info %d: Enter %s\r", FINE_LOG, Filename); //Passing the address may have some problem so I use pointer and new if (hasTheExtName(Filename, ".fasta") || hasTheExtName(Filename, ".fna") || hasTheExtName(Filename, ".fa") || hasTheExtName(Filename, ".mfa") || hasTheExtName(Filename, ".dat") || bFasta) { this->Constructor_Fasta(Filename); } else { LOG_INFO("Info %d: %s is not parsed as a fasta file because of its ext name.\n", INFO_LOG, Filename); this->Consrructor_PreSeq(Filename); } //Change all the nt string to upper case toUpperCase(this->caChromosome, this->iChromosome_size); this->addFakeRefName(Filename); } CchromosomeNTdata::~CchromosomeNTdata(void) { delete [] this->caChromosome; } int CchromosomeNTdata::initialization(void) { this->caChromosome = NULL; this->SlideWindowStart = 0; this->iChromosome_size = 0; this->end = false; return(0); } int CchromosomeNTdata::Constructor_Fasta(const char* Filename) { filebuf *pbuf = NULL; long fileSize = 0; char *buffer = NULL; strcpy(caInputFileName, Filename); ifstream ifile(Filename); pbuf = ifile.rdbuf(); // fileSize is larger than this->iChromosome_size with ratio 41:40 in windows fileSize = pbuf->pubseekoff(0, ios::end, ios::in); this->iChromosome_size = fileSize; // GetsizeofChromsome will get exactly bp, which won't include space, but it is super slow DON"T USE IT // this->iChromosome_size = GetsizeofChromosome(Filename); this->caChromosome = new char[this->iChromosome_size + 1]; memset(this->caChromosome, 0x00, sizeof(char)*(this->iChromosome_size + 1)); this->caChromosome[0] = '\0'; pbuf->pubseekpos(0, ios::in); //Set to the beginning if ( this->iChromosome_size > 0) { time_t start, end; time(&start); // Load file directly to the buffer pbuf->sgetn(this->caChromosome, this->iChromosome_size); // move character within the array from i->j, to exclude bad char this->removedNonACGTNBaseAndCollectGeneName(); // Alternatively, read line by line is very slow. using this->readFastaFileLineByLine(&ifile); time(&end); LOG_INFO("Info %d: %u seconds consumed.\r", CONFIG_LOG, (unsigned int)(end - start)); } ifile.close(); delete[] buffer; return(this->iChromosome_size); } // Read line by line and concatenate // currently not used and need test int CchromosomeNTdata::readFastaFileLineByLine(ifstream &ifile) { char* pch; char caBuffer[MAX_CHAR_PER_LINE]; unsigned int length_counter = 0; ifile.getline(caBuffer, MAX_CHAR_PER_LINE - 1); pch = strtok(caBuffer, " ,\t|"); //This should be the name if (pch[0] == '>') { do { pch = NULL; pch = strtok(NULL, " ,\t"); //This should be the name } while (pch != NULL); // Get the information in the header lines. Assume each header is a tag of a new Gene. } do { ifile.getline(caBuffer, MAX_CHAR_PER_LINE - 1); length_counter += (int)strlen(caBuffer); strcat(this->caChromosome, caBuffer); } while (ifile.eof() == false && length_counter <= this->iChromosome_size); //double check this->caChromosome[length_counter] = '\0'; this->iChromosome_size = length_counter; return (0); } // This is the function that read-in pre-process sequence file by Sunje. int CchromosomeNTdata::Consrructor_PreSeq(const char* Filename) { FILE *fp = fopen(Filename, "r"); strcpy(caInputFileName, Filename); ch_header header; if (fp == NULL) { printf("fail to open %s contig file.\n", Filename); exit(-1); } size_t result; result = fread((void*)&header, sizeof(ch_header), 1, fp); if (result != sizeof(ch_header)) { LOG_INFO("Info %d: Unrecognize file format in %s.\n", ERROR_LOG, Filename); }; this->iChromosome_size = header.size; this->caChromosome = new char[this->iChromosome_size+1]; memset(this->caChromosome, 0x00, sizeof(char)*(this->iChromosome_size + 1)); //LOG_INFO("Info %d: Filename = %s\n", CONFIG_LOG, Filename); if (! myFread(this->caChromosome, 1, this->iChromosome_size, fp)) { LOG_INFO("Info %d: fail to read %s contig file.\n", CONFIG_LOG, Filename); exit(-1); } fclose(fp); return(this->iChromosome_size); } int CchromosomeNTdata::getsizeofChromosome(const char* Filename) { //This will getsize from a fasta file char* pch; int Chromosome_size = 0;//local variable, set after returned char caBuffer[MAX_CHAR_PER_LINE]; ifstream ifile; ifile.open(Filename); ifile.getline(caBuffer, MAX_CHAR_PER_LINE - 1); pch = strtok(caBuffer, " ,\t|"); //This should be the name if (pch[0] == '>') { do { pch = NULL; pch = strtok(NULL, " ,\t"); //This should be the name } while (pch != NULL); //Get the information in the first header line } do { ifile.getline(caBuffer, MAX_CHAR_PER_LINE - 1); Chromosome_size += (int)strlen(caBuffer); } while (ifile.eof() == false); ifile.close(); return(Chromosome_size); } // private function for copy a line in a buffer from a large buffer // return the length for the substring being copied int sgetline(const char* sourceBuf, char* destinationBuf) { int i = 0; for (i = 0; sourceBuf[i] != '\n' && sourceBuf[i] != EOF; i++) { destinationBuf[i] = sourceBuf[i]; } destinationBuf[i] = '\0'; return(i); } void formatGeneName(char* geneName) { const int MAX_REF_NAME_LENGTH = FILENAME_MAX; for (int i = 0; i < MAX_REF_NAME_LENGTH; i++) { char c = geneName[i]; if ( isspace(c) || iscntrl(c) || c == ',' || c == '\n') { geneName[i] = '\0'; break; } } } // This function will filter out strange bases in the chromosome // Record multiple tags in a vector for future translation unsigned int CchromosomeNTdata::removedNonACGTNBaseAndCollectGeneName(void) { // move base from i to j if it is ACGTN or some special nucleotide symbol // reomve it if it is other symbol unsigned int i, j; for (i = 0, j = 0; i < this->iChromosome_size; i++) { if (this->caChromosome[i] == '>') { // extract the tag line starts with '>' and skip the line char tagline[MAX_LINE]; tagline[0] = '\0'; i += sgetline(&this->caChromosome[i], tagline); // get the geneName: skipping '>' and get first word char caGeneName[MAX_LINE]; caGeneName[0] = '\0'; sscanf(&tagline[1], "%s", caGeneName); if (strlen(caGeneName) == 0) { sprintf(caGeneName, "%s:%d", caGeneName, (int)this->geneVec.table.size()); } formatGeneName(caGeneName); if (j != 0) { // avoid adding N in the begining this->caChromosome[j] = 'N'; j++; // use an 'N' to separate the gene, to avoid mapping accrose the junction. } this->geneVec.table.push_back(CGene(string(caGeneName), j)); // record the gene name in a vector } else if (isACGT(this->caChromosome[i]) || this->caChromosome[i] == 'N' || this->caChromosome[i] == 'n') { this->caChromosome[j] = toupper(this->caChromosome[i]); j++; } else if (isNucleotide(this->caChromosome[i])) { this->caChromosome[j] = 'N'; // replace special nucleotide Symbol to 'N'; j++; } else if (this->caChromosome[i] == EOF) { this->caChromosome[j++] = 'N'; // add one more N break; } } if (j < i) { this->caChromosome[j++] = 'N'; // add one more N } this->caChromosome[j] = '\0'; this->iChromosome_size = j; return(j - i); } char* CchromosomeNTdata::fragKmer(unsigned int uiKmer_Length) { unsigned int window_end = this->SlideWindowStart + uiKmer_Length - 1; // Last possition of the sliding window if (window_end >= this->iChromosome_size || this->caChromosome[window_end] == '\0') { this->end = true; this->caKmer[0] = '\0'; } else { strncpy(this->caKmer, &(this->caChromosome[this->SlideWindowStart]), uiKmer_Length); this->caKmer[uiKmer_Length] = '\0'; this->SlideWindowStart++;//Only shift one } return(this->caKmer);//Simply return the } char* CchromosomeNTdata::fragACGTKmer(unsigned int uiKmer_Length) { unsigned int i = 0; while (i < uiKmer_Length) { if (isACGT(this->caChromosome[this->SlideWindowStart + i])) { this->caKmer[i] = this->caChromosome[this->SlideWindowStart + i]; i++; } else if ((int)i > _MAX_KMER_LENGTH_ ) { LOG_INFO("\nInfo %d: Buffer overflow.\n", WARNING_LOG); break; } else {//Meet some non ACGT base if (this->caChromosome[this->SlideWindowStart+i] == '\0' || this->SlideWindowStart + i >= this->iChromosome_size) { this->caKmer[0] = '\0'; this->end = true; return(this->caKmer);// The end of the chromosome } else { this->SlideWindowStart += (i + 1);//Skip the non ACGT base i = 0;//Start over again } } } this->caKmer[uiKmer_Length] = '\0'; this->SlideWindowStart++; return(this->caKmer);// The end of the chromosome } int CchromosomeNTdata::addFakeRefName(const char* filename) { if (this->geneVec.table.size() <= 0) { string refName = getBasename(filename).c_str(); LOG_INFO("Info %d: Use %s as the ref name\n", WARNING_LOG, refName.c_str()); CGene g(refName, 0); this->geneVec.table.push_back(g); } return(0); } ./Source/minQ.cpp0000644011075700120610000000175211720654362014016 0ustar yanghochmath-ar#include "minQ.h" #include minQ::minQ(void) { } minQ::~minQ(void) { } int minQ::push(int num) { this->q.push(num); while(this->min.size() > 0 && num < this->min.back()) { this->min.pop_back(); } this->min.push_back(num); return(this->q.size()); } int minQ::pop() { if(this->q.size() > 0) { int front = this->q.front(); this->q.pop(); if(this->min.front() == front) { this->min.pop_front(); } } else { cout << "empty que" << endl; } return(this->q.size()); } int minQ::front() { return(this->q.front()); } int minQ::getMin() { return(this->min.front()); } void testMinQ() { minQ q; q.push(2); cout << q.getMin() << endl; // 2 q.push(1); cout << q.getMin() << endl; // 1 q.push(3); q.pop(); cout << q.getMin() << endl; // 1 q.push(0); q.pop(); cout << q.getMin() << endl; // 0 } ./Source/refInBinFile.cpp0000644011075700120610000001173111720654362015404 0ustar yanghochmath-ar#include "refInBinFile.h" #ifndef REF_IN_BIN_FILE #define REF_IN_BIN_FILE const char* check_sum = "End_of_binary_reference"; int readGenomeInBits(FILE* fp, CGenomeInBits& gInBits) { int scannedItemNo = fscanf(fp, "%u\n%u\n", &(gInBits.uiGenomeLength), &(gInBits.uiGenomeLengthInWordSize)); if (scannedItemNo != 2) { printf("%d, %u, %u", scannedItemNo, gInBits.uiGenomeLength, gInBits.uiGenomeLengthInWordSize); ERR; } unsigned int gLength = gInBits.uiGenomeLength; unsigned int gLengthInWord = gInBits.uiGenomeLengthInWordSize; myFread(gInBits.pLowerBits, sizeof(WORD_SIZE), gLengthInWord, fp); myFread(gInBits.pUpperBits, sizeof(WORD_SIZE), gLengthInWord, fp); unsigned int sizeInByte = size2sizeInByte(gLength); myFread(gInBits.pNBits->bflag, sizeof(unsigned char), sizeInByte, fp); return(0); } int saveGenomeInBits(FILE* fp, const CGenomeInBits& gInBits) { fprintf(fp, "%u\n%u\n", gInBits.uiGenomeLength, gInBits.uiGenomeLengthInWordSize); unsigned int gLength = gInBits.uiGenomeLength; unsigned int gLengthInWord = gInBits.uiGenomeLengthInWordSize; myFwrite(gInBits.pLowerBits, sizeof(WORD_SIZE), gLengthInWord, fp); myFwrite(gInBits.pUpperBits, sizeof(WORD_SIZE), gLengthInWord, fp); unsigned int sizeInByte = size2sizeInByte(gLength); myFwrite(gInBits.pNBits->bflag, sizeof(unsigned char), sizeInByte, fp); return(0); } int readGenesNameVec(FILE* fp, vector& gNameV) { int noRefInVector = 0; if (fscanf(fp, "%d\n", &noRefInVector) == 0) { ERR; } gNameV.clear(); gNameV.reserve(noRefInVector); CGene g; for (int i = 0; i < noRefInVector; i++) { char geneName[FILENAME_MAX]; if (fscanf(fp, "%s\n%d\n", geneName, &(g.startIndex)) != 2) { ERR; } g.name = string(geneName); gNameV.push_back(g); } return(noRefInVector); } int saveGenesNameVec(FILE* fp, vector& gNameV) { int noRefInVector = (int)gNameV.size(); fprintf(fp, "%d\n", noRefInVector); vector::iterator it = gNameV.begin(); for (; it != gNameV.end(); it++) { fprintf(fp, "%s\n%d\n", it->name.c_str(), it->startIndex); } return(noRefInVector); } int readGIndexConvertTable(FILE* fp, CGenomeNTdata& g) { if (fscanf(fp, "%s\n%u\n%u\n", g.refName, &(g.iGenomeSize), &(g.iNo_of_chromosome)) != 3) { ERR; return(-1); } for (unsigned int i = 0; (i < g.iNo_of_chromosome) && (i < GENOME_CAPACITY); i++) { if (fscanf(fp, "%u\n", &(g.IndexCovertTable[i])) == 0) { ERR; return(-1); } } return(0); } int saveGIndexConvertTable(FILE* fp, const CGenomeNTdata& g) { fprintf(fp, "%s\n", g.refName); fprintf(fp, "%u\n", g.iGenomeSize); fprintf(fp, "%u\n", g.iNo_of_chromosome); for (unsigned int i = 0; (i < g.iNo_of_chromosome) && (i < GENOME_CAPACITY); i++) { fprintf(fp, "%u\n", g.IndexCovertTable[i]); } return(0); } int readRefInBinFile(FILE* fp, CGenomeInBits* gInBits, CGenomeNTdata* g) { if (gInBits == NULL || g == NULL) { ERR; return(1); } else { if (readGIndexConvertTable(fp, *g)) { ERR; return(1); } gInBits->allocBitStrSpace(g->iGenomeSize); if (readGenomeInBits(fp, *gInBits)) { ERR; return(1); } for (unsigned int i = 0; i < g->iNo_of_chromosome; i++) { // TODO save chromosome info if (g->paChromosomes[i] == NULL) { g->paChromosomes[i] = new CchromosomeNTdata(); } CchromosomeNTdata* pChr = g->paChromosomes[i]; if (fscanf(fp, "%s\n%u\n", pChr->caInputFileName, &(pChr->iChromosome_size)) != 2) { ERR; } if (readGenesNameVec(fp, pChr->geneVec.table) <= 0) { LOG_INFO("Info %d: ERR reading ref %d's names\n", ERROR_LOG, i); return(1); } } } if (!assertFile(fp, check_sum)) { ERR; } return(0); } int saveRefInBinFile(FILE* fp, const CGenomeInBits* gInBits, const CGenomeNTdata* g) { if (gInBits == NULL || g == NULL) { ERR; return(1); } else { if (saveGIndexConvertTable(fp, *g)) { ERR; return(1); } if (saveGenomeInBits(fp, *gInBits)) { ERR; return(1); } for (unsigned int i = 0; i < g->iNo_of_chromosome; i++) { CchromosomeNTdata* pChr = g->paChromosomes[i]; if (pChr == NULL) { ERR; return(1); } fprintf(fp, "%s\n", pChr->caInputFileName); fprintf(fp, "%u\n", pChr->iChromosome_size); if (saveGenesNameVec(fp, pChr->geneVec.table) <= 0) { LOG_INFO("Info %d: ERR saving ref %d's names\n", ERROR_LOG, i); return(1); } } } fprintf(fp, "%s\n", check_sum); return(0); } #endif ./Source/stdafx.cpp0000644011075700120610000000026311720654362014377 0ustar yanghochmath-ar// stdafx.cpp : source file that includes just the standard includes // my_tests.pch will be the pre-compiled header // stdafx.obj will contain the pre-compiled type information ./Source/AlignmentsQ.h0000644011075700120610000000453211720654362015000 0ustar yanghochmath-ar#pragma once // const unsigned int MAX_Q_CAPACITY = 2000; const unsigned int MAX_Q_CAPACITY = 1000000; const unsigned int MAX_TAG_LENGTH = FILENAME_MAX; #include "ReadInBits.h" #include #include /* * This class has changed to queue alignment we found, instead of hits * It records the hits start and the shift of the hits * It will optionally keep the best set (default), all or a single alignment record * pushed in according to the setting */ class CAlignmentsQ { public: CAlignmentsQ(unsigned int iMaxCapacity = MAX_Q_CAPACITY); CAlignmentsQ(char cFlag_of_Queue_All_Best_One, unsigned int iMaxCapacity = MAX_Q_CAPACITY); ~CAlignmentsQ(void); unsigned int iMaxCapacity; void setQueue_All_Best_OneFlag(char cFlag_of_Queue_All_Best_One); void setForwardLoad(bool forward) { if (forward) ForwardAlignmentLoad = load; }; char returnQueue_All_Best_OneFlag(); unsigned int topHitsinList(); unsigned int saveHits(unsigned int startindex, unsigned short diff); int replaceHits(unsigned int startindex, unsigned short diff); inline void pushHits(unsigned int startindex, unsigned short diff); inline bool checkHits(unsigned int startindex); int clearHits(); int sortHitsByLocation(); int filterAlignments(unsigned int mismatchThreshold, bool bKeepAllAlignmentsInThreshold); inline bool qAllInThreshold(void) { return cFlag_of_Queue_All_Best_One == 'A'; } static const unsigned int NULL_RECORD; static const unsigned short NULL_EDIT_DIS; int readID; CReadInBits read; char tag[MAX_TAG_LENGTH]; const char* qualityScores; bool AmbiguousFlag; bool reverseIsBetter; // How many alignment is in the queue unsigned int load; // Assume the first proportion of records are forward alignments, and the later part is reverse compliment alignment unsigned int ForwardAlignmentLoad; unsigned int MinDiff; unsigned int* aiHitIndex;; // store the hit index unsigned short* asdiff; // store the alignment difference between the string private: int initialization(unsigned int iMaxCapacity = MAX_Q_CAPACITY); // cFlag_of_Queue_All_Best_One is a flag that control whether we queue all alignment record, // or keep the best alignments set only. (best is defined by the # of substitutions). char cFlag_of_Queue_All_Best_One; }; ./Source/ChrIndex2GeneName.h0000644011075700120610000000156511720654362015747 0ustar yanghochmath-ar#pragma once #include #include #include using namespace std; #ifndef CHR_INDEX2GENE_NAME_H #define CHR_INDEX2GENE_NAME_H class CGene { public: CGene(void); ~CGene(void); CGene(string name, unsigned int startIndex); CGene(string name, unsigned int startIndex, bool isValid); string name; unsigned int startIndex; bool isValid; bool operator<(const CGene &other) const; // compare the start index bool operator==(const CGene &other) const; // return true if the name is the same }; class ChrIndex2GeneName { public: ChrIndex2GeneName(void); virtual ~ChrIndex2GeneName(void); int insert(string name, unsigned int startIndex); // translate the chromosome index to the gene it locates and the corresponding position in the gene CGene query(unsigned int chrIndex); vector table; }; #endif ./Source/ColorSpaceRead.h0000644011075700120610000002546211720654362015411 0ustar yanghochmath-ar#pragma once #ifndef COLOR_SPACE_READ_H #define COLOR_SPACE_READ_H #include "ReadInBits.h" #include "ShortReadUtil.h" #include "stdafx.h" /* * This file provides functions translated from base space to color space and vice versa * encoded class CReadInBits. */ // The first base is encoded in first bit returned CReadInBits CReadInBits colors2Bases(CReadInBits readInColors); // Translate color read in B0123 format to CReadInBits bool encodeColors(const char* colorsStr, CReadInBits& readInColors); bool encodeColorsNas3(const char* colorsStr, CReadInBits& readInColors); // Translate color read in readInBits format to c_str in B0123 format char* decodeColors(char* colorsStr, CReadInBits readInColors); char* decodePureColors(char* colorsStr, CReadInBits readInColors); char* decodeLongColors(char* colorsStr, CReadInBits readInColors1stHalf, CReadInBits readInColors2ndHalf, bool oddRead); inline char* decodeColorReadWithPrimer(char* caRead, CReadInBits readInColors) { decodeColors(&(caRead[1]), readInColors); caRead[0] = caRead[1]; // Mimic the first base from the primer caRead[1] = '0'; return(caRead); } char* correctAndDecodeRead \ (CReadInBits readInColors, CReadInBits refInColors, bool correct, char* caRead, char* caQscore); const static char color_transfer_matrix[][4] = { { '0', '1' , '2', '3' }, { '1', '0' , '3', '2' }, { '2', '3', '0', '1' }, { '3', '2', '1', '0' } }; inline void basesStr2ColorStr(const char* readInBase, char* readInColor) { readInColor[0] = readInBase[0]; readInColor[1] = '0'; for (unsigned int i = 1; i < MAX_READ_LENGTH * 2; i++) { char b = readInBase[i - 1]; char c = readInBase[i]; if (c == '\0') { readInColor[i + 1] = '\0'; // TODO Be carefule about the input without '\0'; break; } else { readInColor[i + 1] = color_transfer_matrix[nt2Id(b)][nt2Id(c)]; } }// TODO Be carefule about the length 128bp input } // Encode the first base in the first bit inline CReadInBits bases2Colors(CReadInBits readInBase) { CReadInBits readInColor; //Note the first bits store the first bases of the read. The following bits are color transition readInColor.UpperBits = readInBase.UpperBits ^(readInBase.UpperBits << 1); readInColor.LowerBits = readInBase.LowerBits ^(readInBase.LowerBits << 1); //Note the most significant digit is useless return(readInColor); } // Don't encode the first base in the first bit, colors only inline CReadInBits bases2PureColors(CReadInBits readInBase) { CReadInBits readInColor; //Note bits are color transitions readInColor.UpperBits = readInBase.UpperBits ^(readInBase.UpperBits >> 1); readInColor.LowerBits = readInBase.LowerBits ^(readInBase.LowerBits >> 1); //Note the most significant digit is useless return(readInColor); } // reverse colors signals. (The reverse compliment of a read inline CReadInBits reversePureColors(CReadInBits readInPureColors, unsigned int colorsLength) { CReadInBits reverseColors; // Note the first bit (least significant) store the first bases of the read. The following bits are color transition unsigned int tailLength = (wordSize - colorsLength); reverseColors.UpperBits = (reverse64bits(readInPureColors.UpperBits) >> tailLength); reverseColors.LowerBits = (reverse64bits(readInPureColors.LowerBits) >> tailLength); //Note the most significant digit is useless return(reverseColors); } // Encode the first base in the first bit (least significant) inline void longBases2Colors(CReadInBits& readInBase1stHalf, CReadInBits& readInBase2ndHalf, CReadInBits& readInColor1stHalf, CReadInBits& readInColor2ndHalf, bool oddReadLength) { readInColor1stHalf = bases2Colors(readInBase1stHalf); readInColor2ndHalf = bases2Colors(readInBase2ndHalf); // TO set the transition color bit between the two part of the read // Get the last bit of first half // To get the first bit of the second half unsigned long long upperBit1, lowerBit1, upperBit2, lowerBit2, transitionUpperBit, transitionLowerBit; unsigned int shiftPos; if (oddReadLength) { shiftPos = (unsigned int)CReadInBits::iReadLength - 0x02; } else { shiftPos = (unsigned int)CReadInBits::iReadLength - 0x01; } upperBit1 = longlongShiftRight(readInBase1stHalf.UpperBits, shiftPos) & 0x01; lowerBit1 = longlongShiftRight(readInBase1stHalf.LowerBits, shiftPos) & 0x01; upperBit2 = readInBase2ndHalf.UpperBits & 0x01; lowerBit2 = readInBase2ndHalf.LowerBits & 0x01; transitionUpperBit = (upperBit1 ^ upperBit2) & 0x01; transitionLowerBit = (lowerBit1 ^ lowerBit2) & 0x01; // set the first bit of readInColor2ndHalf as the transition bit readInColor2ndHalf.UpperBits >>= 0x01; readInColor2ndHalf.UpperBits <<= 0x01; if (transitionUpperBit > 0) { readInColor2ndHalf.UpperBits += 0x01; } readInColor2ndHalf.LowerBits >>= 0x01; readInColor2ndHalf.LowerBits <<= 0x01; if (transitionLowerBit > 0) { readInColor2ndHalf.LowerBits += 0x01; } } // Correct the single mismatched from the read in color. // SNP_FLAG is a bits string as flag that should take from the readInColors (SNP caused color mismatches) inline CReadInBits correctReadInColorSpace(CReadInBits readInColors, CReadInBits refInColors, WORD_SIZE SNP_FLAG) { CReadInBits correctedReadInColors; correctedReadInColors.UpperBits = (readInColors.UpperBits & SNP_FLAG) | (refInColors.UpperBits & ~SNP_FLAG); correctedReadInColors.LowerBits = (readInColors.LowerBits & SNP_FLAG) | (refInColors.LowerBits & ~SNP_FLAG) ; return(correctedReadInColors); } // use a matrix for translate a base and color to the next base // translate a base and color to the next base const static char base_transfer_matrix[][4] = { { 'A', 'C' , 'G', 'T' }, { 'C', 'A' , 'T', 'G' }, { 'G', 'T', 'A', 'C' }, { 'T', 'G', 'C', 'A' } }; inline char base2Color(char base, char color) { switch (base) { case 'A': case 'a': return (base_transfer_matrix[0][color - '0']); case 'C': case 'c': return (base_transfer_matrix[1][color - '0']); case 'G': case 'g': return (base_transfer_matrix[2][color - '0']); case 'T': case 't': return (base_transfer_matrix[3][color - '0']); default: return(0); } } // If there are two consecutive mismatched colors, check if it is a valid SNP. // Return # of mismatches or -1 -2 -3 for valid to represent Complement, Transversion or Transition. int getSNPtype(CReadInBits readInColors, CReadInBits refInColors); // Translate the -1 -2 -3 to a character as flag inline char returnSNPtype(CReadInBits readInColors, CReadInBits refInColors) { int diffOrSNPFLag = getSNPtype(readInColors, refInColors); switch (diffOrSNPFLag) { case -1: return('C'); case -2: return('V'); case -3: return('T'); default: return('N'); } } inline int returnSNPtype(char c) { switch (c) { case 'C': return(1); case 'V': return(2); case 'T': return(3); default: return(0); } } // correct the single mismatches in the int correctReadInColorSpace(CReadInBits readInColors, CReadInBits refInColors, CReadInBits& correctedRead); void colorQV2baseQV(CReadInBits readInColors, CReadInBits& correctedRead, char* Qscores); bool colorQV2baseQV(WORD_SIZE singleColorErrorflag, char* Qscores, unsigned int readLength); inline bool setFirstBase(char c, CReadInBits& readInBase) { switch (c) { case 'a': case 'A': case '0': readInBase.UpperBits &= (~0x01); readInBase.LowerBits &= (~0x01); return(true); case 'c': case 'C': case '1': readInBase.UpperBits &= (~0x01); readInBase.LowerBits |= 0x01; return(true); case 'g': case 'G': case '2': readInBase.UpperBits |= 0x01; readInBase.LowerBits &= (~0x01); return(true); case 't': case 'T': case '3': readInBase.UpperBits |= 0x01; readInBase.LowerBits |= 0x01; return(true); case 'N': // The first base N' keep the original as random return(false); default: if (isprint(c)) { cout << "Unknown character " << c << " in the first base of read." << endl; } else { cout << "Unknown character with ascii" << (int)c << " in the first base of read." << endl; } return(false); } } inline CReadInBits reverseColorRead(CReadInBits readInColors) { // Leave one more bit to put the end base bit const int shiftBitsAfterReverse = wordSize - CReadInBits::iReadLength - 1; CReadInBits reverseColors = reverseBitsSignals(readInColors, shiftBitsAfterReverse); // Fix the first base is not correct after the reverse CReadInBits readInBases = colors2Bases(readInColors); bool lowerLastBaseBit = isKthBitSet(readInBases.LowerBits, CReadInBits::iReadLength - 1); bool upperLastBaseBit = isKthBitSet(readInBases.UpperBits, CReadInBits::iReadLength - 1); setKthBit(reverseColors.LowerBits, 0, lowerLastBaseBit); setKthBit(reverseColors.UpperBits, 0, upperLastBaseBit); return(reverseColors); } inline void reverseLongColorRead(CReadInBits& colorRead1stHalf, CReadInBits& colorRead2ndHalf, bool oddReadLength) { // Fix the first base is not correct after the reverse /* CReadInBits readInBases = colors2Bases(readInColors); if(oddReadLength) { const int shiftBitsAfterReverse1 = wordSize - CReadInBits::iReadLength; CReadInBits revColorRead2 = reverseBitsSignals(colorRead1stHalf, shiftBitsAfterReverse); const int shiftBitsAfterReverse2 = wordSize - CReadInBits::iReadLength - 1; CReadInBits revColorRead1 = reverseBitsSignals(colorRead2ndHalf, shiftBitsAfterReverse); } else { const int shiftBitsAfterReverse = wordSize - CReadInBits::iReadLength - 1; CReadInBits revColorRead2 = reverseBitsSignals(colorRead1stHalf, shiftBitsAfterReverse); CReadInBits revColorRead1 = reverseBitsSignals(colorRead2ndHalf, shiftBitsAfterReverse); } */ } char* decodeLongColors(char* colorsStr, CReadInBits readInColors1stHalf, CReadInBits readInColors2ndHalf, bool oddReadLength); void testShift64Bit(void); void testLongBases2ColorsCases(void); void testLongBases2Colors(const char* longRead, const char* expLongColorSignals); void testReverseColorSignals(const char* colorSignalStr); void assertSNP(int SNPType, CReadInBits refInColors, CReadInBits crInColors); // Given strings in bases, return the corresponding color signal in A=0 C=1 G=2, T=3 Format string readInBases2ColorsInACGT_Format(string readInBases); // Given color Read in ACGT Format, return correspond string in A=0 C=1 G=2, T=3 Format // Note the first base is duplicated. string colorReadInACGTto0123Format(string colorReadInACGT); #endif /* COLOR_SPACE_READ_H */ ./Source/FileInputBuffer.h0000644011075700120610000000153111720654362015603 0ustar yanghochmath-ar#pragma once #ifdef WIN32 const unsigned int MAX_INPUT_BUFFER_SIZE = 10000; #else const unsigned int MAX_INPUT_BUFFER_SIZE = 1000000; #endif #include #include using namespace std; class FileInputBuffer { public: FileInputBuffer(void); FileInputBuffer(unsigned int uiCapacity, ifstream* pifile); ~FileInputBuffer(void); unsigned int uiCapacity; unsigned int uiPtrIndex; //Index to point out the next char to read unsigned int Getline(char* caArray, unsigned int uiMax_Char_Per_Line); //return the char get void initialize(unsigned int uiCapacity, ifstream* pifile); void fflush(void); bool ready2Read(void); private: char* caBuffer; char* caBufp; filebuf *pbuf; }; unsigned long long getFileSize(const char* fileName); unsigned long long getNumberOfLineInAFile(const char* fileName); ./Source/FileOutputBuffer.h0000644011075700120610000000136311720654362016007 0ustar yanghochmath-ar#pragma once #include #include "stdafx.h" /***** This is a class is used as the output buffer to speed up the output to file the output will temporarily send to the buffer and output in batch to speed up. Currently, one need to estimate when to output the buffer before it is overflow */ const unsigned int DEFAULT_BUEFFER_SIZE = 1000000; class FileOutputBuffer { //Assume uiCapacity is > 100 and each line is < 100 char public: FileOutputBuffer(unsigned int uiCapacity, ofstream* pofile); FileOutputBuffer(void); ~FileOutputBuffer(void); unsigned int uiCapacity; unsigned int uiSize; void UpdateSize(); void fflush(); void removeEndBlankLine(); char* caBuffer; char* caBufp; ofstream* pofile; }; ./Source/Filename.h0000644011075700120610000000337211720654362014277 0ustar yanghochmath-ar#ifndef FILENAME_H #define FILENAME_H #include #include #include #include #include #include #include #include // For stat(). #include // For stat(). using namespace std; const int MAX_FILE_PATH = 2048; string chExtName(string filename, string Extname); string getBasename(const char* Path); int chExtName(char* filename, const char* Extname); int getBasename(const char* Path, char* fileNameWithoutExt); // no ext name const char* getPtrBasename(const char* Path); // return the pointer of the start int getTitleFromPath(const char* Path, char* Title); string getFullPath(string directory, string filename); char* addPath(const char* filename, const char* folder, char* path); const char* getExtName(const char* filename); bool hasTheExtName(const char* filename, const char* extFile); bool fileExist(const char* filename); bool checkFileExist(const char* filename); bool checkPathCharsAreValid(const char* pathStr); // (Modify a string into a valid filename by replacing its invalid characters) void filenameLize(char* string); bool dirExist(const char* strFolderPath); bool isPathWritable(const char* filename); /* * Get the filename from the next line of a filelist list. * If there are no more files, put "\0" in the char* and return false */ bool GetNextFilenameFromListFile(ifstream &ifile, char* filenameBuffer); /* * Get the filename paired from the next line of a filelist list. * If there are no more files, put "\0" in the char* and return false */ bool GetNextFilenamePairFromListFile(ifstream &ifile, char* filenameBuffer1, char* filenameBuffer2); bool GetNextFilenamePairFromListFile(const char* filename, char* filenameBuffer1, char* filenameBuffer2); #endif ./Source/Flags.h0000644011075700120610000000170711720654362013613 0ustar yanghochmath-ar#include "stdafx.h" #include #include #include #include #include using namespace std; // This class is designed to parse the options of each command line. class CFlags { public: // The string vector to save all the flags that has been checked. vector flags; bool checkArg(int argc, const char** argv, const char* arg); bool checkIntOpt(int argc, const char** argv, const char* arg, int& argValue); bool checkUnIntOpt(int argc, const char** argv, const char* arg, unsigned int& argValue); bool checkpCharOpt(int argc, const char** argv, const char* arg, char& argValue); bool checkpStrOpt(int argc, const char** argv, const char* arg, string& argStr); bool checkpStrOpt(int argc, const char** argv, const char* arg, char* argStr); // Output warning if there are any unrecognizable options in the command line bool checkUnrecognizedFlags(int argc, const char** argv); };./Source/GenomeInBits.h0000644011075700120610000000316711720654362015104 0ustar yanghochmath-ar#pragma once #ifndef CGenomeInBits_H_ #define CGenomeInBits_H_ #include "bitsOperationUtil.h" #include "SeedPattern.h" #include "ReadInBits.h" #include "chromosomeNTdata.h" #include "GenomeNTdata.h" #include "boolFlagArray.h" #include "stdafx.h" class CGenomeInBits { public: CGenomeInBits(unsigned int uiGenomeSize = 0); CGenomeInBits(CGenomeNTdata* pGenome); ~CGenomeInBits(void); // Two arrays to store the chromosome string encoded with bits WORD_SIZE* pUpperBits; WORD_SIZE* pLowerBits; CboolFlagArray* pNBits; unsigned int uiGenomeLength; unsigned int uiGenomeLengthInWordSize; CGenomeNTdata* pGenome; // Don't delete this pointer pointing outside // get the wordSize substring encoded in bits and store in upperBits and lowerBits CReadInBits getSubstringInBits(unsigned int uiGenomeIndex) const; // eliminate the bits beyond read length. Not the length should smaller than word_size CReadInBits getSubstringInBits(unsigned int uiGenomeIndex, unsigned int uiSubstringLength) const; // call getSubstringInBits and transform the info to DNA sequence and store in caSubstring char* getSubstring(unsigned int uiGenomeIndex); // get the substring encoded in bits which is shorter than wordSizea. char* getSubstring(unsigned int uiGenomeIndex, unsigned int uiSubstringLength); char caSubstring[wordSize * 2 + 1]; bool fragACGTKmerInBits(CReadInBits& kmerInBits, int startIndex, int kmerLength); int allocBitStrSpace(unsigned int uiGenomeSize = 0); private: int initialization(unsigned int uiGenomeSize = 0); int encodeJunction(unsigned int ChrID); }; #endif ./Source/GenomeNTdata.h0000644011075700120610000000466411720654362015072 0ustar yanghochmath-ar#pragma once // Maximum number of chromosome in a genome const unsigned int GENOME_CAPACITY = 64; #include "chromosomeNTdata.h" #include "ShortReadUtil.h" #include "SeedPattern.h" #include "stdafx.h" #include #include #include const unsigned int BAD_GENOME_INDEX = std::numeric_limits::max(); const int _MAX_READ_LENGTH_ = 1024; /* This class is designed to contain a collection of CchromosomeNTdata objects It will contain the function to convert the chromosome index to genome index, and vice versa This object is currently set to be a member of Kmer_LSH_table */ #include "stdafx.h" #include "Filename.h" class CGenomeNTdata { public: CGenomeNTdata(void); CGenomeNTdata(const char* DataSetListFile); ~CGenomeNTdata(void); // A buffer to store the read when genomelocusID2Kmer is called char refName[MAX_LINE]; char caKmer[_MAX_READ_LENGTH_]; unsigned int iGenomeSize;//Total base pair in the genome unsigned int iNo_of_chromosome; // The pointer array to point to each chromosome objects CchromosomeNTdata* paChromosomes[GENOME_CAPACITY]; // delete the spaced used character string of each chromosome int freeChromosomeSpace(void); /* This function add one more chromosome in the genome set (Which is maintained by a pointer array) */ unsigned int addChromosome(const char* chromosomeFileName, bool bFastaFormat = true); // This function will store corresponds kmer in this->caKmer char* genomeLocusID2Kmer(unsigned int uiKmer_Length, unsigned int genomeLocusID); /* This function will covert the position described by number pair * (chromosome ID and local position locus ID) * to genome locus index recorded in the table list */ unsigned int chrIndex2genomelocusID(unsigned int iChrID, unsigned int iChrLocusID); unsigned int genomeIndex2chrID(unsigned iGenomeIndex); // This function will covert to genome locus index recorded in the table list // to the locus index recorded in the table list unsigned int genomeLocusID2chrIndex(unsigned int igenomeLocusID); // This table records the accumulated number of NT in the genome unsigned int IndexCovertTable[GENOME_CAPACITY]; void checkRefsNames(void); vector getRefNamesLengths(void); private: int initialization(); }; // For test purpose, exaxhustive search a kmer is in the genome unsigned int BruteForceSearch(CGenomeNTdata& genome, char* Kmer); ./Source/Genome_Index.h0000644011075700120610000000337411720654362015122 0ustar yanghochmath-ar#pragma once /* * This class is the base class of Genome_Index_Table. It contains the the CGenomeNTData and * the function pointers which generated the hashValue and SeedKey as index for each sliding windows */ #include "stdafx.h" #include "GenomeNTdata.h" #include "SeedPattern.h" #include "ReadInBits.h" #include "GenomeInBits.h" #include "seedOptions.h" const unsigned int MAX_MISMATCH_THRESHOLD = 10; class CGenome_Index { public: CGenome_Index(void); virtual ~CGenome_Index(void); // HashValue is to located the bucket. SeedKey is for binary sort in the bucket unsigned int getHashValue(char* slide_window) const; unsigned int getSeedKey(char* slide_window) const; unsigned int getHashValue(CReadInBits r) const; unsigned int getSeedKey(CReadInBits r) const; char caRefName[FILENAME_MAX]; // Name from the input List CGenomeNTdata* pgenomeNT; CGenomeInBits* pgenomeNTInBits; bool bEXTEND_SEED; // Number of digits (bases or colors) used for hashing int iHashDigits; // Number of digits (bases or colors) used for generating key for binary search int iKeyDigits; // Should be iHashDigits + iKeyDigits unsigned int uiSeedLength; // is the read length - uiSeedLength unsigned int uiNoOfShift; unsigned int NO_OF_BUCKET; protected: // Set function pointers to generate hashvalue and SeedKey (using seed pattern) int chooseHashFunction(unsigned int uiReadLength, unsigned int chosenSeedId); void setConsecutiveHashFunction(void); private: int initialization(void); int chooseSeedKeyFunction(unsigned int uiReadLength, unsigned int chosenSeedId); // fucntion pointers pointer unsigned int (* fpHashValue)(CReadInBits); unsigned int (* fpSeedKey)(CReadInBits, int); }; ./Source/Genome_Index_Table.h0000644011075700120610000001167411720654362016233 0ustar yanghochmath-ar#pragma once /* * This class is a base class of CGenome_Index_TableQ and * provides functions to construct the Genome Index Table from genome. * (1) Build index array to find the maskable repeats and marked with flags * (2) Build index array again and skipped all maskable repeats index. */ #include "refInBinFile.h" #include "Genome_Index.h" #include "Index_Table.h" #include "ShortReadUtil.h" #include "ColorSpaceRead.h" #include #include using namespace std; // Use OpenMP is gcc version is later than 4.2 #ifdef __GNUC__ #ifdef __GNUC_PATCHLEVEL__ #define __GNUC_VERSION__ (__GNUC__ * 10000 \ + __GNUC_MINOR__ * 100 \ + __GNUC_PATCHLEVEL__) #else #define __GNUC_VERSION__ (__GNUC__ * 10000 \ + __GNUC_MINOR__ * 100) # endif #if __GNUC_VERSION__ >= 40200 #include #endif #else #ifdef _MSC_VER #if _MSC_VER >= 2000 #include #endif #endif #endif #define PRINT_MASKED_REPERAT_FLAG(table, genomeIndex)\ do {\ for(unsigned int j = 0; j <= table->uiNoOfShift; j++) {\ char masked = table->pbaRepeatMaskedFlag->b(genomeIndex + j) ? 'Y' : 'N';\ cout << masked << endl;\ }\ } while(0) class CGenome_Index_Table : public CIndex_Table, public CGenome_Index { public: CGenome_Index_Table(void); ~CGenome_Index_Table(void); int getGenomeNTdata(const char* genomeListfileName, string refFormat); bool make_index_table(unsigned int uiKmer_Length, unsigned int uiSeedId, bool bMapReadInColors, bool makedMathRepeats); bool read_index_table(const char* indexFilePath, bool bPrintErrMsg); bool save_index_table(const char* indexFilePath, bool bPrintErrMsg); // Compare two substring in the genome using their hashKey. Return true is I1.key < I2.key bool compareKey(CIndex_Type I1, CIndex_Type I2); // Compare two substring in the genome using the alphabetical order. Return true is I1.order < I2.order bool compareSubstring(CIndex_Type I1, CIndex_Type I2, unsigned int slidingWindows); // Return true if the two substring in the genome are the same. bool sameSubstring(CIndex_Type I1, CIndex_Type I2, unsigned int slidingWindows); // Flag array which mark the index skipped because of mathematical repeats. CboolFlagArray* pbaRepeatMaskedFlag; inline bool isMasked(unsigned int GenomeIndex) const { return(this->pbaMaskedFlag->b(GenomeIndex)); }; protected: string get_index_path(string index_path); // get default index path if the pass in is null. int chooseHashFunction(unsigned int uiReadLength, unsigned int \ fullSensitiveThreshold, bool bMapReadInColors); private: int num_of_repeat_patterns; int initialization(void); // check the sliding windows with N and mask them int check_masked_flags(void); ////// add repeat masked regions from files. Those regions will be skipped from the index table. int add_repeat_masked_flags(void); // Currently not used //// find mathematical repeats int find_maskable_mathmatical_repeats(unsigned int uiReadLength, unsigned int uiSubThreshold); // called by previous function that check the neighbor of index array int find_mathmatical_repeats(void); // called by previous function that check the neighbor of index array in a bucket. int find_mathmatical_repeats_in_a_bucket (ofstream& ofile, unsigned int BucketStart, unsigned int NextBucketStart); // Private function called Construct_IndexTable, which do the counting for each bucket - Step 1 int countBucketSize(void); int countBucketSize4Chr(int chrId, unsigned int kmer_length); int bucketCount2Index(unsigned int uiNonMaskedLoci); // Hash the Kmer to the HashIndexTable bucket, without sorting the dHashkey int hashKmer2Bucket(void); int hashKmer2Bucket4Chr(int chrId, unsigned int kmer_length); // Sort the index table. If substringLength == 0, sorted with pre-defined hash function. Else sort with corresponding substring. int sortTable(unsigned int substringLength); }; // functor to compare genome indices. If windowLength == 0, using the Seedkey, else compare 2 loci with the sliding windows class CcompareFunctor4Sort { public: static const unsigned int MAX_SUBSTRING_LENGTH = 64; CGenome_Index_Table* pGenomeIndexTable; unsigned int windowLength; CcompareFunctor4Sort(CGenome_Index_Table* pGenomeIndexTable, unsigned int windowLength) { this->pGenomeIndexTable = pGenomeIndexTable; this->windowLength = windowLength; if (windowLength >= MAX_SUBSTRING_LENGTH) { cout << "Warning!, Excess the limit!" << endl; } } bool operator()(CIndex_Type I1, CIndex_Type I2) { if (windowLength == 0) { return(pGenomeIndexTable->compareKey(I1, I2)); } else { return(pGenomeIndexTable->compareSubstring(I1, I2, windowLength)); } } }; bool testTable(CGenome_Index_Table& table); ./Source/Genome_Index_TableQ.h0000644011075700120610000001065211720654362016347 0ustar yanghochmath-ar#pragma once #include "Genome_Index_Table.h" #include "AlignmentsQ.h" #include "ColorSpaceRead.h" #include "ShortReadUtil.h" /* * This class has base class CGenome_Index_Table and add query function */ class CGenome_Index_TableQ : public CGenome_Index_Table { public: CGenome_Index_TableQ(void); ~CGenome_Index_TableQ(void); bool getSeqFromFasta(const char* genomeListfileName, string refFormat = ""); bool getSeqFromDS(CGenomeNTdata* pgenomeNT); bool getSeqFromIndex(char* indexFile); inline unsigned int extendAlignment(CAlignmentsQ& aQue, CReadInBits readInBases) const; // check the number of Diff between long read and ref inline unsigned int checkAlignment(unsigned int genomeStartIndex, CReadInBits& half1, CReadInBits& half2, bool oddReadLength) const; inline unsigned int checkColorAlignment(unsigned int alignStartGenomeIndex, CReadInBits& half1, CReadInBits& half2, bool oddReadLength) const; // Query a long read in bases (illumina) for hit and check uiDiff and put the result into the given Queue unsigned int queryLongReadBases(CReadInBits r1, CReadInBits r2, bool bOddReadLength, CAlignmentsQ& aQue, int queryRead, bool bClearQ, bool bForward) const; // Query a read in bases (illumina) for hit and check uiDiff and put the result into the given Queue unsigned int queryReadBases(CReadInBits readInBases, CAlignmentsQ& aQue, bool bClearQ, bool bForward) const; // Query a long read in colors (ABI) for hit and check uiDiff and put the result into the given Queue unsigned int queryLongReadColors(CReadInBits r1, CReadInBits r2, bool oddReadLength, CAlignmentsQ& aQue, int queryHalf, bool bClearQ, bool bForward) const; // Query a read in colors (SOLiD) for hit and check uiDiff and put the result into the given Queue // unsigned int queryReadColors(CReadInBits readInColors, CAlignmentsQ& aQue, bool bClearQ, bool bForward) const; unsigned int queryReadColors(CReadInBits readInColors, CAlignmentsQ& aQue, bool bClearQ, bool bForward, bool bDEBUG = false) const; // Given alignments in alignmentsQ, check reads can be also will aligned in the extended position bool bExcludeAmbiguous; // If ambiguous reads are exclude, queries can be accelerated. protected: // Query a read for hit and return the hit index range as pair. // (Input is pure base or pure color read's substring) pair queryKmer(CReadInBits window, unsigned int shift) const; private: int initialization(void); unsigned int getSeedKeyUpperBound(CReadInBits window, unsigned int shift) const; }; // Special functor used for lower_bound() class CcompareFunctor4LowerBound { public: const CGenome_Index_Table* pGenomeIndexTable; CcompareFunctor4LowerBound(const CGenome_Index_Table* pGenomeIndexTable) { this->pGenomeIndexTable = pGenomeIndexTable; } bool operator()(CIndex_Type I1, unsigned int uiQuerySeedKey) { unsigned int uiSeedKey; if (pGenomeIndexTable->bMapReadInColors) { CReadInBits ref = pGenomeIndexTable->pgenomeNTInBits->getSubstringInBits(I1 , pGenomeIndexTable->uiSeedLength + 1); CReadInBits refInColors = bases2PureColors(ref); uiSeedKey = this->pGenomeIndexTable->getSeedKey(refInColors); } else { CReadInBits ref = pGenomeIndexTable->pgenomeNTInBits->getSubstringInBits(I1 , pGenomeIndexTable->uiSeedLength); uiSeedKey = this->pGenomeIndexTable->getSeedKey(ref); } return(uiSeedKey < uiQuerySeedKey); } }; // Special functor used for upper_bound() class CcompareFunctor4UpperBound { public: const CGenome_Index_Table* pGenomeIndexTable; CcompareFunctor4UpperBound(const CGenome_Index_Table* pGenomeIndexTable) { this->pGenomeIndexTable = pGenomeIndexTable; } bool operator()(unsigned int uiQuerySeedKey, CIndex_Type I2) { unsigned int uiSeedKey; if (pGenomeIndexTable->bMapReadInColors) { CReadInBits ref = pGenomeIndexTable->pgenomeNTInBits->getSubstringInBits(I2 , pGenomeIndexTable->uiSeedLength + 1); CReadInBits refInColors = bases2PureColors(ref); uiSeedKey = this->pGenomeIndexTable->getSeedKey(refInColors); } else { CReadInBits ref = pGenomeIndexTable->pgenomeNTInBits->getSubstringInBits(I2, pGenomeIndexTable->uiSeedLength); uiSeedKey = this->pGenomeIndexTable->getSeedKey(ref); } return(uiQuerySeedKey < uiSeedKey); } }; ./Source/HashIndexT.h0000644011075700120610000000140411720654362014550 0ustar yanghochmath-ar#pragma once /***** * This class is a Index table for Hash Table. * When building the table, we hash * Before building the index, we will do a counting for the first run and set the index * When there is a record (Query), they will look up the index and bucket to a range and than do binary search */ class CHashIndexT { public: CHashIndexT(void); CHashIndexT(unsigned int uiBucketSize); ~CHashIndexT(void); unsigned int uiSize; unsigned int uiWindowSize;//The # of base we need to use as hash key unsigned int* aiIndexTable;//Just use a integer as index; int Counter2Index(void); static const unsigned int INDEX_BASES_LIMIT; private: unsigned int HashFunction(char* Kmer) const; int initialization(unsigned int BucketSize); }; ./Source/Index_Table.h0000644011075700120610000000300211720654362014723 0ustar yanghochmath-ar#pragma once /* * This function is a base clase for CGenome_Index_Table * It has an array of genome index, which is binned and sorted into buckets. * It has a CHashIndex, which is a hash index array pointing to the first reacord of each bin */ #include "HashIndexT.h" #include "boolFlagArray.h" #include "chdir.h" #include "stdafx.h" #include "seedOptions.h" #include #include using std::string; typedef unsigned int CIndex_Type; class CIndex_Table { public: CIndex_Table(void); ~CIndex_Table(void); void delete_index_table(void); // A index array represent the substring of a genome CIndex_Type* pIndexTable; // A hash index array pointing to the first reacord of each bin CHashIndexT* pHashIndexTable; // A bit array object indicating the neighborhood seed-pattern record are the same. CboolFlagArray* pbaMaskedFlag; // Flag array which mark the index as the representative copy of the mathmatical repeats. CboolFlagArray* pbaRepeatRepresentativeFlag; unsigned int size; // unsigned int capacity; unsigned int chosenSeedId; unsigned int uiSubDiffThreshold; unsigned int uiRead_Length; bool bMapReadInColors; protected: string indexFileName; bool save_Hash_Table(FILE* fp) const; bool read_Hash_Table(FILE* fp); private: int initialization(void); void printInfo(void) const; }; string default_index_path(string filePostfix, bool colorReads,\ unsigned int seedOption, unsigned int uiReadLength); ./Source/LongReadsSet.h0000644011075700120610000000225111720654362015104 0ustar yanghochmath-ar#pragma once #include "PairedReadsSet.h" #include "FileOutputBuffer.h" class CLongReadsSet : public CPairedReadsSet { public: CLongReadsSet(void); CLongReadsSet (const char* InputFile, const char* fileFormat,\ unsigned int expReadStrLineLength, unsigned int allowedNumOfNinRead, unsigned int readStartIndex); ~CLongReadsSet(void); unsigned int get_next_capacity_long_reads(); unsigned int get_next_capacity_long_reads(CLongReadsSet &set2); bool save_next_long_read(bool bSOLiDReadFormat, bool getQScores, bool in5to3cat3to5Format); // bool save_next_long_read(unsigned int fullReadLength, bool getQScores, bool in5to3cat3to5Format); inline bool save_next_long_Illumina_read(unsigned int fullReadLength, bool getQScores, bool in5to3cat3to5Format); inline bool save_next_long_SOLiD_read(unsigned int fullReadLength, bool getQScores); int size(); void setBadReadOutputFile(FileOutputBuffer* pOut); unsigned int longReadLength; friend int get_next_capacity_long_paired_reads(CLongReadsSet &set1, CLongReadsSet &set2); }; int get_next_capacity_long_paired_reads(CLongReadsSet &set1, CLongReadsSet &set2);./Source/MappingResult.h0000644011075700120610000002444411720654362015354 0ustar yanghochmath-ar#pragma once #include "stdafx.h" #include "FileOutputBuffer.h" #include "AlignmentsQ.h" #include "ShortReadUtil.h" #include "ColorSpaceRead.h" // TODO: Move that so Mapping Result is undependent to that // class conatins the result of a mapping const int READ_BUFFER_SIZE = MAX_READ_LENGTH * 2 + 1; const int SCORES_BUFFER_SIZE = MAX_READ_LENGTH * 10 + 1; class CMappingResult { public: CMappingResult(void); CMappingResult(CAlignmentsQ& que, unsigned int uiReadLength); ~CMappingResult(void); unsigned int uiReadLength; char QNAME[FILENAME_MAX]; char RNAME[FILENAME_MAX]; char caRef[READ_BUFFER_SIZE]; char caRead[READ_BUFFER_SIZE]; char rawScores[READ_BUFFER_SIZE]; // for calculate char QScores[SCORES_BUFFER_SIZE]; // for print unsigned int uiGlobalMappedPos, uiRefId, uiPOS; unsigned int uiDiff; int mismatchScore; //Sum of quality score of mismatches bases, if quality score are available. char strand; unsigned int MultipleMappedNo; char SNPtype; // If not SOLiD read, print a space. If SOLiD read, the SNP type indicated by N, C, V, T // Attribute only for SAM inline void getReverseReadandQual(void); inline void setSingleEndSamFields(void); inline void setSingleEndSamFlags(void); inline void setPairedEndSamFields(void); inline void setPairedEndSamFlags(bool firstRead, char mateStrand); // For SAM format, which prints read in ref direction char revComRead[READ_BUFFER_SIZE]; char revQScores[SCORES_BUFFER_SIZE]; int FLAG; int MAPQ; int ISIZE; unsigned int uiMPOS; char CIGAR[FILENAME_MAX]; char MRNM[FILENAME_MAX]; char TAG[FILENAME_MAX]; bool isColorRead; private: inline void initialization(void); }; inline void CMappingResult::getReverseReadandQual(void) { // if (strand == '-') { strcpy(revComRead, caRead); strcpy(revQScores, QScores); reverseComplementKmer(revComRead); reverseKmer(revQScores); } } inline void CMappingResult::setSingleEndSamFlags(void) { this->FLAG = 0; if (this->strand == '-') { this->FLAG += 0x10; } } inline void CMappingResult::setPairedEndSamFields(void) { // Assume flag, uiMPOS, and ISIZE are set outside this->MAPQ = 255; this->MRNM[0] = '='; this->MRNM[1] = '\0'; sprintf(this->CIGAR, "%uM", this->uiReadLength); if (this->TAG[0] != '\0' && this->TAG[0] != 'N') { sprintf(this->TAG, "NM:i:%d\t%s", this->uiDiff, string(this->TAG).c_str()); } else { sprintf(this->TAG, "NM:i:%d", this->uiDiff); } } inline void CMappingResult::setSingleEndSamFields(void) { this->setSingleEndSamFlags(); // Single strang + -, 64, 80 this->MAPQ = 255; this->MRNM[0] = '*'; this->MRNM[1] = '\0'; this->uiMPOS = 0; this->ISIZE = 0; sprintf(this->CIGAR, "%uM", this->uiReadLength); sprintf(this->TAG, "NM:i:%d", this->uiDiff); } inline int setFlag(int flags, int digit, bool value) { int flag = (0x01 << digit); if (value) { return( flags | flag); } else { return( flags & (~flag)); } } inline string getSamRG(const char* readsName, bool bSOLiD) { char RG[MAX_LINE]; const char* platform = bSOLiD ? "SOLiD" : "Illumina"; sprintf(RG, "@RG\tID:%s\tSM:%s\tPU:%s ", readsName, readsName, platform); return(string(RG)); } inline void setSamFlags(CMappingResult &m1, CMappingResult &m2, int m1No, int m2No, bool bInRange) { // potential error if one of the read is not mapped however the strand flag is still set int flag = 1 + 0x02*(1-(int)bInRange); m1.FLAG = flag + 0x04*((int)(m1No == 0)) + 0x08*((int)(m2No == 0)) + 0x40; m1.FLAG += (int)(m1.strand == '-') * 0x10 + (int)(m2.strand == '-') * 0x20; m2.FLAG = flag + 0x04*((int)(m2No == 0)) + 0x08*((int)(m1No == 0)) + 0x80; m2.FLAG += (int)(m2.strand == '-') * 0x10 + (int)(m1.strand == '-') * 0x20; } inline void resetStrandSamFlag(CMappingResult &m) { const int strandQDigit = 4; m.FLAG = setFlag(m.FLAG, strandQDigit, m.strand == '-'); } inline void resetStrandSamFlags(CMappingResult &m1, CMappingResult &m2) { const int strandQDigit = 4; const int strandMDigit = 5; m1.FLAG = setFlag(m1.FLAG, strandQDigit, m1.strand == '-'); m2.FLAG = setFlag(m2.FLAG, strandQDigit, m2.strand == '-'); m1.FLAG = setFlag(m1.FLAG, strandMDigit, m2.strand == '-'); m2.FLAG = setFlag(m2.FLAG, strandMDigit, m1.strand == '-'); } inline void setSamFlags4OnlyOneEndMapped(CMappingResult &m, bool firstRead) { int flag = 0x01; m.FLAG = flag + 0x08 + (int)(m.strand == '-') * 0x40; if (firstRead) { m.FLAG += 0x40; } else { m.FLAG += 0x80; } } inline void printAMappingInSam(FileOutputBuffer* AlignResult, CMappingResult& m) { char* SEQ = (m.strand == '+') ? m.caRead : m.revComRead; char* QUAL = (m.strand == '+') ? m.QScores : m.revQScores; resetStrandSamFlag(m); // fake quality score if (QUAL[0] == '\0') { memset(QUAL, 'I', sizeof(char) * m.uiReadLength); } QUAL[m.uiReadLength] = '\0'; unsigned int uiMPOS = (m.MRNM[0] == '*' ? 0 : m.uiMPOS + 1); sprintf(AlignResult->caBufp, "%s\t%d\t%s\t%u\t%d\t%s\t%s\t%u\t%d\t%s\t%s\t%s\n",\ m.QNAME, m.FLAG, m.RNAME, m.uiPOS + 1, m.MAPQ, m.CIGAR,\ m.MRNM, uiMPOS, m.ISIZE, SEQ, QUAL, m.TAG); AlignResult->UpdateSize(); } // TODO the quality score is not handled inline void printAMappingInFastq(FileOutputBuffer* AlignResult, CMappingResult& m) { char readInColor[FILENAME_MAX]; char colorReadQ[FILENAME_MAX]; // fake quality score if (m.QScores[0] == '\0') { if (m.isColorRead) { memset(colorReadQ, '!', sizeof(char) * m.uiReadLength); } else { memset(m.QScores, 'I', sizeof(char) * m.uiReadLength); } } // TODO: assign m.isColorRead when maaping // TODO: if it is color read, make color read output const char* SEQ; const char* QUAL; if (m.isColorRead) { // TODO: Move that so Mapping Result is undependent to that basesStr2ColorStr(m.caRead, readInColor); SEQ = readInColor; QUAL = colorReadQ; } else { SEQ = m.caRead; QUAL = m.QScores; } m.QScores[m.uiReadLength] = '\0'; sprintf(AlignResult->caBufp, "@%s\n%s\n+\n%s\n", m.QNAME, SEQ, QUAL); AlignResult->UpdateSize(); } // Assume both end are mapped to a same reference inline bool printAPairedMappingInSam\ (FileOutputBuffer* AlignResult, CMappingResult &m1, CMappingResult &m2) { const bool bMapped = true; setSamFlags(m1, m2, (int)(bMapped), (int)(bMapped), bMapped); m1.setPairedEndSamFields(); m2.setPairedEndSamFields(); m1.uiMPOS = m2.uiPOS; m2.uiMPOS = m1.uiPOS; strcpy(m2.QNAME, m1.QNAME); // sam format needs a same query name for a pair. printAMappingInSam(AlignResult, m1); printAMappingInSam(AlignResult, m2); return(true); } inline void printAMappingInPerM(FileOutputBuffer* AlignResult, CMappingResult& m, bool bPrintNM) { bool bPrintNoOfMis = (bPrintNM || m.QScores[0] == '\0'); int mis = (bPrintNoOfMis ? (int)m.uiDiff : m.mismatchScore); if (m.QScores[0] == '\0') { sprintf(AlignResult->caBufp, "%s\t%s\t%s\t%d\t%s\t%c\t%d\t%u\t%s\n",\ m.QNAME, m.caRead, m.RNAME, m.uiPOS, m.caRef,\ m.strand, mis, m.MultipleMappedNo,"*"); } else { sprintf(AlignResult->caBufp, "%s\t%s\t%s\t%d\t%s\t%c\t%d\t%u\t%s\n",\ m.QNAME, m.caRead, m.RNAME, m.uiPOS, m.caRef,\ m.strand, mis, m.MultipleMappedNo, m.QScores); } AlignResult->UpdateSize(); } inline string getCategory(char strand1, char strand2, int ISIZE, int disLB, int disUB) { char category[4]; category[3] = '\0'; category[0] = (strand1 == strand2) ? 'A' : 'B'; category[1] = (strand1 == '-') ? 'B' : 'A'; if (ISIZE > disUB) { category[2] = 'B'; } else if (ISIZE < disLB) { category[2] = 'C'; } else { category[2] = 'A'; } return(string(category)); } inline bool hasQScores(CMappingResult &m) { // TODO Has a better judgement return(m.QScores[0] != '\0'); } // inline bool printAPairedMappingInPerM\ (FileOutputBuffer* AlignResult, CMappingResult &m1, CMappingResult &m2, int mappedPairedNo, bool bPrintNM = false) { bool bPrintNoOfMis = (bPrintNM || !hasQScores(m1) || !hasQScores(m2)); int mis1 = (bPrintNoOfMis ? (int)m1.uiDiff : m1.mismatchScore); int mis2 = (bPrintNoOfMis ? (int)m2.uiDiff : m2.mismatchScore); int totalMis = mis1 + mis2; if(m1.caRef[0] == '\0' || m2.caRef[0] == '\0') { if(m1.QScores == '\0' || m2.QScores == '\0') { sprintf(AlignResult->caBufp, "%s\t%s\t%s\t%d\t%d\t%d\t%s\t%s\t%u\t%u\t%c\t%c\t%d\n", m1.QNAME, m1.caRead, m2.caRead, mis1, mis2, totalMis, m1.RNAME, m2.RNAME, m1.uiPOS, m2.uiPOS, m1.strand, m2.strand, mappedPairedNo); } else { sprintf(AlignResult->caBufp, "%s\t%s\t%s\t%s\t%s\t%d\t%d\t%d\t%s\t%s\t%u\t%u\t%c\t%c\t%d\n", m1.QNAME, m1.caRead, m2.caRead, m1.QScores, m2.QScores, mis1, mis2, totalMis, m1.RNAME, m2.RNAME, m1.uiPOS, m2.uiPOS, m1.strand, m2.strand, mappedPairedNo); } } else { if(m1.QScores == '\0' || m2.QScores == '\0') { sprintf(AlignResult->caBufp, "%s\t%s\t%s\t%d\t%d\t%d\t%s\t%s\t%u\t%u\t%c\t%c\t%d\t%s\t%s\n", m1.QNAME, m1.caRead, m2.caRead, mis1, mis2, totalMis, m1.RNAME, m2.RNAME, m1.uiPOS, m2.uiPOS, m1.strand, m2.strand, mappedPairedNo, m1.caRef, m2.caRef); } else { sprintf(AlignResult->caBufp, "%s\t%s\t%s\t%s\t%s\t%d\t%d\t%d\t%s\t%s\t%u\t%u\t%c\t%c\t%d\t%s\t%s\n", m1.QNAME, m1.caRead, m2.caRead, m1.QScores, m2.QScores, mis1, mis2, totalMis, m1.RNAME, m2.RNAME, m1.uiPOS, m2.uiPOS, m1.strand, m2.strand, mappedPairedNo, m1.caRef, m2.caRef); } } AlignResult->UpdateSize(); return(true); } inline void printAMappingInGff(FileOutputBuffer* AlignResult, CMappingResult& m,\ int score, string& rTag) { sprintf(AlignResult->caBufp, "%s\tsolid\tread\t%d\t%d\t%d\t%c\t.\tg=%s;q=%s;r=%s", m.QNAME, m.uiPOS + 1, m.uiPOS + m.uiReadLength, score, m.strand,\ m.caRead, m.QScores, rTag.c_str()); AlignResult->UpdateSize(); } ./Source/MismatchScores.h0000644011075700120610000000336311720654362015503 0ustar yanghochmath-ar#pragma once /* * A class used to do statistics in CReadInBitsSet for read mapping */ #include #include #include #include using namespace std; class CMismatchScores { public: CMismatchScores(void); CMismatchScores(unsigned int uiNoOfReads); ~CMismatchScores(void); int printArray(char* filename); int doStatistics(unsigned int uiNoOfReads, char* dataSet, int numOfSNP); // switch the function pointers to (1)Update records (2) Check if scores is the best alignment void switchUpdatesAndIsBest(bool firstRun, bool SOLiDRecord); // Function pointer point to the int (CMismatchScores::*update)(unsigned int, int); int callUpdate(unsigned int readId , int score); int normalUpdate(unsigned int readId , int score); int solidUpdate(unsigned int readId , int score); // special update for solid read mismatch score int dummyUpdate(unsigned int readId , int score); unsigned int uiNoOfReads; // return if an alignment is the best in record bool (CMismatchScores::*isBest)(unsigned int, int); bool callIsBest(unsigned int readId , int score); bool isBestInRecords(unsigned int readId , int score); bool isBestInRecords4Solid(unsigned int readId , int score); // special record lookup for solid read mismatch score bool dummyBest(unsigned int readId , int score); // The array that record the best mapping of each reads. short* mismatchScore; // The array that record the number of identical best mapping of each reads. unsigned char* noOfBestMappings; private: inline unsigned char addCounter(int readId); inline bool compareRecord4Solid(unsigned int readId , int score); // special record lookup for solid read mismatch score }; ./Source/PairedReadsMapping.h0000644011075700120610000001041611720654362016253 0ustar yanghochmath-ar#pragma once #include "ReadsMapping.h" #include "MappingResult.h" #include "LongReadsSet.h" class CBestPairedMapping { public: CBestPairedMapping(void); ~CBestPairedMapping(void); CMappingResult bm1, bm2; int bestMappingNo; int validMappingNo; unsigned int minDiff; inline void update(CMappingResult &m1, CMappingResult &m2, bool excludeAmbigousRead); }; class CPairedReadsMapping : public CReadsMapping { public: CPairedReadsMapping(void); CPairedReadsMapping(const MappingOpts P); ~CPairedReadsMapping(void); //int mapPairedReadsInASingleFile(CPairedReadsSet& readSet, CGenome_Index_TableQ& table); int mapPairedReadsInPairedFiles(CReadInBitsSet& readSet1, CReadInBitsSet& readSet2, CGenome_Index_TableQ& table); int mapPairedReads(CReadInBitsSet& readSet1, CReadInBitsSet& readSet2, CGenome_Index_TableQ& table); int mapPairedLongReadsInBases(CLongReadsSet& longReadSet1, CLongReadsSet& longReadSet2, const CGenome_Index_TableQ& table); int dealMappedSingleRead(const CGenome_Index_TableQ& table, CAlignmentsQ & Que, CMappingResult &m, bool bFirstEnd); int dealMappedPairedReads(CGenome_Index_TableQ& table); int dealMappedLongPairedRead(CAlignmentsQ& q1, CAlignmentsQ& q2, CMappingResult& m1, CMappingResult& m2, const CGenome_Index_TableQ& table); // F3read and R3read are index in AlignmentsQ void dealNoMapping(const CGenome_Index_TableQ& table, CMappingResult& m1, CMappingResult& m2); int dealBestMapping(const CGenome_Index_TableQ& table, CBestPairedMapping& bestMP, CMappingResult& m1, CMappingResult& m2); int printValidMappedPair(const CGenome_Index_TableQ& table, CMappingResult& m1, CMappingResult& m2, int validMappedPairNo); int printBestMappedPair(const CGenome_Index_TableQ& table, CMappingResult& m1, CMappingResult& m2, int minMismatchNo, int bestMappedPairNo); inline void getPairedRInfo(const CGenome_Index_TableQ& table, CMappingResult &m1, CMappingResult &m2, bool samFormat); void printAMappedPair(const CGenome_Index_TableQ& table, CMappingResult &m1, CMappingResult &m2, int noPairedLoc); void printMappedPairStats(ostream& out, CReadInBitsSet& readSet, unsigned int uiSubThreshold); protected: void initialization(void); inline void bookNoMappedKeepPairs(bool sepMore, bool sepLess, bool pairedOnExpStrand); inline void bookKeepMappedPairs(CBestPairedMapping& bestMP); unsigned int getPairedReadSetSize (CReadInBitsSet& setA1, CReadInBitsSet& setA2, CReadInBitsSet& setB1, CReadInBitsSet& setB2); int noOfPairsInRange; int noOfPairsSepMore; int noOfPairsSepLess; int noOfPairsSepMoreAndLess; int noOfSingle1stEndMapped; int noOfSingle2ndEndMapped; int noOfAmbiguousPairs; int noOfExpMappedPairedStrand; }; inline void CPairedReadsMapping::bookNoMappedKeepPairs(bool sepMore, bool sepLess, bool pairedOnExpStrand) { if (sepMore && sepLess) { noOfPairsSepMoreAndLess++; } else if (sepMore) { noOfPairsSepMore++; } else if (sepLess) { noOfPairsSepLess++; } if (pairedOnExpStrand) { this->noOfExpMappedPairedStrand++; } } inline void CPairedReadsMapping::bookKeepMappedPairs(CBestPairedMapping& bestMP) { this->noOfPairsInRange++; this->iMapCount++; this->iMapDiffCount[bestMP.minDiff]++; bool stricklyExcludeAmbiguous = opt.bExcludeAmbiguousPaired && opt.bGetAllAlignments; // -A -e bool bAmbiguous = (stricklyExcludeAmbiguous && bestMP.validMappingNo > 1) || bestMP.bestMappingNo > 1; if (bAmbiguous) { this->iMultiMappedReads++; } } inline void getSingleMappingInfo(CMappingResult &m, CAlignmentsQ &q, int mappingIndex); // Map mated paired reads parallelly int parallelMappingPairedReads(vector& readSetsList1, vector& readSetsList2, CGenome_Index_TableQ& indexTable, MappingOpts P); // For the paired read set in a file with 5'-3' concatenated with 3'-5'. int parallelMappingPairedReads(vector& readSetsList, CGenome_Index_TableQ& indexTable, MappingOpts P); int parallelMappingPairedLongReads(vector& readSetsList1, vector& readSetsList2,\ CGenome_Index_TableQ& indexTable, MappingOpts P); ./Source/PairedReadsSet.h0000644011075700120610000000376511720654362015424 0ustar yanghochmath-ar#pragma once #ifndef CPAIRED_READ_SET_H_ #define CPAIRED_READ_SET_H_ #include "FileOutputBuffer.h" #include "ReadsFileParser.h" #include "ReadInBitsSet.h" #include "ReadInBits.h" #include "ColorSpaceRead.h" #include "ShortReadUtil.h" #include "MismatchScores.h" #include "ReadsQualScores.h" #include #include #include using namespace std; class CPairedReadsSet { public: CPairedReadsSet(void); CPairedReadsSet(unsigned int Capacity, unsigned int uiReadLength); // Generate set given a file with reads and readLength (must be known in advance) CPairedReadsSet(const char* InputFile, const char* fileFormat,\ unsigned int uiReadLength, bool in5to3cat3to5Format, unsigned int allowedNumOfNinRead, unsigned int readStartIndex = 0); virtual ~CPairedReadsSet(); void ignoreQScores(void); // unsigned int get_next_capacity_reads_pairs_from_single_file(void); CReadInBitsSet* F_Reads; CReadInBitsSet* R_Reads; unsigned int allowedNumOfNinRead; unsigned int uiRead_Length; unsigned int uiNo_of_Reads; unsigned int uiNo_of_Bad_Reads; bool in5to3cat3to5Format; // A char flag used to record the input file type, example F for fasta, S for .seq.txt, A for .realign char cFileType; char InputFile[FILENAME_MAX]; // Keep the info of the quality score CReadsQualScores* pQualScoresF; CReadsQualScores* pQualScoresR; protected: // get reads from the file and store (append) in a vector. Return how many reads are read-in. unsigned int openAFileReady2GetRead(const char* InputFile, const char* fileFormat, unsigned int uiExpReadsStrLength, bool bDiscardReadsWN = false); void save_next_read_id(const char* tagLine, char sep = ','); CReadsFileParser parser; void clearReads(void); void removeExtraTags(void); void getQualityScoresFromQUAL(void); void handleBadRead(void); unsigned int getExpReadLength(unsigned int getExpReadLength); private: int initialization(void); }; #endif ./Source/ParameterList.h0000644011075700120610000001150211720654362015325 0ustar yanghochmath-ar#ifndef INCLUDED_ParameterList #define INCLUDED_ParameterList /* * This parameter file is modeifed from SOCS system. */ #include "ParseReadsOpts.h" #include "Flags.h" #include "ShortReadUtil.h" #include "ReadsFileParser.h" #include "stdafx.h" #include #include #include #include #include #include #include #include // Use OpenMP is gcc version is later than 4.2 #ifdef __GNUC__ #ifdef __GNUC_PATCHLEVEL__ #define __GNUC_VERSION__ (__GNUC__ * 10000 \ + __GNUC_MINOR__ * 100 \ + __GNUC_PATCHLEVEL__) #else #define __GNUC_VERSION__ (__GNUC__ * 10000 \ + __GNUC_MINOR__ * 100) # endif #if __GNUC_VERSION__ >= 40200 #include #endif #else #ifdef _MSC_VER #if _MSC_VER >= 2000 #include #endif #endif #endif using namespace std; class MappingOpts : public CParseReadsOpts { public: MappingOpts(void); virtual ~MappingOpts(void); void setDefaults(void); void clearOutputFileName(bool clear = true); unsigned int readLength; unsigned int anchorLength; string fullCommand; // I/O options bool bPrintSamHeader; // Default is true bool bIgnoreQS; // Default is false bool bPrintNM; // Default is false bool bPrintAlignments; // Default is false bool bPrintAmbigReadsSeparately; // Default is false bool bPrintUnMappedReads; // Default is false bool bExcludeAmbiguousReads; // Default is true bool bPrintAmbiguousReadsOnly; // Default is false bool bPrintBadReads; // Default is false bool bPrintFirstAlignmentOnly; // Default is false bool bGetAllAlignments; // Default is false bool bMap2ForwardStrandOnly; // Default is false bool bMap2ReverseStrandOnly; // Default is false bool bPrintAmbigReadsInOneLine; // Default is false char logFileN[FILENAME_MAX]; char outputDir[FILENAME_MAX]; char outputFileN[FILENAME_MAX]; char outputFormat[FILENAME_MAX]; char badReadFileN[FILENAME_MAX]; char ambiguousReadFileN[FILENAME_MAX]; char unmappedFileN[FILENAME_MAX]; char readsFileFormat[FILENAME_MAX]; int ambiguousDiffThreshold; int mismatchScoreThreshold; int maxAlignPerRead; int subDiffThreshold; bool bExcludeAmbiguousPaired; bool bPrintBestPaired; bool bPrintRef4PairedInMapping; bool bPrintPairedRQ; // The default is output all combinations paired end mappings bool frOnly; // Paired end can only align to different strand. bool ffOnly; // Paired end can only align to the same strand. int disLB; // distance lower bound int disUB; // distance upper bound unsigned int truncatedReadPrefix; char readtag_delimiter; unsigned int maxThreadNum; // for OpenMp }; class ParameterList : public MappingOpts { public: ParameterList(void) ; void setDefaults(void); bool checkRefValidity(void); bool truncatReadLength(void); void getOptsByCheckingExtName(void); void printSetting(void); bool validFlag; // Basic Input char refFile[FILENAME_MAX]; char indexFileN[FILENAME_MAX]; char seedName[FILENAME_MAX]; string refFormat; // index, fasta, list int seedId; // Index bool bMakeIndex; // Default is false bool bSaveIndex; // Default is false // For Pairend Read bool bMatePairedReads; char matePairFileN1[FILENAME_MAX]; char matePairFileN2[FILENAME_MAX]; // others bool bMaskedMathRepeat; }; bool printOptWarning4PairedEndOpts(ParameterList &P); bool printOptWarning4PairedEndOpts(ParameterList &P); ParameterList getParameterList(int argc, const char** argv); bool retriveReadSetsAndSettings(ParameterList& P, \ vector& readSetsList1,\ vector& readSetsList2); bool checkReadsSetNamesValidity(vector& readSetsList1,\ vector& readSetsList2); unsigned int getReadLength(const char* readSetFileName, char expFileFormat = 'N'); bool getReadSetsFilenames(ParameterList &P,\ vector& readSetList1,\ vector& readSetList2); bool checkFileListIsForPairedReads(const char* readSetListFilename); bool checkFileListHasTheRightExt(vector& readSetList); bool withSupportExtFileName(const char* fileName); bool withFastaExtFileName(const char* fileName); int selectSeed(ParameterList& P); void printSynopsis(void); void printUsageInfo(string helpOpt); // Overwrite string if the source string is not null inline bool setStr(char* str1, const char* str2) { if (str2[0] != '\0') { strcpy(str1, str2); return(true); } else { return(false); } } #endif ./Source/ParseReadsOpts.h0000644011075700120610000000102311720654362015445 0ustar yanghochmath-ar#pragma once #include "stdafx.h" class CParseReadsOpts { public: CParseReadsOpts(void); virtual ~CParseReadsOpts(void); void setDefaults(void); char readsFile[FILENAME_MAX]; char qualityFile[FILENAME_MAX]; char cFileFormatSymbol; // Reads files (can be selected by reads file) bool bDiscardReadWithN; // Default is true bool bMappedSOLiDRead; bool bMappedLongRead; bool bOddReadLengthAndLongRead; unsigned int truncatedReadLength; unsigned int allowedNumOfNinRead; }; ./Source/ReadInBits.h0000644011075700120610000001260411720654362014541 0ustar yanghochmath-ar#pragma once /* * This function encode a string of DNA sequence into 2 unsigned int. * For a 32 bit machine, it can has only 32 bases. * For a 64 bit machine, it can has only 64 bases */ #include "bitsOperationUtil.h" #include using namespace std; class CReadInBits { public: CReadInBits(void); CReadInBits(const char* caRead); CReadInBits(const char* caRead, int readlength); ~CReadInBits(void); WORD_SIZE UpperBits; WORD_SIZE LowerBits; static int iReadLength; unsigned int encode(const char* caRead); unsigned int encode(const char* caRead, int readlength); unsigned int encodeRead_NasA(const char* caRead, int readlength); char* decode(char* caRead); int* decode(int* iaRead) const; // return 0, 1, 2, 3 instead of A, C, G, T in the array // Return 0 for A, 1 for C, 2 for G, 3 for T in that position inline WORD_SIZE decode(int basePosition); inline CReadInBits getSuffixStr(unsigned int shift) const; inline CReadInBits getPrefixStr(unsigned int length) const; bool operator==(const CReadInBits &other) const; bool operator<(const CReadInBits &other) const; const static unsigned int MAX_READ_LENGTH_IN_BITS = 63; }; // Return 0 for A, 1 for C, 2 for G, 3 for T in that position inline WORD_SIZE CReadInBits::decode(int basePosition) { if (basePosition < CReadInBits::iReadLength) { WORD_SIZE upperbit = (this->UpperBits >> basePosition) & 0x01; WORD_SIZE lowerbit = (this->LowerBits >> basePosition) & 0x01; return((upperbit << 0x01) + lowerbit); } else { return 5; } } inline CReadInBits CReadInBits::getSuffixStr(unsigned int shift) const { CReadInBits r; r.UpperBits = this->UpperBits >> shift; r.LowerBits = this->LowerBits >> shift; return (r); } inline CReadInBits CReadInBits::getPrefixStr(unsigned int length) const { CReadInBits r; unsigned int maskedSuffixLength = (wordSize - length); r.UpperBits = this->UpperBits << maskedSuffixLength; r.UpperBits >>= maskedSuffixLength; r.LowerBits = this->LowerBits << maskedSuffixLength;; r.LowerBits >>= maskedSuffixLength; return (r); } inline WORD_SIZE getDiffBits (CReadInBits A, CReadInBits B) { WORD_SIZE upperBitsDiff = A.UpperBits ^ B.UpperBits; WORD_SIZE lowerBitsDiff = A.LowerBits ^ B.LowerBits; return(upperBitsDiff | lowerBitsDiff); } // reverse complement the two encoded read void reverseCompliment(unsigned int uiReadLength, WORD_SIZE* UpperBits, WORD_SIZE* LowerBits); CReadInBits reverseCompliment(unsigned int uiReadLength, CReadInBits r); // Calculate the difference of bits // unsigned int bitsStrCompare(WORD_SIZE UpperBits1, WORD_SIZE LowerBits1, WORD_SIZE UpperBits2, WORD_SIZE LowerBits2); unsigned int bitsStrCompare(CReadInBits r1, CReadInBits r2); // compare the last N bits only // unsigned int bitsStrNCompare(WORD_SIZE UpperBits1, WORD_SIZE LowerBits1, WORD_SIZE UpperBits2, WORD_SIZE LowerBits2); // count the diff of the first N base pairs. unsigned int bitsStrNCompare(CReadInBits r1, CReadInBits r2, unsigned int N); // skip the first M base pair and count the diff of the following N base pair unsigned int bitsStrMNCompare(CReadInBits r1, CReadInBits r2, unsigned int M, unsigned int N); inline CReadInBits reverseBitsSignals(CReadInBits& readInBits, int extraBitsNo) { // int extraBitsNo = wordSize - bitStrNo - 1; CReadInBits reverseBits = readInBits; reverseBits.LowerBits = reverse64bits(reverseBits.LowerBits); reverseBits.UpperBits = reverse64bits(reverseBits.UpperBits); reverseBits.LowerBits >>= extraBitsNo; reverseBits.UpperBits >>= extraBitsNo; return(reverseBits); } inline int printBitsStrCompare(CReadInBits exp, CReadInBits actual, const char* msg) { char caExp[wordSize + 1]; char caActual[wordSize + 1]; exp.decode(caExp); actual.decode(caActual); for (int i = 0; i < CReadInBits::iReadLength; i++) { if (caExp[i] != caActual[i]) { caActual[i] = (char)tolower(caActual[i]); } } cout << msg << endl; cout << caExp << endl; cout << caActual << endl; return(0); } inline int printBitsStr(CReadInBits strInBits, int length) { char caStr[wordSize + 1]; strInBits.decode(caStr); caStr[length] = '\0'; printf("%s\n", caStr); return(0); } inline int printBitsStr(WORD_SIZE strInBits, int length) { char caStr[wordSize + 1]; for (int i = 0; i < length; i++) { caStr[i] = (strInBits & 0x01) ? '1': '0'; strInBits >>= 0x01; } caStr[length] = '\0'; printf("%s\n", caStr); return(0); } inline WORD_SIZE SHIFT_LEFT(WORD_SIZE strInBits, int digit) { if (digit >= 32) { strInBits <<= 31; return(strInBits << (digit - 31)); } else { return(strInBits << digit); } } unsigned int encodeRead(const char* caRead, int iReadLength, WORD_SIZE* encodUpperBits, WORD_SIZE* encodedLowerBits); unsigned int encodeReadNasA(const char* caRead, int uiReadLength, WORD_SIZE* encodUpperBits, WORD_SIZE* encodedLowerBits); unsigned int encodeLongRead(const char* read, CReadInBits& firstHalf, CReadInBits& secondHalf); unsigned int decodeRead(char* caRead, int iReadLength, WORD_SIZE UpperBits, WORD_SIZE LowerBits); unsigned int decodeRead(int* iaRead, int iReadLength, WORD_SIZE UpperBits, WORD_SIZE LowerBits); unsigned int decodeLongRead(CReadInBits& firstHalf, CReadInBits& secondHalf, char* read, bool oddReadLength);./Source/ReadInBitsSet.h0000644011075700120610000000543611720654362015222 0ustar yanghochmath-ar#pragma once #ifndef READINBITSET_H_ #define READINBITSET_H_ #include "ReadsFileParser.h" #include "ReadInBits.h" #include "ColorSpaceRead.h" #include "ShortReadUtil.h" #include "MismatchScores.h" #include "ReadsQualScores.h" #include "Filename.h" #include "stdafx.h" #include #include #include using namespace std; // each time read in one million reads as buffering const unsigned int BUFFERED_READS_SIZE = 1000000; // const unsigned int BUFFERED_READS_SIZE = 1000; class CReadInBitsSet { public: CReadInBitsSet(void); CReadInBitsSet(unsigned int Capacity, unsigned int uiReadLength, unsigned int allowedNumOfNinRead = 0); // Generate set given a file with reads and readLength (must be known in advance) CReadInBitsSet(const char* InputFile, const char* fileFormat, \ unsigned int uiReadStartIndex, unsigned int uiReadLength, unsigned int allowedNumOfNinRead); virtual ~CReadInBitsSet(); int clear(int capacity = 0); // get reads from the file and store (append) in a vector. Return how many reads are read-in. unsigned int openAFileReady2GetRead(const char* InputFile, const char* fileFormat, unsigned int uiReadStartIndex); unsigned int openAFileReady2GetReadQSinQUAL(const char* InputFile, unsigned int readQsLength); unsigned int get_next_capacity_reads(int capacity, char sep = ','); void ignoreQScores(void); void get_read_id(int no, char* readId); void save_next_read_id(const char* tagLine, char sep = ','); bool save_next_read(const char* readSeq, bool bSOLiDReadFormat); inline const char* getQScoresPtr(int readId); vector* pReadsSet; vector* pReadsID; // The vector keep tags of the reads bool bDiscardReadWithN; unsigned int allowedNumOfNinRead; unsigned int uiRead_Length; unsigned int uiNo_of_Reads; unsigned int uiNo_of_Bad_Reads; //A char flag used to record the input file type, example F for fasta, S for .seq.txt, A for .realign char cFileType; char InputFile[FILENAME_MAX]; // Keep the info of the quality score CReadsQualScores* pQualScores; // Keep the info of the best alignments score + # of best alignments CMismatchScores* pMismatchScores; void setBadReadOutputFile(FileOutputBuffer* pOut); protected: CReadsFileParser parser; private: int initialization(void); void handleBadread(void); }; inline const char* CReadInBitsSet::getQScoresPtr(int readId) { if (this->pQualScores == NULL) { return(NULL); } else { return(this->pQualScores->qScores((unsigned int)readId)); } } // This function prints out the reads that have mapping worse than the missMatchScoreT int printMissReads(const char* outputfile, CReadInBitsSet& readSet, int missMatchScoreT); #endif /* CREADSSET_H_ */ ./Source/ReadsFileParser.h0000644011075700120610000000660711720654362015576 0ustar yanghochmath-ar#pragma once #ifndef READS_FILE_PARSER_H_ #define READS_FILE_PARSER_H_ #ifndef MAX_PATH #define MAX_PATH 2048 #endif #ifndef MAX_CHAR_PER_LINE const int MAX_CHAR_PER_LINE = 5000; #endif const int READS_INPUT_BUFFER_SIZE = 20000000; #ifndef MAX_READ_SET_CAPACITY const int MAX_READ_SET_CAPACITY = 30000000; #endif #include "ReadsQualScores.h" #include "Filename.h" #include "FileInputBuffer.h" #include "FileOutputBuffer.h" #include "ShortReadUtil.h" #include "chdir.h" #include #include using namespace std; /* * This class provides function to read the next reads from different reads file format. */ class CReadsFileParser { public: CReadsFileParser(void); virtual ~CReadsFileParser(void); char openAFileReady2GetRead(const char* filename, const char* fileFormat,\ unsigned int readStartIndex, unsigned int uiRead_Length, bool bDiscardReadsWN, FileOutputBuffer* pBadReadBuf = NULL); // read in a short read file from different format virtual char* get_Next_Read(void); virtual void print_Next_Read(void); ifstream ifile; FileInputBuffer* pBuf; FileOutputBuffer* pOBuf; char InputFile[MAX_PATH]; char caNextReadTag[MAX_CHAR_PER_LINE]; char caNextRead[MAX_CHAR_PER_LINE]; char caNextReadQSs[MAX_CHAR_PER_LINE]; char cFileType; bool bDiscardReadWN; unsigned int readStartIndex; unsigned int uiRead_Length; protected: // get the universal read Id and store in vector // inline void save_next_read_id(const char* tagLine); // get a short read from fasta format char* get_Next_Read_From_Fasta(void); // get a short read from csfasta format char* get_Next_Read_From_csFasta(void); // get a short read from fastq format for Illumina read (seq only) char* get_Next_Read_From_Fastq(void); // get a short read from fastq format for SOLid read (seq only) char* get_Next_Read_From_csFastq(void); private: int initialization(void); // sub functions for get_Next_Read_From_(cs)Fastq inline bool getNextSeqNameInFq(FileInputBuffer* pBuf, char* caBuf); inline bool getNextSeqInFq(FileInputBuffer* pBuf, char* caBuf, unsigned int expLength); inline bool getNextLine(FileInputBuffer* pBuf, char* caBuf, const char exp1stChar); inline bool getNextQScoreInFq(FileInputBuffer* pBuf, char* caBuf, unsigned int expLength); }; /* //estimate number of read unsigned int estimateNoOfReads(const char* fileName, const char* fileFormat); unsigned int estimateNoOfReads_From_Fasta(const char* fileName); unsigned int estimateNoOfReads_From_Fastq(const char* fileName); */ void getReadsFileFormat(const char* fileName, char* fileFormat); char getReadsFileFormatSymbol(const char* InputFile, const char* fileFormat); char getReadsFileFormatSymbol(const char* InputFile); bool is_csFastq_format(const char* fileName, const char* fileFormat = ""); bool is_colorspace_reads(const char* fileName); inline bool hasCsfqExtName(const char* fileName) { if (hasTheExtName(fileName, ".csfastq") || \ hasTheExtName(fileName, ".csfq")) { return(true); } else { return(false); } } inline bool hasFqExtName(const char* fileName) { bool bFqFormat = hasTheExtName(fileName, ".fq") || \ hasTheExtName(fileName, ".fastq") || \ hasTheExtName(fileName, ".fastqsanger"); return(bFqFormat); } #endif ./Source/ReadsMapping.h0000644011075700120610000002607011720654362015131 0ustar yanghochmath-ar#pragma once /****** * Purpose: This program is design to do mapping SOLiD or Illumina Reads, * The input is a set of short reads file and a Genome_Indes_TableQ * The mapping result is output to a file * Author: Yangho Chen */ #include "LongReadsSet.h" #include "PairedReadsSet.h" #include "ReadsMappingStats.h" #include "MappingResult.h" #include "Genome_Index_TableQ.h" #include "ParameterList.h" #include "AlignmentsQ.h" #include "ReadInBitsSet.h" #include "ReadInBits.h" #include "ShortReadUtil.h" #include "ColorSpaceRead.h" #include "GenomeNTdata.h" #include "chromosomeNTdata.h" #include "stdafx.h" #include #include #ifdef WIN32 #include #endif // Use OpenMP is gcc version is later than 4.2 #ifdef __GNUC__ #ifdef __GNUC_PATCHLEVEL__ #define __GNUC_VERSION__ (__GNUC__ * 10000 \ + __GNUC_MINOR__ * 100 \ + __GNUC_PATCHLEVEL__) #else #define __GNUC_VERSION__ (__GNUC__ * 10000 \ + __GNUC_MINOR__ * 100) # endif #if __GNUC_VERSION__ >= 40200 #include #endif #else #ifdef _MSC_VER #if _MSC_VER >= 2000 #include #endif #endif #endif // Macro for Parallelization with OpenMP #ifndef _OPENMP #define __OPENMP_FOR_PARALLEL__(SHARP_SIGN, openmp_flag) \ {LOG_INFO("Info %d: Use single CPU because OpenMP is not available.\n", CONFIG_LOG);} #else #define __OPENMP_FOR_PARALLEL__(SHARP_SIGN, openmp_flag) {\ int numberOfCPUs = omp_get_num_procs();\ std::cout << numberOfCPUs << " CPUs." << BLANK_LINE << "\n";\ SHARP_SIGN##openmp_flag omp parallel for\ } #endif // For both counter for chromosome and tolerated substitution error const int ALIGNMENT_RESULT_FILE_BUFFER_SIZE = 1000000; int parallelMappingLongReads(vector& readSetsList,\ CGenome_Index_TableQ& indexTable, MappingOpts P); int parallelMapping(vector& readSetsList,\ CGenome_Index_TableQ& indexTable, MappingOpts P); /* * This class maps Illumina or SOliD short reads to reference genome. * It switches different ways to do read mapping, according to different options. */ class CReadsMapping: public CReadsMappingStats { public: CReadsMapping(void); CReadsMapping(MappingOpts P); ~CReadsMapping(void); MappingOpts opt; char cOutputFormat; int mapReadsSets(const char* ReadsSetsList, CGenome_Index_TableQ& table, bool bDiscardReadsWN = true); int mapReads(CReadInBitsSet& readsSet, const CGenome_Index_TableQ& table); int mapLongReads(CLongReadsSet& pairedReadSet, const CGenome_Index_TableQ& table); int queryALongReadInColors(CReadInBits& r1stHalf, CReadInBits& r2ndHalf, const CGenome_Index_TableQ& table, CAlignmentsQ& aQue) const; int queryALongReadInBase(CReadInBits& r1stHalf, CReadInBits& r2ndHalf, const CGenome_Index_TableQ& table, CAlignmentsQ& aQue) const; protected: int printMapInfo(CReadInBitsSet& readsSet, int seedOpt); int setUpIO4Aligment(const char* Solexafile, const CGenome_Index_TableQ& table); int tearDownIO4Aligment(void); FileOutputBuffer* AlignResult; FileOutputBuffer* AmbiguousReads; FileOutputBuffer* BadReads; FileOutputBuffer* MissReads; time_t start, end; const static unsigned int CHECK_POINTS = 1000000; int dealMappedRead(const CGenome_Index_TableQ& table, CAlignmentsQ& aQue); int dealMappedLongRead(const CGenome_Index_TableQ& table, CAlignmentsQ& aQue, CMappingResult& m); int dealMissedRead(CMappingResult& m); int dealMissedRead(bool bMapReadInColors, const char* readName, CReadInBits r, const char* qs = NULL); int dealAmbiguousRead(CMappingResult& m); int dealAmbiguousRead(bool bMapReadInColors, const char* readName, CReadInBits r, const char* qs = NULL); int printSingleEndReads(CMappingResult& m); int printLogFile(const char* inputFile); inline void printCheckPointInfo(int i); protected: void initialization(void); inline void printRead(FileOutputBuffer* FileBuf, CMappingResult& m); inline void printRead(FileOutputBuffer* FileBuf, bool bMapReadInColors, const char* readName, CReadInBits r, const char* qs); inline void getLongBaseReadInfo(CReadInBitsSet& readsSet1stHalf, CReadInBitsSet& readsSet2ndHalf, int readId,\ CReadInBits& r1stHalf, CReadInBits& r2ndHalf, CMappingResult& m); inline void getLongColorReadInfo(CReadInBitsSet& readSet1stHalf, CReadInBitsSet& readSet2ndHalf, int readId, \ CReadInBits& r1stHalf, CReadInBits& r2ndHalf, CMappingResult& m); string getMappingFileN(const char* caReadsSetName, const CGenome_Index_TableQ& table); unsigned int checkPairedReadSetSize(CReadInBitsSet& firstHalfSet, CReadInBitsSet& SecondHalfSet); }; inline void CReadsMapping::getLongBaseReadInfo\ (CReadInBitsSet& readSet1stHalf, CReadInBitsSet& readSet2ndHalf, int readId, \ CReadInBits& r1stHalf, CReadInBits& r2ndHalf, \ CMappingResult& m) { m.uiReadLength = this->opt.readLength; // Check the read length // Get read tag if ((int)readSet1stHalf.pReadsID->size() > readId) { strcpy(m.QNAME, readSet1stHalf.pReadsID->at(readId).id); } else { sprintf(m.QNAME, "Read_%d", this->iReadCounter + readId); } // Get read sequence //unsigned int halfReadLength = this->opt.readLength / 2; unsigned int halfReadLength = this->opt.anchorLength; unsigned int secondHalfStart = this->opt.readLength - this->opt.anchorLength; r1stHalf.decode(m.caRead); r2ndHalf.decode(&(m.caRead[secondHalfStart])); // Get the quality score if (readSet1stHalf.pQualScores != NULL) { const char* qS1stHalf = readSet1stHalf.pQualScores->qScores((unsigned int)readId); trQScores(halfReadLength, 0, qS1stHalf, m.rawScores); const char* qS2ndHalf = readSet2ndHalf.pQualScores->qScores((unsigned int)readId); trQScores(halfReadLength, 0, qS2ndHalf, &m.rawScores[secondHalfStart]); trQScores(m.uiReadLength, SolexaScoreEncodingShift, m.rawScores, m.QScores); } if (this->cOutputFormat == 's') { m.getReverseReadandQual(); } } inline void CReadsMapping::getLongColorReadInfo\ (CReadInBitsSet& readSet1stHalf, CReadInBitsSet& readSet2ndHalf, int readId, \ CReadInBits& r1stHalf, CReadInBits& r2ndHalf, \ CMappingResult& m) { m.uiReadLength = this->opt.readLength; // Check the read length // Get read tag if ((int)readSet1stHalf.pReadsID->size() > readId) { strcpy(m.QNAME, readSet1stHalf.pReadsID->at(readId).id); } else { sprintf(m.QNAME, "Read_%d", this->iReadCounter + readId); } // Get read sequence unsigned int halfReadLength = this->opt.anchorLength; unsigned int secondHalfStart = this->opt.readLength - this->opt.anchorLength; decodeColors(m.caRead, r1stHalf); decodePureColors(&(m.caRead[secondHalfStart]), r2ndHalf); // Get the quality score for SOLiD /* TODO fix the SOLiD score (base score and color space score*/ if (readSet1stHalf.pQualScores != NULL) { const char* qS1stHalf = readSet1stHalf.pQualScores->qScores((unsigned int)readId); trQScores(halfReadLength, 0, qS1stHalf, m.rawScores); const char* qS2ndHalf = readSet2ndHalf.pQualScores->qScores((unsigned int)readId); trQScores(halfReadLength, 0, qS2ndHalf, &m.rawScores[secondHalfStart]); trQScores(m.uiReadLength, Phred_SCALE_QUAL_SHIFT, m.rawScores, m.QScores); } // TODO for sam Format, one need reversed quality and Seq if (this->cOutputFormat == 's') { m.getReverseReadandQual(); } } inline void CReadsMapping::printCheckPointInfo(int readNo) { if (readNo % this->CHECK_POINTS == 0) { printf("Mapping no %u reads.\r", this->iReadCounter + readNo); fflush(stdout); } } // The following function fill different part of CMappingResult inline bool getSingleMappingIndex(CGenomeNTdata& pgenomeNT, CAlignmentsQ& aQue, int mappingIndex, CMappingResult &m) { m.uiDiff = aQue.asdiff[mappingIndex]; m.MultipleMappedNo = aQue.load; m.strand = (mappingIndex >= (int)aQue.ForwardAlignmentLoad) ? '-' : '+' ; m.uiGlobalMappedPos = aQue.aiHitIndex[mappingIndex]; m.uiRefId = pgenomeNT.genomeIndex2chrID(m.uiGlobalMappedPos); m.uiPOS = pgenomeNT.genomeLocusID2chrIndex(m.uiGlobalMappedPos); ChrIndex2GeneName& geneVec = pgenomeNT.paChromosomes[m.uiRefId]->geneVec; if (geneVec.table.size() > 0) { CGene g = geneVec.query(m.uiPOS); strcpy(m.RNAME, g.name.c_str()); if(!g.isValid) { // this is for handeling mapping exception for mapping 2 NULL_REGION char* chrName = pgenomeNT.paChromosomes[m.uiRefId]->caInputFileName; char* strBuf = &m.RNAME[strlen(g.name.c_str())]; myStrCpy(strBuf, chrName, FILENAME_MAX/2); return(false); } // g.startIndex is the translated index of m.uiPOS. Not the start index of gene m.uiPOS = g.startIndex; } else { sprintf(m.RNAME, "%d", m.uiRefId); } return(true); } void getQscores4Solexa(CAlignmentsQ& aQue, CMappingResult& m, bool samFormat); void getReadQscores4Solexa(CAlignmentsQ& aQue, CMappingResult& m, bool samFormat); void getSingleMappingSeqAndQ4SOLiD\ (const CGenome_Index_TableQ& table, CAlignmentsQ& aQue, CMappingResult& m, bool samFormat); void getSingleMappingSeq4Solexa(const CGenome_Index_TableQ& table, CMappingResult& m, bool samFormat); void getLongMappingInfo(const CGenome_Index_TableQ& table, CAlignmentsQ& aQue, bool samFormat,\ unsigned int mappingId, CMappingResult& m); void getSingleMappingInfo(const CGenome_Index_TableQ& table, CAlignmentsQ& aQue,\ unsigned int mappingId, CMappingResult& m, bool samFormat); inline void printSamHeader(FileOutputBuffer* AlignResult, vector&refs, const char* RG, const char* CL) { sprintf(AlignResult->caBufp, "@HD\tVN:0.1.5c\tSO:queryname\n"); AlignResult->UpdateSize(); for (vector::iterator it = refs.begin(); it != refs.end(); it++ ) { sprintf(AlignResult->caBufp, "@SQ\tSN:%s\tLN:%u\n", it->name.c_str(), it->startIndex); AlignResult->UpdateSize(); // startIndex is actually the length of reference. } sprintf(AlignResult->caBufp, "%s\n@PG\tID:PerM\tVN:0.4.0\tCL:\"%s\"\n", RG, CL); AlignResult->UpdateSize(); } inline const char* getLongRefSeq(const CGenome_Index_TableQ& table, CMappingResult& m, bool bNoRef) { if(bNoRef) { m.caRef[0] = '\0'; } else { unsigned int secondHalfStart = m.uiReadLength - table.uiRead_Length; CReadInBits ref1stHalf = table.pgenomeNTInBits->getSubstringInBits\ (m.uiGlobalMappedPos, table.uiRead_Length); CReadInBits ref2ndHalf = table.pgenomeNTInBits->getSubstringInBits\ (m.uiGlobalMappedPos + secondHalfStart, table.uiRead_Length); ref1stHalf.decode(m.caRef); ref2ndHalf.decode(&(m.caRef[secondHalfStart])); if(m.strand == '-') { reverseComplementKmer(m.caRef); // reverse complement reference } } return(m.caRef); } bool wrongIndex(const CReadInBitsSet& readsSet, const CGenome_Index_TableQ& table);./Source/ReadsMappingFlags.h0000644011075700120610000000110411720654362016075 0ustar yanghochmath-ar#pragma once #include "stdafx.h" // #include "ProgramOptions.h" class CReadsMappingFlags { public: CReadsMappingFlags(void); ~CReadsMappingFlags(void); protected: // int ptr_initialization(CProgramOptions *_param); int set_Default_Opt(void); //unsigned int uiSubThreshold; bool bSaveTable; bool bPrintUnMappedReads; bool bPrintAlignment; bool bPrintGeneName; // search not only the best in terms of mismatches, but all within the criteria. bool bSearchAllAlignment; // Currently set to be false by default char cOutputFormat; }; ./Source/ReadsMappingStats.h0000644011075700120610000000305511720654362016146 0ustar yanghochmath-ar#pragma once #include "stdafx.h" #include "GenomeNTdata.h" #include "AlignmentsQ.h" /* * This class is a base class for CReadsMapping, which collects the counters for statistics of mapping reads. */ class CReadsMappingStats { public: CReadsMappingStats(void); virtual ~CReadsMappingStats(void); CAlignmentsQ alignmentsQ[2]; // Queue the founded alignments (Two Queues for paired end reads) protected: static const unsigned int SNP_TYPE_NUM = 4; // Complement, Transition, Transvertion and Mixed int iReadsFileCount; /* Basic mapping statistics counter */ void initializeStatsCounter(void); // return true if print the alignments void bookKeepMapping(CAlignmentsQ& que); // return to a bool value indicating the alignment should be printed or not bool printAlignmentOrNot(CAlignmentsQ& que, bool bExcludeAmbiguous, bool ambiguousOnly) const; // Print the counter in an order of Runs,Chr0,Chr1,Chr2,Total Hits,Sub0,Sub1,Sub2,Total Kmers int printMappingStats(ostream& out, const char* readSetName, unsigned int uiSubThreshold) const; void printCommand(ostream& out, string command); unsigned int iMapCount; unsigned int iMapDiffCount[MAXTOLERATSUBMIS + 1]; unsigned int iReadCounter; unsigned int iBadReadCounter; unsigned int iMissReadCounter; // # of reads that has multiple place mapping(repeats or ambiguous read) unsigned int iMultiMappedReads; unsigned int iReadsMapped2tooManyLocations; unsigned int iMultiMappedLocationThreshold; private: void initialization(void); }; ./Source/ReadsQualScores.h0000644011075700120610000001103111720654362015606 0ustar yanghochmath-ar#pragma once #include "stdafx.h" #include "Filename.h" #include "FileInputBuffer.h" #include using namespace std; const char SolexaScoreEncodingShift = 64; const char Phred_SCALE_QUAL_SHIFT = 33; const int READ_ID_LENGTH = 260; struct CReadID { char id[READ_ID_LENGTH]; }; class CReadsQualScores { public: CReadsQualScores(void); CReadsQualScores(unsigned int readLength, unsigned int numOfReads); ~CReadsQualScores(void); void clear(void); void reserve(unsigned int numOfReads); bool openQUALfile(const char* Filename); unsigned int getQualityScoresFromQUAL(vector* pReadsID); inline void addQSs(const char* QSs); inline char* qScores(unsigned int readId); inline char baseQS4SOLiD(unsigned int readId, unsigned int possition); inline char qs(unsigned int readId, unsigned int possition); unsigned int readLength; unsigned int numOfReads; unsigned int load; // number of reads has been loaded. char scoreType; // The type of the score system I=illumina S=SOLiD private: ifstream ifile; FileInputBuffer IBuf; char* QSarray; unsigned int size; // The memory allocated, qScore capacity in bases void initialization(unsigned int readLength, unsigned int numOfReads); // return the ID that the tag aligned to, so store the quality scores int alignReadId4QScores(char* tag, unsigned int searchPoint, vector* pReadsID); }; inline void formatReadId(char* readIdStr, char sep = ',') { for (int i = 0; readIdStr[i] != '\0'; i++) { char c = readIdStr[i]; if ( iscntrl(c) || isspace(c) || c == sep) { readIdStr[i] = '\0'; break; } } } inline void CReadsQualScores::addQSs(const char* QSs) { unsigned int qsIndex = this->load * this->readLength; if (qsIndex + this->readLength <= this->size ) { for (unsigned int i = 0; i < this->readLength; i++) { this->QSarray[qsIndex + i] = QSs[i]; } this->load ++; } else { LOG_INFO("\nInfo %d: Quality scores out of capacity .\n", WARNING_LOG); } } inline bool isValidSolexaQScore(char qscore) { const char maxScore = 40; const char minScore = -5; return (minScore <= qscore && qscore <= maxScore); }; inline bool isValidSolidQScore(char qscore) { const char maxScore = 80; const char minScore = 0; return (minScore <= qscore && qscore <= maxScore); }; inline char getSolexaQScore(char qscore) { const char minScore = -5; qscore = qscore - SolexaScoreEncodingShift; if (isValidSolexaQScore(qscore)) { return(qscore); } else { return(minScore); } }; void trQScores(unsigned int readLength, char qShift, const char* oldQSs, char* newQSs); inline void fillDummyQScores(unsigned int readLength, char score, char* QSs) { memset(QSs, (int)readLength*sizeof(char), score); QSs[readLength] = '\0'; } // return pointers of the quality scores for a read inline char* CReadsQualScores::qScores(unsigned int readId) { unsigned int qsId = readId * this->readLength; if (qsId < size) return(&this->QSarray[qsId]); else printf("\rAccess qscore out of bound!"); return(NULL); // dummy quality score } // return the quality score of a base signal or a color signal in SOLiD system inline char CReadsQualScores::qs(unsigned int readId, unsigned int possition) { unsigned int qsId = readId * this->readLength + possition; if (qsId < size) return(this->QSarray[qsId]); else printf("\rAccess qscore out of bound!"); return(0); // dummy quality score } // return the quality score of a base by average the two color signal quality score inline char CReadsQualScores::baseQS4SOLiD(unsigned int readId, unsigned int possition) { unsigned int qsId = readId * this->readLength + possition; if (qsId < size) { char qScore = this->QSarray[qsId]; // The first and the last base may have less accuracy if (possition != 0 && possition != this->readLength - 1) { qScore += this->QSarray[qsId + 1]; } return(qScore / 2); // Return the average } else printf("\rAccess qscore out of bound!"); return(0); //dummy quality score } // return sum of the quality score for mismatched bases. int alignmentScore(char* str1, char* str2, unsigned int readLength, const char* sc); double getAverageQualityScores(CReadsQualScores& Qscores); // preint comma separate scores string on qScoresStr void printCommaSepScoresStr(unsigned int readlength, const char* qScores, char* qScoresStr); ./Source/SeedPattern.h0000644011075700120610000001136411720654362014775 0ustar yanghochmath-ar#pragma once #include "bitsOperationUtil.h" #include "ReadInBits.h" #define EXTEND_SEED true /* * This file define the seed function including generating hashValue and key for short DNA read according to seed, * which is consisted of repeated pattern of selected position. * F2 seed is consist of repeat pattern (111*1**), which is full sensitive to two mismatches. * S1_1 seed is consist of repeat pattern (1111**1***), which is full sensitive to three mismatches. * S2_0 seed is consist of repeat pattern (1111**1****), which is full sensitive to two pairs of consecutive mismatches. * F3 seed is consist of repeat pattern (111*1**1***), which is full sensitive to three mismatches. * F4 seed is consist of repeat pattern (11***1****), which is full sensitive to three mismatches. */ typedef unsigned int(*ptHashFunc)(CReadInBits); // currently the bits for hashing is 13 const unsigned int BITS_FOR_HASHING = 13; // Corresponding to the seed that is full sensitive to exact match alignments unsigned int getF0SeedHashValue(CReadInBits r); unsigned int getF0SeedKey(CReadInBits r, int keyWeight); ptHashFunc selectF0(int readlength); // Corresponding to the seed that is full sensitive to one mismatch alignments unsigned int getF1SeedHashValue(CReadInBits r); unsigned int getF1SeedHashValue15(CReadInBits r); unsigned int getF1SeedKey(CReadInBits r, int keyWeight); ptHashFunc selectF1(int readlength); // Corresponding to the seed that is full sensitive to two mismatches alignments unsigned int getF2SeedHashValue(CReadInBits r); unsigned int getF2SeedHashValue4ReadLength25_27(CReadInBits r); unsigned int getF2SeedHashValue4ReadLength23_24(CReadInBits r); unsigned int getF2SeedKey(CReadInBits r, int keyWeight); ptHashFunc selectF2(int readlength); // Corresponding to the seed that is full sensitive to alignments with // two consecutive mismatches + a randome mismatches (For Solid) unsigned int getS1_1SeedHashValue(CReadInBits r); // Hash Value for differnet length unsigned int getS1_1SeedHashValue4ReadLength31(CReadInBits r); // get first 12 care positions unsigned int getS1_1SeedHashValue4ReadLength30(CReadInBits r); // get first 11 care positions unsigned int getS1_1SeedHashValue4ReadLength26_29(CReadInBits r); // get first 10 care positions unsigned int getS1_1SeedHashValue4ReadLength23_25(CReadInBits r); // get first 9 care positions unsigned int getS1_1SeedKey(CReadInBits r, int keyWeight); ptHashFunc selectS1_1(int readlength); // Seed that is full sensitive to alignments with two pairs of two consecutive mismatches unsigned int getS2_0SeedHashValue(CReadInBits r); unsigned int getS2_0SeedHashValue4ReadLength34(CReadInBits r); unsigned int getS2_0SeedHashValue4ReadLength33(CReadInBits r); unsigned int getS2_0SeedHashValue4ReadLength28_32(CReadInBits r); unsigned int getS2_0SeedHashValue4ReadLength25_27(CReadInBits r); unsigned int getS2_0SeedKey(CReadInBits r, int keyWeight); unsigned int getS2_0SeedKey4ReadLength34(CReadInBits r, int keyWeight); ptHashFunc selectS2_0(int readlength); // Sseed full sensitive to alignments with any three random mismatches. unsigned int getF3SeedHashValue(CReadInBits r); unsigned int getF3SeedHashValue4ReadLength34(CReadInBits r); unsigned int getF3SeedHashValue4ReadLength33(CReadInBits r); unsigned int getF3SeedHashValue4ReadLength29_32(CReadInBits r); unsigned int getF3SeedHashValue4ReadLength26_28(CReadInBits r); unsigned int getF3SeedHashValue4ReadLength25(CReadInBits r); unsigned int getF3SeedKey(CReadInBits r, int keyWeight); unsigned int getF3SeedKey4ReadLength34(CReadInBits r, int keyWeight); unsigned int getF3SeedKey4ReadLength32(CReadInBits r, int keyWeight); ptHashFunc selectF3(int readlength); // Seed that is full sensitive to alignments with one two consecutive mismatches pair + two random mismatches unsigned int getS1_2SeedHashValue(CReadInBits r); unsigned int getS1_2SeedKey4ReadLength46_49(CReadInBits r, int keyWeight); // Seed that is full sensitive to alignments with four random mismatches unsigned int getF4SeedHashValue(CReadInBits r); unsigned int getF4SeedHashValue4ReadLength41(CReadInBits r); unsigned int getF4SeedHashValue4ReadLength40(CReadInBits r); unsigned int getF4SeedHashValue4ReadLength35_39(CReadInBits r); unsigned int getF4SeedHashValue4ReadLength31_34(CReadInBits r); unsigned int getF4SeedKey4ReadLength45_49(CReadInBits r); ptHashFunc selectF4(int readlength); inline unsigned int returnDummyHashKey(CReadInBits r, int keyWeight) { keyWeight = 0; return 0; } // Return the weight of a seed, given the repeated seed pattern ex: 111*1** and the read-length. unsigned int getNoOfCaredPositions(const char* SeedRepeat, unsigned int uiReadLength); // Return the # of cared position if the periodic unsigned int getNoOfCaredPositions4FullRead(const char* caSeedRepeat, unsigned int uiReadLength); ./Source/ShortReadUtil.h0000644011075700120610000001154411720654362015310 0ustar yanghochmath-ar#pragma once #ifndef SHORT_READ_UTIL_H #define SHORT_READ_UTIL_H #include #include #include #include #include using namespace std; //TODO separate inline functions for single base and functions for short Read. #define NT_SIZE 4 inline bool isACGT(char nt) { switch (nt) { case 'a': case 'c': case 'g': case 't': case 'A': case 'C': case 'G': case 'T': return(true); #ifdef DIPLOID case 'R': // G or A as puRine case 'Y': // T or C as pYrimidine case 'M': // aMino case 'K': // Keto case 'S': // Strong interaction case 'W': // Weak interaction return(true); #endif default: return(false); } } inline int nt2Id(char nt) { switch (nt) { case 'A': case 'a': return(0); case 'C': case 'c': return(1); case 'G': case 'g': return(2); case 'T': case 't': return(3); default: return(4); } } inline bool is0123(char nt) { switch (nt) { case '0': case '1': case '2': case '3': return true; default: return false; } } inline bool isNucleotide(char nt) { switch (nt) { case 'a': case 'c': case 'g': case 't': case 'A': case 'C': case 'G': case 'T': case 'U': case 'u': case 'R': // G or A as puRine case 'r': case 'Y': // T or C as pYrimidine case 'y': case 'M': // aMino case 'm': case 'N': // unknown case 'n': case 'K': // Keto case 'k': case 'S': // Strong interaction case 's': case 'W': // Weak interaction case 'w': case 'B': // GTC case 'b': case 'D': // GAT case 'd': case 'H': // ACT case 'h': case 'V': // GCA case 'v': return(true); default: return(false); } } /* Perform the WildCard comparison between two base. If match return true */ inline bool diNtWildCardComp(char nt1, char nt2) { if (nt1 == nt2) return true; switch (nt1) { case 'R': if (nt2 == 'G' || nt2 == 'A' || nt2 == 'R') return true; break; case 'Y': if (nt2 == 'T' || nt2 == 'C' || nt2 == 'Y') return true; break; case 'M': if (nt2 == 'A' || nt2 == 'C' || nt2 == 'M') return true; break; case 'K': if (nt2 == 'G' || nt2 == 'T' || nt2 == 'K') return true; break; case 'S': if (nt2 == 'G' || nt2 == 'C' || nt2 == 'S') return true; break; case 'W': if (nt2 == 'T' || nt2 == 'A' || nt2 == 'W') return true; break; default: return false; } return false; } unsigned int diNtStrWildCardComp(char* read1, char* read2, unsigned int readlength); unsigned int strComp(char* str1, char* str2, int l); unsigned int strCompMarkDiff(char* str1, char* str2); inline char complimentBase(char ntbase) { switch (ntbase) { case 'a': return('t'); case 'c': return('g'); case 'g': return('c'); case 't': return('a'); case 'A': return('T'); case 'C': return('G'); case 'G': return('C'); case 'T': return('A'); default: return('N'); } } inline char base2color(char nt, char color) { switch (color) { case '1': switch (color) { case 'A': return('C'); case 'C': return('A'); case 'G': return('T'); case 'T': return('G'); default: return(nt); } case '2': switch (color) { case 'A': return('G'); case 'C': return('T'); case 'G': return('A'); case 'T': return('C'); default: return(nt); } case '3': switch (color) { case 'A': return('T'); case 'C': return('G'); case 'G': return('C'); case 'T': return('A'); default: return(nt); } default: // include color 0 return(nt); } } char getBaseFromColors(char nt, const char* colors, int pos); void toUpperCase(char* caArray, int length); void mutateBase(char* Base); char* mutateRead(char* Kmer , unsigned int No_of_mutation); char* mutatePairsOfConsecutiveBases(char* Kmer, unsigned int no_of_mutated_pairs); bool isBadRead(const char* tkmer, unsigned int KmerLength); bool isBadSOLiDRead(const char* Read, unsigned int ReadLength); bool isBadRead(bool isSOLiD, const char* Read, unsigned int ReadLength); //return the complement kmer from 5'->3', destroy the original kmer char* reverseComplementKmer(char* Kmer); char* reverseKmer(char* Kmer); #endif ./Source/TestChromosomeNTdata.h0000644011075700120610000000062611720654362016625 0ustar yanghochmath-ar#pragma once #include "chromosomeNTdata.h" class TestChromosomeNTdata { public: const static unsigned int defaultNtPerLine = 96; TestChromosomeNTdata(const char* testInputChrFileN, const char* testOutputChrFileN); ~TestChromosomeNTdata(void); int outputFasta(const char* filename, unsigned int ntPerLine, const char* ntStr); int generateTestInput(const char* filename); }; ./Source/TestGenome_Index_TableQ.h0000644011075700120610000000101611720654362017201 0ustar yanghochmath-ar#pragma once #include "Genome_Index_TableQ.h" class SimulateLongRead { public: unsigned int uiReadLength; SimulateLongRead(CGenomeInBits* pgenomeNTInBits, unsigned int startIndex); ~SimulateLongRead(void); bool goodRead; CReadInBits half1st; CReadInBits half2nd; char read[MAX_LINE]; char originalRead[MAX_LINE]; }; bool testGenome_Index_TableQ(CGenome_Index_TableQ* table); bool testMappingLongRead(CGenome_Index_TableQ* table); bool testMappingLongPairedRead(CGenome_Index_TableQ* table); ./Source/bitsOperationUtil.h0000644011075700120610000001556411720654362016245 0ustar yanghochmath-ar#pragma once #ifndef BITS_OPERATION_UTIL_H #define BITS_OPERATION_UTIL_H #include "stdafx.h" /* * Most bitwise operation are adopted from Sean Eron Anderson at Stanford */ /* #ifdef BIT32 #define WORD_SIZE unsigned int const unsigned int wordSize = 32; #else */ #define WORD_SIZE unsigned long long const unsigned int wordSize = 64; // #endif #ifdef __MSVC #endif #ifdef __GNUC__ #endif // unsigned int reverse32bits(unsigned int word); // unsigned long long reverse64bits(unsigned long long word); // void printBinaryString(unsigned long long word, unsigned int length); static const unsigned char BitReverseTable256[] = { 0x00, 0x80, 0x40, 0xC0, 0x20, 0xA0, 0x60, 0xE0, 0x10, 0x90, 0x50, 0xD0, 0x30, 0xB0, 0x70, 0xF0, 0x08, 0x88, 0x48, 0xC8, 0x28, 0xA8, 0x68, 0xE8, 0x18, 0x98, 0x58, 0xD8, 0x38, 0xB8, 0x78, 0xF8, 0x04, 0x84, 0x44, 0xC4, 0x24, 0xA4, 0x64, 0xE4, 0x14, 0x94, 0x54, 0xD4, 0x34, 0xB4, 0x74, 0xF4, 0x0C, 0x8C, 0x4C, 0xCC, 0x2C, 0xAC, 0x6C, 0xEC, 0x1C, 0x9C, 0x5C, 0xDC, 0x3C, 0xBC, 0x7C, 0xFC, 0x02, 0x82, 0x42, 0xC2, 0x22, 0xA2, 0x62, 0xE2, 0x12, 0x92, 0x52, 0xD2, 0x32, 0xB2, 0x72, 0xF2, 0x0A, 0x8A, 0x4A, 0xCA, 0x2A, 0xAA, 0x6A, 0xEA, 0x1A, 0x9A, 0x5A, 0xDA, 0x3A, 0xBA, 0x7A, 0xFA, 0x06, 0x86, 0x46, 0xC6, 0x26, 0xA6, 0x66, 0xE6, 0x16, 0x96, 0x56, 0xD6, 0x36, 0xB6, 0x76, 0xF6, 0x0E, 0x8E, 0x4E, 0xCE, 0x2E, 0xAE, 0x6E, 0xEE, 0x1E, 0x9E, 0x5E, 0xDE, 0x3E, 0xBE, 0x7E, 0xFE, 0x01, 0x81, 0x41, 0xC1, 0x21, 0xA1, 0x61, 0xE1, 0x11, 0x91, 0x51, 0xD1, 0x31, 0xB1, 0x71, 0xF1, 0x09, 0x89, 0x49, 0xC9, 0x29, 0xA9, 0x69, 0xE9, 0x19, 0x99, 0x59, 0xD9, 0x39, 0xB9, 0x79, 0xF9, 0x05, 0x85, 0x45, 0xC5, 0x25, 0xA5, 0x65, 0xE5, 0x15, 0x95, 0x55, 0xD5, 0x35, 0xB5, 0x75, 0xF5, 0x0D, 0x8D, 0x4D, 0xCD, 0x2D, 0xAD, 0x6D, 0xED, 0x1D, 0x9D, 0x5D, 0xDD, 0x3D, 0xBD, 0x7D, 0xFD, 0x03, 0x83, 0x43, 0xC3, 0x23, 0xA3, 0x63, 0xE3, 0x13, 0x93, 0x53, 0xD3, 0x33, 0xB3, 0x73, 0xF3, 0x0B, 0x8B, 0x4B, 0xCB, 0x2B, 0xAB, 0x6B, 0xEB, 0x1B, 0x9B, 0x5B, 0xDB, 0x3B, 0xBB, 0x7B, 0xFB, 0x07, 0x87, 0x47, 0xC7, 0x27, 0xA7, 0x67, 0xE7, 0x17, 0x97, 0x57, 0xD7, 0x37, 0xB7, 0x77, 0xF7, 0x0F, 0x8F, 0x4F, 0xCF, 0x2F, 0xAF, 0x6F, 0xEF, 0x1F, 0x9F, 0x5F, 0xDF, 0x3F, 0xBF, 0x7F, 0xFF }; inline unsigned int reverse32bits(unsigned int word) { unsigned int c; unsigned char * p = (unsigned char *) & word; unsigned char * q = (unsigned char *) & c; q[3] = BitReverseTable256[p[0]]; q[2] = BitReverseTable256[p[1]]; q[1] = BitReverseTable256[p[2]]; q[0] = BitReverseTable256[p[3]]; return c; } inline unsigned long long reverse64bits(unsigned long long word) { //get the lower 32bits first unsigned long long returnValue = (unsigned long long) reverse32bits(0xffffffff & ((unsigned int) word)); returnValue <<= 32; //shift to upper bits word >>= 32; //Shift upper bits to lower bits. returnValue += (unsigned long long) reverse32bits(0xffffffff & ((unsigned int) word)); return (returnValue); } static const unsigned short MortonTable256[] = { 0x0000, 0x0001, 0x0004, 0x0005, 0x0010, 0x0011, 0x0014, 0x0015, 0x0040, 0x0041, 0x0044, 0x0045, 0x0050, 0x0051, 0x0054, 0x0055, 0x0100, 0x0101, 0x0104, 0x0105, 0x0110, 0x0111, 0x0114, 0x0115, 0x0140, 0x0141, 0x0144, 0x0145, 0x0150, 0x0151, 0x0154, 0x0155, 0x0400, 0x0401, 0x0404, 0x0405, 0x0410, 0x0411, 0x0414, 0x0415, 0x0440, 0x0441, 0x0444, 0x0445, 0x0450, 0x0451, 0x0454, 0x0455, 0x0500, 0x0501, 0x0504, 0x0505, 0x0510, 0x0511, 0x0514, 0x0515, 0x0540, 0x0541, 0x0544, 0x0545, 0x0550, 0x0551, 0x0554, 0x0555, 0x1000, 0x1001, 0x1004, 0x1005, 0x1010, 0x1011, 0x1014, 0x1015, 0x1040, 0x1041, 0x1044, 0x1045, 0x1050, 0x1051, 0x1054, 0x1055, 0x1100, 0x1101, 0x1104, 0x1105, 0x1110, 0x1111, 0x1114, 0x1115, 0x1140, 0x1141, 0x1144, 0x1145, 0x1150, 0x1151, 0x1154, 0x1155, 0x1400, 0x1401, 0x1404, 0x1405, 0x1410, 0x1411, 0x1414, 0x1415, 0x1440, 0x1441, 0x1444, 0x1445, 0x1450, 0x1451, 0x1454, 0x1455, 0x1500, 0x1501, 0x1504, 0x1505, 0x1510, 0x1511, 0x1514, 0x1515, 0x1540, 0x1541, 0x1544, 0x1545, 0x1550, 0x1551, 0x1554, 0x1555, 0x4000, 0x4001, 0x4004, 0x4005, 0x4010, 0x4011, 0x4014, 0x4015, 0x4040, 0x4041, 0x4044, 0x4045, 0x4050, 0x4051, 0x4054, 0x4055, 0x4100, 0x4101, 0x4104, 0x4105, 0x4110, 0x4111, 0x4114, 0x4115, 0x4140, 0x4141, 0x4144, 0x4145, 0x4150, 0x4151, 0x4154, 0x4155, 0x4400, 0x4401, 0x4404, 0x4405, 0x4410, 0x4411, 0x4414, 0x4415, 0x4440, 0x4441, 0x4444, 0x4445, 0x4450, 0x4451, 0x4454, 0x4455, 0x4500, 0x4501, 0x4504, 0x4505, 0x4510, 0x4511, 0x4514, 0x4515, 0x4540, 0x4541, 0x4544, 0x4545, 0x4550, 0x4551, 0x4554, 0x4555, 0x5000, 0x5001, 0x5004, 0x5005, 0x5010, 0x5011, 0x5014, 0x5015, 0x5040, 0x5041, 0x5044, 0x5045, 0x5050, 0x5051, 0x5054, 0x5055, 0x5100, 0x5101, 0x5104, 0x5105, 0x5110, 0x5111, 0x5114, 0x5115, 0x5140, 0x5141, 0x5144, 0x5145, 0x5150, 0x5151, 0x5154, 0x5155, 0x5400, 0x5401, 0x5404, 0x5405, 0x5410, 0x5411, 0x5414, 0x5415, 0x5440, 0x5441, 0x5444, 0x5445, 0x5450, 0x5451, 0x5454, 0x5455, 0x5500, 0x5501, 0x5504, 0x5505, 0x5510, 0x5511, 0x5514, 0x5515, 0x5540, 0x5541, 0x5544, 0x5545, 0x5550, 0x5551, 0x5554, 0x5555 }; // 11 bitwise operations inline unsigned int InterleaveBits(unsigned short x, unsigned short y) { unsigned int z = MortonTable256[y >> 8] << 17 | MortonTable256[x >> 8] << 16 | MortonTable256[y & 0xFF] << 1 | MortonTable256[x & 0xFF]; return(z); } inline unsigned long long longlongShiftLeft(unsigned long long word, unsigned int shiftBits) { const unsigned int BITS_PER_INT = 32; // Not all the compiler can support variable >> 32 dirrectly unsigned long long returnValue = word << (shiftBits % BITS_PER_INT); if (shiftBits >= BITS_PER_INT) { returnValue <<= 16; returnValue <<= 16; } return(returnValue); } inline unsigned long long longlongShiftRight(unsigned long long word, unsigned int shiftBits) { const unsigned int BITS_PER_INT = 32; unsigned long long returnValue = word >> (shiftBits % BITS_PER_INT); // shift (word mod 32); if (shiftBits >= BITS_PER_INT) { returnValue >>= 16; returnValue >>= 16; } return(returnValue); } inline void clearLastBit(unsigned long long& bitsStr) { bitsStr >>= 0x01; bitsStr <<= 0x01; } inline bool isKthBitSet(unsigned long long data, unsigned int k) { unsigned long long flagMask = 0x01; if (k) { flagMask = longlongShiftLeft(flagMask, k - 1); } if (data & flagMask) { return(true); } else { return(false); } } inline void setKthBit(unsigned long long& bitsStr, unsigned int k, bool bit) { unsigned long long flagMask = 0x01; if (k) { flagMask = longlongShiftLeft(flagMask, k - 1); } if (bit) { bitsStr |= flagMask; } else { bitsStr &= (!flagMask); } } #endif /* BITS_OPERATION_UTIL_H */ ./Source/boolFlagArray.h0000644011075700120610000000161711720654362015303 0ustar yanghochmath-ar#pragma once #include #include #include using namespace std; /** * This class is design to use a char to store 8 bool flags * as an alternative of vector. It is easy to dump and * read the content of this flag array by fwrite and fread. */ class CboolFlagArray { public: CboolFlagArray(void); CboolFlagArray(unsigned int size); ~CboolFlagArray(void); unsigned char* bflag; //The size is # of bits so the real size in bytes is size/8 + 1 unsigned int size; // return true if the flag is set bool b(unsigned int index) const; // return true if there is a flag set within the widows bool b(unsigned int index, unsigned int windowLength) const; void setflag(unsigned int index, bool flag); unsigned int initialization(unsigned int size); }; inline unsigned int size2sizeInByte(unsigned int size) { return(size/8 + 1); } ./Source/chdir.h0000644011075700120610000000126211720654362013644 0ustar yanghochmath-ar#ifndef CHDIR_H #define CHDIR_H #include "stdafx.h" #include #include #include string get_working_directory(void); char* get_working_directory(char* path); int goto_working_directory(const char* path); bool is_accessible_directory(const char * path); int print_working_directory(void); int toparentdir(void); int tochilddir(const char* dirname); int tosomedir(const char* path); int tosiblingdir(const char* dirname); int createdirsAlongPath(const char* path); int createdir(const char* path); int deletedir(const char* target); int deletefile(const char* target); int str2Int(const char* str); char* getnamefrompath(char* Path, char* filename); #endif ./Source/chromosomeInBits.h0000644011075700120610000000251211720654362016036 0ustar yanghochmath-ar#pragma once #ifndef CChromosomeInBits_H_ #define CChromosomeInBits_H_ #include "bitsOperationUtil.h" #include "SeedPattern.h" #include "ReadInBits.h" class CChromosomeInBits { public: CChromosomeInBits(void); CChromosomeInBits(char* caChromosome, unsigned int uiChrLength); ~CChromosomeInBits(void); // Two arrays to store the chromosome string encoded with bits WORD_SIZE* pUpperBits; WORD_SIZE* pLowerBits; unsigned int uiChrLength; unsigned int uiChrLengthInWordSize; char* caChromosome; // Don't delete this pointer pointing outside // get the wordSize substring encoded in bits and store in upperBits and lowerBits CReadInBits getSubstringInBits(unsigned int uiGenomeIndex); // eliminate the bits beyond read length. Not the length should smaller than word_size CReadInBits getSubstringInBits(unsigned int uiGenomeIndex, unsigned int uiSubstringLength); // call getSubstringInBits and transform the info to DNA sequence and store in caSubstring char* getSubstring(unsigned int uiGenomeIndex); // get the substring encoded in bits which is shorter than wordSizea. char* getSubstring(unsigned int uiGenomeIndex, unsigned int uiSubstringLength); char caSubstring[128 + 1]; //WORD_SIZE upperBits; //WORD_SIZE lowerBits; private: int initialization(void); }; #endif ./Source/chromosomeNTdata.h0000644011075700120610000000373311720654362016027 0ustar yanghochmath-ar#pragma once #ifndef CHROMOSOME_NT_DATA_H #define CHROMOSOME_NT_DATA_H #include "ChrIndex2GeneName.h" #include "ShortReadUtil.h" #include "Filename.h" #include #include #include #include "stdafx.h" using namespace std; const int _MAX_KMER_LENGTH_ = 1024; /* * This class is designed to read in a big chromosome file (in fasta format) * and get the fragment of each kmer. */ class CchromosomeNTdata { public: CchromosomeNTdata(void); CchromosomeNTdata(const char* Filename, bool bFasta = false); ~CchromosomeNTdata(void); // Currently, the only accept .fasta or .fa which contains AaCcGgTt and N. Other character will be removed. char caInputFileName[FILENAME_MAX]; int Constructor_Fasta(const char* Filename); int Consrructor_PreSeq(const char* Filename); // Generate the next kmer starting from this->SlideWindowStart and put into this->caKmer char* fragKmer(unsigned int uiKmer_Length); // Generate the next kmer with only ACGT and put into this->caKmer char* fragACGTKmer(unsigned int uiKmer_Length); char* caChromosome; ChrIndex2GeneName geneVec; //This shouldn't be a free pointer. This is a buffer need to new space, fixed to some space, and release char caKmer[_MAX_KMER_LENGTH_]; unsigned int iChromosome_size; //bool flag to show it has generated the last fragment bool end; //The start index for generating k-mer fragment is set. unsigned int SlideWindowStart; protected: unsigned int removedNonACGTNBaseAndCollectGeneName(void); private: int initialization(void); int addFakeRefName(const char* Filename); //This will get chromosome size from a fasta file // Temporarily not used because of I/O speed on PC is slow int getsizeofChromosome(const char* Filename); int readFastaFileLineByLine(ifstream &ifile); }; // used for pre-processed chromosome typedef struct _ch_header { unsigned int totalSize; unsigned int size; } ch_header; #endif ./Source/minQ.h0000644011075700120610000000037411720654362013462 0ustar yanghochmath-ar#pragma once #include using namespace std; class minQ { public: minQ(void); ~minQ(void); queue q; deque min; int push(int num); int pop(); int front(); int getMin(); }; void testMinQ(); ./Source/refInBinFile.h0000644011075700120610000000055011720654362015046 0ustar yanghochmath-ar#pragma once #include "chromosomeNTdata.h" #include "GenomeInBits.h" #include "GenomeNTdata.h" #include "ShortReadUtil.h" #include "stdafx.h" #include #include #include int readRefInBinFile(FILE* fp, CGenomeInBits* gInBits, CGenomeNTdata* g); int saveRefInBinFile(FILE* fp, const CGenomeInBits* gInBits, const CGenomeNTdata* g); ./Source/refInBinary.h0000644011075700120610000000000011720654362014750 0ustar yanghochmath-ar./Source/resource.h0000644011075700120610000000056511720654362014407 0ustar yanghochmath-ar//{{NO_DEPENDENCIES}} // Microsoft Visual C++ generated include file. // Used by PerM_P.rc // 下一個新增物件的預設值 // #ifdef APSTUDIO_INVOKED #ifndef APSTUDIO_READONLY_SYMBOLS #define _APS_NEXT_RESOURCE_VALUE 101 #define _APS_NEXT_COMMAND_VALUE 40001 #define _APS_NEXT_CONTROL_VALUE 1001 #define _APS_NEXT_SYMED_VALUE 101 #endif #endif ./Source/seedOptions.h0000644011075700120610000000207711720654362015054 0ustar yanghochmath-ar#ifndef SEED_OPTIONS #define SEED_OPTIONS const unsigned int FULL_SENSITIVE_OPT_TO_TWO_BASE_MIS = 20; const unsigned int FULL_SENSITIVE_OPT_TO_ONE_BASE_ONE_COLOR_MIS = 11; const unsigned int FULL_SENSITIVE_OPT_TO_ONE_BASE_TWO_COLOR_MIS = 12; const unsigned int MAX_FULL_SENSITIVITY_OPT = 4; // used when allowing more mismatch with partial sensitivity, for higher mismatch threshold. const unsigned int DEFAULT_SEED_OPTION = 3; // used when allowing more mismatch with partial sensitivity, for higher mismatch threshold. #include using namespace std; inline string seedSymbol(unsigned seedId) { switch (seedId) { case 0: return(string("F0")); case 1: return(string("F1")); case 2: return(string("F2")); case 11: return(string("S11")); case 12: return(string("S12")); case 3: return(string("F3")); case 20: return(string("S20")); case 4: return(string("F4")); default: LOG_INFO("\nInfo %d: Unknown seed opts.\n", INFO_LOG); return(string("")); } } #endif ./Source/stdafx.h0000644011075700120610000001140011720654362014037 0ustar yanghochmath-ar// stdafx.h is a header that almost every class in the project will include // #pragma once #include #include #include #include #include #include #include #include #include #include "time.h" #include "Filename.h" //#ifdef WIN32 #include "chdir.h" //#else //#include "errno.h" //#endif using namespace std; // unmark the line for DEBUG // #define DEBUG true // Change the log level to see different level of information. #define LOG_LEVEL 3 #define ERROR_LOG 5 #define WARNING_LOG 4 #define INFO_LOG 3 #define CONFIG_LOG 2 #define FINE_LOG 1 #ifdef _WIN32 #ifndef WIN32 #define WIN32 _WIN32 #endif #endif #ifdef _WIN32 #define _CRT_SECURE_NO_WARNINGS true #endif #ifdef LOG_LEVEL #ifdef WIN32 #define LOG_INFO printf #else #ifdef _WIN64 #define LOG_INFO printf #else #define LOG_INFO(format, level, args...) {\ if(level >= LOG_LEVEL) {\ if(level >= LOG_LEVEL) {\ if (level >= WARNING_LOG) {\ fprintf(stderr, "%s, %d (%s)", __FILE__, __LINE__, __FUNCTION__);\ fprintf(stderr, format, level, ##args);\ fflush(stderr);\ } else {\ fprintf(stdout, format, level, ##args);\ fflush(stdout);\ }\ }\ }\ } #endif #endif #endif #ifndef BLANK_LINE #define BLANK_LINE " " #endif #define ERR printf("ERR --- %s:%d\n", __FILE__, __LINE__); #define TIME_INFORMATION(a) {\ time_t startt, endt;\ time(&startt);\ a;\ time(&endt);\ printf("%u seconds consumed.%s\r", (unsigned int)(endt - startt), BLANK_LINE);\ fflush(stdout);\ } #define TIME_INFO(a, msg) {\ time_t startt, endt;\ time(&startt);\ a;\ time(&endt);\ printf("%s in %u seconds.%s\r", msg, (unsigned int)(endt - startt), BLANK_LINE);\ fflush(stdout);\ } #define ASSERT_TRUE(expression, message)\ do {\ if (!(expression))\ { \ printf("ERROR: %s. Press ENTER to continue", message);\ fflush(stdout);\ char crap[256];\ cin.getline(crap, 256);\ }\ } while(0) #define ASSERT_EQUAL(expectation, real_value, message)\ do {\ if (expectation != real_value)\ { \ cout << '\n' << expectation << " is not equal to " << real_value << << endl;\ printf("ERROR: %s. Press ENTER to continue", message);\ fflush(stdout);\ char crap[256];\ cin.getline(crap, 256);\ }\ } while(0) #ifndef WIN32 #define MEMORY_INFO(b)\ do {\ FILE *xfp = fopen("/proc/self/statm", "r");\ unsigned int l1, l2, l3, l4, l5, l6, l7;\ int PageSize = getpagesize();\ if(xfp == NULL) {\ LOG_INFO("Info %d: We cannot open /proc/self/statm file (%d:%s)\n", CONFIG_LOG\ errno, strerror(errno));\ } else{\ fscanf(xfp, "%u %u %u %u %u %u %u", &l1, &l2, &l3, &l4, &l5, &l6, &l7);\ printf("%s: %u MB memory (%u MB in stack) consumed.\n",\ (b), /*(unsigned int)(end - start),*/ \ (unsigned int)(l2*PageSize/1024/1024), (unsigned int)(l6*PageSize/1024/1024));\ }\ fclose(xfp);\ } while(0) #endif #define STRIKE_KEY2CONTINUE\ do {\ cout << "Strike a key to continue" << endl;\ char c;\ if(scanf("%c", &c) == 0) {\ break;\ }\ } while(0) inline bool myFwrite(const void* a, size_t b, size_t c, FILE* fp) { size_t stWriteByte = fwrite(a, b, c, fp); if (stWriteByte != c) { return (false); } else { return (true); } } inline bool myFread(void* a, size_t b, size_t c, FILE* fp) { size_t stReadByte = fread(a, b, c, fp); if (stReadByte != c) { return (false); } else { return (true); } } inline bool assertFile(FILE* fp, const char* expCheckSum) { char CheckSum[FILENAME_MAX]; if (fscanf(fp, "\n%s\n", CheckSum) <= 0) { ERR; return(false); } if (strcmp(CheckSum, expCheckSum) != 0) { ERR; return(false); } return(true); } inline char* myStrCpy(char* caBuf, const char* str, int iBufSize) { if (caBuf == NULL) { ERR; return(NULL); } int iBufSizeMinus1 = iBufSize - 1; char* returnV = strncpy(caBuf, str, iBufSizeMinus1); if (iBufSizeMinus1 >= 0) { caBuf[iBufSizeMinus1] = '\0'; } else { caBuf[0] = '\0'; } return(returnV); } const int MAX_LINE = 2048; const unsigned int MAXTOLERATSUBMIS = 20; const unsigned int MIN_READ_LENGTH = 13; const unsigned int MAX_READ_LENGTH = 64; const unsigned int MAX_LONG_READ_LENGTH = 128; ./Source/makefile0000644011075700120610000000214211720654362014100 0ustar yanghochmath-arCFLAGS = -ggdb -Wall -fopenmp -static CC = g++ -O2 $(CFLAGS) TARGETS = perm LIBS = -lm -lstdc++ PER_M = AlignmentsQ.cpp Filename.cpp GenomeNTdata.cpp ReadInBits.cpp PerM.cpp chromosomeNTdata.cpp\ bitsOperationUtil.cpp FileOutputBuffer.cpp HashIndexT.cpp ReadInBitsSet.cpp SeedPattern.cpp\ boolFlagArray.cpp GenomeInBits.cpp Index_Table.cpp ReadsMapping.cpp PairedReadsMapping.cpp ShortReadUtil.cpp\ chdir.cpp Genome_Index.cpp MismatchScores.cpp stdafx.cpp Flags.cpp ParseReadsOpts.cpp\ ColorSpaceRead.cpp Genome_Index_Table.cpp ParameterList.cpp ReadsMappingStats.cpp\ FileInputBuffer.cpp Genome_Index_TableQ.cpp ReadsQualScores.cpp ChrIndex2GeneName.cpp\ ReadsFileParser.cpp PairedReadsSet.cpp MappingResult.cpp refInBinFile.cpp LongReadsSet.cpp all: $(TARGETS) # -ctags *.[ch] install: all strip $(TARGETS) cp $(TARGETS) /usr/local/sbin cp *.1 /usr/local/man/man1 perm: $(PER_M) make clean $(CC) -o $@ $(CFLAGS) $(LIB_PATH) $(PER_M) $(LIBS) #$(CC) -o $@ $(LIB_PATH) *.o $(LIBS) tar: clean tar cvfz $(TARGETS).tar.gz *.cpp *.h makefile clean: -rm -f *.o *.exe cut out $(TARGETS) $(TARGETS).tar.gz