pax_global_header00006660000000000000000000000064130246452370014520gustar00rootroot0000000000000052 comment=8d086d747e51a409f25481524e92e99750b14d59 blasr-smrtanalysis-4.0.0/000077500000000000000000000000001302464523700153535ustar00rootroot00000000000000blasr-smrtanalysis-4.0.0/.gitignore000066400000000000000000000003571302464523700173500ustar00rootroot00000000000000defines.mk *.d *.o *.err blasr /utils/bam2bax/build/ /utils/bam2bax/bin/ /utils/bam2bax/tests/bin/ /utils/bam2bax/tests/src/TestData.h /utils/bax2bam/build/ /utils/bax2bam/bin/ /utils/bax2bam/tests/bin/ /utils/bax2bam/tests/src/TestData.h blasr-smrtanalysis-4.0.0/.gitmodules000066400000000000000000000001401302464523700175230ustar00rootroot00000000000000[submodule "libcpp"] path = libcpp url = git://github.com/PacificBiosciences/blasr_libcpp.git blasr-smrtanalysis-4.0.0/Blasr.cpp000066400000000000000000001566141302464523700171370ustar00rootroot00000000000000// Author: Mark Chaisson #include "iblasr/BlasrMiscs.hpp" #include "iblasr/BlasrUtils.hpp" #include "iblasr/BlasrAlign.hpp" #include "iblasr/RegisterBlasrOptions.h" //#define USE_GOOGLE_PROFILER #ifdef USE_GOOGLE_PROFILER #include "gperftools/profiler.h" #endif using namespace std; // Declare global structures that are shared between threads. MappingSemaphores semaphores; ostream *outFilePtr = NULL; #ifdef USE_PBBAM PacBio::BAM::IRecordWriter * bamWriterPtr = NULL; // use IRecordWriter for both SAM ands BAM #endif HDFRegionTableReader *regionTableReader = NULL; ReaderAgglomerate *reader = NULL; // Add comment to version history for each version change ! // // Version history // // 5.0 - a new major version number // 5.1 - transiotion to POSIX notation - double sashes before multi-character flags // 5.2 - --sam no longer supported // 5.3 - --sam supported via pbbam/IRecordWriter // const string GetMajorVersion() { return "5.3"; } // version format is 3 numbers sparated by dots : Version.Subversion.SHA1 const string GetVersion(void) { string gitVersionString(SHA1_7); // gitVersionString is first 7 characters of SHA1 string version = GetMajorVersion(); // if (gitVersionString.size() == 7) { version.append("."); version.append(gitVersionString); // } return version; } /// Checks whether a smrtRead meets the following criteria /// (1) is within the search holeNumber range specified by params.holeNumberRanges. /// (2) its length greater than params.maxReadlength /// (3) its read score (rq) is greater than params.minRawSubreadScore /// (4) its qual is greater than params.minAvgQual. /// Change stop to false if /// HoleNumber of the smrtRead is greater than the search holeNumber range. bool IsGoodRead(const SMRTSequence & smrtRead, MappingParameters & params, bool & stop) { if (params.holeNumberRangesStr.size() > 0 and not params.holeNumberRanges.contains(smrtRead.HoleNumber())) { // Stop processing once the specified zmw hole number is reached. // Eventually this will change to just seek to hole number, and // just align one read anyway. if (smrtRead.HoleNumber() > params.holeNumberRanges.max()){ stop = true; return false; } return false; } // // Discard reads that are too small, or not labeled as having any // useable/good sequence. // if (smrtRead.highQualityRegionScore < params.minRawSubreadScore or (params.maxReadLength != 0 and smrtRead.length > UInt(params.maxReadLength)) or (int(smrtRead.length) < params.minReadLength)) { return false; } if (smrtRead.qual.Empty() != false and smrtRead.GetAverageQuality() < params.minAvgQual) { return false; } return true; } // Make primary intervals (which are intervals of subreads to align // in the first round) from none BAM file using region table. void MakePrimaryIntervals(RegionTable * regionTablePtr, SMRTSequence & smrtRead, vector & subreadIntervals, vector & subreadDirections, int & bestSubreadIndex, MappingParameters & params) { vector adapterIntervals; // // Determine endpoints of this subread in the main read. // if (params.useRegionTable == false) { // // When there is no region table, the subread is the entire // read. // ReadInterval wholeRead(0, smrtRead.length); // The set of subread intervals is just the entire read. subreadIntervals.push_back(wholeRead); } else { // // Grab the subread & adapter intervals from the entire region table to // iterate over. // assert(regionTablePtr->HasHoleNumber(smrtRead.HoleNumber())); subreadIntervals = (*regionTablePtr)[smrtRead.HoleNumber()].SubreadIntervals(smrtRead.length, params.byAdapter); adapterIntervals = (*regionTablePtr)[smrtRead.HoleNumber()].AdapterIntervals(); } // The assumption is that neighboring subreads must have the opposite // directions. So create directions for subread intervals with // interleaved 0s and 1s. CreateDirections(subreadDirections, subreadIntervals.size()); // // Trim the boundaries of subread intervals so that only high quality // regions are included in the intervals, not N's. Remove intervals // and their corresponding dirctions, if they are shorter than the // user specified minimum read length or do not intersect with hq // region at all. Finally, return index of the (left-most) longest // subread in the updated vector. // int longestSubreadIndex = GetHighQualitySubreadsIntervals( subreadIntervals, // a vector of subread intervals. subreadDirections, // a vector of subread directions. smrtRead.lowQualityPrefix, // hq region start pos. smrtRead.length - smrtRead.lowQualitySuffix, // hq end pos. params.minSubreadLength); // minimum read length. bestSubreadIndex = longestSubreadIndex; if (params.concordantTemplate == "longestsubread") { // Use the (left-most) longest full-pass subread as // template for concordant mapping int longestFullSubreadIndex = GetLongestFullSubreadIndex( subreadIntervals, adapterIntervals); if (longestFullSubreadIndex >= 0) { bestSubreadIndex = longestFullSubreadIndex; } } else if (params.concordantTemplate == "typicalsubread") { // Use the 'typical' full-pass subread as template for // concordant mapping. int typicalFullSubreadIndex = GetTypicalFullSubreadIndex( subreadIntervals, adapterIntervals); if (typicalFullSubreadIndex >= 0) { bestSubreadIndex = typicalFullSubreadIndex; } } else if (params.concordantTemplate == "mediansubread") { // Use the 'median-length' full-pass subread as template for // concordant mapping. int medianFullSubreadIndex = GetMedianLengthFullSubreadIndex( subreadIntervals, adapterIntervals); if (medianFullSubreadIndex >= 0) { bestSubreadIndex = medianFullSubreadIndex; } } else { assert(false); } } // Make primary intervals (which are intervals of subreads to align // in the first round) for BAM file, -concordant, void MakePrimaryIntervals(vector & subreads, vector & subreadIntervals, vector & subreadDirections, int & bestSubreadIndex) { MakeSubreadIntervals(subreads, subreadIntervals); CreateDirections(subreadDirections, subreadIntervals.size()); bestSubreadIndex = GetIndexOfConcordantTemplate(subreadIntervals); } /// Scan the next read from input. This may either be a CCS read, unrolled (Polymerase) read, /// or regular read (though this may be aligned in whole, or by /// subread). /// \params[in] reader: FASTA/FASTQ/BAX.H5/CCS.H5/BAM file reader /// \params[in] regionTablePtr: RGN.H5 region table pointer. /// \params[in] params: mapping parameters. /// \params[out] smrtRead: to save smrt sequence. /// \params[out] ccsRead: to save ccs sequence. /// \params[out] readIsCCS: read is CCSSequence. /// \params[out] readGroupId: associated read group id /// \params[out] associatedRandInt: random int associated with this zmw, /// required to for generating deterministic random /// alignments regardless of nproc. /// \params[out] stop: whether or not stop mapping remaining reads. /// \returns whether or not to skip mapping reads of this zmw. bool FetchReads(ReaderAgglomerate * reader, RegionTable * regionTablePtr, SMRTSequence & smrtRead, CCSSequence & ccsRead, vector & subreads, MappingParameters & params, bool & readIsCCS, std::string & readGroupId, int & associatedRandInt, bool & stop) { if ((reader->GetFileType() != FileType::PBBAM and reader->GetFileType() != FileType::PBDATASET) or not params.concordant) { if (reader->GetFileType() == FileType::HDFCCS || reader->GetFileType() == FileType::HDFCCSONLY) { if (GetNextReadThroughSemaphore(*reader, params, ccsRead, readGroupId, associatedRandInt, semaphores) == false) { stop = true; return false; } else { readIsCCS = true; smrtRead.Copy(ccsRead); ccsRead.SetQVScale(params.qvScaleType); smrtRead.SetQVScale(params.qvScaleType); } assert(ccsRead.zmwData.holeNumber == smrtRead.zmwData.holeNumber and ccsRead.zmwData.holeNumber == ccsRead.unrolledRead.zmwData.holeNumber); } else { if (GetNextReadThroughSemaphore(*reader, params, smrtRead, readGroupId, associatedRandInt, semaphores) == false) { stop = true; return false; } else { smrtRead.SetQVScale(params.qvScaleType); } } // // Only normal (non-CCS) reads should be masked. Since CCS reads store the raw read, that is masked. // bool readHasGoodRegion = true; if (params.useRegionTable and params.useHQRegionTable) { if (readIsCCS) { readHasGoodRegion = MaskRead(ccsRead.unrolledRead, ccsRead.unrolledRead.zmwData, *regionTablePtr); } else { readHasGoodRegion = MaskRead(smrtRead, smrtRead.zmwData, *regionTablePtr); } // // Store the high quality start and end of this read for masking purposes when printing. // int hqStart, hqEnd; int score; LookupHQRegion(smrtRead.zmwData.holeNumber, *regionTablePtr, hqStart, hqEnd, score); smrtRead.lowQualityPrefix = hqStart; smrtRead.lowQualitySuffix = smrtRead.length - hqEnd; smrtRead.highQualityRegionScore = score; } else { smrtRead.lowQualityPrefix = 0; smrtRead.lowQualitySuffix = 0; } if (not IsGoodRead(smrtRead, params, stop) or stop) return false; return readHasGoodRegion; } else { subreads.clear(); vector reads; if (GetNextReadThroughSemaphore(*reader, params, reads, readGroupId, associatedRandInt, semaphores) == false) { stop = true; return false; } for (const SMRTSequence & smrtRead: reads) { if (IsGoodRead(smrtRead, params, stop)) { subreads.push_back(smrtRead); } } if (subreads.size() != 0) { smrtRead.MadeFromSubreadsAsPolymerase(subreads); return true; } else { return false; } } } void MapReadsNonCCS(MappingData *mapData, MappingBuffers & mappingBuffers, SMRTSequence & smrtRead, SMRTSequence & smrtReadRC, vector & subreads, MappingParameters & params, const int & associatedRandInt, ReadAlignments & allReadAlignments, ofstream & threadOut) { DNASuffixArray sarray; TupleCountTable ct; SequenceIndexDatabase seqdb; T_GenomeSequence genome; BWT *bwtPtr; mapData->ShallowCopySuffixArray(sarray); mapData->ShallowCopyReferenceSequence(genome); mapData->ShallowCopySequenceIndexDatabase(seqdb); mapData->ShallowCopyTupleCountTable(ct); bwtPtr = mapData->bwtPtr; SeqBoundaryFtr seqBoundary(&seqdb); vector subreadIntervals; vector subreadDirections; int bestSubreadIndex; if ((mapData->reader->GetFileType() != FileType::PBBAM and mapData->reader->GetFileType() != FileType::PBDATASET) or not params.concordant) { MakePrimaryIntervals(mapData->regionTablePtr, smrtRead, subreadIntervals, subreadDirections, bestSubreadIndex, params); } else { MakePrimaryIntervals(subreads, subreadIntervals, subreadDirections, bestSubreadIndex); } // Flop all directions if direction of the longest subread is 1. if (bestSubreadIndex >= 0 and bestSubreadIndex < int(subreadDirections.size()) and subreadDirections[bestSubreadIndex] == 1) { UpdateDirections(subreadDirections, true); } int startIndex = 0; int endIndex = subreadIntervals.size(); if (params.concordant) { // Only the longest subread will be aligned in the first round. // VR , change the comment startIndex = max(startIndex, bestSubreadIndex); endIndex = min(endIndex, bestSubreadIndex + 1); if (params.verbosity >= 1) { cout << "Concordant template subread index: " << bestSubreadIndex << ", " << smrtRead.HoleNumber() << "/" << subreadIntervals[bestSubreadIndex] << endl; } } // // Make room for alignments. // allReadAlignments.Resize(subreadIntervals.size()); allReadAlignments.alignMode = Subread; for (int intvIndex = startIndex; intvIndex < endIndex; intvIndex++) { SMRTSequence subreadSequence, subreadSequenceRC; MakeSubreadOfInterval(subreadSequence, smrtRead, subreadIntervals[intvIndex], params); MakeSubreadRC(subreadSequenceRC, subreadSequence, smrtRead); // // Store the sequence that is being mapped in case no hits are // found, and missing sequences are printed. // allReadAlignments.SetSequence(intvIndex, subreadSequence); vector alignmentPtrs; mapData->metrics.numReads++; assert(subreadSequence.zmwData.holeNumber == smrtRead.zmwData.holeNumber); // // Try default and fast parameters to map the read. // MapRead(subreadSequence, subreadSequenceRC, genome, // possibly multi fasta file read into one sequence sarray, *bwtPtr, // The suffix array, and the bwt-fm index structures seqBoundary, // Boundaries of contigs in the // genome, alignments do not span // the ends of boundaries. ct, // Count table to use word frequencies in the genome to weight matches. seqdb, // Information about the names of // chromosomes in the genome, and // where their sequences are in the genome. params, // A huge list of parameters for // mapping, only compile/command // line values set. mapData->metrics, // Keep track of time/ hit counts, // etc.. Not fully developed, but // should be. alignmentPtrs, // Where the results are stored. mappingBuffers, // A class of buffers for structurs // like dyanmic programming // matrices, match lists, etc., that are not // reallocated between calls to // MapRead. They are cleared though. mapData, // Some values that are shared // across threads. semaphores); // // No alignments were found, sometimes parameters are // specified to try really hard again to find an alignment. // This sets some parameters that use a more sensitive search // at the cost of time. // if ((alignmentPtrs.size() == 0 or alignmentPtrs[0]->pctSimilarity < 80) and params.doSensitiveSearch) { MappingParameters sensitiveParams = params; sensitiveParams.SetForSensitivity(); MapRead(subreadSequence, subreadSequenceRC, genome, sarray, *bwtPtr, seqBoundary, ct, seqdb, sensitiveParams, mapData->metrics, alignmentPtrs, mappingBuffers, mapData, semaphores); } // // Store the mapping quality values. // if (alignmentPtrs.size() > 0 and alignmentPtrs[0]->score < params.maxScore and params.storeMapQV) { StoreMapQVs(subreadSequence, alignmentPtrs, params); } // // Select alignments for this subread. // vector selectedAlignmentPtrs = SelectAlignmentsToPrint(alignmentPtrs, params, associatedRandInt); allReadAlignments.AddAlignmentsForSeq(intvIndex, selectedAlignmentPtrs); // // Move reference from subreadSequence, which will be freed at // the end of this loop to the smrtRead, which exists for the // duration of aligning all subread of the smrtRead. // for (size_t a = 0; a < alignmentPtrs.size(); a++) { if (alignmentPtrs[a]->qStrand == 0) { alignmentPtrs[a]->qAlignedSeq.ReferenceSubstring(smrtRead, alignmentPtrs[a]->qAlignedSeq.seq - subreadSequence.seq, alignmentPtrs[a]->qAlignedSeqLength); } else { alignmentPtrs[a]->qAlignedSeq.ReferenceSubstring(smrtReadRC, alignmentPtrs[a]->qAlignedSeq.seq - subreadSequenceRC.seq, alignmentPtrs[a]->qAlignedSeqLength); } } // Fix for memory leakage bug due to undeleted Alignment Candidate objectts which wasn't selected // for printing // delete all AC which are in complement of SelectedAlignmemntPtrs vector // namely (SelectedAlignmentPtrs/alignmentPtrs) for (size_t ii = 0; ii < alignmentPtrs.size(); ii++) { int found =0; for (size_t jj = 0; jj < selectedAlignmentPtrs.size(); jj++) { if (alignmentPtrs[ii] == selectedAlignmentPtrs[jj] ) { found = 1; break; } } if (found == 0) delete alignmentPtrs[ii]; } subreadSequence.Free(); subreadSequenceRC.Free(); } // End of looping over subread intervals within [startIndex, endIndex). if (params.verbosity >= 3) allReadAlignments.Print(threadOut); // If not concordant , all done if (params.concordant) { allReadAlignments.read = smrtRead; allReadAlignments.alignMode = ZmwSubreads; if (startIndex >= 0 && startIndex < int(allReadAlignments.subreadAlignments.size())) { vector selectedAlignmentPtrs = allReadAlignments.CopySubreadAlignments(startIndex); for(int alignmentIndex = 0; alignmentIndex < int(selectedAlignmentPtrs.size()); alignmentIndex++) { FlankTAlignedSeq(selectedAlignmentPtrs[alignmentIndex], seqdb, genome, params.flankSize); } for (int intvIndex = 0; intvIndex < int(subreadIntervals.size()); intvIndex++) { if (intvIndex == startIndex) continue; int passDirection = subreadDirections[intvIndex]; int passStartBase = subreadIntervals[intvIndex].start; int passNumBases = subreadIntervals[intvIndex].end - passStartBase; if (passNumBases <= params.minReadLength) {continue;} mapData->metrics.numReads++; SMRTSequence subread; subread.ReferenceSubstring(smrtRead, passStartBase, passNumBases); subread.CopyTitle(smrtRead.title); // The unrolled alignment should be relative to the entire read. if (params.clipping == SAMOutput::subread) { SMRTSequence maskedSubread; MakeSubreadOfInterval(maskedSubread, smrtRead, subreadIntervals[intvIndex], params); allReadAlignments.SetSequence(intvIndex, maskedSubread); maskedSubread.Free(); } else { allReadAlignments.SetSequence(intvIndex, smrtRead); } for (size_t alnIndex = 0; alnIndex < selectedAlignmentPtrs.size(); alnIndex++) { T_AlignmentCandidate * alignment = selectedAlignmentPtrs[alnIndex]; if (alignment->score > params.maxScore) break; AlignSubreadToAlignmentTarget(allReadAlignments, subread, smrtRead, alignment, passDirection, subreadIntervals[intvIndex], intvIndex, params, mappingBuffers, threadOut); if (params.concordantAlignBothDirections) { AlignSubreadToAlignmentTarget(allReadAlignments, subread, smrtRead, alignment, ((passDirection==0)?1:0), subreadIntervals[intvIndex], intvIndex, params, mappingBuffers, threadOut); } } // End of aligning this subread to each selected alignment. subread.Free(); } // End of aligning each subread to where the template subread aligned to. for(size_t alignmentIndex = 0; alignmentIndex < selectedAlignmentPtrs.size(); alignmentIndex++) { if (selectedAlignmentPtrs[alignmentIndex]) delete selectedAlignmentPtrs[alignmentIndex]; } } // End of if startIndex >= 0 and < subreadAlignments.size() } // End of if params.concordant } // // invoked for mapping entire ZMW as a single entity // either for CCS reads : all subreads of a ZMW collapsed/merged into a single read // or Polymerase reads : all subreads of a ZMW stitched into a single read // void MapReadsCCS(MappingData *mapData, MappingBuffers & mappingBuffers, SMRTSequence & smrtRead, SMRTSequence & smrtReadRC, CCSSequence & ccsRead, const bool readIsCCS, MappingParameters & params, const int & associatedRandInt, ReadAlignments & allReadAlignments, ofstream & threadOut) { DNASuffixArray sarray; TupleCountTable ct; SequenceIndexDatabase seqdb; T_GenomeSequence genome; BWT *bwtPtr; mapData->ShallowCopySuffixArray(sarray); mapData->ShallowCopyReferenceSequence(genome); mapData->ShallowCopySequenceIndexDatabase(seqdb); mapData->ShallowCopyTupleCountTable(ct); bwtPtr = mapData->bwtPtr; SeqBoundaryFtr seqBoundary(&seqdb); // // The read must be mapped as a whole, even if it contains subreads. // vector alignmentPtrs; mapData->metrics.numReads++; smrtRead.SubreadStart(0).SubreadEnd(smrtRead.length); smrtReadRC.SubreadStart(0).SubreadEnd(smrtRead.length); MapRead(smrtRead, smrtReadRC, genome, sarray, *bwtPtr, seqBoundary, ct, seqdb, params, mapData->metrics, alignmentPtrs, mappingBuffers, mapData, semaphores); // // Store the mapping quality values. // if (alignmentPtrs.size() > 0 and alignmentPtrs[0]->score < params.maxScore and params.storeMapQV) { StoreMapQVs(smrtRead, alignmentPtrs, params); } // // Select de novo ccs-reference alignments for subreads to align to. // vector selectedAlignmentPtrs = SelectAlignmentsToPrint(alignmentPtrs, params, associatedRandInt); // // Just one sequence is aligned. There is one primary hit, and // all other are secondary. // // // Here unrolled reads are aligned // if (readIsCCS == false or params.useCcsOnly) { // if -noSplitSubreads or -useccsdenovo. // // Record some information for proper SAM Annotation. // allReadAlignments.Resize(1); allReadAlignments.AddAlignmentsForSeq(0, selectedAlignmentPtrs); if (params.useCcsOnly) { allReadAlignments.alignMode = CCSDeNovo; } else { allReadAlignments.alignMode = Fullread; } allReadAlignments.SetSequence(0, smrtRead); } // // Here CCS reads are aligned // else if (readIsCCS) { // if -useccsall or -useccs // Flank alignment candidates to both ends. for(size_t alignmentIndex = 0; alignmentIndex < selectedAlignmentPtrs.size(); alignmentIndex++) { FlankTAlignedSeq(selectedAlignmentPtrs[alignmentIndex], seqdb, genome, params.flankSize); } // // Align the ccs subread to where the denovo sequence mapped (explode). // CCSIterator ccsIterator; FragmentCCSIterator fragmentCCSIterator; CCSIterator *subreadIterator; // // Choose a different iterator over subreads depending on the // alignment mode. When the mode is allpass, include the // framgents that are not necessarily full pass. // if (params.useAllSubreadsInCcs) { // // Use all subreads even if they are not full pass fragmentCCSIterator.Initialize(&ccsRead, mapData->regionTablePtr); subreadIterator = &fragmentCCSIterator; allReadAlignments.alignMode = CCSAllPass; } else { // Use only full pass reads. ccsIterator.Initialize(&ccsRead); subreadIterator = &ccsIterator; allReadAlignments.alignMode = CCSFullPass; } allReadAlignments.Resize(subreadIterator->GetNumPasses()); int passDirection, passStartBase, passNumBases; SMRTSequence subread; // // The read was previously set to the smrtRead, which was the // de novo ccs sequence. Since the alignments of exploded // reads are reported, the unrolled read should be used as the // reference when printing. // allReadAlignments.read = ccsRead.unrolledRead; subreadIterator->Reset(); int subreadIndex; // // Realign all subreads to selected reference locations. // for (subreadIndex = 0; subreadIndex < subreadIterator->GetNumPasses(); subreadIndex++) { int retval = subreadIterator->GetNext(passDirection, passStartBase, passNumBases); assert(retval == 1); if (passNumBases <= params.minReadLength) { continue; } ReadInterval subreadInterval(passStartBase, passStartBase + passNumBases); subread.ReferenceSubstring(ccsRead.unrolledRead, passStartBase, passNumBases-1); subread.CopyTitle(ccsRead.title); // The unrolled alignment should be relative to the entire read. allReadAlignments.SetSequence(subreadIndex, ccsRead.unrolledRead); // // Align this subread to all the positions that the de novo // sequence has aligned to. // for (size_t alignmentIndex = 0; alignmentIndex < selectedAlignmentPtrs.size(); alignmentIndex++) { T_AlignmentCandidate *alignment = selectedAlignmentPtrs[alignmentIndex]; if (alignment->score > params.maxScore) break; AlignSubreadToAlignmentTarget(allReadAlignments, subread, ccsRead.unrolledRead, alignment, passDirection, subreadInterval, subreadIndex, params, mappingBuffers, threadOut); } // End of aligning this subread to where the de novo ccs has aligned to. subread.Free(); } // End of alignining all subreads to where the de novo ccs has aligned to. } // End of if readIsCCS and !params.useCcsOnly // Fix for memory leakage due to undeleted Alignment Candidate objectts not selected // for printing // delete all AC which are in complement of SelectedAlignmemntPtrs vector // namely (SelectedAlignmentPtrs/alignmentPtrs) for (size_t ii = 0; ii < alignmentPtrs.size(); ii++) { int found =0; for (size_t jj = 0; jj < selectedAlignmentPtrs.size(); jj++) { if (alignmentPtrs[ii] == selectedAlignmentPtrs[jj] ) { found = 1; break; } } if (found == 0) delete alignmentPtrs[ii]; } } void MapReads(MappingData *mapData) { // // Step 1, initialize local pointers to map data // for programming shorthand. // MappingParameters params = mapData->params; DNASuffixArray sarray; TupleCountTable ct; SequenceIndexDatabase seqdb; T_GenomeSequence genome; mapData->ShallowCopySuffixArray(sarray); mapData->ShallowCopyReferenceSequence(genome); mapData->ShallowCopySequenceIndexDatabase(seqdb); mapData->ShallowCopyTupleCountTable(ct); SeqBoundaryFtr seqBoundary(&seqdb); int numAligned = 0; SMRTSequence smrtRead, smrtReadRC; SMRTSequence unrolledReadRC; CCSSequence ccsRead; // Print verbose logging to pid.threadid.log for each thread. ofstream threadOut; if (params.verbosity >= 3) { stringstream ss; ss << getpid() << "." << pthread_self(); string threadLogFileName = ss.str() + ".log"; threadOut.open(threadLogFileName.c_str(), ios::out|ios::app); } // // Reuse the following buffers during alignment. Since these keep // storage contiguous, hopefully this will decrease memory // fragmentation. // MappingBuffers mappingBuffers; while (true) { // Fetch reads from a zmw bool readIsCCS = false; AlignmentContext alignmentContext; // Associate each sequence to read in with a determined random int. int associatedRandInt = 0; bool stop = false; vector subreads; bool readsOK = FetchReads(mapData->reader, mapData->regionTablePtr, smrtRead, ccsRead, subreads, params, readIsCCS, alignmentContext.readGroupId, associatedRandInt, stop); if (stop) break; if (not readsOK) continue; if (params.verbosity > 1) { cout << "aligning read: " << endl; smrtRead.PrintSeq(cout); } smrtRead.MakeRC(smrtReadRC); // important // 1. CCS and unrolled mode are mutually exclusive // 2. Reverse Complement Read is generated fort CCS only // if (readIsCCS) { ccsRead.unrolledRead.MakeRC(unrolledReadRC); } // // When aligning subreads separately, iterate over each subread, and // print the alignments for these. // ReadAlignments allReadAlignments; allReadAlignments.read = smrtRead; // currently 3 ways of mapping // regular, CCS , and Polymerase (unrolled) // // for regular subreads MapReadsNonCCS // for mapping ZMW as a whole (CCS or Polymerase) MapReadsCCS // For the future , change the name of functions to be more desriptive // noSplitSubreads is in essense unrolled - Polymerase read mode // if (readIsCCS == false and params.mapSubreadsSeparately) { // (not readIsCCS and not -noSplitSubreads) MapReadsNonCCS(mapData, mappingBuffers, smrtRead, smrtReadRC, subreads, params, associatedRandInt, allReadAlignments, threadOut); } // End of if (readIsCCS == false and params.mapSubreadsSeparately). else { // if (readIsCCS or (not readIsCCS and -noSplitSubreads) ) MapReadsCCS(mapData, mappingBuffers, smrtRead, smrtReadRC, ccsRead, readIsCCS, params, associatedRandInt, allReadAlignments, threadOut); } // End of if not (readIsCCS == false and params.mapSubreadsSeparately) PrintAllReadAlignments(allReadAlignments, alignmentContext, *mapData->outFilePtr, *mapData->unalignedFilePtr, params, subreads, #ifdef USE_PBBAM bamWriterPtr, #endif semaphores); allReadAlignments.Clear(); smrtReadRC.Free(); smrtRead.Free(); if (readIsCCS) { ccsRead.Free(); unrolledReadRC.Free(); } numAligned++; if(numAligned % 100 == 0) { mappingBuffers.Reset(); } } // End of while (true). smrtRead.Free(); smrtReadRC.Free(); unrolledReadRC.Free(); ccsRead.Free(); if (params.nProc > 1) { #ifdef __APPLE__ sem_wait(semaphores.reader); sem_post(semaphores.reader); #else sem_wait(&semaphores.reader); sem_post(&semaphores.reader); #endif } if (params.nProc > 1) { pthread_exit(NULL); } threadOut.close(); } int main(int argc, char* argv[]) { // // Configure parameters for refining alignments. // MappingParameters params; CommandLineParser clp; clp.SetHelp(BlasrHelp(params)); clp.SetConciseHelp(BlasrConciseHelp()); clp.SetProgramSummary(BlasrSummaryHelp()); clp.SetProgramName("blasr"); clp.SetVersion(GetVersion()); // Register Blasr options. RegisterBlasrOptions(clp, params); // Parse command line args. clp.ParseCommandLine(argc, argv, params.readsFileNames); string commandLine; clp.CommandLineToString(argc, argv, commandLine); if (params.printVerboseHelp) { cout << BlasrHelp(params) << endl; exit(0); // Not a failure. } if (params.printDiscussion) { cout << BlasrDiscussion(); exit(0); // Not a failure. } if (argc < 3) { cout << BlasrConciseHelp(); exit(1); // A failure. } int a, b; for (a = 0; a < 5; a++ ) { for (b = 0; b < 5; b++ ){ if (a != b) { SMRTDistanceMatrix[a][b] += params.mismatch; } else { SMRTDistanceMatrix[a][b] += params.match; } } } if (params.scoreMatrixString != "") { if (StringToScoreMatrix(params.scoreMatrixString, SMRTDistanceMatrix) == false) { cout << "ERROR. The string " << endl << params.scoreMatrixString << endl << "is not a valid format. It should be a quoted, space separated string of " << endl << "integer values. The matrix: " << endl << " A C G T N" << endl << " A 1 2 3 4 5" << endl << " C 6 7 8 9 10" << endl << " G 11 12 13 14 15" << endl << " T 16 17 18 19 20" << endl << " N 21 22 23 24 25" << endl << " should be specified as \"1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25\"" << endl; exit(1); } } cerr << "[INFO] " << GetTimestamp() << " [blasr] started." << endl; params.MakeSane(); // // The random number generator is used for subsampling for debugging // and testing consensus and selecting hits when hit policy is random // or randombest. // if (params.useRandomSeed == true) { InitializeRandomGenerator(params.randomSeed); } else { InitializeRandomGeneratorWithTime(); } // // Various aspects of timing are stored here. However this isn't // quite finished. // MappingMetrics metrics; ofstream fullMetricsFile; if (params.fullMetricsFileName != "") { CrucialOpen(params.fullMetricsFileName, fullMetricsFile, std::ios::out); metrics.SetStoreList(); } // // If reading a separate region table, there is a 1-1 correspondence // between region table and bas file. // if (params.readSeparateRegionTable) { if (FileOfFileNames::IsFOFN(params.regionTableFileName)) { FileOfFileNames::FOFNToList(params.regionTableFileName, params.regionTableFileNames); } else { params.regionTableFileNames.push_back(params.regionTableFileName); } } if (params.regionTableFileNames.size() != 0 and params.regionTableFileNames.size() != params.queryFileNames.size()) { cout << "Error, there are not the same number of region table files as input files." << endl; exit(1); } // If reading a separate ccs fofn, there is a 1-1 corresponence // between ccs fofn and base file. if (params.readSeparateCcsFofn) { if (FileOfFileNames::IsFOFN(params.ccsFofnFileName)) { FileOfFileNames::FOFNToList(params.ccsFofnFileName, params.ccsFofnFileNames); } else { params.ccsFofnFileNames.push_back(params.ccsFofnFileName); } } if (params.ccsFofnFileNames.size() != 0 and params.ccsFofnFileNames.size() != params.queryFileNames.size()) { cout << "Error, there are not the same number of ccs files as input files." << endl; exit(1); } SequenceIndexDatabase seqdb; SeqBoundaryFtr seqBoundary(&seqdb); // // Initialize the sequence index database if it used. If it is not // specified, it is initialized by default when reading a multiFASTA // file. // if (params.useSeqDB) { ifstream seqdbin; CrucialOpen(params.seqDBName, seqdbin); seqdb.ReadDatabase(seqdbin); } // // Make sure the reads file exists and can be opened before // trying to read any of the larger data structures. // FASTASequence fastaGenome; T_Sequence genome; FASTAReader genomeReader; // // The genome is in normal FASTA, or condensed (lossy homopolymer->unipolymer) // format. Both may be read in using a FASTA reader. // if (!genomeReader.Init(params.genomeFileName)) { cout << "Could not open genome file " << params.genomeFileName << endl; exit(1); } if (params.printSAM or params.printBAM) { genomeReader.computeMD5 = true; } // // If no sequence title database is supplied, initialize one when // reading in the reference, and consider a seqdb to be present. // if (!params.useSeqDB) { genomeReader.ReadAllSequencesIntoOne(fastaGenome, &seqdb); params.useSeqDB = true; } else { genomeReader.ReadAllSequencesIntoOne(fastaGenome); } genomeReader.Close(); // // The genome may have extra spaces in the fasta name. Get rid of those. // for (int t = 0; t < fastaGenome.titleLength; t++ ){ if (fastaGenome.title[t] == ' ') { fastaGenome.titleLength = t; fastaGenome.title[t] = '\0'; break; } } genome.seq = fastaGenome.seq; genome.length = fastaGenome.length; genome.title = fastaGenome.title; genome.deleteOnExit = false; genome.titleLength = fastaGenome.titleLength; genome.ToUpper(); DNASuffixArray sarray; TupleCountTable ct; ofstream outFile; outFile.exceptions(ostream::failbit); ofstream unalignedOutFile; BWT bwt; if (params.useBwt) { if (bwt.Read(params.bwtFileName) == 0) { cout << "ERROR! Could not read the BWT file. " << params.bwtFileName << endl; exit(1); } } else { if (!params.useSuffixArray) { // // There was no explicit specification of a suffix // array on the command line, so build it on the fly here. // genome.ToThreeBit(); vector alphabet; sarray.InitThreeBitDNAAlphabet(alphabet); sarray.LarssonBuildSuffixArray(genome.seq, genome.length, alphabet); if (params.minMatchLength > 0) { if (params.anchorParameters.useLookupTable == true) { if (params.lookupTableLength > params.minMatchLength) { params.lookupTableLength = params.minMatchLength; } sarray.BuildLookupTable(genome.seq, genome.length, params.lookupTableLength); } } genome.ConvertThreeBitToAscii(); params.useSuffixArray = 1; } else if (params.useSuffixArray) { if (sarray.Read(params.suffixArrayFileName)) { if (params.minMatchLength != 0) { params.listTupleSize = min(8, params.minMatchLength); } else { params.listTupleSize = sarray.lookupPrefixLength; } if (params.minMatchLength < int(sarray.lookupPrefixLength)) { cerr << "WARNING. The value of -minMatch " << params.minMatchLength << " is less than the smallest searched length of " << sarray.lookupPrefixLength << ". Setting -minMatch to " << sarray.lookupPrefixLength << "." << endl; params.minMatchLength = sarray.lookupPrefixLength; } } else { cout << "ERROR. " << params.suffixArrayFileName << " is not a valid suffix array. " << endl << " Make sure it is generated with the latest version of sawriter." << endl; exit(1); } } } if (params.minMatchLength < int(sarray.lookupPrefixLength)) { cerr << "WARNING. The value of -minMatch " << params.minMatchLength << " is less than the smallest searched length of " << sarray.lookupPrefixLength << ". Setting -minMatch to " << sarray.lookupPrefixLength << "." << endl; params.minMatchLength = sarray.lookupPrefixLength; } // // It is required to have a tuple count table // for estimating the background frequencies // for word matching. // If one is specified on the command line, simply read // it in. If not, this is operating under the mode // that everything is computed from scratch. // TupleMetrics saLookupTupleMetrics; if (params.useCountTable) { ifstream ctIn; CrucialOpen(params.countTableName, ctIn, std::ios::in | std::ios::binary); ct.Read(ctIn); saLookupTupleMetrics = ct.tm; } else { saLookupTupleMetrics.Initialize(params.lookupTableLength); ct.InitCountTable(saLookupTupleMetrics); ct.AddSequenceTupleCountsLR(genome); } TitleTable titleTable; if (params.useTitleTable) { ofstream titleTableOut; CrucialOpen(params.titleTableName, titleTableOut); // // When using a sequence index database, the title table is simply copied // from the sequencedb. // if (params.useSeqDB) { titleTable.Copy(seqdb.names, seqdb.nSeqPos-1); titleTable.ResetTableToIntegers(seqdb.names, seqdb.nameLengths, seqdb.nSeqPos-1); } else { // // No seqdb, so there is just one sequence. Still the user specified a title // table, so just the first sequence in the fasta file should be used. // titleTable.Copy(&fastaGenome.title, 1); titleTable.ResetTableToIntegers(&genome.title, &genome.titleLength, 1); fastaGenome.titleLength = strlen(genome.title); } titleTable.Write(titleTableOut); } else { if (params.useSeqDB) { // // When using a sequence index database, but not the titleTable, // it is necessary to truncate the titles at the first space to // be compatible with the way other alignment programs interpret // fasta titles. When printing the title table, there is all // sorts of extra storage space, so the full line is stored. // seqdb.SequenceTitleLinesToNames(); } } ostream *outFilePtr = &cout; ofstream outFileStrm; ofstream unalignedFile; ostream *unalignedFilePtr = NULL; ofstream metricsOut, lcpBoundsOut; ofstream anchorFileStrm; ofstream clusterOut, *clusterOutPtr; if (params.anchorFileName != "") { CrucialOpen(params.anchorFileName, anchorFileStrm, std::ios::out); } if (params.clusterFileName != "") { CrucialOpen(params.clusterFileName, clusterOut, std::ios::out); clusterOutPtr = &clusterOut; clusterOut << "total_size p_value n_anchors read_length align_score read_accuracy anchor_probability min_exp_anchors seq_length" << endl; } else { clusterOutPtr = NULL; } if (params.outFileName != "") { if (not params.printBAM) { CrucialOpen(params.outFileName, outFileStrm, std::ios::out); outFilePtr = &outFileStrm; } // otherwise, use bamWriter and initialize it later } if (params.printHeader) { switch(params.printFormat) { case(SummaryPrint): SummaryOutput::PrintHeader(*outFilePtr); break; case(Interval): IntervalOutput::PrintHeader(*outFilePtr); break; case(CompareSequencesParsable): CompareSequencesOutput::PrintHeader(*outFilePtr); break; } } if (params.printUnaligned == true) { CrucialOpen(params.unalignedFileName, unalignedFile, std::ios::out); unalignedFilePtr = &unalignedFile; } if (params.metricsFileName != "") { CrucialOpen(params.metricsFileName, metricsOut); } if (params.lcpBoundsFileName != "") { CrucialOpen(params.lcpBoundsFileName, lcpBoundsOut); // lcpBoundsOut << "pos depth width lnwidth" << endl; } // // Configure the mapping database. // MappingData *mapdb = new MappingData[params.nProc]; int procIndex; pthread_attr_t *threadAttr = new pthread_attr_t[params.nProc]; // MappingSemaphores semaphores; // // When there are multiple processes running along, sometimes there // are semaphores to worry about. // if (params.nProc > 1) { semaphores.InitializeAll(); } for (procIndex = 0; procIndex < params.nProc; procIndex++ ){ pthread_attr_init(&threadAttr[procIndex]); } // // Start the mapping jobs. // if (params.subsample < 1) { InitializeRandomGeneratorWithTime(); reader = new ReaderAgglomerate(params.subsample); } else { reader = new ReaderAgglomerate(params.startRead, params.stride); } // In case the input is fasta, make all bases in upper case. reader->SetToUpper(); regionTableReader = new HDFRegionTableReader; RegionTable regionTable; // // Store lists of how long it took to map each read. // metrics.clocks.SetStoreList(true); if (params.useCcs) { reader->UseCCS(); } string commandLineString; // Restore command. clp.CommandLineToString(argc, argv, commandLineString); if (params.printSAM or params.printBAM) { string so = "UNKNOWN"; // sorting order; string version = GetVersion(); //blasr version; SAMHeaderPrinter shp(so, seqdb, params.queryFileNames, params.queryReadType, params.samQVList, "BLASR", version, commandLineString); string headerString = shp.ToString();// SAM/BAM header if (params.printSAM) { // this is not going to be executed since sam is printed via bam *outFilePtr << headerString; } else if (params.printBAM) { // here both bam and sam are handled #ifdef USE_PBBAM PacBio::BAM::BamHeader header = PacBio::BAM::BamHeader(headerString); // Create bam header // Both file name and SAMHeader are required in order to create a BamWriter. // sam_via_bam changes if (params.sam_via_bam) { bamWriterPtr = new PacBio::BAM::SamWriter(params.outFileName, header); } else { bamWriterPtr = new PacBio::BAM::BamWriter(params.outFileName, header); } #else REQUIRE_PBBAM_ERROR(); #endif } } for (size_t readsFileIndex = 0; readsFileIndex < params.queryFileNames.size(); readsFileIndex++ ){ params.readsFileIndex = readsFileIndex; // // Configure the reader to use the correct read and region // file names. // reader->SetReadFileName(params.queryFileNames[params.readsFileIndex]); // if PBBAM , need to construct scrap file name and check if exist // // Initialize using already set file names. // // unrolled Need to pass unrolled option // unrolled If not PBDATASET also need to construct scrap file name and // test if it exists in the same directory, if not exit with error message // int initReturnValue; if ( ( (reader->GetFileType() == FileType::PBDATASET) || (reader->GetFileType() == FileType::PBBAM)) and not params.mapSubreadsSeparately) { if ( reader->GetFileType() == FileType::PBBAM ) { reader->SetScrapsFileName(params.scrapsFileNames[params.readsFileIndex]); } initReturnValue = reader->Initialize(true); } else { initReturnValue = reader->Initialize(); } if (initReturnValue <= 0) { cerr << "WARNING! Could not open file " << params.queryFileNames[params.readsFileIndex] << endl; continue; } // Check whether use ccs only. if (reader->GetFileType() == FileType::HDFCCSONLY) { params.useAllSubreadsInCcs = false; params.useCcs = params.useCcsOnly = true; } string changeListIdString; reader->hdfBasReader.GetChangeListID(changeListIdString); ChangeListID changeListId(changeListIdString); params.qvScaleType = DetermineQVScaleFromChangeListID(changeListId); if (reader->FileHasZMWInformation() and params.useRegionTable) { if (params.readSeparateRegionTable) { if (regionTableReader->Initialize(params.regionTableFileNames[params.readsFileIndex]) == 0) { cout << "ERROR! Could not read the region table " << params.regionTableFileNames[params.readsFileIndex] <HasRegionTable()) { if (regionTableReader->Initialize(params.queryFileNames[params.readsFileIndex]) == 0) { cout << "ERROR! Could not read the region table " << params.queryFileNames[params.readsFileIndex] <ReadTable(regionTable); regionTableReader->Close(); } // // Check to see if there is a separate ccs fofn. If there is a separate // ccs fofn, use that over the one in the bas file. // //if (params.readSeparateCcsFofn and params.useCcs) { // if (reader->SetCCS(params.ccsFofnFileNames[params.readsFileIndex]) == 0) { // cout << "ERROR! Could not read the ccs file " // << params.ccsFofnFileNames[params.readsFileIndex] << endl; // exit(1); // } // } if (reader->GetFileType() != FileType::HDFCCS and reader->GetFileType() != FileType::HDFBase and reader->GetFileType() != FileType::HDFPulse and reader->GetFileType() != FileType::PBBAM and reader->GetFileType() != FileType::PBDATASET and params.concordant) { cerr << "WARNING! Option concordant is only enabled when " << "input reads are in PacBio bax/pls.h5, bam or " << "dataset xml format." << endl; params.concordant = false; } #ifdef USE_GOOGLE_PROFILER char *profileFileName = getenv("CPUPROFILE"); if (profileFileName != NULL) { ProfilerStart(profileFileName); } else { ProfilerStart("google_profile.txt"); } #endif assert (initReturnValue > 0); if (params.nProc == 1) { mapdb[0].Initialize(&sarray, &genome, &seqdb, &ct, params, reader, ®ionTable, outFilePtr, unalignedFilePtr, &anchorFileStrm, clusterOutPtr); mapdb[0].bwtPtr = &bwt; if (params.fullMetricsFileName != "") { mapdb[0].metrics.SetStoreList(true); } if (params.lcpBoundsFileName != "") { mapdb[0].lcpBoundsOutPtr = &lcpBoundsOut; } else { mapdb[0].lcpBoundsOutPtr = NULL; } MapReads(&mapdb[0]); metrics.Collect(mapdb[0].metrics); } else { pthread_t *threads = new pthread_t[params.nProc]; for (procIndex = 0; procIndex < params.nProc; procIndex++ ){ // // Initialize thread-specific parameters. // mapdb[procIndex].Initialize(&sarray, &genome, &seqdb, &ct, params, reader, ®ionTable, outFilePtr, unalignedFilePtr, &anchorFileStrm, clusterOutPtr); mapdb[procIndex].bwtPtr = &bwt; if (params.fullMetricsFileName != "") { mapdb[procIndex].metrics.SetStoreList(true); } if (params.lcpBoundsFileName != "") { mapdb[procIndex].lcpBoundsOutPtr = &lcpBoundsOut; } else { mapdb[procIndex].lcpBoundsOutPtr = NULL; } if (params.outputByThread) { ofstream *outPtr =new ofstream; mapdb[procIndex].outFilePtr = outPtr; stringstream outNameStream; outNameStream << params.outFileName << "." << procIndex; mapdb[procIndex].params.outFileName = outNameStream.str(); CrucialOpen(mapdb[procIndex].params.outFileName, *outPtr, std::ios::out); } pthread_create(&threads[procIndex], &threadAttr[procIndex], (void* (*)(void*))MapReads, &mapdb[procIndex]); } for (procIndex = 0; procIndex < params.nProc; procIndex++) { pthread_join(threads[procIndex], NULL); } for (procIndex = 0; procIndex < params.nProc; procIndex++) { metrics.Collect(mapdb[procIndex].metrics); if (params.outputByThread) { delete mapdb[procIndex].outFilePtr; } } if (threads) { delete threads; threads = NULL; } } reader->Close(); } if (!reader) {delete reader; reader = NULL;} fastaGenome.Free(); #ifdef USE_GOOGLE_PROFILER ProfilerStop(); #endif if (mapdb != NULL) { delete[] mapdb; } if (threadAttr != NULL) { delete[] threadAttr; } seqdb.FreeDatabase(); if (regionTableReader) { delete regionTableReader; } if (params.metricsFileName != "") { metrics.PrintSummary(metricsOut); } if (params.fullMetricsFileName != "") { metrics.PrintFullList(fullMetricsFile); } if (params.outFileName != "") { if (params.printBAM) { #ifdef USE_PBBAM assert(bamWriterPtr); try { if (!params.sam_via_bam) { // no need to flush for SAM , but need to understand why bamWriterPtr->TryFlush(); } delete bamWriterPtr; bamWriterPtr = NULL; } catch (std::exception e) { cout << "Error, could not flush bam records to bam file." << endl; exit(1); } #else REQUIRE_PBBAM_ERROR(); #endif } else { outFileStrm.close(); } } cerr << "[INFO] " << GetTimestamp() << " [blasr] ended." << endl; return 0; } blasr-smrtanalysis-4.0.0/LICENSE000066400000000000000000000033601302464523700163620ustar00rootroot00000000000000// Copyright (c) 2014-2015, Pacific Biosciences of California, Inc. // // All rights reserved. // // Redistribution and use in source and binary forms, with or without // modification, are permitted (subject to the limitations in the // disclaimer below) provided that the following conditions are met: // // * Redistributions of source code must retain the above copyright // notice, this list of conditions and the following disclaimer. // // * Redistributions in binary form must reproduce the above // copyright notice, this list of conditions and the following // disclaimer in the documentation and/or other materials provided // with the distribution. // // * Neither the name of Pacific Biosciences nor the names of its // contributors may be used to endorse or promote products derived // from this software without specific prior written permission. // // NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE // GRANTED BY THIS LICENSE. THIS SOFTWARE IS PROVIDED BY PACIFIC // BIOSCIENCES AND ITS CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED // WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES // OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE // DISCLAIMED. IN NO EVENT SHALL PACIFIC BIOSCIENCES OR ITS // CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, // SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT // LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF // USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND // ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, // OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT // OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF // SUCH DAMAGE. blasr-smrtanalysis-4.0.0/LICENSES.txt000066400000000000000000000031211302464523700173160ustar00rootroot00000000000000Copyright (c) 2011-2015, Pacific Biosciences of California, Inc. All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: * Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. * Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. * Neither the name of Pacific Biosciences nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE GRANTED BY THIS LICENSE. THIS SOFTWARE IS PROVIDED BY PACIFIC BIOSCIENCES AND ITS CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL PACIFIC BIOSCIENCES OR ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. blasr-smrtanalysis-4.0.0/README.INSTALL.md000066400000000000000000000054611302464523700177450ustar00rootroot00000000000000## Installation ### See a step by step Blasr installation example on Blasr wiki page https://github.com/PacificBiosciences/blasr/wiki/Step-by-step-blasr-installation-example ### Download source code * To pull this project from git hub to your local system: git clone git://github.com/PacificBiosciences/blasr.git blasr * To sync your code with the latest git code base: git pull --rebase origin master && git submodule update --init * To update the submodule: make update-submodule ### Requirements * To configure: ./configure.py --shared --sub --no-pbbam * or with HDF5 directories (and note that `HDF5_LIB` is a *directory* here): ./configure.py --shared --sub --no-pbbam HDF5_INCLUDE=... HDF5_LIB=... To build BLASR, you must have hdf 1.8.12 or above installed and configured with c++ support (you should have the library libhdf5_cpp.a). If you are intalling the entire PacBio secondary analysis software suite, appropriate hdf libraries are already distributed and no configuration is necessary. Otherwise, it is necessary to point two environment variables: + **HDF5_INCLUDE**, which points to directory of the HDF5 headers (e.g., hdf5.h) + **HDF5_LIB**, which points to the HDF5 library directory (e.g., hdf5*.a, and hdf5*.so) You may pass arguments to `configure.py` as above, or you may export them from command line: export HDF5_INC=path_to_your_hdf5_include && export HDF5_LIB=path_to_your_hdf5_lib * To configure submodule: make configure-submodule ### Build * To make the 'libcpp' libraries: make build-submodule * To make 'blasr' only: make blasr * To compile all tools, including blasr, pls2fasta, loadPulses, sawriter: make * Frequently used executables will be under utils. * To test (with **cram** installed): #make cramtests make cramfast ## Currently: ## Ran 22 tests, 0 skipped, 4 failed. * To clean all compiled tools and lib: make cleanall * To clean compiled tools without cleaning lib: make clean make blasr ./blasr ### CXXFLAGS * For optimized builds: ./configure.py CXXFLAGS=-O3 ... * For debug builds: ./configure.py CXXFLAGS=-g ... ## Other issues ### Static binaries If you want static binaries, drop `--shared` when you run configure.py. In that case, you might need to pass `-lsz` to make, if you built HDF5 with szlib support (`--with-szlib`). ./configure.py --with-szlib ... See [our issues](https://github.com/PacificBiosciences/blasr/issues/113#issuecomment-143981496). If you have macosx (Darwin), then you almost certainly want non-static binaries (--shared). ### blasr_libcpp If you have built and installed blasr_libcpp elsewhere, then drop `--sub` and do not run `make build-submodule`. blasr-smrtanalysis-4.0.0/README.MANUAL.md000066400000000000000000000022251302464523700176070ustar00rootroot00000000000000## Running BLASR Typing 'blasr -h' or 'blasr -help' on the command line will give you a list of options. At the least, provide a fasta, fastq, or bas.h5 file, and a genome. ### Some typical use cases Align reads from reads.bas.h5 to ecoli_K12 genome, and output in SAM format. blasr reads.bas.h5 ecoli_K12.fasta -sam Same as above, but with soft clipping blasr reads.bas.h5 ecoli_K12.fasta -sam -clipping soft Use multiple threads blasr reads.bas.h5 ecoli_K12.fasta -sam -clipping soft -out alignments.sam -nproc 16 Include a larger minimal match, for faster but less sensitive alignments blasr reads.bas.h5 ecoli_K12.fasta -sam -clipping soft -minMatch 15 Produce alignments in a pairwise human readable format blasr reads.bas.h5 ecoli_K12.fasta -m 0 Use a precomputed suffix array for faster startup sawriter hg19.fasta.sa hg19.fasta #First precompute the suffix array blasr reads.bas.h5 hg19.fasta -sa hg19.fasta.sa Use a precomputed BWT-FM index for smaller runtime memory footprint, but slower alignments. sa2bwt hg19.fasta hg19.fasta.sa hg19.fasta.bwt blasr reads.bas.h5 hg19.fasta -bwt hg19.fasta.bwt blasr-smrtanalysis-4.0.0/README.md000066400000000000000000000001341302464523700166300ustar00rootroot00000000000000See Blasr [README.INSTALL.md](README.INSTALL.md) and [README.MANUAL.md](README.MANUAL.md). blasr-smrtanalysis-4.0.0/configure.py000077500000000000000000000206731302464523700177210ustar00rootroot00000000000000#!/usr/bin/env python """Configure the build. - Create defines.mk """ import commands import contextlib import optparse import os import sys import warnings #DEFAULTCXXFLAG := -O3 #DEBUGCXXFLAG := -g -ggdb -fno-inline #PROFILECXXFLAG := -Os -pg #GCXXFLAG := -fno-builtin-malloc -fno-builtin-calloc -fno-builtin-realloc -fno-builtin-free -fno-omit-frame-pointer ROOT = os.path.abspath(os.path.dirname(__file__)) def log(msg): sys.stderr.write(msg) sys.stderr.write('\n') def shell(cmd): log('`%s`'%cmd) status, output = commands.getstatusoutput(cmd) if status: raise Exception('%d <- %r' %(status, cmd)) log(output) return output def system(cmd): log(cmd) status = os.system(cmd) if status: raise Exception('%d <- %r' %(status, cmd)) return def mkdirs(path): if not os.path.isdir(path): os.makedirs(path) @contextlib.contextmanager def cd(nwd): cwd = os.getcwd() log('cd %r -> %r' %(cwd, nwd)) os.chdir(nwd) yield os.chdir(cwd) log('cd %r <- %r' %(cwd, nwd)) def update_content(fn, content): current_content = open(fn).read() if os.path.exists(fn) else None if content != current_content: log('writing to %r:' %fn) log('"""\n' + content + '"""\n----') open(fn, 'w').write(content) def get_OS_STRING(): G_BUILDOS_CMD = """bash -c 'set -e; set -o pipefail; id=$(lsb_release -si | tr "[:upper:]" "[:lower:]"); rel=$(lsb_release -sr); case $id in ubuntu) printf "$id-%04d\n" ${rel/./};; centos) echo "$id-${rel%%.*}";; *) echo "$id-$rel";; esac' 2>/dev/null""" return shell(G_BUILDOS_CMD) def get_PREBUILT(): cmd = 'cd ../../../../prebuilt.out 2>/dev/null && pwd || echo -n notfound' return shell(cmd) def ifenvf(env, key, func): if key in env: return env[key] else: return func() def setifenvf(envout, envin, key, func): envout[key] = ifenvf(envin, key, func) def setifenv(envout, envin, key, val): envout[key] = envin.get(key, val) def setenv(envout, key, val): envout[key] = val def update_env_if(envout, envin, keys): for key in keys: if key in envin: envout[key] = envin[key] def compose_defs_env(env): # We disallow env overrides for some things with defaults from GNU make. nons = ['CXX', 'CC', 'AR'] # 'SHELL'? ovr = ['%-20s ?= %s' %(k, v) for k,v in env.items() if k not in nons] nonovr = ['%-20s := %s' %(k, v) for k,v in env.items() if k in nons] return '\n'.join(ovr + nonovr + ['']) def compose_defines_pacbio(envin): """ This is used by mobs via buildcntl.sh. """ env = dict() setenv(env, 'SHELL', 'bash') #setifenvf(env, envin, 'OS_STRING', get_OS_STRING) #setifenvf(env, envin, 'PREBUILT', get_PREBUILT) nondefaults = set([ 'CXX', 'CXXFLAGS', 'NOPBBAM', 'LIBPBDATA_INC', 'LIBPBDATA_LIB', 'LIBPBDATA_LIBFLAGS', 'LIBPBIHDF_INC', 'LIBPBIHDF_LIB', 'LIBPBIHDF_LIBFLAGS', 'LIBBLASR_INC', 'LIBBLASR_LIB', 'LIBBLASR_LIBFLAGS', 'HDF5_INC', 'HDF5_LIB', 'HDF5_LIBFLAGS', 'PBBAM_INC', 'PBBAM_LIB', 'PBBAM_LIBFLAGS', 'HTSLIB_INC', 'HTSLIB_LIB', 'HTSLIB_LIBFLAGS', 'BOOST_INC', 'GCC_LIB', 'ZLIB_LIB', 'ZLIB_LIBFLAGS', 'SZLIB_LIB', 'SZLIB_LIBFLAGS', 'PTHREAD_LIBFLAGS', 'DL_LIBFLAGS', 'RT_LIBFLAGS', ]) update_env_if(env, envin, nondefaults) return compose_defs_env(env) def configure_pacbio(envin, shared, build_dir): content1 = compose_defines_pacbio(envin) if not shared: content1 += 'LDFLAGS+=-static\n' update_content(os.path.join(build_dir, 'defines.mk'), content1) def set_defs_submodule_defaults(env, nopbbam): subdir = os.path.join(ROOT, 'libcpp') defaults = { 'LIBPBDATA_INC': os.path.join(subdir, 'pbdata'), 'LIBBLASR_INC': os.path.join(subdir, 'alignment'), #'LIBPBIHDF_INC': '' if nopbbam else os.path.join(subdir, 'hdf'), 'LIBPBDATA_LIB': os.path.join(subdir, 'pbdata'), 'LIBBLASR_LIB': os.path.join(subdir, 'alignment'), #'LIBPBIHDF_LIB': '' if nopbbam else os.path.join(subdir, 'hdf'), } for k in defaults: if k not in env: env[k] = defaults[k] def update_defaults_for_os(env): OS = shell('uname') if 'Darwin' in OS: #-lsz (for static builds?) env['RT_LIBFLAGS'] = '' def set_defs_defaults(env, nopbbam, with_szlib): defaults = { 'LIBBLASR_INC': os.path.join(ROOT, 'libcpp', 'alignment'), 'LIBPBDATA_INC': os.path.join(ROOT, 'libcpp', 'pbdata'), 'LIBPBIHDF_INC': os.path.join(ROOT, 'libcpp', 'hdf'), 'LIBBLASR_LIB': os.path.join(ROOT, 'libcpp', 'alignment'), 'LIBPBDATA_LIB': os.path.join(ROOT, 'libcpp', 'pbdata'), 'LIBPBIHDF_LIB': os.path.join(ROOT, 'libcpp', 'hdf'), 'LIBBLASR_LIBFLAGS': '-lblasr', 'LIBPBDATA_LIBFLAGS': '-lpbdata', 'LIBPBIHDF_LIBFLAGS': '-lpbihdf', 'HDF5_LIBFLAGS': '-lhdf5_cpp -lhdf5', 'RT_LIBFLAGS': '-lrt', 'ZLIB_LIBFLAGS': '-lz', 'PTHREAD_LIBFLAGS': '-lpthread', 'DL_LIBFLAGS': '-ldl', # neeeded by HDF5 always 'SHELL': 'bash -xe', } try: update_defaults_for_os(defaults) except Exception as e: warnings.warn(e) #setifenvf(defaults, env, 'OS_STRING', get_OS_STRING) #setifenvf(defaults, env, 'PREBUILT', get_PREBUILT) pbbam_defaults = { 'PBBAM_LIBFLAGS': '-lpbbam', 'HTSLIB_LIBFLAGS': '-lhts', 'ZLIB_LIBFLAGS': '-lz', #'PTHREAD_LIBFLAGS': '-lpthread', #'DL_LIBFLAGS': '-ldl', # neeeded by HDF5 always } if not nopbbam: defaults.update(pbbam_defaults) else: defaults['NOPBBAM'] = 1 szlib_defaults = { 'SZLIB_LIBFLAGS': '-lsz', #'ZLIB_LIBFLAGS': '-lz', # probably needed, but provided elsewhere } if with_szlib: defaults.update(szlib_defaults) for k in defaults: if k not in env: env[k] = defaults[k] def get_make_style_env(envin, args): envout = dict() for arg in args: if '=' in arg: k, v = arg.split('=') envout[k] = v envout.update(envin) return envout def parse_args(args): parser = optparse.OptionParser() parser.add_option('--no-pbbam', action='store_true', help='Avoid compiling anything which would need pbbam.') parser.add_option('--with-szlib', action='store_true', help='If HDF5 was built with --with-szlib, then -lsz is needed for static binaries.') parser.add_option('--submodules', action='store_true', help='Set variables to use our git-submodules, which must be pulled and built first. (Implies --no-pbbam.)') parser.add_option('--shared', action='store_true', help='Build for dynamic linking. (Non-static binaries.)') parser.add_option('--build-dir', help='Can be different from source directory, but only when *not* also building submodule.') return parser.parse_args(list(args)) def symlink_makefile(build_dir_root, src_dir_root, makefilename, relpath): src_dir = os.path.join(src_dir_root, relpath) build_dir = os.path.join(build_dir_root, relpath) src_name = os.path.join(src_dir, 'makefile') dst_name = os.path.join(build_dir, 'makefile') if os.path.lexists(dst_name): os.unlink(dst_name) print('%r <- %r' %(src_name, dst_name)) mkdirs(build_dir) os.symlink(src_name, dst_name) def symlink_makefiles(build_dir): symlink_makefile(build_dir, ROOT, 'makefile', '.') symlink_makefile(build_dir, ROOT, 'makefile', 'utils') symlink_makefile(build_dir, ROOT, 'makefile', 'extrautils') def main(prog, *args): """We are still deciding what env-vars to use, if any. """ # Set up an alias, until everything uses one consistently. conf, makevars = parse_args(args) if conf.build_dir is not None: symlink_makefiles(conf.build_dir) else: conf.build_dir = '.' conf.build_dir = os.path.abspath(conf.build_dir) envin = get_make_style_env(os.environ, makevars) if 'HDF5_INCLUDE' in envin and 'HDF5_INC' not in envin: envin['HDF5_INC'] = envin['HDF5_INCLUDE'] if conf.submodules: set_defs_submodule_defaults(envin, conf.no_pbbam) conf.no_pbbam = True set_defs_defaults(envin, conf.no_pbbam, conf.with_szlib) configure_pacbio(envin, conf.shared, conf.build_dir) if __name__=="__main__": main(*sys.argv) blasr-smrtanalysis-4.0.0/cram.mk000066400000000000000000000024511302464523700166300ustar00rootroot00000000000000FAST_CTESTS := \ ctest/ecoli.t \ ctest/fastMaxInterval.t \ ctest/aggressiveIntervalCut.t \ ctest/multipart.t \ ctest/affineAlign.t ctest/bamOut.t ctest/ccsH5.t ctest/filtercriteria.t ctest/m0-5.t \ ctest/fofn.t \ ctest/alignScore.t ctest/hitpolicy.t ctest/noSplitSubreads.t \ ctest/bamIn.t ctest/open_fail.t ctest/verbose.t ctest/deterministic.t MILD_CTESTS := \ ctest/concordant.t ctest/bug25766.t ctest/holeNumbers.t SLOW_CTESTS := ctest/bug25328.t # XXX: following tests sidelined, needs bam input after --sam option removed # MILD: ctest/useccsallBestN1.t # sidelined because of changes in directories # # needed to restore /mnt/data3/vol53/2450530/0014 # SLOW ctest/useccsallLargeGenome.t #BLASR_PATH=/mnt/secondary/builds/full/3.0.0/prod/current-build_smrtanalysis/private/otherbins/internalall/bin/ #export BLASR_PATH cramfast: cram -v --shell=/bin/bash ${FAST_CTESTS} crammild: cram -v --shell=/bin/bash ${MILD_CTESTS} cramslow: cram -v --shell=/bin/bash ${SLOW_CTESTS} cramtests: cram -v --shell=/bin/bash ${FAST_CTESTS} ${MILD_CTESTS} ${SLOW_CTESTS} cramqu: for test in ${FAST_CTESTS}; do \ qsub -pe smp 15 -V -cwd -b y -N cramqu $@cram -v --shell=bin/bash $$test;\ done clean: rm -f cramqu.* ctest/*.err blasr-smrtanalysis-4.0.0/ctest/000077500000000000000000000000001302464523700164755ustar00rootroot00000000000000blasr-smrtanalysis-4.0.0/ctest/.gitignore000066400000000000000000000000061302464523700204610ustar00rootroot00000000000000/out/ blasr-smrtanalysis-4.0.0/ctest/affineAlign.t000066400000000000000000000013061302464523700210650ustar00rootroot00000000000000Set up $ . $TESTDIR/setup.sh Test affineAlign $ rm -rf $OUTDIR/affineAlign.m0 $ $EXEC $DATDIR/affineAlign.fofn $DATDIR/substr_with_ins.fasta -m 0 --out $OUTDIR/affineAlign.m0 --affineAlign --holeNumbers 493 --insertion 100 --deletion 100 [INFO]* (glob) [INFO]* (glob) $ diff $OUTDIR/affineAlign.m0 $STDDIR/affineAlign_2014_06_10.m0 $ rm -rf $OUTDIR/ecoli_affine.m0 $ $EXEC $DATDIR/ecoli_affine.fasta $DATDIR/ecoli_reference.fasta -m 0 --out $OUTDIR/ecoli_affine.m0 --affineAlign --insertion 100 --deletion 100 [INFO]* (glob) [INFO]* (glob) $ diff $OUTDIR/ecoli_affine.m0 $STDDIR/ecoli_affine_2014_06_10.m0 # Note that MapQV for --affineAlign has been fixed in 2014 04 18, bug 24363 blasr-smrtanalysis-4.0.0/ctest/aggressiveIntervalCut.t000066400000000000000000000010221302464523700231750ustar00rootroot00000000000000Set up $ . $TESTDIR/setup.sh Test --aggressiveIntervalCut. $ rm -f $TMP1 $ BASFILE=/pbi/dept/secondary/siv/testdata/BlasrTestData/ctest/data/aggressiveIntervalCut/m130812_185809_42141_c100533960310000001823079711101380_s1_p0.bas.h5 $ REFFA=/pbi/dept/secondary/siv/testdata/BlasrTestData/ctest/data/references/Ecoli_BL21_O26/sequence/Ecoli_BL21_O26.fasta $ $EXEC $BASFILE $REFFA --holeNumbers 1--100 --out $TMP1 --aggressiveIntervalCut [INFO] * [blasr] started. (glob) [INFO] * [blasr] ended. (glob) $ echo $? 0 blasr-smrtanalysis-4.0.0/ctest/alignScore.t000066400000000000000000000004141302464523700207470ustar00rootroot00000000000000Set up $ . $TESTDIR/setup.sh Test alignment score $ rm -rf $OUTDIR/testscore.m0 $ $EXEC $DATDIR/read.fasta $DATDIR/ref.fasta --minReadLength 1 -m 0 --out $OUTDIR/testscore.m0 [INFO]* (glob) [INFO]* (glob) $ diff $OUTDIR/testscore.m0 $STDDIR/testscore.m0 blasr-smrtanalysis-4.0.0/ctest/bamConcordant.t000066400000000000000000000020401302464523700214300ustar00rootroot00000000000000Set up $ . $TESTDIR/setup.sh Test using bam as input, use --concordant $ $EXEC $DATDIR/test_bam/tiny_bam.fofn $DATDIR/bamConcordantRef.fasta --bam --concordant --refineConcordantAlignments --bestn 1 --out $OUTDIR/bamConcordant.bam [INFO]* (glob) [INFO]* (glob) Check whether sam out and bam out have identical alignments, not checking qvs $ $SAMTOOLS view $OUTDIR/bamConcordant.bam |cut -f 4 1 1 8??? (glob) 86?? (glob) 86?? (glob) 86?? (glob) 86?? (glob) 86?? (glob) 86?? (glob) 86?? (glob) 86?? (glob) 86?? (glob) 86?? (glob) 86?? (glob) 86?? (glob) 86?? (glob) $ $EXEC /pbi/dept/secondary/siv/testdata/SA3-RS/lambda/2372215/0007_tiny/Analysis_Results/m150404_101626_42267_c100807920800000001823174110291514_s1_p0.1.subreads.bam $DATDIR/lambda_ref.fasta -m 4 --concordant --bestn 1 --holeNumbers 17417 --out $OUTDIR/tmp.m4 -V 2 > $OUTDIR/bamConcordant.log [INFO]* (glob) [INFO]* (glob) $ grep "Concordant template" $OUTDIR/bamConcordant.log Concordant template subread index: 8, 17417/14708_16595 blasr-smrtanalysis-4.0.0/ctest/bamIn.t000066400000000000000000000013701302464523700177110ustar00rootroot00000000000000Set up $ . $TESTDIR/setup.sh Test using bam as input $ $EXEC $DATDIR/test_bam/tiny_bam.fofn $DATDIR/lambda_ref.fasta -m 4 --out $OUTDIR/tiny_bam_in.m4 [INFO]* (glob) [INFO]* (glob) Check whether blasr produces identical results taking fasta sequences of the bam as input $ $EXEC $DATDIR/test_bam/tiny_fasta.fofn $DATDIR/lambda_ref.fasta -m 4 --out $OUTDIR/tiny_fasta_in.m4 [INFO]* (glob) [INFO]* (glob) $ diff $OUTDIR/tiny_bam_in.m4 $OUTDIR/tiny_fasta_in.m4 Test bam in, bam out $ $EXEC $DATDIR/test_bam/tiny_bam.fofn $DATDIR/lambda_ref.fasta --bam --out $OUTDIR/tiny_bam_in.bam --clipping subread [INFO]* (glob) [INFO]* (glob) TODO: test --concordant, when pbbam API to query over ZMWs is available. TODO: test bam with ccs reads blasr-smrtanalysis-4.0.0/ctest/bamOut.t000066400000000000000000000021171302464523700201120ustar00rootroot00000000000000Set up $ . $TESTDIR/setup.sh Test generating bam output Input is bam, clipping=soft and subread should produce identical results $ $EXEC $DATDIR/test_bam/tiny_bam.fofn $DATDIR/lambda_ref.fasta --bam --out $OUTDIR/tiny_bam_in_soft.bam --clipping soft [INFO]* (glob) [INFO]* (glob) $ $EXEC $DATDIR/test_bam/tiny_bam.fofn $DATDIR/lambda_ref.fasta --bam --out $OUTDIR/tiny_bam_in_subread.bam --clipping subread [INFO]* (glob) [INFO]* (glob) $ $SAMTOOLS view $OUTDIR/tiny_bam_in_soft.bam | sed -n '6,$p' > $TMP1.bam_in_soft $ $SAMTOOLS view $OUTDIR/tiny_bam_in_subread.bam | sed -n '6,$p' > $TMP2.bam_in_subread $ diff $TMP1.bam_in_soft $TMP2.bam_in_subread Test if bam cigar strings are correct $ head -2 $TMP1.bam_in_soft |cut -f 6 25=1I28=1I41=1I5=1D6=1X12=1I15=1I2=1I16=1D10=1I11=1I74=1D12=1D7=3I4=1I6=1D1=2D14=1D16=1I8=1D4=1D5=1D20=1I3=1I10=1I37=1I13=1I25=1I15=1I7=1I11=1I3=2I1=1I16=1I6=1I8=1I11=1X1=1I5=1I56=1I17= 28=1D7=1I1=1I9=2I12=1I3=1D13=1I15=1I2=1X49=1I19=1I14=1I5=1D17=1D20=1D86=1I21=1I9=1I24=1I6=1I1=1I2=1D11=1D4=1D3=1D31=1D6=1I6=1I9=1I57=2I24=1I26=1I8=1I43=1S blasr-smrtanalysis-4.0.0/ctest/bug25328.t000066400000000000000000000005571302464523700200520ustar00rootroot00000000000000Set up $ . $TESTDIR/setup.sh bug_25328, unrolled resequencing test $ INFA=$DATDIR/bug_25328_zmw_38131.fasta $ REF=$DATDIR/All4mers_circular_72x_l50256.fasta $ OUTFA=$OUTDIR/bug_25328.m4 $ $EXEC $INFA $REF --bestn 1 --nCandidates 1 --forwardOnly --maxMatch 14 -m 4 --out $OUTFA [INFO]* (glob) [INFO]* (glob) $ awk '$7-$6 >= 15000' $OUTFA |wc -l 1 blasr-smrtanalysis-4.0.0/ctest/bug25766.t000066400000000000000000000005121302464523700200470ustar00rootroot00000000000000Set up $ . $TESTDIR/setup.sh bug_25766, added an option --minRawSubreadScore $ BASFILE=$DATDIR/lambda_bax.fofn $ REF=$DATDIR/lambda_ref.fasta $ $EXEC $BASFILE $REF --out $TMP1 --minRawSubreadScore 700 --nproc 18 [INFO]* (glob) [INFO]* (glob) $ echo $? 0 $ sort $TMP1 > $TMP2 $ diff $TMP2 $STDDIR/bug_25766.m4 blasr-smrtanalysis-4.0.0/ctest/ccsH5.t000066400000000000000000000006471302464523700176360ustar00rootroot00000000000000Set up $ . $TESTDIR/setup.sh Test using *.ccs.h5 as input # The results should be exactly the same as # blasr $DATDIR/ccsasinput_bas.fofn $DATDIR/ccsasinput.fasta -m 4 --out tmp.m4 --useccsdenovo $ rm -rf $OUTDIR/ccsasinput.m4 $ $EXEC $DATDIR/ccsasinput.fofn $DATDIR/ccsasinput.fasta -m 4 --out $OUTDIR/ccsasinput.m4 [INFO]* (glob) [INFO]* (glob) $ diff $OUTDIR/ccsasinput.m4 $STDDIR/ccsasinput_2014_06_10.m4 blasr-smrtanalysis-4.0.0/ctest/cigarAdjecentIndels.t000066400000000000000000000013321302464523700225430ustar00rootroot00000000000000Set up $ . $TESTDIR/setup.sh Without --allowAdjacentIndels, adjacent indels should not exist in SAM/BAM CIGAR strings $ $EXEC $DATDIR/test_dataset/nofilter.subreadset.xml $DATDIR/ecoli_reference.fasta --bam --out $OUTDIR/noAdjacentIndels.bam --concordant --refineConcordantAlignments --bestn 1 && echo $? [INFO]* (glob) [INFO]* (glob) 0 $ $SAMTOOLS view $OUTDIR/noAdjacentIndels.bam |cut -f 6 > $TMP1 $ grep 'ID' $TMP1 |wc -l 0 $ grep 'DI' $TMP1 |wc -l 0 With --allowAdjacentIndels $ $EXEC $DATDIR/test_dataset/nofilter.subreadset.xml $DATDIR/ecoli_reference.fasta --bam --out $OUTDIR/allowAdjacentIndels.bam --concordant --bestn 1 --allowAdjacentIndels && echo $? [INFO]* (glob) [INFO]* (glob) 0 blasr-smrtanalysis-4.0.0/ctest/concordant.t000066400000000000000000000036651302464523700210260ustar00rootroot00000000000000Set up $ . $TESTDIR/setup.sh Test --concordant $ rm -rf $OUTDIR/concordant_subset.sam $OUTDIR/tmp1 $OUTDIR/tmp2 $ $EXEC $DATDIR/ecoli_lp.fofn $DATDIR/ecoli_reference.fasta --concordant --refineConcordantAlignments -m 4 --out $OUTDIR/concordant_subset.m4 --nproc 12 --holeNumbers 1--10000 --sa $DATDIR/ecoli_reference.sa [INFO]* (glob) [INFO]* (glob) $ sort $OUTDIR/concordant_subset.m4 > $OUTDIR/tmp1 Updated in 2016_10_05 --> changed output format from sam to m4, isolate concordant tests from file format tests $ diff $OUTDIR/tmp1 $STDDIR/2016_10_05/concordant_subset.m4 Test --concordant FMR1 case (the 'typical subread' is selected as template for concordant mapping) $ FOFN=$DATDIR/FMR1_concordant.fofn $ REF=$DATDIR/FMR1_130CGG.fasta $ $EXEC $FOFN $REF --concordant --refineConcordantAlignments --out $OUTDIR/FMR1_zmw_37927.m4 -m 4 --holeNumbers 37927 [INFO]* (glob) [INFO]* (glob) $ diff $OUTDIR/FMR1_zmw_37927.m4 $STDDIR/$UPDATEDATE/FMR1_zmw_37927.m4 #History #2014_05_28 --> changelist 135254, use MAX_BAND_SIZE to contrain GuidedAlign #2014_08_21 --> changelist 138516, added YS, YE, ZM tags. #2014_08_28 --> changelist 139176, update SAM MD5 #2014_09_12 --> changelist 140410, changed the default value of '--concordantTemplate' from 'longestsubread' to 'typicalsubread' #2014_09_17 --> changelist 140573, changed SDPFragment LessThan to make sure blasr compiled with gcc 4.4 and 4.8 can produce identical results. #2014_10_16 --> changelist 141378, changed the default value of '--concordantTemplate' from 'typicalsubread' to 'mediansubread' #2015_03_01 --> changelist 146599, reads from the same movie should have unique readGroupId #2015_03_28 --> changelist 148101, 148080 updated read group id, 148100 updated TLEN #2015_04_09 --> changelist 148796, updated read group id #2015_04_25 --> changelist 149721, update CIGAR string, replace M with X=. #2015_11_09 --> changelist 167117, added -refineConcordantAlignments blasr-smrtanalysis-4.0.0/ctest/dataset.t000066400000000000000000000032761302464523700203170ustar00rootroot00000000000000Set up $ . $TESTDIR/setup.sh Test dataset.xml as input $ $EXEC $DATDIR/test_dataset/chunking.subreadset.xml $DATDIR/ecoli_reference.fasta -m 4 --out $OUTDIR/chunking.m4 --bestn 1 && echo $? [INFO]* (glob) [INFO]* (glob) 0 Test filters in dataset.xml is respected. $ cat $OUTDIR/chunking.m4 | wc -l 9 Test dataset.xml --bam output $ $EXEC $DATDIR/test_dataset/chunking.subreadset.xml $DATDIR/ecoli_reference.fasta --bam --out $OUTDIR/chunking.bam && echo $? [INFO]* (glob) [INFO]* (glob) 0 Test dataset.xml --concordant $ $EXEC $DATDIR/test_dataset/chunking.subreadset.xml $DATDIR/ecoli_reference.fasta --bam --out $OUTDIR/chunking.concordant.bam --concordant && echo $? [INFO]* (glob) [INFO]* (glob) 0 Test dataset with no filters (to make sure that an empty filter does not discard all bam records.) $ $EXEC $DATDIR/test_dataset/nofilter.subreadset.xml $DATDIR/ecoli_reference.fasta --bam --out $OUTDIR/nofilter.bam --concordant --bestn 1 && echo $? [INFO]* (glob) [INFO]* (glob) 0 $ $SAMTOOLS view $OUTDIR/nofilter.bam|wc -l 131 Test dataset with --concordant is on $ $EXEC $DATDIR/test_dataset/nofilter.subreadset.xml $DATDIR/bamConcordantRef.fasta --bam --concordant --refineConcordantAlignments --bestn 1 --out $OUTDIR/datasetConcordant.bam --holeNumbers 1898 && echo $? [INFO]* (glob) [INFO]* (glob) 0 $ $SAMTOOLS view $OUTDIR/datasetConcordant.bam | cut -f 4 ??? (glob) 3?? (glob) 3?? (glob) 3?? (glob) 3?? (glob) 3?? (glob) 3?? (glob) 3?? (glob) 3?? (glob) 3?? (glob) 3?? (glob) 3?? (glob) 3?? (glob) 3?? (glob) 3?? (glob) 3?? (glob) 3?? (glob) 3?? (glob) 3?? (glob) 3?? (glob) 3?? (glob) 3?? (glob) 3?? (glob) blasr-smrtanalysis-4.0.0/ctest/deterministic.t000066400000000000000000000023741302464523700215330ustar00rootroot00000000000000Set up $ . $TESTDIR/setup.sh Test blasr with input bam which has: (1) insertionQV, deletionQV, deletionTag, substitutionQV, substitutionTag (2) insertionQV, deletionQV, deletionTag (3) no QV and then check if output is determined. (1) $ name=iq-dq-sub $ infile=$DATDIR/test_bam/$name.subreads.bam $ outfile=$OUTDIR/$name.m4 $ stdfile=$STDDIR/$name.m4 $ rm -f $outfile $ $EXEC $infile $DATDIR/lambda_ref.fasta -m 4 --out $outfile && echo $? [INFO]* (glob) [INFO]* (glob) 0 $ sort $outfile > $outfile.tmp && mv $outfile.tmp $outfile $ diff $outfile $stdfile (2) $ name=iq-dq $ infile=$DATDIR/test_bam/$name.subreads.bam $ outfile=$OUTDIR/$name.m4 $ stdfile=$STDDIR/$name.m4 $ rm -f $outfile $ $EXEC $infile $DATDIR/lambda_ref.fasta -m 4 --out $outfile && echo $? [INFO]* (glob) [INFO]* (glob) 0 $ sort $outfile > $outfile.tmp && mv $outfile.tmp $outfile $ diff $outfile $stdfile (3) $ name=no-iq-dq $ infile=$DATDIR/test_bam/$name.subreads.bam $ outfile=$OUTDIR/$name.m4 $ stdfile=$STDDIR/$name.m4 $ rm -f $outfile $ $EXEC $infile $DATDIR/lambda_ref.fasta -m 4 --out $outfile && echo $? [INFO]* (glob) [INFO]* (glob) 0 $ sort $outfile > $outfile.tmp && mv $outfile.tmp $outfile $ diff $outfile $stdfile blasr-smrtanalysis-4.0.0/ctest/ecoli.t000066400000000000000000000017501302464523700177600ustar00rootroot00000000000000Set up $ . $TESTDIR/setup.sh Test blasr on ecoli. Test blasr with --bam # The following job takes a very long time to finish, let us use a subset of reads instead #See $STDOUT/ecoli_v1.4.sam for 1.4 output. # $STDOUT/ecoli_2014_03_28.sam for bug before mapQV for affineAlign/align without QV is fixed. $ rm -rf $OUTDIR/ecoli_subset.bam $ rm -rf $OUTDIR/ecoli_subset.sam $ $EXEC $DATDIR/ecoli_subset.fasta $DATDIR/ecoli_reference.fasta --bam --out $OUTDIR/ecoli_subset.bam --nproc 15 [INFO]* (glob) [INFO]* (glob) $ $SAMTOOLS view -h $OUTDIR/ecoli_subset.bam > $OUTDIR/ecoli_subset.sam $ sed -n '5,$ p' $OUTDIR/ecoli_subset.sam | sort | cut -f 1-11 > $OUTDIR/ecoli_subset_out $ sed -n '5,$ p' $STDDIR/2016_10_20/ecoli_subset.sam | sort | cut -f 1-11 > $OUTDIR/ecoli_subset_std $ diff $OUTDIR/ecoli_subset_out $OUTDIR/ecoli_subset_std # 2015_03_08 --> changelist 148101, 148080 updated read group id; 148100 updated TLEN # 2015_04_09 --> changelist 148796, updated read group id blasr-smrtanalysis-4.0.0/ctest/fastMaxInterval.t000066400000000000000000000010061302464523700217670ustar00rootroot00000000000000Set up $ . $TESTDIR/setup.sh Test --fastMaxInterval. $ rm -f $TMP1 $ BASFILE=/pbi/dept/secondary/siv/testdata/BlasrTestData/ctest/data/aggressiveIntervalCut/m130812_185809_42141_c100533960310000001823079711101380_s1_p0.bas.h5 $ REFFA=/pbi/dept/secondary/siv/testdata/BlasrTestData/ctest/data/references/Ecoli_BL21_O26/sequence/Ecoli_BL21_O26.fasta $ $EXEC $BASFILE $REFFA --holeNumbers 1--100 --out $TMP1 --fastMaxInterval [INFO] * [blasr] started. (glob) [INFO] * [blasr] ended. (glob) $ echo $? 0 blasr-smrtanalysis-4.0.0/ctest/filtercriteria.t000066400000000000000000000011401302464523700216660ustar00rootroot00000000000000Set up $ . $TESTDIR/setup.sh $ NAME=test_filtercriteria $ DATDIR=$DATDIR/$NAME $ OUTDIR=$OUTDIR/$NAME $ STDDIR=$STDDIR/$NAME $ mkdir -p $OUTDIR Test --minPctSimilarity $ I=$DATDIR/tiny_bam.fofn $ R=$DATDIR/lambdaNEB.fa $ O=$OUTDIR/min_pct_similarity_90.m4 $ $EXEC $I $R --out $O -m 4 --minPctSimilarity 90 [INFO]* (glob) [INFO]* (glob) $ echo $? 0 $ awk '$4 < 90 {print}' $O |wc -l |cut -f 1 -d ' ' 0 $ O=$OUTDIR/min_aln_len_1000.m4 $ $EXEC $I $R --out $O -m 4 --minAlnLength 1000 [INFO]* (glob) [INFO]* (glob) $ echo $? 0 $ wc -l $O |cut -f 1 -d ' ' 14 blasr-smrtanalysis-4.0.0/ctest/fofn.t000066400000000000000000000014351302464523700176150ustar00rootroot00000000000000Set up $ . $TESTDIR/setup.sh Test blasr with *.fofn input # $ rm -rf $OUTDIR/lambda_bax.m4 # $ $EXEC $DATDIR/lambda_bax.fofn $DATDIR/lambda_ref.fasta -m 4 --out lambda_bax_tmp.m4 --nproc 15 --minMatch 14 # [INFO]* (glob) # [INFO]* (glob) # $ sort lambda_bax_tmp.m4 > $OUTDIR/lambda_bax.m4 # $ diff $OUTDIR/lambda_bax.m4 $STDDIR/lambda_bax.m4 # This test takes a long time, use a subset instad. $ rm -rf $OUTDIR/lambda_bax_subset.m4 $ $EXEC $DATDIR/lambda_bax.fofn $DATDIR/lambda_ref.fasta -m 4 --out $OUTDIR/lambda_bax_tmp_subset.m4 --nproc 15 --minMatch 14 --holeNumbers 1--1000 --sa $DATDIR/lambda_ref.sa [INFO]* (glob) [INFO]* (glob) $ sort $OUTDIR/lambda_bax_tmp_subset.m4 > $OUTDIR/lambda_bax_subset.m4 $ diff $OUTDIR/lambda_bax_subset.m4 $STDDIR/lambda_bax_subset.m4 blasr-smrtanalysis-4.0.0/ctest/hitpolicy.t000066400000000000000000000052351302464523700206730ustar00rootroot00000000000000Set up $ . $TESTDIR/setup.sh $ NAME=test_hitpolicy $ DATDIR=$DATDIR/$NAME $ OUTDIR=$OUTDIR/$NAME $ STDDIR=$STDDIR/$NAME $ mkdir -p $OUTDIR $ I=$DATDIR/tiny_bam.fofn $ R=$DATDIR/test_hitpolicy_target.fa $ O=$OUTDIR/hitpolicy_all.m4 $ X=$STDDIR/hitpolicy_all.m4 Test hitpolicy all $ $EXEC $I $R --out $O -m 4 --hitPolicy all [INFO]* (glob) [INFO]* (glob) $ echo $? 0 $ wc -l $O | cut -f 1 -d ' ' 683 Test hitpolicy allbest $ O=$OUTDIR/hitpolicy_allbest.m4 $ X=$STDDIR/hitpolicy_allbest.m4 $ $EXEC $I $R --out $O -m 4 --hitPolicy allbest && sort $O > $TMP1 && mv $TMP1 $O [INFO]* (glob) [INFO]* (glob) $ echo $? 0 $ sort $O > $TMP1 && mv $TMP1 $O $ diff $O $X && echo $? 0 Test hitpolicy random $ O=$OUTDIR/hitpolicy_random.m4 $ O2=$OUTDIR/hitpolicy_random_2.m4 $ X=$STDDIR/hitpolicy_random.m4 $ $EXEC $I $R --out $O -m 4 --hitPolicy random --randomSeed 1 [INFO]* (glob) [INFO]* (glob) $ $EXEC $I $R --out $O2 -m 4 --hitPolicy random --randomSeed 1 [INFO]* (glob) [INFO]* (glob) $ sort $O > $TMP1 && mv $TMP1 $O $ sort $O2 > $TMP2 && mv $TMP2 $O2 $ diff $O $O2 && echo $? 0 Test hitpolicy randombest bam inputs, nproc > 1, fixed seed $ O=$OUTDIR/hitpolicy_randombest_bam_in.m4 $ O2=$OUTDIR/hitpolicy_randombest_bam_in_2.m4 $ X=$STDDIR/hitpolicy_randombest_bam_in.m4 $ $EXEC $I $R --out $O -m 4 --hitPolicy randombest --randomSeed 1 --nproc 10 [INFO]* (glob) [INFO]* (glob) $ $EXEC $I $R --out $O2 -m 4 --hitPolicy randombest --randomSeed 1 --nproc 10 [INFO]* (glob) [INFO]* (glob) $ sort $O > $TMP1 && mv $TMP1 $O $ sort $O2 > $TMP1 && mv $TMP1 $O2 $ diff $O $O2 && echo $? 0 Test hitpolicy randombest bax inputs, nproc > 1, fixed seed $ I=$DATDIR/tiny_bax.fofn $ O=$OUTDIR/hitpolicy_randombest_bax_in.m4 $ X=$STDDIR/hitpolicy_randombest_bax_in.m4 $ $EXEC $I $R --out $O -m 4 --hitPolicy randombest --randomSeed 1 --nproc 10 [INFO]* (glob) [INFO]* (glob) $ sort $O > $TMP1 && mv $TMP1 $O $ diff $O $X && echo $? 0 Test hitpolicy randombest fasta inputs, nproc > 1, fixed seed $ I=$DATDIR/tiny_fasta.fofn $ O=$OUTDIR/hitpolicy_randombest_fasta_in.m4 $ X=$STDDIR/hitpolicy_randombest_fasta_in.m4 $ $EXEC $I $R --out $O -m 4 --hitPolicy randombest --randomSeed 1 --nproc 10 [INFO]* (glob) [INFO]* (glob) $ sort $O > $TMP1 && mv $TMP1 $O $ diff $O $X && echo $? 0 Test hitpolicy leftmost $ O=$OUTDIR/hitpolicy_leftmost.m4 $ X=$STDDIR/hitpolicy_leftmost.m4 $ $EXEC $I $R --out $O -m 4 --hitPolicy leftmost --nproc 10 [INFO]* (glob) [INFO]* (glob) $ # target is lambda x 6, leftmost -> only map to the very first x. $ awk '$10 > 48502 {print}' $O |wc -l 0 blasr-smrtanalysis-4.0.0/ctest/holeNumbers.t000066400000000000000000000006011302464523700211420ustar00rootroot00000000000000Set up $ . $TESTDIR/setup.sh Test --holeNumbers $ rm -f $OUTDIR/holeNumbers.m4 $ $EXEC $DATDIR/lambda_bax.fofn $DATDIR/lambda_ref.fasta -m 4 --out $OUTDIR/holeNumbers.m4 --holeNumbers 14798,55000--55100 --nproc 8 [INFO]* (glob) [INFO]* (glob) $ sort $OUTDIR/holeNumbers.m4 > $TMP1 $ sort $STDDIR/holeNumbers_2014_05_29.m4 > $TMP2 $ diff $TMP1 $TMP2 $ rm $TMP1 $TMP2 blasr-smrtanalysis-4.0.0/ctest/m0-5.t000066400000000000000000000016761302464523700173520ustar00rootroot00000000000000Set up $ . $TESTDIR/setup.sh Test blasr with -m 0 ~ 5 $ rm -rf $OUTDIR/read.m0 $ $EXEC $DATDIR/read.fasta $DATDIR/ref.fasta -m 0 --out $OUTDIR/read.m0 [INFO]* (glob) [INFO]* (glob) $ diff $OUTDIR/read.m0 $STDDIR/read.m0 $ rm -rf $OUTDIR/read.m1 $ $EXEC $DATDIR/read.fasta $DATDIR/ref.fasta -m 1 --out $OUTDIR/read.m1 [INFO]* (glob) [INFO]* (glob) $ diff $OUTDIR/read.m1 $STDDIR/read_2014_05_29.m1 $ rm -rf $OUTDIR/read.m2 $ $EXEC $DATDIR/read.fasta $DATDIR/ref.fasta -m 2 --out $OUTDIR/read.m2 [INFO]* (glob) [INFO]* (glob) $ diff $OUTDIR/read.m2 $STDDIR/read.m2 $ rm -rf $OUTDIR/read.m3 $ $EXEC $DATDIR/read.fasta $DATDIR/ref.fasta -m 3 --out $OUTDIR/read.m3 [INFO]* (glob) [INFO]* (glob) $ diff $OUTDIR/read.m3 $STDDIR/read.m3 $ rm -rf $OUTDIR/read.m4 $ $EXEC $DATDIR/read.fasta $DATDIR/ref.fasta -m 4 --out $OUTDIR/read.m4 [INFO]* (glob) [INFO]* (glob) $ diff $OUTDIR/read.m4 $STDDIR/read.m4 blasr-smrtanalysis-4.0.0/ctest/multipart.t000066400000000000000000000011551302464523700207050ustar00rootroot00000000000000Set up $ . $TESTDIR/setup.sh Test input.fofn containing a new bas.h5 file. Note that the new bas.h5 file does not contain any /PulseData, instead contains /MultiPart/Parts. $ rm -f $TMP1 $ BASFILE=/pbi/dept/secondary/siv/testdata/BlasrTestData/ctest/data/aggressiveIntervalCut/m130812_185809_42141_c100533960310000001823079711101380_s1_p0.bas.h5 $ REFFA=/pbi/dept/secondary/siv/testdata/BlasrTestData/ctest/data/references/Ecoli_BL21_O26/sequence/Ecoli_BL21_O26.fasta $ $EXEC $BASFILE $REFFA --holeNumbers 1--100 --out $TMP1 [INFO] * [blasr] started. (glob) [INFO] * [blasr] ended. (glob) $ echo $? 0 blasr-smrtanalysis-4.0.0/ctest/noSplitSubreads.t000066400000000000000000000042641302464523700220110ustar00rootroot00000000000000Set up $ . $TESTDIR/setup.sh Test blasr with --noSplitSubreads # $ rm -rf $OUTDIR/lambda_bax_noSplitSubreads.m4 # $ $EXEC $DATDIR/lambda_bax.fofn $DATDIR/lambda_ref.fasta --noSplitSubreads -m 4 --out lambda_bax_noSplitSubreads_tmp.m4 --nproc 15 # [INFO]* (glob) # [INFO]* (glob) # $ sort lambda_bax_noSplitSubreads_tmp.m4 > $OUTDIR/lambda_bax_noSplitSubreads.m4 # $ diff $OUTDIR/lambda_bax_noSplitSubreads.m4 $STDDIR/lambda_bax_noSplitSubreads.m4 # This test takes a long time, use a subset instad. $ rm -rf $OUTDIR/lambda_bax_noSplitSubreads_subset.m4 $ $EXEC $DATDIR/lambda_bax.fofn $DATDIR/lambda_ref.fasta --noSplitSubreads -m 4 --out $OUTDIR/lambda_bax_noSplitSubreads_tmp_subset.m4 --nproc 15 --holeNumbers 1--1000 --sa $DATDIR/lambda_ref.sa [INFO]* (glob) [INFO]* (glob) $ sort $OUTDIR/lambda_bax_noSplitSubreads_tmp_subset.m4 > $OUTDIR/lambda_bax_noSplitSubreads_subset.m4 $ diff $OUTDIR/lambda_bax_noSplitSubreads_subset.m4 $STDDIR/lambda_bax_noSplitSubreads_subset.m4 # Test key command of unrolled resequencing, check bam header and alignments in output $ outbam=$OUTDIR/unrolled-4mer.bam $ outsam=$OUTDIR/unrolled-4mer.sam $ query=$DATDIR/unrolled/m54006_151021_185942.subreadset.xml $ ref=$DATDIR/unrolled/All4mer_V2_11_V2_13_V2_15_V2_44_circular_72x_l50256.fasta $ stdsam=$STDDIR/unrolled-4mer.sam $ rm -rf $outbam $outsam $ $EXEC $query $ref --out $outbam --noSplitSubreads --fastMaxInterval --bam [INFO]* (glob) [INFO]* (glob) $ $SAMTOOLS view -h $outbam -o $outsam $ grep -v '^@PG' $outsam > $TMP1 && grep -v '^@PG' $stdsam > $TMP2 && diff $TMP1 $TMP2 $ grep '@RG' $outsam @RG\tID:e6043908* (glob) $ grep 'RG:Z:e6043908' $outsam |wc -l 4 $ query=$DATDIR/unrolled/m54006_151021_185942.subreads.bam $ outbam=$OUTDIR/unrolled-4mer-bam-in.bam $ outsam=$OUTDIR/unrolled-4mer-bam-in.sam $ rm -rf $outbam $outsam $ $EXEC $query $ref --out $outbam --noSplitSubreads --fastMaxInterval --bam [INFO]* (glob) [INFO]* (glob) $ $SAMTOOLS view -h $outbam -o $outsam $ grep -v '^@PG' $outsam > $TMP1 && grep -v '^@PG' $stdsam > $TMP2 && diff $TMP1 $TMP2 $ grep '@RG' $outsam @RG\tID:e6043908* (glob) $ grep 'RG:Z:e6043908' $outsam |wc -l 4 blasr-smrtanalysis-4.0.0/ctest/open_fail.t000066400000000000000000000005101302464523700206120ustar00rootroot00000000000000Set up $ . $TESTDIR/setup.sh If fail to open an bax/bas.h5 file because of unable to initialize required dataset, give an warning. $ $EXEC $DATDIR/open_fail_no_dyset.fofn $DATDIR/lambda_ref.fasta -m 4 [INFO]* (glob) Could not open /pbi/dept/secondary/siv/testdata/BlasrTestData/ctest/data/open_fail_no_dyset.fofn [1] blasr-smrtanalysis-4.0.0/ctest/setup.sh000077500000000000000000000022271302464523700201770ustar00rootroot00000000000000# Set up directories CURDIR=$TESTDIR REMOTEDIR=/pbi/dept/secondary/siv/testdata/BlasrTestData/ctest DATDIR=$REMOTEDIR/data OUTDIR=$CURDIR/out STDDIR=$REMOTEDIR/stdout # Set up the executable: blasr. #EXEC=${BLASR_PATH}/blasr EXEC=blasr # Define tmporary files TMP1=$OUTDIR/$$.tmp.out TMP2=$OUTDIR/$$.tmp.stdout # Make OUTDIR mkdir -p $OUTDIR #FIXME: make samtools independent of absolute build path. SAMTOOLS=/mnt/secondary/Smrtpipe/builds/Internal_Mainline_Nightly_LastSuccessfulBuild/analysis/bin/samtools #Update date UPDATEDATE=2015_11_09 # 2014_08_21 --> change 138516: added YS, YE, ZM tags # 2014_08_28 --> change 139176: Update SAM MD5 # 2015_03_28 --> change 148101: 148080 update read group id, 148100 update TLEN. # 2015_04_09 --> change 148796: update read group id # 2015_04_25 --> change 149721, update CIGAR string, replace M with X= # 2015_04_26 --> change 149749, add opiton -cigarUseSeqMatch (default: false). If -cigarUseSeqMatch is turned on, CIGAR strings use '=' and 'X' to represent sequence match and mismatch instead of 'M'. # 2015_11_05 --> change 166177, update CIGAR string, DO NOT allow adjacent indels unless -allowAdjacentIndels is ON. blasr-smrtanalysis-4.0.0/ctest/unaligned.t000066400000000000000000000024211302464523700206270ustar00rootroot00000000000000Set up $ . $TESTDIR/setup.sh Test dataset.xml as input $ $EXEC $DATDIR/test_dataset/chunking.subreadset.xml $DATDIR/ecoli_reference.fasta --unaligned $OUTDIR/unaligned.txt --noPrintUnalignedSeqs --concordant 1>/dev/null && echo $? [INFO]* (glob) [INFO]* (glob) 0 $ head -5 $OUTDIR/unaligned.txt m150404_101626_42267_c100807920800000001823174110291514_s1_p0/55/0_1380 m150404_101626_42267_c100807920800000001823174110291514_s1_p0/55/1432_3136 m150404_101626_42267_c100807920800000001823174110291514_s1_p0/480/11699_11988 m150404_101626_42267_c100807920800000001823174110291514_s1_p0/480/12033_13456 m150404_101626_42267_c100807920800000001823174110291514_s1_p0/480/13519_14067 $ $EXEC $DATDIR/ecoli_subset.fasta $DATDIR/ecoli_reference.fasta --unaligned $OUTDIR/unaligned.txt --noPrintUnalignedSeqs 1>/dev/null && echo $? [INFO]* (glob) [INFO]* (glob) 0 $ head -5 $OUTDIR/unaligned.txt m121004_000921_42130_c100440700060000001523060402151341_s1_p0/8/2724_3021 m121004_000921_42130_c100440700060000001523060402151341_s1_p0/13/0_278 m121004_000921_42130_c100440700060000001523060402151341_s1_p0/13/327_954 m121004_000921_42130_c100440700060000001523060402151341_s1_p0/13/1004_1580 m121004_000921_42130_c100440700060000001523060402151341_s1_p0/13/1625_2202 blasr-smrtanalysis-4.0.0/ctest/useccsallBestN1.t000066400000000000000000000007321302464523700216570ustar00rootroot00000000000000Set up $ . $TESTDIR/setup.sh Test --useccsall with bestn = 1 $ $EXEC $DATDIR/ccstest.fofn $DATDIR/ccstest_ref.fasta --bestn 1 --useccsall --bam --out $OUTDIR/useccsall.bam --holeNumbers 76772 [INFO]* (glob) [INFO]* (glob) $ $SAMTOOLS view -h $OUTDIR/useccsall.bam > $OUTDIR/useccsall.sam $ sed -n '9,$ p' $OUTDIR/useccsall.sam |cut -f 1-4 > $TMP1 $ sed -n '9,$ p' $STDDIR/$UPDATEDATE/useccsall.sam | cut -f 1-4 > $TMP2 $ diff $TMP1 $TMP2 $ rm $TMP1 $TMP2 blasr-smrtanalysis-4.0.0/ctest/useccsallLargeGenome.t000066400000000000000000000011251302464523700227450ustar00rootroot00000000000000Set up $ . $TESTDIR/setup.sh Test --useccsall with Large genome. $ BASFILE=/mnt/data3/vol53/2450530/0014/Analysis_Results/m130507_052228_42161_c100519212550000001823079909281305_s1_p0.3.bax.h5 $ REFDIR=/mnt/secondary/Smrtpipe/repository/hg19_M_sorted/sequence $ REFFA=$REFDIR/hg19_M_sorted.fasta $ REFSA=$REFDIR/hg19_M_sorted.fasta.sa $ OUTFILE=$OUTDIR/intflow.m4 $ $EXEC $BASFILE $REFFA --out $OUTFILE -m 4 --sa $REFSA --holeNumbers 109020 [INFO]* (glob) [INFO]* (glob) $ sort $OUTFILE > $TMP1 && sort $STDDIR/intflow_2014_06_10.m4 > $TMP2 && diff $TMP1 $TMP2 && echo $? 0 blasr-smrtanalysis-4.0.0/ctest/verbose.t000066400000000000000000000003051302464523700203250ustar00rootroot00000000000000Set up $ . $TESTDIR/setup.sh Test alignment score $ $EXEC $DATDIR/lambda_bax.fofn $DATDIR/lambda_ref.fasta --holeNumbers 1--200 -V 3 > $TMP1 [INFO]* (glob) [INFO]* (glob) $ echo $? 0 blasr-smrtanalysis-4.0.0/extrautils/000077500000000000000000000000001302464523700175575ustar00rootroot00000000000000blasr-smrtanalysis-4.0.0/extrautils/BuildSequenceDB.cpp000066400000000000000000000030761302464523700232270ustar00rootroot00000000000000#include #include "utils.hpp" #include "metagenome/SequenceIndexDatabase.hpp" #include "CommandLineParser.hpp" #include "FASTAReader.hpp" #include "utils/FileOfFileNames.hpp" using namespace std; int main(int argc, char* argv[]) { CommandLineParser clp; string fastaFileName, indexFileName; vector fastaFileNames; vector opts; clp.SetProgramName("bsdb"); clp.SetProgramSummary("Build an index database on a file of sequences.\n" " The index is used to map to reads given alignment positions.\n"); clp.RegisterStringOption("fasta", &fastaFileName, "A file with sequences to build an index."); clp.RegisterStringOption("index", &indexFileName, "The index file."); clp.RegisterPreviousFlagsAsHidden(); clp.ParseCommandLine(argc, argv, opts); ifstream fastaIn; ofstream indexOut; if (FileOfFileNames::IsFOFN(fastaFileName)) { FileOfFileNames::FOFNToList(fastaFileName, fastaFileNames); } else { fastaFileNames.push_back(fastaFileName); } CrucialOpen(indexFileName, indexOut, std::ios::out | std::ios::binary); SequenceIndexDatabase seqDB; int fileNameIndex; for (fileNameIndex = 0; fileNameIndex < fastaFileNames.size(); fileNameIndex++){ FASTAReader reader; FASTASequence seq; reader.Init(fastaFileNames[fileNameIndex]); int i = 0; while (reader.GetNext(seq)) { seqDB.AddSequence(seq); i++; } } seqDB.Finalize(); seqDB.WriteDatabase(indexOut); return 0; } blasr-smrtanalysis-4.0.0/extrautils/BwtToSuffixArray.cpp000066400000000000000000000013141302464523700235050ustar00rootroot00000000000000#include "bwt/BWT.hpp" #include "suffixarray/SuffixArray.hpp" #include "suffixarray/SuffixArrayTypes.hpp" #include #include #include using namespace std; int main(int argc, char* argv[]) { string bwtFileName, saFileName; if (argc < 3) { cout << "usage: bwt2sa bwtfile safile " << endl; exit(1); } bwtFileName = argv[1]; saFileName = argv[2]; Bwt bwt; DNASuffixArray suffixArray; bwt.Read(bwtFileName); suffixArray.AllocateSuffixArray(bwt.bwtSequence.length-1); SAIndex index; for (index = 1; index < bwt.bwtSequence.length+1; index++) { suffixArray.index[index-1] = bwt.Locate(index); } suffixArray.Write(saFileName); } blasr-smrtanalysis-4.0.0/extrautils/Evolve.cpp000066400000000000000000000155031302464523700215270ustar00rootroot00000000000000#include #include #include "utils.hpp" #include "FASTAReader.hpp" #include "FASTASequence.hpp" #include "CommandLineParser.hpp" #include "statistics/StatUtils.hpp" using namespace std; /* ref000001 . SNV 9454 9454 0.00 . . reference=C;confidence=0;Name=9454C>A;coverage=0;variantseq=A ref000001 . deletion 20223 20223 0.00 . . reference=T;length=1;confidence=0;coverage=0;Name=20222delT ref000001 . insertion 35089 35089 0.00 . . confidence=0;Name=35089_35090insC;reference=.;length=1;coverage=0;variantseq=C */ char ToLower(char c, bool useToLower) { if (useToLower) { return tolower(c); } else { return toupper(c); } } int main(int argc, char* argv[]) { CommandLineParser clp; string refGenomeName; string mutGenomeName; string gffFileName; float insRate = 0; float delRate = 0; float mutRate = 0; bool lower = false; gffFileName = ""; clp.RegisterStringOption("refGenome", &refGenomeName, "Reference genome.", true); clp.RegisterStringOption("mutGenome", &mutGenomeName, "Mutated genome.", true); clp.RegisterPreviousFlagsAsHidden(); clp.RegisterStringOption("gff", &gffFileName, "GFF file describing the modifications made to the genome."); clp.RegisterFloatOption("i", &insRate, "Insertion rate: (0-1].", CommandLineParser::NonNegativeFloat, false); clp.RegisterFloatOption("d", &delRate, "Deletion rate: (0-1]", CommandLineParser::NonNegativeFloat, false); clp.RegisterFloatOption("m", &mutRate, "Mutation rate, even across all nucleotides: (0-1]", CommandLineParser::NonNegativeFloat, false); clp.RegisterFlagOption("lower", &lower, "Make mutations in lower case", false); vector leftovers; clp.ParseCommandLine(argc, argv, leftovers); FASTAReader reader; FASTASequence refGenome; reader.Init(refGenomeName); ofstream mutGenomeOut; CrucialOpen(mutGenomeName, mutGenomeOut, std::ios::out); ofstream gffOut; if (gffFileName != "") { CrucialOpen(gffFileName, gffOut, std::ios::out); } vector insIndices, delIndices, subIndices; int readIndex = 0; InitializeRandomGeneratorWithTime(); while (reader.GetNext(refGenome)) { insIndices.resize(refGenome.length); delIndices.resize(refGenome.length); subIndices.resize(refGenome.length); std::fill(insIndices.begin(), insIndices.end(), false); std::fill(delIndices.begin(), delIndices.end(), false); std::fill(subIndices.begin(), subIndices.end(), 0); enum ChangeType { Ins, Del, Mut, None}; float changeProb[4]; changeProb[Ins] = insRate; changeProb[Del] = changeProb[Ins] + delRate; changeProb[Mut] = changeProb[Del] + mutRate; changeProb[None] = 1; if (changeProb[Mut] > 1) { cout << "ERROR! The sum of the error probabilities must be less than 1" << endl; exit(1); } DNALength pos; float randomNumber; int numIns = 0; int numDel = 0; int numMut = 0; for (pos =0 ; pos < refGenome.length; pos++) { randomNumber = Random(); if (randomNumber < changeProb[Ins]) { insIndices[pos] = true; numIns++; } else if (randomNumber < changeProb[Del]) { delIndices[pos] = true; numDel++; } else if (randomNumber < changeProb[Mut]){ Nucleotide newNuc = TwoBitToAscii[RandomInt(4)]; int maxIts = 100000; int it = 0; while (newNuc == refGenome.seq[pos]) { newNuc = TwoBitToAscii[RandomInt(4)]; if (it == maxIts) { cout << "ERROR, something is wrong with the random number generation, it took too many tries to generate a new nucleotide" << endl; exit(1); } } subIndices[pos] = refGenome[pos]; refGenome.seq[pos] = ToLower(newNuc,lower); ++numMut; } } // cout << readIndex << " m " << numMut << " i " << numIns << " d " << numDel << endl; if (readIndex % 100000 == 0 && readIndex > 0) { cout << readIndex << endl; } // // Now add the insertions and deletions. // FASTASequence newSequence; DNALength newPos; if (numIns - numDel + refGenome.length < 0) { cout << "ERROR, the genome has been deleted to nothing." << endl; exit(1); } ResizeSequence(newSequence, refGenome.length + (numIns - numDel)); newPos = 0; pos = 0; for (pos = 0; pos < refGenome.length; pos++) { assert(newPos < newSequence.length or delIndices[pos] == true); if (subIndices[pos] != 0 and gffFileName != "") { gffOut << refGenome.GetName() << " . SNV " << newPos << " " << newPos <<" 0.00 . . reference=" << (char)subIndices[pos] << ";confidence=10;Name=" << newPos << (char)subIndices[pos] << ">" << refGenome.seq[pos] <<";coverage=10;variantseq=" << refGenome.seq[pos] << endl; } if (insIndices[pos] == true) { newSequence.seq[newPos] = ToLower(TwoBitToAscii[RandomInt(4)], lower); newPos++; newSequence.seq[newPos] = refGenome.seq[pos]; assert(newSequence.seq[newPos] != '1'); assert(newSequence.seq[newPos] != 1); if (gffFileName != "") { gffOut << refGenome.GetName() << " . deletion " << newPos << " " << newPos << " 0.00 . . reference=" << newSequence.seq[newPos] << ";length=1;confidence=10;coverage=0;Name="<< newPos << "del" << newSequence.seq[newPos] << endl; } newPos++; } else if (delIndices[pos] == true) { // no-op, skip if (gffFileName != "") { gffOut << refGenome.GetName() << " . insertion " << newPos << " " << newPos << " 0.00 . . confidence=10;Name=" << newPos << "_ins" << refGenome.seq[pos] << ";reference=.;length=1;coverage=0;variantseq=" << refGenome.seq[newPos] << endl; //ref000001 . deletion 20223 20223 0.00 . . reference=T;length=1;confidence=0;coverage=0;Name=20222delT } } else { newSequence.seq[newPos] = refGenome.seq[pos]; newPos++; } } stringstream titlestrm; titlestrm << " mutated ins " << insRate << " del " << delRate << " mut " << mutRate; newSequence.CopyTitle(refGenome.title); newSequence.AppendToTitle(titlestrm.str()); newSequence.PrintSeq(mutGenomeOut); newSequence.Free(); readIndex++; } } blasr-smrtanalysis-4.0.0/extrautils/ExciseRepeats.cpp000066400000000000000000000031061302464523700230270ustar00rootroot00000000000000#include #include "FASTAReader.hpp" #include "FASTASequence.hpp" #include "utils.hpp" using namespace std; int main(int argc, char* argv[]) { string seqInName, seqOutName, dotOutName; if (argc < 4) { cout << "usage: exciseRepeats inName repMaskOutFile outName" << endl; exit(1); } seqInName = argv[1]; dotOutName = argv[2]; seqOutName = argv[3]; FASTAReader reader; reader.Initialize(seqInName); FASTASequence origSeq; reader.GetNext(origSeq); ifstream dotOutFile; CrucialOpen(dotOutName, dotOutFile); ofstream seqOutFile; ofstream seqOut; CrucialOpen(seqOutName, seqOut, std::ios::out); string dotOutLine; getline(dotOutFile, dotOutLine); getline(dotOutFile, dotOutLine); getline(dotOutFile, dotOutLine); while(getline(dotOutFile, dotOutLine)) { stringstream lineStrm(dotOutLine); int swScore; float pctDiv, pctDel, pctIns; string query; int qPosBegin, qPosEnd; string left; char strand; string matchingRepeat; string repClass; string repPos, repEnd, repLeft; int id; lineStrm >> swScore >> pctDiv >> pctDel >> pctIns >> query >> qPosBegin >> qPosEnd >> left >> strand >> matchingRepeat >> repClass >> repPos >> repEnd >> repLeft >> id; DNALength seqPos; for (seqPos = qPosBegin; seqPos < qPosEnd; seqPos++) { origSeq.seq[seqPos] = 'X'; } } DNALength seqPos, unexPos; unexPos = 0; for (seqPos = 0; seqPos < origSeq.length; seqPos++) { if (origSeq.seq[seqPos] != 'X') { origSeq.seq[unexPos] = origSeq.seq[seqPos]; unexPos++; } } origSeq.length = unexPos; origSeq.PrintSeq(seqOut); return 0; } blasr-smrtanalysis-4.0.0/extrautils/PrintTupleCountTable.cpp000066400000000000000000000036501302464523700243560ustar00rootroot00000000000000#include #include #include #include #include "utils.hpp" #include "FASTASequence.hpp" #include "FASTAReader.hpp" #include "CommandLineParser.hpp" #include "tuples/DNATuple.hpp" #include "tuples/CompressedDNATuple.hpp" #include "tuples/TupleMetrics.hpp" #include "tuples/TupleCountTable.hpp" #ifdef COMPRESSED typedef TupleCountTable > CountTable; #else typedef TupleCountTable CountTable; #endif int main(int argc, char* argv[]) { CommandLineParser clp; string tableFileName; vector sequenceFiles; TupleMetrics tm; int tupleSize = 8; clp.SetProgramName("printTupleCountTable"); clp.SetProgramSummary("Count the number of occurrences of every k-mer in a file."); clp.RegisterStringOption("table", &tableFileName, "Output table name.", true); clp.RegisterIntOption("wordsize", &tupleSize, "Size of words to count", CommandLineParser::NonNegativeInteger, false); clp.RegisterStringListOption("reads", &sequenceFiles, "All sequences.", false); clp.RegisterPreviousFlagsAsHidden(); vector opts; if (argc == 2) { string fastaFileName = argv[1]; sequenceFiles.push_back(fastaFileName); tableFileName = fastaFileName + ".ctab"; } else { clp.ParseCommandLine(argc, argv, opts); } tm.tupleSize = tupleSize; tm.InitializeMask(); ofstream tableOut; CrucialOpen(tableFileName, tableOut, std::ios::out| std::ios::binary); CountTable table; table.InitCountTable(tm); int i; FASTASequence seq; for (i = 0; i < sequenceFiles.size(); i++ ){ FASTAReader reader; reader.Init(sequenceFiles[i]); while (reader.GetNext(seq)) { seq.ToUpper(); table.AddSequenceTupleCountsLR(seq); } } table.Write(tableOut); return 0; } blasr-smrtanalysis-4.0.0/extrautils/SALS.cpp000066400000000000000000000017431302464523700210320ustar00rootroot00000000000000#include #include #include "utils.hpp" #include "suffixarray/SuffixArray.hpp" #include "suffixarray/SuffixArrayTypes.hpp" using namespace std; int main(int argc, char* argv[]) { if (argc <= 1) { cout << "sals checks if a suffix array has lookup table or not." < #include #include "NucConversion.hpp" #include "FASTASequence.hpp" #include "FASTAReader.hpp" #include "suffixarray/SuffixArray.hpp" #include "suffixarray/SuffixArrayTypes.hpp" #include "suffixarray/ssort.hpp" #include "algorithms/sorting/qsufsort.hpp" void PrintUsage() { cout << "samodify changes word size of input suffix array." << endl; cout << "Usage: samodify in.sa genome.fasta out.sa [-blt p]" << endl; cout << " -blt p Build a lookup table on prefixes of length 'p' " << endl; } int main(int argc, char* argv[]) { if (argc < 4) { PrintUsage(); exit(1); } int argi = 1; string saInFile = argv[argi++]; string genomeFileName = argv[argi++]; string saOutFile = argv[argi++]; vector inFiles; int doBLT = 0; int doBLCP = 0; int bltPrefixLength = 0; int lcpLength = 0; int parsingOptions = 0; while (argi < argc) { if (strcmp(argv[argi], "-blt") == 0) { doBLT = 1; bltPrefixLength = atoi(argv[++argi]); } else if (strcmp(argv[argi], "-blcp") == 0) { doBLCP = 1; lcpLength = atoi(argv[++argi]); } else { PrintUsage(); cout << "Bad option: " << argv[argi] << endl; exit(1); } ++argi; } // // Read the suffix array to modify. // DNASuffixArray sa; sa.Read(saInFile); FASTAReader reader; reader.Initialize(genomeFileName); FASTASequence seq; reader.ReadAllSequencesIntoOne(seq); if (doBLT) { sa.BuildLookupTable(seq.seq, seq.length, bltPrefixLength); } if (doBLCP) { cout << "LCP Table not yet implemented." << endl; } sa.Write(saOutFile); } blasr-smrtanalysis-4.0.0/extrautils/SWMatcher.cpp000066400000000000000000000130521302464523700221210ustar00rootroot00000000000000#include #include #include #include #include "FASTAReader.hpp" #include "FASTASequence.hpp" #include "algorithms/alignment/AlignmentUtils.hpp" #include "algorithms/alignment/DistanceMatrixScoreFunction.hpp" #include "algorithms/alignment/IDSScoreFunction.hpp" #include "algorithms/alignment/SWAlign.hpp" #include "format/StickAlignmentPrinter.hpp" using namespace std; int main(int argc, char* argv[]) { if (argc < 3) { cout << "usage: swMatcher query target [-indel i] [-local] [-showalign] " << endl << " [-type queryfit|overlap|global] [-match m ] [-mismatch m]" << endl << " or [-local] [-queryfit] [-overlap] [-fixedtarget] [-fixedquery]" << endl << " [-printmatrix]"<< endl << " Unless -showalign is specified, output is tabular and in the formt:"< scoreFn( SMRTDistanceMatrix, insertion, deletion); FASTASequence query, target; FASTAReader queryReader, targetReader; queryReader.Init(queryName); targetReader.Init(targetName); if (fixedTarget) { targetReader.GetNext(target); } if (fixedQuery) { queryReader.GetNext(query); } // // Prepare the target database; // // // Prepare the query match set. // int seqIndex = 0; vector scoreMat; vector pathMat; int alignScore; MatchedAlignment alignment; if (match != 0) { int i; for (i = 0; i < 4; i++ ) { LocalAlignLowMutationMatrix[i][i] = match; } } int i,j; for (i = 0; i < 5; i++) { for (j = 0; j < 5 ; j++) { if (i == j) continue; SMRTDistanceMatrix[i][j] += 3; } } cout << "qlen tlen score" << endl; while ((fixedQuery or queryReader.GetNext(query)) and (fixedTarget or targetReader.GetNext(target))) { alignment.qName.assign(query.title, query.titleLength); alignment.tName.assign(target.title, target.titleLength); alignment.blocks.clear(); alignment.qPos = 0; alignment.tPos = 0; alignment.qStart = 0; alignment.tStart = 0; if (query.length == 0 or target.length == 0) continue; alignScore = SWAlign(query, target, scoreMat, pathMat, alignment, scoreFn, alignType, false, printMatrix); cout << query.length << " " << target.length << " " << alignScore << endl; cout << alignment.qPos << " " << alignment.QEnd() << " " << alignment.tPos << " " << alignment.TEnd() << endl; if (showAlign) { ComputeAlignmentStats(alignment, query.seq, target.seq, scoreFn); //SMRTDistanceMatrix, indelCost, indelCost); PrintAlignmentStats(alignment, cout); StickPrintAlignment(alignment, query, target, cout); } ++seqIndex; } return 0; } blasr-smrtanalysis-4.0.0/extrautils/SimpleShredder.cpp000066400000000000000000000133651302464523700232050ustar00rootroot00000000000000#include #include #include "utils.hpp" #include "FASTAReader.hpp" #include "FASTQSequence.hpp" #include "FASTASequence.hpp" #include "CommandLineParser.hpp" #include "metagenome/FindRandomSequence.hpp" #include "statistics/StatUtils.hpp" using namespace std; int main(int argc, char* argv[]) { string inFileName, readsFileName; DNALength readLength; float coverage = 0; bool noRandInit = false; int numReads = -1; CommandLineParser clp; int qualityValue = 20; bool printFastq = false; int stratify = 0; string titleType = "pacbio"; string fastqType = "illumina"; // or "sanger" clp.RegisterStringOption("inFile", &inFileName, "Reference sequence", 0); clp.RegisterPreviousFlagsAsHidden(); clp.RegisterIntOption("readLength", (int*) &readLength, "The length of reads to simulate. The length is fixed.", CommandLineParser::PositiveInteger, 0); clp.RegisterFloatOption("coverage", &coverage, "Total coverage (from which the number of reads is calculated", CommandLineParser::PositiveFloat, 0); clp.RegisterFlagOption("nonRandInit", &noRandInit, "Skip initializing the random number generator with time."); clp.RegisterIntOption("nReads", &numReads, "Total number of reads (from which coverage is calculated)", CommandLineParser::PositiveInteger, 0); clp.RegisterStringOption("readsFile", &readsFileName, "Reads output file", 0); clp.RegisterFlagOption("fastq", &printFastq, "Fake fastq output with constant quality value (20)"); clp.RegisterIntOption("quality", &qualityValue, "Value to use for fastq quality", CommandLineParser::PositiveInteger); clp.RegisterIntOption("stratify", &stratify, "Sample a read every 'stratify' bases, rather than randomly.", CommandLineParser::PositiveInteger); clp.RegisterStringOption("titleType", &titleType, "Set the name of the title: 'pacbio'|'illumina'"); clp.RegisterStringOption("fastqType", &fastqType, "Set the type of fastq: 'illumina'|'sanger'"); vector leftovers; clp.ParseCommandLine(argc, argv, leftovers); if (!noRandInit) { InitializeRandomGeneratorWithTime(); } FASTAReader inReader; inReader.Init(inFileName); vector reference; inReader.ReadAllSequences(reference); ofstream readsFile; if (readsFileName == "") { cout << "ERROR. You must specify a reads file." << endl; exit(1); } CrucialOpen(readsFileName, readsFile, std::ios::out); ofstream sangerFastqFile; if (fastqType == "sanger") { string sangerFastqFileName = readsFileName + ".fastq"; CrucialOpen(sangerFastqFileName, sangerFastqFile, std::ios::out); } DNALength refLength = 0; int i; for (i = 0; i < reference.size(); i++) { refLength += reference[i].length; } if (numReads == -1 and coverage == 0 and stratify == 0) { cout << "ERROR, you must specify either coverage, nReads, or stratify." << endl; exit(1); } else if (numReads == -1) { numReads = (refLength / readLength) * coverage; } if (stratify) { if (!readLength) { cout << "ERROR. If you are using stratification, a read length must be specified." << endl; exit(1); } } DNASequence sampleSeq; sampleSeq.length = readLength; int maxRetry = 10000000; int retryNumber = 0; DNALength seqIndex, seqPos; if (stratify) { seqIndex = 0; seqPos = 0; } DNALength origReadLength = readLength; for (i = 0; stratify or i < numReads; i++) { if (stratify == 0) { FindRandomPos(reference, seqIndex, seqPos, readLength ); } else { // // find the next start pos, or bail if done // if (seqPos >= reference[seqIndex].length) { if (seqIndex == reference.size() - 1) { break; } else { seqIndex = seqIndex + 1; seqPos = 0; continue; } } readLength = min(reference[seqIndex].length - seqPos, origReadLength); } sampleSeq.seq = &reference[seqIndex].seq[seqPos]; int j; int gappedRead = 0; string title; stringstream titleStrm; if (titleType == "pacbio") { titleStrm << i << "|"<< reference[seqIndex].GetName() << "|" << seqPos << "|" << seqPos + readLength; } else if (titleType == "illumina") { titleStrm << "SE_" << i << "_0@" << seqPos << "-"<" << title << endl; sampleSeq.PrintSeq(readsFile); } else { FASTQSequence fastqSampleSeq; fastqSampleSeq.CopyTitle(title); fastqSampleSeq.seq = sampleSeq.seq; fastqSampleSeq.length = sampleSeq.length; fastqSampleSeq.qual.data = new unsigned char[sampleSeq.length]; fill(fastqSampleSeq.qual.data, fastqSampleSeq.qual.data + sampleSeq.length, qualityValue); if (fastqType == "illumina") { fastqSampleSeq.PrintFastq(readsFile, fastqSampleSeq.length+1); } else { fastqSampleSeq.PrintSeq(readsFile); fastqSampleSeq.PrintQual(sangerFastqFile); } delete[] fastqSampleSeq.qual.data; delete[] fastqSampleSeq.title; } if (stratify) { seqPos += readLength; } } return 0; } blasr-smrtanalysis-4.0.0/extrautils/StoreQualityByContextFromCmpH5.cpp000066400000000000000000000236741302464523700262650ustar00rootroot00000000000000#include "files/ReaderAgglomerate.hpp" #include "SMRTSequence.hpp" #include "utils/FileOfFileNames.hpp" #include "simulator/ContextSet.hpp" #include "simulator/OutputSampleListSet.hpp" #include "datastructures/alignment/CmpFile.hpp" #include "HDFCmpFile.hpp" #include "format/StickAlignmentPrinter.hpp" class ScoredLength { public: int score, length; int operator<(const ScoredLength &rhs) const { return score < rhs.score; } ScoredLength(int s, int l) : score(s), length(l) {} ScoredLength() {} }; void PrintUsage() { cout << "cmpH5StoreQualityByContext - grab quality values from cmp.h5 files until minimum requirements for the number of times a context has been sampled are met." << endl; cout << "usage: cmpH5StoreQualityByContext aligned_reads.cmp.h5 output.qbc [options] " << endl; cout << "options: " << endl << " -contextLength L The length of the context to sample (default: 5) " << endl << " -minSamples S(500) Report pass if all contexts are sampled" << endl << " at least S times." << endl << " -maxSamples S(1000) Stop sampling a context once it has reached" << endl << " S samples." << endl << " -onlyMaxLength" < maxLengthMap; OutputSampleListSet samples(contextLength); SMRTSequence read; ofstream sampleOut; CrucialOpen(outFileName, sampleOut, std::ios::out|std::ios::binary); int fileNameIndex; int numContextsReached = 0; int numContexts = 1 << (contextLength*2); ReaderAgglomerate reader; samples.keyLength = contextLength; HDFCmpFile cmpReader; cmpReader.IncludeField("QualityValue"); cmpReader.IncludeField("DeletionQV"); cmpReader.IncludeField("InsertionQV"); cmpReader.IncludeField("SubstitutionQV"); cmpReader.IncludeField("SubstitutionTag"); cmpReader.IncludeField("DeletionTag"); cmpReader.IncludeField("PulseIndex"); cmpReader.IncludeField("WidthInFrames"); cmpReader.IncludeField("PreBaseFrames"); if (cmpReader.Initialize(cmpH5FileName, H5F_ACC_RDWR) == 0) { cout << "ERROR, could not open the cmp file." << endl; exit(1); } cout << "Reading cmp file." << endl; CmpFile cmpFile; cmpReader.ReadAlignmentDescriptions(cmpFile); cmpReader.ReadStructure(cmpFile); cout << "done reading structure."< alignmentToBaseMap; for (alignmentIndex = 0; alignmentIndex < nAlignments and !samples.Sufficient(); alignmentIndex++) { // // For ease of use, store the length of the alignment to make another model. // ByteAlignment alignmentArray; cmpReader.ReadAlignmentArray(alignmentIndex, alignmentArray); Alignment alignment; ByteAlignmentToAlignment(alignmentArray, alignment); string readSequence, refSequence; readSequence.resize(alignmentArray.size()); refSequence.resize(alignmentArray.size()); DNASequence readDNA, refDNA; ByteAlignmentToQueryString(&alignmentArray[0], alignmentArray.size(), &readSequence[0]); ByteAlignmentToRefString(&alignmentArray[0], alignmentArray.size(), &refSequence[0]); RemoveGaps(readSequence, readSequence); RemoveGaps(refSequence, refSequence); readDNA.seq = (Nucleotide*) readSequence.c_str(); readDNA.length = readSequence.size(); refDNA.seq = (Nucleotide*) refSequence.c_str(); refDNA.length = refSequence.size(); CmpAlignment cmpAlignment; cmpReader.ImportReadFromCmpH5(alignmentIndex, cmpAlignment, read); CreateAlignmentToSequenceMap(alignmentArray, alignmentToBaseMap); if (read.length < contextLength) { continue; } int subreadLength = (cmpFile.alnInfo.alignments[alignmentIndex].GetQueryEnd() - cmpFile.alnInfo.alignments[alignmentIndex].GetQueryStart()); if (onlyMaxLength == false) { samples.lengths.push_back(subreadLength); } else { int score = (cmpAlignment.GetNMatch() - cmpAlignment.GetNMismatch() - cmpAlignment.GetNInsertions() - cmpAlignment.GetNDeletions()); stringstream nameStrm; nameStrm << cmpAlignment.GetMovieId() << "_" << cmpAlignment.GetHoleNumber(); string nameStr = nameStrm.str(); if (maxLengthMap.find(nameStr) == maxLengthMap.end()) { maxLengthMap[nameStr] = ScoredLength(score, subreadLength); } } int sampleEnd = alignmentArray.size() - contextLength/2; int a; for (a = contextLength/2; a < sampleEnd; a++) { // Make sure the context begins on a real nucleotide. while (a < sampleEnd and ((RefChar[alignmentArray[a]] == ' '))) {a++;} // // Move ab back to an index where there are contextLength/2 non-gap // characters, counted by nb // int ab; //num bases int ae; //alignment end ab = a-1; int nb = 0, ne = 0; while (true) { if (RefChar[alignmentArray[ab]] != ' ') { nb++; } if (ab == 0 or nb == contextLength/2) break; ab--; } // // Advance ae to an index where there are contextLength/2 non-gap // characters, counted by ne. // ae = a + 1; while (ae < alignmentArray.size() and ne < contextLength/ 2) { if (RefChar[alignmentArray[ae]] != ' ') { ne++; } ae++; } // // Make sure there are no edge effects that prevent a context of the correct length from being assigned. // if (nb + ne + 1 != contextLength) { continue; } int ai; string context; for (ai = ab; ai < ae; ai++) { if (RefChar[alignmentArray[ai]] != ' ') { context.push_back(RefChar[alignmentArray[ai]]); } } assert(context.size() == contextLength); // // Now create the context. // OutputSample sample; // // This context is a deletion, create that. // sample.type = OutputSample::Deletion; // // This context is either an insertion or substitution // // Look to see if the previous aligned position was an // insertion, and move back as far as the insertion extends. int aq = a-1; int sampleLength; if (QueryChar[alignmentArray[a]] == ' ') { sample.type = OutputSample::Deletion; sampleLength = 0; } else if (RefChar[alignmentArray[aq]] == ' ') { while (aq > 0 and RefChar[alignmentArray[aq]] == ' ' and QueryChar[alignmentArray[aq]] != ' ') { aq--; } sample.type = OutputSample::Insertion; sampleLength = a - aq; } else if (QueryChar[alignmentArray[a]] == RefChar[alignmentArray[aq]]) { sample.type = OutputSample::Match; sampleLength = 1; } else { sample.type = OutputSample::Substitution; sampleLength = 1; } sample.Resize(sampleLength); if (sampleLength > 0) { int seqPos = alignmentToBaseMap[aq]; if (seqPos < read.length) { sample.CopyFromSeq(read, seqPos, sampleLength); string nucs; int n; for (n = 0; n < sample.nucleotides.size(); n++) { char c = sample.nucleotides[n]; assert(c == 'A' or c == 'T' or c == 'G' or c == 'C'); nucs.push_back(sample.nucleotides[n]); } } } samples.AppendOutputSample(context, sample); } read.Free(); } if (onlyMaxLength) { map::iterator maxScoreIt; for (maxScoreIt = maxLengthMap.begin(); maxScoreIt != maxLengthMap.end(); ++maxScoreIt) { cout << maxScoreIt->second.length << endl; samples.lengths.push_back(maxScoreIt->second.length); } } samples.Write(sampleOut); return 0; } blasr-smrtanalysis-4.0.0/extrautils/SuffixArrayToBWT.cpp000066400000000000000000000020771302464523700234140ustar00rootroot00000000000000#include "FASTASequence.hpp" #include "FASTAReader.hpp" #include "suffixarray/SuffixArrayTypes.hpp" #include "suffixarray/SuffixArray.hpp" #include "bwt/BWT.hpp" #include #include using namespace std; int main(int argc, char* argv[]) { if (argc < 4) { cout << "usage: sa2bwt genomeFileName suffixArray bwt [-debug]" << endl; exit(1); } string genomeFileName = argv[1]; string suffixArrayFileName = argv[2]; string bwtFileName = argv[3]; int storeDebugInformation = 0; int argi = 4; while(argi < argc) { if (strcmp(argv[argi], "-debug") == 0) { storeDebugInformation = 1; } ++argi; } ofstream bwtOutFile; CrucialOpen(bwtFileName, bwtOutFile, std::ios::out|std::ios::binary); FASTAReader reader; reader.Init(genomeFileName); FASTASequence seq; reader.ReadAllSequencesIntoOne(seq); DNASuffixArray suffixArray; suffixArray.Read(suffixArrayFileName); Bwt bwt; bwt.InitializeFromSuffixArray(seq, suffixArray.index, storeDebugInformation ); bwt.Write(bwtOutFile); return 0; } blasr-smrtanalysis-4.0.0/extrautils/ctest/000077500000000000000000000000001302464523700207015ustar00rootroot00000000000000blasr-smrtanalysis-4.0.0/extrautils/ctest/bwt2sa.t000066400000000000000000000006611302464523700222730ustar00rootroot00000000000000Set up $ . $TESTDIR/setup.sh Noticed that pipeline fa->sa->bwt works OK, however fa->sa->bwt->sa does not generate identical suffix array. Set up the executable: bwt2sa. $ EXEC=$TESTDIR/../bwt2sa Define tmporary files $ TMP1=$OUTDIR/$$.tmp.out $ TMP2=$OUTDIR/$$.tmp.stdout Make OUTDIR $ mkdir -p $OUTDIR $ SA=$OUTDIR/ecoli_reference.bwt2sa.sa $ BWT=$DATDIR/ecoli_reference.bwt $ $EXEC $BWT $SA $ echo $? 0 blasr-smrtanalysis-4.0.0/extrautils/ctest/cmpH5StoreQualityByContext.t000066400000000000000000000006131302464523700262700ustar00rootroot00000000000000Set up $ . $TESTDIR/setup.sh Set up the executable: cmpH5StoreQualityByContext. $ EXEC=$TESTDIR/../cmpH5StoreQualityByContext Define tmporary files $ TMP1=$OUTDIR/$$.tmp.out $ TMP2=$OUTDIR/$$.tmp.stdout Make OUTDIR $ mkdir -p $OUTDIR $ $EXEC $DATDIR/ecoli_out.cmp.h5 $OUTDIR/ecoli_out.qbc -contextLength 8 -onlyMaxLength -minSamples 600 -maxSamples 1500 > $TMP1 $ echo $? 0 blasr-smrtanalysis-4.0.0/extrautils/ctest/printTupleCountTable.t000066400000000000000000000006351302464523700252210ustar00rootroot00000000000000Set up $ . $TESTDIR/setup.sh Set up the executable: printTupleCountTable. $ EXEC=$TESTDIR/../printTupleCountTable Define tmporary files $ TMP1=$OUTDIR/$$.tmp.out $ TMP2=$OUTDIR/$$.tmp.stdout Make OUTDIR $ mkdir -p $OUTDIR $ $EXEC $OUTDIR/ecoli_tuple.table 8 $DATDIR/ecoli_reference.fasta $ echo $? 0 $ md5sum $OUTDIR/ecoli_tuple.table |cut -f 1 -d ' ' 3f1ae70fd009827d6d6e56050341b5df blasr-smrtanalysis-4.0.0/extrautils/ctest/sa2bwt.t000066400000000000000000000005441302464523700222730ustar00rootroot00000000000000Set up $ . $TESTDIR/setup.sh Set up the executable: sa2bwt. $ EXEC=$TESTDIR/../sa2bwt Define tmporary files $ TMP1=$OUTDIR/$$.tmp.out $ TMP2=$OUTDIR/$$.tmp.stdout Make OUTDIR $ mkdir -p $OUTDIR $ FA=$DATDIR/ecoli_reference.fasta $ SA=$DATDIR/ecoli_reference.sa $ BWT=$OUTDIR/ecoli_reference.bwt $ $EXEC $FA $SA $BWT $ echo $? 0 blasr-smrtanalysis-4.0.0/extrautils/ctest/sals.t000066400000000000000000000005051302464523700220300ustar00rootroot00000000000000Set up $ . $TESTDIR/setup.sh Set up the executable: sals. $ EXEC=$TESTDIR/../sals Define tmporary files $ TMP1=$OUTDIR/$$.tmp.out $ TMP2=$OUTDIR/$$.tmp.stdout Make OUTDIR $ mkdir -p $OUTDIR $ $EXEC $DATDIR/ecoli_reference.sa * has a suffix array. * has a lookup table for word size. 8 $ echo $? 0 blasr-smrtanalysis-4.0.0/extrautils/ctest/samodify.t000066400000000000000000000006641302464523700227070ustar00rootroot00000000000000Set up $ . $TESTDIR/setup.sh Set up the executable: samodify. $ EXEC=$TESTDIR/../samodify Define tmporary files $ TMP1=$OUTDIR/$$.tmp.out $ TMP2=$OUTDIR/$$.tmp.stdout Make OUTDIR $ mkdir -p $OUTDIR $ $EXEC $DATDIR/ecoli_reference.sa $DATDIR/ecoli_reference.fasta $OUTDIR/ecoli_reference_blt13.sa -blt 13 $ echo $? 0 $ md5sum $OUTDIR/ecoli_reference_blt13.sa | cut -f 1 -d ' ' ac70eef5a6e03ae8177f27b3aeacc4c5 blasr-smrtanalysis-4.0.0/extrautils/ctest/setup.sh000077500000000000000000000004551302464523700224040ustar00rootroot00000000000000# Set up directories CURDIR=$TESTDIR REMOTEDIR=/mnt/secondary-siv/testdata/BlasrTestData/ctest DATDIR=$REMOTEDIR/data OUTDIR=$CURDIR/out STDDIR=$REMOTEDIR/stdout SCRIPTDIR=$REMOTEDIR/scripts/ # Define tmporary files TMP1=$OUTDIR/$$.tmp.out TMP2=$OUTDIR/$$.tmp.stdout # Make OUTDIR mkdir -p $OUTDIR blasr-smrtanalysis-4.0.0/extrautils/ctest/swmatcher.t000066400000000000000000000005671302464523700230730ustar00rootroot00000000000000Set up $ . $TESTDIR/setup.sh Set up the executable: swMather. $ EXEC=$TESTDIR/../swMatcher Define tmporary files $ TMP1=$OUTDIR/$$.tmp.out $ TMP2=$OUTDIR/$$.tmp.stdout Make OUTDIR $ mkdir -p $OUTDIR $ FA=$DATDIR/ecoli_subset.fasta $ $EXEC $FA $FA 10 -local > $OUTDIR/swmatcher.out $ echo $? 0 $ diff $OUTDIR/swmatcher.out $STDDIR/swmatcher.stdout blasr-smrtanalysis-4.0.0/extrautils/makefile000066400000000000000000000023331302464523700212600ustar00rootroot00000000000000.PHONY=all cramtests SRCDIR:=$(dir $(realpath $(lastword $(MAKEFILE_LIST)))) -include ${CURDIR}/../defines.mk include ${SRCDIR}/../rules.mk CXXOPTS := -std=c++0x -pedantic \ -Wall -Wextra -Wno-overloaded-virtual \ -MMD -MP CXXFLAGS += ${CXXOPTS} ${GCXXFLAGS} EXE = sa2bwt bwt2sa excrep evolve bsdb simpleShredder swMatcher \ samodify sals printTupleCountTable cmpH5StoreQualityByContext LD_LIBRARY_PATH=${HDF5_LIB}:${LIBBLASR_LIB}:${LIBPBIHDF_LIB}:${LIBPBDATA_LIB} export LD_LIBRARY_PATH vpath %.cpp ${SRCDIR} all: ${EXE} ${EXE}: ${CXX} -o $@ $< ${CXXFLAGS} ${CPPFLAGS} -MF"${@:%=%.d}" ${STATIC} ${LDFLAGS} ${LDLIBS} sa2bwt: SuffixArrayToBWT.o bwt2sa: BwtToSuffixArray.o excrep: ExciseRepeats.o evolve: Evolve.o bsdb: BuildSequenceDB.o simpleShredder: SimpleShredder.o swMatcher: SWMatcher.o samodify: SAModify.o sals: SALS.o printTupleCountTable: PrintTupleCountTable.o cmpH5StoreQualityByContext: StoreQualityByContextFromCmpH5.o CTESTS := \ ctest/printTupleCountTable.t ctest/sals.t ctest/swmatcher.t \ ctest/bwt2sa.t ctest/cmpH5StoreQualityByContext.t ctest/sa2bwt.t ctest/samodify.t cramtests: ${EXE} cram -v --shell=/bin/bash ${CTESTS} clean: @rm -f ${EXE} @rm -f *.d *.o blasr-smrtanalysis-4.0.0/iblasr/000077500000000000000000000000001302464523700166275ustar00rootroot00000000000000blasr-smrtanalysis-4.0.0/iblasr/BlasrAlign.hpp000066400000000000000000000073001302464523700213560ustar00rootroot00000000000000// Author: Mark Chaisson #pragma once #include "BlasrHeaders.h" #include "BlasrMiscs.hpp" //------------------MAP READS---------------------------------// template void MapRead(T_Sequence &read, T_Sequence &readRC, T_RefSequence &genome, T_SuffixArray &sarray, BWT &bwt, SeqBoundaryFtr &seqBoundary, T_TupleCountTable &ct, SequenceIndexDatabase &seqdb, MappingParameters ¶ms, MappingMetrics &metrics, vector &alignmentPtrs, MappingBuffers &mappingBuffers, MappingIPC *mapData, MappingSemaphores & semaphores); template void MapRead(T_Sequence &read, T_Sequence &readRC, vector &alignmentPtrs, MappingBuffers &mappingBuffers, MappingIPC *mapData, MappingSemaphores & semaphores); /* void MapReads(MappingData *mapData); */ //------------------MAKE ALIGNMENTS---------------------------// template void AlignIntervals(T_TargetSequence &genome, T_QuerySequence &read, T_QuerySequence &rcRead, WeightedIntervalSet &weightedIntervals, int mutationCostMatrix[][5], int ins, int del, int sdpTupleSize, int useSeqDB, SequenceIndexDatabase &seqDB, vector &alignments, MappingParameters ¶ms, MappingBuffers &mappingBuffers, int procId=0); template void PairwiseLocalAlign(T_Sequence &qSeq, T_RefSequence &tSeq, int k, MappingParameters ¶ms, T_AlignmentCandidate &alignment, MappingBuffers &mappingBuffers, AlignmentType alignType=Global); // Extend target aligned sequence of the input alignement to both ends // by flankSize bases. Update alignment->tAlignedSeqPos, // alignment->tAlignedSeqLength and alignment->tAlignedSeq. void FlankTAlignedSeq(T_AlignmentCandidate * alignment, SequenceIndexDatabase &seqdb, DNASequence & genome, int flankSize); // Align a subread of a SMRT sequence to target sequence of an alignment. // Input: // subread - a subread of a SMRT sequence. // unrolledRead - the full SMRT sequence. // alignment - an alignment. // passDirection - whether or not the subread has the // same direction as query of the alignment. // 0 = true, 1 = false. // subreadInterval - [start, end) interval of the subread in the // SMRT read. // subreadIndex - index of the subread in allReadAlignments. // params - mapping paramters. // Output: // allReadAlignments - where the sequence and alignments of the // subread are saved. // threadOut - an out stream for debugging the current thread. void AlignSubreadToAlignmentTarget(ReadAlignments & allReadAlignments, SMRTSequence & subread, SMRTSequence & unrolledRead, T_AlignmentCandidate * alignment, int passDirection, ReadInterval & subreadInterval, int subreadIndex, MappingParameters & params, MappingBuffers & mappingBuffers, ostream & threadOut); #include "BlasrAlignImpl.hpp" blasr-smrtanalysis-4.0.0/iblasr/BlasrAlignImpl.hpp000066400000000000000000002106021302464523700222010ustar00rootroot00000000000000// Author: Mark Chaisson #pragma once template void MapRead(T_Sequence &read, T_Sequence &readRC, T_RefSequence &genome, T_SuffixArray &sarray, BWT &bwt, SeqBoundaryFtr &seqBoundary, T_TupleCountTable &ct, SequenceIndexDatabase &seqdb, MappingParameters ¶ms, MappingMetrics &metrics, vector &alignmentPtrs, MappingBuffers &mappingBuffers, MappingIPC *mapData, MappingSemaphores & semaphores) { bool matchFound; WeightedIntervalSet topIntervals(params.nCandidates); int numKeysMatched=0, rcNumKeysMatched=0; (void)(numKeysMatched); (void)(rcNumKeysMatched); int expand = params.minExpand; metrics.clocks.total.Tick(); int forwardNumBasesMatched = 0, reverseNumBasesMatched = 0; do { matchFound = false; mappingBuffers.matchPosList.clear(); mappingBuffers.rcMatchPosList.clear(); alignmentPtrs.clear(); topIntervals.clear(); params.anchorParameters.expand = expand; metrics.clocks.mapToGenome.Tick(); if (params.useSuffixArray) { params.anchorParameters.lcpBoundsOutPtr = mapData->lcpBoundsOutPtr; numKeysMatched = MapReadToGenome(genome, sarray, read, params.lookupTableLength, mappingBuffers.matchPosList, params.anchorParameters); // // Only print values for the read in forward direction (and only // the first read). // mapData->lcpBoundsOutPtr = NULL; if (!params.forwardOnly) { rcNumKeysMatched = MapReadToGenome(genome, sarray, readRC, params.lookupTableLength, mappingBuffers.rcMatchPosList, params.anchorParameters); } } else if (params.useBwt){ numKeysMatched = MapReadToGenome(bwt, read, read.SubreadStart(), read.SubreadEnd(), mappingBuffers.matchPosList, params.anchorParameters, forwardNumBasesMatched); if (!params.forwardOnly) { rcNumKeysMatched = MapReadToGenome(bwt, readRC, readRC.SubreadStart(), readRC.SubreadEnd(), mappingBuffers.rcMatchPosList, params.anchorParameters, reverseNumBasesMatched); } } // // Look to see if only the anchors are printed. if (params.anchorFileName != "") { size_t i; if (params.nProc > 1) { #ifdef __APPLE__ sem_wait(semaphores.writer); #else sem_wait(&semaphores.writer); #endif } *mapData->anchorFilePtr << read.title << endl; for (i = 0; i < mappingBuffers.matchPosList.size(); i++) { *mapData->anchorFilePtr << mappingBuffers.matchPosList[i] << endl; } *mapData->anchorFilePtr << readRC.title << " (RC) " << endl; for (i = 0; i < mappingBuffers.rcMatchPosList.size(); i++) { *mapData->anchorFilePtr << mappingBuffers.rcMatchPosList[i] << endl; } if (params.nProc > 1) { #ifdef __APPLE__ sem_post(semaphores.writer); #else sem_post(&semaphores.writer); #endif } } metrics.totalAnchors += mappingBuffers.matchPosList.size() + mappingBuffers.rcMatchPosList.size(); metrics.clocks.mapToGenome.Tock(); metrics.clocks.sortMatchPosList.Tick(); SortMatchPosList(mappingBuffers.matchPosList); SortMatchPosList(mappingBuffers.rcMatchPosList); metrics.clocks.sortMatchPosList.Tock(); PValueWeightor lisPValue(read, genome, ct.tm, &ct); MultiplicityPValueWeightor lisPValueByWeight(genome); LISSumOfLogPWeightor > lisPValueByLogSum(genome); LISSizeWeightor > lisWeightFn; IntervalSearchParameters intervalSearchParameters; intervalSearchParameters.globalChainType = params.globalChainType; intervalSearchParameters.advanceHalf = params.advanceHalf; intervalSearchParameters.warp = params.warp; intervalSearchParameters.fastMaxInterval = params.fastMaxInterval; intervalSearchParameters.aggressiveIntervalCut = params.aggressiveIntervalCut; intervalSearchParameters.verbosity = params.verbosity; // // If specified, only align a band from the anchors. // DNALength squareRefLength = read.length * 1.25 + params.limsAlign; if (params.limsAlign != 0) { size_t fi; for (fi = 0; fi < mappingBuffers.matchPosList.size(); fi++) { if (mappingBuffers.matchPosList[fi].t >= squareRefLength) { break; } } if (fi < mappingBuffers.matchPosList.size()) { mappingBuffers.matchPosList.resize(fi); } } metrics.clocks.findMaxIncreasingInterval.Tick(); // // For now say that something that has a 50% chance of happening // by chance is too high of a p value. This is probably many times // the size. // intervalSearchParameters.maxPValue = log(0.5); intervalSearchParameters.aboveCategoryPValue = -300; VarianceAccumulator accumPValue; VarianceAccumulator accumWeight; VarianceAccumulator accumNBases; mappingBuffers.clusterList.Clear(); mappingBuffers.revStrandClusterList.Clear(); // // Remove anchors that are fully encompassed by longer ones. This // speeds up limstemplate a lot. // RemoveOverlappingAnchors(mappingBuffers.matchPosList); RemoveOverlappingAnchors(mappingBuffers.rcMatchPosList); if (params.pValueType == 0) { if (params.printDotPlots) { ofstream dotPlotOut; string dotPlotName = string(read.title) + ".anchors"; CrucialOpen(dotPlotName, dotPlotOut, std::ios::out); for (size_t mp = 0; mp < mappingBuffers.matchPosList.size(); mp++ ){ dotPlotOut << mappingBuffers.matchPosList[mp].q << " " << mappingBuffers.matchPosList[mp].t << " " << mappingBuffers.matchPosList[mp].l << " " << endl; } dotPlotOut.close(); } /* This is an optimization that is being tested out that places a grid over the area where there are anchors, and then finds an increasing maximally weighted path through the grid. The weight of a cell in the grid is the sum of the number of anchors in it. All other anchors are to be removed. This will likely only work for LIMSTemplate sequences, or other sequences with little structural variation. FindBand(mappingBuffers.matchPosList, refCopy, read, 100); */ FindMaxIncreasingInterval(Forward, mappingBuffers.matchPosList, // allow for indels to stretch out the mapping of the read. (DNALength) ((read.SubreadLength()) * (1 + params.indelRate)), params.nCandidates, seqBoundary, lisPValue,//lisPValue2, lisWeightFn, topIntervals, genome, read, intervalSearchParameters, &mappingBuffers.globalChainEndpointBuffer, mappingBuffers.clusterList, accumPValue, accumWeight, accumNBases); // Uncomment when the version of the weight functor needs the sequence. mappingBuffers.clusterList.ResetCoordinates(); FindMaxIncreasingInterval(Reverse, mappingBuffers.rcMatchPosList, (DNALength) ((read.SubreadLength()) * (1 + params.indelRate)), params.nCandidates, seqBoundary, lisPValue,//lisPValue2 lisWeightFn, topIntervals, genome, readRC, intervalSearchParameters, &mappingBuffers.globalChainEndpointBuffer, mappingBuffers.revStrandClusterList, accumPValue, accumWeight, accumNBases); } else if (params.pValueType == 1) { FindMaxIncreasingInterval(Forward, mappingBuffers.matchPosList, // allow for indels to stretch out the mapping of the read. (DNALength) ((read.SubreadLength()) * (1 + params.indelRate)), params.nCandidates, seqBoundary, lisPValueByWeight, // different from pvaltype == 2 and 0 lisWeightFn, topIntervals, genome, read, intervalSearchParameters, &mappingBuffers.globalChainEndpointBuffer, mappingBuffers.clusterList, accumPValue, accumWeight, accumNBases); mappingBuffers.clusterList.ResetCoordinates(); FindMaxIncreasingInterval(Reverse, mappingBuffers.rcMatchPosList, (DNALength) ((read.SubreadLength()) * (1 + params.indelRate)), params.nCandidates, seqBoundary, lisPValueByWeight, // different from pvaltype == 2 and 0 lisWeightFn, topIntervals, genome, readRC, intervalSearchParameters, &mappingBuffers.globalChainEndpointBuffer, mappingBuffers.revStrandClusterList, accumPValue, accumWeight, accumNBases); } else if (params.pValueType == 2) { FindMaxIncreasingInterval(Forward, mappingBuffers.matchPosList, // allow for indels to stretch out the mapping of the read. (DNALength) ((read.SubreadLength()) * (1 + params.indelRate)), params.nCandidates, seqBoundary, lisPValueByLogSum, // different from pvaltype == 1 and 0 lisWeightFn, topIntervals, genome, read, intervalSearchParameters, &mappingBuffers.globalChainEndpointBuffer, mappingBuffers.clusterList, accumPValue, accumWeight, accumNBases); mappingBuffers.clusterList.ResetCoordinates(); FindMaxIncreasingInterval(Reverse, mappingBuffers.rcMatchPosList, (DNALength) ((read.SubreadLength()) * (1 + params.indelRate)), params.nCandidates, seqBoundary, lisPValueByLogSum, // different from pvaltype == 1 and 0 lisWeightFn, topIntervals, genome, readRC, intervalSearchParameters, &mappingBuffers.globalChainEndpointBuffer, mappingBuffers.revStrandClusterList, accumPValue, accumWeight, accumNBases); } mappingBuffers.clusterList.numBases.insert(mappingBuffers.clusterList.numBases.end(), mappingBuffers.revStrandClusterList.numBases.begin(), mappingBuffers.revStrandClusterList.numBases.end()); mappingBuffers.clusterList.numAnchors.insert(mappingBuffers.clusterList.numAnchors.end(), mappingBuffers.revStrandClusterList.numAnchors.begin(), mappingBuffers.revStrandClusterList.numAnchors.end()); metrics.clocks.findMaxIncreasingInterval.Tock(); // // Print verbose output. // WeightedIntervalSet::iterator topIntIt, topIntEnd; topIntEnd = topIntervals.end(); if (params.verbosity > 0) { int topintind = 0; cout << " intv: index start end qstart qend seq_boundary_start seq_boundary_end pvalue " << endl; for (topIntIt = topIntervals.begin();topIntIt != topIntEnd ; ++topIntIt) { cout << " intv: " << topintind << " " << (*topIntIt).start << " " << (*topIntIt).end << " " << (*topIntIt).qStart << " " << (*topIntIt).qEnd << " " << seqBoundary((*topIntIt).start) << " " << seqBoundary((*topIntIt).end) << " " << (*topIntIt).pValue << endl; if (params.verbosity > 2) { for (size_t m = 0; m < (*topIntIt).matches.size(); m++) { cout << " (" << (*topIntIt).matches[m].q << ", " << (*topIntIt).matches[m].t << ", " << (*topIntIt).matches[m].l << ") "; } cout << endl; } ++topintind; } } // // Allocate candidate alignments on the stack. Each interval is aligned. // alignmentPtrs.resize(topIntervals.size()); UInt i; for (i = 0; i < alignmentPtrs.size(); i++ ) { alignmentPtrs[i] = new T_AlignmentCandidate; } metrics.clocks.alignIntervals.Tick(); AlignIntervals( genome, read, readRC, topIntervals, SMRTDistanceMatrix, params.indel, params.indel, params.sdpTupleSize, params.useSeqDB, seqdb, alignmentPtrs, params, mappingBuffers, params.startRead ); /* cout << read.title << endl; for (i = 0; i < alignmentPtrs.size(); i++) { cout << alignmentPtrs[i]->clusterScore << " " << alignmentPtrs[i]->score << endl; } */ StoreRankingStats(alignmentPtrs, accumPValue, accumWeight); std::sort(alignmentPtrs.begin(), alignmentPtrs.end(), SortAlignmentPointersByScore()); metrics.clocks.alignIntervals.Tock(); // // Evalutate the matches that are found for 'good enough'. // matchFound = CheckForSufficientMatch(read, alignmentPtrs, params); // // When no proper alignments are found, the loop will resume. // Delete all alignments because they are bad. // if (expand < params.maxExpand and matchFound == false) { DeleteAlignments(alignmentPtrs, 0); } // // Record some metrics that show how long this took to run per base. // if (alignmentPtrs.size() > 0) { metrics.RecordNumAlignedBases(read.length); metrics.RecordNumCells(alignmentPtrs[0]->nCells); } if (matchFound == true) { metrics.totalAnchorsForMappedReads += mappingBuffers.matchPosList.size() + mappingBuffers.rcMatchPosList.size(); } ++expand; } while ( expand <= params.maxExpand and matchFound == false); metrics.clocks.total.Tock(); UInt i; int totalCells = 0; for (i = 0; i< alignmentPtrs.size(); i++) { totalCells += alignmentPtrs[i]->nCells; } metrics.clocks.AddCells(totalCells); int totalBases = 0; for (i = 0; i < alignmentPtrs.size(); i++) { totalBases += alignmentPtrs[i]->qLength; } metrics.clocks.AddBases(totalBases); // // Some of the alignments are to spurious regions. Delete the // references that have too small of a score. // int effectiveReadLength = 0; for (i = 0; i< read.length; i++) { if (read.seq[i] != 'N') effectiveReadLength++; } if (params.sdpFilterType == 0) { RemoveLowQualityAlignments(read, alignmentPtrs, params); } else if (params.sdpFilterType == 1) { RemoveLowQualitySDPAlignments(effectiveReadLength, alignmentPtrs, params); } // // Now remove overlapping alignments. // vector bothQueryStrands; bothQueryStrands.resize(2); bothQueryStrands[Forward] = &read; bothQueryStrands[Reverse] = &readRC; // // Possibly use banded dynamic programming to refine the columns // of an alignment and the alignment score. // if (params.refineAlignments) { RefineAlignments(bothQueryStrands, genome, alignmentPtrs, params, mappingBuffers); RemoveLowQualityAlignments(read,alignmentPtrs,params); RemoveOverlappingAlignments(alignmentPtrs, params); } // // Look to see if the number of anchors found for this read match // what is expected given the expected distribution of number of // anchors. // if (alignmentPtrs.size() > 0) { size_t clusterIndex; // // Compute some stats on the read. For now this is fixed but will // be updated on the fly soon. // float meanAnchorBasesPerRead, sdAnchorBasesPerRead; float meanAnchorsPerRead, sdAnchorsPerRead; int lookupValue; // // If a very short anchor size was used, or very long min match // size there may be no precomputed distributions for it. // Handle this by bounding the min match by the smallest and // largest values for which there are precomputed statistics. int boundedMinWordMatchLength = min(max(params.minMatchLength, PacBio::AnchorDistributionTable::anchorMinKValues[0]), PacBio::AnchorDistributionTable::anchorMinKValues[1]); // // Do a similar bounding for match length and accuracy. // int boundedMatchLength = min(max((int) alignmentPtrs[0]->qAlignedSeq.length, PacBio::AnchorDistributionTable::anchorReadLengths[0]), PacBio::AnchorDistributionTable::anchorReadLengths[1]); int boundedPctSimilarity = min(max((int)alignmentPtrs[0]->pctSimilarity, PacBio::AnchorDistributionTable::anchorReadAccuracies[0]), PacBio::AnchorDistributionTable::anchorReadAccuracies[1]); lookupValue = LookupAnchorDistribution(boundedMatchLength, boundedMinWordMatchLength, boundedPctSimilarity, meanAnchorsPerRead, sdAnchorsPerRead, meanAnchorBasesPerRead, sdAnchorBasesPerRead); float minExpAnchors = meanAnchorsPerRead - sdAnchorsPerRead; // // The number of standard deviations is just trial and error. float minExpAnchorBases = meanAnchorBasesPerRead - 2 * sdAnchorBasesPerRead; if (lookupValue < 0 or minExpAnchorBases < 0) { minExpAnchorBases = 0; } int numSignificantClusters = 0; int totalSignificantClusterSize = 0; int maxClusterSize = 0; int numAlnAnchorBases, numAlnAnchors; alignmentPtrs[0]->ComputeNumAnchors(boundedMinWordMatchLength, numAlnAnchors, numAlnAnchorBases); int totalAnchorBases = 0; if (numAlnAnchorBases > meanAnchorBasesPerRead + sdAnchorBasesPerRead) { numSignificantClusters = 1; } else { if (alignmentPtrs[0]->score < params.maxScore) { for (clusterIndex = 0; clusterIndex < mappingBuffers.clusterList.numBases.size(); clusterIndex++) { if (mappingBuffers.clusterList.numBases[clusterIndex] > maxClusterSize) { maxClusterSize = mappingBuffers.clusterList.numBases[clusterIndex]; } } int scaledExpectedClusterSize = maxClusterSize / ((float)numAlnAnchorBases) * minExpAnchorBases; for (clusterIndex = 0; clusterIndex < mappingBuffers.clusterList.numBases.size(); clusterIndex++) { if (mappingBuffers.clusterList.numBases[clusterIndex] >= scaledExpectedClusterSize) { // cout << mappingBuffers.clusterList.numBases[clusterIndex] << " " << scaledExpectedClusterSize << " " << meanAnchorBasesPerRead << " " << sdAnchorBasesPerRead << endl; ++numSignificantClusters; totalSignificantClusterSize += meanAnchorBasesPerRead; } // // The following output block is useful in debugging mapqv // calculation. It should be uncommented and examined when // mapqvs do not look correct. // totalAnchorBases += mappingBuffers.clusterList.numBases[clusterIndex]; } } if (lookupValue == 0) { alignmentPtrs[0]->ComputeNumAnchors(params.minMatchLength, numAlnAnchors, numAlnAnchorBases); } } for (i = 0; i < alignmentPtrs.size(); i++) { alignmentPtrs[i]->numSignificantClusters = numSignificantClusters; } if (mapData->clusterFilePtr != NULL and topIntervals.size() > 0 and alignmentPtrs.size() > 0) { WeightedIntervalSet::iterator intvIt = topIntervals.begin(); if (params.nProc > 1) { #ifdef __APPLE__ sem_wait(semaphores.hitCluster); #else sem_wait(&semaphores.hitCluster); #endif } *mapData->clusterFilePtr << (*intvIt).size << " " << (*intvIt).pValue << " " << (*intvIt).nAnchors << " " << read.length << " " << alignmentPtrs[0]->score << " " << alignmentPtrs[0]->pctSimilarity << " " << " " << minExpAnchors << " " << alignmentPtrs[0]->qAlignedSeq.length << endl; if (params.nProc > 1) { #ifdef __APPLE__ sem_post(semaphores.hitCluster); #else sem_post(&semaphores.hitCluster); #endif } } } // // Assign the query name and strand for each alignment. // for (i = 0; i < alignmentPtrs.size(); i++) { T_AlignmentCandidate *aref = alignmentPtrs[i]; if (aref->tStrand == 0) { aref->qName = read.GetName(); } else { aref->qName = readRC.GetName(); } } AssignRefContigLocations(alignmentPtrs, seqdb, genome); } template void MapRead(T_Sequence &read, T_Sequence &readRC, vector &alignmentPtrs, MappingBuffers &mappingBuffers, MappingIPC *mapData, MappingSemaphores & semaphores) { DNASuffixArray sarray; TupleCountTable ct; SequenceIndexDatabase seqdb; T_GenomeSequence genome; BWT *bwtPtr = mapData->bwtPtr; mapData->ShallowCopySuffixArray(sarray); mapData->ShallowCopyReferenceSequence(genome); mapData->ShallowCopySequenceIndexDatabase(seqdb); mapData->ShallowCopyTupleCountTable(ct); SeqBoundaryFtr seqBoundary(&seqdb); return MapRead(read, readRC, genome, // possibly multi fasta file read into one sequence sarray, *bwtPtr, // The suffix array, and the bwt-fm index structures seqBoundary, // Boundaries of contigs in the // genome, alignments do not span // the ends of boundaries. ct, // Count table to use word frequencies in the genome to weight matches. seqdb, // Information about the names of // chromosomes in the genome, and // where their sequences are in the genome. mapData->params,// A huge list of parameters for // mapping, only compile/command // line values set. mapData->metrics, // Keep track of time/ hit counts, // etc.. Not fully developed, but // should be. alignmentPtrs, // Where the results are stored. mappingBuffers, // A class of buffers for structurs // like dyanmic programming // matrices, match lists, etc., that are not // reallocated between calls to // MapRead. They are cleared though. mapData, // Some values that are shared // across threads. semaphores); } template void AlignIntervals(T_TargetSequence &genome, T_QuerySequence &read, T_QuerySequence &rcRead, WeightedIntervalSet &weightedIntervals, int mutationCostMatrix[][5], int ins, int del, int sdpTupleSize, int useSeqDB, SequenceIndexDatabase &seqDB, vector &alignments, MappingParameters ¶ms, MappingBuffers &mappingBuffers, int procId) { (void)(mutationCostMatrix); (void)(ins); (void)(del); (void)(procId); vector forrev; forrev.resize(2); forrev[Forward] = &read; forrev[Reverse] = &rcRead; // // Use an edit distance scoring function instead of IDS. Although // the IDS should be more accurate, it is more slow, and it is more // important at this stage to have faster alignments than accurate, // since all alignments are rerun using GuidedAlignment later on. // DistanceMatrixScoreFunction distScoreFn(SMRTDistanceMatrix, params.insertion, params.deletion); DistanceMatrixScoreFunction distScoreFn2(SMRTDistanceMatrix, ins, ins); // // Assume there is at least one interval. // if (weightedIntervals.size() == 0) return; WeightedIntervalSet::iterator intvIt = weightedIntervals.begin(); int alignmentIndex = 0; do { T_AlignmentCandidate *alignment = alignments[alignmentIndex]; alignment->clusterWeight= (*intvIt).size; // totalAnchorSize == size alignment->clusterScore = (*intvIt).pValue; // // Advance references. Intervals are stored in reverse order, so // go backwards in the list, and alignments are in forward order. // That should probably be changed. // ++alignmentIndex; // // Try aligning the read to the genome. // DNALength matchIntervalStart, matchIntervalEnd; matchIntervalStart = (*intvIt).start; matchIntervalEnd = (*intvIt).end; bool readOverlapsContigStart = false; bool readOverlapsContigEnd = false; int startOverlappedContigIndex = 0; int endOverlappedContigIndex = 0; (void)(readOverlapsContigStart); (void)(readOverlapsContigEnd); (void)(startOverlappedContigIndex); (void)(endOverlappedContigIndex); if (params.verbosity > 0) { cout << "aligning interval : " << read.length << " " << (*intvIt).start << " " << (*intvIt).end << " " << (*intvIt).qStart << " " << (*intvIt).qEnd << " " << matchIntervalStart << " to " << matchIntervalEnd << " " << params.approximateMaxInsertionRate << " " << endl; } assert(matchIntervalEnd >= matchIntervalStart); // // If using a sequence database, check to make sure that the // boundaries of the sequence windows do not overlap with // the boundaries of the reads. If the beginning is before // the boundary, move the beginning up to the start of the read. // If the end is past the end boundary of the read, similarly move // the window boundary to the end of the read boundary. int seqDBIndex = 0; // // Stretch the alignment interval so that it is close to where // the read actually starts. // DNALength subreadStart = read.SubreadStart(); DNALength subreadEnd = read.SubreadEnd(); if ((*intvIt).GetStrandIndex() == Reverse) { subreadEnd = read.MakeRCCoordinate(read.SubreadStart()) + 1; subreadStart = read.MakeRCCoordinate(read.SubreadEnd()-1); } DNALength lengthBeforeFirstMatch = ((*intvIt).qStart - subreadStart) * params.approximateMaxInsertionRate ; DNALength lengthAfterLastMatch = (subreadEnd - (*intvIt).qEnd) * params.approximateMaxInsertionRate; if (matchIntervalStart < lengthBeforeFirstMatch or params.doGlobalAlignment) { matchIntervalStart = 0; } else { matchIntervalStart -= lengthBeforeFirstMatch; } if (genome.length < matchIntervalEnd + lengthAfterLastMatch or params.doGlobalAlignment) { matchIntervalEnd = genome.length; } else { matchIntervalEnd += lengthAfterLastMatch; } DNALength intervalContigStartPos, intervalContigEndPos; if (useSeqDB) { // // The sequence db index is the one where the actual match is // contained. The matchIntervalStart might be before the sequence // index boundary due to the extrapolation of alignment start by // insertion rate. If this is the case, bump up the // matchIntervalStart to be at the beginning of the boundary. // Modify bounds similarly for the matchIntervalEnd and the end // of a boundary. // seqDBIndex = seqDB.SearchForIndex((*intvIt).start); intervalContigStartPos = seqDB.seqStartPos[seqDBIndex]; if (intervalContigStartPos > matchIntervalStart) { matchIntervalStart = intervalContigStartPos; } intervalContigEndPos = seqDB.seqStartPos[seqDBIndex+1] - 1; if (intervalContigEndPos < matchIntervalEnd) { matchIntervalEnd = intervalContigEndPos; } alignment->tName = seqDB.GetSpaceDelimitedName(seqDBIndex); alignment->tLength = intervalContigEndPos - intervalContigStartPos; // // When there are multiple sequences in the database, store the // index of this sequence. This lets one compare the contigs // that reads are mapped to, for instance. // alignment->tIndex = seqDBIndex; } else { alignment->tLength = genome.length; alignment->tName = genome.GetName(); intervalContigStartPos = 0; intervalContigEndPos = genome.length; // // When there are multiple sequences in the database, store the // index of this sequence. This lets one compare the contigs // that reads are mapped to, for instance. // } alignment->qName = read.title; // // Look to see if a read overhangs the beginning of a contig. // if (params.verbosity > 2) { cout << "Check for prefix/suffix overlap on interval: " << (*intvIt).qStart << " ?> " << (*intvIt).start - intervalContigStartPos < (*intvIt).start - intervalContigStartPos) { readOverlapsContigStart = true; startOverlappedContigIndex = seqDBIndex; } // // Look to see if the read overhangs the end of a contig. // if (params.verbosity > 2) { cout << "Check for suffix/prefix overlap on interval, read overhang: " << read.length - (*intvIt).qEnd << " ?> " << matchIntervalEnd - (*intvIt).end < matchIntervalEnd - (*intvIt).end) { if (params.verbosity > 2) { cout << "read overlaps genome end." << endl; } readOverlapsContigEnd = true; endOverlappedContigIndex = seqDBIndex; } int alignScore; alignScore = 0; alignment->tAlignedSeqPos = matchIntervalStart; alignment->tAlignedSeqLength = matchIntervalEnd - matchIntervalStart; if ((*intvIt).GetStrandIndex() == Forward) { alignment->tAlignedSeq.Copy(genome, alignment->tAlignedSeqPos, alignment->tAlignedSeqLength); alignment->tStrand = Forward; } else { DNALength rcAlignedSeqPos = genome.MakeRCCoordinate(alignment->tAlignedSeqPos + alignment->tAlignedSeqLength - 1); genome.CopyAsRC(alignment->tAlignedSeq, rcAlignedSeqPos, alignment->tAlignedSeqLength); // Map forward coordinates into reverse complement. intervalContigStartPos = genome.MakeRCCoordinate(intervalContigStartPos) + 1; intervalContigEndPos = genome.MakeRCCoordinate(intervalContigEndPos - 1); swap(intervalContigStartPos, intervalContigEndPos); alignment->tAlignedSeqPos = rcAlignedSeqPos; alignment->tStrand = Reverse; } // Configure the part of the query that is aligned. The entire // query should always be aligned. alignment->qAlignedSeqPos = 0; alignment->qAlignedSeq.ReferenceSubstring(read); alignment->qAlignedSeqLength = alignment->qAlignedSeq.length; alignment->qLength = read.length; alignment->qStrand = 0; if (params.verbosity > 1) { cout << "aligning read " << endl; static_cast(&(alignment->qAlignedSeq))->PrintSeq(cout); cout << endl << "aligning reference" << endl; static_cast(&(alignment->tAlignedSeq))->PrintSeq(cout); cout << endl; } // // The type of alignment that is performed depends on the mode // blasr is running in. If it is running in normal mode, local // aligment is performed and guided by SDP alignment. When // running in overlap mode, the alignments are forced to the ends // of reads. // int intervalSize = 0; // // Check to see if the matches to the genome are sufficiently // dense to allow them to be used instead of having to redo // sdp alignment. // // First count how much of the read matches the genome exactly. for (size_t m = 0; m < intvIt->matches.size(); m++) { intervalSize += intvIt->matches[m].l;} int subreadLength = forrev[(*intvIt).GetStrandIndex()]->SubreadEnd() - forrev[(*intvIt).GetStrandIndex()]->SubreadStart(); if ((1.0*intervalSize) / subreadLength < params.sdpBypassThreshold and !params.emulateNucmer) { // // Not enough of the read maps to the genome, need to use // sdp alignment to define the regions of the read that map. // if (params.refineBetweenAnchorsOnly) { // // Run SDP alignment only between the genomic anchors, // including the genomic anchors as part of the alignment. // size_t m; vector *matches; vector rcMatches; Alignment anchorsOnly; DNASequence tAlignedSeq; FASTQSequence qAlignedSeq; // // The strand bookkeeping is a bit confusing, so hopefully // this will set things straight. // // If the alignment is forward strand, the coordinates of the // blocks are relative to the forward read, starting at 0, not // the subread start. // If the alignment is reverse strand, the coordinates of the // blocks are relative to the reverse strand, starting at the // position of the subread on the reverse strand. // // The coordinates of the blocks in the genome are always // relative to the forward strand on the genome, starting at // 0. // // // The first step to refining between anchors only is to make // the anchors relative to the tAlignedSeq. matches = (vector*) &(*intvIt).matches; tAlignedSeq = alignment->tAlignedSeq; qAlignedSeq = alignment->qAlignedSeq; if (alignment->tStrand == 0) { for (m = 0; m < matches->size(); m++) { (*matches)[m].t -= alignment->tAlignedSeqPos; (*matches)[m].q -= alignment->qAlignedSeqPos; } } else { // // Flip the entire alignment if it is on the reverse strand. DNALength rcAlignedSeqPos = genome.MakeRCCoordinate(alignment->tAlignedSeqPos + alignment->tAlignedSeqLength - 1); for (m = 0; m < matches->size(); m++) { (*matches)[m].t -= rcAlignedSeqPos; (*matches)[m].q -= alignment->qAlignedSeqPos; } alignment->tAlignedSeq.CopyAsRC(tAlignedSeq); rcMatches.resize((*intvIt).matches.size()); // // Make the reverse complement of the match list. // // 1. Reverse complement the coordinates. for (m = 0; m < (*intvIt).matches.size(); m++) { int revCompIndex = rcMatches.size() - m - 1; rcMatches[revCompIndex].q = read.MakeRCCoordinate((*intvIt).matches[m].q + (*intvIt).matches[m].l - 1); rcMatches[revCompIndex].t = tAlignedSeq.MakeRCCoordinate((*intvIt).matches[m].t + (*intvIt).matches[m].l - 1); rcMatches[revCompIndex].l = (*intvIt).matches[m].l; } matches = &rcMatches; } /* Uncomment to get a dot plot ofstream matchFile; matchFile.open("matches.txt"); matchFile << "q t l " << endl; for (m = 0; matches->size() > 0 and m < matches->size() - 1; m++) { matchFile << (*matches)[m].q << " " << (*matches)[m].t << " " << (*matches)[m].l << endl; } */ DNASequence tSubSeq; FASTQSequence qSubSeq; for (m = 0; matches->size() > 0 and m < matches->size() - 1; m++) { Block block; block.qPos = (*matches)[m].q; block.tPos = (*matches)[m].t; block.length = (*matches)[m].l; // // Find the lengths of the gaps between anchors. // int tGap, qGap; tGap = (*matches)[m+1].t - ((*matches)[m].t + (*matches)[m].l); qGap = (*matches)[m+1].q - ((*matches)[m].q + (*matches)[m].l); if (tGap > 0 and qGap > 0) { DNALength tPos, qPos; tPos = block.tPos + block.length; qPos = block.qPos + block.length; tSubSeq.ReferenceSubstring(tAlignedSeq, tPos, tGap); qSubSeq.ReferenceSubstring(alignment->qAlignedSeq, qPos, qGap); Alignment alignmentInGap; /* The following code is experimental code for trying to do something like affine gap alignment in long gaps. It would eventually be used in cDNA alignment to align between exons, but for now is being tested here by using it to align when there is a big gap between anchors. */ if (params.separateGaps == true and qSubSeq.length > 0 and tSubSeq.length > 0 and ( (1.0*qSubSeq.length)/tSubSeq.length < 0.25 )) { OneGapAlign(qSubSeq, tSubSeq, distScoreFn, mappingBuffers, alignmentInGap); } else { /* This is the 'normal/default' way to align between gaps. It is more well tested than OneGapAlign. */ SDPAlign(qSubSeq, tSubSeq, distScoreFn, params.sdpTupleSize, params.sdpIns, params.sdpDel, params.indelRate*2, alignmentInGap, mappingBuffers, Global, params.detailedSDPAlignment, params.extendFrontAlignment, params.recurseOver, params.fastSDP); } // // Now, splice the fragment alignment into the current // alignment. // if (alignmentInGap.blocks.size() > 0) { size_t b; // // Configure this block to be relative to the beginning // of the aligned substring. // for (b = 0; b < alignmentInGap.size(); b++) { alignmentInGap.blocks[b].tPos += tPos + alignmentInGap.tPos; alignmentInGap.blocks[b].qPos += qPos + alignmentInGap.qPos; assert(alignmentInGap.blocks[b].tPos < alignment->tAlignedSeq.length); assert(alignmentInGap.blocks[b].qPos < alignment->qAlignedSeq.length); } } // Add the original block alignment->blocks.push_back(block); anchorsOnly.blocks.push_back(block); // Add the blocks for the refined alignment alignment->blocks.insert(alignment->blocks.end(), alignmentInGap.blocks.begin(), alignmentInGap.blocks.end()); } } // Add the last block m = (*matches).size() - 1; Block block; block.qPos = (*matches)[m].q; block.tPos = (*matches)[m].t; assert(block.tPos <= alignment->tAlignedSeq.length); assert(block.qPos <= alignment->qAlignedSeq.length); block.length = (*matches)[m].l; alignment->blocks.push_back(block); anchorsOnly.blocks.push_back(block); // // By convention, blocks start at 0, and the // alignment->tPos,qPos give the start of the alignment. // Modify the block positions so that they are offset by 0. alignment->tPos = alignment->blocks[0].tPos; alignment->qPos = alignment->blocks[0].qPos; size_t b; size_t blocksSize = alignment->blocks.size(); for (b = 0; b < blocksSize ; b++) { assert(alignment->tPos <= alignment->blocks[b].tPos); assert(alignment->qPos <= alignment->blocks[b].qPos); alignment->blocks[b].tPos -= alignment->tPos; alignment->blocks[b].qPos -= alignment->qPos; } for (b = 0; b < anchorsOnly.blocks.size(); b++) { anchorsOnly.blocks[b].tPos -= alignment->tPos; anchorsOnly.blocks[b].qPos -= alignment->qPos; } anchorsOnly.tPos = alignment->tPos; anchorsOnly.qPos = alignment->qPos; ComputeAlignmentStats(*alignment, alignment->qAlignedSeq.seq, alignment->tAlignedSeq.seq, distScoreFn); tAlignedSeq.Free(); qAlignedSeq.Free(); tSubSeq.Free(); qSubSeq.Free(); } else { alignScore = SDPAlign(alignment->qAlignedSeq, alignment->tAlignedSeq, distScoreFn, sdpTupleSize, params.sdpIns, params.sdpDel, params.indelRate*3, *alignment, mappingBuffers, Local, params.detailedSDPAlignment, params.extendFrontAlignment, params.recurseOver, params.fastSDP); ComputeAlignmentStats(*alignment, alignment->qAlignedSeq.seq, alignment->tAlignedSeq.seq, distScoreFn); } } else { // // The anchors used to anchor the sequence are sufficient to extend the alignment. // size_t m; for (m = 0; m < (*intvIt).matches.size(); m++ ){ Block block; block.qPos = (*intvIt).matches[m].q - alignment->qAlignedSeqPos; block.tPos = (*intvIt).matches[m].t - alignment->tAlignedSeqPos; block.length = (*intvIt).matches[m].l; alignment->blocks.push_back(block); } } // // The anchors/sdp alignments may leave portions of the read // unaligned at the beginning and end. If the parameters // specify extending alignments, try and align extra bases at // the beginning and end of alignments. if (params.extendAlignments) { // // Modify the alignment so that the start and end of the // alignment strings are at the alignment boundaries. // // Since the query sequence is pointing at a subsequence of the // read (and is always in the forward direction), just reference // a new portion of the read. alignment->qAlignedSeqPos = alignment->qAlignedSeqPos + alignment->qPos; alignment->qAlignedSeqLength = alignment->QEnd(); alignment->qAlignedSeq.ReferenceSubstring(read, alignment->qAlignedSeqPos, alignment->qAlignedSeqLength ); alignment->qPos = 0; // // Since the target sequence may be on the forward or reverse // strand, a copy of the subsequence is made, and the original // sequence free'd. // DNASequence tSubseq; alignment->tAlignedSeqPos = alignment->tAlignedSeqPos + alignment->tPos; alignment->tAlignedSeqLength = alignment->TEnd(); tSubseq.Copy(alignment->tAlignedSeq, alignment->tPos, alignment->tAlignedSeqLength); alignment->tPos = 0; alignment->tAlignedSeq.Free(); alignment->tAlignedSeq.TakeOwnership(tSubseq); DNALength maximumExtendLength = 500; if (alignment->blocks.size() > 0 ) { int lastAlignedBlock = alignment->blocks.size() - 1; DNALength lastAlignedQPos = alignment->blocks[lastAlignedBlock].QEnd() + alignment->qPos + alignment->qAlignedSeqPos; DNALength lastAlignedTPos = alignment->blocks[lastAlignedBlock].TEnd() + alignment->tPos + alignment->tAlignedSeqPos; T_AlignmentCandidate extendedAlignmentForward, extendedAlignmentReverse; int forwardScore, reverseScore; SMRTSequence readSuffix; DNALength readSuffixLength; DNASequence genomeSuffix; DNALength genomeSuffixLength; SMRTSequence readPrefix; DNALength readPrefixLength; DNASequence genomePrefix; DNALength genomePrefixLength; // // Align the entire end of the read if it is short enough. // readSuffixLength = min(read.length - lastAlignedQPos, maximumExtendLength); if (readSuffixLength > 0) { readSuffix.ReferenceSubstring(read, lastAlignedQPos, readSuffixLength); } else { readSuffix.length = 0; } // // Align The entire end of the genome up to the maximum extend length; // genomeSuffixLength = min(intervalContigEndPos - lastAlignedTPos, maximumExtendLength); if (genomeSuffixLength > 0) { if (alignment->tStrand == Forward) { genomeSuffix.Copy(genome, lastAlignedTPos, genomeSuffixLength); } else { static_cast(&genome)->CopyAsRC(genomeSuffix, lastAlignedTPos, genomeSuffixLength); } } else { genomeSuffix.length = 0; } forwardScore = 0; if (readSuffix.length > 0 and genomeSuffix.length > 0) { forwardScore = ExtendAlignmentForward(readSuffix, 0, genomeSuffix, 0, params.extendBandSize, // Reuse buffers to speed up alignment mappingBuffers.scoreMat, mappingBuffers.pathMat, // Do the alignment in the forward direction. extendedAlignmentForward, distScoreFn, 1, // don't bother attempting // to extend the alignment // if one of the sequences // is less than 1 base long params.maxExtendDropoff); } if ( forwardScore < 0 ) { // // The extended alignment considers the whole genome, but // should be modified to be starting at the end of where // the original alignment left off. // if (params.verbosity > 0) { cout << "forward extended an alignment of score " << alignment->score << " with score " << forwardScore << " by " << extendedAlignmentForward.blocks.size() << " blocks and length " << extendedAlignmentForward.blocks[extendedAlignmentForward.blocks.size()-1].qPos << endl; } extendedAlignmentForward.tAlignedSeqPos = lastAlignedTPos; extendedAlignmentForward.qAlignedSeqPos = lastAlignedQPos; genomeSuffix.length = extendedAlignmentForward.tPos + extendedAlignmentForward.TEnd(); alignment->tAlignedSeq.Append(genomeSuffix); alignment->qAlignedSeq.length += extendedAlignmentForward.qPos + extendedAlignmentForward.QEnd(); assert(alignment->qAlignedSeq.length <= read.length); alignment->AppendAlignment(extendedAlignmentForward); } DNALength firstAlignedQPos = alignment->qPos + alignment->qAlignedSeqPos; DNALength firstAlignedTPos = alignment->tPos + alignment->tAlignedSeqPos; readPrefixLength = min(firstAlignedQPos, maximumExtendLength); if (readPrefixLength > 0) { readPrefix.ReferenceSubstring(read, firstAlignedQPos-readPrefixLength, readPrefixLength); } else { readPrefix.length = 0; } genomePrefixLength = min(firstAlignedTPos - intervalContigStartPos, maximumExtendLength); if (genomePrefixLength > 0) { if (alignment->tStrand == 0) { genomePrefix.Copy(genome, firstAlignedTPos - genomePrefixLength, genomePrefixLength); } else { static_cast(&genome)->MakeRC(genomePrefix, firstAlignedTPos - genomePrefixLength, genomePrefixLength); } } reverseScore = 0; if (readPrefix.length > 0 and genomePrefix.length > 0) { reverseScore = ExtendAlignmentReverse(readPrefix, readPrefix.length-1, genomePrefix, genomePrefixLength - 1, params.extendBandSize, //k mappingBuffers.scoreMat, mappingBuffers.pathMat, extendedAlignmentReverse, distScoreFn, 1, // don't bother attempting // to extend the alignment // if one of the sequences // is less than 1 base long params.maxExtendDropoff); } if (reverseScore < 0 ) { // // Make alignment->tPos relative to the beginning of the // extended alignment so that when it is appended, the // coordinates match correctly. if (params.verbosity > 0) { cout << "reverse extended an alignment of score " << alignment->score << " with score " << reverseScore << " by " << extendedAlignmentReverse.blocks.size() << " blocks and length " << extendedAlignmentReverse.blocks[extendedAlignmentReverse.blocks.size()-1].qPos << endl; } extendedAlignmentReverse.tAlignedSeqPos = firstAlignedTPos - genomePrefixLength; extendedAlignmentReverse.qAlignedSeqPos = firstAlignedQPos - readPrefixLength; extendedAlignmentReverse.AppendAlignment(*alignment); genomePrefix.Append(alignment->tAlignedSeq, genomePrefix.length - alignment->tPos); alignment->tAlignedSeq.Free(); alignment->tAlignedSeq.TakeOwnership(genomePrefix); alignment->blocks = extendedAlignmentReverse.blocks; alignment->tAlignedSeqPos = extendedAlignmentReverse.tAlignedSeqPos; alignment->tPos = extendedAlignmentReverse.tPos; alignment->qAlignedSeqPos = extendedAlignmentReverse.qAlignedSeqPos; alignment->qAlignedSeq.length = readPrefix.length + alignment->qAlignedSeq.length; alignment->qPos = extendedAlignmentReverse.qPos; alignment->qAlignedSeq.seq = readPrefix.seq; // // Make sure the two ways of accounting for aligned sequence // length are in sync. This needs to go. // if (alignment->blocks.size() > 0) { alignment->qAlignedSeqLength = alignment->qAlignedSeq.length; alignment->tAlignedSeqLength = alignment->tAlignedSeq.length; } else { alignment->qAlignedSeqLength = alignment->qAlignedSeq.length = 0; alignment->tAlignedSeqLength = alignment->tAlignedSeq.length = 0; } } // end of if (reverseScore < 0 ) readSuffix.Free(); readPrefix.Free(); genomePrefix.Free(); genomeSuffix.Free(); } tSubseq.Free(); } if (params.verbosity > 0) { cout << "interval align score: " << alignScore << endl; StickPrintAlignment(*alignment, (DNASequence&) alignment->qAlignedSeq, (DNASequence&) alignment->tAlignedSeq, cout, 0, alignment->tAlignedSeqPos); } ComputeAlignmentStats(*alignment, alignment->qAlignedSeq.seq, alignment->tAlignedSeq.seq, distScoreFn2); //SMRTDistanceMatrix, ins, del ); intvIt++; } while (intvIt != weightedIntervals.end()); } template void PairwiseLocalAlign(T_Sequence &qSeq, T_RefSequence &tSeq, int k, MappingParameters ¶ms, T_AlignmentCandidate &alignment, MappingBuffers &mappingBuffers, AlignmentType alignType) { // // Perform a pairwise alignment between qSeq and tSeq, but choose // the pairwise alignment method based on the parameters. The // options for pairwise alignment are: // - Affine KBanded alignment: usually used for sequences with no // quality information. // - KBanded alignment: For sequences with quality information. // Gaps are scored with quality values. // QualityValueScoreFunction scoreFn; scoreFn.del = params.indel; scoreFn.ins = params.indel; DistanceMatrixScoreFunction distScoreFn2( SMRTDistanceMatrix, params.indel, params.indel); IDSScoreFunction idsScoreFn; idsScoreFn.ins = params.insertion; idsScoreFn.del = params.deletion; idsScoreFn.substitutionPrior = params.substitutionPrior; idsScoreFn.globalDeletionPrior = params.globalDeletionPrior; idsScoreFn.InitializeScoreMatrix(SMRTDistanceMatrix); int kbandScore; int qvAwareScore; if (params.ignoreQualities || qSeq.qual.Empty() || !ReadHasMeaningfulQualityValues(qSeq) ) { kbandScore = AffineKBandAlign(qSeq, tSeq, SMRTDistanceMatrix, params.indel+2, params.indel - 3, // homopolymer insertion open and extend params.indel+2, params.indel - 1, // any insertion open and extend params.indel, // deletion k*1.2, mappingBuffers.scoreMat, mappingBuffers.pathMat, mappingBuffers.hpInsScoreMat, mappingBuffers.hpInsPathMat, mappingBuffers.insScoreMat, mappingBuffers.insPathMat, alignment, Global); alignment.score = kbandScore; if (params.verbosity >= 2) { cout << "align score: " << kbandScore << endl; } } else { if (qSeq.insertionQV.Empty() == false) { qvAwareScore = KBandAlign(qSeq, tSeq, SMRTDistanceMatrix, params.indel+2, // ins params.indel+2, // del k, mappingBuffers.scoreMat, mappingBuffers.pathMat, alignment, idsScoreFn, alignType); if (params.verbosity >= 2) { cout << "ids score fn score: " << qvAwareScore << endl; } } else { qvAwareScore = KBandAlign(qSeq, tSeq, SMRTDistanceMatrix, params.indel+2, // ins params.indel+2, // del k, mappingBuffers.scoreMat, mappingBuffers.pathMat, alignment, scoreFn, alignType); if (params.verbosity >= 2) { cout << "qv score fn score: " << qvAwareScore << endl; } } alignment.sumQVScore = qvAwareScore; alignment.score = qvAwareScore; alignment.probScore = 0; } // Compute stats and assign a default alignment score using an edit distance. ComputeAlignmentStats(alignment, qSeq.seq, tSeq.seq, distScoreFn2); if (params.scoreType == 1) { alignment.score = alignment.sumQVScore; } } // Extend target aligned sequence of the input alignement to both ends // by flankSize bases. Update alignment->tAlignedSeqPos, // alignment->tAlignedSeqLength and alignment->tAlignedSeq. void FlankTAlignedSeq(T_AlignmentCandidate * alignment, SequenceIndexDatabase &seqdb, DNASequence & genome, int flankSize) { assert(alignment != NULL and alignment->tIsSubstring); UInt forwardTPos, newTAlignedSeqPos, newTAlignedSeqLen; // New aligned start position relative to this chromosome, with // the same direction as alignment->tStrand. newTAlignedSeqPos = UInt((alignment->tAlignedSeqPos > UInt(flankSize))? (alignment->tAlignedSeqPos - flankSize): 0); newTAlignedSeqLen = min(alignment->tAlignedSeqPos + alignment->tAlignedSeqLength + flankSize, alignment->tLength) - newTAlignedSeqPos; if (alignment->tStrand ==0) { forwardTPos = newTAlignedSeqPos; } else { forwardTPos = alignment->tLength - newTAlignedSeqPos - 1; } // Find where this chromosome is in the genome. int seqIndex = seqdb.GetIndexOfSeqName(alignment->tName); assert(seqIndex != -1); UInt newGenomePos = seqdb.ChromosomePositionToGenome(seqIndex, forwardTPos); if (alignment->tIsSubstring == false) { alignment->tAlignedSeq.Free(); } alignment->tAlignedSeqPos = newTAlignedSeqPos; alignment->tAlignedSeqLength = newTAlignedSeqLen; if (alignment->tStrand == 0) { alignment->tAlignedSeq.ReferenceSubstring(genome, newGenomePos, newTAlignedSeqLen); } else { // Copy and then reverse complement. genome.MakeRC(alignment->tAlignedSeq, newGenomePos + 1 - alignment->tAlignedSeqLength, alignment->tAlignedSeqLength); alignment->tIsSubstring = false; } } // Align a subread of a SMRT sequence to target sequence of an alignment. // Input: // subread - a subread of a SMRT sequence. // unrolledRead - the full SMRT sequence. // alignment - an alignment. // passDirection - whether or not the subread has the // same direction as query of the alignment. // 0 = true, 1 = false. // subreadInterval - [start, end) interval of the subread in the // SMRT read. // subreadIndex - index of the subread in allReadAlignments. // params - mapping paramters. // Output: // allReadAlignments - where the sequence and alignments of the // subread are saved. // threadOut - an out stream for debugging the current thread. void AlignSubreadToAlignmentTarget(ReadAlignments & allReadAlignments, SMRTSequence & subread, SMRTSequence & unrolledRead, T_AlignmentCandidate * alignment, int passDirection, ReadInterval & subreadInterval, int subreadIndex, MappingParameters & params, MappingBuffers & mappingBuffers, ostream & threadOut) { assert(passDirection == 0 or passDirection == 1); // // Determine where in the genome the subread has mapped. // DNASequence alignedForwardRefSequence, alignedReverseRefSequence; if (alignment->tStrand == 0) { // This needs to be changed -- copy copies RHS into LHS, // CopyAsRC copies LHS into RHS alignedForwardRefSequence.Copy(alignment->tAlignedSeq); alignment->tAlignedSeq.CopyAsRC(alignedReverseRefSequence); } else { alignment->tAlignedSeq.CopyAsRC(alignedForwardRefSequence); alignedReverseRefSequence.Copy(alignment->tAlignedSeq); } IDSScoreFunction idsScoreFn; idsScoreFn.ins = params.insertion; idsScoreFn.del = params.deletion; idsScoreFn.InitializeScoreMatrix(SMRTDistanceMatrix); idsScoreFn.globalDeletionPrior = params.globalDeletionPrior; idsScoreFn.substitutionPrior = params.substitutionPrior; DistanceMatrixScoreFunction distScoreFn2( SMRTDistanceMatrix, params.indel, params.indel); // // Determine the strand to align the subread to. // T_AlignmentCandidate exploded; bool sameAlignmentPassDirection = (alignment->tStrand == passDirection); bool computeProbIsFalse = false; DNASequence & alignedRefSequence = (sameAlignmentPassDirection? alignedForwardRefSequence:alignedReverseRefSequence); // // In the original code, parameters: bandSize=10, alignType=Global, // sdpTupleSize=4 (instead of 12, Local and 6) were used when // alignment & pass have different directions. // int explodedScore = GuidedAlign(subread, alignedRefSequence, idsScoreFn, 12, params.sdpIns, params.sdpDel, params.indelRate, mappingBuffers, exploded, Local, computeProbIsFalse, 6); if (params.verbosity >= 3) { threadOut << "zmw " << unrolledRead.zmwData.holeNumber << ", subreadIndex " << subreadIndex << ", passDirection " << passDirection << ", subreadInterval [" << subreadInterval.start << ", " << subreadInterval.end << ")" << endl << "Exploded score " << explodedScore << endl << "StickPrintAlignment subread-reference alignment which has" << " the " << (sameAlignmentPassDirection?"same":"different") << " direction as the ccs-reference (or the " << "longestSubread-reference) alignment. " << endl << "subread: " << endl; static_cast(&subread)->PrintSeq(threadOut); threadOut << endl; threadOut << "alignedRefSeq: " << endl; static_cast(&alignedRefSequence)->PrintSeq(threadOut); StickPrintAlignment(exploded, (DNASequence&) subread, (DNASequence&) alignedRefSequence, threadOut, exploded.qAlignedSeqPos, exploded.tAlignedSeqPos); } if (exploded.blocks.size() > 0) { DistanceMatrixScoreFunction distScoreFn( SMRTDistanceMatrix, params.indel, params.indel); ComputeAlignmentStats(exploded, subread.seq, alignedRefSequence.seq, distScoreFn2); if (exploded.score <= params.maxScore) { // // The coordinates of the alignment should be // relative to the reference sequence (the specified chromosome, // not the whole genome). // exploded.qStrand = 0; exploded.tStrand = sameAlignmentPassDirection?0:1; exploded.qLength = unrolledRead.length; exploded.tLength = alignment->tLength; exploded.tAlignedSeq.Copy(alignedRefSequence); exploded.tAlignedSeqPos = (passDirection == 0)? (alignment->tAlignedSeqPos): (exploded.tLength - alignment->tAlignedSeqPos - alignment->tAlignedSeqLength); exploded.tAlignedSeqLength = alignment->tAlignedSeqLength; exploded.qAlignedSeq.ReferenceSubstring(subread); exploded.qAlignedSeqPos = subreadInterval.start; exploded.qAlignedSeqLength = subreadInterval.end - subreadInterval.start; exploded.mapQV = alignment->mapQV; exploded.tName = alignment->tName; exploded.tIndex = alignment->tIndex; stringstream namestrm; namestrm << "/" << subreadInterval.start << "_" << subreadInterval.end; exploded.qName = string(unrolledRead.title) + namestrm.str(); // // Don't call AssignRefContigLocation as the coordinates // of the alignment is already relative to the chromosome coordiantes. // // Save this alignment for printing later. // T_AlignmentCandidate *alignmentPtr = new T_AlignmentCandidate; // Refine concordant alignments if (params.refineConcordantAlignments) { vector vquery; vquery.push_back(&unrolledRead); RefineAlignment(vquery, alignedRefSequence, exploded, params, mappingBuffers); } *alignmentPtr = exploded; // // Check if need to be filtered // For now filtering only in concordant mode // Later add filtration in other modes // if (allReadAlignments.alignMode == ZmwSubreads) { if (params.filterCriteria.Satisfy(alignmentPtr)) { if (params.verbosity > 3) { std::cerr << " Filters passed. Adding slave alignment in concordant mode" << std::endl; } allReadAlignments.AddAlignmentForSeq(subreadIndex, alignmentPtr); } else { // delete alignment immediately if (params.verbosity > 3) { std::cerr << " Filters failed. Delete alignment immediately" << std::endl; } delete alignmentPtr; } } // for all modes except ZmwSubreads no filtering for now else { allReadAlignments.AddAlignmentForSeq(subreadIndex, alignmentPtr); } } // End of exploded score <= maxScore. if (params.verbosity >= 3) { threadOut << "exploded score: " << exploded.score << endl << "exploded alignment: "<< endl; exploded.Print(threadOut); threadOut << endl; } } // End of exploded.blocks.size() > 0. } blasr-smrtanalysis-4.0.0/iblasr/BlasrHeaders.h000066400000000000000000000075241302464523700213470ustar00rootroot00000000000000#pragma once #ifdef __GLIBC__ # include #endif #include #include #include #include #include #include #include #include #include #if defined(__GLIBC__) || defined(__APPLE__) # include #endif #define MAX_PHRED_SCORE 254 #define MAPQV_END_ALIGN_WIGGLE 5 using namespace std; #include #ifdef USE_PBBAM #include #include #endif #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include "MappingIPC.h" #include "MappingSemaphores.h" #include "MappingBuffers.hpp" #include "ReadAlignments.hpp" typedef SMRTSequence T_Sequence; typedef FASTASequence T_GenomeSequence; typedef DNASuffixArray T_SuffixArray; typedef DNATuple T_Tuple; typedef LISPValueWeightor > PValueWeightor; typedef LISSMatchFrequencyPValueWeightor > MultiplicityPValueWeightor; typedef MappingData MappingIPC; blasr-smrtanalysis-4.0.0/iblasr/BlasrMiscs.hpp000066400000000000000000000051461302464523700214100ustar00rootroot00000000000000// Author: Mark Chaisson #pragma once #include "BlasrHeaders.h" //-------------------------Fetch Reads----------------------------// template bool GetNextReadThroughSemaphore(ReaderAgglomerate &reader, MappingParameters ¶ms, T_Sequence &read, string & readGroupId, int & associatedRandInt, MappingSemaphores & semaphores); //---------------------MAKE & CHECK READS-------------------------// //FIXME: move to SMRTSequence bool ReadHasMeaningfulQualityValues(FASTQSequence &sequence); //FIXME: Move to SMRTSequence // Given a SMRT sequence and a subread interval, make the subread. // Input: // smrtRead - a SMRT sequence // subreadInterval - a subread interval // params - mapping parameters // Output: // subreadSequence - the constructed subread void MakeSubreadOfInterval(SMRTSequence & subreadSequence, SMRTSequence & smrtRead, ReadInterval & subreadInterval, MappingParameters & params); //FIXME: Move to SMRTSequence // Given a SMRT sequence and one of its subreads, make the // reverse complement of the subread in the coordinate of the // reverse complement sequence of the SMRT sequence. // Input: // smrtRead - a SMRT read // subreadSequence - a subread of smrtRead // Output: // subreadSequenceRC - the reverse complement of the subread // in the coordinate of the reverse // complement of the SMRT read. void MakeSubreadRC(SMRTSequence & subreadSequenceRC, SMRTSequence & subreadSequence, SMRTSequence & smrtRead); // Construct subreads invervals from subreads void MakeSubreadIntervals(vector & subreads, vector & subreadIntervals); // Return index of subread which will be used as concordant template. // If Zmw has exactly one subread, return index of the subread (i.e., 0). // If Zmw has exactly two subreads, return index of the longer subread. // If Zmw has three or more subreads, return index of the median-length // subread in range subreadIntervals[1:-1]. Avoid using the first and last // subreads (which are less likely to be full-pass) if possible. int GetIndexOfConcordantTemplate(const vector & subreadIntervals); //-------------------------MISC-----------------------------------// int CountZero(unsigned char *ptr, int length); #include "BlasrMiscsImpl.hpp" blasr-smrtanalysis-4.0.0/iblasr/BlasrMiscsImpl.hpp000066400000000000000000000136341302464523700222330ustar00rootroot00000000000000// Author: Mark Chaisson #pragma once #include template bool GetNextReadThroughSemaphore(ReaderAgglomerate &reader, MappingParameters ¶ms, T_Sequence &read, string & readGroupId, int & associatedRandInt, MappingSemaphores & semaphores) { // Wait on a semaphore if (params.nProc > 1) { #ifdef __APPLE__ sem_wait(semaphores.reader); #else sem_wait(&semaphores.reader); #endif } bool returnValue = true; // // CCS Reads are read differently from other reads. Do static casting here // of this. // if (reader.GetNext(read, associatedRandInt) == 0) { returnValue = false; } // // Set the read group id before releasing the semaphore, since other // threads may change the reader object to a new read group before // sending this alignment out to printing. readGroupId = reader.readGroupId; if (params.nProc > 1) { #ifdef __APPLE__ sem_post(semaphores.reader); #else sem_post(&semaphores.reader); #endif } return returnValue; } bool ReadHasMeaningfulQualityValues(FASTQSequence &sequence) { if (sequence.qual.Empty() == true) { return 0; } else { int numZero=0, numNonZero=0; if (sequence.qual.data == NULL) { return false; } numZero = CountZero(sequence.qual.data, sequence.length); numNonZero = sequence.length - numZero; int subNumZero = 0, subNonZero = 0; if (sequence.substitutionQV.data == NULL) { return false; } subNumZero = CountZero(sequence.substitutionQV.data, sequence.length); subNonZero = sequence.length - subNumZero; if (numZero < 0.5*numNonZero and subNumZero < 0.5 * subNonZero) { return true; } else { return false; } } } // Given a SMRT sequence and a subread interval, make the subread. // Input: // smrtRead - a SMRT sequence // subreadInterval - a subread interval // params - mapping parameters // Output: // subreadSequence - the constructed subread void MakeSubreadOfInterval(SMRTSequence & subreadSequence, SMRTSequence & smrtRead, ReadInterval & subreadInterval, MappingParameters & params) { int start = subreadInterval.start; int end = subreadInterval.end; assert(smrtRead.length >= subreadSequence.length); smrtRead.MakeSubreadAsMasked(subreadSequence, start, end); if (!params.preserveReadTitle) { smrtRead.SetSubreadTitle(subreadSequence, subreadSequence.SubreadStart(), subreadSequence.SubreadEnd()); } else { subreadSequence.CopyTitle(smrtRead.title); } subreadSequence.zmwData = smrtRead.zmwData; } // Given a SMRT sequence and one of its subreads, make the // reverse complement of the subread in the coordinate of the // reverse complement sequence of the SMRT sequence. // Input: // smrtRead - a SMRT read // subreadSequence - a subread of smrtRead // Output: // subreadSequenceRC - the reverse complement of the subread // in the coordinate of the reverse // complement of the SMRT read. void MakeSubreadRC(SMRTSequence & subreadSequenceRC, SMRTSequence & subreadSequence, SMRTSequence & smrtRead) { assert(smrtRead.length >= subreadSequence.length); // Reverse complement sequence of the subread. subreadSequence.MakeRC(subreadSequenceRC); // Update start and end positions of subreadSequenceRC in the // coordinate of reverse compelement sequence of the SMRT read. subreadSequenceRC.SubreadStart(smrtRead.length - subreadSequence.SubreadEnd()); subreadSequenceRC.SubreadEnd (smrtRead.length - subreadSequence.SubreadStart()); subreadSequenceRC.zmwData = smrtRead.zmwData; } int CountZero(unsigned char *ptr, int length) { int i; int nZero = 0; for (i = 0; i < length; i++) { if (ptr[i] == 0) { ++nZero; } } return nZero; } void MakeSubreadIntervals(vector & subreads, vector & subreadIntervals) { subreadIntervals.clear(); for (auto subread: subreads) { subreadIntervals.push_back(ReadInterval(subread.SubreadStart(), subread.SubreadEnd(), subread.highQualityRegionScore)); } } int GetIndexOfConcordantTemplate(const vector & subreadIntervals) { assert(subreadIntervals.size() != 0); if (subreadIntervals.size() == 1) return 0; // Zmw has exactly one subread. else if (subreadIntervals.size() == 2) { // Zmw has two subreads, return index of the longer one. const ReadInterval & first = subreadIntervals[0]; const ReadInterval & second = subreadIntervals[1]; if (first.Length() < second.Length()) return 1; else return 0; } else { // Zmw has more than two subreads, look for the median-length subread // in subreadIntervals[1:-1]. The first and last subreads are not // considered because they are usually non-full-pass. vector intervals; intervals.insert(intervals.begin(), subreadIntervals.begin() + 1, subreadIntervals.end() - 1); std::sort(intervals.begin(), intervals.end(), [](const ReadInterval& a, const ReadInterval& b)->bool {return a.Length() < b.Length();}); const ReadInterval & template_interval = intervals[int(intervals.size()/2)]; for (int pos = 1; pos < int(subreadIntervals.size()) -1; pos ++) { if (subreadIntervals[pos] == template_interval) { return pos; } } } return 0; } blasr-smrtanalysis-4.0.0/iblasr/BlasrUtils.hpp000066400000000000000000000160541302464523700214320ustar00rootroot00000000000000// Author: Mark Chaisson #pragma once #include "BlasrHeaders.h" //----------------------MODIFY ALIGNMENTS--------------------------// //FIXME: refactor class SequenceIndexDatabase void AssignRefContigLocation(T_AlignmentCandidate &alignment, SequenceIndexDatabase &seqdb, DNASequence &genome); //FIXME: refactor class SequenceIndexDatabase void AssignRefContigLocations(vector &alignmentPtrs, SequenceIndexDatabase &seqdb, DNASequence &genome); template //FIXME: refactor class SequenceIndexDatabase void AssignGenericRefContigName(vector &alignmentPtrs, T_RefSequence &genome); //FIXME: move to class ReadAlignments void StoreRankingStats(vector &alignments, VarianceAccumulator &accumPValue, VarianceAccumulator &accumWeight); //FIXME: mapQV should be assigned when alignments are created. void AssignMapQV(vector &alignmentPtrs); //FIXME: move to class ReadAlignments void ScaleMapQVByClusterSize(T_AlignmentCandidate &alignment, MappingParameters ¶ms); void StoreMapQVs(SMRTSequence &read, vector &alignmentPtrs, MappingParameters ¶ms); //--------------------SEARCH & CHECK ALIGNMENTS-------------------// //FIXME: move to class ReadAlignments template bool CheckForSufficientMatch(T_Sequence &read, vector &alignmentPtrs, MappingParameters ¶ms); //FIXME: move to class ReadAlignments int FindMaxLengthAlignment(vector alignmentPtrs, int &maxLengthIndex); //FIXME: move to class T_AlignmentCandidate void SumMismatches(SMRTSequence &read, T_AlignmentCandidate &alignment, int mismatchScore, int fullIntvStart, int fullIntvEnd, MappingParameters ¶ms, int &sum); //FIXME: move to class T_AlignmentCandidate /// \returns whether two alignments overlap by more than minPcercentOverlap% bool AlignmentsOverlap(T_AlignmentCandidate &alnA, T_AlignmentCandidate &alnB, float minPercentOverlap); /// \Partition overlapping alignments. void PartitionOverlappingAlignments(vector &alignmentPtrs, vector > &partitions, float minOverlap); //--------------------FILTER ALIGNMENTS---------------------------// //FIXME: move to class T_AlignmentCandidate and ReadAlignments int RemoveLowQualitySDPAlignments(int readLength, vector &alignmentPtrs, MappingParameters ¶ms); //FIXME: move to class ReadAlignments template int RemoveLowQualityAlignments(T_Sequence &read, vector &alignmentPtrs, MappingParameters ¶ms); //FIXME: move to class ReadAlignments int RemoveOverlappingAlignments(vector &alignmentPtrs, MappingParameters ¶ms); // FIXME: move to class ReadAlignments // Delete all alignments from index startIndex in vector, inclusive. void DeleteAlignments(vector &alignmentPtrs, int startIndex=0); //--------------------REFINE ALIGNMENTS---------------------------// template void RefineAlignment(vector &bothQueryStrands, T_RefSequence &genome, T_AlignmentCandidate &alignmentCandidate, MappingParameters ¶ms, MappingBuffers &mappingBuffers); template void RefineAlignments(vector &bothQueryStrands, T_RefSequence &genome, vector &alignmentPtrs, MappingParameters ¶ms, MappingBuffers &mappingBuffers); //--------------------PRINT ALIGNMENTS---------------------------// vector SelectAlignmentsToPrint(vector alignmentPtrs, MappingParameters & params, const int & associatedRandInt); // // The full read is not the subread, and does not have masked off characters. // void PrintAlignment(T_AlignmentCandidate &alignment, SMRTSequence &fullRead, MappingParameters ¶ms, AlignmentContext &alignmentContext, ostream &outFile #ifdef USE_PBBAM , SMRTSequence &subread , PacBio::BAM::IRecordWriter * bamWriterPtr #endif ); // Print all alignments in vector alignmentPtrs void PrintAlignments(vector alignmentPtrs, SMRTSequence &read, MappingParameters ¶ms, ostream &outFile, AlignmentContext alignmentContext, #ifdef USE_PBBAM SMRTSequence &subread, PacBio::BAM::IRecordWriter * bamWriterPtr, #endif MappingSemaphores & semaphores); void PrintAlignmentPtrs(vector & alignmentPtrs, ostream & out = cout); // Print an unaligned read, if noPrintUnalignedSeqs is True, print title only; // otherwise, print title and sequence of the read. void PrintUnaligned(const SMRTSequence & unalignedRead, ostream & unalignedFilePtr, const bool noPrintUnalignedSeqs); // Print all alignments for subreads in allReadAlignments. // Input: // allReadAlignments - contains a set of subreads, each of which // is associated with a group of alignments. // alignmentContext - an alignment context of each subread used // for printing in SAM format. // params - mapping parameters. // Output: // outFilePtr - where to print alignments for subreads. // unalignedFilePtr - where to print sequences for unaligned subreads. void PrintAllReadAlignments(ReadAlignments & allReadAlignments, AlignmentContext & alignmentContext, ostream & outFilePtr, ostream & unalignedFilePtr, MappingParameters & params, vector & subreads, #ifdef USE_PBBAM PacBio::BAM::IRecordWriter * bamWriterPtr, #endif MappingSemaphores & semaphores); #include "BlasrUtilsImpl.hpp" blasr-smrtanalysis-4.0.0/iblasr/BlasrUtilsImpl.hpp000066400000000000000000001347761302464523700222700ustar00rootroot00000000000000// Author: Mark Chaisson #pragma once #include "BlasrAlign.hpp" //----------------------MODIFY ALIGNMENTS--------------------------// void AssignRefContigLocation(T_AlignmentCandidate &alignment, SequenceIndexDatabase &seqdb, DNASequence &genome) { // // If the sequence database is used, the start position of // the alignment is relative to the start of the chromosome, // not the entire index. Subtract off the start position of // the chromosome to get the true position. // DNALength forwardTPos; int seqDBIndex; if (alignment.tStrand == 0) { forwardTPos = alignment.tAlignedSeqPos; seqDBIndex = seqdb.SearchForIndex(forwardTPos); alignment.tAlignedSeqPos -= seqdb.seqStartPos[seqDBIndex]; } else { // // Flip coordinates into forward strand in order to find the boundaries // of the contig, then reverse them in order to find offset. // // Find the reverse complement coordinate of the index of the last aligned base. assert(alignment.tAlignedSeqLength > 0); forwardTPos = genome.MakeRCCoordinate(alignment.tAlignedSeqPos + alignment.tAlignedSeqLength - 1); seqDBIndex = seqdb.SearchForIndex(forwardTPos); // // Find the reverse comlement coordinate of the last base of this // sequence. This would normally be the start of the next contig // -1 to get the length, but since an 'N' is added between every // pair of sequences, this is -2. // DNALength reverseTOffset; reverseTOffset = genome.MakeRCCoordinate(seqdb.seqStartPos[seqDBIndex+1]-2); alignment.tAlignedSeqPos -= reverseTOffset; } } void AssignRefContigLocations(vector &alignmentPtrs, SequenceIndexDatabase &seqdb, DNASequence &genome) { UInt i; for (i = 0; i < alignmentPtrs.size(); i++) { T_AlignmentCandidate *aref = alignmentPtrs[i]; AssignRefContigLocation(*aref, seqdb, genome); } } template void AssignGenericRefContigName(vector &alignmentPtrs, T_RefSequence &genome) { UInt i; for (i = 0; i < alignmentPtrs.size(); i++) { T_AlignmentCandidate *aref = alignmentPtrs[i]; aref->tName = genome.title; } } void StoreRankingStats(vector &alignments, VarianceAccumulator &accumPValue, VarianceAccumulator &accumWeight) { int i; for (i = 0; i < int(alignments.size()); i++) { alignments[i]->pvalVariance = accumPValue.GetVariance(); alignments[i]->pvalNStdDev = accumPValue.GetNStdDev(alignments[i]->clusterScore); alignments[i]->weightVariance = accumWeight.GetVariance(); alignments[i]->weightNStdDev = accumWeight.GetNStdDev(alignments[i]->clusterWeight); } } void AssignMapQV(vector &alignmentPtrs) { int i; int mapQV = 1; if (alignmentPtrs.size() > 1 and alignmentPtrs[0]->score == alignmentPtrs[1]->score) { // the top two alignments have the same score, don't consider them as mapped. mapQV = 0; } for (i = 0; i < int(alignmentPtrs.size()); i++) { alignmentPtrs[i]->mapQV = mapQV; } } void ScaleMapQVByClusterSize(T_AlignmentCandidate &alignment, MappingParameters ¶ms) { if (alignment.numSignificantClusters > int(params.nCandidates)) { alignment.mapQV = Phred((1-InversePhred(alignment.mapQV))* ((float)params.nCandidates / alignment.numSignificantClusters)); } else if (alignment.numSignificantClusters == 0) { alignment.mapQV = 0; } } void StoreMapQVs(SMRTSequence &read, vector &alignmentPtrs, MappingParameters ¶ms) { // // Only weight alignments for mapqv against eachother if they are overlapping. // int a; vector > partitions; // Each set contains alignments that overlap on the read. DistanceMatrixScoreFunction distScoreFn; distScoreFn.del = params.deletion; distScoreFn.ins = params.insertion; // bug 24363, set affineOpen and affineExtend for distScoreFn distScoreFn.affineOpen = params.affineOpen; distScoreFn.affineExtend = params.affineExtend; distScoreFn.InitializeScoreMatrix(SMRTLogProbMatrix); IDSScoreFunction idsScoreFn; idsScoreFn.ins = params.insertion; idsScoreFn.del = params.deletion; idsScoreFn.affineExtend = params.affineExtend; idsScoreFn.affineOpen = params.affineOpen; idsScoreFn.substitutionPrior = params.substitutionPrior; idsScoreFn.globalDeletionPrior = params.globalDeletionPrior; // // Rescore the alignment so that it uses probabilities. // for (a = 0; a < int(alignmentPtrs.size()); a++) { if (params.ignoreQualities == false) { // bug 24363, pass -affineAlign to compute correct alignment score. alignmentPtrs[a]->probScore = -ComputeAlignmentScore(*alignmentPtrs[a], alignmentPtrs[a]->qAlignedSeq, alignmentPtrs[a]->tAlignedSeq, idsScoreFn, params.affineAlign) / 10.0; } else { alignmentPtrs[a]->probScore = -ComputeAlignmentScore(*alignmentPtrs[a], alignmentPtrs[a]->qAlignedSeq, alignmentPtrs[a]->tAlignedSeq, distScoreFn, params.affineAlign) / 10.0; } } PartitionOverlappingAlignments(alignmentPtrs, partitions, params.minFractionToBeConsideredOverlapping); int p; set::iterator partIt, partEnd; // // For each partition, store where on the read it begins, and where // it ends. // vector partitionBeginPos, partitionEndPos; partitionBeginPos.resize(partitions.size()); partitionEndPos.resize(partitions.size()); fill(partitionBeginPos.begin(), partitionBeginPos.end(), -1); fill(partitionEndPos.begin(), partitionEndPos.end(), -1); vector assigned; assigned.resize( alignmentPtrs.size()); fill(assigned.begin(), assigned.end(), false); for (p = 0; p < int(partitions.size()); p++) { partEnd = partitions[p].end(); int alnStart, alnEnd; if (partitions[p].size() > 0) { partIt = partitions[p].begin(); alignmentPtrs[*partIt]->GetQInterval(alnStart, alnEnd); partitionBeginPos[p] = alnStart; partitionEndPos[p] = alnEnd; ++partIt; partEnd = partitions[p].end(); for (; partIt != partEnd; ++partIt) { // Comment out because all reads are now in the forward strand. // alignmentPtrs[*partIt]->GetQInterval(alnStart, alnEnd, convertToForwardStrand); alignmentPtrs[*partIt]->GetQInterval(alnStart, alnEnd); if (alnEnd - alnStart > partitionEndPos[p] - partitionBeginPos[p]) { partitionBeginPos[p] = alnStart; partitionEndPos[p] = alnEnd; } } } } // // For each partition, determine the widest parts of the read that // are aligned in the partition. All alignments will be extended to // the end of the widest parts of the partition. // const static bool convertToForwardStrand = true; UInt i; // // For now, just use the alignment score as the probability score. // Although it is possible to use the full forward probability, for // the most part it is pretty much the same as the Vitterbi // probability, but it takes a lot longer to compute. // // // Now estimate what the alignment scores would be if they were // extended past the ends of their current alignment. // for (p = 0; p < int(partitions.size()); p++) { partEnd = partitions[p].end(); int alnStart, alnEnd; for (partIt = partitions[p].begin(); partitions[p].size() > 0 and partIt != partEnd; ++partIt) { int mismatchSum = 0; alignmentPtrs[*partIt]->GetQInterval(alnStart, alnEnd, convertToForwardStrand); if (alnStart - partitionBeginPos[p] > MAPQV_END_ALIGN_WIGGLE or partitionEndPos[p] - alnEnd > MAPQV_END_ALIGN_WIGGLE) { // bug 24363, use updated SumMismatches to compute mismatch score when // no QV is available. SumMismatches(read, *alignmentPtrs[*partIt], 15, partitionBeginPos[p], partitionEndPos[p], params, mismatchSum); } // // Random sequence can be aligned with about 50% similarity due // to optimization, so weight the qv sum // alignmentPtrs[*partIt]->probScore += -(mismatchSum) * 0.5; } } // // Determine mapqv by summing qvscores in partitions float mapQVDenominator = 0; for (p = 0; p < int(partitions.size()); p++) { set::iterator nextIt; if (partitions[p].size() == 0) { continue; } int index = *partitions[p].begin(); mapQVDenominator = alignmentPtrs[index]->probScore; if (partitions[p].size() > 1) { partIt = partitions[p].begin(); partEnd = partitions[p].end(); ++partIt; for (; partIt != partEnd; ++partIt) { index = *partIt; mapQVDenominator = LogSumOfTwo(mapQVDenominator, alignmentPtrs[index]->probScore); } } for (partIt = partitions[p].begin(); partIt != partitions[p].end(); ++partIt) { // // If only one alignment is found, assume maximum mapqv. // assigned[*partIt] = true; if (partitions[p].size() == 1) { alignmentPtrs[*partIt]->mapQV = MAX_PHRED_SCORE; } // // Look for overflow. // else if (alignmentPtrs[*partIt]->probScore - mapQVDenominator < -20) { alignmentPtrs[*partIt]->mapQV = 0; } else { double log10 = log(10); double sub = alignmentPtrs[*partIt]->probScore - mapQVDenominator; double expo = exp(log10*sub); double diff = 1.0 - expo; int phredValue; if (expo == 0) { phredValue = 0; } else if (diff == 0) { phredValue = MAX_PHRED_SCORE; } else { phredValue = Phred(diff); } if (phredValue > MAX_PHRED_SCORE) { phredValue = MAX_PHRED_SCORE; } alignmentPtrs[*partIt]->mapQV = phredValue; assigned[*partIt]=true; } if (params.scaleMapQVByNumSignificantClusters) { ScaleMapQVByClusterSize(*alignmentPtrs[*partIt], params); } } } for (i = 0; i < assigned.size(); i++) { assert(assigned[i]); } } //--------------------SEARCH & CHECK ALIGNMENTS-------------------// template bool CheckForSufficientMatch(T_Sequence &read, vector &alignmentPtrs, MappingParameters ¶ms) { (void)(read); if (alignmentPtrs.size() > 0 and alignmentPtrs[0]->score < params.maxScore) { return true; } else { return false; } } int FindMaxLengthAlignment(vector alignmentPtrs, int &maxLengthIndex) { int i; int maxLength = 0; maxLengthIndex = -1; for (i = 0; i < int(alignmentPtrs.size()); i++) { int qStart, qEnd; alignmentPtrs[i]->GetQInterval(qStart, qEnd); if (qEnd - qStart > maxLength) { maxLengthIndex = i; maxLength = qEnd - qStart; } } return (maxLength != -1); } void SumMismatches(SMRTSequence &read, T_AlignmentCandidate &alignment, int mismatchScore, int fullIntvStart, int fullIntvEnd, MappingParameters ¶ms, int &sum) { int alnStart, alnEnd; alignment.GetQIntervalOnForwardStrand(alnStart, alnEnd); int p; sum = 0; if (not params.ignoreQualities and read.substitutionQV.Empty() == false) { for (p = fullIntvStart; p < alnStart; p++) { sum += read.substitutionQV[p]; } for (p = alnEnd; p < fullIntvEnd; p++) { sum += read.substitutionQV[p]; } } else { // bug 24363, compute mismatch score when QV is not available. sum += mismatchScore * ((alnStart - fullIntvStart) + (fullIntvEnd - alnEnd)); } } bool AlignmentsOverlap(T_AlignmentCandidate &alnA, T_AlignmentCandidate &alnB, float minPercentOverlap) { int alnAStart, alnAEnd, alnBStart, alnBEnd; bool useForwardStrand=true; alnA.GetQInterval(alnAStart, alnAEnd, useForwardStrand); alnB.GetQInterval(alnBStart, alnBEnd, useForwardStrand); // Look if one alignment encompasses the other int ovp = 0; if (alnAStart <= alnBStart and alnAEnd >= alnBEnd) { return true; } else if (alnBStart <= alnAStart and alnBEnd >= alnAEnd) { return true; //ovp = alnAEnd - alnAStart; } else { // // Look to see if the alignments overlap // if (alnAEnd >= alnBStart and alnAEnd <= alnBEnd) { ovp = alnAEnd - alnBStart; } else if (alnAStart >= alnBStart and alnAStart <= alnBEnd) { ovp = alnBEnd - alnAStart; } } // float ovpPercent = (2.0*ovp) / ((alnAEnd - alnAStart) + (alnBEnd - alnBStart)); float ovpPercent = 0; if (alnAEnd - alnAStart > 0 and alnBEnd - alnBStart > 0) { // overlap percentage: maximum overlap percent in A and B. ovpPercent = max(float(ovp)/float(alnAEnd - alnAStart), float(ovp)/float(alnBEnd - alnBStart)); } // returns true when an overlap is found. return (ovpPercent > minPercentOverlap); } void PartitionOverlappingAlignments(vector &alignmentPtrs, vector > &partitions, float minOverlap) { if (alignmentPtrs.size() == 0) { partitions.clear(); return; } set::iterator setIt, setEnd; int i, p; bool overlapFound = false; for (i = 0; i < int(alignmentPtrs.size()); i++) { overlapFound = false; for (p = 0; p < int(partitions.size()) and overlapFound == false; p++) { setEnd = partitions[p].end(); for (setIt = partitions[p].begin(); setIt != partitions[p].end() and overlapFound == false; ++setIt) { if (AlignmentsOverlap(*alignmentPtrs[i], *alignmentPtrs[*setIt], minOverlap) or ((alignmentPtrs[i]->QAlignStart() <= alignmentPtrs[*setIt]->QAlignStart()) and (alignmentPtrs[i]->QAlignEnd() > alignmentPtrs[*setIt]->QAlignEnd()))) { partitions[p].insert(i); overlapFound = true; } } } // // If this alignment does not overlap any other, create a // partition with it as the first element. // if (overlapFound == false) { partitions.push_back(set()); partitions[partitions.size()-1].insert(i); } } } //--------------------FILTER ALIGNMENTS---------------------------// int RemoveLowQualitySDPAlignments(int readLength, vector &alignmentPtrs, MappingParameters ¶ms) { // Just a hack. For now, assume there is at least 1 match per 50 bases. int totalBasesMatched = 0; int a; for (a = 0; a < int(alignmentPtrs.size()); a++) { int b; for (b = 0; b < int(alignmentPtrs[a]->blocks.size()); b++) { totalBasesMatched += alignmentPtrs[a]->blocks[b].length; } int expectedMatches = params.sdpTupleSize/50.0 * readLength; if (totalBasesMatched < expectedMatches) { delete alignmentPtrs[a]; alignmentPtrs[a] = NULL; } } int packedAlignmentIndex = 0; for (a = 0; a < int(alignmentPtrs.size()); a++) { if (alignmentPtrs[a] != NULL) { alignmentPtrs[packedAlignmentIndex] = alignmentPtrs[a]; packedAlignmentIndex++; } } alignmentPtrs.resize(packedAlignmentIndex); return packedAlignmentIndex; } template int RemoveLowQualityAlignments(T_Sequence &read, vector &alignmentPtrs, MappingParameters ¶ms) { PB_UNUSED(read); if (params.verbosity > 0) { cout << "checking at least " << alignmentPtrs.size() << " alignments to see if they are accurate." << endl; } for (size_t i = 0; i < MIN(static_cast(params.nCandidates), alignmentPtrs.size()); i++) { if (params.verbosity > 0) { cout << "Quality check " << i << " " << alignmentPtrs[i]->score << endl; } if (alignmentPtrs[i]->blocks.size() == 0 or alignmentPtrs[i]->score > params.maxScore) { // // Since the alignments are sorted according to alignment // score, once one of the alignments is too low of a score, // all remaining alignments are also too low, and should be // removed as well. Do that all at once. // if (alignmentPtrs[i]->blocks.size() == 0 and params.verbosity > 0) { cout << "Removing empty alignment " << alignmentPtrs[i]->qName << endl; } if (params.verbosity > 0) { cout << alignmentPtrs[i]->qName << " alignment " << i << " is too low of a score." << alignmentPtrs[i]->score << endl; } for (size_t deletedIndex = i; deletedIndex < alignmentPtrs.size(); deletedIndex++) { delete alignmentPtrs[deletedIndex]; alignmentPtrs[deletedIndex] = NULL; } alignmentPtrs.erase(i + alignmentPtrs.begin(), alignmentPtrs.end()); break; } else { if (params.verbosity > 0) { cout << "Keeping alignment " << i << " " << alignmentPtrs[i]->qPos << " " << alignmentPtrs[i]->qLength << " " << alignmentPtrs[i]->tName << " " << alignmentPtrs[i]->tPos << " " << alignmentPtrs[i]->tLength << " from score: " << alignmentPtrs[i]->score << endl; } } } return alignmentPtrs.size(); } //FIXME: move to class ReadAlignments int RemoveOverlappingAlignments(vector &alignmentPtrs, MappingParameters ¶ms) { vector alignmentIsContained; alignmentIsContained.resize(alignmentPtrs.size()); std::fill(alignmentIsContained.begin(), alignmentIsContained.end(), false); int j; int numContained = 0; int curNotContained = 0; if (alignmentPtrs.size() > 0) { UInt i; for (i = 0; i < alignmentPtrs.size()-1; i++ ){ T_AlignmentCandidate *aref = alignmentPtrs[i]; if (aref->pctSimilarity < params.minPctSimilarity) { continue; } for (j = i + 1; j < int(alignmentPtrs.size()); j++ ){ // // Make sure this alignment isn't already removed. // if (alignmentIsContained[j]) { continue; } // // Only check for containment if the two sequences are from the same contig. // if (alignmentPtrs[i]->tIndex != alignmentPtrs[j]->tIndex) { continue; } // // Check for an alignment that is fully overlapping another // alignment. if (aref->GenomicTBegin() <= alignmentPtrs[j]->GenomicTBegin() and aref->GenomicTEnd() >= alignmentPtrs[j]->GenomicTEnd() and alignmentPtrs[i]->tIndex == alignmentPtrs[j]->tIndex) { // // Alignment i is contained in j is only true if it has a worse score. // if (aref->score <= alignmentPtrs[j]->score) { alignmentIsContained[j] = true; } if (params.verbosity >= 2) { cout << "alignment " << i << " is contained in " << j << endl; cout << aref->tAlignedSeqPos << " " << alignmentPtrs[j]->tAlignedSeqPos << " " << aref->tAlignedSeqPos + aref->tAlignedSeqLength << " " << alignmentPtrs[j]->tAlignedSeqPos + alignmentPtrs[j]->tAlignedSeqLength << endl; } } else if (alignmentPtrs[j]->GenomicTBegin() <= aref->GenomicTBegin() and alignmentPtrs[j]->GenomicTEnd() >= aref->GenomicTEnd() and alignmentPtrs[i]->tIndex == alignmentPtrs[j]->tIndex) { if (params.verbosity >= 2) { cout << "ALIGNMENT " << j << " is contained in " << i << endl; cout << alignmentPtrs[j]->tAlignedSeqPos << " " << aref->tAlignedSeqPos << " " << alignmentPtrs[j]->tAlignedSeqPos + alignmentPtrs[j]->tAlignedSeqLength << " " << aref->tAlignedSeqPos + aref->tAlignedSeqLength << endl; } if (alignmentPtrs[j]->score <= aref->score) { alignmentIsContained[i] = true; } } } } for (i = 0; i < alignmentPtrs.size(); i++) { T_AlignmentCandidate *aref = alignmentPtrs[i]; if (alignmentIsContained[i]) { delete alignmentPtrs[i]; alignmentPtrs[i] = NULL; numContained++; } else { alignmentPtrs[curNotContained] = aref; ++curNotContained; } } alignmentPtrs.resize(alignmentPtrs.size() - numContained); } return alignmentPtrs.size(); } // Delete all alignments from index startIndex in vector, inclusive. void DeleteAlignments(vector &alignmentPtrs, int startIndex) { int i; for (i = startIndex; i < int(alignmentPtrs.size()); i++ ) { delete alignmentPtrs[i]; } alignmentPtrs.resize(0); } //--------------------REFINE ALIGNMENTS---------------------------// template void RefineAlignment(vector &bothQueryStrands, T_RefSequence &genome, T_AlignmentCandidate &alignmentCandidate, MappingParameters ¶ms, MappingBuffers &mappingBuffers) { (void)(genome); FASTQSequence qSeq; DNASequence tSeq; DistanceMatrixScoreFunction distScoreFn( SMRTDistanceMatrix, params.deletion, params.insertion); DistanceMatrixScoreFunction distScoreFn2( SMRTDistanceMatrix, params.indel, params.indel); QualityValueScoreFunction scoreFn; IDSScoreFunction idsScoreFn; idsScoreFn.InitializeScoreMatrix(SMRTDistanceMatrix); scoreFn.del = params.indel; scoreFn.ins = params.indel; idsScoreFn.ins = params.insertion; idsScoreFn.del = params.deletion; idsScoreFn.affineExtend = params.affineExtend; idsScoreFn.affineOpen = params.affineOpen; idsScoreFn.substitutionPrior = params.substitutionPrior; idsScoreFn.globalDeletionPrior = params.globalDeletionPrior; if (params.doGlobalAlignment) { SMRTSequence subread; subread.ReferenceSubstring(*bothQueryStrands[0], bothQueryStrands[0]->SubreadStart(), (bothQueryStrands[0]->SubreadLength())); int drift = ComputeDrift(alignmentCandidate); T_AlignmentCandidate refinedAlignment; KBandAlign(subread, alignmentCandidate.tAlignedSeq, SMRTDistanceMatrix, params.insertion, params.deletion, drift, mappingBuffers.scoreMat, mappingBuffers.pathMat, refinedAlignment, idsScoreFn, Global); refinedAlignment.RemoveEndGaps(); ComputeAlignmentStats(refinedAlignment, subread.seq, alignmentCandidate.tAlignedSeq.seq, distScoreFn2); //idsScoreFn); alignmentCandidate.blocks = refinedAlignment.blocks; alignmentCandidate.gaps = refinedAlignment.gaps; alignmentCandidate.tPos = refinedAlignment.tPos; alignmentCandidate.qPos = refinedAlignment.qPos + bothQueryStrands[0]->SubreadStart(); alignmentCandidate.score = refinedAlignment.score; subread.Free(); } else if (params.useGuidedAlign) { T_AlignmentCandidate refinedAlignment; int lastBlock = alignmentCandidate.blocks.size() - 1; if (alignmentCandidate.blocks.size() > 0) { /* * Refine the alignment without expanding past the current * boundaries of the sequences that are already aligned. */ // // NOTE** this only makes sense when // alignmentCandidate.blocks[0].tPos == 0. Otherwise the length // of the sequence is not correct. // tSeq.Copy(alignmentCandidate.tAlignedSeq, alignmentCandidate.tPos, (alignmentCandidate.blocks[lastBlock].tPos + alignmentCandidate.blocks[lastBlock].length - alignmentCandidate.blocks[0].tPos)); // qSeq.ReferenceSubstring(alignmentCandidate.qAlignedSeq, qSeq.ReferenceSubstring(*bothQueryStrands[0], alignmentCandidate.qAlignedSeqPos + alignmentCandidate.qPos, (alignmentCandidate.blocks[lastBlock].qPos + alignmentCandidate.blocks[lastBlock].length)); if (!params.ignoreQualities && ReadHasMeaningfulQualityValues(alignmentCandidate.qAlignedSeq)) { if (params.affineAlign) { AffineGuidedAlign(qSeq, tSeq, alignmentCandidate, idsScoreFn, params.bandSize, mappingBuffers, refinedAlignment, Global, false); } else { GuidedAlign(qSeq, tSeq, alignmentCandidate, idsScoreFn, params.guidedAlignBandSize, mappingBuffers, refinedAlignment, Global, false); } } else { if (params.affineAlign) { AffineGuidedAlign(qSeq, tSeq, alignmentCandidate, distScoreFn, params.bandSize, mappingBuffers, refinedAlignment, Global, false); } else { GuidedAlign(qSeq, tSeq, alignmentCandidate, distScoreFn, params.guidedAlignBandSize, mappingBuffers, refinedAlignment, Global, false); } } ComputeAlignmentStats(refinedAlignment, qSeq.seq, tSeq.seq, distScoreFn2, params.affineAlign); // // Copy the refine alignment, which may be a subsequence of the // alignmentCandidate into the alignment candidate. // // First copy the alignment block and gap (the description of // the base by base alignment). alignmentCandidate.blocks.clear(); alignmentCandidate.blocks = refinedAlignment.blocks; alignmentCandidate.CopyStats(refinedAlignment); alignmentCandidate.gaps = refinedAlignment.gaps; alignmentCandidate.score = refinedAlignment.score; alignmentCandidate.nCells = refinedAlignment.nCells; // Next copy the information that describes what interval was // aligned. Since the reference sequences of the alignment // candidate have been modified, they are reassigned. alignmentCandidate.tAlignedSeq.Free(); alignmentCandidate.tAlignedSeq.TakeOwnership(tSeq); alignmentCandidate.ReassignQSequence(qSeq); alignmentCandidate.tAlignedSeqPos += alignmentCandidate.tPos; alignmentCandidate.qAlignedSeqPos += alignmentCandidate.qPos; // // tPos and qPos are the positions within the interval where the // alignment begins. The refined alignment has adifferent tPos // and qPos from the alignment candidate. alignmentCandidate.tPos = refinedAlignment.tPos; alignmentCandidate.qPos = refinedAlignment.qPos; // The lengths of the newly aligned sequences may differ, update those. alignmentCandidate.tAlignedSeqLength = tSeq.length; alignmentCandidate.qAlignedSeqLength = qSeq.length; } } else { // // This assumes an SDP alignment has been performed to create 'alignmentCandidate'. // // Recompute the alignment using a banded smith waterman to // get rid of any spurious effects of usign the seeded gaps. // // // The k-banded alignment is over a subsequence of the first // (sparse dynamic programming, SDP) alignment. The SDP // alignment is over a large window that may contain the // candidate sequence. The k-band alignment is over a tighter // region. int drift = ComputeDrift(alignmentCandidate); // // Rescore the alignment with a banded alignment that has a // better model of sequencing error. // if (alignmentCandidate.blocks.size() == 0 ){ alignmentCandidate.score = 0; return; } int lastBlock = alignmentCandidate.blocks.size() - 1; // // Assign the sequences that are going to be realigned using // banded alignment. The SDP alignment does not give that great // of a score, but it does do a good job at finding a backbone // alignment that closely defines the sequence that is aligned. // Reassign the subsequences for alignment with a tight bound // around the beginning and ending of each sequence, so that // global banded alignment may be performed. // // // This section needs to be cleaned up substantially. Right now it // copies a substring from the ref to a temp, then from the temp // back to the ref. It may be possible to just keep one pointer per // read to the memory that was allocated, then allow the seq // parameter to float around. The reason for all the copying is // that in case there is a compressed version of the genome the // seqences must be transformed before alignment. // if (alignmentCandidate.qIsSubstring) { qSeq.ReferenceSubstring(*bothQueryStrands[0], // the original sequence alignmentCandidate.qPos + alignmentCandidate.qAlignedSeqPos, alignmentCandidate.blocks[lastBlock].qPos + alignmentCandidate.blocks[lastBlock].length); } else { qSeq.ReferenceSubstring(alignmentCandidate.qAlignedSeq, // the subsequence that the alignment points to alignmentCandidate.qPos + alignmentCandidate.qAlignedSeqPos, alignmentCandidate.blocks[lastBlock].qPos + alignmentCandidate.blocks[lastBlock].length - alignmentCandidate.blocks[0].qPos); } tSeq.Copy(alignmentCandidate.tAlignedSeq, // the subsequence the alignment points to alignmentCandidate.tPos, // ofset into the subsequence alignmentCandidate.blocks[lastBlock].tPos + alignmentCandidate.blocks[lastBlock].length - alignmentCandidate.blocks[0].tPos); T_AlignmentCandidate refinedAlignment; // // When the parameter bandSize is 0, set the alignment band size // to the drift off the diagonal, plus a little more for wiggle // room. When the parameteris nonzero, use that as a fixed band. // int k; if (params.bandSize == 0) { k = abs(drift) * 1.5; } else { k = params.bandSize; } if (params.verbosity > 0) { cout << "drift: " << drift << " qlen: " << alignmentCandidate.qAlignedSeq.length << " tlen: " << alignmentCandidate.tAlignedSeq.length << " k: " << k << endl; cout << "aligning in " << k << " * " << alignmentCandidate.tAlignedSeq.length << " " << k * alignmentCandidate.tAlignedSeq.length << endl; } if (k < 10) { k = 10; } alignmentCandidate.tAlignedSeqPos += alignmentCandidate.tPos; VectorIndex lastSDPBlock = alignmentCandidate.blocks.size() - 1; if (alignmentCandidate.blocks.size() > 0) { alignmentCandidate.tAlignedSeqLength = (alignmentCandidate.blocks[lastSDPBlock].tPos + alignmentCandidate.blocks[lastSDPBlock].length - alignmentCandidate.blocks[0].tPos); } else { alignmentCandidate.tAlignedSeqLength = 0; } alignmentCandidate.tPos = 0; alignmentCandidate.qAlignedSeqPos += alignmentCandidate.qPos; if (alignmentCandidate.blocks.size() > 0) { alignmentCandidate.qAlignedSeqLength = (alignmentCandidate.blocks[lastSDPBlock].qPos + alignmentCandidate.blocks[lastSDPBlock].length - alignmentCandidate.blocks[0].qPos); } else { alignmentCandidate.qAlignedSeqLength = 0; } alignmentCandidate.qPos = 0; alignmentCandidate.blocks.clear(); alignmentCandidate.tAlignedSeq.Free(); alignmentCandidate.tAlignedSeq.TakeOwnership(tSeq); alignmentCandidate.ReassignQSequence(qSeq); if (params.verbosity >= 2) { cout << "refining target: " << endl; alignmentCandidate.tAlignedSeq.PrintSeq(cout); cout << "refining query: " << endl; static_cast(&alignmentCandidate.qAlignedSeq)->PrintSeq(cout); cout << endl; } PairwiseLocalAlign(qSeq, tSeq, k, params, alignmentCandidate, mappingBuffers, Fit); } } template void RefineAlignments(vector &bothQueryStrands, T_RefSequence &genome, vector &alignmentPtrs, MappingParameters ¶ms, MappingBuffers &mappingBuffers) { UInt i; for (i = 0; i < alignmentPtrs.size(); i++ ) { RefineAlignment(bothQueryStrands, genome, *alignmentPtrs[i], params, mappingBuffers); } // // It's possible the alignment references change their order after running // the local alignments. This is made into a parameter rather than resorting // every time so that the performance gain by resorting may be measured. // if (params.sortRefinedAlignments) { std::sort(alignmentPtrs.begin(), alignmentPtrs.end(), SortAlignmentPointersByScore()); } } vector SelectAlignmentsToPrint(vector alignmentPtrs, MappingParameters & params, const int & associatedRandInt) { if (params.placeRandomly) {assert(params.hitPolicy.IsRandombest());} if (alignmentPtrs.size() == 0) {return vector({});} std::sort(alignmentPtrs.begin(), alignmentPtrs.end(), SortAlignmentPointersByScore()); // Apply filter criteria and hit policy. // Shallow copy AlignmentCandidate pointers. vector filtered; for (auto ptr: alignmentPtrs) { if (params.filterCriteria.Satisfy(ptr)) { filtered.push_back(ptr); if (int(filtered.size()) == params.nBest) break; } } return params.hitPolicy.Apply(filtered, false, associatedRandInt); } // The full read is not the subread, and does not have masked off characters. void PrintAlignment(T_AlignmentCandidate &alignment, SMRTSequence &fullRead, MappingParameters ¶ms, AlignmentContext &alignmentContext, ostream &outFile #ifdef USE_PBBAM , SMRTSequence & subread , PacBio::BAM::IRecordWriter * bamWriterPtr #endif ) { try { if (params.printFormat == StickPrint) { PrintAlignmentStats(alignment, outFile); StickPrintAlignment(alignment, (DNASequence&) alignment.qAlignedSeq, (DNASequence&) alignment.tAlignedSeq, outFile, alignment.qAlignedSeqPos, alignment.tAlignedSeqPos); } else if (params.printFormat == SAM) { SAMOutput::PrintAlignment(alignment, fullRead, outFile, alignmentContext, params.samQVList, params.clipping, params.cigarUseSeqMatch, params.allowAdjacentIndels); } else if (params.printFormat == BAM) { #ifdef USE_PBBAM BAMOutput::PrintAlignment(alignment, fullRead, subread, *bamWriterPtr, alignmentContext, params.samQVList, params.clipping, params.cigarUseSeqMatch, params.allowAdjacentIndels); #else REQUIRE_PBBAM_ERROR(); #endif } else if (params.printFormat == CompareXML) { XMLOutput::Print(alignment, (DNASequence&) alignment.qAlignedSeq, (DNASequence&) alignment.tAlignedSeq, outFile, alignment.qAlignedSeqPos, alignment.tAlignedSeqPos); } else if (params.printFormat == Vulgar) { PrintAlignmentStats(alignment, outFile); VulgarOutput::Print(alignment, outFile); } else if (params.printFormat == CompareSequencesParsable) { CompareSequencesOutput::Print(alignment, alignment.qAlignedSeq, alignment.tAlignedSeq, outFile); } else if (params.printFormat == Interval) { if (alignment.blocks.size() > 0) { IntervalOutput::Print(alignment, outFile); } } else if (params.printFormat == SummaryPrint) { if (alignment.blocks.size() > 0) { SummaryOutput::Print(alignment, outFile); } } } catch (ostream::failure f) { cout << "ERROR writing to output file. The output drive may be full, or you " << endl; cout << "may not have proper write permissions." << endl; exit(1); } } // Print all alignments in vector alignmentPtrs void PrintAlignments(vector alignmentPtrs, SMRTSequence &read, MappingParameters ¶ms, ostream &outFile, AlignmentContext alignmentContext, #ifdef USE_PBBAM SMRTSequence &subread, PacBio::BAM::IRecordWriter * bamWriterPtr, #endif MappingSemaphores & semaphores) { if (params.nProc > 1) { #ifdef __APPLE__ sem_wait(semaphores.writer); #else sem_wait(&semaphores.writer); #endif } for (int i = 0; i < int(alignmentPtrs.size()); i++) { T_AlignmentCandidate *aref = alignmentPtrs[i]; if (aref->blocks.size() == 0) { // // If the SDP alignment finds nothing, there will be no // blocks. This may happen if the sdp block size is larger // than the anchor size found with the suffix array. When no // blocks are found there is no alignment, so zero-out the // score and continue. // aref->score = 0; if (params.verbosity > 0) { cout << "Zero blocks found for " << aref->qName << " " << aref->qAlignedSeqPos << " " << aref->tAlignedSeqPos << endl; } continue; } // // Configure some of the alignment context before printing. // if (i > 0 and params.placeRandomly == false) { alignmentContext.isPrimary = false; } else { alignmentContext.isPrimary = true; } if (params.printSAM or params.printBAM) { DistanceMatrixScoreFunction editdistScoreFn(EditDistanceMatrix, 1, 1); T_AlignmentCandidate & alignment = *alignmentPtrs[i]; alignmentContext.editDist = ComputeAlignmentScore(alignment, alignment.qAlignedSeq, alignment.tAlignedSeq, editdistScoreFn); } PrintAlignment(*alignmentPtrs[i], read, params, alignmentContext, outFile #ifdef USE_PBBAM , subread , bamWriterPtr #endif ); } if (params.nProc > 1) { #ifdef __APPLE__ sem_post(semaphores.writer); #else sem_post(&semaphores.writer); #endif } } void PrintAlignmentPtrs(vector & alignmentPtrs, ostream & out) { for(int alignmentIndex = 0; alignmentIndex < int(alignmentPtrs.size()); alignmentIndex++) { out << "["<< alignmentIndex << "/" << alignmentPtrs.size() << "]" << endl; T_AlignmentCandidate *alignment = alignmentPtrs[alignmentIndex]; alignment->Print(out); } out << endl; } void PrintUnaligned(const SMRTSequence & unalignedRead, ostream & unalignedFilePtr, const bool noPrintUnalignedSeqs) { if (noPrintUnalignedSeqs) { string s = unalignedRead.GetTitle(); SMRTTitle st(s); if (st.isSMRTTitle) unalignedFilePtr << st.ToString() << endl; else //size_t pos = s.rfind("/"); //if (pos != string::npos) // unalignedFilePtr << s.substr(0, pos) << std::endl; //else unalignedFilePtr << s << std::endl; } else unalignedRead.PrintSeq(unalignedFilePtr); } // Print all alignments for subreads in allReadAlignments. // Input: // allReadAlignments - contains a set of subreads, each of which // is associated with a group of alignments. // alignmentContext - an alignment context of each subread used // for printing in SAM format. // params - mapping parameters. // Output: // outFilePtr - where to print alignments for subreads. // unalignedFilePtr - where to print sequences for unaligned subreads. void PrintAllReadAlignments(ReadAlignments & allReadAlignments, AlignmentContext & alignmentContext, ostream & outFilePtr, ostream & unalignedFilePtr, MappingParameters & params, vector & subreads, #ifdef USE_PBBAM PacBio::BAM::IRecordWriter * bamWriterPtr, #endif MappingSemaphores & semaphores) { int subreadIndex; int nAlignedSubreads = allReadAlignments.GetNAlignedSeq(); // // Initialize the alignemnt context with information applicable to SAM output. // alignmentContext.alignMode = allReadAlignments.alignMode; for (subreadIndex = 0; subreadIndex < nAlignedSubreads; subreadIndex++) { if (allReadAlignments.subreadAlignments[subreadIndex].size() > 0) { alignmentContext.numProperlyAlignedSubreads++; } } if (alignmentContext.numProperlyAlignedSubreads == int(allReadAlignments.subreadAlignments.size())) { alignmentContext.allSubreadsProperlyAligned = true; } alignmentContext.nSubreads = nAlignedSubreads; for (subreadIndex = 0; subreadIndex < nAlignedSubreads; subreadIndex++) { alignmentContext.subreadIndex = subreadIndex; if (subreadIndex < nAlignedSubreads-1 and allReadAlignments.subreadAlignments[subreadIndex+1].size() > 0) { alignmentContext.nextSubreadPos = allReadAlignments.subreadAlignments[subreadIndex+1][0]->QAlignStart(); alignmentContext.nextSubreadDir = allReadAlignments.subreadAlignments[subreadIndex+1][0]->qStrand; alignmentContext.rNext = allReadAlignments.subreadAlignments[subreadIndex+1][0]->tName; alignmentContext.hasNextSubreadPos = true; } else { alignmentContext.nextSubreadPos = 0; alignmentContext.nextSubreadDir = 0; alignmentContext.rNext = ""; alignmentContext.hasNextSubreadPos = false; } SMRTSequence * sourceSubread = &(allReadAlignments.subreads[subreadIndex]); if (subreads.size() == allReadAlignments.subreads.size()) { sourceSubread = &subreads[subreadIndex]; } if (allReadAlignments.subreadAlignments[subreadIndex].size() > 0) { PrintAlignments(allReadAlignments.subreadAlignments[subreadIndex], allReadAlignments.subreads[subreadIndex], // for these alignments params, outFilePtr,//*mapData->outFilePtr, alignmentContext, #ifdef USE_PBBAM *sourceSubread, bamWriterPtr, #endif semaphores); } else { // // Print the unaligned sequences. // if (params.printUnaligned == true) { if (params.nProc == 1) { PrintUnaligned(*sourceSubread, unalignedFilePtr, params.noPrintUnalignedSeqs); } else { #ifdef __APPLE__ sem_wait(semaphores.unaligned); #else sem_wait(&semaphores.unaligned); #endif PrintUnaligned(*sourceSubread,//subreads[subreadIndex], unalignedFilePtr, params.noPrintUnalignedSeqs); #ifdef __APPLE__ sem_post(semaphores.unaligned); #else sem_post(&semaphores.unaligned); #endif } // End of nproc > 1. } // End of printing unaligned sequences. } // End of finding no alignments for the subread with subreadIndex. } // End of printing and processing alignmentContext for each subread. } blasr-smrtanalysis-4.0.0/iblasr/MappingBuffers.hpp000066400000000000000000000053411302464523700222530ustar00rootroot00000000000000// Author: Mark Chaisson #pragma once #include #include #include #include #include #include #include using namespace std; // // Define a list of buffers that are meant to grow to high-water // marks, and not shrink down past that. The memory is reused rather // than having multiple calls to new. // class MappingBuffers { public: vector hpInsScoreMat, insScoreMat; vector kbandScoreMat; vector hpInsPathMat, insPathMat; vector kbandPathMat; vector scoreMat; vector pathMat; vector affineScoreMat; vector affinePathMat; vector matchPosList; vector rcMatchPosList; vector > globalChainEndpointBuffer; vector sdpFragmentSet, sdpPrefixFragmentSet, sdpSuffixFragmentSet; TupleList sdpCachedTargetTupleList; TupleList sdpCachedTargetPrefixTupleList; TupleList sdpCachedTargetSuffixTupleList; std::vector sdpCachedMaxFragmentChain; vector probMat; vector optPathProbMat; vector lnSubPValueMat; vector lnInsPValueMat; vector lnDelPValueMat; vector lnMatchPValueMat; vector clusterNumBases; ClusterList clusterList; ClusterList revStrandClusterList; void Reset(void); }; inline void MappingBuffers::Reset(void) { vector().swap(hpInsScoreMat); vector().swap(insScoreMat); vector().swap(kbandScoreMat); vector().swap(hpInsPathMat); vector().swap(insPathMat); vector().swap(kbandPathMat); vector().swap(scoreMat); vector().swap(pathMat); vector().swap(matchPosList); vector().swap(rcMatchPosList); vector >().swap(globalChainEndpointBuffer); vector().swap(sdpFragmentSet); vector().swap(sdpPrefixFragmentSet); vector().swap(sdpSuffixFragmentSet); sdpCachedTargetTupleList.Reset(); sdpCachedTargetPrefixTupleList.Reset(); sdpCachedTargetSuffixTupleList.Reset(); vector().swap(sdpCachedMaxFragmentChain); vector().swap(probMat); vector().swap(optPathProbMat); vector().swap(lnSubPValueMat); vector().swap(lnInsPValueMat); vector().swap(lnDelPValueMat); vector().swap(lnMatchPValueMat); vector().swap(clusterNumBases); } blasr-smrtanalysis-4.0.0/iblasr/MappingIPC.h000066400000000000000000000072231302464523700207330ustar00rootroot00000000000000#pragma once #include #include "MappingParameters.h" #include #include #include #include #include #include #include #include #include #include #include #include /* * This structure contains pointers to all required data structures * for mapping reads to a suffix array and evaluating the significance * of the matches. */ template class MappingData { public: T_SuffixArray *suffixArrayPtr; BWT *bwtPtr; T_GenomeSequence *referenceSeqPtr; SequenceIndexDatabase *seqDBPtr; TupleCountTable *ctabPtr; MappingParameters params; MappingMetrics metrics; RegionTable *regionTablePtr; ReaderAgglomerate *reader; ostream *outFilePtr; ostream *unalignedFilePtr; ostream *anchorFilePtr; ostream *clusterFilePtr; ostream *lcpBoundsOutPtr; // Declare a semaphore for blocking on reading from the same hdhf file. void ShallowCopySuffixArray(T_SuffixArray &dest) { dest.index = suffixArrayPtr->index; dest.length = suffixArrayPtr->length; dest.target = suffixArrayPtr->target; dest.startPosTable = suffixArrayPtr->startPosTable; dest.endPosTable = suffixArrayPtr->endPosTable; dest.lookupTableLength = suffixArrayPtr->lookupTableLength; dest.lookupPrefixLength = suffixArrayPtr->lookupPrefixLength; dest.tm = suffixArrayPtr->tm; dest.deleteStructures = false; // dest.useLCPTable = suffixArrayPtr->useLCPTable; } void ShallowCopySequenceIndexDatabase(SequenceIndexDatabase &dest) { dest.nSeqPos = seqDBPtr->nSeqPos; dest.seqStartPos = seqDBPtr->seqStartPos; dest.nameLengths = seqDBPtr->nameLengths; dest.names = seqDBPtr->names; dest.deleteStructures = false; } void ShallowCopyTupleCountTable( TupleCountTable &dest) { dest.countTable = ctabPtr->countTable; dest.countTableLength = ctabPtr->countTableLength; dest.nTuples = ctabPtr->nTuples; dest.tm = ctabPtr->tm; dest.deleteStructures = false; } void ShallowCopyReferenceSequence(T_GenomeSequence &refSeq) { refSeq.ShallowCopy(*referenceSeqPtr); refSeq.deleteOnExit = false; } void Initialize(T_SuffixArray *saP, T_GenomeSequence *refP, SequenceIndexDatabase *seqDBP, TupleCountTable *ctabP, MappingParameters ¶msP, ReaderAgglomerate *readerP, RegionTable *regionTableP, ostream *outFileP, ostream *unalignedFileP, ostream *anchorFilePtrP, ostream *clusterFilePtrP=NULL) { suffixArrayPtr = saP; referenceSeqPtr = refP; seqDBPtr = seqDBP; ctabPtr = ctabP; regionTablePtr = regionTableP; params = paramsP; reader = readerP; outFilePtr = outFileP; unalignedFilePtr = unalignedFileP; anchorFilePtr = anchorFilePtrP; clusterFilePtr= clusterFilePtrP; } }; blasr-smrtanalysis-4.0.0/iblasr/MappingParameters.h000066400000000000000000000622241302464523700224250ustar00rootroot00000000000000#pragma once #define REQUIRE_PBBAM_ERROR() \ assert("blasr must be compiled with lib pbbam to perform IO on bam." == 0); #include #include #include #include #include #include #include #include #include #include #include class MappingParameters { public: // // Parameters for global substitution, insertion, and deletion priors. // float minFractionToBeConsideredOverlapping; float indelRate; float minRatio; int indel; int idsIndel; int sdpIndel; int sdpIns, sdpDel; int insertion; int deletion; int mismatch; int sdpTupleSize; int match; int showAlign; bool useScoreCutoff; int maxScore; int argi; int nProc; int globalChainType; SAMOutput::Clipping clipping; string clippingString; QVScale qvScaleType; vector readsFileNames; // = queryFileNames, genomeFileName vector queryFileNames; vector scrapsFileNames; // needed for noSplitSubread flag in PBBAM, deriived from queryFileNames string genomeFileName; // Query file type: FASTA/FASTQ/HDF*/PBBAM, // Note that mixed query file types is not allowed. FileType queryFileType; // Query read type, SUBREAD, CCS or UNKNOWN // Note that mixed read types is not allowed. ReadType::ReadTypeEnum queryReadType; vector regionTableFileNames; vector ccsFofnFileNames; string tupleListName; string posTableName; string outFileName; string suffixArrayFileName; string bwtFileName; string indexFileName; string anchorFileName; string clusterFileName; int nBest; int printWindow; int doCondense; int do4BitComp; int cutoff; int useSuffixArray; int useBwt; int useReverseCompressIndex; int useTupleList; int useSeqDB; string seqDBName; int useCountTable; string countTableName; int minMatchLength; int listTupleSize; int printFormat; int maxExpand, minExpand; int startRead; int stride; int pValueType; float subsample; int sortRefinedAlignments; int verbosity; bool printSAM; bool cigarUseSeqMatch; bool printBAM; bool sam_via_bam; // for SAM output via pbbam using IRecordWriter bool storeMapQV; bool useRandomSeed; int randomSeed; bool placeRandomly; bool printHeader; bool samplePaths; bool warp, nowarp; //bool usePrefixLookupTable; bool doSensitiveSearch; bool emulateNucmer; bool refineBetweenAnchorsOnly; bool byAdapter; bool extendDenovoCCSSubreads; TupleMetrics saTupleMetrics; TupleMetrics sdpTupleMetrics; int lookupTableLength; //int branchQualityThreshold; int qualityLowerCaseThreshold; AnchorParameters anchorParameters; int readsFileIndex; //int numBranches; bool storeMetrics; bool ignoreQualities; bool extendFrontAlignment; bool extendAlignments; int maxExtendDropoff; int minReadLength; int maxReadLength; int minSubreadLength; int minRawSubreadScore; int minAvgQual; bool overlap; bool advanceHalf; int advanceExactMatches; float approximateMaxInsertionRate; float minPctSimilarity; // [0, 100] float minPctAccuracy; // [0, 100] bool refineAlignments; int nCandidates; bool doGlobalAlignment; string tempDirectory; bool useTitleTable; string titleTableName; bool readSeparateRegionTable; bool readSeparateCcsFofn; string regionTableFileName; string ccsFofnFileName; //float averageMismatchScore; bool mapSubreadsSeparately; bool concordant; bool refineConcordantAlignments; int flankSize; bool useRegionTable; bool setIgnoreRegions; bool useHQRegionTable; bool setIgnoreHQRegions; bool printUnaligned; bool noPrintUnalignedSeqs; // print unaligned reads names only. string unalignedFileName; string metricsFileName; string lcpBoundsFileName; string fullMetricsFileName; bool printSubreadTitle; bool useCcs; bool useAllSubreadsInCcs; bool useCcsOnly; bool detailedSDPAlignment, nouseDetailedSDPAlignment; int chunkSize; int sdpFilterType; bool useGuidedAlign; int guidedAlignBandSize; int bandSize; int extendBandSize; bool useQVScore; int scoreType; bool printVerboseHelp; bool printDiscussion; float sdpBypassThreshold; bool computeAlignProbability; float qvMatchWeight; float qvMismatchWeight; float qvInsWeight; float qvDelWeight; float readAccuracyPrior; bool printVersion; int substitutionPrior; int globalDeletionPrior; bool outputByThread; int recurseOver; bool allowAdjacentIndels; bool separateGaps; string scoreMatrixString; bool printDotPlots; bool preserveReadTitle; bool forwardOnly; bool printOnlyBest; bool affineAlign; int affineExtend; int affineOpen; bool scaleMapQVByNumSignificantClusters; int limsAlign; string holeNumberRangesStr; Ranges holeNumberRanges; int minAlnLength; bool printSAMQV; vector samQV; SupplementalQVList samQVList; bool fastMaxInterval; bool aggressiveIntervalCut; bool fastSDP; string concordantTemplate; bool concordantAlignBothDirections; FilterCriteria filterCriteria; string hitPolicyStr; HitPolicy hitPolicy; bool enableHiddenPaths; void Init() { qvMatchWeight = 1.0; qvMismatchWeight = 1.0; qvInsWeight = 1.0; qvDelWeight = 1.0; minFractionToBeConsideredOverlapping = 0.75; minRatio = 0.25; indelRate = 0.3; indel = 5; insertion = 4; // asymmetric indel parameters deletion = 5; idsIndel = 15; sdpIndel = 5; sdpIns = 5; sdpDel = 10; sdpTupleSize = 11; match = 0; mismatch = 0; showAlign = 1; useScoreCutoff = false; maxScore = -200; argi = 1; nProc = 1; readsFileNames.clear(); queryFileNames.clear(); genomeFileName = ""; queryReadType = ReadType::UNKNOWN; queryFileType = FileType::None; tupleListName = ""; posTableName = ""; suffixArrayFileName= ""; bwtFileName = ""; indexFileName = ""; anchorFileName = ""; outFileName = ""; nBest = 10; nCandidates = 10; printWindow = 0; doCondense = 0; do4BitComp = 0; pValueType = 0; cutoff = 0; useSuffixArray = 0; useBwt = 0; useReverseCompressIndex = 0; useTupleList = 0; useSeqDB = 0; seqDBName = ""; useCountTable = 0; countTableName = ""; lookupTableLength = 8; anchorParameters.minMatchLength = minMatchLength = 12; printFormat = SummaryPrint; maxExpand = 0; minExpand = 0; startRead = 0; stride = 1; subsample = 1.1; listTupleSize = 6; sortRefinedAlignments = 1; anchorParameters.verbosity = verbosity = 0; saTupleMetrics.Initialize(listTupleSize); sdpTupleMetrics.Initialize(sdpTupleSize); qualityLowerCaseThreshold = 0; anchorParameters.branchQualityThreshold = 0; readsFileIndex = 0; printSAM = false; printBAM = false; sam_via_bam = false; useRandomSeed = false; randomSeed = 0; placeRandomly = false; samplePaths = false; nowarp = false; storeMapQV = true; warp = true; extendDenovoCCSSubreads = false; storeMetrics = false; ignoreQualities = true; extendFrontAlignment = false; extendAlignments = false; maxExtendDropoff = 10; minReadLength = 50; maxReadLength = 0; // means no max read length minSubreadLength = 0; minRawSubreadScore = -1; // raw subread score in region table should be in range [0, 1000]. minAvgQual = 0; overlap = false; advanceHalf = false; refineAlignments = true; anchorParameters.advanceExactMatches = advanceExactMatches = 0; approximateMaxInsertionRate = 1.30; minPctSimilarity = 0; minPctAccuracy = 0; doGlobalAlignment = false; tempDirectory = ""; useTitleTable = false; titleTableName = ""; readSeparateRegionTable = false; readSeparateCcsFofn = false; regionTableFileName = ""; ccsFofnFileName = ""; mapSubreadsSeparately=true; concordant=false; refineConcordantAlignments=false; flankSize=40; useRegionTable = true; setIgnoreRegions = false; useHQRegionTable=true; setIgnoreHQRegions = false; printUnaligned = false; unalignedFileName = ""; noPrintUnalignedSeqs = false; globalChainType = 0; metricsFileName = ""; fullMetricsFileName = ""; doSensitiveSearch = false; emulateNucmer = false; refineBetweenAnchorsOnly = false; printSubreadTitle = true; detailedSDPAlignment = true; nouseDetailedSDPAlignment = false; useCcs = false; useCcsOnly = false; useAllSubreadsInCcs = false; chunkSize = 10000000; sdpFilterType = 0; anchorParameters.stopMappingOnceUnique = true; useGuidedAlign = true; bandSize = 0; extendBandSize = 10; guidedAlignBandSize = 10; useQVScore = false; printVerboseHelp = false; printDiscussion = false; sdpBypassThreshold = 1000000.0; scoreType = 0; byAdapter = false; qvScaleType = PHRED; printHeader = false; computeAlignProbability = false; readAccuracyPrior = 0.85; printVersion = false; clipping = SAMOutput::none; clippingString = ""; substitutionPrior = 20; globalDeletionPrior = 13; outputByThread = false; recurseOver = 10000; allowAdjacentIndels = false; separateGaps = false; scoreMatrixString = ""; printDotPlots = false; preserveReadTitle = false; forwardOnly = false; printOnlyBest = false; affineAlign = false; affineExtend = 0; affineOpen = 10; scaleMapQVByNumSignificantClusters = false; limsAlign = 0; holeNumberRangesStr = ""; minAlnLength = 0; printSAMQV = false; cigarUseSeqMatch = false; samQV.clear(); samQVList.clear(); fastMaxInterval = false; aggressiveIntervalCut = false; fastSDP = false; concordantTemplate = "mediansubread"; // typicalsubread or longestsubread concordantAlignBothDirections = false; hitPolicyStr = "all"; ResetFilterAndHit(); enableHiddenPaths = false; //turn off hidden paths. } MappingParameters() : filterCriteria(0, 0, 0, false, Score(0, ScoreSign::NEGATIVE)) , hitPolicy("all", ScoreSign::NEGATIVE) { Init(); } void MakeSane() { // Expand FOFN FileOfFileNames::ExpandFileNameList(readsFileNames); // Must have at least a query and a genome if (readsFileNames.size() <= 1) { cout << "Error, you must provide at least one reads file and a genome file." < string.scraps.bam substitute subreads to scraps // 2. string.bam -> string.scraps.bam insert .scraps before .bam // TODO loop over query check for each // not needed for xml since scraps specified explicetely // if (not mapSubreadsSeparately && (queryFileType == FileType::PBBAM) ) { const string dsubdb = ".subreads.bam"; const string dbam = ".bam"; // loop over all subread files and fill the vector or scraps files for (size_t i = 0; i < queryFileNames.size(); i++) { scrapsFileNames.push_back(queryFileNames[i]); size_t dsubdb_pos = scrapsFileNames[i].find(dsubdb); // find .subreads.bam if (dsubdb_pos != std::string::npos) { // TODO check that .subreads.bam is LAST occurence // replace subreads.bam with scraps.bam scrapsFileNames[i].replace(dsubdb_pos,dsubdb.length(),".scraps.bam"); } else { // insert scraps before .bam" // actually we can just replace last 4 characters // fix later size_t dbam_pos = scrapsFileNames[0].find(dbam); // find .bam scrapsFileNames[i].replace(dbam_pos,dbam.length(),".scraps.bam"); } } } // -useQuality can not be used in combination with a fasta input if (!ignoreQualities) { if (queryFileType == FileType::Fasta) { cout<<"ERROR, you can not use -useQuality option when any of the input reads files are in multi-fasta format."< 1) { cerr << "Warning: using new filter method for SDP alignments. The parameter is " << endl << "either 0 or 1, but " << sdpFilterType << " was specified." << endl; sdpFilterType = 1; } if (sdpFilterType == 0) { detailedSDPAlignment = true; nouseDetailedSDPAlignment = false; } if (detailedSDPAlignment == false) { sdpFilterType = 1; } if (useGuidedAlign == true and bandSize == 0) { bandSize = 16; } anchorParameters.minMatchLength = minMatchLength; if (suffixArrayFileName != "") { useSuffixArray = true; } if (bwtFileName != "") { useBwt = true; } if (useBwt and useSuffixArray) { cout << "ERROR, sa and bwt must be used independently." << endl; exit(1); } if (countTableName != "") { useCountTable = true; } if (metricsFileName != "" or fullMetricsFileName != "") { storeMetrics = true; } if (useCcsOnly) { useCcs = true; } if (useAllSubreadsInCcs == true) { useCcs = true; } if (titleTableName != "") { useTitleTable = true; } if (unalignedFileName != "") { printUnaligned = true; } if (regionTableFileName != "") { useRegionTable = true; readSeparateRegionTable = true; } bool isHDFFile = (queryFileType == FileType::HDFPulse or queryFileType == FileType::HDFBase or queryFileType == FileType::HDFCCSONLY); if ((setIgnoreRegions or setIgnoreHQRegions) and not isHDFFile) { cout << "ERROR: query must be HDF files in order to set ignoreRegions or ignoreHQRegions." << std::endl; exit(1); } if (ccsFofnFileName != "") { readSeparateCcsFofn = true; } if (nouseDetailedSDPAlignment == true) { detailedSDPAlignment = false; } if (nouseDetailedSDPAlignment == false) { detailedSDPAlignment = true; } if (anchorParameters.maxLCPLength != 0 and int(anchorParameters.maxLCPLength) < int(anchorParameters.minMatchLength)) { cerr << "ERROR: maxLCPLength is less than minLCPLength, which will result in no hits." << endl; } if (subsample < 1 and stride > 1) { cout << "ERROR, subsample and stride must be used independently." << endl; exit(1); } if (emulateNucmer) { SetEmulateNucmer(); } if (randomSeed != 0) { useRandomSeed = true; } // // Parse the clipping. // if (clippingString == "soft") { clipping = SAMOutput::soft; } else if (clippingString == "hard") { clipping = SAMOutput::hard; } else if (clippingString == "none") { clipping = SAMOutput::none; } else if (clippingString == "subread") { clipping = SAMOutput::subread; } else if (clippingString != "") { cout << "ERROR, clipping should either be soft, hard, or none." << endl; exit(1); } if (printSAM) { // since sam is printed via bam we need to use ifndef USE_PBBAM here #ifndef USE_PBBAM REQUIRE_PBBAM_ERROR(); #else printSAM = false; printBAM = true; sam_via_bam = true; // set to true for constructors and to avoid entering if (printBAM cigarUseSeqMatch = true; // ALWAYS true for BAM printFormat = BAM; // Not sure for sam_via_bam samQVList.SetDefaultQV(); printSAMQV = true; if (clipping != SAMOutput::soft) { // Only support two clipping methods: soft or subread. clipping = SAMOutput::subread; } // Turn on fa fa -> bam pipe /* if (queryFileType != FileType::PBBAM and queryFileType != FileType::PBDATASET and not enableHiddenPaths) { // bax|fasta|fastq -> bam paths are turned off by default cout << "ERROR, could not output alignments in BAM unless input reads are in PacBio BAM or DATASET files." << endl; exit(1); } */ if (outFileName == "") { cout << "ERROR, SAM output file must be specified." << endl; exit(1); } // VR Need to see what happens if printing SAM // VR Check with Derek regarding sam_via_bam if (outputByThread) { cout << "ERROR, could not output alignments by threads in BAM format." << endl; exit(1); } #endif } if (printBAM && !sam_via_bam) { // Need to check settings for SAM, #ifndef USE_PBBAM REQUIRE_PBBAM_ERROR(); #else cigarUseSeqMatch = true; // ALWAYS true for BAM printFormat = BAM; printSAM = false; samQVList.SetDefaultQV(); printSAMQV = true; if (clipping != SAMOutput::soft) { // Only support two clipping methods: soft or subread. clipping = SAMOutput::subread; } // Turn on fa fa -> bam pipe /* if (queryFileType != FileType::PBBAM and queryFileType != FileType::PBDATASET and not enableHiddenPaths) { // bax|fasta|fastq -> bam paths are turned off by default cout << "ERROR, could not output alignments in BAM unless input reads are in PacBio BAM or DATASET files." << endl; exit(1); } */ if (outFileName == "") { cout << "ERROR, BAM output file must be specified." << endl; exit(1); } // VR Need to see what happens if printing SAM // VR Check with Derek regarding sam_via_bam if (outputByThread) { cout << "ERROR, could not output alignments by threads in BAM format." << endl; exit(1); } #endif } if (limsAlign != 0) { mapSubreadsSeparately = false; forwardOnly = true; } if (holeNumberRangesStr.size() > 0) { if (not holeNumberRanges.setRanges(holeNumberRangesStr)) { cout << "ERROR, could not parse hole number ranges: " << holeNumberRangesStr << "." << endl; exit(1); } } if (printSAMQV) { if (samQV.size() == 0) { samQVList.SetDefaultQV(); } else { samQVList.UseQV(samQV); } } if (minRawSubreadScore > 1000) { cout << "ERROR, minimum raw subread score should be less than 1000." << endl; exit(1); } if (minRawSubreadScore != -1 and byAdapter) { cout << "ERROR, minRawSubreadScore and byAdapter should not be used together." << endl; exit(1); } // Determine query read type queryReadType = DetermineQueryReadType(); // Pass verbosity anchorParameters.verbosity = verbosity; // Set filter criteria and hit policy ResetFilterAndHit(); } void ResetFilterAndHit(void) { filterCriteria = FilterCriteria(minAlnLength, minPctSimilarity, minPctAccuracy, true, Score(static_cast(maxScore), ScoreSign::NEGATIVE)); hitPolicy = HitPolicy(hitPolicyStr, ScoreSign::NEGATIVE); } ReadType::ReadTypeEnum DetermineQueryReadType() { if (useCcsOnly or queryFileType == FileType::HDFCCSONLY) { return ReadType::CCS; } if (queryFileType == FileType::PBBAM) { if (not mapSubreadsSeparately) { // specifal case: blasr subread.bam ref.fa --noSplitSubreads // input type seems like subread while infact is polymerase return ReadType::POLYMERASE; } // Read type in BAM may be CCS, SUBREAD, HQREGION or POLYMERASE. // Determine it later. return ReadType::UNKNOWN; } if (mapSubreadsSeparately) { return ReadType::SUBREAD; } else { if (useHQRegionTable and (queryFileType == FileType::HDFCCSONLY or queryFileType == FileType::HDFBase or queryFileType == FileType::HDFPulse)) { // Only HDF files can contain region table. return ReadType::HQREGION; } else { return ReadType::POLYMERASE; } } } void SetEmulateNucmer() { anchorParameters.stopMappingOnceUnique = true; anchorParameters.advanceExactMatches = 30; anchorParameters.maxAnchorsPerPosition = 1; sdpBypassThreshold = 0.75; sdpTupleSize = 15; anchorParameters.minMatchLength = 30; useGuidedAlign = true; refineAlignments = false; } void SetForSensitivity() { advanceExactMatches = 0; anchorParameters.numBranches = 1; anchorParameters.maxAnchorsPerPosition = 10000; } }; blasr-smrtanalysis-4.0.0/iblasr/MappingSemaphores.h000066400000000000000000000016421302464523700224250ustar00rootroot00000000000000#pragma once #include #include #include #ifndef __APPLE__ class MappingSemaphores { public: sem_t reader; sem_t writer; sem_t unaligned; sem_t hitCluster; void InitializeAll() { sem_init(&reader, 0, 1); sem_init(&writer, 0, 1); sem_init(&unaligned, 0, 1); sem_init(&hitCluster, 0, 1); } }; #else class MappingSemaphores { public: sem_t *reader; sem_t *writer; sem_t *unaligned; sem_t *hitCluster; void InitializeAll() { reader = sem_open("/reader", O_CREAT, 0644, 1); writer = sem_open("/writer", O_CREAT, 0644, 1); unaligned = sem_open("/unaligned", O_CREAT, 0644, 1); hitCluster = sem_open("/hitCluster", O_CREAT, 0644, 1); } }; #endif blasr-smrtanalysis-4.0.0/iblasr/ReadAlignments.hpp000066400000000000000000000116121302464523700222360ustar00rootroot00000000000000// Author: Mark Chaisson #pragma once #include #include #include #include #include using namespace std; class ReadAlignments { public: /* This class stores the alignments from a read. A read may be aligned in several different modes: 1. Fullread - Treat the read as a unit from start to end 2. Subread - Align each subread independently 3. CCSDeNovo - Only align the CCS sequence from a read 4. CCSAllPass - Align the de novo ccs sequences and then the subreads to where the denovo ccs aligned. 5. CCSFullPass - Same as allpass, except using only complete subreads. 6. ZmwSubreads - Align subreads of each zmw to where the longest subread of the zmw aligned to. The alignments are a raggad array of n sequences; n is 1 for cases 1 and 3, the number of subreads for cases 2 and 4, and the number of full length passes for case 5. A ReadAligments class must only have alignments for a single type of read in it. */ vector > subreadAlignments; vector subreads; AlignMode alignMode; SMRTSequence read; inline int GetNAlignedSeq(); inline bool AllSubreadsHaveAlignments(); inline void Clear(); inline void Resize(int nSeq); inline void CheckSeqIndex(int seqIndex); inline void SetSequence(int seqIndex, SMRTSequence &seq); inline void AddAlignmentForSeq(int seqIndex, T_AlignmentCandidate *alignmentPtr); inline void AddAlignmentsForSeq(int seqIndex, vector &seqAlignmentPtrs); // Copy all T_AlignmentCandidate objects (to which subreadAlignment[seqIndex] // is pointing) to newly created objects, and then return pointers to the new // objects. inline vector CopySubreadAlignments(int seqIndex); inline void Print(ostream &out=cout); inline ~ReadAlignments(); }; inline int ReadAlignments::GetNAlignedSeq() { return subreadAlignments.size(); } inline bool ReadAlignments::AllSubreadsHaveAlignments() { int i, nAlignedSeq; nAlignedSeq = subreadAlignments.size(); for (i = 0; i < nAlignedSeq; i++) { if (subreadAlignments[i].size() == 0) { return false; } } return true; } inline void ReadAlignments::Clear() { int i; int nAlignedSeq; for (i = 0, nAlignedSeq = subreadAlignments.size(); i < nAlignedSeq; i++) { int nAlignments; int a; for (a = 0, nAlignments = subreadAlignments[i].size(); a < nAlignments; a++) { delete subreadAlignments[i][a]; } subreadAlignments[i].clear(); } for (i = 0, nAlignedSeq = subreads.size(); i< nAlignedSeq; i++) { subreads[i].Free(); } subreadAlignments.clear(); read.Free(); } inline void ReadAlignments::Resize(int nSeq) { subreadAlignments.resize(nSeq); subreads.resize(nSeq); } inline void ReadAlignments::CheckSeqIndex(int seqIndex) { if ( seqIndex < 0 or seqIndex >= int(subreads.size()) ) { cout << "ERROR, adding a sequence to an unallocated position." << endl; assert(0); } } inline void ReadAlignments::SetSequence(int seqIndex, SMRTSequence &seq) { CheckSeqIndex(seqIndex); subreads[seqIndex] = seq; } inline void ReadAlignments::AddAlignmentForSeq(int seqIndex, T_AlignmentCandidate *alignmentPtr) { CheckSeqIndex(seqIndex); subreadAlignments[seqIndex].push_back(alignmentPtr); } inline void ReadAlignments::AddAlignmentsForSeq(int seqIndex, vector &seqAlignmentPtrs) { CheckSeqIndex(seqIndex); subreadAlignments[seqIndex].insert(subreadAlignments[seqIndex].end(), seqAlignmentPtrs.begin(), seqAlignmentPtrs.end()); } inline vector ReadAlignments::CopySubreadAlignments(int seqIndex) { vector ret; for (int i=0; iPrint(out); } } out << " read: "; read.Print(out); out << endl << endl; } inline ReadAlignments::~ReadAlignments() { read.Free(); } blasr-smrtanalysis-4.0.0/iblasr/RegisterBlasrOptions.h000066400000000000000000001223031302464523700231250ustar00rootroot00000000000000#pragma once /* * ============================================================================ * * Filename: RegisterOptions.hpp * * Description: * * Version: 1.0 * Created: 04/29/2015 04:48:26 PM * Revision: none * Compiler: gcc * * Author: Yuan Li (yli), yli@pacificbiosciences.com * Company: Pacific Biosciences * * ============================================================================ */ #include #include #include #include "MappingParameters.h" #include "RegisterFilterOptions.h" using namespace std; void RegisterBlasrOptions(CommandLineParser & clp, MappingParameters & params) { int trashbinInt; float trashbinFloat; bool trashbinBool; clp.RegisterStringOption("-sa", ¶ms.suffixArrayFileName, ""); clp.RegisterStringOption("-ctab", ¶ms.countTableName, "" ); clp.RegisterStringOption("-regionTable", ¶ms.regionTableFileName, ""); clp.RegisterStringOption("-ccsFofn", ¶ms.ccsFofnFileName, ""); clp.RegisterIntOption("-bestn", (int*) ¶ms.nBest, "", CommandLineParser::PositiveInteger); clp.RegisterIntOption("-limsAlign", ¶ms.limsAlign, "", CommandLineParser::PositiveInteger); clp.RegisterFlagOption("-printOnlyBest", ¶ms.printOnlyBest, ""); clp.RegisterFlagOption("-outputByThread", ¶ms.outputByThread, ""); clp.RegisterFlagOption("-rbao", ¶ms.refineBetweenAnchorsOnly, ""); clp.RegisterFlagOption("-onegap", ¶ms.separateGaps, ""); clp.RegisterFlagOption("-allowAdjacentIndels", ¶ms.allowAdjacentIndels, "", false); clp.RegisterFlagOption("-placeRepeatsRandomly", ¶ms.placeRandomly, ""); clp.RegisterIntOption("-randomSeed", ¶ms.randomSeed, "", CommandLineParser::Integer); clp.RegisterFlagOption("-extend", ¶ms.extendAlignments, ""); clp.RegisterIntOption("-branchExpand", ¶ms.anchorParameters.branchExpand, "", CommandLineParser::NonNegativeInteger); clp.RegisterIntOption("-maxExtendDropoff", ¶ms.maxExtendDropoff, "", CommandLineParser::NonNegativeInteger); clp.RegisterFlagOption("-nucmer", ¶ms.emulateNucmer, ""); clp.RegisterIntOption("-maxExpand", ¶ms.maxExpand, "", CommandLineParser::PositiveInteger); clp.RegisterIntOption("-minExpand", ¶ms.minExpand, "", CommandLineParser::NonNegativeInteger); clp.RegisterStringOption("-seqdb", ¶ms.seqDBName, ""); clp.RegisterStringOption("-anchors", ¶ms.anchorFileName, ""); clp.RegisterStringOption("-clusters", ¶ms.clusterFileName, ""); clp.RegisterFlagOption("-samplePaths", (bool*) ¶ms.samplePaths, ""); clp.RegisterFlagOption("-noStoreMapQV", ¶ms.storeMapQV, ""); clp.RegisterFlagOption("-nowarp", (bool*) ¶ms.nowarp, ""); clp.RegisterFlagOption("-guidedAlign", (bool*)¶ms.useGuidedAlign, ""); clp.RegisterFlagOption("-useGuidedAlign", (bool*)&trashbinBool, ""); clp.RegisterFlagOption("-noUseGuidedAlign", (bool*)¶ms.useGuidedAlign, ""); clp.RegisterFlagOption("-header", (bool*)¶ms.printHeader, ""); clp.RegisterIntOption("-bandSize", ¶ms.bandSize, "", CommandLineParser::PositiveInteger); clp.RegisterIntOption("-extendBandSize", ¶ms.extendBandSize, "", CommandLineParser::PositiveInteger); clp.RegisterIntOption("-guidedAlignBandSize", ¶ms.guidedAlignBandSize, "", CommandLineParser::PositiveInteger); clp.RegisterIntOption("-maxAnchorsPerPosition", (int*) ¶ms.anchorParameters.maxAnchorsPerPosition, "", CommandLineParser::PositiveInteger); clp.RegisterIntOption("-stopMappingOnceUnique", (int*) ¶ms.anchorParameters.stopMappingOnceUnique, "", CommandLineParser::NonNegativeInteger); clp.RegisterStringOption("-out", ¶ms.outFileName, ""); clp.RegisterIntOption("-match", ¶ms.match, "", CommandLineParser::Integer); clp.RegisterIntOption("-mismatch", ¶ms.mismatch, "", CommandLineParser::Integer); clp.RegisterIntOption("-minMatch", ¶ms.minMatchLength, "", CommandLineParser::PositiveInteger); clp.RegisterIntOption("-maxMatch", ¶ms.anchorParameters.maxLCPLength, "", CommandLineParser::NonNegativeInteger); clp.RegisterIntOption("-maxLCPLength", ¶ms.anchorParameters.maxLCPLength, "", CommandLineParser::NonNegativeInteger); clp.RegisterIntOption("-indel", ¶ms.indel, "", CommandLineParser::Integer); clp.RegisterIntOption("-insertion", ¶ms.insertion, "", CommandLineParser::Integer); clp.RegisterIntOption("-deletion", ¶ms.deletion, "", CommandLineParser::Integer); clp.RegisterIntOption("-idsIndel", ¶ms.idsIndel, "", CommandLineParser::Integer); clp.RegisterIntOption("-sdpindel", ¶ms.sdpIndel, "", CommandLineParser::Integer); clp.RegisterIntOption("-sdpIns", ¶ms.sdpIns, "", CommandLineParser::Integer); clp.RegisterIntOption("-sdpDel", ¶ms.sdpDel, "", CommandLineParser::Integer); clp.RegisterFloatOption("-indelRate", ¶ms.indelRate, "", CommandLineParser::NonNegativeFloat); clp.RegisterFloatOption("-minRatio", ¶ms.minRatio, "", CommandLineParser::NonNegativeFloat); clp.RegisterFloatOption("-sdpbypass", ¶ms.sdpBypassThreshold, "", CommandLineParser::NonNegativeFloat); clp.RegisterFloatOption("-minFrac", &trashbinFloat, "", CommandLineParser::NonNegativeFloat); clp.RegisterIntOption("-maxScore", ¶ms.maxScore, "", CommandLineParser::Integer); clp.RegisterStringOption("-bwt", ¶ms.bwtFileName, ""); clp.RegisterIntOption("m", ¶ms.printFormat, "", CommandLineParser::NonNegativeInteger); #ifdef USE_PBBAM clp.RegisterFlagOption("-sam", ¶ms.printSAM, ""); clp.RegisterFlagOption("-bam", ¶ms.printBAM, ""); #endif clp.RegisterStringOption("-clipping", ¶ms.clippingString, ""); clp.RegisterIntOption("-sdpTupleSize", ¶ms.sdpTupleSize, "", CommandLineParser::PositiveInteger); clp.RegisterIntOption("-pvaltype", ¶ms.pValueType, "", CommandLineParser::NonNegativeInteger); clp.RegisterIntOption("-start", ¶ms.startRead, "", CommandLineParser::NonNegativeInteger); clp.RegisterIntOption("-stride", ¶ms.stride, "", CommandLineParser::NonNegativeInteger); clp.RegisterFloatOption("-subsample", ¶ms.subsample, "", CommandLineParser::PositiveFloat); clp.RegisterIntOption("-nproc", ¶ms.nProc, "", CommandLineParser::PositiveInteger); clp.RegisterFlagOption("-sortRefinedAlignments",(bool*) ¶ms.sortRefinedAlignments, ""); clp.RegisterIntOption("-quallc", ¶ms.qualityLowerCaseThreshold, "", CommandLineParser::Integer); clp.RegisterFlagOption("v", (bool*) ¶ms.verbosity, ""); clp.RegisterIntOption("V", ¶ms.verbosity, "Specify a level of verbosity.", CommandLineParser::NonNegativeInteger); clp.RegisterIntOption("-contextAlignLength", ¶ms.anchorParameters.contextAlignLength, "", CommandLineParser::PositiveInteger); clp.RegisterFlagOption("-skipLookupTable", ¶ms.anchorParameters.useLookupTable, ""); clp.RegisterStringOption("-metrics", ¶ms.metricsFileName, ""); clp.RegisterStringOption("-lcpBounds", ¶ms.lcpBoundsFileName, ""); clp.RegisterStringOption("-fullMetrics", ¶ms.fullMetricsFileName, ""); clp.RegisterIntOption("-nbranch", ¶ms.anchorParameters.numBranches, "", CommandLineParser::NonNegativeInteger); clp.RegisterFlagOption("-divideByAdapter", ¶ms.byAdapter, ""); clp.RegisterFlagOption("-useQuality", ¶ms.ignoreQualities, ""); clp.RegisterFlagOption("-noFrontAlign", ¶ms.extendFrontAlignment, ""); clp.RegisterIntOption("-minReadLength", ¶ms.minReadLength, "", CommandLineParser::NonNegativeInteger); clp.RegisterIntOption("-maxReadLength", ¶ms.maxReadLength, "", CommandLineParser::NonNegativeInteger); clp.RegisterIntOption("-minSubreadLength", ¶ms.minSubreadLength, "", CommandLineParser::NonNegativeInteger); clp.RegisterIntOption("-minRawSubreadScore", ¶ms.minRawSubreadScore, "", CommandLineParser::NonNegativeInteger); clp.RegisterIntOption("-minAvgQual", ¶ms.minAvgQual, "", CommandLineParser::Integer); clp.RegisterFlagOption("-advanceHalf", ¶ms.advanceHalf, ""); clp.RegisterIntOption("-advanceExactMatches", ¶ms.anchorParameters.advanceExactMatches, "", CommandLineParser::NonNegativeInteger); clp.RegisterFlagOption("-useccs", ¶ms.useCcs, ""); clp.RegisterFlagOption("-useccsdenovo", ¶ms.useCcsOnly, ""); clp.RegisterFlagOption("-useccsall", ¶ms.useAllSubreadsInCcs, ""); clp.RegisterFlagOption("-extendDenovoCCSSubreads", ¶ms.extendDenovoCCSSubreads, ""); clp.RegisterFlagOption("-noRefineAlignments", ¶ms.refineAlignments, ""); clp.RegisterFlagOption("-refineConcordantAlignments", ¶ms.refineConcordantAlignments, ""); clp.RegisterIntOption("-nCandidates", ¶ms.nCandidates, "", CommandLineParser::NonNegativeInteger); clp.RegisterFlagOption("-useTemp", (bool*) ¶ms.tempDirectory, ""); clp.RegisterFlagOption("-noSplitSubreads", ¶ms.mapSubreadsSeparately, ""); clp.RegisterFlagOption("-concordant", ¶ms.concordant, ""); // When -concordant is turned on, blasr first selects a subread (e.g., the median length full-pass subread) // of a zmw as template, maps the template subread to a reference, then infers directions of all other subreads // of the same zmw based on direction of the template, and finally maps all other subreads to the same // genomic coordinates as the template. When -concordantAlignBothDirections is turned on, blasr will align // all other subreads both forwardly and backwardly, without infering their directions. This is a hidden // diagnostic option only useful for analyzing movies which have lots of un-identified or missed adapters such // that directions of subreads can not be inferred accurately. clp.RegisterFlagOption("-concordantAlignBothDirections", ¶ms.concordantAlignBothDirections, ""); clp.RegisterIntOption("-flankSize", ¶ms.flankSize, "", CommandLineParser::NonNegativeInteger); clp.RegisterStringOption("-titleTable", ¶ms.titleTableName, ""); clp.RegisterFlagOption("-useSensitiveSearch", ¶ms.doSensitiveSearch, ""); // ignoreRegions or ignoreHQRegions implies region table must exist (i.e., query is HDF). clp.RegisterFlagOption("-ignoreRegions", ¶ms.setIgnoreRegions, ""); clp.RegisterFlagOption("-ignoreHQRegions", ¶ms.setIgnoreHQRegions, ""); clp.RegisterFlagOption("-computeAlignProbability", ¶ms.computeAlignProbability, ""); clp.RegisterStringOption("-unaligned", ¶ms.unalignedFileName, ""); // Print unaligned reads names only clp.RegisterFlagOption("-noPrintUnalignedSeqs", ¶ms.noPrintUnalignedSeqs, ""); clp.RegisterFlagOption("-global", ¶ms.doGlobalAlignment, ""); clp.RegisterIntOption("-globalChainType", ¶ms.globalChainType, "", CommandLineParser::NonNegativeInteger); clp.RegisterFlagOption("-noPrintSubreadTitle", (bool*) ¶ms.printSubreadTitle, ""); clp.RegisterIntOption("-saLookupTableLength", ¶ms.lookupTableLength, "", CommandLineParser::PositiveInteger); clp.RegisterFlagOption("-useDetailedSDP", ¶ms.detailedSDPAlignment, ""); clp.RegisterFlagOption("-nouseDetailedSDP", &trashbinBool, ""); clp.RegisterIntOption("-sdpFilterType", ¶ms.sdpFilterType, "", CommandLineParser::NonNegativeInteger); clp.RegisterIntOption("-scoreType", ¶ms.scoreType, "", CommandLineParser::NonNegativeInteger); clp.RegisterFlagOption("h", ¶ms.printVerboseHelp, ""); clp.RegisterFlagOption("-help", ¶ms.printDiscussion, ""); clp.RegisterFloatOption("-accuracyPrior", ¶ms.readAccuracyPrior, "", CommandLineParser::NonNegativeFloat); // holeNumberRangesStr is a string of comma-delimited hole number ranges, such as '1,2,3,10-15'. // Blasr only analyzes reads whose hole numbers are in the specified hole number ranges. clp.RegisterStringOption("-holeNumbers", ¶ms.holeNumberRangesStr, ""); clp.RegisterIntOption("-substitutionPrior", ¶ms.substitutionPrior, "", CommandLineParser::NonNegativeInteger); clp.RegisterIntOption("-deletionPrior", ¶ms.globalDeletionPrior, "", CommandLineParser::NonNegativeInteger); clp.RegisterIntOption("-recurseOver", ¶ms.recurseOver, "", CommandLineParser::NonNegativeInteger); clp.RegisterStringOption("-scoreMatrix", ¶ms.scoreMatrixString, ""); clp.RegisterFlagOption("-printDotPlots", ¶ms.printDotPlots, ""); clp.RegisterFlagOption("-preserveReadTitle", ¶ms.preserveReadTitle,""); clp.RegisterFlagOption("-forwardOnly", ¶ms.forwardOnly,""); clp.RegisterFlagOption("-affineAlign", ¶ms.affineAlign, ""); clp.RegisterIntOption("-affineOpen", ¶ms.affineOpen, "", CommandLineParser::NonNegativeInteger); clp.RegisterIntOption("-affineExtend", ¶ms.affineExtend, "", CommandLineParser::NonNegativeInteger); clp.RegisterFlagOption("-scaleMapQVByNClusters", ¶ms.scaleMapQVByNumSignificantClusters, "", false); clp.RegisterFlagOption("-printSAMQV", ¶ms.printSAMQV, "", false); clp.RegisterFlagOption("-cigarUseSeqMatch", ¶ms.cigarUseSeqMatch, ""); clp.RegisterStringListOption("-samQV", ¶ms.samQV, ""); clp.RegisterFlagOption("-fastMaxInterval", ¶ms.fastMaxInterval, "", false); clp.RegisterFlagOption("-aggressiveIntervalCut", ¶ms.aggressiveIntervalCut, "", false); clp.RegisterFlagOption("-fastSDP", ¶ms.fastSDP, "", false); clp.RegisterStringOption("-concordantTemplate", ¶ms.concordantTemplate, "typicalsubread"); RegisterFilterOptions(clp, params.minAlnLength, params.minPctSimilarity, params.minPctAccuracy, params.hitPolicyStr, trashbinBool=true, trashbinInt, params.maxScore); } const string BlasrHelp(MappingParameters & params) { stringstream helpStream; helpStream << " Options for blasr " << endl << " Basic usage: 'blasr reads.{bam|fasta|bax.h5|fofn} genome.fasta [-options] " << endl << " option\tDescription (default_value)." << endl << endl << " Input Files." << endl << " reads.bam is a PacBio BAM file of reads." << endl << " This is the preferred input to blasr because rich quality" << endl << " value (insertion,deletion, and substitution quality values) information is " << endl << " maintained. The extra quality information improves variant detection and mapping"< 3." << endl << " --maxMatch l (inf)" << endl << " Stop mapping a read to the genome when the lcp length reaches l. " << endl << " This is useful when the query is part of the reference, for example when " < #include #include #include /// Register options for filtering alignments. void RegisterFilterOptions(CommandLineParser & clp, int & minAlnLength, float & minPctSimilarity, float & minPctAccuracy, std::string & hitPolicyStr, bool & useScoreCutoff, int & scoreSignInt, int & scoreCutoff) { ScoreSign ss = static_cast(scoreSignInt); Score sc(static_cast(scoreCutoff), ss); FilterCriteria fc(static_cast(minAlnLength), minPctSimilarity, minPctAccuracy, useScoreCutoff, sc); HitPolicy hp("randombest", ScoreSign::NEGATIVE); clp.RegisterIntOption("-minAlnLength", &minAlnLength, fc.MinAlnLengthHelp(), CommandLineParser::PositiveInteger); clp.RegisterIntOption("-minAlignLength", &minAlnLength, "Alias of --minAlnLength", CommandLineParser::PositiveInteger); clp.RegisterIntOption("-minLength", &minAlnLength, "Alias of --minAlnLength", CommandLineParser::PositiveInteger); clp.RegisterFloatOption("-minPctSimilarity", &minPctSimilarity, fc.MinPctSimilarityHelp(), CommandLineParser::PositiveFloat); clp.RegisterFloatOption("-minPctIdentity", &minPctSimilarity, "Alias of --minPctSimilarity", CommandLineParser::PositiveFloat); clp.RegisterFloatOption("-minPctAccuracy", &minPctAccuracy, fc.MinPctAccuracyHelp(), CommandLineParser::PositiveFloat); clp.RegisterFloatOption("-minAccuracy", &minPctAccuracy, "Alias of --minPctAccuracy", CommandLineParser::PositiveFloat); clp.RegisterStringOption("-hitPolicy", &hitPolicyStr, hp.Help()); clp.RegisterIntOption("-scoreSign", &scoreSignInt, fc.ScoreSignHelp(), CommandLineParser::Integer); clp.RegisterIntOption("-scoreCutoff", &scoreCutoff, fc.ScoreCutoffHelp(), CommandLineParser::Integer); } blasr-smrtanalysis-4.0.0/libcpp/000077500000000000000000000000001302464523700166245ustar00rootroot00000000000000blasr-smrtanalysis-4.0.0/makefile000066400000000000000000000051631302464523700170600ustar00rootroot00000000000000all: SRCDIR:=$(dir $(realpath $(firstword $(MAKEFILE_LIST)))) -include ${CURDIR}/defines.mk -include ${SRCDIR}/rules.mk foo: echo $(realpath $(firstword $(MAKEFILE_LIST))) echo $(firstword $(MAKEFILE_LIST)) echo $(MAKEFILE_LIST) echo ${SRCDIR} GET_SHA1 := $(shell git -C ${SRCDIR} describe --always --dirty='*') CXXFLAGS += -O3 -g -DSHA1_7=\"${GET_SHA1}\" CXXOPTS += \ -std=c++0x -pedantic \ -Wall -Wextra -Wno-div-by-zero -Wno-overloaded-virtual \ -MMD -MP GCXXFLAGS := -fno-builtin-malloc -fno-builtin-calloc -fno-builtin-realloc -fno-builtin-free -fno-omit-frame-pointer override CXXFLAGS += ${CXXOPTS} ${GCXXFLAGS} #INC_DIRS:=${LIBBLASR_INC} ${LIBPBIHDF_INC} ${LIBPBDATA_INC} ${PBBAM_INC} ${HTSLIB_INC} ${HDF5_INC} ${ZLIB_INC} #LIB_DIRS:=${LIBBLASR_LIB} ${LIBPBIHDF_LIB} ${LIBPBDATA_LIB} ${PBBAM_LIB} ${HTSLIB_LIB} ${HDF5_LIB} ${ZLIB_LIB} #LDLIBS := \ # ${LIBBLASR_LIBFLAGS} ${LIBPBIHDF_LIBFLAGS} ${LIBPBDATA_LIBFLAGS} \ # ${PBBAM_LIBFLAGS} ${HTSLIB_LIBFLAGS} ${HDF5_LIBFLAGS} ${ZLIB_LIBFLAGS} \ # -ldl -lpthread # HDF5 needs -ldl, but mobs does not pass it in. SRCS := Blasr.cpp OBJS := ${SRCS:.cpp=.o} DEPS := ${SRCS:.cpp=.d} override BLASR_PATH=${SRCDIR}/ export BLASR_PATH override LD_LIBRARY_PATH:=${LIBBLASR_LIB}:${LIBPBIHDF_LIB}:${LIBPBDATA_LIB}:${HDF5_LIB}:${HTSLIB_LIB}:${PBBAM_LIB}:${ZLIB_LIB}:${LD_LIBRARY_PATH} export LD_LIBRARY_PATH # Note: On macosx, this would be DYLD_LIBRARY_PATH. vpath %.cpp ${SRCDIR} init-submodule: ${MAKE} update-submodule ${MAKE} configure-submodule ${MAKE} build-submodule update-submodule: git submodule update --init configure-submodule: ${MAKE} -f ${SRCDIR}/sub.mk configure-submodule build-submodule: ${MAKE} -C libcpp distclean-submodule: ${RM} -r libcpp # The rules above must be run separately. all: blasr makeutils #all: makeextrautils #This would require pbbam. blasr: ${OBJS} ${CXX} -o $@ ${CXXFLAGS} ${CPPFLAGS} -MF"${@:%=%.d}" ${OBJS} ${LDFLAGS} ${LDLIBS} @echo LD_LIBRARY_PATH=${LD_LIBRARY_PATH} makeutils: ${MAKE} -C utils makeextrautils: ${MAKE} -C extrautils cramtests: blasr utils ${MAKE} -f cram.mk cramtests ${MAKE} -C utils cramtests cramfast: blasr utils ${MAKE} -f cram.mk cramfast ${MAKE} -C utils cramfast crammild: blasr utils ${MAKE} -f cram.mk crammild ${MAKE} -C utils crammild gtest: blasr # This requires the submodule to be configured with gtest. ${MAKE} -C libcpp gtest check: gtest cramtests cleanall: cleanlib clean # cleanlib is only for submodule users cleanlib: libcpp/defines.mk ${MAKE} -C libcpp clean clean: ${RM} blasr ${OBJS} ${DEPS} blasr.d ${MAKE} -C utils clean ${MAKE} -C extrautils clean -include ${DEPS} blasr-smrtanalysis-4.0.0/rules.mk000066400000000000000000000014101302464523700170320ustar00rootroot00000000000000INCDIRS := \ ${LIBBLASR_INC} \ ${LIBPBIHDF_INC} \ ${LIBPBDATA_INC} SYSINCDIRS := \ ${PBBAM_INC} \ ${HDF5_INC} \ ${HTSLIB_INC} \ ${BOOST_INC} LIBDIRS := \ ${LIBBLASR_LIB} \ ${LIBPBIHDF_LIB} \ ${LIBPBDATA_LIB} \ ${PBBAM_LIB} \ ${HDF5_LIB} \ ${HTSLIB_LIB} \ ${GCC_LIB} \ ${SZLIB_LIB} \ ${ZLIB_LIB} LDLIBS+= \ ${LIBPBIHDF_LIBFLAGS} \ ${LIBBLASR_LIBFLAGS} \ ${LIBPBIHDF_LIBFLAGS} \ ${LIBPBDATA_LIBFLAGS} \ ${PBBAM_LIBFLAGS} \ ${HDF5_LIBFLAGS} \ ${HTSLIB_LIBFLAGS} \ ${SZLIB_LIBFLAGS} \ ${ZLIB_LIBFLAGS} \ ${RT_LIBFLAGS} \ ${PTHREAD_LIBFLAGS} \ ${DL_LIBFLAGS} # We repeat LIBPBIHDF_LIBFLAGS because of a circular dependency. See #77. CPPFLAGS+=$(patsubst %,-I%,${INCDIRS}) CPPFLAGS+=$(patsubst %,-I%,${SYSINCDIRS}) LDFLAGS+=$(patsubst %,-L%,${LIBDIRS}) blasr-smrtanalysis-4.0.0/sub.mk000066400000000000000000000002571302464523700165010ustar00rootroot00000000000000 SRCDIR:=$(dir $(realpath $(firstword ${MAKEFILE_LIST}))) -include ${CURDIR}/defines.mk -include ${SRCDIR}/rules.mk export configure-submodule: cd libcpp && ./configure.py blasr-smrtanalysis-4.0.0/travis.sh000077500000000000000000000005161302464523700172240ustar00rootroot00000000000000#!/usr/bin/env bash # This will not work within Travis until have have pre-compiled HDF5 # (or least headers?). But it shows the steps. set -ex # There is a bug without --shared. Working on it. See #77. ./configure.py --shared --sub --no-pbbam HDF5_INC=${HDF5_INC} HDF5_LIB=${HDF5_LIB} make -j4 init-submodule make --debug=b -j4 all blasr-smrtanalysis-4.0.0/utils/000077500000000000000000000000001302464523700165135ustar00rootroot00000000000000blasr-smrtanalysis-4.0.0/utils/.gitignore000066400000000000000000000001341302464523700205010ustar00rootroot00000000000000/loadPulses /pls2fasta /samFilter /samtoh5 /samtom4 /sawriter /sdpMatcher /toAfg *.swp tags blasr-smrtanalysis-4.0.0/utils/LoadPulses.cpp000066400000000000000000003506211302464523700213010ustar00rootroot00000000000000#define __FAST_MATH__ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include using namespace std; typedef map MovieNameToArrayIndex; typedef map MetricOptionsMap; typedef map > RequirementMap; char VERSION[] = "v1.1.0"; char PERFORCE_VERSION_STRING[] = "$Change: 126407 $"; // define default values for metrics const float NaN = 0.0/0.0; const UChar missingQualityValue = 255; const unsigned char maxQualityValue = 100; const HalfWord missingFrameRateValue = USHRT_MAX; const unsigned int missingPulseIndex = UINT_MAX; void CapQualityValue(QualityValueVector &vect, DNALength length, unsigned char maxQualityValue=100) { unsigned int i; if (vect.data == NULL) { return; } for (i = 0; i < length; i++) { vect.data[i] = min(vect.data[i], maxQualityValue); } } void CapQualityValues(SMRTSequence &seq, unsigned char maxQualityValue = 100) { CapQualityValue(seq.qual, seq.length, maxQualityValue); CapQualityValue(seq.deletionQV, seq.length, maxQualityValue); CapQualityValue(seq.preBaseDeletionQV, seq.length, maxQualityValue); CapQualityValue(seq.insertionQV, seq.length, maxQualityValue); CapQualityValue(seq.substitutionQV, seq.length, maxQualityValue); CapQualityValue(seq.mergeQV, seq.length, maxQualityValue); } int CheckCmpFileFormat(CmpFile &cmpFile) { if (cmpFile.readType != ReadType::Standard) { cout << "ERROR! Reading pulse information into a cmp.h5 file generated from circular " << endl << "consensus called sequences is not supported." << endl; exit(1); } return 1; } void BuildRequirementMap(RequirementMap &fieldRequirements) { fieldRequirements["StartTimeOffset"].push_back("StartFrame"); fieldRequirements["StartTimeOffset"].push_back("NumEvent"); fieldRequirements["StartFrame"].push_back("PreBaseFrames"); fieldRequirements["StartFrame"].push_back("WidthInFrames"); fieldRequirements["PulseWidth"].push_back("WidthInFrames"); fieldRequirements["pkmid"].push_back("MidSignal"); fieldRequirements["pkmid"].push_back("NumEvent"); fieldRequirements["IPD"].push_back("StartFrame"); fieldRequirements["IPD"].push_back("NumEvent"); fieldRequirements["IPD"].push_back("PreBaseFrames"); fieldRequirements["IPD"].push_back("WidthInFrames"); fieldRequirements["Light"].push_back("MeanSignal"); fieldRequirements["Light"].push_back("NumEvent"); fieldRequirements["Light"].push_back("WidthInFrames"); // Build requirementMap for sneaky metrics fieldRequirements["StartFrameBase"].push_back("PreBaseFrames"); fieldRequirements["StartFrameBase"].push_back("WidthInFrames"); fieldRequirements["StartFramePulse"].push_back("PreBaseFrames"); fieldRequirements["StartFramePulse"].push_back("WidthInFrames"); } void ExclusivelyAdd(const char *value, vector &vect) { if (find(vect.begin(), vect.end(), value) == vect.end()) { vect.push_back(value); } } bool AnyFieldRequiresFrameRate(vector &fields) { for (size_t i = 0; i < fields.size(); i++ ) { if (fields[i] == "PulseWidth" or fields[i] == "IPD" or fields[i] == "Light" or fields[i] == "StartTimeOffset" or fields[i] == "StartFrame" or fields[i] == "PulseWidth" or fields[i] == "PreBaseFrames" or fields[i] == "WidthInFrames") { return true; } } return false; } template void Free(T* &buf) { if (buf != NULL){ delete[] buf; } buf = NULL; } // Return all eighteen metrics that can be loaded. // StartTimeOffset QualityValue InsertionQV MergeQV // DeletionQV DeletionTag PulseIndex SubstitutionTag // SubstitutionQV ClassifierQV StartFrame PulseWidth // PreBaseFrames WidthInFrames pkmid IPD // Light WhenStarted vector GetAllSupportedMetrics(bool isSneakyMetricsIncluded = true) { // The order of metrics matters. With -bymetric option, all fields // which are required for computing a metric are cached before WriteMetric() // and cleared afterwards. If two neighboring metrics share a subset of // required fields, then the cached fields can be re-used. Arrange metrics // in an order that maximizes reuse of cached fields. vector supportedMetrics; supportedMetrics.push_back("WhenStarted"); supportedMetrics.push_back("QualityValue"); supportedMetrics.push_back("InsertionQV"); supportedMetrics.push_back("MergeQV"); supportedMetrics.push_back("DeletionQV"); supportedMetrics.push_back("DeletionTag"); supportedMetrics.push_back("SubstitutionTag"); supportedMetrics.push_back("SubstitutionQV"); supportedMetrics.push_back("PreBaseFrames"); // Sneaky metrics for internal use Only if (isSneakyMetricsIncluded) { supportedMetrics.push_back("StartFrameBase"); } supportedMetrics.push_back("IPD"); supportedMetrics.push_back("StartFrame"); if (isSneakyMetricsIncluded) { supportedMetrics.push_back("StartFramePulse"); } // Disable metric StartTimeOffset for now. // StartTimeOffset is placed at the same level as AlnArray, However, the // size of StartTimeOffset is far less than AlnArray, while cmp.h5 spec // requires all datasets at that level to have the same size. // supportedMetrics.push_back("StartTimeOffset"); supportedMetrics.push_back("PulseWidth"); supportedMetrics.push_back("WidthInFrames"); supportedMetrics.push_back("Light"); supportedMetrics.push_back("pkmid"); supportedMetrics.push_back("ClassifierQV"); supportedMetrics.push_back("PulseIndex"); return supportedMetrics; } // Return metrics to load by default. vector GetDefaultMetrics() { vector defaultMetrics; defaultMetrics.push_back("QualityValue"); defaultMetrics.push_back("ClassifierQV"); defaultMetrics.push_back("StartFrame"); defaultMetrics.push_back("PulseWidth"); defaultMetrics.push_back("WidthInFrames"); defaultMetrics.push_back("pkmid"); defaultMetrics.push_back("IPD"); return defaultMetrics; } // Return metrics that can be computed from PulseCalls. vector GetPulseMetrics() { vector pulseMetrics; pulseMetrics.push_back("StartFrame"); pulseMetrics.push_back("StartTimeOffset"); pulseMetrics.push_back("ClassifierQV"); pulseMetrics.push_back("PulseWidth"); pulseMetrics.push_back("WidthInFrames"); pulseMetrics.push_back("IPD"); pulseMetrics.push_back("pkmid"); pulseMetrics.push_back("Light"); pulseMetrics.push_back("StartFramePulse"); return pulseMetrics; } // Return true if this metric can be computed from PulseCalls. bool IsPulseMetric(const string & metric) { vector pulseMetrics = GetPulseMetrics(); for (size_t i = 0; i < pulseMetrics.size(); i++) { if (pulseMetrics[i] == metric) return true; } return false; } // Return all metrics that are // (1) supported, // (2) requested to load, and // (3) computable with all required fields available // in either bas.h5 or pls.h5. vector GetMetricsToLoad(map & metricOptions) { vector metricsToLoad; // Get all supported metrics. vector supportedMetrics = GetAllSupportedMetrics(); map::iterator metricIt; for (size_t i = 0; i < supportedMetrics.size(); i++) { string metric = supportedMetrics[i]; metricIt = metricOptions.find(metric); if (metricIt!=metricOptions.end() and metricIt->second) { // Get metrics that are required and computable metricsToLoad.push_back(metricIt->first); } } return metricsToLoad; } void StoreDatasetFieldsFromPulseFields(MetricOptionsMap &fieldSet, RequirementMap &fieldRequirements, vector &datasetFields) { size_t d; MetricOptionsMap::iterator optionsIt; for (optionsIt = fieldSet.begin(); optionsIt != fieldSet.end(); ++optionsIt) { if (optionsIt->second == true) { if (fieldRequirements.find(optionsIt->first) == fieldRequirements.end()) { ExclusivelyAdd(optionsIt->first.c_str(), datasetFields); } else { for (d = 0; d < fieldRequirements[optionsIt->first].size(); d++) { ExclusivelyAdd(fieldRequirements[optionsIt->first][d].c_str(), datasetFields ); } } } } } void ParseMetricsList(string metricListString, MetricOptionsMap &metricOptions) { vector metrics; Splice(metricListString, ",", metrics); for (size_t m = 0; m < metrics.size(); m++) { if (metricOptions.find(metrics[m]) != metricOptions.end()) { metricOptions[metrics[m]] = true; } else { cout << "ERROR! Metric " << metrics[m] << " is not supported." << endl; exit(1); } } } // Set default metric options to true void SetDefaultMetricOptions(map & metricOptions) { vector defaultMetrics = GetDefaultMetrics(); for (size_t i = 0; i < defaultMetrics.size(); i++) { metricOptions[defaultMetrics[i]] = true; } } // Initialize all supported metric options and set all to false void CreateMetricOptions(map &metricOptions) { vector supportedMetrics = GetAllSupportedMetrics(); for (size_t i = 0; i < supportedMetrics.size(); i++) { metricOptions[supportedMetrics[i]] = false; } } // Check whether all fields are available or not. bool AreAllFieldsAvailable( vector & requiredFields, HDFBasReader & hdfBasReader, HDFPlsReader & hdfPlsReader, const bool & useBaseFile, const bool & usePulseFile) { bool allAvailable = true; for (size_t i = 0; i < requiredFields.size(); i++) { Field field = requiredFields[i]; if (field.type == BasField) { if (!useBaseFile or !hdfBasReader.FieldIsIncluded(field.name) or !hdfBasReader.includedFields[field.name]) { allAvailable = false; break; } } else if (field.type == PlsField) { if (!usePulseFile or !hdfPlsReader.FieldIsIncluded(field.name) or !hdfPlsReader.includedFields[field.name]) { allAvailable = false; break; } } } return allAvailable; } // // Check whether a metric is computable or not. // fieldsToBeUsed = all fields that will be used for computing a metric. // If a metric can be computed from both bas and pls files (e.g. // StartFrame, IPD, PulseWidth, WidthInFrame), only compute it from pls. // bool CanThisMetricBeComputed ( const string & metricName, HDFBasReader & hdfBasReader, HDFPlsReader & hdfPlsReader, const bool & useBaseFile, const bool & usePulseFile, vector & fieldsToBeUsed) { fieldsToBeUsed.clear(); FieldsRequirement fieldsRequirement = FieldsRequirement(metricName); bool metricMayBeComputedFromPls = true; if (fieldsRequirement.fieldsUsePlsFile.size() != 0 && usePulseFile) { metricMayBeComputedFromPls = AreAllFieldsAvailable( fieldsRequirement.fieldsUsePlsFile, hdfBasReader, hdfPlsReader, useBaseFile, usePulseFile); } else { metricMayBeComputedFromPls = false; } bool metricMayBeComputedFromBas = true; if (fieldsRequirement.fieldsUseBasFile.size() != 0 && useBaseFile) { metricMayBeComputedFromBas = AreAllFieldsAvailable( fieldsRequirement.fieldsUseBasFile, hdfBasReader, hdfPlsReader, useBaseFile, usePulseFile); } else { metricMayBeComputedFromBas = false; } bool metricMayBeComputed = true; if (!metricMayBeComputedFromBas and !metricMayBeComputedFromPls) { metricMayBeComputed = false; } // Compute from pls if possible if (metricMayBeComputedFromPls) { fieldsToBeUsed = fieldsRequirement.fieldsUsePlsFile; } else if (metricMayBeComputedFromBas) { fieldsToBeUsed = fieldsRequirement.fieldsUseBasFile; } if (metricName == "StartTimeOffset") { metricMayBeComputed = false; // Disable StartTimeOffset for now. } if (metricName == "WhenStarted") { // WhenStarted requires no fields from neither bas nor pls. metricMayBeComputed = true; } return metricMayBeComputed; } // // Check whether metrics are computable or not. If a metric is not // computable, disable it with a warning or exit with an error. // void CanMetricsBeComputed( MetricOptionsMap & metricOptions, HDFBasReader & hdfBasReader, HDFPlsReader & hdfPlsReader, const bool & useBaseFile, const bool & usePulseFile, const bool & failOnMissingData, const string & movieName) { map::iterator metricIt; for (metricIt = metricOptions.begin(); metricIt != metricOptions.end(); ++metricIt) { string metricName = metricIt->first; if (metricName == "") { metricIt->second = false; } if (metricIt->second == false) { continue; } vector fieldsToBeUsed; bool metricMayBeComputed = CanThisMetricBeComputed(metricName, hdfBasReader, hdfPlsReader, useBaseFile, usePulseFile, fieldsToBeUsed); if (metricMayBeComputed == false) { if (failOnMissingData) { cout << "ERROR"; } else { cout << "WARNING"; } cout << ": There is insufficient data to compute metric: " << metricName << " in the file " << movieName << " "; cout << " It will be ignored." << endl; if (failOnMissingData) { exit(1); } metricOptions[metricName] = false; } } } // Return size of a single field in KB. UInt ComputeRequiredMemoryForThisField( Field & thisField, HDFBasReader & hdfBasReader, HDFPlsReader & hdfPlsReader, const bool & useBaseFile, const bool & usePulseFile) { if (thisField.type == BasField) { assert(useBaseFile); return hdfBasReader.GetFieldSize(thisField.name); } if (thisField.type == PlsField) { assert(usePulseFile); return hdfPlsReader.GetFieldSize(thisField.name); } assert(false); } // // Return estimated memory peak (in KB) for buffering all data using -bymetric. // UInt ComputeRequiredMemory( vector & metricsToLoad, HDFBasReader & hdfBasReader, HDFPlsReader & hdfPlsReader, const bool & useBaseFile, const bool & usePulseFile, HDFCmpFile & cmpReader, UInt & totalAlnLength) { UInt maxMemory = 0; for (size_t i = 0; i < metricsToLoad.size(); i++) { UInt memoryForThisMetric = 0; vector fieldsToBeUsed; bool canBeComputed = CanThisMetricBeComputed( metricsToLoad[i], hdfBasReader, hdfPlsReader, useBaseFile, usePulseFile, fieldsToBeUsed); (void)(canBeComputed); for (size_t j = 0; j < fieldsToBeUsed.size(); j++) { UInt memoryForThisField = ComputeRequiredMemoryForThisField( fieldsToBeUsed[j], hdfBasReader, hdfPlsReader, useBaseFile, usePulseFile); memoryForThisMetric += memoryForThisField; } maxMemory = max(maxMemory, memoryForThisMetric); } // // AlnIndex will be buffered. Some other datastructures also need // to be buffered for quick look up. Approximately double the size. // UInt totalAlnIndexMem = 2 * cmpReader.alnInfoGroup.GetAlnIndexSize(); // // AlnArray and metrics to load needs to be buffered in KB. // UInt totalAlnArrayMem = totalAlnLength / 1024 * (sizeof(unsigned int) + sizeof(unsigned char)); // // It's diffcult to estimate how much memory will be used by hdf5. // Assume memory consumed by hdf5 scales with AlnIndex and AlnArray datasets. // UInt hdf5Mem = totalAlnIndexMem / 2 + totalAlnLength / 1024 * sizeof(unsigned int); maxMemory += totalAlnIndexMem + totalAlnArrayMem + hdf5Mem; //cout << "The estimated peak memory for buffering fields is " // << maxMemory << " KB." << endl; //cout << "The estimated memory for buffering AlnIndex related data is " // << totalAlnIndexMem << " KB."<< endl; //cout << "The estimated memory for buffering AlnArray related data is " // << totalAlnArrayMem << " KB." << endl; //cout << "The estimated memory for hdf5 is " // << hdf5Mem << " KB." << endl; //cout << "The estimated total memory is " // << maxMemory << " KB." << endl; return maxMemory; } // // Get aligned sequence for this alignment from cmpFile // string GetAlignedSequenceFromCmpFile( const HDFCmpFile & cmpReader, MovieAlnIndexLookupTable & lookupTable) { string alignedSequence; vector byteAlignment; int alignedSequenceLength = lookupTable.offsetEnd - lookupTable.offsetBegin; if (alignedSequenceLength >= 0 ) { alignedSequence.resize(alignedSequenceLength); byteAlignment.resize(alignedSequenceLength); } // // Read the alignment string. All alignments // cmpReader.refAlignGroups[lookupTable.refGroupIndex]->readGroups[lookupTable.readGroupIndex]->alignmentArray.Read( lookupTable.offsetBegin, lookupTable.offsetEnd, &byteAlignment[0]); // // Convert to something we can compare easily. // ByteAlignmentToQueryString(&byteAlignment[0], byteAlignment.size(), &alignedSequence[0]); return alignedSequence; } // // Store info necessary for loading pulses to lookupTable. // void BuildLookupTable( const int & movieAlignmentIndex, CmpFile & cmpFile, BaseFile & baseFile, const bool & usePulseFile, PulseFile & pulseFile, HDFCmpFile & cmpReader, const vector & movieAlnIndex, const vector< pair > & toFrom, const set & moviePartHoleNumbers, MovieAlnIndexLookupTable & lookupTable) { // // Query the cmp file for a way to look up a read based on // coordinate information. For Astro reads, the coords are // based on x and y. For Springfield, it is read index. The // base files should be able to look up reads by x,y or by // index. // if (cmpFile.platformId == Astro) { cout << "ASTRO pulse loading is deprecated." << endl; exit(1); } int alignmentIndex = movieAlnIndex[toFrom[movieAlignmentIndex].second]; // // Alignments are grouped by ref group id then movie id. // int refGroupId = cmpFile.alnInfo.alignments[alignmentIndex].GetRefGroupId(); int movieId = cmpFile.alnInfo.alignments[alignmentIndex].GetMovieId(); (void)(movieId); UInt holeNumber = cmpFile.alnInfo.alignments[alignmentIndex].GetHoleNumber(); int alnGroupId = cmpFile.alnInfo.alignments[alignmentIndex].GetAlnGroupId(); if (cmpReader.refGroupIdToArrayIndex.find(refGroupId) == cmpReader.refGroupIdToArrayIndex.end()) { cout << "ERROR! An alignment " << alignmentIndex << " is specified with reference group " << endl << refGroupId << " that is not found as an alignment group." << endl; exit(1); } int refGroupIndex = cmpReader.refGroupIdToArrayIndex[refGroupId]; // // Now find the group containing the alignment. // if (cmpReader.alnGroupIdToReadGroupName.find(alnGroupId) == cmpReader.alnGroupIdToReadGroupName.end()) { cout << "ERROR! An alignment " << alignmentIndex << " is specified with alignment group " << endl << alnGroupId << " that is not found." << endl; exit(1); } string readGroupName = cmpReader.alnGroupIdToReadGroupName[alnGroupId]; if (cmpReader.refAlignGroups[refGroupIndex]->experimentNameToIndex.find(readGroupName) == cmpReader.refAlignGroups[refGroupIndex]->experimentNameToIndex.end()) { cout << "ERROR! An alignment " << alignmentIndex << " is specified with read group name " << endl << readGroupName << " that is not found." << endl; exit(1); } int readGroupIndex = cmpReader.refAlignGroups[refGroupIndex]->experimentNameToIndex[readGroupName]; UInt offsetBegin = cmpFile.alnInfo.alignments[alignmentIndex].GetOffsetBegin(); UInt offsetEnd = cmpFile.alnInfo.alignments[alignmentIndex].GetOffsetEnd(); // // First pull out the bases corresponding to this read. // int queryStart = cmpFile.alnInfo.alignments[alignmentIndex].GetQueryStart(); int queryEnd = cmpFile.alnInfo.alignments[alignmentIndex].GetQueryEnd(); bool skip = false; int readIndex, readStart, readLength, plsReadIndex; readIndex = readStart = readLength = plsReadIndex = -1; // // Since the movie may be split into multiple parts, look to see // if this hole number is one of the ones covered by this // set. If it is not, just continue. It will be loaded on // another pass through a different movie part. // if (moviePartHoleNumbers.find(holeNumber) == moviePartHoleNumbers.end()) { skip = true; } else { if (!baseFile.LookupReadIndexByHoleNumber(holeNumber, readIndex)) { cout << "ERROR! Alignment has hole number " << holeNumber << " that is not in the movie. " << endl; exit(1); } readStart = baseFile.readStartPositions[readIndex]; readLength = baseFile.readStartPositions[readIndex+1] - baseFile.readStartPositions[readIndex]; if (usePulseFile) { if (!pulseFile.LookupReadIndexByHoleNumber(holeNumber, plsReadIndex)) { cout << "ERROR! Alignment has hole number " << holeNumber << " that is not in the movie. " << endl; exit(1); } assert(pulseFile.holeNumbers[plsReadIndex] == baseFile.holeNumbers[readIndex]); } } // Save info to lookupTable lookupTable.SetValue(skip, // Skip processing this or not movieAlignmentIndex, alignmentIndex, refGroupIndex, readGroupIndex, holeNumber, // cmp.h5 /AlnInfo/AlnIndex column 7 offsetBegin, // cmp.h5 /AlnInfo/AlnIndex column 18 offsetEnd, // cmp.h5 /AlnInfo/AlnIndex column 19 queryStart, // cmp.h5 /AlnInfo/AlnIndex column 11 queryEnd, // cmp.h5 /AlnInfo/AlnIndex column 12 readIndex, // hole Index in BaseCalls/ZMW/HoleNumber readStart, // readStart in BaseCalls/* (e.g. *=Basecall) readLength, // readLength in BaseCalls/* plsReadIndex); // readIndex in PulseCalls/ZMW/HoleNumber } // // Map bases of a read to pulse indices. // void MapBaseToPulseIndex( BaseFile & baseFile, PulseFile & pulseFile, MovieAlnIndexLookupTable & table, vector & baseToPulseIndexMap) { baseToPulseIndexMap.resize(table.readLength); int pulseStart = pulseFile.pulseStartPositions[table.plsReadIndex]; // // Copy the subset of pulses that correspond to the ones called as bases. // int i; for (i = 0; i < table.readLength; i++) { baseToPulseIndexMap[i] = pulseStart + baseFile.pulseIndex[table.readStart + i]; } } // // Get source read from the bas/pls file. // void GetSourceRead(CmpFile & cmpFile, BaseFile & baseFile, PulseFile & pulseFile, HDFBasReader & hdfBasReader, HDFPlsReader & hdfPlsReader, HDFCCSReader & hdfCcsReader, const bool & useBaseFile, const bool & usePulseFile, const bool & useCcsOnly, //const bool & byRead, MovieAlnIndexLookupTable & table, const string & alignedSequence, SMRTSequence & sourceRead, unsigned int & numPasses) { (void)(baseFile); (void)(pulseFile); (void)(alignedSequence); assert(!table.skip); // // These are not allocated in the regular allocate function // since they are only used in loadPulses. (maybe I should // subclass SMRTSequence here). // //if (byRead) { // Read in the data from the bas file if it exsts. if (useBaseFile) { hdfBasReader.GetReadAt(table.readIndex, sourceRead); if (cmpFile.readType == ReadType::CCS or useCcsOnly) { numPasses = hdfCcsReader.GetNumPasses(table.readIndex); } } // Read in the data from the pls file if it exists. if (usePulseFile) { hdfPlsReader.GetReadAt(table.plsReadIndex, sourceRead.pulseIndex, sourceRead); } // } // else { // This is deprecated // // // // The entire base/pulse file was read in, so copy data from that into a read // // For the data used in the read, it is possible to simply // // reference the data, but for the pls file it is necessary // // to copy since there is a packing of data. // // // if (useBaseFile) { // baseFile.CopyReadAt(table.readIndex, sourceRead); // if (cmpFile.readType == ReadType::CCS or useCcsOnly) { // numPasses = hdfCcsReader.GetNumPasses(table.readIndex); // } // } // if (usePulseFile) { // vector baseToPulseIndexMap; // MapBaseToPulseIndex(baseFile, pulseFile, table, baseToPulseIndexMap); // pulseFile.CopyReadAt(table.readIndex, &baseToPulseIndexMap[0], sourceRead); // } //} CapQualityValues(sourceRead); } // // Build lookup tables for all alignments whose indices in // AlnArray are saved in movieAlnIndex. // Also check whether the bas file and the cmp file match. // void BuildLookupTablesAndMakeSane( CmpFile & cmpFile, BaseFile & baseFile, PulseFile & pulseFile, HDFCmpFile & cmpReader, HDFBasReader & hdfBasReader, HDFPlsReader & hdfPlsReader, HDFCCSReader & hdfCcsReader, const bool & useBaseFile, const bool & usePulseFile, const bool & useCcsOnly, const vector & movieAlnIndex, const vector< pair > & toFrom, const set & moviePartHoleNumbers, vector & lookupTables) { (void)(hdfPlsReader); (void)(hdfCcsReader); (void)(useCcsOnly); (void)(useBaseFile); lookupTables.resize(movieAlnIndex.size()); size_t movieAlignmentIndex = 0; for (movieAlignmentIndex = 0; movieAlignmentIndex < movieAlnIndex.size(); movieAlignmentIndex++) { BuildLookupTable(movieAlignmentIndex, cmpFile, baseFile, usePulseFile, pulseFile, cmpReader, movieAlnIndex, toFrom, moviePartHoleNumbers, lookupTables[movieAlignmentIndex]); } // // Load entire Basecall from pls/bas to memory, and // check whether aligned sequences in cmp.h5 matches // sequences in pls/bas or not // hdfBasReader.ReadField(baseFile, "Basecall"); // // For each alignment, do sanity check and // cache aligned sequence in MovieAlnIndexLookupTable // for (movieAlignmentIndex = 0; movieAlignmentIndex < movieAlnIndex.size(); movieAlignmentIndex++) { MovieAlnIndexLookupTable & table = lookupTables[movieAlignmentIndex]; if (table.skip) continue; // // Get aligned sequence for this alignment from cmpFile // string alignedSequence = GetAlignedSequenceFromCmpFile(cmpReader, table); // Save the aligned sequence in the table table.alignedSequence = alignedSequence; RemoveGaps(alignedSequence, alignedSequence); // // Get sequence for this alignment from baseFile // Nucleotide * seq = new Nucleotide[table.readLength]; baseFile.CopyArray(baseFile.baseCalls, table.readStart, table.readLength, seq); string readSequence; readSequence.resize(table.queryEnd - table.queryStart); copy((char*) (seq + table.queryStart), (char*) (seq + table.queryEnd), readSequence.begin()); delete []seq; // // Do a sanity check to make sure the pulses and the alignment // make sense. The main check is to see if the query sequence // in the alignment is the same as the query sequence in the // read. // if (alignedSequence.size() != readSequence.size() or alignedSequence != readSequence) { cout << "ERROR, the query sequence does not match the aligned query sequence." << endl << "HoleNumber: " << cmpFile.alnInfo.alignments[table.alignmentIndex].GetHoleNumber() << ", MovieName: " << baseFile.GetMovieName() << ", ReadIndex: " << table.readIndex << ", qStart: " << table.queryStart << ", qEnd: " << table.queryEnd << endl << "Aligned sequence: " << endl << alignedSequence << endl << "Original sequence: "<< endl << readSequence << endl; exit(1); } } hdfBasReader.ClearField(baseFile, "Basecall"); } // Given a vector of lookupTables in which items with the same // refGroupIndex and readGroupIndex are grouped, find index boundaries // of each group and save these boundaries to groupedLookupTablesIndexPairs // The index boundary of each group consists of: // 1, index (0 based, inclusive) of the very first item of a group // 2, index (0 based, exclusive) of the very last item of a group // // Assume that lookupTables satisfy the following criteria. // 1, items are already grouped by refGroupIndex and readGroupIndex // 2, items which have the same alnGroupIndex, should have // the same refGroupIndex and readGroupIndex // Note that: // 1, alnGroupIndex represents index of AlnGroupID, (i.e. dataset // /AlnInfo/AlnIndex column 1); // refGroupIndex represents index of RefGroupID, (i.e. dataset // /AlnInfo/AlnIndex column 3); // readGroupIndex represents index of an experiment group within // a refGroup (e.g. if a refGroup /ref0001 contains two experiment // groups /ref0001/movie1 and /ref0001/movie2, then readGroupIndex // for these two groups are 0 and 1.). // 2, within each grouped item, offsetBegin may not begin from 0, // and offsets may not be continugous. // void GroupLookupTables( vector & lookupTables, vector > & groupedLookupTablesIndexPairs) { vector > refGroupIndexReadGroupIndexPairs; UInt movieAlignmentIndex = 0; size_t preRefGroupIndex = 0; size_t preReadGroupIndex = 0; UInt pairFirst = 0; bool isVeryFirstGroup = true; for (movieAlignmentIndex = 0; movieAlignmentIndex < lookupTables.size(); movieAlignmentIndex++) { MovieAlnIndexLookupTable & lookupTable = lookupTables[movieAlignmentIndex]; if (isVeryFirstGroup or (lookupTable.refGroupIndex != preRefGroupIndex or lookupTable.readGroupIndex != preReadGroupIndex)) { // Find a new group if (isVeryFirstGroup) { // This is the very first group isVeryFirstGroup = false; } else if (lookupTable.refGroupIndex == preRefGroupIndex && lookupTable.readGroupIndex != preReadGroupIndex) { // Assumption (1) has been violated cout << "ERROR! lookupTables should have been sorted by reference" << "group index and read group index." << endl; exit(1); } else { // Find the first lookupTable of a new group, save indices of [first and last) // lookupTables of the last group. groupedLookupTablesIndexPairs.push_back(pair (pairFirst, movieAlignmentIndex)); // Save refGroupIndex and readGroupIndex of the last group pair refGroupIndexReadGroupIndexPair(preRefGroupIndex, preReadGroupIndex); refGroupIndexReadGroupIndexPairs.push_back(refGroupIndexReadGroupIndexPair); } // Store index of the first lookupTable of the new group in lookupTables pairFirst = movieAlignmentIndex; // Store refGroupIndex and readGroupIndex of the new group preRefGroupIndex = lookupTable.refGroupIndex; preReadGroupIndex = lookupTable.readGroupIndex; } } if (not isVeryFirstGroup) { // Save indices of [first and last) lookupTables of the very last group groupedLookupTablesIndexPairs.push_back(pair (pairFirst, movieAlignmentIndex)); // Save refGroupIndex and readGroupIndex of the very last group pair refGroupIndexReadGroupIndexPair(preRefGroupIndex, preReadGroupIndex); refGroupIndexReadGroupIndexPairs.push_back(refGroupIndexReadGroupIndexPair); } // Do nothing, if no lookupTable exists // Double check all assumptions are met for (size_t i = 0; i < refGroupIndexReadGroupIndexPairs.size(); i++) { for (size_t j = i+1; j < refGroupIndexReadGroupIndexPairs.size(); j++) { // Assure that assumption (1) is met. If this assertion fails, // then alignments in the input cmp.h5 are not grouped by // reference. Check /AlnInfo/AlnIndex dataset column 3. assert(refGroupIndexReadGroupIndexPairs[i] != refGroupIndexReadGroupIndexPairs[j]); } } assert(groupedLookupTablesIndexPairs.size() == refGroupIndexReadGroupIndexPairs.size()); for (size_t i = 0; i < groupedLookupTablesIndexPairs.size(); i++) { UInt firstIndex = groupedLookupTablesIndexPairs[i].first; UInt lastIndex = groupedLookupTablesIndexPairs[i].second; UInt refGroupIndex = refGroupIndexReadGroupIndexPairs[i].first; UInt readGroupIndex = refGroupIndexReadGroupIndexPairs[i].second; for(UInt index = firstIndex; index < lastIndex; index++) { assert(lookupTables[index].refGroupIndex == refGroupIndex); assert(lookupTables[index].readGroupIndex == readGroupIndex); } } } // // Read all required fields for computing the specified metric into memory, // unless the fields have been cached. // void CacheRequiredFieldsForMetric( BaseFile & baseFile, PulseFile & pulseFile, HDFBasReader & hdfBasReader, HDFPlsReader & hdfPlsReader, HDFCCSReader & hdfCcsReader, const bool & useBaseFile, const bool & usePulseFile, const bool & useCcsOnly, vector & cachedFields, const string & curMetric) { (void)(hdfCcsReader); (void)(useCcsOnly); vector fieldsToBeUsed; bool canBeComputed = CanThisMetricBeComputed( curMetric, hdfBasReader, hdfPlsReader, useBaseFile, usePulseFile, fieldsToBeUsed); assert(canBeComputed); // Cache all required fields for (size_t i = 0; i < fieldsToBeUsed.size(); i++) { bool isFieldCached = false; for (size_t j = 0; j < cachedFields.size(); j++) { if (fieldsToBeUsed[i] == cachedFields[j]) { isFieldCached = true; break; } } if (isFieldCached) { continue; } string & curField = fieldsToBeUsed[i].name; FieldType & fieldType= fieldsToBeUsed[i].type; if (fieldType == BasField and useBaseFile and hdfBasReader.FieldIsIncluded(curField) and hdfBasReader.includedFields[curField]) { hdfBasReader.ReadField(baseFile, curField); cachedFields.push_back(fieldsToBeUsed[i]); } else if (fieldType == PlsField and usePulseFile and hdfPlsReader.FieldIsIncluded(curField) and hdfPlsReader.includedFields[curField]) { hdfPlsReader.ReadField(pulseFile, curField); cachedFields.push_back(fieldsToBeUsed[i]); } } } // // Clear cached fields unless they are also required for computing // the next metric. // void ClearCachedFields( BaseFile & baseFile, PulseFile & pulseFile, HDFBasReader & hdfBasReader, HDFPlsReader & hdfPlsReader, HDFCCSReader & hdfCcsReader, const bool & useBaseFile, const bool & usePulseFile, const bool & useCcsOnly, vector & cachedFields, const string & curMetric, const string & nextMetric) { (void)(hdfCcsReader); (void)(useCcsOnly); (void)(curMetric); vector nextRequiredFields; if (nextMetric != "") { bool canBeComputed = CanThisMetricBeComputed( nextMetric, hdfBasReader, hdfPlsReader, useBaseFile, usePulseFile, nextRequiredFields); assert(canBeComputed); } for (size_t i = 0; i < cachedFields.size(); i++) { bool isRequiredForNextMetric = false; for (size_t j = 0; j < nextRequiredFields.size(); j++) { if (cachedFields[i] == nextRequiredFields[j]) { isRequiredForNextMetric = true; break; } } if (isRequiredForNextMetric) { continue; } string & curField = cachedFields[i].name; FieldType & fieldType= cachedFields[i].type; if (fieldType == BasField and useBaseFile and hdfBasReader.FieldIsIncluded(curField) and hdfBasReader.includedFields[curField]) { hdfBasReader.ClearField(baseFile, curField); // Remove it from cachedFields cachedFields.erase(cachedFields.begin()+i); i--; } else if (fieldType == PlsField and usePulseFile and hdfPlsReader.FieldIsIncluded(curField) and hdfPlsReader.includedFields[curField]) { if (curField == "NumEvent") { // Always keep NumEvent continue; } hdfPlsReader.ClearField(pulseFile, curField); // Remove it from cachedFields cachedFields.erase(cachedFields.begin()+i); i--; } } } // Compute StartFrame from BaseCalls only. // Return true if succeed, false otherwise. bool ComputeStartFrameFromBase( BaseFile & baseFile, HDFBasReader & hdfBasReader, const bool & useBaseFile, MovieAlnIndexLookupTable & lookupTable, vector & newStartFrame) { newStartFrame.resize(lookupTable.readLength); if (useBaseFile and hdfBasReader.FieldIsIncluded("PreBaseFrames") and hdfBasReader.includedFields["PreBaseFrames"] and baseFile.preBaseFrames.size() > 0) { // baseFile.preBaseFrame data type = uint16 // startFrame data type = uint32 for (int i = 0; i < lookupTable.readLength; i++) { newStartFrame[i] = baseFile.preBaseFrames[lookupTable.readStart+i]; } for (int i = 0; i < lookupTable.readLength-1; i++) { newStartFrame[i+1] += baseFile.basWidthInFrames[lookupTable.readStart+i]; } partial_sum(&newStartFrame[0], &newStartFrame[lookupTable.readLength], &newStartFrame[0]); return true; } return false; } // Compute StartFrame from PulseCalls only. // Return true if succeed, false otherwise. bool ComputeStartFrameFromPulse( PulseFile & pulseFile, HDFPlsReader & hdfPlsReader, const bool & usePulseFile, MovieAlnIndexLookupTable & lookupTable, vector & baseToPulseIndexMap, vector & newStartFrame) { newStartFrame.resize(lookupTable.readLength); if (usePulseFile) { assert(pulseFile.startFrame.size() > 0); hdfPlsReader.CopyFieldAt(pulseFile, "StartFrame", lookupTable.plsReadIndex, &baseToPulseIndexMap[0], &newStartFrame[0], lookupTable.readLength); return true; } return false; } // Compute StartFrame from either (1) BaseCalls or (2) PulseCalls. // (1) Uses baseFile.preBaseFrames and baseFile.basWidthInFrames // (2) Uses pulseFile.startFrame // In theory, the generated results using both methods should // be exactly the same. However, they can be different in practice // because PreBaseFrames is of data type uint_16, while its // value can exceed maximum uint_16 (65535). // When possible, always use PulseCalls. void ComputeStartFrame( BaseFile & baseFile, PulseFile & pulseFile, HDFBasReader & hdfBasReader, HDFPlsReader & hdfPlsReader, bool useBaseFile, bool usePulseFile, MovieAlnIndexLookupTable & lookupTable, vector & baseToPulseIndexMap, vector & newStartFrame) { if (!ComputeStartFrameFromPulse(pulseFile, hdfPlsReader, usePulseFile, lookupTable, baseToPulseIndexMap, newStartFrame)) { if (!ComputeStartFrameFromBase(baseFile, hdfBasReader, useBaseFile, lookupTable, newStartFrame)) { cout << "ERROR! There is insufficient data to compute metric: StartFrame." << endl; exit(1); } } } // // Compute and write an entire metric to cmp.h5. // Assume that all required fields have been loaded. // void WriteMetric( CmpFile & cmpFile, BaseFile & baseFile, PulseFile & pulseFile, HDFCmpFile & cmpReader, HDFBasReader & hdfBasReader, HDFPlsReader & hdfPlsReader, HDFCCSReader & hdfCcsReader, const bool & useBaseFile, const bool & usePulseFile, const bool & useCcsOnly, vector & lookupTables, vector > & groupedLookupTablesIndexPairs, const string & curMetric ) { (void)(cmpFile); (void)(hdfCcsReader); (void)(useCcsOnly); for (size_t index = 0; index < groupedLookupTablesIndexPairs.size(); index++) { // Group[index] contains all items in lookupTables[firstIndex...lastIndex) UInt firstIndex = groupedLookupTablesIndexPairs[index].first; UInt lastIndex = groupedLookupTablesIndexPairs[index].second; assert(lookupTables.size() > firstIndex); UInt refGroupIndex = lookupTables[firstIndex].refGroupIndex; UInt readGroupIndex = lookupTables[firstIndex].readGroupIndex; // Obtain alignment array length from *.cmp.h5/refGroup/readGroup/AlnArray. HDFCmpExperimentGroup* expGroup = cmpReader.refAlignGroups[refGroupIndex]->readGroups[readGroupIndex]; UInt alnArrayLength = expGroup->alignmentArray.size(); // // Compute any necessary data fields. These usually involve // using differences of pulse indices, pulse widths, etc.. // Missing fields are stored as 0's. // vector startTimeOffsetMetric; // pulseIndex's data type is uint16 in ICD, // but I have seen it defined as uint32 in a bas file. vector pulseMetric; vector qvMetric; vector frameRateMetric; vector timeMetric; vector tagMetric; vector floatMetric; /* if (curMetric == "StartTimeOffset") { startTimeOffsetMetric.resize(alnNum); HDFArray * data = (HDFArray*) expGroup->fields[curMetric]; if (data->IsInitialized()) { assert(data->size() == alnNum); data->UpdateH5Dataspace(); data->Read(0, alnNum-1, &StartTimeOffsetMetric[0]); } else { data->Initialize(expGroup->experimentGroup, curMetric); fill(startTimeOffsetMetric.begin(), startTimeOffsetMetric.end(), ); } } else */ if (curMetric == "QualityValue" || curMetric == "InsertionQV" || curMetric == "DeletionQV" || curMetric == "MergeQV" || curMetric == "SubstitutionQV") { qvMetric.resize(alnArrayLength); HDFArray * data = (HDFArray*) expGroup->fields[curMetric]; if (data->IsInitialized()) { assert(data->size() == alnArrayLength); data->UpdateH5Dataspace(); data->Read(0, alnArrayLength-1, &qvMetric[0]); } else { data->Initialize(expGroup->experimentGroup, curMetric); //fill(qvMetric.begin(), qvMetric.end(), missingQualityValue); } } else if (curMetric == "ClassifierQV" || curMetric == "pkmid" ) { // Note that data type of pkmid=midSignal, which is uint_8 in bas/pls files, // has been changed to float in cmp.h5. Why? floatMetric.resize(alnArrayLength); HDFArray * data = (HDFArray*) expGroup->fields[curMetric]; if (data->IsInitialized()) { assert(data->size() == alnArrayLength); data->UpdateH5Dataspace(); data->Read(0, alnArrayLength-1, &floatMetric[0]); } else { data->Initialize(expGroup->experimentGroup, curMetric); //fill(floatMetric.begin(), floatMetric.end(), NaN); } } else if (curMetric == "PulseIndex" ) { pulseMetric.resize(alnArrayLength); HDFArray * data = (HDFArray*) expGroup->fields[curMetric]; if (data->IsInitialized()) { assert(data->size() == alnArrayLength); data->UpdateH5Dataspace(); data->Read(0, alnArrayLength-1, &pulseMetric[0]); } else { data->Initialize(expGroup->experimentGroup, curMetric); //fill(pulseMetric.begin(), pulseMetric.end(), 0); } } else if (curMetric == "DeletionTag" || curMetric == "SubstitutionTag") { tagMetric.resize(alnArrayLength); HDFArray * data = (HDFArray*) expGroup->fields[curMetric]; if (data->IsInitialized()) { assert(data->size() == alnArrayLength); data->UpdateH5Dataspace(); data->Read(0, alnArrayLength-1, &tagMetric[0]); } else { data->Initialize(expGroup->experimentGroup, curMetric); //fill(tagMetric.begin(), tagMetric.end(), '-'); } } else if (curMetric == "StartFrame" || curMetric == "StartFrameBase" || curMetric == "StartFramePulse") { timeMetric.resize(alnArrayLength); HDFArray * data = (HDFArray*) expGroup->fields[curMetric]; if (data->IsInitialized()) { assert(data->size() == alnArrayLength); data->UpdateH5Dataspace(); data->Read(0, alnArrayLength-1, &timeMetric[0]); } else { data->Initialize(expGroup->experimentGroup, curMetric); //fill(timeMetric.begin(), timeMetric.end(), missingPulseIndex); } } else if (curMetric == "PulseWidth" || curMetric == "PreBaseFrames" || curMetric == "WidthInFrames"|| curMetric == "IPD" || curMetric == "Light") { frameRateMetric.resize(alnArrayLength); HDFArray * data = (HDFArray*) expGroup->fields[curMetric]; if (data->IsInitialized()) { assert(data->size() == alnArrayLength); data->UpdateH5Dataspace(); data->Read(0, alnArrayLength-1, &frameRateMetric[0]); } else { data->Initialize(expGroup->experimentGroup, curMetric); //fill(frameRateMetric.begin(), frameRateMetric.end(), missingFrameRateValue); } } else { cout << "ERROR, metric " << curMetric << " is not supported." << endl; exit(1); } for (size_t movieAlignmentIndex = firstIndex; movieAlignmentIndex < lastIndex; movieAlignmentIndex++) { MovieAlnIndexLookupTable & lookupTable = lookupTables[movieAlignmentIndex]; if (lookupTable.skip) continue; const UInt alignedSequenceLength = lookupTable.offsetEnd - lookupTable.offsetBegin; const UInt ungappedAlignedSequenceLength = lookupTable.queryEnd - lookupTable.queryStart; const UInt & plsReadIndex = lookupTable.plsReadIndex; const UInt & readStart = lookupTable.readStart; const UInt & readLength = lookupTable.readLength; const UInt & queryStart = lookupTable.queryStart; const UInt & offsetBegin = lookupTable.offsetBegin; const UInt & offsetEnd = lookupTable.offsetEnd; assert (offsetEnd <= alnArrayLength); assert (offsetBegin+alignedSequenceLength <= alnArrayLength); // Condense gaps and get ungapped aligned sequence. string ungappedAlignedSequence = lookupTable.alignedSequence; RemoveGaps(ungappedAlignedSequence, ungappedAlignedSequence); vector baseToAlignmentMap; // Map bases in the aligned sequence to their positions in the alignment. CreateSequenceToAlignmentMap(lookupTable.alignedSequence, baseToAlignmentMap); vector baseToPulseIndexMap; if (usePulseFile && IsPulseMetric(curMetric)) { // Map bases in the read to pulse indices. MapBaseToPulseIndex(baseFile, pulseFile, lookupTable, baseToPulseIndexMap); } UInt i; if (curMetric == "QualityValue") { assert(baseFile.qualityValues.size() > 0 && baseFile.qualityValues.size() >= readStart + readLength); fill(&qvMetric[offsetBegin], &qvMetric[offsetEnd], missingPulseIndex); for (i = 0; i < ungappedAlignedSequenceLength; i++ ) { // cap quality value qvMetric[offsetBegin+baseToAlignmentMap[i]] = min(maxQualityValue, baseFile.qualityValues[readStart+queryStart+i]); } qvMetric[offsetBegin+alignedSequenceLength] = 0; } else if (curMetric == "InsertionQV") { assert(baseFile.insertionQV.size() > 0 && baseFile.insertionQV.size() >= readStart + readLength); fill(&qvMetric[offsetBegin], &qvMetric[offsetEnd], missingPulseIndex); for (i = 0; i < ungappedAlignedSequenceLength; i++) { // cap quality value qvMetric[offsetBegin+baseToAlignmentMap[i]] = min(maxQualityValue, baseFile.insertionQV[readStart+queryStart+i]); } qvMetric[offsetBegin+alignedSequenceLength] = 0; } else if (curMetric == "MergeQV") { assert(baseFile.mergeQV.size() > 0 && baseFile.mergeQV.size() >= readStart + readLength); fill(&qvMetric[offsetBegin], &qvMetric[offsetEnd], missingPulseIndex); for (i = 0; i < ungappedAlignedSequenceLength; i++ ) { // cap quality value qvMetric[offsetBegin+baseToAlignmentMap[i]] = min(maxQualityValue, baseFile.mergeQV[readStart+queryStart+i]); } qvMetric[offsetBegin+alignedSequenceLength] = 0; } else if (curMetric == "DeletionQV") { assert(baseFile.deletionQV.size() > 0 && baseFile.deletionQV.size() >= readStart + readLength); fill(&qvMetric[offsetBegin], &qvMetric[offsetEnd], missingPulseIndex); for (i = 0; i < ungappedAlignedSequenceLength; i++) { // cap quality value qvMetric[offsetBegin+baseToAlignmentMap[i]] = min(maxQualityValue, baseFile.deletionQV[readStart+queryStart+i]); } qvMetric[offsetBegin+alignedSequenceLength] = 0; } else if (curMetric == "DeletionTag") { assert(baseFile.deletionTag.size() > 0 && baseFile.deletionTag.size() >= readStart + readLength); fill(&tagMetric[offsetBegin], &tagMetric[offsetEnd], '-'); for (i = 0; i < ungappedAlignedSequenceLength; i++ ) { assert(offsetBegin+baseToAlignmentMap[i] < tagMetric.size()); tagMetric[offsetBegin+baseToAlignmentMap[i]] = baseFile.deletionTag[readStart+queryStart+i]; } tagMetric[offsetBegin+alignedSequenceLength] = 0; } else if (curMetric == "PulseIndex") { assert(baseFile.pulseIndex.size() > 0 && baseFile.pulseIndex.size() >= readStart + readLength); fill(&pulseMetric[offsetBegin], &pulseMetric[offsetEnd], 0); for (i = 0; i < ungappedAlignedSequenceLength; i++ ) { pulseMetric[offsetBegin+baseToAlignmentMap[i]] = baseFile.pulseIndex[readStart+queryStart+i]; } pulseMetric[offsetBegin+alignedSequenceLength] = 0; } else if (curMetric == "SubstitutionTag") { assert(baseFile.substitutionTag.size() > 0 && baseFile.substitutionTag.size() >= readStart + readLength); fill(&tagMetric[offsetBegin], &tagMetric[offsetEnd], '-'); for (i = 0; i < ungappedAlignedSequenceLength; i++ ) { tagMetric[offsetBegin+baseToAlignmentMap[i]] = baseFile.substitutionTag[readStart+queryStart+i]; } tagMetric[offsetBegin+alignedSequenceLength] = 0; } else if (curMetric == "SubstitutionQV") { assert(baseFile.substitutionQV.size() > 0 && baseFile.substitutionQV.size() >= readStart + readLength); fill(&qvMetric[offsetBegin], &qvMetric[offsetEnd], missingPulseIndex); for (i = 0; i < ungappedAlignedSequenceLength; i++ ) { qvMetric[offsetBegin+baseToAlignmentMap[i]] = min(maxQualityValue, baseFile.substitutionQV[readStart+queryStart+i]); } qvMetric[offsetBegin+alignedSequenceLength] = 0; } else if (curMetric == "ClassifierQV") { assert(pulseFile.classifierQV.size() > 0 && pulseFile.classifierQV.size() >= readStart + readLength); vector newClassifierQV; newClassifierQV.resize(ungappedAlignedSequenceLength); // For the data used for this table, it is possible to simply // reference the data for the bas file, but for the pls file, // it is necessary to copy since there is a packing of data. hdfPlsReader.CopyFieldAt(pulseFile, "ClassifierQV", plsReadIndex, &baseToPulseIndexMap[queryStart], &newClassifierQV[0], ungappedAlignedSequenceLength); fill(&floatMetric[offsetBegin], &floatMetric[offsetEnd], NaN); for (i = 0; i < ungappedAlignedSequenceLength; i++ ) { floatMetric[offsetBegin+baseToAlignmentMap[i]] = newClassifierQV[i]; } floatMetric[offsetBegin+alignedSequenceLength] = 0; /* } else if (curMetric == "StartTimeOffset") { // StartTimeOffset is a subset of StartFrame. vector newStartFrame; ComputeStartFrame(baseFile, pulseFile, hdfBasReader, hdfPlsReader, useBaseFile, usePulseFile, lookupTable, baseToPulseIndexMap, newStartFrame); startTimeOffsetMetric[offsetBegin] = newStartFrame[queryStart]; */ } else if (curMetric == "StartFrame") { vector newStartFrame; ComputeStartFrame(baseFile, pulseFile, hdfBasReader, hdfPlsReader, useBaseFile, usePulseFile, lookupTable, baseToPulseIndexMap, newStartFrame); fill(&timeMetric[offsetBegin], &timeMetric[offsetEnd], missingPulseIndex); for (i = 0; i < ungappedAlignedSequenceLength; i++ ) { timeMetric[offsetBegin+baseToAlignmentMap[i]] = newStartFrame[queryStart+i]; } timeMetric[offsetBegin+alignedSequenceLength] = 0; } else if (curMetric == "StartFrameBase") { // Sneaky metric, compute StartFrame from BaseCalls only. vector newStartFrame; ComputeStartFrameFromBase(baseFile, hdfBasReader, useBaseFile, lookupTable, newStartFrame); fill(&timeMetric[offsetBegin], &timeMetric[offsetEnd], missingPulseIndex); for (i = 0; i < ungappedAlignedSequenceLength; i++ ) { timeMetric[offsetBegin+baseToAlignmentMap[i]] = newStartFrame[queryStart+i]; } timeMetric[offsetBegin+alignedSequenceLength] = 0; } else if (curMetric == "StartFramePulse") { // Sneaky metric, compute StartFrame from PulseCalls only. vector newStartFrame; ComputeStartFrameFromPulse(pulseFile, hdfPlsReader, usePulseFile, lookupTable, baseToPulseIndexMap, newStartFrame); fill(&timeMetric[offsetBegin], &timeMetric[offsetEnd], missingPulseIndex); for (i = 0; i < ungappedAlignedSequenceLength; i++ ) { timeMetric[offsetBegin+baseToAlignmentMap[i]] = newStartFrame[queryStart+i]; } timeMetric[offsetBegin+alignedSequenceLength] = 0; } else if (curMetric == "PreBaseFrames") { // Directly load baseFile.PreBaseFrames. // DON'T compute it from PulseCalls even if you can. assert(baseFile.preBaseFrames.size() > 0 && baseFile.preBaseFrames.size() >= readStart + readLength); fill(&frameRateMetric[offsetBegin], &frameRateMetric[offsetEnd], missingFrameRateValue); for (i = 0; i < ungappedAlignedSequenceLength; i++ ) { frameRateMetric[offsetBegin+baseToAlignmentMap[i]] = baseFile.preBaseFrames[readStart+queryStart+i]; } frameRateMetric[offsetBegin+alignedSequenceLength] = 0; } else if (curMetric == "WidthInFrames" || curMetric == "PulseWidth") { // For legacy reasons, it's possible the width in frames is // stored in the bas file. If this is the case, use the width // in frames there. Otherwise, use the width in frames stored // in the pls file. vector newWidthInFrames; newWidthInFrames.resize(ungappedAlignedSequenceLength); if (usePulseFile) { hdfPlsReader.CopyFieldAt(pulseFile, "WidthInFrames", plsReadIndex, &baseToPulseIndexMap[queryStart], &newWidthInFrames[0], ungappedAlignedSequenceLength); } else if (useBaseFile) { // basWidthInFrames data type uint16 copy(&baseFile.basWidthInFrames[readStart+queryStart], &baseFile.basWidthInFrames[readStart+queryStart+ungappedAlignedSequenceLength], &newWidthInFrames[0]); } fill(&frameRateMetric[offsetBegin], &frameRateMetric[offsetEnd], missingFrameRateValue); for (i = 0; i < ungappedAlignedSequenceLength; i++ ) { frameRateMetric[offsetBegin+baseToAlignmentMap[i]] = newWidthInFrames[i]; } frameRateMetric[offsetBegin+alignedSequenceLength] = 0; } else if (curMetric == "pkmid") { // pkmid in cmp.h5 is MidSignal in pls.h5, but // data type of MidSignal is uint16 in pls files, // data type of pkmid is float in cmp files. assert(usePulseFile); vector newMidSignal; newMidSignal.resize(ungappedAlignedSequenceLength); hdfPlsReader.CopyFieldAt(pulseFile, "MidSignal", plsReadIndex, &baseToPulseIndexMap[queryStart], &newMidSignal[0], ungappedAlignedSequenceLength, ungappedAlignedSequence); fill(&floatMetric[offsetBegin], &floatMetric[offsetEnd], NaN); for (i = 0; i < ungappedAlignedSequenceLength; i++ ) { floatMetric[offsetBegin+baseToAlignmentMap[i]] = newMidSignal[i]; } floatMetric[offsetBegin+alignedSequenceLength] = 0; } else if (curMetric == "IPD") { fill(&frameRateMetric[offsetBegin], &frameRateMetric[offsetEnd], missingFrameRateValue); // IPD can be either (1) copied from baseFile.preBaseFrames // or (2) computed from pulseFile.StartFrame and pulseFile.WidthInFrames // Always use method (2) when possible as it is more accurate. if (usePulseFile) { // Need to read StartFrame & WidthInFrames for the entire read, // not only for a subset of bases in the alignment assert(pulseFile.startFrame.size() > 0); assert(pulseFile.plsWidthInFrames.size() > 0); vector newStartFrame; newStartFrame.resize(readLength); hdfPlsReader.CopyFieldAt(pulseFile, "StartFrame", plsReadIndex, &baseToPulseIndexMap[0], &newStartFrame[0], readLength); vector newWidthInFrames; newWidthInFrames.resize(readLength); hdfPlsReader.CopyFieldAt(pulseFile, "WidthInFrames", plsReadIndex, &baseToPulseIndexMap[0], &newWidthInFrames[0], readLength); for (i = 0; i < ungappedAlignedSequenceLength; i++) { // The IPD is undefined for the first base in a read. if (queryStart == 0 and i == 0) { frameRateMetric[offsetBegin+baseToAlignmentMap[i]] = 0; } else { frameRateMetric[offsetBegin+baseToAlignmentMap[i]] = newStartFrame[queryStart+i] - newStartFrame[i+queryStart-1] - newWidthInFrames[i+queryStart-1]; } } } else if (useBaseFile) { assert(baseFile.preBaseFrames.size() > 0); assert(baseFile.preBaseFrames.size() >= readStart + readLength); for (i = 0; i < ungappedAlignedSequenceLength; i++) { frameRateMetric[offsetBegin+baseToAlignmentMap[i]] = baseFile.preBaseFrames[readStart+queryStart+i]; } } frameRateMetric[offsetBegin+alignedSequenceLength] = 0; } else if (curMetric == "Light") { // Light can be computed from pulseFile.meanSignal and // pulseFile.plsWidthInFrames. Might have been deprecated. assert(usePulseFile); fill(&frameRateMetric[offsetBegin], &frameRateMetric[offsetEnd], missingFrameRateValue); vector newMeanSignal; newMeanSignal.resize(ungappedAlignedSequenceLength); hdfPlsReader.CopyFieldAt(pulseFile, "MeanSignal", plsReadIndex, &baseToPulseIndexMap[queryStart], &newMeanSignal[0], ungappedAlignedSequenceLength, ungappedAlignedSequence); vector newWidthInFrames; newWidthInFrames.resize(ungappedAlignedSequenceLength); hdfPlsReader.CopyFieldAt(pulseFile, "WidthInFrames", plsReadIndex, &baseToPulseIndexMap[queryStart], &newWidthInFrames[0], ungappedAlignedSequenceLength); for (i = 0; i < ungappedAlignedSequenceLength; i++ ) { frameRateMetric[offsetBegin+baseToAlignmentMap[i]] = newMeanSignal[i] * newWidthInFrames[i]; } frameRateMetric[offsetBegin+alignedSequenceLength] = 0; } else { cout << "ERROR, unknown metric " << curMetric << endl; exit(1); } } // Write the computed metric to cmp.h5. /*if (curMetric == "StartTimeOffset") { expGroup->startTimeOffset.WriteToPos(&startTimeOffsetMetric[0], startTimeOffsetMetric.size(), 0); } else */ if (curMetric == "QualityValue" || curMetric == "InsertionQV" || curMetric == "DeletionQV" || curMetric == "MergeQV" || curMetric == "SubstitutionQV") { HDFArray * data = (HDFArray *) expGroup->fields[curMetric]; data->WriteToPos(&qvMetric[0], qvMetric.size(), 0); } else if (curMetric == "ClassifierQV" || curMetric == "pkmid" ) { HDFArray * data = (HDFArray *) expGroup->fields[curMetric]; data->WriteToPos(&floatMetric[0], floatMetric.size(), 0); } else if (curMetric == "PulseIndex") { HDFArray * data = (HDFArray *) expGroup->fields[curMetric]; data->WriteToPos(&pulseMetric[0], pulseMetric.size(), 0); } else if (curMetric == "DeletionTag" || curMetric == "SubstitutionTag") { HDFArray * data = (HDFArray *) expGroup->fields[curMetric]; data->WriteToPos(&tagMetric[0], tagMetric.size(), 0); } else if (curMetric == "StartFrame" || curMetric == "StartFrameBase"|| curMetric == "StartFramePulse") { HDFArray * data = (HDFArray*) expGroup->fields[curMetric]; data->WriteToPos(&timeMetric[0], timeMetric.size(), 0); } else if (curMetric == "PulseWidth" || curMetric == "PreBaseFrames" || curMetric == "WidthInFrames"|| curMetric == "IPD" || curMetric == "Light") { HDFArray * data = (HDFArray*) expGroup->fields[curMetric]; data->WriteToPos(&frameRateMetric[0], frameRateMetric.size(), 0); } else { cout << "ERROR, unknown metric " << curMetric << endl; exit(1); } } } // // Write "WhenStarted" from pls.h5 and write to cmp.h5 // void WriteMetricWhenStarted( HDFCmpFile & cmpReader, HDFPlsReader & hdfPlsReader, const string & movieName) { string metric = "WhenStarted"; string whenStarted; if (hdfPlsReader.scanDataReader.useWhenStarted == false) { cout << "ERROR! Attempting to read WhenStarted from " << movieName << " but the attriubte does not exist." << endl; exit(1); } hdfPlsReader.scanDataReader.ReadWhenStarted(whenStarted); if (!cmpReader.movieInfoGroup.whenStartedArray.IsInitialized()) { cmpReader.movieInfoGroup.whenStartedArray.Initialize(cmpReader.movieInfoGroup.movieInfoGroup, metric); } cmpReader.movieInfoGroup.whenStartedArray.Write(&whenStarted, 1); } // // Print metrics. // string MetricsToString(const vector & metrics) { string ret = ""; for (size_t i = 0; i < metrics.size(); i++) { ret += metrics[i]; if (i != metrics.size()-1) ret += ","; if (i % 4 == 3) ret += "\n"; } return ret; } // // Print usage. // void PrintUsage() { cout << " loadPulses - Load pulse information and quality values into a Compare file" << endl; cout << "usage: loadPulses movieFile cmpFile [-metrics m1,m2,...] [-byread]" << endl; cout << " movieFile may be a movie file or a fofn of movie file names." << endl; cout << " metrics m1,m2,... is a comma-separated list (without spaces) of metrics " << endl << " to print to the pulse file." << endl; cout << " Valid metrics are: " << endl; cout << MetricsToString(GetAllSupportedMetrics(false)) << endl; // << " QualityValue, ClassifierQV, MergeQV," << endl // << " StartFrame, PulseWidth, pkmid, IPD, Light" << endl // << " WhenStarted, StartTimeOffset, PreBaseFrames," << endl // << " InsertionQV, DeletionQV, DeletionTag, SubstitutionQV" << endl // << " SubstitutionTag, PulseIndex, WidthInFrames" << endl; cout << " By default, " << MetricsToString(GetDefaultMetrics()) << " are added" << endl; // Deprecate -useccs, an option for old data. // cout << " -useccs This option is for older cmp.h5 files that do not have the read type " << endl // << " stored. Newer cmp.h5 files have a read type that indicates the cmp.h5 file " << endl // << " has alignments generated from de novo ccs sequences. Using this flag assuems"< metricOptions; int maxElements = 0; //Maximum Memory allowed for bymetric is 6 GB int maxMemory = 4; // // Default is all options are false // CreateMetricOptions(metricOptions); string metricList = ""; bool useCcsOnly = false; bool byRead = false; bool byMetric = false; bool failOnMissingData = false; CommandLineParser clp; clp.SetProgramName(program); clp.SetVersion(versionStr); clp.RegisterStringOption("basFileName", &movieFileName, "The input {bas,pls}.h5 or input.fofn.", true); clp.RegisterStringOption("cmpFileName", &cmpFileName, "The cmp.h5 file to load pulse information into.", true); clp.RegisterPreviousFlagsAsHidden(); string metricsDescription = "A comma separated list of metrics (with no spaces).\nValid options are:\n"; metricsDescription += MetricsToString(GetAllSupportedMetrics(false)); metricsDescription += "\nDefault options are:\n"; metricsDescription += MetricsToString(GetDefaultMetrics()); clp.RegisterStringOption("metrics", &metricList, metricsDescription); clp.RegisterFlagOption("failOnMissingData", &failOnMissingData, "Exit if any data fields are missing from the bas.h5 or pls.h5 " "input that are required to load a metric. Defualt is a warning."); clp.RegisterFlagOption("byread", &byRead, "Load pulse information by read rather than buffering metrics."); clp.RegisterFlagOption("bymetric", & byMetric, "Load pulse information by metric rather than by read. " "This uses more memory than -byread, but can be faster."); clp.RegisterIntOption("maxElements", &maxElements, "Set a limit on the size of pls/bas file to buffer in with -bymetric " "(default value: maximum int). Use -byread if the limit is exceeded.", CommandLineParser::PositiveInteger); clp.RegisterIntOption("maxMemory", & maxMemory, "Set a limit (in GB) on the memory to buffer data with -bymetric " "(default value: 4 GB). Use -byread if the limit is exceeded.", CommandLineParser::PositiveInteger); int metaNElements, rawChunkSize, rawNElements; metaNElements = 0; rawChunkSize = 0; metaNElements = 0; clp.RegisterIntOption("metaNElements", & metaNElements, "Set number of elements in meta data cache for reading bas/bax/pls.h5 file.", CommandLineParser::PositiveInteger); clp.RegisterIntOption("rawNElements", & rawNElements, "Set number of elements in raw data cache for reading bas/bax/pls.h5 file.", CommandLineParser::PositiveInteger); clp.RegisterIntOption("rawChunkSize", & rawChunkSize, "Set chunk size of raw data cache for reading bas/bax/pls.h5 file.", CommandLineParser::PositiveInteger); string progSummary = ("Loads pulse information such as inter pulse " "distance, or quality information into the cmp.h5 file. This allows " "one to analyze kinetic and quality information by alignment column."); clp.SetProgramSummary(progSummary); clp.ParseCommandLine(argc, argv); cerr << "[INFO] " << GetTimestamp() << " [" << program << "] started." << endl; //use byMetric by default unless byRead is specified. byMetric = true; if (byRead) { byMetric = false; } if (metricList == "") { SetDefaultMetricOptions(metricOptions); } else { ParseMetricsList(metricList, metricOptions); } // // Always read in basecalls since they are used to check the sanity // of the alignment indices. // metricOptions["Basecall"] = true; // // Translate from the metrics to be loaded to the ones that are // required to compute them. // Need to be refactored. // vector datasetFields; RequirementMap fieldRequirements; BuildRequirementMap(fieldRequirements); StoreDatasetFieldsFromPulseFields(metricOptions, fieldRequirements, datasetFields); //e.g. /PATH_TO_FILE/m120321_032600_42142_c100310572550000001523013208061210_s1_p0.bas.h5 // /PATH_TO_FILE/m120321_032600_42142_c100310572550000001523013208061210_s2_p0.bas.h5 vector movieFileNames; //e.g. m120321_032600_42142_c100310572550000001523013208061210_s1_p0 // m120321_032600_42142_c100310572550000001523013208061210_s2_p0 vector fofnMovieNames; FileOfFileNames::StoreFileOrFileList(movieFileName, movieFileNames); HDFBasReader hdfBasReader; HDFPlsReader hdfPlsReader; HDFCCSReader hdfCcsReader; vector baseFileFields, pulseFileFields; size_t fieldIndex; bool useBaseFile = false, usePulseFile = false; for (fieldIndex = 0; fieldIndex < datasetFields.size(); fieldIndex++) { if (hdfBasReader.ContainsField(datasetFields[fieldIndex])) { useBaseFile = true; baseFileFields.push_back(datasetFields[fieldIndex]); } } if (maxElements != 0) { hdfBasReader.maxAllocNElements = maxElements; hdfPlsReader.maxAllocNElements = maxElements; } // // For now, all runs will attempt to use information from a .bas // file, since it's assumed that if one has alignments, one has a // .bas file. // useBaseFile = true; // // Add some default fields. // hdfBasReader.IncludeField("Basecall"); hdfBasReader.IncludeField("PulseIndex"); hdfBasReader.InitializeFields(baseFileFields); for (fieldIndex = 0; fieldIndex < datasetFields.size(); fieldIndex++) { if (hdfPlsReader.ContainsField(datasetFields[fieldIndex])) { usePulseFile = true; pulseFileFields.push_back(datasetFields[fieldIndex]); } } if (usePulseFile) { // set hdfPlsReader.includedFields[fieldX] to true if fieldX is // in pulseFileFields hdfPlsReader.InitializeFields(pulseFileFields); } hdfPlsReader.IncludeField("NumEvent"); int nMovies = movieFileNames.size(); int movieIndex; MovieNameToArrayIndex movieNameMap; // // Initialize movies. This accomplishes two tasks. First, all movie // files are opened and initialized, so that if there are data // fields missing the program will exit now rather than in the // middle of loading pulses. // Next, a list of movie names is created in fofnMovieNames. The // cmp file does not necessarily index movies in the order of the // fofn, and so when loading pulses from a movie indexed by a cmp // file, one needs to look up the file name of the movie. This is // done by scanning the fofnMovieNames list in order until the movie // is found. // // h5 file access property list can be customized here. // H5::FileAccPropList fileAccPropList = H5::FileAccPropList::DEFAULT; // h5: number of items in meta data cache int mdc_nelmts = (metaNElements==0)?(4096):(metaNElements); // h5: number of items in raw data chunk cache size_t rdcc_nelmts = (rawNElements==0)?(4096):(rawNElements); // h5: raw data chunk cache size (in bytes) per dataset size_t rdcc_nbytes = (rawChunkSize==0)?(9192):(rawChunkSize); double rdcc_w0 = 0.75; // h5: preemption policy // fileAccPropList.getCache(mdc_nelmts, rdcc_nelmts, rdcc_nbytes, rdcc_w0); fileAccPropList.setCache(mdc_nelmts, rdcc_nelmts, rdcc_nbytes, rdcc_w0); // fileAccPropList.setCache(4096, 4096, 8388608, rdcc_w0); // If one of the h5 in the fofn is a ccs.h5 file, then only load pulse // information from group /PulseData/ConsensusBaseCalls. for (movieIndex = 0; movieIndex < nMovies; movieIndex++) { FileType fileType; BaseSequenceIO::DetermineFileTypeByExtension(movieFileNames[movieIndex], fileType, true); if (fileType == FileType::HDFCCSONLY) { useCcsOnly = true; } } for (movieIndex = 0; movieIndex < nMovies; movieIndex++) { if (useCcsOnly) { hdfCcsReader.SetReadBasesFromCCS(); hdfBasReader.SetReadBasesFromCCS(); } if (!hdfBasReader.Initialize(movieFileNames[movieIndex], fileAccPropList)) { cout << "ERROR, could not initialize HDF file " << movieFileNames[movieIndex] << " for reading bases." << endl; exit(1); } else { fofnMovieNames.push_back(hdfBasReader.GetMovieName()); movieNameMap[hdfBasReader.GetMovieName()] = movieIndex; hdfBasReader.Close(); } // // The pulse file is optional. // if (usePulseFile) { if (hdfPlsReader.Initialize(movieFileNames[movieIndex], fileAccPropList) == 0) { usePulseFile = false; } } } CmpFile cmpFile; // // These readers pull information from the same pls file. // HDFCmpFile cmpReader; if (cmpReader.Initialize(cmpFileName, H5F_ACC_RDWR) == 0) { cout << "ERROR, could not open the cmp file." << endl; exit(1); } if (cmpReader.HasNoAlignments()) { cout << "WARNING, there is no alignment in the cmp file." << endl; if (useBaseFile) { hdfBasReader.Close(); } if (usePulseFile) { hdfPlsReader.Close(); } cmpReader.Close(); cerr << "[INFO] " << GetTimestamp() << " [" << program << "] ended." << endl; exit(0); } cmpReader.Read(cmpFile, false); // Sanity check: if there is a ccs.h5 file in the fofn and // cmp.h5 file's readType is not CCS, something is wrong. if (cmpFile.readType != ReadType::CCS and useCcsOnly) { cout << "ERROR, there is a ccs.h5 file in the fofn, while read type of" << " the cmp.h5 file is not CCS." << endl; exit(1); } string commandLine; clp.CommandLineToString(argc, argv, commandLine); cmpReader.fileLogGroup.AddEntry(commandLine, "Loading pulse metrics", program, GetTimestamp(), versionStr); // // Group alignment indices by movie so that they may be processed one movie at a time // later on. The movie indices set keeps track of all indices // listed in alignment files. This keeps a reference to all // alignments in memory at once. At the time of writing this, most // projects will have at most a few million alignments, and so the // size of this structure is modest. // Each movieIndexSets[$movieId] contains indices of all the alignments, which // are associated with a movie whose id in dataset /MovieInfo/ID equals $movieId // UInt alignmentIndex; map > movieIndexSets; for (alignmentIndex = 0; alignmentIndex < cmpFile.alnInfo.alignments.size(); alignmentIndex++) { movieIndexSets[cmpFile.alnInfo.alignments[alignmentIndex].GetMovieId()].push_back(alignmentIndex); } // // Load pulses from movies in order they appear in the input fofn. // int m; for (size_t fofnMovieIndex = 0; fofnMovieIndex < fofnMovieNames.size(); fofnMovieIndex++) { bool byMetricForThisMovie = byMetric; if (cmpFile.readType == ReadType::CCS or useCcsOnly) { hdfBasReader.SetReadBasesFromCCS(); hdfCcsReader.Initialize(movieFileNames[fofnMovieIndex], fileAccPropList); } hdfBasReader.Initialize(movieFileNames[fofnMovieIndex], fileAccPropList); BaseFile baseFile; PulseFile pulseFile; // // Deprecate reading the entire bas.h5 file. Reads are scanned // one by one or by metric, instead of caching all. // It is still necessary to read in some of the datasets entirely, // in particular the start positions and hole numbers. // hdfBasReader.ReadBaseFileInit(baseFile); set moviePartHoleNumbers; copy(baseFile.holeNumbers.begin(), baseFile.holeNumbers.end(), inserter(moviePartHoleNumbers, moviePartHoleNumbers.begin())); if (usePulseFile) { hdfPlsReader.Initialize(movieFileNames[fofnMovieIndex], fileAccPropList); hdfPlsReader.IncludeField("NumEvent"); hdfPlsReader.IncludeField("StartFrame"); // // Deprecate reading the entire pls.h5 file. // Reads are scanned by read or by metric instead of caching all. // It is still necessary to read in some of the datasets entirely, // in particular the start positions and hole numbers. // hdfPlsReader.ReadPulseFileInit(pulseFile); } string cmpFileMovieName; for (m = 0; m < static_cast(cmpFile.movieInfo.name.size()); m++) { // // First find the file name for the movie 'm' // cmpFileMovieName = cmpFile.movieInfo.name[m]; if (baseFile.GetMovieName() == cmpFileMovieName) { break; } } // // If the movie specified in the input.fofn is not found in the // cmp file, that indicates something bad is happeing. Either the // input.fofn was not used to generate the cmp.h5 file, or no // alignments were found between the input bas.h5 and the // reference. That shouldn't happen. // if (m == static_cast(cmpFile.movieInfo.name.size())) { cout << "WARNING: Could not find any alignments for file " << movieFileNames[fofnMovieIndex] << endl; continue; } // // Open the movie and load its pulses into memory. // movieIndex = cmpFile.movieInfo.id[m]; UInt movieAlignmentIndex; // // Since usePulseFile is set when the input file is a pulseFile, // and ReadType::CCS becomes the read type when the alignments are // ccs, when pulse files are specified for de novo ccs alignments, // they will be opened as pulse files. Since the de novo ccs // sequences do not have pulse file information, the auto-reading // of pulse files needs to be disabled. Do that here. // if (cmpFile.readType == ReadType::CCS or useCcsOnly) { usePulseFile = false; } // Check whether all metrics are computable or not. CanMetricsBeComputed(metricOptions, hdfBasReader, hdfPlsReader, useBaseFile, usePulseFile, failOnMissingData, movieFileNames[fofnMovieIndex]); // Get all metrics that are (1) supported, (2) required and (3) can be loaded. vector metricsToLoad = GetMetricsToLoad(metricOptions); // // An index set is a set of indices into the alignment array that // are of reads generated by this movie. Load pulses for all // alignments generated for this movie. // // Movie index sets should be sorted by alignment index. Build a lookup table for this. // std::vector > toFrom; UInt totalAlnLength = 0; for (movieAlignmentIndex = 0; movieAlignmentIndex < movieIndexSets[movieIndex].size(); movieAlignmentIndex++) { alignmentIndex = movieIndexSets[movieIndex][movieAlignmentIndex]; totalAlnLength += cmpFile.alnInfo.alignments[alignmentIndex].GetOffsetEnd() - \ cmpFile.alnInfo.alignments[alignmentIndex].GetOffsetBegin(); toFrom.push_back(std::pair(cmpFile.alnInfo.alignments[alignmentIndex].GetAlignmentId(), movieAlignmentIndex)); } // orders by first by default. std::sort(toFrom.begin(), toFrom.end()); // // Check metric dataset size in this movie and the required memory // consumption, if either limit is exceeded, switch to byread. // if (byMetricForThisMovie) { UInt requiredMem = ComputeRequiredMemory(metricsToLoad, hdfBasReader, hdfPlsReader, useBaseFile, usePulseFile, cmpReader, totalAlnLength); if (hdfBasReader.baseArray.arrayLength > static_cast(hdfBasReader.maxAllocNElements) or (usePulseFile and hdfPlsReader.GetStartFrameSize() > hdfPlsReader.maxAllocNElements) or ((float)requiredMem / 1024 / 1024) > maxMemory) { cout << "Either the number of elements exceeds maxElement (" << hdfPlsReader.maxAllocNElements << "). Or the estimated memory " << endl << "consumption exceeds maxMemory (" << maxMemory << " GB)." << endl << "Loading pulses from " << movieFileNames[fofnMovieIndex] << " by read." << endl; byMetricForThisMovie = false; } } if (((metricOptions.find("StartFrameBase") != metricOptions.end() and metricOptions["StartFrameBase"]) or (metricOptions.find("StartFramePulse")!= metricOptions.end() and metricOptions["StartFramePulse"])) and !byMetricForThisMovie) { // Sneaky metrics StartFrameBase and StartFramePulse can used // with -bymetric only cout << "ERROR: Internal metrics StartFrameBase and StartFramePulse " << "can only be loaded with -bymetric." << endl; exit(1); } // Load "WhenStarted" before processing the others. if (metricOptions["WhenStarted"]) { WriteMetricWhenStarted(cmpReader, hdfPlsReader, movieFileNames[fofnMovieIndex]); } // Now load frame rate. // if (AnyFieldRequiresFrameRate(datasetFields)) { // Load frame rate anyway to ensure that cmp.h5 files are consistent. if (useBaseFile) { cmpReader.movieInfoGroup.StoreFrameRate(m, baseFile.GetFrameRate()); } else if (usePulseFile) { cmpReader.movieInfoGroup.StoreFrameRate(m, pulseFile.GetFrameRate()); } // // Load metrics for alignments from movie 'movieIndex'. // cout << "loading " << movieIndexSets[movieIndex].size() << " alignments for movie " << movieIndex << endl; if (byMetricForThisMovie) { // // Build lookup tables for all alignments which // are generated by the movie and check whether // pls/bas.h5 and cmp.h5 match. // vector lookupTables; BuildLookupTablesAndMakeSane(cmpFile, baseFile, pulseFile, cmpReader, hdfBasReader, hdfPlsReader, hdfCcsReader, useBaseFile, usePulseFile, useCcsOnly, movieIndexSets[movieIndex], toFrom, moviePartHoleNumbers, lookupTables); // // Group lookup tables by refGroupIndex and readGroupIndex. // vector > groupedLookupTablesIndexPairs; GroupLookupTables(lookupTables, groupedLookupTablesIndexPairs); if (cmpFile.readType == ReadType::CCS or useCcsOnly) { vector numPassesMetric; numPassesMetric.resize(lookupTables.size()); UInt index = 0; for (index = 0; index < lookupTables.size(); index++) { if (lookupTables[index].skip) { continue; } numPassesMetric[index] = hdfCcsReader.GetNumPasses(lookupTables[index].readIndex); } if (!cmpReader.alnInfoGroup.numPasses.IsInitialized()) { cmpReader.alnInfoGroup.InitializeNumPasses(); // Clear /AlnInfo/NumPasses dataset. cmpReader.alnInfoGroup.numPasses.Resize(0); } // Append numPasses of this movie to the end of /AlnInfo/NumPasses. UInt numPassesSize = cmpReader.alnInfoGroup.numPasses.size(); cmpReader.alnInfoGroup.numPasses.WriteToPos( &numPassesMetric[0], numPassesMetric.size(), numPassesSize); } // Keep a list of currently cached fields. vector cachedFields; if (usePulseFile) { // PulseCalls/ZMW/NumEvent is always cached in plsFile. cachedFields.push_back(Field("NumEvent", PlsField)); } for (size_t metricsToLoadIndex = 0; metricsToLoadIndex < metricsToLoad.size(); metricsToLoadIndex++) { string curMetric = metricsToLoad[metricsToLoadIndex]; // Metric "WhenStarted" should have been loaded before getting here. if (curMetric == "WhenStarted") { continue; } // Get the next metric to load. string nextMetric = ""; if (metricsToLoadIndex+1 < metricsToLoad.size()) { nextMetric = metricsToLoad[metricsToLoadIndex+1]; } // Cache all required data for computing this metric. CacheRequiredFieldsForMetric(baseFile, pulseFile, hdfBasReader, hdfPlsReader, hdfCcsReader, useBaseFile, usePulseFile, useCcsOnly, cachedFields, curMetric); // Compute the metric and write it to cmp.h5. WriteMetric(cmpFile, baseFile, pulseFile, cmpReader, hdfBasReader, hdfPlsReader, hdfCcsReader, useBaseFile, usePulseFile, useCcsOnly, lookupTables, groupedLookupTablesIndexPairs, curMetric); // Clear cached fields unless they are required by the next metric. ClearCachedFields(baseFile, pulseFile, hdfBasReader, hdfPlsReader, hdfCcsReader, useBaseFile, usePulseFile, useCcsOnly, cachedFields, curMetric, nextMetric); } // Clear the default field "NumEvent" if (usePulseFile) { hdfPlsReader.ClearField(pulseFile, "NumEvent"); } } else { // byRead for this movie for (movieAlignmentIndex = 0; movieAlignmentIndex < movieIndexSets[movieIndex].size(); movieAlignmentIndex++) { MovieAlnIndexLookupTable lookupTable; BuildLookupTable(movieAlignmentIndex, cmpFile, baseFile, usePulseFile, pulseFile, cmpReader, movieIndexSets[movieIndex], toFrom, moviePartHoleNumbers, lookupTable); // Skip this alignment if it is not generated by this movie if (lookupTable.skip) { continue; } UInt & alignmentIndex = lookupTable.alignmentIndex; size_t & refGroupIndex = lookupTable.refGroupIndex; size_t & readGroupIndex = lookupTable.readGroupIndex; UInt & holeNumber = lookupTable.holeNumber; size_t & readIndex = lookupTable.readIndex; UInt & queryStart = lookupTable.queryStart; UInt & queryEnd = lookupTable.queryEnd; UInt & offsetBegin = lookupTable.offsetBegin; UInt & offsetEnd = lookupTable.offsetEnd; string alignedSequence = GetAlignedSequenceFromCmpFile(cmpReader, lookupTable); // Create a map of where. vector baseToAlignmentMap; CreateSequenceToAlignmentMap(alignedSequence, baseToAlignmentMap); // Condense gaps in the alignment for easy comparison. RemoveGaps(alignedSequence, alignedSequence); // Get source read. unsigned int numPasses; SMRTSequence sourceRead; GetSourceRead(cmpFile, baseFile , pulseFile , hdfBasReader, hdfPlsReader, hdfCcsReader, useBaseFile , usePulseFile, useCcsOnly , //byRead , lookupTable , alignedSequence, sourceRead , numPasses); string readSequence; readSequence.resize(queryEnd - queryStart); copy((char*) (sourceRead.seq + queryStart), (char*) (sourceRead.seq + queryEnd), readSequence.begin()); if (alignedSequence.size() != readSequence.size() or alignedSequence != readSequence) { cout << "ERROR, the query sequence does not match the aligned query sequence." << endl; cout << "HoleNumber: "<< holeNumber << ", MovieName: " << cmpFileMovieName; cout << ", ReadIndex: " << (int) readIndex; cout << ", qStart: "<< queryStart << ", qEnd: " << queryEnd << endl; cout << "Aligned sequence: "<< endl; cout << alignedSequence << endl; cout << "Original sequence: " << endl; cout << readSequence << endl; assert(0); } // // Compute any necessary data fields. These usually involve // using differences of pulse indices, pulse widths, etc.. // Missing fields are stored as 0's. // vector readPulseMetric; vector floatMetric; vector qvMetric; vector frameRateMetric; vector timeMetric; UInt ungappedAlignedSequenceLength = alignedSequence.size(); assert(ungappedAlignedSequenceLength == queryEnd - queryStart); UInt alignedSequenceLength = offsetEnd - offsetBegin; readPulseMetric.resize(alignedSequenceLength+1); qvMetric.resize(alignedSequenceLength+1); frameRateMetric.resize(alignedSequenceLength+1); timeMetric.resize(alignedSequenceLength+1); UInt i; HDFCmpExperimentGroup* expGroup = cmpReader.refAlignGroups[refGroupIndex]->readGroups[readGroupIndex]; UInt alnArrayLength = expGroup->alignmentArray.size(); if (cmpFile.readType == ReadType::CCS or useCcsOnly) { if (!cmpReader.alnInfoGroup.numPasses.IsInitialized()) { cmpReader.alnInfoGroup.InitializeNumPasses(); } cmpReader.alnInfoGroup.numPasses.WriteToPos(&numPasses, 1, alignmentIndex); } if (metricOptions["StartTimeOffset"] == true) { if (!expGroup->startTimeOffset.IsInitialized()) { expGroup->startTimeOffset.Initialize(expGroup->experimentGroup, "StartTimeOffset"); } unsigned int readStartTimeOffset = sourceRead.startFrame[queryStart]; expGroup->startTimeOffset.WriteToPos(&readStartTimeOffset, 1, alignmentIndex); } if (metricOptions["QualityValue"] == true) { if (!expGroup->qualityValue.IsInitialized()) { expGroup->qualityValue.Initialize(expGroup->experimentGroup, "QualityValue", true, alnArrayLength); } // Store QualityValue. fill(qvMetric.begin(), qvMetric.end(), missingQualityValue); for (i = 0; i < ungappedAlignedSequenceLength; i++ ) { qvMetric[baseToAlignmentMap[i]] = sourceRead.qual[queryStart + i]; } qvMetric[qvMetric.size()-1] = 0; expGroup->qualityValue.WriteToPos(&qvMetric[0], qvMetric.size(), offsetBegin); } if (metricOptions["InsertionQV"] == true) { if (!expGroup->insertionQV.IsInitialized()) { expGroup->insertionQV.Initialize(expGroup->experimentGroup, "InsertionQV", true, alnArrayLength); } // Store InsertionQV. fill(qvMetric.begin(), qvMetric.end(), missingQualityValue); for (i = 0; i < ungappedAlignedSequenceLength; i++ ) { qvMetric[baseToAlignmentMap[i]] = sourceRead.insertionQV[queryStart+ i]; } qvMetric[qvMetric.size()-1] = 0; expGroup->insertionQV.WriteToPos(&qvMetric[0], qvMetric.size(), offsetBegin); } if (metricOptions["MergeQV"] == true) { if (!expGroup->mergeQV.IsInitialized()) { expGroup->mergeQV.Initialize(expGroup->experimentGroup, "MergeQV", true, alnArrayLength); } // Store MergeQV. fill(qvMetric.begin(), qvMetric.end(), missingQualityValue); for (i = 0; i < ungappedAlignedSequenceLength; i++ ) { qvMetric[baseToAlignmentMap[i]] = sourceRead.mergeQV[queryStart+ i]; } qvMetric[qvMetric.size()-1] = 0; expGroup->mergeQV.WriteToPos(&qvMetric[0], qvMetric.size(), offsetBegin); } if (metricOptions["DeletionQV"] == true) { if (!expGroup->deletionQV.IsInitialized()) { expGroup->deletionQV.Initialize(expGroup->experimentGroup, "DeletionQV", true, alnArrayLength); } // Store DeletionQV. fill(qvMetric.begin(), qvMetric.end(), missingQualityValue); for (i = 0; i < ungappedAlignedSequenceLength; i++ ) { qvMetric[baseToAlignmentMap[i]] = sourceRead.deletionQV[queryStart+i]; } qvMetric[qvMetric.size()-1] = 0; expGroup->deletionQV.WriteToPos(&qvMetric[0], qvMetric.size(), offsetBegin); } if (metricOptions["DeletionTag"] == true) { if (!expGroup->deletionTag.IsInitialized()) { expGroup->deletionTag.Initialize(expGroup->experimentGroup, "DeletionTag", true, alnArrayLength); } vector readDeletionTagMetric; readDeletionTagMetric.resize(readPulseMetric.size()); // Store DeletionTag. for (i = 0; i < readDeletionTagMetric.size()-1; i++ ) { readDeletionTagMetric[i] = '-'; } readDeletionTagMetric[i] = '\0'; for (i = 0; i < ungappedAlignedSequenceLength; i++ ) { assert(baseToAlignmentMap[i] < static_cast(readDeletionTagMetric.size())); readDeletionTagMetric[baseToAlignmentMap[i]] = sourceRead.deletionTag[queryStart+i]; } readDeletionTagMetric[readDeletionTagMetric.size()-1] = 0; expGroup->deletionTag.WriteToPos(&readDeletionTagMetric[0], readDeletionTagMetric.size(), offsetBegin); } if (metricOptions["PulseIndex"] == true) { if (!expGroup->pulseIndex.IsInitialized()) { expGroup->pulseIndex.Initialize(expGroup->experimentGroup, "PulseIndex", true, alnArrayLength); } vector readPulseIndexMetric; fill(readPulseIndexMetric.begin(), readPulseIndexMetric.end(), missingPulseIndex); readPulseIndexMetric.resize(readPulseMetric.size()); // Store Pulse Index. assert(readPulseIndexMetric.size() > 0); for (i = 0; i < readPulseIndexMetric.size(); i++ ) { readPulseIndexMetric[i] = 0; } for (i = 0; i < ungappedAlignedSequenceLength; i++ ) { readPulseIndexMetric[baseToAlignmentMap[i]] = sourceRead.pulseIndex[queryStart+i]; } readPulseIndexMetric[readPulseIndexMetric.size()-1] = 0; expGroup->pulseIndex.WriteToPos(&readPulseIndexMetric[0], readPulseIndexMetric.size(), offsetBegin); } if (metricOptions["SubstitutionTag"] == true) { if (!expGroup->substitutionTag.IsInitialized()) { expGroup->substitutionTag.Initialize(expGroup->experimentGroup, "SubstitutionTag", true, alnArrayLength); } vector readSubstitutionTagMetric; readSubstitutionTagMetric.resize(readPulseMetric.size()); // Store substitutionTag for (i = 0; i < readSubstitutionTagMetric.size()-1; i++ ) { readSubstitutionTagMetric[i] = '-'; } readSubstitutionTagMetric[i] = '\0'; for (i = 0; i < ungappedAlignedSequenceLength; i++ ) { readSubstitutionTagMetric[baseToAlignmentMap[i]] = sourceRead.substitutionTag[queryStart+i]; } readSubstitutionTagMetric[readSubstitutionTagMetric.size()-1] = 0; expGroup->substitutionTag.WriteToPos(&readSubstitutionTagMetric[0], readSubstitutionTagMetric.size(), offsetBegin); } if (metricOptions["SubstitutionQV"] == true) { if (!expGroup->substitutionQV.IsInitialized()) { expGroup->substitutionQV.Initialize(expGroup->experimentGroup, "SubstitutionQV", true, alnArrayLength); } // Store start time normalized to frame rate. fill(qvMetric.begin(), qvMetric.end(), missingQualityValue); for (i = 0; i < ungappedAlignedSequenceLength; i++ ) { qvMetric[baseToAlignmentMap[i]] = sourceRead.substitutionQV[queryStart+i]; } qvMetric[qvMetric.size()-1] = 0; expGroup->substitutionQV.WriteToPos(&qvMetric[0], qvMetric.size(), offsetBegin); } if (metricOptions["ClassifierQV"] == true) { if (!expGroup->classifierQV.IsInitialized()) { expGroup->classifierQV.Initialize(expGroup->experimentGroup, "ClassifierQV", true, alnArrayLength); } fill(floatMetric.begin(), floatMetric.end(), NaN); for (i = 0; i < ungappedAlignedSequenceLength; i++ ) { floatMetric[baseToAlignmentMap[i]] = sourceRead.classifierQV[i+queryStart]; } qvMetric[qvMetric.size()-1] = 0; expGroup->classifierQV.WriteToPos(&floatMetric[0], floatMetric.size(), offsetBegin); } if (metricOptions["StartFrame"] == true) { if (!expGroup->startTime.IsInitialized()) { expGroup->startTime.Initialize(expGroup->experimentGroup, "StartFrame", true, alnArrayLength); } // StartFrame used to be computed from baseFile.preBaseFrame and // baseFile.basWidthInFrames, whenever possible. But a more accurate // way is to obtain StartFrame directly from pulseFile.StartFrame // when a pulseFile is provided. if (usePulseFile) { assert(sourceRead.startFrame); } else if (useBaseFile) { if (sourceRead.startFrame) { Free(sourceRead.startFrame); } sourceRead.startFrame = new unsigned int[sourceRead.length]; copy(sourceRead.preBaseFrames, &sourceRead.preBaseFrames[sourceRead.length], sourceRead.startFrame); for (i = 0; i < sourceRead.length-1; i++) { sourceRead.startFrame[i+1] += sourceRead.widthInFrames[i]; } partial_sum(sourceRead.startFrame, &sourceRead.startFrame[sourceRead.length], sourceRead.startFrame); } fill(timeMetric.begin(), timeMetric.end(), missingPulseIndex); for (i = 0; i < ungappedAlignedSequenceLength; i++ ) { timeMetric[baseToAlignmentMap[i]] = sourceRead.startFrame[i+queryStart]; } timeMetric[timeMetric.size()-1] = 0; expGroup->startTime.WriteToPos(&timeMetric[0], timeMetric.size(), offsetBegin); } if (metricOptions["PulseWidth"] == true) { if (!expGroup->pulseWidth.IsInitialized()) { expGroup->pulseWidth.Initialize(expGroup->experimentGroup, "PulseWidth", true, alnArrayLength); } fill(frameRateMetric.begin(), frameRateMetric.end(), missingFrameRateValue); // For legacy reasons, it's possible the width in frames is // stored in the bas file. If this is the case, use the width // in frames there. Otherwise, use the width in frames stored // in the pls file. for (i = 0; i < ungappedAlignedSequenceLength; i++ ) { frameRateMetric[baseToAlignmentMap[i]] = sourceRead.widthInFrames[queryStart + i]; } frameRateMetric[frameRateMetric.size()-1] = 0; expGroup->pulseWidth.WriteToPos(&frameRateMetric[0], frameRateMetric.size(), offsetBegin); } if (metricOptions["PreBaseFrames"] == true) { if (!expGroup->preBaseFrames.IsInitialized()) { expGroup->preBaseFrames.Initialize(expGroup->experimentGroup, "PreBaseFrames", true, alnArrayLength); } fill(frameRateMetric.begin(), frameRateMetric.end(), missingFrameRateValue); for (i = 0; i < ungappedAlignedSequenceLength; i++ ) { frameRateMetric[baseToAlignmentMap[i]] = sourceRead.preBaseFrames[i+queryStart]; } frameRateMetric[frameRateMetric.size()-1] = 0; expGroup->preBaseFrames.WriteToPos(&frameRateMetric[0], frameRateMetric.size(), offsetBegin); } if (metricOptions["WidthInFrames"] == true) { if (!expGroup->widthInFrames.IsInitialized()) { expGroup->widthInFrames.Initialize(expGroup->experimentGroup, "WidthInFrames", true, alnArrayLength); } // Compute width in frames. fill(frameRateMetric.begin(), frameRateMetric.end(), missingFrameRateValue); for (i = 0; i < ungappedAlignedSequenceLength; i++ ) { frameRateMetric[baseToAlignmentMap[i]] = sourceRead.widthInFrames[i+queryStart]; } frameRateMetric[frameRateMetric.size()-1] = 0; expGroup->widthInFrames.WriteToPos(&frameRateMetric[0], frameRateMetric.size(), offsetBegin); } if (metricOptions["pkmid"] == true) { if (!expGroup->pkmid.IsInitialized()) { expGroup->pkmid.Initialize(expGroup->experimentGroup, "pkmid", true, alnArrayLength); } for (i = 0; i < readPulseMetric.size(); i++ ) { readPulseMetric[i] = NaN; } for (i = 0; i < ungappedAlignedSequenceLength; i++ ) { readPulseMetric[baseToAlignmentMap[i]] = sourceRead.midSignal[i+queryStart]; } readPulseMetric[readPulseMetric.size()-1] = 0; expGroup->pkmid.WriteToPos(&readPulseMetric[0], readPulseMetric.size(), offsetBegin); } if (metricOptions["IPD"] == true) { if (!expGroup->ipd.IsInitialized()) { expGroup->ipd.Initialize(expGroup->experimentGroup, "IPD", true, alnArrayLength); } fill(frameRateMetric.begin(), frameRateMetric.end(), missingFrameRateValue); for (i = 0; i < ungappedAlignedSequenceLength; i++ ) { // // The IPD is undefined for the first base in a read. // if (usePulseFile ) { if (queryStart == 0 and i == 0) { frameRateMetric[baseToAlignmentMap[i]] = 0; } else { frameRateMetric[baseToAlignmentMap[i]] = (sourceRead.startFrame[i+queryStart] - sourceRead.startFrame[i+queryStart-1] - sourceRead.widthInFrames[i+queryStart-1]); } } else if (useBaseFile) { frameRateMetric[baseToAlignmentMap[i]] = sourceRead.preBaseFrames[i + queryStart]; } } frameRateMetric[frameRateMetric.size()-1] = 0; expGroup->ipd.WriteToPos(&frameRateMetric[0], frameRateMetric.size(), offsetBegin); } if (metricOptions["Light"] == true) { if (!expGroup->light.IsInitialized()) { expGroup->light.Initialize(expGroup->experimentGroup, "Light", true, alnArrayLength); } fill(frameRateMetric.begin(), frameRateMetric.end(), missingFrameRateValue); for (i = 0; i < ungappedAlignedSequenceLength; i++ ) { frameRateMetric[baseToAlignmentMap[i]] = sourceRead.meanSignal[i+queryStart]; frameRateMetric[baseToAlignmentMap[i]] = (frameRateMetric[baseToAlignmentMap[i]] * sourceRead.widthInFrames[i+queryStart]); } frameRateMetric[frameRateMetric.size()-1] = 0; expGroup->light.WriteToPos(&frameRateMetric[0], frameRateMetric.size(), offsetBegin); } sourceRead.Free(); Free(sourceRead.meanSignal); Free(sourceRead.maxSignal); Free(sourceRead.midSignal); Free(sourceRead.startFrame); Free(sourceRead.classifierQV); Free(sourceRead.widthInFrames); } } if (useBaseFile) { hdfBasReader.Close(); } if (cmpFile.readType == ReadType::CCS or useCcsOnly) { hdfCcsReader.Close(); } if (usePulseFile) { hdfPlsReader.Close(); } } // Done loading movies. cmpReader.Close(); cerr << "[INFO] " << GetTimestamp() << " [" << program << "] ended." << endl; } blasr-smrtanalysis-4.0.0/utils/PulseToFasta.cpp000066400000000000000000000304131302464523700215720ustar00rootroot00000000000000#include #include #include #include #include #include #include #include #include #include #include #include #include #include #include using namespace std; char VERSION[] = "v1.0.0"; char PERFORCE_VERSION_STRING[] = "$Change: 126414 $"; int main(int argc, char* argv[]) { string program = "pls2fasta"; string versionString = VERSION; AppendPerforceChangelist(PERFORCE_VERSION_STRING, versionString); string plsFileName, fastaOutName; vector plsFileNames; bool trimByRegion, maskByRegion; trimByRegion = false; maskByRegion = false; RegionTable regionTable; string regionsFOFNName = ""; vector regionFileNames; bool splitSubreads = true; int minSubreadLength = 0; bool addSimulatedData = false; bool printFastq = false; bool printCcs = false; int lineLength = 50; int minReadScore = 0; vector holeNumbers; CommandLineParser clp; bool printOnlyBest = false; clp.SetProgramName(program); clp.SetVersion(versionString); clp.RegisterStringOption("in.bax.h5", &plsFileName, "Input plx.h5/bax.h5/fofn file.", true); clp.RegisterStringOption("out.fasta", &fastaOutName, "Output fasta/fastq file.", true); clp.RegisterPreviousFlagsAsHidden(); clp.RegisterFlagOption("trimByRegion", &trimByRegion, "Trim away low quality regions."); clp.RegisterFlagOption("maskByRegion", &maskByRegion, "Mask low quality regions with 'N'."); clp.RegisterStringOption("regionTable", ®ionsFOFNName, "Optional HDF file with a /PulseData/Regions dataset."); clp.RegisterIntOption("minSubreadLength", &minSubreadLength, "Do not write subreads less than the specified length.", CommandLineParser::PositiveInteger); clp.RegisterFlagOption("noSplitSubreads", &splitSubreads, "Do not split reads on adapter sequences."); clp.RegisterIntListOption("holeNumber", &holeNumbers, "Only print this hole number (or list of numbers)."); clp.RegisterFlagOption("fastq", &printFastq, "Print in FASTQ format with quality."); clp.RegisterFlagOption("ccs", &printCcs, "Print de novo CCS sequences"); clp.RegisterIntOption("lineLength", &lineLength, "Specify fasta/fastq line length", CommandLineParser::PositiveInteger); clp.RegisterIntOption("minReadScore", &minReadScore, "Minimum read score to print a read. The score is " "a number between 0 and 1000 and represents the expected accuracy percentage * 10. " "A typical value would be between 750 and 800. This does not apply to ccs reads.", CommandLineParser::NonNegativeInteger); clp.RegisterFlagOption("best", &printOnlyBest, "If a CCS sequence exists, print this. Otherwise, print the longest" "subread. This does not support fastq."); string description = ("Converts plx.h5/bax.h5/fofn files to fasta or fastq files. Although fasta files are provided" " with every run, they are not trimmed nor split into subreads. This program takes " "additional annotation information, such as the subread coordinates and high quality regions " "and uses them to create fasta sequences that are substrings of all bases called. Most of the time " "you will want to trim low quality reads, so you should specify -trimByRegion."); clp.SetProgramSummary(description); clp.ParseCommandLine(argc, argv); cerr << "[INFO] " << GetTimestamp() << " [" << program << "] started." << endl; if (trimByRegion and maskByRegion) { cout << "ERROR! You cannot both trim and mask regions. Use one or the other." << endl; exit(1); } if (printFastq) { // Setting lineLength to 0 flags to print on one line. lineLength = 0; } FileOfFileNames::StoreFileOrFileList(plsFileName, plsFileNames); if (regionsFOFNName == "") { regionFileNames = plsFileNames; } else { FileOfFileNames::StoreFileOrFileList(regionsFOFNName, regionFileNames); } ofstream fastaOut; CrucialOpen(fastaOutName, fastaOut); HDFRegionTableReader hdfRegionReader; sort(holeNumbers.begin(), holeNumbers.end()); vector pls2rgn = MapPls2Rgn(plsFileNames, regionFileNames); for (size_t plsFileIndex = 0; plsFileIndex < plsFileNames.size(); plsFileIndex++) { if (trimByRegion or maskByRegion or splitSubreads) { hdfRegionReader.Initialize(regionFileNames[pls2rgn[plsFileIndex]]); hdfRegionReader.ReadTable(regionTable); } ReaderAgglomerate reader; HDFBasReader ccsReader; if (printOnlyBest) { ccsReader.SetReadBasesFromCCS(); ccsReader.Initialize(plsFileNames[plsFileIndex]); } if (printCcs == false) { reader.IgnoreCCS(); } else { reader.hdfBasReader.SetReadBasesFromCCS(); } if (addSimulatedData) { reader.hdfBasReader.IncludeField("SimulatedCoordinate"); reader.hdfBasReader.IncludeField("SimulatedSequenceIndex"); } if (reader.SetReadFileName(plsFileNames[plsFileIndex]) == 0) { cout << "ERROR, could not determine file type." << plsFileNames[plsFileIndex] << endl; exit(1); } if (reader.Initialize() == 0) { cout << "ERROR, could not initialize file " << plsFileNames[plsFileIndex] << endl; exit(1); } DNALength simulatedCoordinate; DNALength simulatedSequenceIndex; reader.SkipReadQuality(); SMRTSequence seq; vector subreadIntervals;; SMRTSequence ccsSeq; while (reader.GetNextBases(seq, printFastq)) { if (printOnlyBest) { ccsReader.GetNext(ccsSeq); } if (holeNumbers.size() != 0 and binary_search(holeNumbers.begin(), holeNumbers.end(), seq.zmwData.holeNumber) == false) { continue; } if (seq.length == 0) { continue; } if (addSimulatedData) { reader.hdfBasReader.simulatedCoordinateArray.Read(reader.hdfBasReader.curRead-1, reader.hdfBasReader.curRead, &simulatedCoordinate); reader.hdfBasReader.simulatedSequenceIndexArray.Read(reader.hdfBasReader.curRead-1, reader.hdfBasReader.curRead, &simulatedSequenceIndex); } if (printCcs == true) { if (printFastq == false) { seq.PrintSeq(fastaOut); } else { seq.PrintFastq(fastaOut, lineLength); } continue; } // // Determine the high quality boundaries of the read. This is // the full read is no hq regions exist, or it is stated to // ignore regions. // DNALength hqReadStart, hqReadEnd; int hqRegionScore; if (GetReadTrimCoordinates(seq, seq.zmwData, regionTable, hqReadStart, hqReadEnd, hqRegionScore) == false or (trimByRegion == false and maskByRegion == false)) { hqReadStart = 0; hqReadEnd = seq.length; } // // Mask off the low quality portions of the reads. // if (maskByRegion) { if (hqReadStart > 0) { fill(&seq.seq[0], &seq.seq[hqReadStart], 'N'); } if (hqReadEnd != seq.length) { fill(&seq.seq[hqReadEnd], &seq.seq[seq.length], 'N'); } } // // Now possibly print the full read with masking. This could be handled by making a // if (splitSubreads == false) { ReadInterval wholeRead(0, seq.length); // The set of subread intervals is just the entire read. subreadIntervals.clear(); subreadIntervals.push_back(wholeRead); } else { // // Print subread coordinates no matter whether or not reads have subreads. // if (regionTable.HasHoleNumber(seq.HoleNumber())) { subreadIntervals = regionTable[seq.HoleNumber()].SubreadIntervals(seq.length, false, true); } else { subreadIntervals = {}; } } // // Output all subreads as separate sequences. // SMRTSequence bestSubreadSequence; int bestSubreadScore = -1; int bestSubreadIndex = 0; SMRTSequence bestSubread; for (size_t intvIndex = 0; intvIndex < subreadIntervals.size(); intvIndex++) { SMRTSequence subreadSequence, subreadSequenceRC; subreadSequence.SubreadStart(subreadIntervals[intvIndex].start); subreadSequence.SubreadEnd (subreadIntervals[intvIndex].end); // // When trimming by region, only output the parts of the // subread that overlap the hq region. // if (trimByRegion == true) { subreadSequence.SubreadStart(max((DNALength) subreadIntervals[intvIndex].start, hqReadStart)); subreadSequence.SubreadEnd ( min((DNALength) subreadIntervals[intvIndex].end, hqReadEnd)); } if (subreadSequence.SubreadStart() >= subreadSequence.SubreadEnd() or subreadSequence.SubreadEnd() - subreadSequence.SubreadStart() <= DNALength(minSubreadLength)) { // // There is no high qualty portion of this subread. Skip it. // continue; } if (hqRegionScore < minReadScore) { continue; } // // Print the subread, adding the coordinates as part of the title. // subreadSequence.ReferenceSubstring(seq, subreadSequence.SubreadStart(), subreadSequence.SubreadLength()); stringstream titleStream; titleStream << seq.title; if (splitSubreads) { // // Add the subread coordinates if splitting on subread. // titleStream << "/" << subreadSequence.SubreadStart() << "_" << subreadSequence.SubreadEnd(); } // // If running on simulated data, add where the values were simulated from. // if (addSimulatedData) { titleStream << ((FASTASequence*)&seq)->title << "/chrIndex_" << simulatedSequenceIndex << "/position_"<< simulatedCoordinate; ((FASTASequence*)&seq)->CopyTitle(titleStream.str()); } subreadSequence.CopyTitle(titleStream.str()); // // Eventually replace with WriterAgglomerate. // if (printOnlyBest == false) { if (subreadSequence.length > 0) { if (printFastq == false) { ((FASTASequence*)&subreadSequence)->PrintSeq(fastaOut); } else { subreadSequence.PrintFastq(fastaOut, lineLength); } } } else { int subreadWeightedScore = subreadSequence.length * hqRegionScore; if (subreadWeightedScore > bestSubreadScore) { bestSubreadIndex = intvIndex; (void)(bestSubreadIndex); bestSubread = subreadSequence; bestSubreadScore = subreadWeightedScore; } } } if (printOnlyBest) { if (ccsSeq.length > 0) { if (printFastq == false) { ccsSeq.PrintSeq(fastaOut); } else { ccsSeq.PrintFastq(fastaOut, ccsSeq.length); } } else { if (bestSubreadScore >= 0) { if (printFastq == false) { bestSubread.PrintSeq(fastaOut); } else { bestSubread.PrintFastq(fastaOut, bestSubread.length); } bestSubread.Free(); } } ccsSeq.Free(); } seq.Free(); } reader.Close(); hdfRegionReader.Close(); } cerr << "[INFO] " << GetTimestamp() << " [" << program << "] ended." << endl; } blasr-smrtanalysis-4.0.0/utils/SAWriter.cpp000066400000000000000000000176061302464523700207310ustar00rootroot00000000000000#include #include #include #include #include #include #include #include #include #include #include void PrintUsage() { cout << "usage: sawriter saOut fastaIn [fastaIn2 fastaIn3 ...] [-blt p] [-larsson] [-4bit] [-manmy] [-kar]" << endl; cout << " or sawriter fastaIn (writes to fastIn.sa)." << endl; cout << " -blt p Build a lookup table on prefixes of length 'p'. This speeds " << endl << " up lookups considerably (more than the LCP table), but misses matches " << endl << " less than p when searching." << endl; cout << " -4bit Read in (one) fasta file as a compressed sequence file." << endl; cout << " -larsson (default) Uses the method of Larsson and Sadakane to build the array." << endl; cout << " -mamy Uses the method of MAnber and MYers to build the array (slower than larsson, " << endl << " and produces the same result. This is mainly for double checking"< inFiles; int doBLT = 1; int bltPrefixLength = 8; int parsingOptions = 0; SAType saBuildType = larsson; int read4BitCompressed = 0; int diffCoverSize = 0; while (argi < argc) { if (strlen(argv[argi]) > 0 and argv[argi][0] == '-'){ parsingOptions = 1; } if (!parsingOptions) { inFiles.push_back(argv[argi]); } else { if (strcmp(argv[argi], "-blt") == 0) { doBLT = 1; if (argi < argc - 1) { bltPrefixLength = atoi(argv[++argi]); if (bltPrefixLength == 0) { cout << argv[argi] << " is not a valid lookup table length." << endl; exit(1); } } else { cout << "Please specify a lookup table length." << endl; exit(1); } } else if (strcmp(argv[argi], "-mamy") == 0) { saBuildType = manmy; } else if (strcmp(argv[argi], "-larsson") == 0) { saBuildType = larsson; } else if (strcmp(argv[argi], "-mcilroy") == 0) { saBuildType = mcilroy; } else if (strcmp(argv[argi], "-slow") == 0) { saBuildType = slow; } else if (strcmp(argv[argi], "-kark") == 0) { saBuildType = kark; } else if (strcmp(argv[argi], "-mafe") == 0) { saBuildType = mafe; } else if (strcmp(argv[argi], "-welter") == 0) { saBuildType = welter; } else if (strcmp(argv[argi], "-welterweight") == 0) { if (argi < argc-1) { diffCoverSize = atoi(argv[++argi]); } else { cout << "Please specify a difference cover size. Valid values are 7,32,64,111, and 2281. Larger values use less memory but may be slower." << endl; exit(1); } if ( ! (diffCoverSize == 7 or diffCoverSize == 32 or diffCoverSize == 64 or diffCoverSize == 111 or diffCoverSize == 2281) ) { cout << "The difference cover size must be one of 7,32,64,111, or 2281." << endl; cout << "Larger numbers use less space but are more slow." << endl; exit(1); } } else if (strcmp(argv[argi], "-4bit") == 0) { read4BitCompressed = 1; } else if (strcmp(argv[argi], "-h") == 0 or strcmp(argv[argi], "-help") == 0 or strcmp(argv[argi], "--help") == 0) { PrintUsage(); exit(0); } else { PrintUsage(); cout << "ERROR, bad option: " << argv[argi] << endl; exit(1); } } ++argi; } if (inFiles.size() == 0) { // // Special use case: the input file is a fasta file. Write to that file + .sa // inFiles.push_back(saFile); saFile = saFile + ".sa"; } VectorIndex inFileIndex; FASTASequence seq; CompressedSequence compSeq; if (read4BitCompressed == 0) { for (inFileIndex = 0; inFileIndex < inFiles.size(); ++inFileIndex) { FASTAReader reader; reader.Init(inFiles[inFileIndex]); reader.SetSpacePadding(111); if (saBuildType == kark) { // // The Karkkainen sa building method requires a little extra // space at the end of the dna sequence so that counting may // be done mod 3 without adding extra logic for boundaries. // } if (inFileIndex == 0) { reader.ReadAllSequencesIntoOne(seq); reader.Close(); } else { while(reader.ConcatenateNext(seq)) { cout << "added " << seq.title << endl; } } } seq.ToThreeBit(); //seq.ToUpper(); } else { assert(inFiles.size() == 1); cout << "reading compressed sequence." << endl; compSeq.Read(inFiles[0]); seq.seq = compSeq.seq; seq.length = compSeq.length; compSeq.RemoveCompressionCounts(); cout << "done." << endl; } // // For now, do not allow creation of suffix arrays on sequences > 4G. // if (seq.length >= UINT_MAX) { cout << "ERROR, references greater than " << UINT_MAX << " bases are not supported." << endl; cout << "Consider breaking the reference into multiple files, running alignment. " << endl; cout << "against each file, and merging the result." << endl; exit(1); } vector alphabet; SuffixArray > sa; // sa.InitTwoBitDNAAlphabet(alphabet); // sa.InitAsciiCharDNAAlphabet(alphabet); sa.InitThreeBitDNAAlphabet(alphabet); if (saBuildType == manmy) { sa.MMBuildSuffixArray(seq.seq, seq.length, alphabet); } else if (saBuildType == mcilroy) { sa.index = new SAIndex[seq.length+1]; DNALength i; for (i = 0; i < seq.length; i++) { sa.index[i] = seq.seq[i] + 1;} sa.index[seq.length] = 0; ssort(sa.index, NULL); for (i = 1; i < seq.length+1; i++ ){ sa.index[i-1] = sa.index[i];}; sa.length = seq.length; } else if (saBuildType == larsson) { sa.LarssonBuildSuffixArray(seq.seq, seq.length, alphabet); } else if (saBuildType == kark) { sa.index = new SAIndex[seq.length]; seq.ToThreeBit(); DNALength p; for (p = 0; p < seq.length; p++ ){ seq.seq[p]++; } KarkkainenBuildSuffixArray(seq.seq, sa.index, seq.length, 5); sa.length = seq.length; } else if (saBuildType == mafe) { // sa.MaFeBuildSuffixArray(seq.seq, seq.length); } else if (saBuildType == welter) { if (diffCoverSize == 0) { sa.LightweightBuildSuffixArray(seq.seq, seq.length); } else { sa.LightweightBuildSuffixArray(seq.seq, seq.length, diffCoverSize); } } if (doBLT) { sa.BuildLookupTable(seq.seq, seq.length, bltPrefixLength); } sa.Write(saFile); return 0; } blasr-smrtanalysis-4.0.0/utils/SDPMatcher.cpp000066400000000000000000000126101302464523700211510ustar00rootroot00000000000000#include #include #include #include #include #include #include #include #include #include #include #include #include #include #include /* * Performs sparse dynamic programming (SDP) between pairs of sequences as they * are given in two FASTA files, one called for convenience query, the other * target. k is the size of the k-mer used for the SDP algorithm. */ void PrintUsage() { cout << "usage: sdpMatcher query target k [-indelRate delta] " "[-showalign] [-printsw] [-noRefine] [-indel i] [ -local ] " "[-match m] [-sdpIndel i]" << endl; } int main(int argc, char* argv[]) { if (argc < 4) { PrintUsage(); exit(1); } string queryName, targetName; queryName = argv[1]; targetName = argv[2]; TupleMetrics tm; tm.Initialize(atoi(argv[3])); int argi = 4; float indelRate = 0.25; int indel = 3; int match = 0; int printSW = 0; int printSimilarity = 0; int refineAlignments = 1; int showalign = 0; int fixedTarget = 0; int sdpIndel = indel; int sdpIns = 5; int sdpDel = 5; (void)(sdpIndel); (void)(sdpIns); (void)(sdpDel); // not yet used. AlignmentType alignType = Global; while (argi < argc) { if (strcmp(argv[argi], "-indelRate") == 0) { ++argi; indelRate = atof(argv[argi]); } else if (strcmp(argv[argi], "-printsw") == 0) { printSW = 1; } else if (strcmp(argv[argi], "-noRefine") == 0) { refineAlignments = 0; } else if (strcmp(argv[argi], "-indel") == 0) { indel = atoi(argv[++argi]); } else if (strcmp(argv[argi], "-sdpIndel") == 0) { sdpIndel = atoi(argv[++argi]); } else if (strcmp(argv[argi], "-sdpIns") == 0) { sdpIns = atoi(argv[++argi]); } else if (strcmp(argv[argi], "-sdpDel") == 0) { sdpDel = atoi(argv[++argi]); } else if (strcmp(argv[argi], "-showalign") == 0) { showalign = 1; } else if (strcmp(argv[argi], "-local") == 0) { alignType = Local; } else if (strcmp(argv[argi], "-match") == 0) { match = atoi(argv[++argi]); } else if (strcmp(argv[argi], "-fixedtarget") == 0) { fixedTarget = 1; } else if (strcmp(argv[argi], "-printSimilarity") == 0) { printSimilarity = 1; } else { PrintUsage(); cout << "Bad option: " << argv[argi] << endl; exit(1); } ++argi; } FASTASequence query, target; FASTAReader queryReader, targetReader; queryReader.Init(queryName); targetReader.Init(targetName); if (match != 0) { int i; for (i = 0; i < 4; i++ ){ LocalAlignLowMutationMatrix[i][i] = match; } } int seqIndex = 0; Alignment alignment; vector scoreMat; vector pathMat; DistanceMatrixScoreFunction distScoreFn; distScoreFn.del = indel; distScoreFn.ins = indel; distScoreFn.InitializeScoreMatrix(SMRTDistanceMatrix); if (fixedTarget) { targetReader.GetNext(target); } cout << "qid,tid,qstart,qend,qlen,tstart,tend,tlen,score"; if (printSimilarity) cout << ",pctSimilarity"; cout << endl; while (queryReader.GetNext(query) and (fixedTarget or targetReader.GetNext(target))) { if (query.length == 0 or target.length == 0) continue; alignment.blocks.clear(); int alignScore; alignScore = SDPAlign(query, target, distScoreFn, tm.tupleSize, sdpIndel, sdpIndel, indelRate, alignment, alignType, refineAlignments, false, 0); ComputeAlignmentStats(alignment, query.seq, target.seq, distScoreFn); if (alignScore > 0){ // in rare cases the SDP returns positive. alignScore = 0; // this makes it more like a true local alignment } if (showalign) { StickPrintAlignment(alignment, query, target, cout); } if (printSW) { MatchedAlignment swAlignment; vector scoreMat; vector pathMat; SWAlign(query, target, scoreMat, pathMat, swAlignment, distScoreFn); StickPrintAlignment(swAlignment, query, target, cout); } cout << query.GetName() << "," << target.GetName() << "," << alignment.qPos << "," << alignment.QEnd() << "," << query.length << "," << alignment.tPos << "," << alignment.TEnd() << "," << target.length << "," << alignScore; if (printSimilarity) cout << "," << alignment.pctSimilarity; cout << endl; ++seqIndex; } return 0; } blasr-smrtanalysis-4.0.0/utils/SamFilter.cpp000066400000000000000000000503751302464523700211170ustar00rootroot00000000000000/* * ===================================================================================== * * Filename: SAMFilter.cpp * * Description: Filter SAM Hits according to * filteration criteria * minPctSimilarity, minAccuracy, * minLength, holeNumbers * and multiple-hit policy * random : a random hit * all : all hits * allbest : all hits with the best score * randombest: a random hit selected from all the hits * that have the best score * * Version: 1.0 * Created: 03/19/2013 01:19:43 PM * Revision: none * Compiler: gcc * * Author: Yuan Li (yli), yli@pacificbiosciences.com * Company: Pacific Biosciences * * ===================================================================================== */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include "../iblasr/RegisterFilterOptions.h" //#define USE_GOOGLE_PROFILER #ifdef USE_GOOGLE_PROFILER #include #endif char VERSION[] = "v0.1.0"; char PERFORCE_VERSION_STRING[] = "$Change: 134995 $"; // By default negative score is better. ScoreSign scoreSign = ScoreSign::NEGATIVE; // Compare SAMAlignment objects by qName, score and // target positions. bool byQNameScoreTStart(const SAMAlignment & a, const SAMAlignment & b) { if (a.qName == b.qName) { if (a.score == b.score) return a.pos < b.pos; return Score(a.score, scoreSign).WorseThan(Score(b.score, scoreSign)); } return (a.qName < b.qName); } // Compare SAMAlignment objects by rName and qName bool byRNameQName(const SAMAlignment & a, const SAMAlignment & b) { if (a.rName == b.rName) { return a.qName < b.qName; } return (a.rName < b.rName); } // Get the next group of SAM alignments that have the same qName from // allSAMAlignments[groupBegin ... groupEnd) // Note that allSAMAlignments is already sorted by qName, score and tPos. void GetNextSAMAlignmentGroup(vector & allSAMAlignments, unsigned int groupBegin, unsigned int & groupEnd) { assert(groupBegin < allSAMAlignments.size()); groupEnd = groupBegin + 1; string queryName = allSAMAlignments[groupBegin].qName; while(groupEnd < allSAMAlignments.size()) { if (allSAMAlignments[groupEnd].qName == queryName) groupEnd ++; else break; } } // Get the best SAM alignments whose alignment score are the best. // Assume that alignments in allSAMAlignments[groupBegin, groupEnd) // all have the same queryName and are sorted by score and tPos // asscendingly: worst, ...., best void GetBestSAMAlignmentsInGroup(vector & allSAMAlignments, const unsigned int & groupBegin, const unsigned int & groupEnd, unsigned int & bestBegin, unsigned int & bestEnd) { assert(groupEnd <= allSAMAlignments.size() and groupBegin < groupEnd); bestEnd = groupEnd; bestBegin = groupEnd - 1; int groupBestScore = allSAMAlignments[bestBegin].score; string queryName = allSAMAlignments[bestBegin].qName; while (bestBegin >= groupBegin and bestBegin < groupEnd) { assert(allSAMAlignments[bestBegin].qName == queryName); if (allSAMAlignments[bestBegin].score == groupBestScore) bestBegin -= 1; else break; } bestBegin += 1; } // Apply hit policy to a group of SAM alignments and return indices // of the selected alignments. vector ApplyHitPolicy(HitPolicy & hitPolicy, vector & allSAMAlignments, const unsigned int & groupBegin, const unsigned int & groupEnd) { vector hitIndices; if (hitPolicy.IsAll()) { for(unsigned int i = groupBegin; i < groupEnd; i++){ hitIndices.push_back(i); } } else if (hitPolicy.IsRandom()) { hitIndices.push_back(rand()%(groupEnd - groupBegin) + groupBegin); } else { unsigned int bestBegin, bestEnd; GetBestSAMAlignmentsInGroup(allSAMAlignments, groupBegin, groupEnd, bestBegin, bestEnd); if (hitPolicy.IsAllbest()) { for(unsigned int i = bestBegin; i < bestEnd; i++){ hitIndices.push_back(i); } } else if (hitPolicy.IsRandombest()) { hitIndices.push_back(rand()%(bestEnd-bestBegin) + bestBegin); } else if (hitPolicy.IsLeftmost()) { hitIndices.push_back(bestBegin); } else { assert(false); } } return hitIndices; } // Convert references[...].title in reference.fasta to their corresponding // indices in the title table. void ConvertTitlesToTitleTableIndices(vector & references, string & titleTableName) { TitleTable tt; tt.Read(titleTableName); for(size_t i = 0; i < references.size(); i++) { string title = references[i].GetTitle(); int idx = -1; if (tt.Lookup(title, idx)) { stringstream ss; ss << idx; references[i].CopyTitle(ss.str()); } else { cout << "ERROR, reference " << title << " does not exist " << " in the title table " << titleTableName << ". The " << "reference fasta and the title table do not match." << endl; exit(1); } } tt.Free(); } // Return true if the alignment can only map to an adapter specified // in the adapter GFF file. // A sample record in adapter GFF file: // ref000001 . adapter 10955 10999 0.00 + . xxxx // ref000001 . adapter 32886 32930 0.00 + . xxxx // Note that the first field (e.g., 'ref000001') is id of sequence // in a reference repository, not sequence name, so we need to // reconstruct the mapping between sequence id and sequence name. bool CheckAdapterOnly(GFFFile & adapterGffFile, //Adapter gff file AlignmentCandidate<> & alignment, // An alignment map & refNameToIndex) { // Map target sequence name to its index in reference repository. if (refNameToIndex.find(alignment.tName) == refNameToIndex.end()) { // This should not happen ... cout << "ERROR, could not find alignment target name " << alignment.tName << " in the reference file." << endl; exit(1); } int refNameIndex = refNameToIndex[alignment.tName]; char buf [16]; sprintf(buf, "ref%06d", refNameIndex + 1); // Reconstruct ref id in the format "ref00000?". string refNameId(buf); int FUZZY_OVERLAP = 20; for(size_t eindex = 0; eindex < adapterGffFile.entries.size(); eindex++) { GFFEntry & entry = adapterGffFile.entries[eindex]; // Convert each GFF record from 1-based inclusive to // 0-based exclusive. if (entry.type == "adapter" and (entry.name == alignment.tName or entry.name == refNameId)) { UInt estart = entry.start - 1; UInt eend = entry.end; if (entry.strand == '-') { UInt tmp = estart; estart = alignment.tLength - 1 - eend; eend = alignment.tLength - 1 - tmp; } if (not (eend < alignment.GenomicTBegin() or estart > alignment.GenomicTEnd())) { UInt lengthUnion = max(eend, alignment.GenomicTEnd()) - min(estart, alignment.GenomicTBegin()); if (lengthUnion < eend - estart + FUZZY_OVERLAP) { return true; } } } } return false; } int main(int argc, char* argv[]) { #ifdef USE_GOOGLE_PROFILER char *profileFileName = getenv("CPUPROFILE"); if (profileFileName != NULL) { ProfilerStart(profileFileName); } else { ProfilerStart("google_profile.txt"); } #endif // Register inputs and outputs. string samFileName, refFileName, outFileName; CommandLineParser clp; clp.RegisterStringOption("file.sam", &samFileName, "Input SAM file."); clp.RegisterStringOption("reference.fasta", &refFileName, "Reference used to generate reads."); clp.RegisterStringOption("out.sam", &outFileName, "Output SAM file."); clp.RegisterPreviousFlagsAsHidden(); // Register filter criteria options. int minAlnLength = 50; float minPctSimilarity = 70, minPctAccuracy = 70; string hitPolicyStr = "randombest"; bool useScoreCutoff = false; int scoreCutoff = INF_INT; int scoreSignInt = -1; RegisterFilterOptions(clp, minAlnLength, minPctSimilarity, minPctAccuracy, hitPolicyStr, useScoreCutoff, scoreSignInt, scoreCutoff); int seed = 1; clp.RegisterIntOption("seed", &seed, "(1) Seed for random number generator.\n" "If seed is 0, then use current time as seed.", CommandLineParser::Integer); string holeNumberStr; Ranges holeNumberRanges; clp.RegisterStringOption("-holeNumbers", &holeNumberStr, "A string of comma-delimited hole number ranges to output hits, " "such as '1,2,10-12'. " "This requires hit titles to be in SMRT read title format."); bool parseSmrtTitle = false; clp.RegisterFlagOption("smrtTitle", &parseSmrtTitle, "Use this option when filtering alignments generated by " "programs other than blasr, e.g. bwa-sw or gmap. " " Parse read coordinates from the SMRT read title. " "The title is in the format /name/hole/coordinates, where" " coordinates are in the format \\d+_\\d+, and represent " "the interval of the read that was aligned."); /* This experimental option can be useful for metagenomics, in which case * there are hundreds of sequences in the target, of which many titles are * long and may contain white spaces (e.g., ' ', '\t'). * In order to save disc space and avoid the (possibly) none unique mapping * between full and short reference names, one may call blasr with * -titleTable option to represent all target sequences in the output * by their indices in the title table.*/ string titleTableName = ""; clp.RegisterStringOption("titleTable", &titleTableName, "Use this experimental option when filtering alignments generated by " "blasr with -titleTable titleTableName, in which case " "reference titles in SAM are represented by their " "indices (e.g., 0, 1, 2, ...) in the title table."); string adapterGffFileName = ""; clp.RegisterStringOption("filterAdapterOnly", &adapterGffFileName, "Use this option to remove reads which can only map to adapters " "specified in the GFF file."); bool verbose = false; clp.RegisterFlagOption("v", &verbose, "Be verbose."); clp.SetExamples( "Because SAM has optional tags that have different meanings" " in different programs, careful usage is required in order " "to have proper output. The \"xs\" tag in bwa-sw is used to " "show the suboptimal score, but in PacBio SAM (blasr) it is " "defined as the start in the query sequence of the alignment.\n" "When \"-smrtTitle\" is specified, the xs tag is ignored, but " "when it is not specified, the coordinates given by the xs and " "xe tags are used to define the interval of a read that is " "aligned. The CIGAR string is relative to this interval."); clp.ParseCommandLine(argc, argv); // Set random number seed. if (seed == 0) { srand(time(NULL)); } else { srand(seed); } scoreSign = (scoreSignInt == -1)?ScoreSign::NEGATIVE:ScoreSign::POSITIVE; Score s(static_cast(scoreCutoff), scoreSign); FilterCriteria filterCriteria(minAlnLength, minPctSimilarity, minPctAccuracy, true, s); filterCriteria.Verbose(verbose); HitPolicy hitPolicy(hitPolicyStr, scoreSign); string errMsg; if (not filterCriteria.MakeSane(errMsg)) { cout << errMsg << endl; exit(1); } // Parse hole number ranges. if (holeNumberStr.size() != 0) { if (not holeNumberRanges.setRanges(holeNumberStr)) { cout << "Could not parse hole number ranges: " << holeNumberStr << "." << endl; exit(1); } } // Open output file. ofstream outFileStrm; if (outFileName != "") { CrucialOpen(outFileName, outFileStrm, std::ios::out); } GFFFile adapterGffFile; if (adapterGffFileName != "") adapterGffFile.ReadAll(adapterGffFileName); SAMReader samReader; FASTAReader fastaReader; // // Initialize samReader and fastaReader. // samReader.Initialize(samFileName); fastaReader.Initialize(refFileName); // // Configure the file log. // string command; CommandLineParser::CommandLineToString(argc, argv, command); string log = "Filter sam hits."; string program = "samFilter"; string versionString = VERSION; AppendPerforceChangelist(PERFORCE_VERSION_STRING, versionString); // // Read necessary input. // vector references; fastaReader.ReadAllSequences(references); // If the SAM file is generated by blasr with -titleTable, // then references in the SAM are represented by // their corresponding indices in the title table. // In that case, we need to convert reference titles in fasta file // to their corresponding indices in the title table, such that // references in both SAM and fasta files are represented // by title table indices and therefore can match. if (titleTableName != "") { ConvertTitlesToTitleTableIndices(references, titleTableName); } AlignmentSet alignmentSet; vector allHeaders = samReader.ReadHeader(alignmentSet); // Process SAM Header. string commandLineString; clp.CommandLineToString(argc, argv, commandLineString); allHeaders.push_back("@PG\tID:SAMFILTER\tVN:" + versionString + \ "\tCL:" + program + " " + commandLineString); for (size_t i = 0; i < allHeaders.size(); i++) { outFileStrm << allHeaders[i] << endl; } // // The order of references in vector references and // AlignmentSet<, , >alignmentSet.references can be different. // Rearrange alignmentSet.references such that they are ordered in // exactly the same way as vector references. // alignmentSet.RearrangeReferences(references); // Map reference name obtained from SAM file to indices map refNameToIndex; for (size_t i = 0; i < references.size(); i++) { string refName = alignmentSet.references[i].GetSequenceName(); refNameToIndex[refName] = i; } // // Store the alignments. // SAMAlignment samAlignment; size_t alignIndex = 0; // // For 150K, each chip produces about 300M sequences // (not including quality values and etc.). // Let's assume that the sam file and reference data can // fit in the memory. // Need to scale for larger sequal data in the future. // vector allSAMAlignments; while (samReader.GetNextAlignment(samAlignment)) { if (samAlignment.rName == "*") { continue; } if (parseSmrtTitle and holeNumberStr.size() != 0) { string movieName; int thisHoleNumber; if (not ParsePBIReadName(samAlignment.qName, movieName, thisHoleNumber)) { cout << "ERROR, could not parse SMRT title: " << samAlignment.qName << "." << endl; exit(1); } if (not holeNumberRanges.contains(UInt(thisHoleNumber))) { if (verbose) cout << thisHoleNumber << " is not in range." << endl; continue; } } if (samAlignment.cigar.find('P') != string::npos) { cout << "WARNING. Could not process SAM record with 'P' in " << "its cigar string." << endl; continue; } vector > convertedAlignments; SAMAlignmentsToCandidates(samAlignment, references, refNameToIndex, convertedAlignments, parseSmrtTitle, false); if (convertedAlignments.size() > 1) { cout << "WARNING. Ignore multiple segments." << endl; continue; } for (int i = 0; i < 1; i++) { AlignmentCandidate<> & alignment = convertedAlignments[i]; //score func does not matter DistanceMatrixScoreFunction distFunc; ComputeAlignmentStats(alignment, alignment.qAlignedSeq.seq, alignment.tAlignedSeq.seq, distFunc); // Check whether this alignment can only map to adapters in // the adapter GFF file. if (adapterGffFileName != "" and CheckAdapterOnly(adapterGffFile, alignment, refNameToIndex)) { if (verbose) cout << alignment.qName << " filter adapter only." << endl; continue; } // Assign score to samAlignment. samAlignment.score = samAlignment.as; if (not filterCriteria.Satisfy(static_cast *>(&alignment))) { continue; } allSAMAlignments.push_back( samAlignment ); alignment.FreeSubsequences(); } ++alignIndex; } // Sort all SAM alignments by qName, score and target position. sort(allSAMAlignments.begin(), allSAMAlignments.end(), byQNameScoreTStart); unsigned int groupBegin = 0; unsigned int groupEnd = -1; vector filteredSAMAlignments; while(groupBegin < allSAMAlignments.size()) { // Get the next group of SAM alignments which have the same qName // from allSAMAlignments[groupBegin ... groupEnd) GetNextSAMAlignmentGroup(allSAMAlignments, groupBegin, groupEnd); vector hitIndices = ApplyHitPolicy( hitPolicy, allSAMAlignments, groupBegin, groupEnd); for(unsigned int i = 0; i < hitIndices.size(); i++) { filteredSAMAlignments.push_back(allSAMAlignments[hitIndices[i]]); } groupBegin = groupEnd; } // Sort all SAM alignments by reference name and query name sort(filteredSAMAlignments.begin(), filteredSAMAlignments.end(), byRNameQName); for(unsigned int i = 0; i < filteredSAMAlignments.size(); i++) { filteredSAMAlignments[i].PrintSAMAlignment(outFileStrm); } if (outFileName != "") { outFileStrm.close(); } #ifdef USE_GOOGLE_PROFILER ProfilerStop(); #endif return 0; } blasr-smrtanalysis-4.0.0/utils/SamToCmpH5.cpp000066400000000000000000000201011302464523700210710ustar00rootroot00000000000000#include #include #include #include #include #include #include #include #include #include #include #include char VERSION[] = "v1.0.0"; char PERFORCE_VERSION_STRING[] = "$Change: 141782 $"; int main(int argc, char* argv[]) { string program = "samtoh5"; string versionString = VERSION; AppendPerforceChangelist(PERFORCE_VERSION_STRING, versionString); string samFileName, cmpFileName, refFileName; bool parseSmrtTitle = false; bool useShortRefName = false; bool copyQVs = false; CommandLineParser clp; string readType = "standard"; int verbosity = 0; clp.SetProgramName(program); clp.SetProgramSummary("Converts in.sam file to out.cmp.h5 file."); clp.SetVersion(versionString); clp.RegisterStringOption("in.sam", &samFileName, "Input SAM file.", true); clp.RegisterStringOption("reference.fasta", &refFileName, "Reference used to generate reads.", true); clp.RegisterStringOption("out.cmp.h5", &cmpFileName, "Output cmp.h5 file.", true); clp.RegisterPreviousFlagsAsHidden(); clp.RegisterFlagOption("smrtTitle", &parseSmrtTitle, "Use this option when converting alignments " "generated from reads produced by the " "pls2fasta from bas.h5 files by parsing read " "coordinates from the SMRT read title. The title " "is in the format /name/hole/coordinates, where " "coordinates are in the format \\d+_\\d+, and " "represent the interval of the read that was " "aligned."); clp.RegisterStringOption("readType", &readType, "Set the read type: 'standard', 'strobe', 'CCS', " "or 'cDNA'"); clp.RegisterIntOption("verbosity", &verbosity, "Set desired verbosity.", CommandLineParser::PositiveInteger); clp.RegisterFlagOption("useShortRefName", &useShortRefName, "Use abbreviated reference names obtained " "from file.sam instead of using full names " "from reference.fasta."); clp.RegisterFlagOption("copyQVs", ©QVs, "Copy all QVs available in the SAM file into the " "cmp.h5 file. This includes things like InsertionQV " "and DeletionTag."); string description = ("Because SAM has optional tags that have different " "meanings in different programs, careful usage is required in order to " "have proper output. The \"xs\" tag in bwa-sw is used to show the " "suboptimal score, but in PacBio SAM (blasr) it is defined as the start " "in the query sequence of the alignment.\nWhen \"-smrtTitle\" is " "specified, the xs tag is ignored, but when it is not specified, the " "coordinates given by the xs and xe tags are used to define the interval " "of a read that is aligned. The CIGAR string is relative to this interval."); clp.SetExamples(description); clp.ParseCommandLine(argc, argv); if (readType != "standard" and readType != "strobe" and readType != "cDNA" and readType != "CCS") { cout << "ERROR. Read type '" << readType << "' must be one of either 'standard', 'strobe', 'cDNA' or 'CCS'." << endl; exit(1); } cerr << "[INFO] " << GetTimestamp() << " [" << program << "] started." << endl; SAMReader samReader; FASTAReader fastaReader; HDFCmpFile > cmpFile; // // Initialize input/output files. // samReader.Initialize(samFileName); fastaReader.Initialize(refFileName); cmpFile.Create(cmpFileName); // // Configure the file log. // string command; CommandLineParser::CommandLineToString(argc, argv, command); string log = "Convert sam to cmp.h5"; cmpFile.fileLogGroup.AddEntry(command, log, program, GetTimestamp(), versionString); // // Set the readType // cmpFile.SetReadType(readType); // // Read necessary input. // vector references; fastaReader.ReadAllSequences(references); // // This should probably be handled by the alignmentSetAdapter, but // time constraints... // AlignmentSet alignmentSet; samReader.ReadHeader(alignmentSet); // // The order of references in vector references and // AlignmentSet<, , >alignmentSet.references can be different. // Rearrange alignmentSet.references such that it is ordered in // exactly the same way as vector references. // alignmentSet.RearrangeReferences(references); // // Always recompute the MD5 values even if they exist in the input // sam file. Because MD5 is defined differently in sam and cmp.h5 files. // The SAM convention uppercases and normalizes before computing the MD5. // For cmp.h5, we compute the MD5 on the sequence 'as is'. // for(size_t i = 0; i < alignmentSet.references.size(); i++) { MakeMD5((const char*)&references[i].seq[0], (unsigned int)references[i].length, alignmentSet.references[i].md5); } // // Map short names for references obtained from file.sam to full names obtained from reference.fasta // map shortRefNameToFull; map::iterator it; assert(references.size() == alignmentSet.references.size()); if (!useShortRefName) { for (size_t i = 0; i < references.size(); i++) { string shortRefName = alignmentSet.references[i].GetSequenceName(); string fullRefName(references[i].title); if (shortRefNameToFull.find(shortRefName) != shortRefNameToFull.end()) { cout << "ERROR, Found more than one reference " << shortRefName << "in sam header" << endl; exit(1); } shortRefNameToFull[shortRefName] = fullRefName; alignmentSet.references[i].sequenceName = fullRefName; } } // // Start setting up the cmp.h5 file. // AlignmentSetToCmpH5Adapter > > alignmentSetAdapter; alignmentSetAdapter.Initialize(); alignmentSetAdapter.StoreReferenceInfo(alignmentSet.references, cmpFile); // // Store the alignments. // SAMAlignment samAlignment; while (samReader.GetNextAlignment(samAlignment)) { if (samAlignment.rName == "*") { continue; } if (!useShortRefName) { //convert shortRefName to fullRefName it = shortRefNameToFull.find(samAlignment.rName); if (it == shortRefNameToFull.end()) { cout << "ERROR, Could not find " << samAlignment.rName << " in the reference repository." << endl; exit(1); } samAlignment.rName = (*it).second; } vector > convertedAlignments; if (verbosity > 0) { cout << "Storing alignment for " << samAlignment.qName << endl; } SAMAlignmentsToCandidates(samAlignment, // Order of references and alignmentSetAdapter.RefInfoGroup // should be exactly the same. references, alignmentSetAdapter.refNameToRefInfoIndex, convertedAlignments, parseSmrtTitle, false, copyQVs); // -1: moleculeID will be computed dynamically. // o.w., the value will be assigned as moleculeID. alignmentSetAdapter.StoreAlignmentCandidateList(convertedAlignments, cmpFile, -1, copyQVs); for (size_t a = 0; a < convertedAlignments.size(); a++) { convertedAlignments[a].FreeSubsequences(); } } cerr << "[INFO] " << GetTimestamp() << " [" << program << "] ended." << endl; return 0; } blasr-smrtanalysis-4.0.0/utils/SamToM4.cpp000066400000000000000000000175761302464523700204630ustar00rootroot00000000000000/* * ===================================================================================== * * Filename: SamToM4.cpp * * Description: Convert a sam file to a blasr m4 file. * * Version: 1.0 * Created: 04/03/2013 01:19:43 PM * Revision: none * Compiler: gcc * * Author: Yuan Li (yli), yli@pacificbiosciences.com * Company: Pacific Biosciences * * ===================================================================================== */ #include #include #include #include #include #include #include #include #include #include #include char VERSION[] = "v0.1.0"; char PERFORCE_VERSION_STRING[] = "$Change: 126414 $"; int main(int argc, char* argv[]) { string program = "samtom4"; string versionString = VERSION; AppendPerforceChangelist(PERFORCE_VERSION_STRING, versionString); string samFileName, refFileName, outFileName; bool printHeader = false; bool parseSmrtTitle = false; bool useShortRefName = false; CommandLineParser clp; clp.SetProgramName(program); clp.SetVersion(versionString); clp.SetProgramSummary("Converts a SAM file generated by blasr to M4 format."); clp.RegisterStringOption("in.sam", &samFileName, "Input SAM file, which is produced by blasr."); clp.RegisterStringOption("reference.fasta", &refFileName, "Reference used to generate file.sam."); clp.RegisterStringOption("out.m4", &outFileName, "Output in blasr M4 format."); clp.RegisterPreviousFlagsAsHidden(); clp.RegisterFlagOption("header", &printHeader, "Print M4 header."); clp.RegisterFlagOption("useShortRefName", &useShortRefName, "Use abbreviated reference names obtained " "from file.sam instead of using full names " "from reference.fasta."); //clp.SetExamples(program + " file.sam reference.fasta out.m4"); clp.ParseCommandLine(argc, argv); ostream * outFilePtr = &cout; ofstream outFileStrm; if (outFileName != "") { CrucialOpen(outFileName, outFileStrm, std::ios::out); outFilePtr = &outFileStrm; } SAMReader samReader; FASTAReader fastaReader; // // Initialize samReader and fastaReader. // samReader.Initialize(samFileName); fastaReader.Initialize(refFileName); // // Configure the file log. // string command; CommandLineParser::CommandLineToString(argc, argv, command); // // Read necessary input. // vector references; fastaReader.ReadAllSequences(references); AlignmentSet alignmentSet; samReader.ReadHeader(alignmentSet); // // The order of references in vector references and // AlignmentSet<, , >alignmentSet.references can be different. // Rearrange alignmentSet.references such that it is ordered in // exactly the same way as vector references. // alignmentSet.RearrangeReferences(references); // // Map short names for references obtained from file.sam to // full names obtained from reference.fasta // map shortRefNameToFull; map::iterator it; assert(references.size() == alignmentSet.references.size()); if (!useShortRefName) { for (size_t i = 0; i < references.size(); i++) { string shortRefName = alignmentSet.references[i].GetSequenceName(); string fullRefName(references[i].title); if (shortRefNameToFull.find(shortRefName) != shortRefNameToFull.end()) { cout << "ERROR, Found more than one reference " << shortRefName << "in sam header" << endl; exit(1); } shortRefNameToFull[shortRefName] = fullRefName; alignmentSet.references[i].sequenceName = fullRefName; } } // Map reference name obtained from SAM file to indices map refNameToIndex; for (size_t i = 0; i < references.size(); i++) { string refName = alignmentSet.references[i].GetSequenceName(); refNameToIndex[refName] = i; } // // Store the alignments. // SAMAlignment samAlignment; size_t alignIndex = 0; // // For 150K, each chip produces about 300M sequences // (not including quality values and etc.). // Let's assume that the sam file and reference data can // fit in the memory. // Need to scale for larger sequal data in the future. // if (printHeader) IntervalOutput::PrintHeader(*outFilePtr); // The socre matrix does not matter because we will use the // aligner's score from SAM file anyway. DistanceMatrixScoreFunction distScoreFn; while (samReader.GetNextAlignment(samAlignment)) { if (samAlignment.rName == "*") { continue; } if (!useShortRefName) { //convert shortRefName to fullRefName it = shortRefNameToFull.find(samAlignment.rName); if (it == shortRefNameToFull.end()) { cout << "ERROR, Could not find " << samAlignment.rName << " in the reference repository." << endl; exit(1); } samAlignment.rName = (*it).second; } // The padding character 'P' is not supported if (samAlignment.cigar.find('P') != string::npos) { cout << "WARNING. Could not process sam record with 'P' in its cigar string." << endl; continue; } vector > convertedAlignments; // // Keep reference as forward. // So if IsReverseComplement(sam.flag)==true, then qStrand is reverse // and tStrand is forward. // bool keepRefAsForward = false; SAMAlignmentsToCandidates(samAlignment, references, refNameToIndex, convertedAlignments, parseSmrtTitle, keepRefAsForward); if (convertedAlignments.size() > 1) { cout << "WARNING. Ignore an alignment which has multiple segments." << endl; continue; } //all alignments are unique single-ended alignments. for (int i = 0; i < 1; i++) { AlignmentCandidate<> & alignment = convertedAlignments[i]; ComputeAlignmentStats(alignment, alignment.qAlignedSeq.seq, alignment.tAlignedSeq.seq, distScoreFn); // Use aligner's score from SAM file anyway. alignment.score = samAlignment.as; alignment.mapQV = samAlignment.mapQV; // Since SAM only has the aligned sequence, many info of the // original query (e.g. the full length) is missing. // Overwrite alignment.qLength (which is length of the query // in the SAM alignment) with xq (which is the length of the // original query sequence saved by blasr) right before printing // the output so that one can reconstruct a blasr m4 record from // a blasr sam alignment. if (samAlignment.xq!=0) alignment.qLength = samAlignment.xq; IntervalOutput::PrintFromSAM(alignment, *outFilePtr); alignment.FreeSubsequences(); } ++alignIndex; } if (outFileName != "") { outFileStrm.close(); } return 0; } blasr-smrtanalysis-4.0.0/utils/ToAfg.cpp000066400000000000000000000144211302464523700202210ustar00rootroot00000000000000#include #include #include #include #include #include #include #include #include #include #include #include #include #include using namespace std; void PrintUsage() { cout << "usage: toAfg input.filetype output.filetype" << endl << " [-minSubreadLength l] " << endl << " [-regionTable regions_file] " << endl << " [-noSplitSubreads]" << endl << " [-useccsdenovo]" << endl << " [-uniformQV QV]" << endl << "Print reads stored in a file (pls|fasta|fastq) as an afg." << endl; } int main(int argc, char* argv[]) { string inputFileName, outputFileName; if (argc < 2) { PrintUsage(); exit(1); } vector inputFileNames; inputFileName = argv[1]; outputFileName = argv[2]; int argi = 3; RegionTable regionTable; string regionsFOFNName = ""; vector regionFileNames; bool splitSubreads = true; bool useCCS = false; bool useUniformQV = false; int uniformQV = 7; int minSubreadLength = 1; while (argi < argc) { if (strcmp(argv[argi], "-regionTable") == 0) { regionsFOFNName = argv[++argi]; } else if (strcmp(argv[argi], "-noSplitSubreads") == 0) { splitSubreads = false; } else if (strcmp(argv[argi], "-minSubreadLength") == 0) { minSubreadLength = atoi(argv[++argi]); } else if (strcmp(argv[argi], "-useccsdenovo") == 0) { useCCS = true; } else if (strcmp(argv[argi], "-uniformQV") == 0) { useUniformQV = true; uniformQV = atoi(argv[++argi]); } else { PrintUsage(); cout << "ERROR! Option " << argv[argi] << " is not supported." << endl; } argi++; } if (FileOfFileNames::IsFOFN(inputFileName)) { FileOfFileNames::FOFNToList(inputFileName, inputFileNames); } else { inputFileNames.push_back(inputFileName); } if (regionsFOFNName == "") { regionFileNames = inputFileNames; } else { if (FileOfFileNames::IsFOFN(regionsFOFNName)) { FileOfFileNames::FOFNToList(regionsFOFNName, regionFileNames); } else { regionFileNames.push_back(regionsFOFNName); } } ofstream fastaOut; CrucialOpen(outputFileName, fastaOut); HDFRegionTableReader hdfRegionReader; AfgBasWriter afgWriter; if (useUniformQV){ afgWriter.SetDefaultQuality(uniformQV); } afgWriter.Initialize(outputFileName); for (size_t plsFileIndex = 0; plsFileIndex < inputFileNames.size(); plsFileIndex++) { if (splitSubreads) { hdfRegionReader.Initialize(regionFileNames[plsFileIndex]); hdfRegionReader.ReadTable(regionTable); } ReaderAgglomerate reader; // reader.SkipReadQuality(); // should have been taken care of by *Filter modules if (useCCS){ reader.UseCCS(); } else { reader.IgnoreCCS(); } reader.Initialize(inputFileNames[plsFileIndex]); CCSSequence seq; int seqIndex = 0; vector subreadIntervals; while (reader.GetNext(seq)){ ++seqIndex; if (useUniformQV && seq.qual.data != NULL){ for (DNALength qvIndex = 0; qvIndex < seq.length; qvIndex++){ seq.qual[qvIndex] = uniformQV; } } if (splitSubreads == false) { if (seq.length >= static_cast(minSubreadLength)) { afgWriter.Write(seq); } seq.Free(); continue; } DNALength hqReadStart, hqReadEnd; int score; GetReadTrimCoordinates(seq, seq.zmwData, regionTable, hqReadStart, hqReadEnd, score); if (regionTable.HasHoleNumber(seq.HoleNumber())) { subreadIntervals = regionTable[seq.HoleNumber()].SubreadIntervals(seq.length, true, true); } else { subreadIntervals = {}; } if (seq.length == 0 and subreadIntervals.size() > 0) { cout << "WARNING! A high quality interval region exists for a read of length 0." <(subreadIntervals[intvIndex].start) > hqReadStart ? static_cast(subreadIntervals[intvIndex].start) : hqReadStart; DNALength subreadEnd = static_cast(subreadIntervals[intvIndex].end) < hqReadEnd ? static_cast(subreadIntervals[intvIndex].end) : hqReadEnd; DNALength subreadLength = subreadEnd - subreadStart; if (subreadLength < DNALength(minSubreadLength)) continue; subreadSequence.SubreadStart(subreadStart); subreadSequence.SubreadEnd (subreadEnd); subreadSequence.ReferenceSubstring(seq, subreadStart, subreadLength); stringstream titleStream; titleStream << seq.title << "/" << subreadIntervals[intvIndex].start << "_" << subreadIntervals[intvIndex].end; subreadSequence.CopyTitle(titleStream.str()); afgWriter.Write(subreadSequence); } seq.Free(); } reader.Close(); hdfRegionReader.Close(); } } blasr-smrtanalysis-4.0.0/utils/bam2bax/000077500000000000000000000000001302464523700200275ustar00rootroot00000000000000blasr-smrtanalysis-4.0.0/utils/bam2bax/BUILD.txt000066400000000000000000000071111302464523700214270ustar00rootroot00000000000000Build instructions for developers: Assuming that blasr and blaser_libcpp is placed under //depot/software/smrtanalysis/bioinformatics/ext/pi $ cd $ module load boost $ mkdir build; cd build; cmake .. $ make $ ../tests/bin/test_bam2bax # to test bam2bax exe Build instructions for users: If pbbam and htslib are prebuilt and included in blasr/defines.mk, set PacBioBAM_INCLUDE_DIRS, HTSLIB_INCLUDE_DIRS, PacBioBAM_LIBRARIES and HTSLIB_LIBRARIES as below. Otherwise, set PacBioBAM_RootDir instead. $ cd $ mkdir build; cd build; $ cmake -DPacBioBAM_INCLUDE_DIRS= \ -DHTSLIB_INCLUDE_DIRS= \ -DPacBioBAM_LIBRARIES= \ -DHTSLIB_LIBRARIES= \ -DPBDATA_INCLUDE_DIRS= -DPBDATA_LIBRARIES= \ -DPBIHDF_INCLUDE_DIRS= -DPBIHDF_LIBRARIES= \ -DBLASR_INCLUDE_DIRS= -DBLASR_LIBRARIES= \ -DHDF5_INCLUDE_DIRS= -DHDF5_CPP_LIBRARIES= \ -DHDF5_LIBRARIES= \ -DBam2Bax_EXE_LINKER_FLAGS="-Wl,--no-as-needed -ldl -pthread -lrt " \ ../ $ make $ ../tests/bin/test_bam2bax # to test bam2bax exe An example: $ cmake -DPacBioBAM_INCLUDE_DIRS=$smrtanalysis/bioinformatics/lib/cpp/pbbam/include \ -DHTSLIB_INCLUDE_DIRS=$smrtanalysis/bioinformatics/lib/cpp/htslib \ -DPacBioBAM_LIBRARIES=$smrtanalysis/bioinformatics/lib/cpp/pbbam/lib/libpbbam.a \ -DHTSLIB_LIBRARIES=$smrtanalysis/bioinformatics/lib/cpp/htslib/libhts.a \ -DPBDATA_INCLUDE_DIRS=$smrtanalysis/bioinformatics/ext/pi/blasr/libcpp/pbdata \ -DPBDATA_LIBRARIES=$smrtanalysis/bioinformatics/ext/pi/blasr/libcpp/pbdata/libpbdata.a \ -DPBIHDF_INCLUDE_DIRS=$smrtanalysis/bioinformatics/ext/pi/blasr/libcpp/hdf \ -DPBIHDF_LIBRARIES=$smrtanalysis/bioinformatics/ext/pi/blasr/libcpp/hdf/libpbihdf.a \ -DBLASR_INCLUDE_DIRS=$smrtanalysis/bioinformatics/ext/pi/blasr/libcpp/alignment/ \ -DBLASR_LIBRARIES=$smrtanalysis/bioinformatics/ext/pi/blasr/libcpp/alignment/libblasr.a \ -DHDF5_INCLUDE_DIRS=$smrtanalysis/prebuilt.out/hdf5/hdf5-1.8.12/ubuntu-1404/include \ -DHDF5_CPP_LIBRARIES=$smrtanalysis/prebuilt.out/hdf5/hdf5-1.8.12/ubuntu-1404/lib/libhdf5_cpp.a \ -DHDF5_LIBRARIES=$smrtanalysis/prebuilt.out/hdf5/hdf5-1.8.12/ubuntu-1404/lib/libhdf5.a \ -DBam2Bax_EXE_LINKER_FLAGS="-Wl,--no-as-needed -ldl -pthread -lrt " \ ../ Alternatively: $ cmake \ -DPacBioBAM_RootDir=$smrtanalsis/bioinformatics/lib/cpp/pbbam \ -DPBDATA_INCLUDE_DIRS=$smrtanalysis/bioinformatics/ext/pi/blasr/libcpp/pbdata \ -DPBDATA_LIBRARIES=$smrtanalysis/bioinformatics/ext/pi/blasr/libcpp/pbdata/libpbdata.a \ -DPBIHDF_INCLUDE_DIRS=$smrtanalysis/bioinformatics/ext/pi/blasr/libcpp/hdf \ -DPBIHDF_LIBRARIES=$smrtanalysis/bioinformatics/ext/pi/blasr/libcpp/hdf/libpbihdf.a \ -DBLASR_INCLUDE_DIRS=$smrtanalysis/bioinformatics/ext/pi/blasr/libcpp/alignment/ \ -DBLASR_LIBRARIES=$smrtanalysis/bioinformatics/ext/pi/blasr/libcpp/alignment/libblasr.a \ -DHDF5_INCLUDE_DIRS=$smrtanalysis/prebuilt.out/hdf5/hdf5-1.8.12/ubuntu-1404/include \ -DHDF5_CPP_LIBRARIES=$smrtanalysis/prebuilt.out/hdf5/hdf5-1.8.12/ubuntu-1404/lib/libhdf5_cpp.a \ -DHDF5_LIBRARIES=$smrtanalysis/prebuilt.out/hdf5/hdf5-1.8.12/ubuntu-1404/lib/libhdf5.a \ -DBam2Bax_EXE_LINKER_FLAGS="-Wl,--no-as-needed -ldl -pthread -lrt " \ ../ blasr-smrtanalysis-4.0.0/utils/bam2bax/CMakeLists.txt000066400000000000000000000113211302464523700225650ustar00rootroot00000000000000######################################################################## # CMake build script for Bam2Bax executable. ######################################################################## project(Bam2Bax CXX C) cmake_minimum_required(VERSION 2.8) # project version set(Bam2Bax_MAJOR_VERSION 0) set(Bam2Bax_MINOR_VERSION 0) set(Bam2Bax_PATCH_VERSION 1) set(Bam2Bax_VERSION "${Bam2Bax_MAJOR_VERSION}.${Bam2Bax_MINOR_VERSION}.${Bam2Bax_PATCH_VERSION}" ) # build-time options option(Bam2Bax_build_tests "Build Bam2Bax's unit tests." ON) # main project paths set(Bam2Bax_RootDir ${Bam2Bax_SOURCE_DIR}) set(Bam2Bax_DocsDir ${Bam2Bax_RootDir}/docs) set(Bam2Bax_SourceDir ${Bam2Bax_RootDir}/src) set(Bam2Bax_TestsDir ${Bam2Bax_RootDir}/tests) set(Bam2Bax_ThirdPartyDir ${Bam2Bax_RootDir}/third-party) if (NOT Bam2Bax_OutputDir) set(Bam2Bax_OutputDir ${Bam2Bax_RootDir}) endif() set(Bam2Bax_BinDir ${Bam2Bax_OutputDir}/bin) file(MAKE_DIRECTORY ${Bam2Bax_BinDir}) # shared & third-party paths if (NOT PBDATA_ROOT_DIR) set(PBDATA_ROOT_DIR ${Bam2Bax_RootDir}/../../../blasr_libcpp) endif() # find (existing) libraries needed by executable and tests if (NOT BLASR_INCLUDE_DIRS OR NOT BLASR_LIBRARIES) find_library(BLASR_LIBRARIES blasr ${PBDATA_ROOT_DIR}/alignment) set(BLASR_INCLUDE_DIRS ${PBDATA_ROOT_DIR}/alignment) endif() if (NOT PBIHDF_INCLUDE_DIRS OR NOT PBIHDF_LIBRARIES) find_library(PBIHDF_LIBRARIES pbihdf ${PBDATA_ROOT_DIR}/hdf) set(PBIHDF_INCLUDE_DIRS ${PBDATA_ROOT_DIR}/hdf) endif() if (NOT PBDATA_INCLUDE_DIRS OR NOT PBDATA_LIBRARIES) find_library(PBDATA_LIBRARIES pbdata ${PBDATA_ROOT_DIR}/pbdata) set(PBDATA_INCLUDE_DIRS ${PBDATA_ROOT_DIR}/pbdata) endif() if (NOT HDF5_INCLUDE_DIRS OR NOT HDF5_LIBRARIES) if (NOT HDF5_RootDir) set(HDF5_RootDir ${Bam2Bax_RootDir}/../../../../../../prebuilt.out/hdf5/hdf5-1.8.12/ubuntu-1404) endif() set(HDF5_INCLUDE_DIRS ${HDF5_RootDir}/include) set(HDF5_LibDir ${HDF5_RootDir}/lib) find_library(HDF5_LIBRARIES hdf5 ${HDF5_LibDir} NO_CMAKE_SYSTEM_PATH) find_library(HDF5_CPP_LIBRARIES hdf5_cpp ${HDF5_LibDir} NO_CMAKE_SYSTEM_PATH) endif() if (NOT PacBioBAM_INCLUDE_DIRS OR NOT PacBioBAM_LIBRARIES OR NOT HTSLIB_INCLUDE_DIRS OR NOT HTSLIB_LIBRARIES) set(PacBioBAM_LIBRARIES ) set(PacBioBAM_INCLUDE_DIRS ) set(HTSLIB_INCLUDE_DIRS ) set(HTSLIB_LIBRARIES ) if (NOT PacBioBAM_RootDir) message ("Must either set (PacBioBAM_INCLUDE_DIRS, PacBioBAM_LIBRARIES, HTSLIB_INCLUDE_DIRS, and HTSLIB_LIBRARIES) or PacBioBAM_RootDir!") endif() add_subdirectory(${PacBioBAM_RootDir} external/build/pbbam) set(PBBAM_LINK_FLAG pbbam) endif() if (NOT Boost_INCLUDE_DIRS) find_package(Boost REQUIRED) endif() if (NOT ZLIB_LIBRARIES OR NOT ZLIB_INCLUDE_DIRS) find_package(ZLIB REQUIRED) endif() # shared CXX flags for src & tests include(CheckCXXCompilerFlag) set(Bam2Bax_CXX_FLAGS "-g -std=c++11 -Wall") # quash warnings from pbdata check_cxx_compiler_flag("-Wno-overloaded-virtual" HAS_NO_OVERLOADED_VIRTUAL) if(HAS_NO_OVERLOADED_VIRTUAL) set(Bam2Bax_CXX_FLAGS "${Bam2Bax_CXX_FLAGS} -Wno-overloaded-virtual") endif() #check_cxx_compiler_flag("-Wno-unused-private-field" HAS_NO_UNUSED_PRIVATE_FIELD) #if(HAS_NO_UNUSED_PRIVATE_FIELD) # set(Bam2Bax_CXX_FLAGS "${Bam2Bax_CXX_FLAGS} -Wno-unused-private-field") #endif() check_cxx_compiler_flag("-Wno-unused-variable" HAS_NO_UNUSED_VARIABLE) if(HAS_NO_UNUSED_VARIABLE) set(Bam2Bax_CXX_FLAGS "${Bam2Bax_CXX_FLAGS} -Wno-unused-variable") endif() check_cxx_compiler_flag("-Wno-uninitialized" HAS_NO_UNINITIALIZED) if(HAS_NO_UNINITIALIZED) set(Bam2Bax_CXX_FLAGS "${Bam2Bax_CXX_FLAGS} -Wno-uninitialized") endif() check_cxx_compiler_flag("-Wunused-but-set-variable" HAS_UNUSED_BUT_SET_VARIABLE) if(HAS_UNUSED_BUT_SET_VARIABLE) set(Bam2Bax_CXX_FLAGS "${Bam2Bax_CXX_FLAGS} -Wunused-but-set-variable") endif() check_cxx_compiler_flag("-Wno-deprecated-declarations" HAS_NO_DEPRECATED_DECLARATIONS) if(HAS_NO_DEPRECATED_DECLARATIONS) set(Bam2Bax_CXX_FLAGS "${Bam2Bax_CXX_FLAGS} -Wno-deprecated-declarations") endif() # NOTE: -Wno-unused-local-typedefs used to quash clang warnings w/ Boost check_cxx_compiler_flag("-Wno-unused-local-typedef" HAS_NO_UNUSED_LOCAL_TYPEDEF) if(HAS_NO_UNUSED_LOCAL_TYPEDEF) set(Bam2Bax_CXX_FLAGS "${Bam2Bax_CXX_FLAGS} -Wno-unused-local-typedef") endif() SET(CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} ${Bam2Bax_LINKER_FLAGS}" ) # main exe src add_subdirectory(src) # testing if(Bam2Bax_build_tests) enable_testing() if (NOT GTEST_SRC_DIR) set(GTEST_SRC_DIR ../gtest) endif() add_subdirectory(${GTEST_SRC_DIR} external/gtest/build) add_subdirectory(tests) endif() blasr-smrtanalysis-4.0.0/utils/bam2bax/README.md000066400000000000000000000002501302464523700213030ustar00rootroot00000000000000#bam2bax# ##Usage## bam2bax movie.subreads.bam movie.scrapes.bam -o movie # movie.bax.h5 will be generated. ##Example## tests/example/end-to-end.sh blasr-smrtanalysis-4.0.0/utils/bam2bax/makefile000066400000000000000000000046671302464523700215440ustar00rootroot00000000000000.PHONY=all SRCDIR:=$(dir $(realpath $(lastword $(MAKEFILE_LIST)))) -include ${CURDIR}/../../defines.mk include ${SRCDIR}/../../rules.mk # If pbbam and htslib are prebuilt and included in defines.mk, # set PacBioBAM_INCLUDE_DIRS, HTSLIB_INCLUDE_DIRS, PacBioBAM_LIBRARIES # and HTSLIB_LIBRARIES as below. Otherwise, set PacBioBAM_RootDir instead. all: ${CURDIR}/src/*.cpp ${CURDIR}/src/*.h ${CURDIR}/tests/src/*.cpp ${CURDIR}/tests/src/*.h @mkdir -p ${CURDIR}/build && \ cd ${CURDIR}/build && \ cmake -DBOOST_ROOT=${BOOST_ROOT} \ -DPacBioBAM_INCLUDE_DIRS=${PBBAM_INC} \ -DHTSLIB_INCLUDE_DIRS=${HTSLIB_INC} \ -DPacBioBAM_LIBRARIES=${PBBAM_LIB}/libpbbam${SH_LIB_EXT} \ -DHTSLIB_LIBRARIES=${HTSLIB_LIB}/libhts${SH_LIB_EXT} \ -DPBDATA_INCLUDE_DIRS=${LIBPBDATA_INC} \ -DPBDATA_LIBRARIES=${LIBPBDATA_LIB}/libpbdata${SH_LIB_EXT} \ -DPBIHDF_INCLUDE_DIRS=${LIBPBIHDF_INC} \ -DPBIHDF_LIBRARIES=${LIBPBIHDF_LIB}/libpbihdf${SH_LIB_EXT} \ -DBLASR_INCLUDE_DIRS=${LIBBLASR_INC}/ \ -DBLASR_LIBRARIES=${LIBBLASR_LIB}/libblasr${SH_LIB_EXT} \ -DHDF5_INCLUDE_DIRS=${HDF5_INC} \ -DHDF5_CPP_LIBRARIES=${HDF5_LIB}/libhdf5_cpp${SH_LIB_EXT} \ -DHDF5_LIBRARIES=${HDF5_LIB}/libhdf5${SH_LIB_EXT} \ -DBam2Bax_EXE_LINKER_FLAGS="-Wl,--no-as-needed -ldl -pthread -lrt " \ ../ && \ make # If pbbam is not prebuilt, just set PacBioBAM_RootDir #all: ${CURDIR}/src/*.cpp ${CURDIR}/src/*.h ${CURDIR}/tests/src/*.cpp ${CURDIR}/tests/src/*.h # @mkdir -p ${CURDIR}/build && \ # cd ${CURDIR}/build && \ # cmake -DBOOST_ROOT=${BOOST_ROOT} \ # -DPacBioBAM_RootDir=/home/UNIXHOME/yli/git/depot/software/smrtanalysis/bioinformatics/lib/cpp/pbbam \ # -DPBDATA_INCLUDE_DIRS=${LIBPBDATA_INC} \ # -DPBDATA_LIBRARIES=${LIBPBDATA_LIB}/libpbdata${SH_LIB_EXT} \ # -DPBIHDF_INCLUDE_DIRS=${LIBPBIHDF_INC} \ # -DPBIHDF_LIBRARIES=${LIBPBIHDF_LIB}/libpbihdf${SH_LIB_EXT} \ # -DBLASR_INCLUDE_DIRS=${LIBBLASR_INC}/ \ # -DBLASR_LIBRARIES=${LIBBLASR_LIB}/libblasr${SH_LIB_EXT} \ # -DHDF5_INCLUDE_DIRS=${HDF5_INC} \ # -DHDF5_CPP_LIBRARIES=${HDF5_LIB}/libhdf5_cpp${SH_LIB_EXT} \ # -DHDF5_LIBRARIES=${HDF5_LIB}/libhdf5${SH_LIB_EXT} \ # -DBam2Bax_EXE_LINKER_FLAGS="-Wl,--no-as-needed -ldl -pthread -lrt " \ # ../ && \ # make clean: @rm -rf ${CURDIR}/bin/ @rm -rf ${CURDIR}/build blasr-smrtanalysis-4.0.0/utils/bam2bax/src/000077500000000000000000000000001302464523700206165ustar00rootroot00000000000000blasr-smrtanalysis-4.0.0/utils/bam2bax/src/Bam2Bax.cpp000066400000000000000000000010241302464523700225330ustar00rootroot00000000000000// Author: Yuan Li #include // getcwd #include #include #include "Bam2Bax.h" #include "Converter.h" using namespace std; int Bam2Bax::Run(Settings& settings) { bool success = false; Converter converter(settings); if (converter.Run()) { success = true; } // return success/fail if (success) return EXIT_SUCCESS; else { for (const string& e : converter.Errors()) cerr << "ERROR: " << e << endl; return EXIT_FAILURE; } } blasr-smrtanalysis-4.0.0/utils/bam2bax/src/Bam2Bax.h000066400000000000000000000002411302464523700222000ustar00rootroot00000000000000// Author: Yuan Li #ifndef BAM2BAX_H #define BAM2BAX_H class Settings; class Bam2Bax { public: static int Run(Settings& settings); }; #endif // BAM2BAX_H blasr-smrtanalysis-4.0.0/utils/bam2bax/src/Bam2BaxConverter.h000066400000000000000000000015311302464523700240730ustar00rootroot00000000000000// Author: Yuan Li #ifndef _BAM2BAXCONVERTER_H_ #define _BAM2BAXCONVERTER_H_ #include #include #include #include #include #include #include #include #include #include #include #include "HDFFile.hpp" #include "RegionsAdapter.h" #include "IConverter.h" template class Bam2BaxConverter : public IConverter { public: Bam2BaxConverter(Settings & settings) :IConverter(settings) {} ~Bam2BaxConverter(void) {} bool Run(void) {return ConvertFile();} protected: bool ConvertFile(void); }; #include "Bam2BaxConverterImpl.hpp" #endif blasr-smrtanalysis-4.0.0/utils/bam2bax/src/Bam2BaxConverterImpl.hpp000066400000000000000000000040741302464523700252620ustar00rootroot00000000000000// Author: Yuan Li #ifndef BAM2BAX_CONVERTER_IMPL_HPP #define BAM2BAX_CONVERTER_IMPL_HPP #include #include "MetadataWriter.h" #include "Bam2BaxInternal.h" #include #include template bool Bam2BaxConverter::ConvertFile(void) { // Write metadata.xml to parent directory of Bax.h5. if (not settings_.outputMetadataFilename.empty()) MetadataWriter metaWriter_(settings_.outputMetadataFilename, rg, settings_.outputAnalysisDirname); T_HDFWRITER writer(outfn, rg.BasecallerVersion(), scandata.BaseMap(), qvs, Bam2BaxDefaults::Bax_Regions_RegionTypes); if (settings_.traceFilename.empty()) { writer.WriteScanData(scandata); } else { HDFFile traceFile; traceFile.Open(settings_.traceFilename, H5F_ACC_RDONLY); writer.CopyObject(traceFile, "/ScanData"); traceFile.Close(); } if (not settings_.subreadsBamFilename.empty() and not settings_.scrapsBamFilename.empty()) { // Stich subreads and scraps in order to reconstruct polymerase reads. PacBio::BAM::VirtualPolymeraseReader reader(settings_.subreadsBamFilename, settings_.scrapsBamFilename); while(reader.HasNext()) { // FIXME: pbbam should not crash when reading internal pulse features. const PacBio::BAM::VirtualPolymeraseBamRecord & record = reader.Next(); SMRTSequence smrt; smrt.Copy(record, true); std::vector ras = RegionsAdapter::ToRegionAnnotations(record, regionTypes); if (not writer.WriteOneZmw(smrt, ras) or not writer.Errors().empty()) { break; } writer.Flush(); } if (not settings_.ignoreQV) writer.WriteFakeDataSets(); for (auto error: writer.Errors()) { AddErrorMessage(error); } } return errors_.empty(); } #endif blasr-smrtanalysis-4.0.0/utils/bam2bax/src/Bam2BaxInternal.h000066400000000000000000000101131302464523700236740ustar00rootroot00000000000000// Author: Yuan Li #ifndef _BAM2BAXINTERNAL_H_ #define _BAM2BAXINTERNAL_H_ #include //namespace internal namespace internal { /// \name \{ static const std::vector QVEnums = { PacBio::BAM::BaseFeature::DELETION_QV , PacBio::BAM::BaseFeature::DELETION_TAG , PacBio::BAM::BaseFeature::INSERTION_QV , PacBio::BAM::BaseFeature::MERGE_QV , PacBio::BAM::BaseFeature::SUBSTITUTION_QV , PacBio::BAM::BaseFeature::SUBSTITUTION_TAG , PacBio::BAM::BaseFeature::IPD , PacBio::BAM::BaseFeature::PULSE_WIDTH , PacBio::BAM::BaseFeature::PKMID , PacBio::BAM::BaseFeature::PKMEAN , PacBio::BAM::BaseFeature::LABEL , PacBio::BAM::BaseFeature::LABEL_QV , PacBio::BAM::BaseFeature::ALT_LABEL , PacBio::BAM::BaseFeature::ALT_LABEL_QV , PacBio::BAM::BaseFeature::PULSE_MERGE_QV , PacBio::BAM::BaseFeature::PULSE_CALL , PacBio::BAM::BaseFeature::START_FRAME , PacBio::BAM::BaseFeature::PULSE_CALL_WIDTH }; /// \returns QVs contained by read group rg. /// FIXME: this function should be provided by pbbam.ReadGroupInfo /// FIXME: pbbam, ReadGroupInfo does not recognize internal pulse features such as AltLabelQV. inline std::vector QVEnumsInReadGroup(const PacBio::BAM::ReadGroupInfo & rg) { std::vector ret; for (auto it = internal::QVEnums.begin(); it != internal::QVEnums.end(); it++) { if (rg.HasBaseFeature(*it)) { ret.push_back(*it); } } return ret; } /// \} /// \returns QVs contained by the first record if it exists, otherwise, return {} /// FIXME: this function provides an alternative route to get QVs contained in the bam file now, /// because pbbam ReadGroupInfo does not recorgize internal pulse features such as AltLabelQV. /// Note: Ignore Label because it is neither base feature nor internal pulse feature. inline std::vector QVEnumsInFirstRecord(const PacBio::BAM::BamFile & bamFile) { std::vector ret; PacBio::BAM::EntireFileQuery query(bamFile); for (const PacBio::BAM::BamRecord & record: query) { if (record.HasDeletionQV()) {ret.push_back(PacBio::BAM::BaseFeature::DELETION_QV);} if (record.HasDeletionTag()) {ret.push_back(PacBio::BAM::BaseFeature::DELETION_TAG);} if (record.HasInsertionQV()) {ret.push_back(PacBio::BAM::BaseFeature::INSERTION_QV);} if (record.HasMergeQV()) {ret.push_back(PacBio::BAM::BaseFeature::MERGE_QV);} if (record.HasSubstitutionQV()) {ret.push_back(PacBio::BAM::BaseFeature::SUBSTITUTION_QV);} if (record.HasSubstitutionTag()) {ret.push_back(PacBio::BAM::BaseFeature::SUBSTITUTION_TAG);} if (record.HasIPD()) {ret.push_back(PacBio::BAM::BaseFeature::IPD);} if (record.HasPulseWidth()) {ret.push_back(PacBio::BAM::BaseFeature::PULSE_WIDTH);} if (record.HasPkmid()) {ret.push_back(PacBio::BAM::BaseFeature::PKMID);} if (record.HasPkmean()) {ret.push_back(PacBio::BAM::BaseFeature::PKMEAN);} if (record.HasLabelQV()) {ret.push_back(PacBio::BAM::BaseFeature::LABEL_QV);} if (record.HasAltLabelTag()) {ret.push_back(PacBio::BAM::BaseFeature::ALT_LABEL);} if (record.HasAltLabelQV()) {ret.push_back(PacBio::BAM::BaseFeature::ALT_LABEL_QV);} if (record.HasPulseMergeQV()) {ret.push_back(PacBio::BAM::BaseFeature::PULSE_MERGE_QV);} if (record.HasPulseCall()) {ret.push_back(PacBio::BAM::BaseFeature::PULSE_CALL);} if (record.HasStartFrame()) {ret.push_back(PacBio::BAM::BaseFeature::START_FRAME);} if (record.HasPulseCallWidth()) {ret.push_back(PacBio::BAM::BaseFeature::PULSE_CALL_WIDTH);} break; // only use the first record. } return ret; } }; #endif blasr-smrtanalysis-4.0.0/utils/bam2bax/src/Bam2BaxMain.cpp000066400000000000000000000052251302464523700233470ustar00rootroot00000000000000// Author: Yuan Li #include "Bam2Bax.h" #include "OptionParser.h" #include "Settings.h" #include #include #include using namespace std; int main(int argc, char* argv[]) { // setup help & options optparse::OptionParser parser; parser.description("bam2bax converts the PacBio BAM format into bax.h5 format."); parser.prog("bam2bax"); parser.version("1.0.0.170337"); parser.add_version_option(true); parser.add_help_option(true); auto ioGroup = optparse::OptionGroup(parser, "Input/output files"); ioGroup.add_option("") .dest(Settings::Option::input_) .metavar("movie.subreads.bam movie.scraps.bam") .help("A movie.subreads.bam and a movie.scraps.bam"); ioGroup.add_option("--trace") .dest(Settings::Option::trace_) .metavar("movie.trc.h5") .help("(Optional but recommended) Input trace file to copy ScanData from"); ioGroup.add_option("-o") .dest(Settings::Option::output_) .metavar("STRING") .help("Prefix of output filenames. Movie name will be used if no prefix provided"); ioGroup.add_option("--metadata") .dest(Settings::Option::metadata_) .action("store_true") .help("Write metadata.xml to the upper directory of output file."); parser.add_option_group(ioGroup); auto modeGroup = optparse::OptionGroup(parser, "Output file types (mutually exclusive:)"); modeGroup.add_option("--base") .dest(Settings::Option::baseMode_) .metavar("") .action("store_true") .help("Output bax.h5 (default)"); modeGroup.add_option("--pulse") .dest(Settings::Option::pulseMode_) .metavar("") .action("store_true") .help("Output pls.h5"); modeGroup.add_option("--baseMap") .dest(Settings::Option::baseMap_) .metavar(Settings::OptionValue::baseMap_) .help("Set /ScanData/DyeSet/BaseMap, mapping channels to bases."); modeGroup.add_option("--ignoreQV") .dest(Settings::Option::ignoreQV_) .metavar("") .action("store_true") .help("Don't save QVs in ouptut file."); parser.add_option_group(modeGroup); // parse command line Settings settings = Settings::FromCommandLine(parser, argc, argv); if (!settings.errors_.empty()) { cerr << endl; for (const auto e : settings.errors_) cerr << "ERROR: " << e << endl; cerr << endl; parser.print_help(); return EXIT_FAILURE; } // main conversion return Bam2Bax::Run(settings); } blasr-smrtanalysis-4.0.0/utils/bam2bax/src/Bam2PlxMain.cpp000066400000000000000000000043761302464523700234060ustar00rootroot00000000000000// Author: Yuan Li #include "Bam2Bax.h" #include "OptionParser.h" #include "Settings.h" #include #include #include using namespace std; int main(int argc, char* argv[]) { // setup help & options optparse::OptionParser parser; parser.description("bam2plx converts the PacBio Internal BAM format into plx.h5 format."); parser.prog("bam2plx"); parser.version("1.0.0.170337"); parser.add_version_option(true); parser.add_help_option(true); auto ioGroup = optparse::OptionGroup(parser, "Input/output files"); ioGroup.add_option("") .dest(Settings::Option::input_) .metavar("movie.subreads.bam movie.scraps.bam") .help("A movie.subreads.bam and a movie.scraps.bam"); ioGroup.add_option("-o") .dest(Settings::Option::output_) .metavar("STRING") .help("Prefix of output filenames. Movie name will be used if no prefix provided"); ioGroup.add_option("--metadata") .dest(Settings::Option::metadata_) .action("store_true") .help("Write metadata.xml to the upper directory of output file."); parser.add_option_group(ioGroup); auto modeGroup = optparse::OptionGroup(parser, "Output file types (mutually exclusive:)"); modeGroup.add_option("--baseMap") .dest(Settings::Option::baseMap_) .metavar(Settings::OptionValue::baseMap_) .help("Set /ScanData/DyeSet/BaseMap, mapping channels to bases."); modeGroup.add_option("--ignoreQV") .dest(Settings::Option::ignoreQV_) .metavar("") .action("store_true") .help("Don't save QVs in ouptut file."); parser.add_option_group(modeGroup); // parse command line Settings settings = Settings::FromCommandLine(parser, argc, argv, true); if (!settings.errors_.empty()) { cerr << endl; for (const auto e : settings.errors_) cerr << "ERROR: " << e << endl; cerr << endl; parser.print_help(); return EXIT_FAILURE; } // Reset settings.mode to pulse mode. settings.mode = Settings::PulseMode; settings.outputBaxFilename = settings.outputBaxPrefix + ".plx.h5"; // main conversion return Bam2Bax::Run(settings); } blasr-smrtanalysis-4.0.0/utils/bam2bax/src/CMakeLists.txt000066400000000000000000000032331302464523700233570ustar00rootroot00000000000000include_directories( . ${BLASR_INCLUDE_DIRS} ${Boost_INCLUDE_DIRS} ${HDF5_INCLUDE_DIRS} ${HTSLIB_INCLUDE_DIRS} ${PacBioBAM_INCLUDE_DIRS} ${PBDATA_INCLUDE_DIRS} ${PBDATA_ROOT_DIR} ${PBIHDF_INCLUDE_DIRS} ${ZLIB_INCLUDE_DIRS} ) set(SOURCES Settings.h Settings.cpp Converter.h Converter.cpp Bam2BaxInternal.h RegionTypeAdapter.h RegionsAdapter.h Bam2BaxConverter.h Bam2BaxConverterImpl.hpp Bam2Bax.h Bam2Bax.cpp OptionParser.h OptionParser.cpp MetadataWriter.h MetadataWriter.cpp ) set(BAM2BAX_SOURCES Bam2BaxMain.cpp ${SOURCES} ) set(BAM2PLX_SOURCES Bam2PlxMain.cpp ${SOURCES} ) set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${Bam2Bax_CXX_FLAGS}") set(CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} ${Bam2Bax_EXE_LINKER_FLAGS}") add_executable(bam2bax ${BAM2BAX_SOURCES}) set_target_properties(bam2bax PROPERTIES RUNTIME_OUTPUT_DIRECTORY ${Bam2Bax_BinDir} ) if (NOT APPLE) set(MY_LIBRT -lrt) else() endif() target_link_libraries(bam2bax ${PBBAM_LINK_FLAG} ${BLASR_LIBRARIES} ${PBIHDF_LIBRARIES} ${PBDATA_LIBRARIES} ${HDF5_CPP_LIBRARIES} ${HDF5_LIBRARIES} ${PacBioBAM_LIBRARIES} ${HTSLIB_LIBRARIES} ${ZLIB_LIBRARIES} ${MY_LIBRT} ) add_executable(bam2plx ${BAM2PLX_SOURCES} ) set_target_properties(bam2plx PROPERTIES RUNTIME_OUTPUT_DIRECTORY ${Bam2Bax_BinDir} ) target_link_libraries(bam2plx ${PBBAM_LINK_FLAG} ${BLASR_LIBRARIES} ${PBIHDF_LIBRARIES} ${PBDATA_LIBRARIES} ${HDF5_CPP_LIBRARIES} ${HDF5_LIBRARIES} ${PacBioBAM_LIBRARIES} ${HTSLIB_LIBRARIES} ${ZLIB_LIBRARIES} ${MY_LIBRT} ) blasr-smrtanalysis-4.0.0/utils/bam2bax/src/Converter.cpp000066400000000000000000000123271302464523700232760ustar00rootroot00000000000000#include "Converter.h" Converter::Converter(Settings const& settings) :settings_(settings) { writer_ = NULL; scanData_ = NULL; std::string infn = settings_.subreadsBamFilename; bamfile_ = new PacBio::BAM::BamFile(infn); PacBio::BAM::BamHeader bamheader = bamfile_->Header(); if (bamheader.ReadGroups().size() != 1) { AddErrorMessage("Bam file must contain reads from exactly one SMRTCell."); // XXX: Throw initialization exception } PacBio::BAM::ReadGroupInfo rg = bamheader.ReadGroups()[0]; MockScanData(rg); // Write metadata.xml to parent directory of Bax.h5. if (not settings_.outputMetadataFilename.empty()) MetadataWriter metaWriter_(settings_.outputMetadataFilename, rg, settings_.outputAnalysisDirname); // FIXME: pbbam needs to provide an API which returns BaseFeatures in read group std::vector qvs = settings_.ignoreQV ? std::vector({}) : internal::QVEnumsInFirstRecord(*bamfile_); InitializeWriter(rg.BasecallerVersion(), qvs); } Converter::~Converter(void) { if (scanData_ != NULL) delete scanData_; if (writer_ != NULL) delete writer_; delete bamfile_; } std::vector Converter::Errors(void) const { return errors_; } bool Converter::Run() { if (settings_.traceFilename.empty()) { writer_->WriteScanData(*scanData_); } else { HDFFile traceFile; traceFile.Open(settings_.traceFilename, H5F_ACC_RDONLY); writer_->CopyObject(traceFile, "/ScanData"); if (settings_.mode == Settings::PulseMode) { SetInverseGain(traceFile); } traceFile.Close(); } // Regions attribute RegionTypes, which defines supported region types in ORDER. std::vector regionTypes = RegionTypeAdapter::ToRegionTypes(Bam2BaxDefaults::Bax_Regions_RegionTypes); if (not settings_.subreadsBamFilename.empty() and not settings_.scrapsBamFilename.empty()) { // Stich subreads and scraps in order to reconstruct polymerase reads. PacBio::BAM::VirtualPolymeraseReader reader(settings_.subreadsBamFilename, settings_.scrapsBamFilename); while(reader.HasNext()) { // FIXME: pbbam should not crash when reading internal pulse features. const PacBio::BAM::VirtualPolymeraseBamRecord & record = reader.Next(); SMRTSequence smrt; smrt.Copy(record, true); std::vector ras = RegionsAdapter::ToRegionAnnotations(record, regionTypes); if (not writer_->WriteOneZmw(smrt, ras) or not writer_->Errors().empty()) { break; } writer_->Flush(); } if (not settings_.ignoreQV) writer_->WriteFakeDataSets(); for (auto error: writer_->Errors()) { AddErrorMessage(error); } } return errors_.empty(); } void Converter::MockScanData(PacBio::BAM::ReadGroupInfo& rg) { // Construct AcqParams AcqParams acqParams(Bam2BaxDefaults::Bax_ScanData_AduGain, Bam2BaxDefaults::Bax_ScanData_CameraGain, Bam2BaxDefaults::Bax_ScanData_CameraType, Bam2BaxDefaults::Bax_ScanData_HotStartFrame, Bam2BaxDefaults::Bax_ScanData_LaserOnFrame); // Construct scandata. scanData_ = new ScanData(acqParams); scanData_->PlatformID(Sequel) // assume sequel movie .MovieName(rg.MovieName()) // should be reliable now .WhenStarted(rg.Date()) .RunCode(Bam2BaxDefaults::Bax_ScanData_RunCode) // bam does not contain RunCode .NumFrames(Bam2BaxDefaults::Bax_ScanData_NumFrames) // bam does not contain NumFrames .FrameRate(Bam2BaxDefaults::Bax_ScanData_FrameRate) // Ignore bam header FrameRate. .SequencingKit(rg.SequencingKit()) .BindingKit(rg.BindingKit()) .BaseMap(settings_.baseMap); } void Converter::InitializeWriter(const std::string& bcvers, const std::vector& qvs) { std::string outfn = settings_.outputBaxFilename; Settings::Mode mode = settings_.mode; if (mode == Settings::BaseMode) { std::cout << "Converting BAM to bax.h5." << std::endl; writer_ = new HDFBaxWriter(outfn, bcvers, scanData_->BaseMap(), qvs, Bam2BaxDefaults::Bax_Regions_RegionTypes); } else if (mode == Settings::PulseMode) { std::cout << "Converting BAM to plx.h5." << std::endl; writer_ = new HDFPulseWriter(outfn, bcvers, scanData_->BaseMap(), qvs, Bam2BaxDefaults::Bax_Regions_RegionTypes); } else { std::cerr << "UNKNOWN mode." << settings_.mode << std::endl; throw std::exception(); } } void Converter::SetInverseGain(HDFFile& traceFile) { H5::Group acqGrp = traceFile.hdfFile.openGroup("/ScanData/AcqParams"); H5::Attribute aduAttr = acqGrp.openAttribute("AduGain"); float igain; H5::DataType* dt = new H5::DataType(H5::PredType::IEEE_F32LE); aduAttr.read(*dt, &igain); HDFPulseWriter* pw = static_cast(writer_); pw->SetInverseGain(igain); } blasr-smrtanalysis-4.0.0/utils/bam2bax/src/Converter.h000066400000000000000000000054071302464523700227440ustar00rootroot00000000000000// Author: Yuan Li #ifndef BAM2BAX_ICONVERTER_H_ #define BAM2BAX_ICONVERTER_H_ #include #include #include #include "pbdata/Enumerations.h" #include "pbbam/BamFile.h" #include "pbbam/BamHeader.h" #include "pbbam/ReadGroupInfo.h" #include "pbbam/virtual/VirtualPolymeraseReader.h" #include "pbbam/virtual/VirtualPolymeraseBamRecord.h" #include "pbbam/virtual/VirtualRegion.h" #include "pbbam/virtual/VirtualRegionType.h" #include "pbbam/virtual/VirtualRegionTypeMap.h" #include "HDFWriterBase.hpp" #include "HDFBaxWriter.hpp" #include "HDFPulseWriter.hpp" #include "RegionsAdapter.h" #include "Settings.h" #include "MetadataWriter.h" #include "Bam2BaxInternal.h" namespace Bam2BaxDefaults { // Default value of attribute /ScanData/AcqParams/NumFrames in Bax. static const unsigned int Bax_ScanData_NumFrames = 0; // Default value of attribute /ScanData/AcqParams/AduGain in Bax. static const float Bax_ScanData_AduGain = 1.0; // Default value of attribute /ScanData/AcqParams/CameraGain in Bax. static const float Bax_ScanData_CameraGain = 1.0; // Default value of attribute /ScanData/AcqParams/CameraType in Bax. static const int Bax_ScanData_CameraType = 0; // Default value of attribute /ScanData/AcqParams/HotStartFrame in Bax. static const UInt Bax_ScanData_HotStartFrame = 0; // Default value of attribute /ScanData/AcqParams/LaserOnFrame in Bax. static const UInt Bax_ScanData_LaserOnFrame = 0; // Default value of attribute /ScanData/AcqParams/FrameRate in Bax. static const float Bax_ScanData_FrameRate = 80.047035; // Default value of attribute /ScanData/RunInfo/RunCode in Bax. static const std::string Bax_ScanData_RunCode = "Bam2Bax_Run_Code"; // Default value of attribute /ScanData/DyeSet/BaseMap in Bax. static const std::string Bax_ScanData_BaseMap = PacBio::AttributeValues::ScanData::DyeSet::basemap; // Default value of attribute /Regions/RegionTypes in Bax. static const std::vector Bax_Regions_RegionTypes = PacBio::AttributeValues::Regions::regiontypes; } class Converter { public: Converter(Settings const& settings); ~Converter(void); public: std::vector Errors(void) const; bool Run(); protected: void AddErrorMessage(const std::string & errmsg) { errors_.push_back(errmsg); } protected: // protected variables Settings const& settings_; ScanData* scanData_; HDFWriterBase* writer_; PacBio::BAM::BamFile* bamfile_; std::vector errors_; private: void MockScanData(PacBio::BAM::ReadGroupInfo& rg); void InitializeWriter(const std::string& bcvers, const std::vector& qvs); void SetInverseGain(HDFFile& traceFile); }; #endif blasr-smrtanalysis-4.0.0/utils/bam2bax/src/MetadataWriter.cpp000066400000000000000000000032731302464523700242440ustar00rootroot00000000000000#include "MetadataWriter.h" std::string internal::Replace(const std::string & in_str, const std::string & to_find, const std::string & to_replace) { // Replace the first occurrence of to_find by to_replace. std::string ret = in_str; std::size_t pos = ret.find(to_find); if (pos != std::string::npos) { ret.replace(pos, to_find.size(), to_replace); } return ret; } MetadataWriter::MetadataWriter(const std::string & filename, const PacBio::BAM::ReadGroupInfo & rg, const std::string & analysisDir) { MetadataWriter(filename, rg.BasecallerVersion(), rg.SequencingKit(), rg.BindingKit(), analysisDir); } MetadataWriter::MetadataWriter(const std::string & filename, const std::string & basecallerVersion, const std::string & sequencingKit, const std::string & bindingKit, const std::string & analysisDir) { assert(analysisDir.find('/') == std::string::npos); std::ofstream ofile; ofile.open(filename, std::ofstream::out); std::string to_print = internal::META_CONTENT; to_print = internal::Replace(to_print, "__BASECALLERVERSION__", basecallerVersion); to_print = internal::Replace(to_print, "__SEQUENCINGKIT__", sequencingKit); to_print = internal::Replace(to_print, "__BINDINGKIT__", bindingKit); to_print = internal::Replace(to_print, "__ANALYSISDIR__", analysisDir); ofile << to_print << std::endl; ofile.close(); } blasr-smrtanalysis-4.0.0/utils/bam2bax/src/MetadataWriter.h000066400000000000000000000030371302464523700237070ustar00rootroot00000000000000// Author: Yuan Li #ifndef _BAM2BAX_METADATA_WRITER_H_ #define _BAM2BAX_METADATA_WRITER_H_ #include #include #include #include namespace internal{ const std::string DEFAULT_ANALYSIS_DIR = "Analysis_Results"; const std::string META_CONTENT = "__BASECALLERVERSION__31__BINDINGKIT____SEQUENCINGKIT__BasecallerV1__ANALYSISDIR__"; std::string Replace(const std::string & in_str, const std::string & to_find, const std::string & to_replace); } //namespace internal class MetadataWriter { public: MetadataWriter(const std::string & filename, const PacBio::BAM::ReadGroupInfo & rg, const std::string & analysisDir=internal::DEFAULT_ANALYSIS_DIR); MetadataWriter(const std::string & filename, const std::string & basecallerVersion, const std::string & sequencingKit, const std::string & bindingKit, const std::string & analysisDir); ~MetadataWriter(void) {} }; #endif blasr-smrtanalysis-4.0.0/utils/bam2bax/src/OptionParser.cpp000066400000000000000000000367751302464523700237710ustar00rootroot00000000000000/** * Copyright (C) 2010 Johannes Weißl * License: your favourite BSD-style license * * See OptionParser.h for help. */ #include "OptionParser.h" #include #include #include #include #if defined(ENABLE_NLS) && ENABLE_NLS # include # define _(s) gettext(s) #else # define _(s) ((const char *) (s)) #endif using namespace std; namespace optparse { ////////// auxiliary (string) functions { ////////// class str_wrap { public: str_wrap(const string& l, const string& r) : lwrap(l), rwrap(r) {} str_wrap(const string& w) : lwrap(w), rwrap(w) {} string operator() (const string& s) { return lwrap + s + rwrap; } const string lwrap, rwrap; }; template static string str_join_trans(const string& sep, InputIterator begin, InputIterator end, UnaryOperator op) { string buf; for (InputIterator it = begin; it != end; ++it) { if (it != begin) buf += sep; buf += op(*it); } return buf; } template static string str_join(const string& sep, InputIterator begin, InputIterator end) { return str_join_trans(sep, begin, end, str_wrap("")); } static string& str_replace(string& s, const string& patt, const string& repl) { size_t pos = 0, n = patt.length(); while (true) { pos = s.find(patt, pos); if (pos == string::npos) break; s.replace(pos, n, repl); pos += repl.size(); } return s; } static string str_replace(const string& s, const string& patt, const string& repl) { string tmp = s; str_replace(tmp, patt, repl); return tmp; } static string str_format(const string& s, size_t pre, size_t len, bool indent_first = true) { stringstream ss; string p; if (indent_first) p = string(pre, ' '); size_t pos = 0, linestart = 0; size_t line = 0; while (true) { bool wrap = false; size_t new_pos = s.find_first_of(" \n\t", pos); if (new_pos == string::npos) break; if (s[new_pos] == '\n') { pos = new_pos + 1; wrap = true; } if (line == 1) p = string(pre, ' '); if (wrap || new_pos + pre > linestart + len) { ss << p << s.substr(linestart, pos - linestart - 1) << endl; linestart = pos; line++; } pos = new_pos + 1; } ss << p << s.substr(linestart) << endl; return ss.str(); } static string str_inc(const string& s) { stringstream ss; string v = (s != "") ? s : "0"; long i; istringstream(v) >> i; ss << i+1; return ss.str(); } static unsigned int cols() { unsigned int n = 80; #ifndef _WIN32 const char *s = getenv("COLUMNS"); if (s) istringstream(s) >> n; #endif return n; } static string basename(const string& s) { string b = s; size_t i = b.find_last_not_of('/'); if (i == string::npos) { if (b[0] == '/') b.erase(1); return b; } b.erase(i+1, b.length()-i-1); i = b.find_last_of("/"); if (i != string::npos) b.erase(0, i+1); return b; } ////////// } auxiliary (string) functions ////////// ////////// class OptionParser { ////////// OptionParser::OptionParser() : _usage(_("%prog [options]")), _add_help_option(true), _add_version_option(true), _interspersed_args(true) {} Option& OptionParser::add_option(const string& opt) { const string tmp[1] = { opt }; return add_option(vector(&tmp[0], &tmp[1])); } Option& OptionParser::add_option(const string& opt1, const string& opt2) { const string tmp[2] = { opt1, opt2 }; return add_option(vector(&tmp[0], &tmp[2])); } Option& OptionParser::add_option(const string& opt1, const string& opt2, const string& opt3) { const string tmp[3] = { opt1, opt2, opt3 }; return add_option(vector(&tmp[0], &tmp[3])); } Option& OptionParser::add_option(const vector& v) { _opts.resize(_opts.size()+1); Option& option = _opts.back(); string dest_fallback; for (vector::const_iterator it = v.begin(); it != v.end(); ++it) { if (it->substr(0,2) == "--") { const string s = it->substr(2); if (option.dest() == "") option.dest(str_replace(s, "-", "_")); option._long_opts.insert(s); _optmap_l[s] = &option; } else if ( it->empty() ) { continue; } else { const string s = it->substr(1,1); if (dest_fallback == "") dest_fallback = s; option._short_opts.insert(s); _optmap_s[s] = &option; } } if (option.dest() == "") option.dest(dest_fallback); return option; } OptionParser& OptionParser::add_option_group(const OptionGroup& group) { for (list