pax_global_header00006660000000000000000000000064126073765670014535gustar00rootroot0000000000000052 comment=8e668beae0dda1da6914586fb458182c6c3c7482 blasr-8e668beae0dda1da6914586fb458182c6c3c7482/000077500000000000000000000000001260737656700176605ustar00rootroot00000000000000blasr-8e668beae0dda1da6914586fb458182c6c3c7482/.gitignore000066400000000000000000000000311260737656700216420ustar00rootroot00000000000000defines.mk *.d *.o blasr blasr-8e668beae0dda1da6914586fb458182c6c3c7482/.gitmodules000066400000000000000000000001401260737656700220300ustar00rootroot00000000000000[submodule "libcpp"] path = libcpp url = git://github.com/PacificBiosciences/blasr_libcpp.git blasr-8e668beae0dda1da6914586fb458182c6c3c7482/Blasr.cpp000066400000000000000000001547331260737656700214440ustar00rootroot00000000000000// Copyright (c) 2014-2015, Pacific Biosciences of California, Inc. // // All rights reserved. // // Redistribution and use in source and binary forms, with or without // modification, are permitted (subject to the limitations in the // disclaimer below) provided that the following conditions are met: // // * Redistributions of source code must retain the above copyright // notice, this list of conditions and the following disclaimer. // // * Redistributions in binary form must reproduce the above // copyright notice, this list of conditions and the following // disclaimer in the documentation and/or other materials provided // with the distribution. // // * Neither the name of Pacific Biosciences nor the names of its // contributors may be used to endorse or promote products derived // from this software without specific prior written permission. // // NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE // GRANTED BY THIS LICENSE. THIS SOFTWARE IS PROVIDED BY PACIFIC // BIOSCIENCES AND ITS CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED // WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES // OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE // DISCLAIMED. IN NO EVENT SHALL PACIFIC BIOSCIENCES OR ITS // CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, // SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT // LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF // USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND // ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, // OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT // OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF // SUCH DAMAGE. // Author: Mark Chaisson #include "BlasrMiscs.hpp" #include "BlasrUtils.hpp" #include "BlasrAlign.hpp" #include "RegisterBlasrOptions.h" //#define USE_GOOGLE_PROFILER #ifdef USE_GOOGLE_PROFILER #include "gperftools/profiler.h" #endif using namespace std; // Declare global structures that are shared between threads. MappingSemaphores semaphores; ostream *outFilePtr = NULL; #ifdef USE_PBBAM PacBio::BAM::BamWriter * bamWriterPtr = NULL; #endif HDFRegionTableReader *regionTableReader = NULL; ReaderAgglomerate *reader = NULL; const string GetMajorVersion() { return "2.0.0"; } const string GetVersion(void) { string perforceVersionString("$Change$"); string version = GetMajorVersion(); if (perforceVersionString.size() > 12) { version.insert(version.size(), "."); version.insert(version.size(), perforceVersionString, 9, perforceVersionString.size() - 11); } return version; } /// Checks whether a smrtRead meets the following criteria /// (1) is within the search holeNumber range specified by params.holeNumberRanges. /// (2) its length greater than params.maxReadlength /// (3) its read score (rq) is greater than params.minRawSubreadScore /// (4) its qual is greater than params.minAvgQual. /// Change stop to false if /// HoleNumber of the smrtRead is greater than the search holeNumber range. bool IsGoodRead(const SMRTSequence & smrtRead, MappingParameters & params, bool & stop) { if (params.holeNumberRangesStr.size() > 0 and not params.holeNumberRanges.contains(smrtRead.HoleNumber())) { // Stop processing once the specified zmw hole number is reached. // Eventually this will change to just seek to hole number, and // just align one read anyway. if (smrtRead.HoleNumber() > params.holeNumberRanges.max()){ stop = true; return false; } return false; } // // Discard reads that are too small, or not labeled as having any // useable/good sequence. // if (smrtRead.highQualityRegionScore < params.minRawSubreadScore or (params.maxReadLength != 0 and smrtRead.length > UInt(params.maxReadLength)) or (smrtRead.length < params.minReadLength)) { return false; } if (smrtRead.qual.Empty() != false and smrtRead.GetAverageQuality() < params.minAvgQual) { return false; } return true; } // Make primary intervals (which are intervals of subreads to align // in the first round) from none BAM file using region table. void MakePrimaryIntervals(RegionTable * regionTablePtr, SMRTSequence & smrtRead, vector & subreadIntervals, vector & subreadDirections, int & bestSubreadIndex, MappingParameters & params) { vector adapterIntervals; // // Determine endpoints of this subread in the main read. // if (params.useRegionTable == false) { // // When there is no region table, the subread is the entire // read. // ReadInterval wholeRead(0, smrtRead.length); // The set of subread intervals is just the entire read. subreadIntervals.push_back(wholeRead); } else { // // Grab the subread & adapter intervals from the entire region table to // iterate over. // assert(regionTablePtr->HasHoleNumber(smrtRead.HoleNumber())); subreadIntervals = (*regionTablePtr)[smrtRead.HoleNumber()].SubreadIntervals(smrtRead.length, params.byAdapter); adapterIntervals = (*regionTablePtr)[smrtRead.HoleNumber()].AdapterIntervals(); } // The assumption is that neighboring subreads must have the opposite // directions. So create directions for subread intervals with // interleaved 0s and 1s. CreateDirections(subreadDirections, subreadIntervals.size()); // // Trim the boundaries of subread intervals so that only high quality // regions are included in the intervals, not N's. Remove intervals // and their corresponding dirctions, if they are shorter than the // user specified minimum read length or do not intersect with hq // region at all. Finally, return index of the (left-most) longest // subread in the updated vector. // int longestSubreadIndex = GetHighQualitySubreadsIntervals( subreadIntervals, // a vector of subread intervals. subreadDirections, // a vector of subread directions. smrtRead.lowQualityPrefix, // hq region start pos. smrtRead.length - smrtRead.lowQualitySuffix, // hq end pos. params.minSubreadLength); // minimum read length. bestSubreadIndex = longestSubreadIndex; if (params.concordantTemplate == "longestsubread") { // Use the (left-most) longest full-pass subread as // template for concordant mapping int longestFullSubreadIndex = GetLongestFullSubreadIndex( subreadIntervals, adapterIntervals); if (longestFullSubreadIndex >= 0) { bestSubreadIndex = longestFullSubreadIndex; } } else if (params.concordantTemplate == "typicalsubread") { // Use the 'typical' full-pass subread as template for // concordant mapping. int typicalFullSubreadIndex = GetTypicalFullSubreadIndex( subreadIntervals, adapterIntervals); if (typicalFullSubreadIndex >= 0) { bestSubreadIndex = typicalFullSubreadIndex; } } else if (params.concordantTemplate == "mediansubread") { // Use the 'median-length' full-pass subread as template for // concordant mapping. int medianFullSubreadIndex = GetMedianLengthFullSubreadIndex( subreadIntervals, adapterIntervals); if (medianFullSubreadIndex >= 0) { bestSubreadIndex = medianFullSubreadIndex; } } else { assert(false); } } // Make primary intervals (which are intervals of subreads to align // in the first round) for BAM file, -concordant, void MakePrimaryIntervals(vector & subreads, vector & subreadIntervals, vector & subreadDirections, int & bestSubreadIndex, MappingParameters & params) { MakeSubreadIntervals(subreads, subreadIntervals); CreateDirections(subreadDirections, subreadIntervals.size()); bestSubreadIndex = GetIndexOfMedian(subreadIntervals); } /// Scan the next read from input. This may either be a CCS read, /// or regular read (though this may be aligned in whole, or by /// subread). /// \params[in] reader: FASTA/FASTQ/BAX.H5/CCS.H5/BAM file reader /// \params[in] regionTablePtr: RGN.H5 region table pointer. /// \params[in] params: mapping parameters. /// \params[out] smrtRead: to save smrt sequence. /// \params[out] ccsRead: to save ccs sequence. /// \params[out] readIsCCS: read is CCSSequence. /// \params[out] readGroupId: associated read group id /// \params[out] associatedRandInt: random int associated with this zmw, /// required to for generating deterministic random /// alignments regardless of nproc. /// \params[out] stop: whether or not stop mapping remaining reads. /// \returns whether or not to skip mapping reads of this zmw. bool FetchReads(ReaderAgglomerate * reader, RegionTable * regionTablePtr, SMRTSequence & smrtRead, CCSSequence & ccsRead, vector & subreads, MappingParameters & params, bool & readIsCCS, std::string & readGroupId, int & associatedRandInt, bool & stop) { if (reader->GetFileType() != BAM or not params.concordant) { if (reader->GetFileType() == HDFCCS || reader->GetFileType() == HDFCCSONLY) { if (GetNextReadThroughSemaphore(*reader, params, ccsRead, readGroupId, associatedRandInt, semaphores) == false) { stop = true; return false; } else { readIsCCS = true; smrtRead.Copy(ccsRead); ccsRead.SetQVScale(params.qvScaleType); smrtRead.SetQVScale(params.qvScaleType); } assert(ccsRead.zmwData.holeNumber == smrtRead.zmwData.holeNumber and ccsRead.zmwData.holeNumber == ccsRead.unrolledRead.zmwData.holeNumber); } else { if (GetNextReadThroughSemaphore(*reader, params, smrtRead, readGroupId, associatedRandInt, semaphores) == false) { stop = true; return false; } else { smrtRead.SetQVScale(params.qvScaleType); } } // // Only normal (non-CCS) reads should be masked. Since CCS reads store the raw read, that is masked. // bool readHasGoodRegion = true; if (params.useRegionTable and params.useHQRegionTable) { if (readIsCCS) { readHasGoodRegion = MaskRead(ccsRead.unrolledRead, ccsRead.unrolledRead.zmwData, *regionTablePtr); } else { readHasGoodRegion = MaskRead(smrtRead, smrtRead.zmwData, *regionTablePtr); } // // Store the high quality start and end of this read for masking purposes when printing. // int hqStart, hqEnd; int score; LookupHQRegion(smrtRead.zmwData.holeNumber, *regionTablePtr, hqStart, hqEnd, score); smrtRead.lowQualityPrefix = hqStart; smrtRead.lowQualitySuffix = smrtRead.length - hqEnd; smrtRead.highQualityRegionScore = score; } else { smrtRead.lowQualityPrefix = 0; smrtRead.lowQualitySuffix = 0; } if (not IsGoodRead(smrtRead, params, stop) or stop) return false; return readHasGoodRegion; } else { subreads.clear(); vector reads; if (GetNextReadThroughSemaphore(*reader, params, reads, readGroupId, associatedRandInt, semaphores) == false) { stop = true; return false; } for (const SMRTSequence & smrtRead: reads) { if (IsGoodRead(smrtRead, params, stop)) { subreads.push_back(smrtRead); } } if (subreads.size() != 0) { MakeVirtualRead(smrtRead, subreads); return true; } else { return false; } } } void MapReadsNonCCS(MappingData *mapData, MappingBuffers & mappingBuffers, SMRTSequence & smrtRead, SMRTSequence & smrtReadRC, CCSSequence & ccsRead, vector & subreads, MappingParameters & params, const int & associatedRandInt, ReadAlignments & allReadAlignments, ofstream & threadOut) { DNASuffixArray sarray; TupleCountTable ct; SequenceIndexDatabase seqdb; T_GenomeSequence genome; BWT *bwtPtr; mapData->ShallowCopySuffixArray(sarray); mapData->ShallowCopyReferenceSequence(genome); mapData->ShallowCopySequenceIndexDatabase(seqdb); mapData->ShallowCopyTupleCountTable(ct); bwtPtr = mapData->bwtPtr; SeqBoundaryFtr seqBoundary(&seqdb); vector subreadIntervals; vector subreadDirections; int bestSubreadIndex; if (mapData->reader->GetFileType() != BAM or not params.concordant) { MakePrimaryIntervals(mapData->regionTablePtr, smrtRead, subreadIntervals, subreadDirections, bestSubreadIndex, params); } else { MakePrimaryIntervals(subreads, subreadIntervals, subreadDirections, bestSubreadIndex, params); } // Flop all directions if direction of the longest subread is 1. if (bestSubreadIndex >= 0 and bestSubreadIndex < int(subreadDirections.size()) and subreadDirections[bestSubreadIndex] == 1) { UpdateDirections(subreadDirections, true); } int startIndex = 0; int endIndex = subreadIntervals.size(); if (params.concordant) { // Only the longest subread will be aligned in the first round. startIndex = max(startIndex, bestSubreadIndex); endIndex = min(endIndex, bestSubreadIndex + 1); } // // Make room for alignments. // allReadAlignments.Resize(subreadIntervals.size()); allReadAlignments.alignMode = Subread; DNALength intvIndex; for (intvIndex = startIndex; intvIndex < endIndex; intvIndex++) { SMRTSequence subreadSequence, subreadSequenceRC; MakeSubreadOfInterval(subreadSequence, smrtRead, subreadIntervals[intvIndex], params); MakeSubreadRC(subreadSequenceRC, subreadSequence, smrtRead); // // Store the sequence that is being mapped in case no hits are // found, and missing sequences are printed. // allReadAlignments.SetSequence(intvIndex, subreadSequence); vector alignmentPtrs; mapData->metrics.numReads++; assert(subreadSequence.zmwData.holeNumber == smrtRead.zmwData.holeNumber); // // Try default and fast parameters to map the read. // MapRead(subreadSequence, subreadSequenceRC, genome, // possibly multi fasta file read into one sequence sarray, *bwtPtr, // The suffix array, and the bwt-fm index structures seqBoundary, // Boundaries of contigs in the // genome, alignments do not span // the ends of boundaries. ct, // Count table to use word frequencies in the genome to weight matches. seqdb, // Information about the names of // chromosomes in the genome, and // where their sequences are in the genome. params, // A huge list of parameters for // mapping, only compile/command // line values set. mapData->metrics, // Keep track of time/ hit counts, // etc.. Not fully developed, but // should be. alignmentPtrs, // Where the results are stored. mappingBuffers, // A class of buffers for structurs // like dyanmic programming // matrices, match lists, etc., that are not // reallocated between calls to // MapRead. They are cleared though. mapData, // Some values that are shared // across threads. semaphores); // // No alignments were found, sometimes parameters are // specified to try really hard again to find an alignment. // This sets some parameters that use a more sensitive search // at the cost of time. // if ((alignmentPtrs.size() == 0 or alignmentPtrs[0]->pctSimilarity < 80) and params.doSensitiveSearch) { MappingParameters sensitiveParams = params; sensitiveParams.SetForSensitivity(); MapRead(subreadSequence, subreadSequenceRC, genome, sarray, *bwtPtr, seqBoundary, ct, seqdb, sensitiveParams, mapData->metrics, alignmentPtrs, mappingBuffers, mapData, semaphores); } // // Store the mapping quality values. // if (alignmentPtrs.size() > 0 and alignmentPtrs[0]->score < params.maxScore and params.storeMapQV) { StoreMapQVs(subreadSequence, alignmentPtrs, params); } // // Select alignments for this subread. // vector selectedAlignmentPtrs = SelectAlignmentsToPrint(alignmentPtrs, params, associatedRandInt); allReadAlignments.AddAlignmentsForSeq(intvIndex, selectedAlignmentPtrs); // // Move reference from subreadSequence, which will be freed at // the end of this loop to the smrtRead, which exists for the // duration of aligning all subread of the smrtRead. // for (size_t a = 0; a < alignmentPtrs.size(); a++) { if (alignmentPtrs[a]->qStrand == 0) { alignmentPtrs[a]->qAlignedSeq.ReferenceSubstring(smrtRead, alignmentPtrs[a]->qAlignedSeq.seq - subreadSequence.seq, alignmentPtrs[a]->qAlignedSeqLength); } else { alignmentPtrs[a]->qAlignedSeq.ReferenceSubstring(smrtReadRC, alignmentPtrs[a]->qAlignedSeq.seq - subreadSequenceRC.seq, alignmentPtrs[a]->qAlignedSeqLength); } } // Fix for memory leakage bug due to undeleted Alignment Candidate objectts which wasn't selected // for printing // delete all AC which are in complement of SelectedAlignmemntPtrs vector // namely (SelectedAlignmentPtrs/alignmentPtrs) for (int ii = 0; ii < alignmentPtrs.size(); ii++) { int found =0; for (int jj = 0; jj < selectedAlignmentPtrs.size(); jj++) { if (alignmentPtrs[ii] == selectedAlignmentPtrs[jj] ) { found = 1; break; } } if (found == 0) delete alignmentPtrs[ii]; } subreadSequence.Free(); subreadSequenceRC.Free(); } // End of looping over subread intervals within [startIndex, endIndex). if (params.verbosity >= 3) allReadAlignments.Print(threadOut); if (params.concordant) { allReadAlignments.read = smrtRead; allReadAlignments.alignMode = ZmwSubreads; if (startIndex >= 0 && startIndex < int(allReadAlignments.subreadAlignments.size())) { vector selectedAlignmentPtrs = allReadAlignments.CopySubreadAlignments(startIndex); for(int alignmentIndex = 0; alignmentIndex < int(selectedAlignmentPtrs.size()); alignmentIndex++) { FlankTAlignedSeq(selectedAlignmentPtrs[alignmentIndex], seqdb, genome, params.flankSize); } for (intvIndex = 0; intvIndex < subreadIntervals.size(); intvIndex++) { if (intvIndex == startIndex) continue; int passDirection = subreadDirections[intvIndex]; int passStartBase = subreadIntervals[intvIndex].start; int passNumBases = subreadIntervals[intvIndex].end - passStartBase; if (passNumBases <= params.minReadLength) {continue;} mapData->metrics.numReads++; SMRTSequence subread; subread.ReferenceSubstring(smrtRead, passStartBase, passNumBases); subread.CopyTitle(smrtRead.title); // The unrolled alignment should be relative to the entire read. if (params.clipping == SAMOutput::subread) { SMRTSequence maskedSubread; MakeSubreadOfInterval(maskedSubread, smrtRead, subreadIntervals[intvIndex], params); allReadAlignments.SetSequence(intvIndex, maskedSubread); maskedSubread.Free(); } else { allReadAlignments.SetSequence(intvIndex, smrtRead); } for (int alnIndex = 0; alnIndex < selectedAlignmentPtrs.size(); alnIndex++) { T_AlignmentCandidate * alignment = selectedAlignmentPtrs[alnIndex]; if (alignment->score > params.maxScore) break; AlignSubreadToAlignmentTarget(allReadAlignments, subread, smrtRead, alignment, passDirection, subreadIntervals[intvIndex], intvIndex, params, mappingBuffers, threadOut); if (params.concordantAlignBothDirections) { AlignSubreadToAlignmentTarget(allReadAlignments, subread, smrtRead, alignment, ((passDirection==0)?1:0), subreadIntervals[intvIndex], intvIndex, params, mappingBuffers, threadOut); } } // End of aligning this subread to each selected alignment. subread.Free(); } // End of aligning each subread to where the template subread aligned to. for(int alignmentIndex = 0; alignmentIndex < selectedAlignmentPtrs.size(); alignmentIndex++) { if (selectedAlignmentPtrs[alignmentIndex]) delete selectedAlignmentPtrs[alignmentIndex]; } } // End of if startIndex >= 0 and < subreadAlignments.size() } // End of if params.concordant } void MapReadsCCS(MappingData *mapData, MappingBuffers & mappingBuffers, SMRTSequence & smrtRead, SMRTSequence & smrtReadRC, CCSSequence & ccsRead, const bool readIsCCS, MappingParameters & params, const int & associatedRandInt, ReadAlignments & allReadAlignments, ofstream & threadOut) { DNASuffixArray sarray; TupleCountTable ct; SequenceIndexDatabase seqdb; T_GenomeSequence genome; BWT *bwtPtr; mapData->ShallowCopySuffixArray(sarray); mapData->ShallowCopyReferenceSequence(genome); mapData->ShallowCopySequenceIndexDatabase(seqdb); mapData->ShallowCopyTupleCountTable(ct); bwtPtr = mapData->bwtPtr; SeqBoundaryFtr seqBoundary(&seqdb); // // The read must be mapped as a whole, even if it contains subreads. // vector alignmentPtrs; mapData->metrics.numReads++; smrtRead.SubreadStart(0).SubreadEnd(smrtRead.length); smrtReadRC.SubreadStart(0).SubreadEnd(smrtRead.length); MapRead(smrtRead, smrtReadRC, genome, sarray, *bwtPtr, seqBoundary, ct, seqdb, params, mapData->metrics, alignmentPtrs, mappingBuffers, mapData, semaphores); // // Store the mapping quality values. // if (alignmentPtrs.size() > 0 and alignmentPtrs[0]->score < params.maxScore and params.storeMapQV) { StoreMapQVs(smrtRead, alignmentPtrs, params); } // // Select de novo ccs-reference alignments for subreads to align to. // vector selectedAlignmentPtrs = SelectAlignmentsToPrint(alignmentPtrs, params, associatedRandInt); // // Just one sequence is aligned. There is one primary hit, and // all other are secondary. // if (readIsCCS == false or params.useCcsOnly) { // if -noSplitSubreads or -useccsdenovo. // // Record some information for proper SAM Annotation. // allReadAlignments.Resize(1); allReadAlignments.AddAlignmentsForSeq(0, selectedAlignmentPtrs); if (params.useCcsOnly) { allReadAlignments.alignMode = CCSDeNovo; } else { allReadAlignments.alignMode = Fullread; } allReadAlignments.SetSequence(0, smrtRead); } else if (readIsCCS) { // if -useccsall or -useccs // Flank alignment candidates to both ends. for(int alignmentIndex = 0; alignmentIndex < selectedAlignmentPtrs.size(); alignmentIndex++) { FlankTAlignedSeq(selectedAlignmentPtrs[alignmentIndex], seqdb, genome, params.flankSize); } // // Align the ccs subread to where the denovo sequence mapped (explode). // CCSIterator ccsIterator; FragmentCCSIterator fragmentCCSIterator; CCSIterator *subreadIterator; // // Choose a different iterator over subreads depending on the // alignment mode. When the mode is allpass, include the // framgents that are not necessarily full pass. // if (params.useAllSubreadsInCcs) { // // Use all subreads even if they are not full pass fragmentCCSIterator.Initialize(&ccsRead, mapData->regionTablePtr); subreadIterator = &fragmentCCSIterator; allReadAlignments.alignMode = CCSAllPass; } else { // Use only full pass reads. ccsIterator.Initialize(&ccsRead); subreadIterator = &ccsIterator; allReadAlignments.alignMode = CCSFullPass; } allReadAlignments.Resize(subreadIterator->GetNumPasses()); int passDirection, passStartBase, passNumBases; SMRTSequence subread; // // The read was previously set to the smrtRead, which was the // de novo ccs sequence. Since the alignments of exploded // reads are reported, the unrolled read should be used as the // reference when printing. // allReadAlignments.read = ccsRead.unrolledRead; subreadIterator->Reset(); int subreadIndex; // // Realign all subreads to selected reference locations. // for (subreadIndex = 0; subreadIndex < subreadIterator->GetNumPasses(); subreadIndex++) { int retval = subreadIterator->GetNext(passDirection, passStartBase, passNumBases); assert(retval == 1); if (passNumBases <= params.minReadLength) { continue; } ReadInterval subreadInterval(passStartBase, passStartBase + passNumBases); subread.ReferenceSubstring(ccsRead.unrolledRead, passStartBase, passNumBases-1); subread.CopyTitle(ccsRead.title); // The unrolled alignment should be relative to the entire read. allReadAlignments.SetSequence(subreadIndex, ccsRead.unrolledRead); int alignmentIndex; // // Align this subread to all the positions that the de novo // sequence has aligned to. // for (alignmentIndex = 0; alignmentIndex < selectedAlignmentPtrs.size(); alignmentIndex++) { T_AlignmentCandidate *alignment = selectedAlignmentPtrs[alignmentIndex]; if (alignment->score > params.maxScore) break; AlignSubreadToAlignmentTarget(allReadAlignments, subread, ccsRead.unrolledRead, alignment, passDirection, subreadInterval, subreadIndex, params, mappingBuffers, threadOut); } // End of aligning this subread to where the de novo ccs has aligned to. subread.Free(); } // End of alignining all subreads to where the de novo ccs has aligned to. } // End of if readIsCCS and !params.useCcsOnly // Fix for memory leakage due to undeleted Alignment Candidate objectts not selected // for printing // delete all AC which are in complement of SelectedAlignmemntPtrs vector // namely (SelectedAlignmentPtrs/alignmentPtrs) for (int ii = 0; ii < alignmentPtrs.size(); ii++) { int found =0; for (int jj = 0; jj < selectedAlignmentPtrs.size(); jj++) { if (alignmentPtrs[ii] == selectedAlignmentPtrs[jj] ) { found = 1; break; } } if (found == 0) delete alignmentPtrs[ii]; } } void MapReads(MappingData *mapData) { // // Step 1, initialize local pointers to map data // for programming shorthand. // MappingParameters params = mapData->params; DNASuffixArray sarray; TupleCountTable ct; SequenceIndexDatabase seqdb; T_GenomeSequence genome; BWT *bwtPtr; mapData->ShallowCopySuffixArray(sarray); mapData->ShallowCopyReferenceSequence(genome); mapData->ShallowCopySequenceIndexDatabase(seqdb); mapData->ShallowCopyTupleCountTable(ct); bwtPtr = mapData->bwtPtr; SeqBoundaryFtr seqBoundary(&seqdb); int numAligned = 0; SMRTSequence smrtRead, smrtReadRC; SMRTSequence unrolledReadRC; CCSSequence ccsRead; // Print verbose logging to pid.threadid.log for each thread. ofstream threadOut; if (params.verbosity >= 3) { stringstream ss; ss << getpid() << "." << pthread_self(); string threadLogFileName = ss.str() + ".log"; threadOut.open(threadLogFileName.c_str(), ios::out|ios::app); } // // Reuse the following buffers during alignment. Since these keep // storage contiguous, hopefully this will decrease memory // fragmentation. // MappingBuffers mappingBuffers; while (true) { // Fetch reads from a zmw bool readIsCCS = false; AlignmentContext alignmentContext; // Associate each sequence to read in with a determined random int. int associatedRandInt = 0; bool stop = false; vector subreads; bool readsOK = FetchReads(mapData->reader, mapData->regionTablePtr, smrtRead, ccsRead, subreads, params, readIsCCS, alignmentContext.readGroupId, associatedRandInt, stop); if (stop) break; if (not readsOK) continue; if (params.verbosity > 1) { cout << "aligning read: " << endl; smrtRead.PrintSeq(cout); } smrtRead.MakeRC(smrtReadRC); if (readIsCCS) { ccsRead.unrolledRead.MakeRC(unrolledReadRC); } // // When aligning subreads separately, iterate over each subread, and // print the alignments for these. // ReadAlignments allReadAlignments; allReadAlignments.read = smrtRead; if (readIsCCS == false and params.mapSubreadsSeparately) { // (not readIsCCS and not -noSplitSubreads) MapReadsNonCCS(mapData, mappingBuffers, smrtRead, smrtReadRC, ccsRead, subreads, params, associatedRandInt, allReadAlignments, threadOut); } // End of if (readIsCCS == false and params.mapSubreadsSeparately). else { // if (readIsCCS or (not readIsCCS and -noSplitSubreads) ) MapReadsCCS(mapData, mappingBuffers, smrtRead, smrtReadRC, ccsRead, readIsCCS, params, associatedRandInt, allReadAlignments, threadOut); } // End of if not (readIsCCS == false and params.mapSubreadsSeparately) PrintAllReadAlignments(allReadAlignments, alignmentContext, *mapData->outFilePtr, *mapData->unalignedFilePtr, params, subreads, #ifdef USE_PBBAM bamWriterPtr, #endif semaphores); allReadAlignments.Clear(); smrtReadRC.Free(); smrtRead.Free(); if (readIsCCS) { ccsRead.Free(); unrolledReadRC.Free(); } numAligned++; if(numAligned % 100 == 0) { mappingBuffers.Reset(); } } // End of while (true). smrtRead.Free(); smrtReadRC.Free(); unrolledReadRC.Free(); ccsRead.Free(); if (params.nProc > 1) { #ifdef __APPLE__ sem_wait(semaphores.reader); sem_post(semaphores.reader); #else sem_wait(&semaphores.reader); sem_post(&semaphores.reader); #endif } if (params.nProc > 1) { pthread_exit(NULL); } threadOut.close(); } int main(int argc, char* argv[]) { // // Configure parameters for refining alignments. // MappingParameters params; ReverseCompressIndex index; pid_t parentPID; pid_t *pids; CommandLineParser clp; clp.SetHelp(BlasrHelp(params)); clp.SetConciseHelp(BlasrConciseHelp()); clp.SetProgramSummary(BlasrSummaryHelp()); clp.SetProgramName("blasr"); clp.SetVersion(GetVersion()); // Register Blasr options. RegisterBlasrOptions(clp, params); // Parse command line args. clp.ParseCommandLine(argc, argv, params.readsFileNames); string commandLine; clp.CommandLineToString(argc, argv, commandLine); if (params.printVerboseHelp) { cout << BlasrHelp(params) << endl; exit(0); // Not a failure. } if (params.printDiscussion) { cout << BlasrDiscussion(); exit(0); // Not a failure. } if (argc < 3) { cout << BlasrConciseHelp(); exit(1); // A failure. } int a, b; for (a = 0; a < 5; a++ ) { for (b = 0; b < 5; b++ ){ if (a != b) { SMRTDistanceMatrix[a][b] += params.mismatch; } else { SMRTDistanceMatrix[a][b] += params.match; } } } if (params.scoreMatrixString != "") { if (StringToScoreMatrix(params.scoreMatrixString, SMRTDistanceMatrix) == false) { cout << "ERROR. The string " << endl << params.scoreMatrixString << endl << "is not a valid format. It should be a quoted, space separated string of " << endl << "integer values. The matrix: " << endl << " A C G T N" << endl << " A 1 2 3 4 5" << endl << " C 6 7 8 9 10" << endl << " G 11 12 13 14 15" << endl << " T 16 17 18 19 20" << endl << " N 21 22 23 24 25" << endl << " should be specified as \"1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25\"" << endl; exit(1); } } cerr << "[INFO] " << GetTimestamp() << " [blasr] started." << endl; params.MakeSane(); // // The random number generator is used for subsampling for debugging // and testing consensus and selecting hits when hit policy is random // or randombest. // if (params.useRandomSeed == true) { InitializeRandomGenerator(params.randomSeed); } else { InitializeRandomGeneratorWithTime(); } // // Various aspects of timing are stored here. However this isn't // quite finished. // MappingMetrics metrics; ofstream fullMetricsFile; if (params.fullMetricsFileName != "") { CrucialOpen(params.fullMetricsFileName, fullMetricsFile, std::ios::out); metrics.SetStoreList(); } // // If reading a separate region table, there is a 1-1 correspondence // between region table and bas file. // if (params.readSeparateRegionTable) { if (FileOfFileNames::IsFOFN(params.regionTableFileName)) { FileOfFileNames::FOFNToList(params.regionTableFileName, params.regionTableFileNames); } else { params.regionTableFileNames.push_back(params.regionTableFileName); } } if (params.regionTableFileNames.size() != 0 and params.regionTableFileNames.size() != params.queryFileNames.size()) { cout << "Error, there are not the same number of region table files as input files." << endl; exit(1); } // If reading a separate ccs fofn, there is a 1-1 corresponence // between ccs fofn and base file. if (params.readSeparateCcsFofn) { if (FileOfFileNames::IsFOFN(params.ccsFofnFileName)) { FileOfFileNames::FOFNToList(params.ccsFofnFileName, params.ccsFofnFileNames); } else { params.ccsFofnFileNames.push_back(params.ccsFofnFileName); } } if (params.ccsFofnFileNames.size() != 0 and params.ccsFofnFileNames.size() != params.queryFileNames.size()) { cout << "Error, there are not the same number of ccs files as input files." << endl; exit(1); } parentPID = getpid(); SequenceIndexDatabase seqdb; SeqBoundaryFtr seqBoundary(&seqdb); // // Initialize the sequence index database if it used. If it is not // specified, it is initialized by default when reading a multiFASTA // file. // if (params.useSeqDB) { ifstream seqdbin; CrucialOpen(params.seqDBName, seqdbin); seqdb.ReadDatabase(seqdbin); } // // Make sure the reads file exists and can be opened before // trying to read any of the larger data structures. // FASTASequence fastaGenome; T_Sequence genome; FASTAReader genomeReader; // // The genome is in normal FASTA, or condensed (lossy homopolymer->unipolymer) // format. Both may be read in using a FASTA reader. // if (!genomeReader.Init(params.genomeFileName)) { cout << "Could not open genome file " << params.genomeFileName << endl; exit(1); } if (params.printSAM or params.printBAM) { genomeReader.computeMD5 = true; } // // If no sequence title database is supplied, initialize one when // reading in the reference, and consider a seqdb to be present. // if (!params.useSeqDB) { genomeReader.ReadAllSequencesIntoOne(fastaGenome, &seqdb); params.useSeqDB = true; } else { genomeReader.ReadAllSequencesIntoOne(fastaGenome); } genomeReader.Close(); // // The genome may have extra spaces in the fasta name. Get rid of those. // VectorIndex t; for (t = 0; t < fastaGenome.titleLength; t++ ){ if (fastaGenome.title[t] == ' ') { fastaGenome.titleLength = t; fastaGenome.title[t] = '\0'; break; } } genome.seq = fastaGenome.seq; genome.length = fastaGenome.length; genome.title = fastaGenome.title; genome.deleteOnExit = false; genome.titleLength = fastaGenome.titleLength; genome.ToUpper(); DNASuffixArray sarray; TupleCountTable ct; int listTupleSize; ofstream outFile; outFile.exceptions(ostream::failbit); ofstream unalignedOutFile; BWT bwt; if (params.useBwt) { if (bwt.Read(params.bwtFileName) == 0) { cout << "ERROR! Could not read the BWT file. " << params.bwtFileName << endl; exit(1); } } else { if (!params.useSuffixArray) { // // There was no explicit specification of a suffix // array on the command line, so build it on the fly here. // genome.ToThreeBit(); vector alphabet; sarray.InitThreeBitDNAAlphabet(alphabet); sarray.LarssonBuildSuffixArray(genome.seq, genome.length, alphabet); if (params.minMatchLength > 0) { if (params.anchorParameters.useLookupTable == true) { if (params.lookupTableLength > params.minMatchLength) { params.lookupTableLength = params.minMatchLength; } sarray.BuildLookupTable(genome.seq, genome.length, params.lookupTableLength); } } genome.ConvertThreeBitToAscii(); params.useSuffixArray = 1; } else if (params.useSuffixArray) { if (sarray.Read(params.suffixArrayFileName)) { if (params.minMatchLength != 0) { params.listTupleSize = min(8, params.minMatchLength); } else { params.listTupleSize = sarray.lookupPrefixLength; } if (params.minMatchLength < sarray.lookupPrefixLength) { cerr << "WARNING. The value of -minMatch " << params.minMatchLength << " is less than the smallest searched length of " << sarray.lookupPrefixLength << ". Setting -minMatch to " << sarray.lookupPrefixLength << "." << endl; params.minMatchLength = sarray.lookupPrefixLength; } } else { cout << "ERROR. " << params.suffixArrayFileName << " is not a valid suffix array. " << endl << " Make sure it is generated with the latest version of sawriter." << endl; exit(1); } } } if (params.minMatchLength < sarray.lookupPrefixLength) { cerr << "WARNING. The value of -minMatch " << params.minMatchLength << " is less than the smallest searched length of " << sarray.lookupPrefixLength << ". Setting -minMatch to " << sarray.lookupPrefixLength << "." << endl; params.minMatchLength = sarray.lookupPrefixLength; } // // It is required to have a tuple count table // for estimating the background frequencies // for word matching. // If one is specified on the command line, simply read // it in. If not, this is operating under the mode // that everything is computed from scratch. // long l; TupleMetrics saLookupTupleMetrics; if (params.useCountTable) { ifstream ctIn; CrucialOpen(params.countTableName, ctIn, std::ios::in | std::ios::binary); ct.Read(ctIn); saLookupTupleMetrics = ct.tm; } else { saLookupTupleMetrics.Initialize(params.lookupTableLength); ct.InitCountTable(saLookupTupleMetrics); ct.AddSequenceTupleCountsLR(genome); } TitleTable titleTable; if (params.useTitleTable) { ofstream titleTableOut; CrucialOpen(params.titleTableName, titleTableOut); // // When using a sequence index database, the title table is simply copied // from the sequencedb. // if (params.useSeqDB) { titleTable.Copy(seqdb.names, seqdb.nSeqPos-1); titleTable.ResetTableToIntegers(seqdb.names, seqdb.nameLengths, seqdb.nSeqPos-1); } else { // // No seqdb, so there is just one sequence. Still the user specified a title // table, so just the first sequence in the fasta file should be used. // titleTable.Copy(&fastaGenome.title, 1); titleTable.ResetTableToIntegers(&genome.title, &genome.titleLength, 1); fastaGenome.titleLength = strlen(genome.title); } titleTable.Write(titleTableOut); } else { if (params.useSeqDB) { // // When using a sequence index database, but not the titleTable, // it is necessary to truncate the titles at the first space to // be compatible with the way other alignment programs interpret // fasta titles. When printing the title table, there is all // sorts of extra storage space, so the full line is stored. // seqdb.SequenceTitleLinesToNames(); } } ostream *outFilePtr = &cout; ofstream outFileStrm; ofstream unalignedFile; ostream *unalignedFilePtr = NULL; ofstream metricsOut, lcpBoundsOut; ofstream anchorFileStrm; ofstream clusterOut, *clusterOutPtr; if (params.anchorFileName != "") { CrucialOpen(params.anchorFileName, anchorFileStrm, std::ios::out); } if (params.clusterFileName != "") { CrucialOpen(params.clusterFileName, clusterOut, std::ios::out); clusterOutPtr = &clusterOut; clusterOut << "total_size p_value n_anchors read_length align_score read_accuracy anchor_probability min_exp_anchors seq_length" << endl; } else { clusterOutPtr = NULL; } if (params.outFileName != "") { if (not params.printBAM) { CrucialOpen(params.outFileName, outFileStrm, std::ios::out); outFilePtr = &outFileStrm; } // otherwise, use bamWriter and initialize it later } if (params.printHeader) { switch(params.printFormat) { case(SummaryPrint): SummaryOutput::PrintHeader(*outFilePtr); break; case(Interval): IntervalOutput::PrintHeader(*outFilePtr); break; case(CompareSequencesParsable): CompareSequencesOutput::PrintHeader(*outFilePtr); break; } } if (params.printUnaligned == true) { CrucialOpen(params.unalignedFileName, unalignedFile, std::ios::out); unalignedFilePtr = &unalignedFile; } if (params.metricsFileName != "") { CrucialOpen(params.metricsFileName, metricsOut); } if (params.lcpBoundsFileName != "") { CrucialOpen(params.lcpBoundsFileName, lcpBoundsOut); // lcpBoundsOut << "pos depth width lnwidth" << endl; } // // Configure the mapping database. // MappingData *mapdb = new MappingData[params.nProc]; int procIndex; pthread_attr_t *threadAttr = new pthread_attr_t[params.nProc]; // MappingSemaphores semaphores; // // When there are multiple processes running along, sometimes there // are semaphores to worry about. // if (params.nProc > 1) { semaphores.InitializeAll(); } for (procIndex = 0; procIndex < params.nProc; procIndex++ ){ pthread_attr_init(&threadAttr[procIndex]); } // // Start the mapping jobs. // int readsFileIndex = 0; if (params.subsample < 1) { InitializeRandomGeneratorWithTime(); reader = new ReaderAgglomerate(params.subsample); } else { reader = new ReaderAgglomerate(params.startRead, params.stride); } // In case the input is fasta, make all bases in upper case. reader->SetToUpper(); regionTableReader = new HDFRegionTableReader; RegionTable regionTable; // // Store lists of how long it took to map each read. // metrics.clocks.SetStoreList(true); if (params.useCcs) { reader->UseCCS(); } string commandLineString; // Restore command. clp.CommandLineToString(argc, argv, commandLineString); if (params.printSAM or params.printBAM) { string so = "UNKNOWN"; // sorting order; string version = GetVersion(); //blasr version; SAMHeaderPrinter shp(so, seqdb, params.queryFileNames, params.queryReadType, params.samQVList, "BLASR", version, commandLineString); string headerString = shp.ToString();// SAM/BAM header if (params.printSAM) { *outFilePtr << headerString; } else if (params.printBAM) { #ifdef USE_PBBAM PacBio::BAM::BamHeader header = PacBio::BAM::BamHeader(headerString); // Both file name and SAMHeader are required in order to create a BamWriter. bamWriterPtr = new PacBio::BAM::BamWriter(params.outFileName, header); #else REQUIRE_PBBAM_ERROR(); #endif } } for (readsFileIndex = 0; readsFileIndex < params.queryFileNames.size(); readsFileIndex++ ){ params.readsFileIndex = readsFileIndex; // // Configure the reader to use the correct read and region // file names. // reader->SetReadFileName(params.queryFileNames[params.readsFileIndex]); // // Initialize using already set file names. // int initReturnValue = reader->Initialize(); if (initReturnValue <= 0) { cerr << "WARNING! Could not open file " << params.queryFileNames[params.readsFileIndex] << endl; continue; } // Check whether use ccs only. if (reader->GetFileType() == HDFCCSONLY) { params.useAllSubreadsInCcs = false; params.useCcs = params.useCcsOnly = true; } string changeListIdString; reader->hdfBasReader.GetChangeListID(changeListIdString); ChangeListID changeListId(changeListIdString); params.qvScaleType = DetermineQVScaleFromChangeListID(changeListId); if (reader->FileHasZMWInformation() and params.useRegionTable) { if (params.readSeparateRegionTable) { if (regionTableReader->Initialize(params.regionTableFileNames[params.readsFileIndex]) == 0) { cout << "ERROR! Could not read the region table " << params.regionTableFileNames[params.readsFileIndex] <HasRegionTable()) { if (regionTableReader->Initialize(params.queryFileNames[params.readsFileIndex]) == 0) { cout << "ERROR! Could not read the region table " << params.queryFileNames[params.readsFileIndex] <ReadTable(regionTable); regionTableReader->Close(); } // // Check to see if there is a separate ccs fofn. If there is a separate // ccs fofn, use that over the one in the bas file. // //if (params.readSeparateCcsFofn and params.useCcs) { // if (reader->SetCCS(params.ccsFofnFileNames[params.readsFileIndex]) == 0) { // cout << "ERROR! Could not read the ccs file " // << params.ccsFofnFileNames[params.readsFileIndex] << endl; // exit(1); // } // } if (reader->GetFileType() != HDFCCS and reader->GetFileType() != HDFBase and reader->GetFileType() != HDFPulse and reader->GetFileType() != BAM and params.concordant) { cerr << "WARNING! Option concordant is only enabled when " << "input reads are in PacBio bax/pls.h5 or bam format." << endl; params.concordant = false; } #ifdef USE_GOOGLE_PROFILER char *profileFileName = getenv("CPUPROFILE"); if (profileFileName != NULL) { ProfilerStart(profileFileName); } else { ProfilerStart("google_profile.txt"); } #endif assert (initReturnValue > 0); if (params.nProc == 1) { mapdb[0].Initialize(&sarray, &genome, &seqdb, &ct, &index, params, reader, ®ionTable, outFilePtr, unalignedFilePtr, &anchorFileStrm, clusterOutPtr); mapdb[0].bwtPtr = &bwt; if (params.fullMetricsFileName != "") { mapdb[0].metrics.SetStoreList(true); } if (params.lcpBoundsFileName != "") { mapdb[0].lcpBoundsOutPtr = &lcpBoundsOut; } else { mapdb[0].lcpBoundsOutPtr = NULL; } MapReads(&mapdb[0]); metrics.Collect(mapdb[0].metrics); } else { pthread_t *threads = new pthread_t[params.nProc]; for (procIndex = 0; procIndex < params.nProc; procIndex++ ){ // // Initialize thread-specific parameters. // mapdb[procIndex].Initialize(&sarray, &genome, &seqdb, &ct, &index, params, reader, ®ionTable, outFilePtr, unalignedFilePtr, &anchorFileStrm, clusterOutPtr); mapdb[procIndex].bwtPtr = &bwt; if (params.fullMetricsFileName != "") { mapdb[procIndex].metrics.SetStoreList(true); } if (params.lcpBoundsFileName != "") { mapdb[procIndex].lcpBoundsOutPtr = &lcpBoundsOut; } else { mapdb[procIndex].lcpBoundsOutPtr = NULL; } if (params.outputByThread) { ofstream *outPtr =new ofstream; mapdb[procIndex].outFilePtr = outPtr; stringstream outNameStream; outNameStream << params.outFileName << "." << procIndex; mapdb[procIndex].params.outFileName = outNameStream.str(); CrucialOpen(mapdb[procIndex].params.outFileName, *outPtr, std::ios::out); } pthread_create(&threads[procIndex], &threadAttr[procIndex], (void* (*)(void*))MapReads, &mapdb[procIndex]); } for (procIndex = 0; procIndex < params.nProc; procIndex++) { pthread_join(threads[procIndex], NULL); } for (procIndex = 0; procIndex < params.nProc; procIndex++) { metrics.Collect(mapdb[procIndex].metrics); if (params.outputByThread) { delete mapdb[procIndex].outFilePtr; } } if (threads) { delete threads; threads = NULL; } } reader->Close(); } if (!reader) {delete reader; reader = NULL;} fastaGenome.Free(); #ifdef USE_GOOGLE_PROFILER ProfilerStop(); #endif if (mapdb != NULL) { delete[] mapdb; } if (threadAttr != NULL) { delete[] threadAttr; } seqdb.FreeDatabase(); if (regionTableReader) { delete regionTableReader; } if (params.metricsFileName != "") { metrics.PrintSummary(metricsOut); } if (params.fullMetricsFileName != "") { metrics.PrintFullList(fullMetricsFile); } if (params.outFileName != "") { if (params.printBAM) { #ifdef USE_PBBAM assert(bamWriterPtr); try { bamWriterPtr->TryFlush(); delete bamWriterPtr; bamWriterPtr = NULL; } catch (std::exception e) { cout << "Error, could not flush bam records to bam file." << endl; exit(1); } #else REQUIRE_PBBAM_ERROR(); #endif } else { outFileStrm.close(); } } cerr << "[INFO] " << GetTimestamp() << " [blasr] ended." << endl; return 0; } blasr-8e668beae0dda1da6914586fb458182c6c3c7482/LICENSES.txt000066400000000000000000000031211260737656700216230ustar00rootroot00000000000000Copyright (c) 2011-2015, Pacific Biosciences of California, Inc. All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: * Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. * Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. * Neither the name of Pacific Biosciences nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE GRANTED BY THIS LICENSE. THIS SOFTWARE IS PROVIDED BY PACIFIC BIOSCIENCES AND ITS CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL PACIFIC BIOSCIENCES OR ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. blasr-8e668beae0dda1da6914586fb458182c6c3c7482/README.INSTALL.md000066400000000000000000000046121260737656700222470ustar00rootroot00000000000000## Installation ### Download source code * To pull this project from git hub to your local system: git clone git://github.com/PacificBiosciences/blasr.git blasr * To sync your code with the latest git code base: git pull -u origin master && git submodule update --init ### Requirements * To configure: ./configure.py --shared --sub --no-pbbam * or with HDF5 directories (and note that `HDF5_LIB` is a *directory* here): ./configure.py --shared --sub --no-pbbam HDF5_INCLUDE=... HDF5_LIB=... To build BLASR, you must have hdf 1.8.12 or above installed and configured with c++ support (you should have the library libhdf5_cpp.a). If you are intalling the entire PacBio secondary analysis software suite, appropriate hdf libraries are already distributed and no configuration is necessary. Otherwise, it is necessary to point two environment variables: + **HDF5_INCLUDE**, which points to directory of the HDF5 headers (e.g., hdf5.h) + **HDF5_LIB**, which points to the HDF5 library directory (e.g., hdf5*.a, and hdf5*.so) You may pass arguments to `configure.py` as above, or you may export them from command line: export HDF5_INC=path_to_your_hdf5_include && export HDF5_LIB=path_to_your_hdf5_lib ### Build * To make the 'libcpp' libraries: make build-submodule * To make 'blasr' only: make blasr * To compile all tools, including blasr, pls2fasta, loadPulses, sawriter: make * Frequently used executables will be under utils. * To test (with **cram** installed): #make cramtests make cramfast ## Currently: ## Ran 22 tests, 0 skipped, 4 failed. * To clean all compiled tools and lib: make cleanall * To clean compiled tools without cleaning lib: make clean make blasr ./blasr ## Other issues ### Static binaries If you want static binaries, drop `--shared` when you run configure.py. In that case, you might need to pass `-lsz` to make, if you built HDF5 with szlib support (`--with-szlib`). ./configure.py --with-szlib ... See [our issues](https://github.com/PacificBiosciences/blasr/issues/113#issuecomment-143981496). If you have macosx (Darwin), then you almost certainly want non-static binaries (--shared). ### blasr_libcpp If you have built and installed blasr_libcpp elsewhere, then drop `--sub` and do not run `make build-submodule`. blasr-8e668beae0dda1da6914586fb458182c6c3c7482/README.MANUAL.md000066400000000000000000000022251260737656700221140ustar00rootroot00000000000000## Running BLASR Typing 'blasr -h' or 'blasr -help' on the command line will give you a list of options. At the least, provide a fasta, fastq, or bas.h5 file, and a genome. ### Some typical use cases Align reads from reads.bas.h5 to ecoli_K12 genome, and output in SAM format. blasr reads.bas.h5 ecoli_K12.fasta -sam Same as above, but with soft clipping blasr reads.bas.h5 ecoli_K12.fasta -sam -clipping soft Use multiple threads blasr reads.bas.h5 ecoli_K12.fasta -sam -clipping soft -out alignments.sam -nproc 16 Include a larger minimal match, for faster but less sensitive alignments blasr reads.bas.h5 ecoli_K12.fasta -sam -clipping soft -minMatch 15 Produce alignments in a pairwise human readable format blasr reads.bas.h5 ecoli_K12.fasta -m 0 Use a precomputed suffix array for faster startup sawriter hg19.fasta.sa hg19.fasta #First precompute the suffix array blasr reads.bas.h5 hg19.fasta -sa hg19.fasta.sa Use a precomputed BWT-FM index for smaller runtime memory footprint, but slower alignments. sa2bwt hg19.fasta hg19.fasta.sa hg19.fasta.bwt blasr reads.bas.h5 hg19.fasta -bwt hg19.fasta.bwt blasr-8e668beae0dda1da6914586fb458182c6c3c7482/README.md000066400000000000000000000001341260737656700211350ustar00rootroot00000000000000See Blasr [README.INSTALL.md](README.INSTALL.md) and [README.MANUAL.md](README.MANUAL.md). blasr-8e668beae0dda1da6914586fb458182c6c3c7482/configure.py000077500000000000000000000206541260737656700222250ustar00rootroot00000000000000#!/usr/bin/env python """Configure the build. - Create defines.mk """ import commands import contextlib import optparse import os import sys import warnings #DEFAULTCXXFLAG := -O3 #DEBUGCXXFLAG := -g -ggdb -fno-inline #PROFILECXXFLAG := -Os -pg #GCXXFLAG := -fno-builtin-malloc -fno-builtin-calloc -fno-builtin-realloc -fno-builtin-free -fno-omit-frame-pointer ROOT = os.path.abspath(os.path.dirname(__file__)) def log(msg): sys.stderr.write(msg) sys.stderr.write('\n') def shell(cmd): log('`%s`'%cmd) status, output = commands.getstatusoutput(cmd) if status: raise Exception('%d <- %r' %(status, cmd)) log(output) return output def system(cmd): log(cmd) status = os.system(cmd) if status: raise Exception('%d <- %r' %(status, cmd)) return def mkdirs(path): if not os.path.isdir(path): os.makedirs(path) @contextlib.contextmanager def cd(nwd): cwd = os.getcwd() log('cd %r -> %r' %(cwd, nwd)) os.chdir(nwd) yield os.chdir(cwd) log('cd %r <- %r' %(cwd, nwd)) def update_content(fn, content): current_content = open(fn).read() if os.path.exists(fn) else None if content != current_content: log('writing to %r:' %fn) log('"""\n' + content + '"""\n----') open(fn, 'w').write(content) def get_OS_STRING(): G_BUILDOS_CMD = """bash -c 'set -e; set -o pipefail; id=$(lsb_release -si | tr "[:upper:]" "[:lower:]"); rel=$(lsb_release -sr); case $id in ubuntu) printf "$id-%04d\n" ${rel/./};; centos) echo "$id-${rel%%.*}";; *) echo "$id-$rel";; esac' 2>/dev/null""" return shell(G_BUILDOS_CMD) def get_PREBUILT(): cmd = 'cd ../../../../prebuilt.out 2>/dev/null && pwd || echo -n notfound' return shell(cmd) def ifenvf(env, key, func): if key in env: return env[key] else: return func() def setifenvf(envout, envin, key, func): envout[key] = ifenvf(envin, key, func) def setifenv(envout, envin, key, val): envout[key] = envin.get(key, val) def setenv(envout, key, val): envout[key] = val def update_env_if(envout, envin, keys): for key in keys: if key in envin: envout[key] = envin[key] def compose_defs_env(env): # We disallow env overrides for anything with a default from GNU make. nons = ['CXX', 'CC', 'AR'] # 'SHELL'? ovr = ['%-20s ?= %s' %(k, v) for k,v in env.items() if k not in nons] nonovr = ['%-20s := %s' %(k, v) for k,v in env.items() if k in nons] return '\n'.join(ovr + nonovr + ['']) def compose_defines_pacbio(envin): """ This is used by mobs via buildcntl.sh. """ env = dict() setenv(env, 'SHELL', 'bash') #setifenvf(env, envin, 'OS_STRING', get_OS_STRING) #setifenvf(env, envin, 'PREBUILT', get_PREBUILT) nondefaults = set([ 'CXX', 'BLASR_INC', 'LIBPBDATA_INC', 'LIBPBDATA_LIB', 'LIBPBDATA_LIBFLAGS', 'LIBPBIHDF_INC', 'LIBPBIHDF_LIB', 'LIBPBIHDF_LIBFLAGS', 'LIBBLASR_INC', 'LIBBLASR_LIB', 'LIBBLASR_LIBFLAGS', 'HDF5_INC', 'HDF5_LIB', 'HDF5_LIBFLAGS', 'PBBAM_INC', 'PBBAM_LIB', 'PBBAM_LIBFLAGS', 'HTSLIB_INC', 'HTSLIB_LIB', 'HTSLIB_LIBFLAGS', 'BOOST_INC', 'GCC_LIB', 'ZLIB_LIB', 'ZLIB_LIBFLAGS', 'SZLIB_LIB', 'SZLIB_LIBFLAGS', 'PTHREAD_LIBFLAGS', 'DL_LIBFLAGS', 'RT_LIBFLAGS', ]) update_env_if(env, envin, nondefaults) return compose_defs_env(env) def configure_pacbio(envin, shared, build_dir): content1 = compose_defines_pacbio(envin) if not shared: content1 += 'LDFLAGS+=-static\n' update_content(os.path.join(build_dir, 'defines.mk'), content1) def set_defs_submodule_defaults(env, nopbbam): subdir = os.path.join(ROOT, 'libcpp') defaults = { 'LIBPBDATA_INC': os.path.join(subdir, 'pbdata'), 'LIBBLASR_INC': os.path.join(subdir, 'alignment'), #'LIBPBIHDF_INC': '' if nopbbam else os.path.join(subdir, 'hdf'), 'LIBPBDATA_LIB': os.path.join(subdir, 'pbdata'), 'LIBBLASR_LIB': os.path.join(subdir, 'alignment'), #'LIBPBIHDF_LIB': '' if nopbbam else os.path.join(subdir, 'hdf'), } for k in defaults: if k not in env: env[k] = defaults[k] def update_defaults_for_os(env): OS = shell('uname') if 'Darwin' in OS: #-lsz (for static builds?) env['RT_LIBFLAGS'] = '' def set_defs_defaults(env, nopbbam, with_szlib): defaults = { 'BLASR_INC': os.path.join(ROOT, 'include'), 'LIBBLASR_INC': os.path.join(ROOT, 'libcpp', 'alignment'), 'LIBPBDATA_INC': os.path.join(ROOT, 'libcpp', 'pbdata'), 'LIBPBIHDF_INC': os.path.join(ROOT, 'libcpp', 'hdf'), 'LIBBLASR_LIB': os.path.join(ROOT, 'libcpp', 'alignment'), 'LIBPBDATA_LIB': os.path.join(ROOT, 'libcpp', 'pbdata'), 'LIBPBIHDF_LIB': os.path.join(ROOT, 'libcpp', 'hdf'), 'LIBBLASR_LIBFLAGS': '-lblasr', 'LIBPBDATA_LIBFLAGS': '-lpbdata', 'LIBPBIHDF_LIBFLAGS': '-lpbihdf', 'HDF5_LIBFLAGS': '-lhdf5_cpp -lhdf5', 'RT_LIBFLAGS': '-lrt', 'ZLIB_LIBFLAGS': '-lz', 'PTHREAD_LIBFLAGS': '-lpthread', 'DL_LIBFLAGS': '-ldl', # neeeded by HDF5 always 'SHELL': 'bash -xe', } try: update_defaults_for_os(defaults) except Exception as e: warnings.warn(e) #setifenvf(defaults, env, 'OS_STRING', get_OS_STRING) #setifenvf(defaults, env, 'PREBUILT', get_PREBUILT) pbbam_defaults = { 'PBBAM_LIBFLAGS': '-lpbbam', 'HTSLIB_LIBFLAGS': '-lhts', 'ZLIB_LIBFLAGS': '-lz', #'PTHREAD_LIBFLAGS': '-lpthread', #'DL_LIBFLAGS': '-ldl', # neeeded by HDF5 always } if not nopbbam: defaults.update(pbbam_defaults) szlib_defaults = { 'SZLIB_LIBFLAGS': '-lsz', #'ZLIB_LIBFLAGS': '-lz', # probably needed, but provided elsewhere } if with_szlib: defaults.update(szlib_defaults) for k in defaults: if k not in env: env[k] = defaults[k] def get_make_style_env(envin, args): envout = dict() for arg in args: if '=' in arg: k, v = arg.split('=') envout[k] = v envout.update(envin) return envout def parse_args(args): parser = optparse.OptionParser() parser.add_option('--no-pbbam', action='store_true', help='Avoid compiling anything which would need pbbam.') parser.add_option('--with-szlib', action='store_true', help='If HDF5 was built with --with-szlib, then -lz is needed for static binaries.') parser.add_option('--submodules', action='store_true', help='Set variables to use our git-submodules, which must be pulled and built first. (Implies --no-pbbam.)') parser.add_option('--shared', action='store_true', help='Build for dynamic linking. (Non-static binaries.)') parser.add_option('--build-dir', help='Can be different from source directory, but only when *not* also building submodule.') return parser.parse_args(list(args)) def symlink_makefile(build_dir_root, src_dir_root, makefilename, relpath): src_dir = os.path.join(src_dir_root, relpath) build_dir = os.path.join(build_dir_root, relpath) src_name = os.path.join(src_dir, 'makefile') dst_name = os.path.join(build_dir, 'makefile') if os.path.lexists(dst_name): os.unlink(dst_name) print('%r <- %r' %(src_name, dst_name)) mkdirs(build_dir) os.symlink(src_name, dst_name) def symlink_makefiles(build_dir): symlink_makefile(build_dir, ROOT, 'makefile', '.') symlink_makefile(build_dir, ROOT, 'makefile', 'utils') symlink_makefile(build_dir, ROOT, 'makefile', 'extrautils') def main(prog, *args): """We are still deciding what env-vars to use, if any. """ # Set up an alias, until everything uses one consistently. conf, makevars = parse_args(args) if conf.build_dir is not None: symlink_makefiles(conf.build_dir) else: conf.build_dir = '.' conf.build_dir = os.path.abspath(conf.build_dir) envin = get_make_style_env(os.environ, makevars) if 'HDF5_INCLUDE' in envin and 'HDF5_INC' not in envin: envin['HDF5_INC'] = envin['HDF5_INCLUDE'] if conf.submodules: set_defs_submodule_defaults(envin, conf.no_pbbam) conf.no_pbbam = True set_defs_defaults(envin, conf.no_pbbam, conf.with_szlib) configure_pacbio(envin, conf.shared, conf.build_dir) if __name__=="__main__": main(*sys.argv) blasr-8e668beae0dda1da6914586fb458182c6c3c7482/ctest/000077500000000000000000000000001260737656700210025ustar00rootroot00000000000000blasr-8e668beae0dda1da6914586fb458182c6c3c7482/ctest/affineAlign.t000066400000000000000000000012741260737656700233760ustar00rootroot00000000000000Set up $ . $TESTDIR/setup.sh Test affineAlign $ rm -rf $OUTDIR/affineAlign.m0 $ $EXEC $DATDIR/affineAlign.fofn $DATDIR/substr_with_ins.fasta -m 0 -out $OUTDIR/affineAlign.m0 -affineAlign -holeNumbers 493 -insertion 100 -deletion 100 [INFO]* (glob) [INFO]* (glob) $ diff $OUTDIR/affineAlign.m0 $STDDIR/affineAlign_2014_06_10.m0 $ rm -rf $OUTDIR/ecoli_affine.m0 $ $EXEC $DATDIR/ecoli_affine.fasta $DATDIR/ecoli_reference.fasta -m 0 -out $OUTDIR/ecoli_affine.m0 -affineAlign -insertion 100 -deletion 100 [INFO]* (glob) [INFO]* (glob) $ diff $OUTDIR/ecoli_affine.m0 $STDDIR/ecoli_affine_2014_06_10.m0 # Note that MapQV for -affineAlign has been fixed in 2014 04 18, bug 24363 blasr-8e668beae0dda1da6914586fb458182c6c3c7482/ctest/aggressiveIntervalCut.t000066400000000000000000000007121260737656700255070ustar00rootroot00000000000000Set up $ . $TESTDIR/setup.sh Test -aggressiveIntervalCut. $ rm -f $TMP1 $ BASFILE=/mnt/data3/vol53/2450598/0001/Analysis_Results/m130812_185809_42141_c100533960310000001823079711101380_s1_p0.bas.h5 $ REFFA=/mnt/secondary/Smrtpipe/repository/Ecoli_BL21_O26/sequence/Ecoli_BL21_O26.fasta $ $EXEC $BASFILE $REFFA -holeNumbers 1-100 -out $TMP1 -aggressiveIntervalCut [INFO] * [blasr] started. (glob) [INFO] * [blasr] ended. (glob) $ echo $? 0 blasr-8e668beae0dda1da6914586fb458182c6c3c7482/ctest/alignScore.t000066400000000000000000000004121260737656700232520ustar00rootroot00000000000000Set up $ . $TESTDIR/setup.sh Test alignment score $ rm -rf $OUTDIR/testscore.m0 $ $EXEC $DATDIR/read.fasta $DATDIR/ref.fasta -minReadLength 1 -m 0 -out $OUTDIR/testscore.m0 [INFO]* (glob) [INFO]* (glob) $ diff $OUTDIR/testscore.m0 $STDDIR/testscore.m0 blasr-8e668beae0dda1da6914586fb458182c6c3c7482/ctest/bamIn.t000066400000000000000000000043571260737656700222260ustar00rootroot00000000000000Set up $ . $TESTDIR/setup.sh Test using bam as input $ $EXEC $DATDIR/test_bam/tiny_bam.fofn $DATDIR/lambda_ref.fasta -m 4 -out $OUTDIR/tiny_bam_in.m4 [INFO]* (glob) [INFO]* (glob) Check whether blasr produces identical results taking fasta sequences of the bam as input $ $EXEC $DATDIR/test_bam/tiny_fasta.fofn $DATDIR/lambda_ref.fasta -m 4 -out $OUTDIR/tiny_fasta_in.m4 [INFO]* (glob) [INFO]* (glob) $ diff $OUTDIR/tiny_bam_in.m4 $OUTDIR/tiny_fasta_in.m4 Test bam in, sam out $ $EXEC $DATDIR/test_bam/tiny_bam.fofn $DATDIR/lambda_ref.fasta -sam -out $OUTDIR/tiny_bam_in.sam -printSAMQV -clipping subread -cigarUseSeqMatch [INFO]* (glob) [INFO]* (glob) Test bam in, bam out $ $EXEC $DATDIR/test_bam/tiny_bam.fofn $DATDIR/lambda_ref.fasta -bam -out $OUTDIR/tiny_bam_in.bam -clipping subread [INFO]* (glob) [INFO]* (glob) Check whether sam out and bam out have identical alignments, not checking qvs $ $SAMTOOLS view -h $OUTDIR/tiny_bam_in.bam -o $OUTDIR/tiny_bam_in.bam.sam $ cut -f 2-11 $OUTDIR/tiny_bam_in.bam.sam |sed -n '6,$p' > $TMP1.aln $ cut -f 2-11 $OUTDIR/tiny_bam_in.sam |sed -n '6,$p' > $TMP2.aln $ diff $TMP1.aln $TMP2.aln Check whether sam out and bam out have identical read groups @RG $ awk '/^@RG/' $OUTDIR/tiny_bam_in.bam.sam > $TMP1.rg $ awk '/^@RG/' $OUTDIR/tiny_bam_in.sam > $TMP2.rg $ diff $TMP1.rg $TMP2.rg Compare iq produced with stdout $ sed -n '6,$p' $OUTDIR/tiny_bam_in.bam.sam | awk '{gsub(/\t/,"\n");}1' | awk '/^iq:Z:/' > $TMP1.iq $ sed -n '6,$p' $STDDIR/$UPDATEDATE/tiny_bam_in.bam.sam | awk '{gsub(/\t/,"\n");}1' | awk '/^iq:Z:/' > $TMP2.iq $ diff $TMP1.iq $TMP2.iq TODO:Check whether sam out and bam out have identical insertion qvs Currently QVs in bam are in 'native' orientation, and QVs in sam are in 'genomic' orientation. This needs to be fixed. $ sed -n '6,$p' $OUTDIR/tiny_bam_in.sam | awk '{gsub(/\t/,"\n");}1' | awk '/^iq:Z:/' > $TMP2.iq Test with multiple nproc $ $EXEC $DATDIR/test_bam/two_bam.fofn $DATDIR/lambda_ref.fasta -bam -nproc 15 -out $OUTDIR/two_bam_in.bam [INFO]* (glob) [INFO]* (glob) $ $SAMTOOLS view -h $OUTDIR/two_bam_in.bam -o $OUTDIR/two_bam_in.bam.sam TODO: test -concordant, when pbbam API to query over ZMWs is available. TODO: test bam with ccs reads blasr-8e668beae0dda1da6914586fb458182c6c3c7482/ctest/bamOut.t000066400000000000000000000021111260737656700224110ustar00rootroot00000000000000Set up $ . $TESTDIR/setup.sh Test generating bam output Input is bam, clipping=soft and subread should produce identical results $ $EXEC $DATDIR/test_bam/tiny_bam.fofn $DATDIR/lambda_ref.fasta -bam -out $OUTDIR/tiny_bam_in_soft.bam -clipping soft [INFO]* (glob) [INFO]* (glob) $ $EXEC $DATDIR/test_bam/tiny_bam.fofn $DATDIR/lambda_ref.fasta -bam -out $OUTDIR/tiny_bam_in_subread.bam -clipping subread [INFO]* (glob) [INFO]* (glob) $ $SAMTOOLS view $OUTDIR/tiny_bam_in_soft.bam | sed -n '6,$p' > $TMP1.bam_in_soft $ $SAMTOOLS view $OUTDIR/tiny_bam_in_subread.bam | sed -n '6,$p' > $TMP2.bam_in_subread $ diff $TMP1.bam_in_soft $TMP2.bam_in_subread Test if bam cigar strings are correct $ head -2 $TMP1.bam_in_soft |cut -f 6 25=1I28=1I41=1I5=1D6=1X12=1I15=1I2=1I16=1D10=1I11=1I74=1D12=1D7=3I4=1I6=1D1=2D14=1D16=1I8=1D4=1D5=1D20=1I3=1I10=1I37=1I13=1I25=1I15=1I7=1I11=1I3=2I1=1I16=1I6=1I8=1I11=1X1=1I5=1I56=1I17= 28=1D7=1I1=1I9=2I12=1I3=1D13=1I15=1I2=1X49=1I19=1I14=1I5=1D17=1D20=1D86=1I21=1I9=1I24=1I6=1I1=1I2=1D11=1D4=1D3=1D31=1D6=1I6=1I9=1I57=2I24=1I26=1I8=1I43=1S blasr-8e668beae0dda1da6914586fb458182c6c3c7482/ctest/bug25328.t000066400000000000000000000005521260737656700223520ustar00rootroot00000000000000Set up $ . $TESTDIR/setup.sh bug_25328, unrolled resequencing test $ INFA=$DATDIR/bug_25328_zmw_38131.fasta $ REF=$DATDIR/All4mers_circular_72x_l50256.fasta $ OUTFA=$OUTDIR/bug_25328.m4 $ $EXEC $INFA $REF -bestn 1 -nCandidates 1 -forwardOnly -maxMatch 14 -m 4 -out $OUTFA [INFO]* (glob) [INFO]* (glob) $ awk '$7-$6 >= 15000' $OUTFA |wc -l 1 blasr-8e668beae0dda1da6914586fb458182c6c3c7482/ctest/bug25741.t000066400000000000000000000006111260737656700223450ustar00rootroot00000000000000Set up $ . $TESTDIR/setup.sh bug_25741, if input bas.h5 does not contain mergeQV, blasr with -printSAMQV, -nproc>1 should not write garbage 'mq' values to output. $ $EXEC $DATDIR/bas_wo_mergeQV.fofn $DATDIR/lambda_ref.fasta -printSAMQV -sam -clipping subread -out $OUTDIR/out_printSAMQV.sam -nproc 12 [INFO]* (glob) [INFO]* (glob) $ grep 'mq' $OUTDIR/out_printSAMQV.sam |wc -l 1 blasr-8e668beae0dda1da6914586fb458182c6c3c7482/ctest/bug25766.t000066400000000000000000000005061260737656700223570ustar00rootroot00000000000000Set up $ . $TESTDIR/setup.sh bug_25766, added an option -minRawSubreadScore $ BASFILE=$DATDIR/lambda_bax.fofn $ REF=$DATDIR/lambda_ref.fasta $ $EXEC $BASFILE $REF -out $TMP1 -minRawSubreadScore 700 -nproc 18 [INFO]* (glob) [INFO]* (glob) $ echo $? 0 $ sort $TMP1 > $TMP2 $ diff $TMP2 $STDDIR/bug_25766.m4 blasr-8e668beae0dda1da6914586fb458182c6c3c7482/ctest/ccsH5.t000066400000000000000000000006441260737656700221400ustar00rootroot00000000000000Set up $ . $TESTDIR/setup.sh Test using *.ccs.h5 as input # The results should be exactly the same as # blasr $DATDIR/ccsasinput_bas.fofn $DATDIR/ccsasinput.fasta -m 4 -out tmp.m4 -useccsdenovo $ rm -rf $OUTDIR/ccsasinput.m4 $ $EXEC $DATDIR/ccsasinput.fofn $DATDIR/ccsasinput.fasta -m 4 -out $OUTDIR/ccsasinput.m4 [INFO]* (glob) [INFO]* (glob) $ diff $OUTDIR/ccsasinput.m4 $STDDIR/ccsasinput_2014_06_10.m4 blasr-8e668beae0dda1da6914586fb458182c6c3c7482/ctest/concordant.t000066400000000000000000000035331260737656700233250ustar00rootroot00000000000000Set up $ . $TESTDIR/setup.sh Test -concordant $ rm -rf $OUTDIR/concordant_subset.sam $ $EXEC $DATDIR/ecoli_lp.fofn $DATDIR/ecoli_reference.fasta -concordant -sam -out $OUTDIR/concordant_subset.sam -nproc 12 -holeNumbers 1-10000 -sa $DATDIR/ecoli_reference.sa [INFO]* (glob) [INFO]* (glob) $ sed -n 6,110864p $OUTDIR/concordant_subset.sam > $OUTDIR/tmp1 $ sort $OUTDIR/tmp1 > $OUTDIR/tmp11 $ sed -n 6,110864p $STDDIR/$UPDATEDATE/concordant_subset.sam > $OUTDIR/tmp2 $ sort $OUTDIR/tmp2 > $OUTDIR/tmp22 $ diff $OUTDIR/tmp11 $OUTDIR/tmp22 $ rm -rf $OUTDIR/tmp1 $OUTDIR/tmp2 $OUTDIR/tmp11 $OUTDIR/tmp22 #2014_05_28 --> changelist 135254, use MAX_BAND_SIZE to contrain GuidedAlign #2014_08_21 --> changelist 138516, added YS, YE, ZM tags. #2014_08_28 --> changelist 139176, update SAM MD5 #2014_09_12 --> changelist 140410, changed the default value of '-concordantTemplate' from 'longestsubread' to 'typicalsubread' #2014_09_17 --> changelist 140573, changed SDPFragment LessThan to make sure blasr compiled with gcc 4.4 and 4.8 can produce identical results. #2014_10_16 --> changelist 141378, changed the default value of '-concordantTemplate' from 'typicalsubread' to 'mediansubread' #2015_03_01 --> changelist 146599, reads from the same movie should have unique readGroupId #2015_03_28 --> changelist 148101, 148080 updated read group id, 148100 updated TLEN #2015_04_09 --> changelist 148796, updated read group id #2015_04_25 --> changelist 149721, update CIGAR string, replace M with X=. Test -concordant FMR1 case (the 'typical subread' is selected as template for concordant mapping) $ FOFN=$DATDIR/FMR1_concordant.fofn $ REF=$DATDIR/FMR1_130CGG.fasta $ $EXEC $FOFN $REF -concordant -out $OUTDIR/FMR1_zmw_37927.m4 -m 4 -holeNumbers 37927 [INFO]* (glob) [INFO]* (glob) $ diff $OUTDIR/FMR1_zmw_37927.m4 $STDDIR/FMR1_zmw_37927.m4 blasr-8e668beae0dda1da6914586fb458182c6c3c7482/ctest/ecoli.t000066400000000000000000000014761260737656700222720ustar00rootroot00000000000000Set up $ . $TESTDIR/setup.sh Test blasr on ecoli. Test blasr with -sam # The following job takes a very long time to finish, let us use a subset of reads instead #See $STDOUT/ecoli_v1.4.sam for 1.4 output. # $STDOUT/ecoli_2014_03_28.sam for bug before mapQV for affineAlign/align without QV is fixed. $ rm -rf $OUTDIR/ecoli_subset.sam $ $EXEC $DATDIR/ecoli_subset.fasta $DATDIR/ecoli_reference.fasta -sam -out $OUTDIR/ecoli_subset.sam -nproc 15 [INFO]* (glob) [INFO]* (glob) $ sed -n '5,$ p' $OUTDIR/ecoli_subset.sam | sort | cut -f 1-11 > $TMP1 $ sed -n '5,$ p' $STDDIR/$UPDATEDATE/ecoli_subset.sam | sort | cut -f 1-11 > $TMP2 $ diff $TMP1 $TMP2 $ rm $TMP1 $TMP2 # 2015_03_08 --> changelist 148101, 148080 updated read group id; 148100 updated TLEN # 2015_04_09 --> changelist 148796, updated read group id blasr-8e668beae0dda1da6914586fb458182c6c3c7482/ctest/fastMaxInterval.t000066400000000000000000000006761260737656700243100ustar00rootroot00000000000000Set up $ . $TESTDIR/setup.sh Test -fastMaxInterval. $ rm -f $TMP1 $ BASFILE=/mnt/data3/vol53/2450598/0001/Analysis_Results/m130812_185809_42141_c100533960310000001823079711101380_s1_p0.bas.h5 $ REFFA=/mnt/secondary/Smrtpipe/repository/Ecoli_BL21_O26/sequence/Ecoli_BL21_O26.fasta $ $EXEC $BASFILE $REFFA -holeNumbers 1-100 -out $TMP1 -fastMaxInterval [INFO] * [blasr] started. (glob) [INFO] * [blasr] ended. (glob) $ echo $? 0 blasr-8e668beae0dda1da6914586fb458182c6c3c7482/ctest/filtercriteria.t000066400000000000000000000011331260737656700241750ustar00rootroot00000000000000Set up $ . $TESTDIR/setup.sh $ NAME=test_filtercriteria $ DATDIR=$DATDIR/$NAME $ OUTDIR=$OUTDIR/$NAME $ STDDIR=$STDDIR/$NAME $ mkdir -p $OUTDIR Test -minPctSimilarity $ I=$DATDIR/tiny_bam.fofn $ R=$DATDIR/lambdaNEB.fa $ O=$OUTDIR/min_pct_similarity_90.m4 $ $EXEC $I $R -out $O -m 4 -minPctSimilarity 90 [INFO]* (glob) [INFO]* (glob) $ echo $? 0 $ awk '$4 < 90 {print}' $O |wc -l |cut -f 1 -d ' ' 0 $ O=$OUTDIR/min_aln_len_1000.m4 $ $EXEC $I $R -out $O -m 4 -minAlnLength 1000 [INFO]* (glob) [INFO]* (glob) $ echo $? 0 $ wc -l $O |cut -f 1 -d ' ' 12 blasr-8e668beae0dda1da6914586fb458182c6c3c7482/ctest/fofn.t000066400000000000000000000014241260737656700221200ustar00rootroot00000000000000Set up $ . $TESTDIR/setup.sh Test blasr with *.fofn input # $ rm -rf $OUTDIR/lambda_bax.m4 # $ $EXEC $DATDIR/lambda_bax.fofn $DATDIR/lambda_ref.fasta -m 4 -out lambda_bax_tmp.m4 -nproc 15 -minMatch 14 # [INFO]* (glob) # [INFO]* (glob) # $ sort lambda_bax_tmp.m4 > $OUTDIR/lambda_bax.m4 # $ diff $OUTDIR/lambda_bax.m4 $STDDIR/lambda_bax.m4 # This test takes a long time, use a subset instad. $ rm -rf $OUTDIR/lambda_bax_subset.m4 $ $EXEC $DATDIR/lambda_bax.fofn $DATDIR/lambda_ref.fasta -m 4 -out $OUTDIR/lambda_bax_tmp_subset.m4 -nproc 15 -minMatch 14 -holeNumbers 1-1000 -sa $DATDIR/lambda_ref.sa [INFO]* (glob) [INFO]* (glob) $ sort $OUTDIR/lambda_bax_tmp_subset.m4 > $OUTDIR/lambda_bax_subset.m4 $ diff $OUTDIR/lambda_bax_subset.m4 $STDDIR/lambda_bax_subset.m4 blasr-8e668beae0dda1da6914586fb458182c6c3c7482/ctest/hitpolicy.t000066400000000000000000000052001260737656700231700ustar00rootroot00000000000000Set up $ . $TESTDIR/setup.sh $ NAME=test_hitpolicy $ DATDIR=$DATDIR/$NAME $ OUTDIR=$OUTDIR/$NAME $ STDDIR=$STDDIR/$NAME $ mkdir -p $OUTDIR $ I=$DATDIR/tiny_bam.fofn $ R=$DATDIR/test_hitpolicy_target.fa $ O=$OUTDIR/hitpolicy_all.m4 $ X=$STDDIR/hitpolicy_all.m4 Test hitpolicy all $ $EXEC $I $R -out $O -m 4 -hitPolicy all [INFO]* (glob) [INFO]* (glob) $ echo $? 0 $ wc -l $O | cut -f 1 -d ' ' 683 Test hitpolicy allbest $ O=$OUTDIR/hitpolicy_allbest.m4 $ X=$STDDIR/hitpolicy_allbest.m4 $ $EXEC $I $R -out $O -m 4 -hitPolicy allbest && sort $O > $TMP1 && mv $TMP1 $O [INFO]* (glob) [INFO]* (glob) $ echo $? 0 $ sort $O > $TMP1 && mv $TMP1 $O $ diff $O $X && echo $? 0 Test hitpolicy random $ O=$OUTDIR/hitpolicy_random.m4 $ O2=$OUTDIR/hitpolicy_random_2.m4 $ X=$STDDIR/hitpolicy_random.m4 $ $EXEC $I $R -out $O -m 4 -hitPolicy random -randomSeed 1 [INFO]* (glob) [INFO]* (glob) $ $EXEC $I $R -out $O2 -m 4 -hitPolicy random -randomSeed 1 [INFO]* (glob) [INFO]* (glob) $ sort $O > $TMP1 && mv $TMP1 $O $ sort $O2 > $TMP2 && mv $TMP2 $O2 $ diff $O $O2 && echo $? 0 Test hitpolicy randombest bam inputs, nproc > 1, fixed seed $ O=$OUTDIR/hitpolicy_randombest_bam_in.m4 $ O2=$OUTDIR/hitpolicy_randombest_bam_in_2.m4 $ X=$STDDIR/hitpolicy_randombest_bam_in.m4 $ $EXEC $I $R -out $O -m 4 -hitPolicy randombest -randomSeed 1 -nproc 10 [INFO]* (glob) [INFO]* (glob) $ $EXEC $I $R -out $O2 -m 4 -hitPolicy randombest -randomSeed 1 -nproc 10 [INFO]* (glob) [INFO]* (glob) $ sort $O > $TMP1 && mv $TMP1 $O $ sort $O2 > $TMP1 && mv $TMP1 $O2 $ diff $O $O2 && echo $? 0 Test hitpolicy randombest bax inputs, nproc > 1, fixed seed $ I=$DATDIR/tiny_bax.fofn $ O=$OUTDIR/hitpolicy_randombest_bax_in.m4 $ X=$STDDIR/hitpolicy_randombest_bax_in.m4 $ $EXEC $I $R -out $O -m 4 -hitPolicy randombest -randomSeed 1 -nproc 10 [INFO]* (glob) [INFO]* (glob) $ sort $O > $TMP1 && mv $TMP1 $O $ diff $O $X && echo $? 0 Test hitpolicy randombest fasta inputs, nproc > 1, fixed seed $ I=$DATDIR/tiny_fasta.fofn $ O=$OUTDIR/hitpolicy_randombest_fasta_in.m4 $ X=$STDDIR/hitpolicy_randombest_fasta_in.m4 $ $EXEC $I $R -out $O -m 4 -hitPolicy randombest -randomSeed 1 -nproc 10 [INFO]* (glob) [INFO]* (glob) $ sort $O > $TMP1 && mv $TMP1 $O $ diff $O $X && echo $? 0 Test hitpolicy leftmost $ O=$OUTDIR/hitpolicy_leftmost.m4 $ X=$STDDIR/hitpolicy_leftmost.m4 $ $EXEC $I $R -out $O -m 4 -hitPolicy leftmost -nproc 10 [INFO]* (glob) [INFO]* (glob) $ # target is lambda x 6, leftmost -> only map to the very first x. $ awk '$10 > 48502 {print}' $O |wc -l 0 blasr-8e668beae0dda1da6914586fb458182c6c3c7482/ctest/holeNumbers.t000066400000000000000000000005741260737656700234600ustar00rootroot00000000000000Set up $ . $TESTDIR/setup.sh Test -holeNumbers $ rm -f $OUTDIR/holeNumbers.m4 $ $EXEC $DATDIR/lambda_bax.fofn $DATDIR/lambda_ref.fasta -m 4 -out $OUTDIR/holeNumbers.m4 -holeNumbers 14798,55000-55100 -nproc 8 [INFO]* (glob) [INFO]* (glob) $ sort $OUTDIR/holeNumbers.m4 > $TMP1 $ sort $STDDIR/holeNumbers_2014_05_29.m4 > $TMP2 $ diff $TMP1 $TMP2 $ rm $TMP1 $TMP2 blasr-8e668beae0dda1da6914586fb458182c6c3c7482/ctest/m0-5.t000066400000000000000000000016711260737656700216520ustar00rootroot00000000000000Set up $ . $TESTDIR/setup.sh Test blasr with -m 0 ~ 5 $ rm -rf $OUTDIR/read.m0 $ $EXEC $DATDIR/read.fasta $DATDIR/ref.fasta -m 0 -out $OUTDIR/read.m0 [INFO]* (glob) [INFO]* (glob) $ diff $OUTDIR/read.m0 $STDDIR/read.m0 $ rm -rf $OUTDIR/read.m1 $ $EXEC $DATDIR/read.fasta $DATDIR/ref.fasta -m 1 -out $OUTDIR/read.m1 [INFO]* (glob) [INFO]* (glob) $ diff $OUTDIR/read.m1 $STDDIR/read_2014_05_29.m1 $ rm -rf $OUTDIR/read.m2 $ $EXEC $DATDIR/read.fasta $DATDIR/ref.fasta -m 2 -out $OUTDIR/read.m2 [INFO]* (glob) [INFO]* (glob) $ diff $OUTDIR/read.m2 $STDDIR/read.m2 $ rm -rf $OUTDIR/read.m3 $ $EXEC $DATDIR/read.fasta $DATDIR/ref.fasta -m 3 -out $OUTDIR/read.m3 [INFO]* (glob) [INFO]* (glob) $ diff $OUTDIR/read.m3 $STDDIR/read.m3 $ rm -rf $OUTDIR/read.m4 $ $EXEC $DATDIR/read.fasta $DATDIR/ref.fasta -m 4 -out $OUTDIR/read.m4 [INFO]* (glob) [INFO]* (glob) $ diff $OUTDIR/read.m4 $STDDIR/read.m4 blasr-8e668beae0dda1da6914586fb458182c6c3c7482/ctest/multipart.t000066400000000000000000000010471260737656700232120ustar00rootroot00000000000000Set up $ . $TESTDIR/setup.sh Test input.fofn containing a new bas.h5 file. Note that the new bas.h5 file does not contain any /PulseData, instead contains /MultiPart/Parts. $ rm -f $TMP1 $ BASFILE=/mnt/data3/vol53/2450598/0001/Analysis_Results/m130812_185809_42141_c100533960310000001823079711101380_s1_p0.bas.h5 $ REFFA=/mnt/secondary/Smrtpipe/repository/Ecoli_BL21_O26/sequence/Ecoli_BL21_O26.fasta $ $EXEC $BASFILE $REFFA -holeNumbers 1-100 -out $TMP1 [INFO] * [blasr] started. (glob) [INFO] * [blasr] ended. (glob) $ echo $? 0 blasr-8e668beae0dda1da6914586fb458182c6c3c7482/ctest/noSplitSubreads.t000066400000000000000000000017421260737656700243140ustar00rootroot00000000000000Set up $ . $TESTDIR/setup.sh Test blasr with -noSplitSubreads # $ rm -rf $OUTDIR/lambda_bax_noSplitSubreads.m4 # $ $EXEC $DATDIR/lambda_bax.fofn $DATDIR/lambda_ref.fasta -noSplitSubreads -m 4 -out lambda_bax_noSplitSubreads_tmp.m4 -nproc 15 # [INFO]* (glob) # [INFO]* (glob) # $ sort lambda_bax_noSplitSubreads_tmp.m4 > $OUTDIR/lambda_bax_noSplitSubreads.m4 # $ diff $OUTDIR/lambda_bax_noSplitSubreads.m4 $STDDIR/lambda_bax_noSplitSubreads.m4 # This test takes a long time, use a subset instad. $ rm -rf $OUTDIR/lambda_bax_noSplitSubreads_subset.m4 $ $EXEC $DATDIR/lambda_bax.fofn $DATDIR/lambda_ref.fasta -noSplitSubreads -m 4 -out $OUTDIR/lambda_bax_noSplitSubreads_tmp_subset.m4 -nproc 15 -holeNumbers 1-1000 -sa $DATDIR/lambda_ref.sa [INFO]* (glob) [INFO]* (glob) $ sort $OUTDIR/lambda_bax_noSplitSubreads_tmp_subset.m4 > $OUTDIR/lambda_bax_noSplitSubreads_subset.m4 $ diff $OUTDIR/lambda_bax_noSplitSubreads_subset.m4 $STDDIR/lambda_bax_noSplitSubreads_subset.m4 blasr-8e668beae0dda1da6914586fb458182c6c3c7482/ctest/open_fail.t000066400000000000000000000005031260737656700231210ustar00rootroot00000000000000Set up $ . $TESTDIR/setup.sh If fail to open an bax/bas.h5 file because of unable to initialize required dataset, give an warning. $ $EXEC $DATDIR/open_fail_no_dyset.fofn $DATDIR/lambda_ref.fasta -m 4 [INFO]* (glob) Could not open /mnt/secondary-siv/testdata/BlasrTestData/ctest/data/open_fail_no_dyset.fofn [1] blasr-8e668beae0dda1da6914586fb458182c6c3c7482/ctest/samNM.t000066400000000000000000000004141260737656700222010ustar00rootroot00000000000000Set up $ . $TESTDIR/setup.sh Test Sam out nm tag $ rm -rf $OUTDIR/read.sam $ $EXEC $DATDIR/read.fasta $DATDIR/ref.fasta -sam -out $OUTDIR/read.sam [INFO]* (glob) [INFO]* (glob) $ tail -n+5 $OUTDIR/read.sam |cut -f 21 NM:i:2 NM:i:3 NM:i:2 NM:i:4 blasr-8e668beae0dda1da6914586fb458182c6c3c7482/ctest/setup.sh000077500000000000000000000020171260737656700225010ustar00rootroot00000000000000# Set up directories CURDIR=$TESTDIR REMOTEDIR=/mnt/secondary-siv/testdata/BlasrTestData/ctest DATDIR=$REMOTEDIR/data OUTDIR=$CURDIR/out STDDIR=$REMOTEDIR/stdout # Set up the executable: blasr. EXEC=$TESTDIR/../blasr # Define tmporary files TMP1=$OUTDIR/$$.tmp.out TMP2=$OUTDIR/$$.tmp.stdout # Make OUTDIR mkdir -p $OUTDIR #FIXME: make samtools independent of absolute build path. SAMTOOLS=/mnt/secondary/Smrtpipe/builds/Internal_Mainline_Nightly_LastSuccessfulBuild/analysis/bin/samtools #Update date UPDATEDATE=2015_04_27 # 2014_08_21 --> change 138516: added YS, YE, ZM tags # 2014_08_28 --> change 139176: Update SAM MD5 # 2015_03_28 --> change 148101: 148080 update read group id, 148100 update TLEN. # 2015_04_09 --> change 148796: update read group id # 2015_04_25 --> change 149721, update CIGAR string, replace M with X= # 2015_04_26 --> change 149749, add opiton -cigarUseSeqMatch (default: false). If -cigarUseSeqMatch is turned on, CIGAR strings use '=' and 'X' to represent sequence match and mismatch instead of 'M'. blasr-8e668beae0dda1da6914586fb458182c6c3c7482/ctest/useccsallBestN1.t000066400000000000000000000005671260737656700241720ustar00rootroot00000000000000Set up $ . $TESTDIR/setup.sh Test -useccsall with bestn = 1 $ $EXEC $DATDIR/ccstest.fofn $DATDIR/ccstest_ref.fasta -bestn 1 -useccsall -sam -out $OUTDIR/useccsall.sam -holeNumbers 76772 [INFO]* (glob) [INFO]* (glob) $ sed -n '9,$ p' $OUTDIR/useccsall.sam > $TMP1 $ sed -n '9,$ p' $STDDIR/$UPDATEDATE/useccsall.sam > $TMP2 $ diff $TMP1 $TMP2 $ rm $TMP1 $TMP2 blasr-8e668beae0dda1da6914586fb458182c6c3c7482/ctest/useccsallLargeGenome.t000066400000000000000000000011211260737656700252460ustar00rootroot00000000000000Set up $ . $TESTDIR/setup.sh Test -useccsall with Large genome. $ BASFILE=/mnt/data3/vol53/2450530/0014/Analysis_Results/m130507_052228_42161_c100519212550000001823079909281305_s1_p0.3.bax.h5 $ REFDIR=/mnt/secondary/Smrtpipe/repository/hg19_M_sorted/sequence $ REFFA=$REFDIR/hg19_M_sorted.fasta $ REFSA=$REFDIR/hg19_M_sorted.fasta.sa $ OUTFILE=$OUTDIR/intflow.m4 $ $EXEC $BASFILE $REFFA -out $OUTFILE -m 4 -sa $REFSA -holeNumbers 109020 [INFO]* (glob) [INFO]* (glob) $ sort $OUTFILE > $TMP1 && sort $STDDIR/intflow_2014_06_10.m4 > $TMP2 && diff $TMP1 $TMP2 && echo $? 0 blasr-8e668beae0dda1da6914586fb458182c6c3c7482/ctest/verbose.t000066400000000000000000000003031260737656700226300ustar00rootroot00000000000000Set up $ . $TESTDIR/setup.sh Test alignment score $ $EXEC $DATDIR/lambda_bax.fofn $DATDIR/lambda_ref.fasta -holeNumbers 1-200 -V 3 > $TMP1 [INFO]* (glob) [INFO]* (glob) $ echo $? 0 blasr-8e668beae0dda1da6914586fb458182c6c3c7482/extrautils/000077500000000000000000000000001260737656700220645ustar00rootroot00000000000000blasr-8e668beae0dda1da6914586fb458182c6c3c7482/extrautils/BasH5Simulator.cpp000066400000000000000000000516701260737656700254030ustar00rootroot00000000000000#include #include #include #include "utils.hpp" #include "Enumerations.h" #include "DNASequence.hpp" #include "FASTAReader.hpp" #include "CommandLineParser.hpp" #include "metagenome/TitleTable.hpp" #include "metagenome/FindRandomSequence.hpp" #include "statistics/StatUtils.hpp" #include "HDFBasWriter.hpp" #include "HDFRegionTableWriter.hpp" #include "simulator/LengthHistogram.hpp" #include "simulator/OutputSampleListSet.hpp" #include "simulator/ContextOutputList.hpp" using namespace std; void SetHelp(string & str) { stringstream helpStream; helpStream << "usage: alchemy outputModel [ options ]" << endl << " options: " << endl << " -genome genome.fasta" << endl << " Simulate reads from the reference genome 'genome.fasta'." << endl << endl << " -numBasesPerFile numBasesPerFile" << endl << " Limit the number of bases per output file to this." << endl << endl << " -sourceReads filename " << endl << " When set, simulate reads by reading from 'filename', " << endl << " rather than simulating from a genome." << endl << " The format of the fasta titles should be >read_index|chr|start_pos|end_pos" << endl << endl << " -lengthModel" << endl << " Use lengths from the alchemy model, rather than the read length. This " << endl << " is used in conjunction with the sourceReadsFile, to modulate the lenghts" << endl << " of the reads." << endl << endl << " -fixedLength length " << endl << " Set simulated read length to a fixed value of 'length', rather than " << endl << " sampling from a length mode." << endl << " -movieName name (\"simulated_movie\")" << endl << " Use 'name' for movies rather than m000_000..." << endl << endl << " -titleTable name" < movieNames; bool useLengthModel = false; bool useFixedLength = false; ofstream posMapFile; int scaledLength = 0; int fixedLength = 0; int nBasFiles = 1; bool useLengthsModel = true; bool printHelp = false; // Look to see if the refAsReads flag is specified anywhere before // parsing the command line. CommandLineParser clp; string commandLine; string helpString; SetHelp(helpString); vector fns; clp.RegisterStringOption("genome", &refGenomeFileName, ""); clp.RegisterIntOption("numBasesPerFile", (int*)&numBasesPerFile, "", CommandLineParser::PositiveInteger); clp.RegisterStringOption("sourceReads", &sourceReadsFileName, ""); clp.RegisterStringOption("lengthModel", &lengthModelFileName, ""); clp.RegisterIntOption("fixedLength", &fixedLength, "", CommandLineParser::PositiveInteger); clp.RegisterFlagOption("lengthModel", &useLengthModel, ""); clp.RegisterStringOption("movieName", &movieName, ""); clp.RegisterStringOption("titleTable", &titleTableFileName, ""); clp.RegisterStringOption("baseFileName", &basH5BaseFileName, ""); clp.RegisterIntOption("nFiles", &nBasFiles, "", CommandLineParser::PositiveInteger); clp.RegisterIntOption("meanLength", &scaledLength, "", CommandLineParser::PositiveInteger); clp.RegisterStringOption("posMap", &posMapFileName, ""); clp.RegisterFlagOption("printPercentRepeat", &printPercentRepeat, ""); clp.RegisterFlagOption("h", &printHelp, ""); clp.SetHelp(helpString); clp.ParseCommandLine(argc, argv, fns); clp.CommandLineToString(argc, argv, commandLine); clp.SetProgramName("alchemy"); outputModelFileName = fns[0]; if (argc <= 1 or printHelp or outputModelFileName == "") { cout << helpString << endl; exit(0); } if (usePosMap) { CrucialOpen(posMapFileName, posMapFile, std::ios::out); } if (sourceReadsFileName == "" and fixedLength == 0) { useLengthModel = true; } if (useLengthModel and fixedLength != 0) { cout << "ERROR! You must either use a length model or a fixed length." << endl; exit(1); } if (sourceReadsFileName == "" and numBasesPerFile == 0) { cout << "ERROR! You must specify either a set of read to use as " << endl << "original reads for simulation or the total number of bases " << endl << "to simulate in each bas.h5 file." << endl; exit(1); } if (sourceReadsFileName == "" and refGenomeFileName == "") { cout << "ERROR! You must specify a genome to sample reads from or a set of read "< alignmentLengths; int meanAlignmentLength; if (scaledLength != 0 and useLengthModel) { // // Scale the histogram so that the average length is 'scaledLength'. // // 1. Integrate histogram long totalLength = 0; long totalSamples = 0; int hi; for (hi = 0; hi < lengthHistogram.lengthHistogram.cdf.size()-1; hi++) { int ni; ni = lengthHistogram.lengthHistogram.cdf[hi+1] - lengthHistogram.lengthHistogram.cdf[hi]; totalLength += ni * lengthHistogram.lengthHistogram.data[hi]; } totalSamples = lengthHistogram.lengthHistogram.cdf[lengthHistogram.lengthHistogram.cdf.size()-1]; float meanSampleLength = totalLength / (1.0*totalSamples); float fractionIncrease = scaledLength / meanSampleLength; for (hi = 0; hi < lengthHistogram.lengthHistogram.cdf.size(); hi++) { lengthHistogram.lengthHistogram.data[hi] *= fractionIncrease; } } FASTAReader inReader, seqReader; vector reference; DNALength refLength = 0; int i; if (refGenomeFileName != "") { inReader.Init(refGenomeFileName); inReader.ReadAllSequences(reference); for (i = 0; i < reference.size(); i++) { refLength += reference[i].length; } } if (sourceReadsFileName != "") { seqReader.Init(sourceReadsFileName); } ofstream readsFile; // // Create and simulate bas.h5 files. // int baseFileIndex; bool readsRemain = true; for (baseFileIndex = 0; ((sourceReadsFileName == "" and baseFileIndex < nBasFiles) // case 1 is reads are generated by file or (sourceReadsFileName != "" and readsRemain)); // case 2 is reads are generated by an input file. baseFileIndex++) { // // Prep the base file for writing. // stringstream fileNameStrm, movieNameStrm; //string movieName = "m000000_000000_00000_cSIMULATED_s"; movieNameStrm << movieName << baseFileIndex << "_p0"; string fullMovieName = movieNameStrm.str(); fileNameStrm << fullMovieName << ".bas.h5"; HDFBasWriter basWriter; HDFRegionTableWriter regionWriter; // // This is mainly used to create the atributes. // RegionTable regionTable; regionTable.CreateDefaultAttributes(); basWriter.SetPlatform(Springfield); // // Use a fixed set of fields for now. // // These are all pulled from the outputModel. basWriter.IncludeField("Basecall"); basWriter.IncludeField("QualityValue"); basWriter.IncludeField("SubstitutionQV"); basWriter.IncludeField("SubstitutionTag"); basWriter.IncludeField("InsertionQV"); basWriter.IncludeField("DeletionQV"); basWriter.IncludeField("DeletionTag"); basWriter.IncludeField("WidthInFrames"); basWriter.IncludeField("PreBaseFrames"); basWriter.IncludeField("PulseIndex"); vector qualityValue, substitutionQV, substitutionTag, insertionQV, deletionQV, deletionTag; vector widthInFrames, preBaseFrames, pulseIndex; // Just go from 0 .. hole Number basWriter.IncludeField("HoleNumber"); // Fixed to 0. basWriter.IncludeField("HoleXY"); if (usePosMap == false) { basWriter.IncludeField("SimulatedSequenceIndex"); basWriter.IncludeField("SimulatedCoordinate"); } basWriter.SetChangeListID("1.3.0.50.104380"); DNALength numSimulatedBases = 0; FASTASequence sampleSeq; //sampleSeq.length = readLength; int maxRetry = 10000000; int retryNumber = 0; int numReads = 0; int readLength = 0; while (numBasesPerFile == 0 or numSimulatedBases < numBasesPerFile) { DNALength seqIndex, seqPos; if (useLengthModel or fixedLength) { if (useLengthModel) { lengthHistogram.GetRandomLength(readLength); } else { readLength = fixedLength; } } if (refGenomeFileName != "") { FindRandomPos(reference, seqIndex, seqPos, readLength + (outputModel.keyLength - 1)); sampleSeq.seq = &reference[seqIndex].seq[seqPos]; sampleSeq.length = readLength + (outputModel.keyLength - 1); assert(reference[seqIndex].length >= sampleSeq.length); } else if (sourceReadsFileName != "") { if (seqReader.GetNext(sampleSeq) == false) { readsRemain = false; break; } if (sampleSeq.length < outputModel.keyLength) { continue; } // // Now attempt to parse the position from the fasta title. // if (useLengthModel) { int tryNumber = 0; readLength = 0; int maxNTries = 1000; int tryBuffer[5] = {-1,-1,-1,-1,-1}; while (tryNumber < maxNTries and readLength < outputModel.keyLength) { lengthHistogram.GetRandomLength(readLength); readLength = sampleSeq.length = min(sampleSeq.length, (unsigned int) readLength); tryBuffer[tryNumber%5] = readLength; tryNumber++; } if (tryNumber >= maxNTries) { cout << "ERROR. Could not generate a read length greater than the " << outputModel.keyLength << " requried " < tokens; Splice(sampleSeq.title, "|", tokens); if (tokens.size() == 4) { seqPos = atoi(tokens[2].c_str()); if (titleTableFileName == "") { seqIndex = 0; } else { int index; titleTable.Lookup(tokens[1], index); seqIndex = index; } } else { seqPos = 0; } } // // If this is the first read printed to the base file, initialize it. // if (numSimulatedBases == 0) { basWriter.Initialize(fileNameStrm.str(), movieNameStrm.str(), Springfield); regionWriter.Initialize(basWriter.pulseDataGroup); } numSimulatedBases += readLength; int p; // create the sample sequence int contextLength = outputModel.keyLength; int contextMiddle = contextLength / 2; string outputString; int nDel = 0; int nIns = 0; // // Simulate to beyond the sample length. // qualityValue.clear(); substitutionQV.clear(); substitutionTag.clear(); insertionQV.clear(); deletionQV.clear(); deletionTag.clear(); pulseIndex.clear(); widthInFrames.clear(); preBaseFrames.clear(); assert(sampleSeq.length > contextMiddle + 1); for (p = contextMiddle; p < sampleSeq.length - contextMiddle - 1; p++) { string refContext; refContext.assign((const char*) &sampleSeq.seq[p-contextMiddle], contextLength); string outputContext; int contextWasFound; OutputSample sample; int i; for (i = 0; i < refContext.size(); i++) { refContext[i] = toupper(refContext[i]);} outputModel.SampleRandomSample(refContext, sample); if (sample.type == OutputSample::Deletion ) { // // There was a deletion. Advance in reference, then output // the base after the deletion. // p++; ++nDel; } int cp; // // Add the sampled context, possibly multiple characters because of an insertion. // for (i = 0; i < sample.nucleotides.size(); i++) { outputString.push_back(sample.nucleotides[i]); qualityValue.push_back(sample.qualities[i].qv[0]); deletionQV.push_back(sample.qualities[i].qv[1]); insertionQV.push_back(sample.qualities[i].qv[2]); substitutionQV.push_back(sample.qualities[i].qv[3]); deletionTag.push_back(sample.qualities[i].tags[0]); substitutionTag.push_back(sample.qualities[i].tags[1]); pulseIndex.push_back(sample.qualities[i].frameValues[0]); preBaseFrames.push_back(sample.qualities[i].frameValues[1]); widthInFrames.push_back(sample.qualities[i].frameValues[2]); } nIns += sample.qualities.size() - 1; } if (outputString.find('N') != outputString.npos or outputString.find('n') != outputString.npos) { cout << "WARNING! The sampled string " << endl << outputString << endl << "should not contain N's, but it seems to. This is being ignored "< #include "utils.hpp" #include "metagenome/SequenceIndexDatabase.hpp" #include "CommandLineParser.hpp" #include "FASTAReader.hpp" #include "utils/FileOfFileNames.hpp" using namespace std; int main(int argc, char* argv[]) { CommandLineParser clp; string fastaFileName, indexFileName; vector fastaFileNames; vector opts; clp.SetProgramName("bsdb"); clp.SetProgramSummary("Build an index database on a file of sequences.\n" " The index is used to map to reads given alignment positions.\n"); clp.RegisterStringOption("fasta", &fastaFileName, "A file with sequences to build an index."); clp.RegisterStringOption("index", &indexFileName, "The index file."); clp.RegisterPreviousFlagsAsHidden(); clp.ParseCommandLine(argc, argv, opts); ifstream fastaIn; ofstream indexOut; if (FileOfFileNames::IsFOFN(fastaFileName)) { FileOfFileNames::FOFNToList(fastaFileName, fastaFileNames); } else { fastaFileNames.push_back(fastaFileName); } CrucialOpen(indexFileName, indexOut, std::ios::out | std::ios::binary); SequenceIndexDatabase seqDB; int fileNameIndex; for (fileNameIndex = 0; fileNameIndex < fastaFileNames.size(); fileNameIndex++){ FASTAReader reader; FASTASequence seq; reader.Init(fastaFileNames[fileNameIndex]); int i = 0; while (reader.GetNext(seq)) { seqDB.AddSequence(seq); i++; } } seqDB.Finalize(); seqDB.WriteDatabase(indexOut); return 0; } blasr-8e668beae0dda1da6914586fb458182c6c3c7482/extrautils/BwtToSuffixArray.cpp000066400000000000000000000013141260737656700260120ustar00rootroot00000000000000#include "bwt/BWT.hpp" #include "suffixarray/SuffixArray.hpp" #include "suffixarray/SuffixArrayTypes.hpp" #include #include #include using namespace std; int main(int argc, char* argv[]) { string bwtFileName, saFileName; if (argc < 3) { cout << "usage: bwt2sa bwtfile safile " << endl; exit(1); } bwtFileName = argv[1]; saFileName = argv[2]; Bwt bwt; DNASuffixArray suffixArray; bwt.Read(bwtFileName); suffixArray.AllocateSuffixArray(bwt.bwtSequence.length-1); SAIndex index; for (index = 1; index < bwt.bwtSequence.length+1; index++) { suffixArray.index[index-1] = bwt.Locate(index); } suffixArray.Write(saFileName); } blasr-8e668beae0dda1da6914586fb458182c6c3c7482/extrautils/CCSH5ToBam.cpp000066400000000000000000000115021260737656700243170ustar00rootroot00000000000000#include "utils/FileOfFileNames.hpp" #include "datastructures/alignmentset/SAMSupplementalQVList.hpp" #include "format/SAMHeaderPrinter.hpp" #include "format/BAMPrinter.hpp" #include "pbbam/BamWriter.h" #include "CommandLineParser.hpp" using namespace PacBio::BAM; using namespace std; string DISCLAIM = "THIS TOOL IS CREATED FOR DEVELOPERS USE ONLY AND IT MAY OR MAY NOT " "BREAK AT ANY TIME. USE AT YOUR OWN RISK."; string GetVersion(void) { return "1.0"; } void CCSReadToBamRecord(CCSSequence & ccsRead, BamRecord & bamRecord, SupplementalQVList & samQVList) { //m140905_042212_sidney_c100564852550000001823085912221377_s1_X0/1650/1920_2155 4 * 0 255 * * 0 0 bamRecord.Impl().Name(ccsRead.GetTitle()); bamRecord.Impl().Flag(static_cast(4)); string seqString; seqString.assign((char*)ccsRead.seq, ccsRead.length); bamRecord.Impl().SetSequenceAndQualities(seqString, ccsRead.qual.ToString()); // bamRecord.Impl().CigarData(Cigar::FromStdString("*")); bamRecord.Impl().Bin(0); bamRecord.Impl().InsertSize(0); bamRecord.Impl().MatePosition(static_cast(-1)); bamRecord.Impl().MateReferenceId(static_cast(-1)); bamRecord.Impl().Position(static_cast(-1)); bamRecord.Impl().ReferenceId(static_cast(-1)); TagCollection tags; tags["RG"] = ccsRead.GetReadGroupId(); tags["np"] = ccsRead.numPasses; tags["zm"] = ccsRead.zmwData.holeNumber; tags["qs"] = 0; tags["qe"] = ccsRead.length; samQVList.FormatQVOptionalFields(ccsRead); // Add QVs to BamRecordImpl. string insertionQVs, deletionQVs, substitutionQVs, mergeQVs, substitutionTags, deletionTags; if (ccsRead.GetQVs("InsertionQV", insertionQVs)) { tags["iq"] = insertionQVs; } if (ccsRead.GetQVs("DeletionQV", deletionQVs)) { tags["dq"] = deletionQVs; } if (ccsRead.GetQVs("SubstitutionQV", substitutionQVs)) { tags["sq"] = substitutionQVs; } if (ccsRead.GetQVs("MergeQV", mergeQVs)) { tags["mq"] = mergeQVs; } // substitutionTag is not included by default if (ccsRead.GetQVs("DeletionTag", deletionTags)) { tags["dt"] = deletionTags; } bamRecord.Impl().Tags(tags); } int main(int argc, char* argv[]) { string progName = "ccsh5tobam"; CommandLineParser clp; clp.SetHelp("Convert ccs.h5 to bam.\n" + DISCLAIM); clp.SetConciseHelp("ccsh5tobam ccs.h5|fofn out.bam\n" + DISCLAIM); clp.SetProgramName(progName); clp.SetVersion(GetVersion()); string fofn, bamOutName; clp.RegisterStringOption("in.ccs.h5", &fofn, "Input ccs.h5|fofn file.", true); clp.RegisterStringOption("out.bam", &bamOutName, "Output bam file.", true); clp.RegisterPreviousFlagsAsHidden(); clp.ParseCommandLine(argc, argv); //cerr << "[INFO] " << GetTimestamp() << " [" << progName << "] started." << endl; vector ccsFileNames; FileOfFileNames::StoreFileOrFileList(fofn, ccsFileNames); string so = "UNKNOWN"; // sorting order; string version = GetVersion(); string commandLineString; clp.CommandLineToString(argc, argv, commandLineString); SupplementalQVList samQVList; samQVList.SetDefaultQV(); SequenceIndexDatabase seqdb; SAMHeaderPrinter shp(so, seqdb, ccsFileNames, ReadType::ReadTypeEnum::CCS, samQVList, "ccsh52bam", version, commandLineString); string headerString = shp.ToString();// SAM/BAM header BamHeader header = BamHeader(headerString); // Both file name and SAMHeader are required in order to create a BamWriter. BamWriter * bamWriterPtr = new BamWriter(bamOutName, header); for (string ccsFileName: ccsFileNames) { ReaderAgglomerate reader; reader.SetReadFileName(ccsFileName); reader.SetReadType(ReadType::ReadTypeEnum::CCS); // Initialize using already set file names. int initReturnValue = reader.Initialize(); if (initReturnValue <= 0) { cerr << "WARNING! Could not open file " << ccsFileName << endl; continue; } // Check whether use ccs only. assert (reader.GetFileType() == HDFCCSONLY); int randint = 0; CCSSequence ccsRead; while(reader.GetNext(ccsRead, randint) != 0) { if (ccsRead.length > 0) { BamRecord bamRecord; CCSReadToBamRecord(ccsRead, bamRecord, samQVList); bamWriterPtr->Write(bamRecord); } } } try { bamWriterPtr->TryFlush(); delete bamWriterPtr; bamWriterPtr = NULL; } catch (std::exception e) { cout << "Error, could not flush bam records to bam file." << endl; exit(1); } //cerr << "[INFO] " << GetTimestamp() << " [" << progName << "] ended." << endl; return 0; } blasr-8e668beae0dda1da6914586fb458182c6c3c7482/extrautils/Evolve.cpp000066400000000000000000000155031260737656700240340ustar00rootroot00000000000000#include #include #include "utils.hpp" #include "FASTAReader.hpp" #include "FASTASequence.hpp" #include "CommandLineParser.hpp" #include "statistics/StatUtils.hpp" using namespace std; /* ref000001 . SNV 9454 9454 0.00 . . reference=C;confidence=0;Name=9454C>A;coverage=0;variantseq=A ref000001 . deletion 20223 20223 0.00 . . reference=T;length=1;confidence=0;coverage=0;Name=20222delT ref000001 . insertion 35089 35089 0.00 . . confidence=0;Name=35089_35090insC;reference=.;length=1;coverage=0;variantseq=C */ char ToLower(char c, bool useToLower) { if (useToLower) { return tolower(c); } else { return toupper(c); } } int main(int argc, char* argv[]) { CommandLineParser clp; string refGenomeName; string mutGenomeName; string gffFileName; float insRate = 0; float delRate = 0; float mutRate = 0; bool lower = false; gffFileName = ""; clp.RegisterStringOption("refGenome", &refGenomeName, "Reference genome.", true); clp.RegisterStringOption("mutGenome", &mutGenomeName, "Mutated genome.", true); clp.RegisterPreviousFlagsAsHidden(); clp.RegisterStringOption("gff", &gffFileName, "GFF file describing the modifications made to the genome."); clp.RegisterFloatOption("i", &insRate, "Insertion rate: (0-1].", CommandLineParser::NonNegativeFloat, false); clp.RegisterFloatOption("d", &delRate, "Deletion rate: (0-1]", CommandLineParser::NonNegativeFloat, false); clp.RegisterFloatOption("m", &mutRate, "Mutation rate, even across all nucleotides: (0-1]", CommandLineParser::NonNegativeFloat, false); clp.RegisterFlagOption("lower", &lower, "Make mutations in lower case", false); vector leftovers; clp.ParseCommandLine(argc, argv, leftovers); FASTAReader reader; FASTASequence refGenome; reader.Init(refGenomeName); ofstream mutGenomeOut; CrucialOpen(mutGenomeName, mutGenomeOut, std::ios::out); ofstream gffOut; if (gffFileName != "") { CrucialOpen(gffFileName, gffOut, std::ios::out); } vector insIndices, delIndices, subIndices; int readIndex = 0; InitializeRandomGeneratorWithTime(); while (reader.GetNext(refGenome)) { insIndices.resize(refGenome.length); delIndices.resize(refGenome.length); subIndices.resize(refGenome.length); std::fill(insIndices.begin(), insIndices.end(), false); std::fill(delIndices.begin(), delIndices.end(), false); std::fill(subIndices.begin(), subIndices.end(), 0); enum ChangeType { Ins, Del, Mut, None}; float changeProb[4]; changeProb[Ins] = insRate; changeProb[Del] = changeProb[Ins] + delRate; changeProb[Mut] = changeProb[Del] + mutRate; changeProb[None] = 1; if (changeProb[Mut] > 1) { cout << "ERROR! The sum of the error probabilities must be less than 1" << endl; exit(1); } DNALength pos; float randomNumber; int numIns = 0; int numDel = 0; int numMut = 0; for (pos =0 ; pos < refGenome.length; pos++) { randomNumber = Random(); if (randomNumber < changeProb[Ins]) { insIndices[pos] = true; numIns++; } else if (randomNumber < changeProb[Del]) { delIndices[pos] = true; numDel++; } else if (randomNumber < changeProb[Mut]){ Nucleotide newNuc = TwoBitToAscii[RandomInt(4)]; int maxIts = 100000; int it = 0; while (newNuc == refGenome.seq[pos]) { newNuc = TwoBitToAscii[RandomInt(4)]; if (it == maxIts) { cout << "ERROR, something is wrong with the random number generation, it took too many tries to generate a new nucleotide" << endl; exit(1); } } subIndices[pos] = refGenome[pos]; refGenome.seq[pos] = ToLower(newNuc,lower); ++numMut; } } // cout << readIndex << " m " << numMut << " i " << numIns << " d " << numDel << endl; if (readIndex % 100000 == 0 && readIndex > 0) { cout << readIndex << endl; } // // Now add the insertions and deletions. // FASTASequence newSequence; DNALength newPos; if (numIns - numDel + refGenome.length < 0) { cout << "ERROR, the genome has been deleted to nothing." << endl; exit(1); } ResizeSequence(newSequence, refGenome.length + (numIns - numDel)); newPos = 0; pos = 0; for (pos = 0; pos < refGenome.length; pos++) { assert(newPos < newSequence.length or delIndices[pos] == true); if (subIndices[pos] != 0 and gffFileName != "") { gffOut << refGenome.GetName() << " . SNV " << newPos << " " << newPos <<" 0.00 . . reference=" << (char)subIndices[pos] << ";confidence=10;Name=" << newPos << (char)subIndices[pos] << ">" << refGenome.seq[pos] <<";coverage=10;variantseq=" << refGenome.seq[pos] << endl; } if (insIndices[pos] == true) { newSequence.seq[newPos] = ToLower(TwoBitToAscii[RandomInt(4)], lower); newPos++; newSequence.seq[newPos] = refGenome.seq[pos]; assert(newSequence.seq[newPos] != '1'); assert(newSequence.seq[newPos] != 1); if (gffFileName != "") { gffOut << refGenome.GetName() << " . deletion " << newPos << " " << newPos << " 0.00 . . reference=" << newSequence.seq[newPos] << ";length=1;confidence=10;coverage=0;Name="<< newPos << "del" << newSequence.seq[newPos] << endl; } newPos++; } else if (delIndices[pos] == true) { // no-op, skip if (gffFileName != "") { gffOut << refGenome.GetName() << " . insertion " << newPos << " " << newPos << " 0.00 . . confidence=10;Name=" << newPos << "_ins" << refGenome.seq[pos] << ";reference=.;length=1;coverage=0;variantseq=" << refGenome.seq[newPos] << endl; //ref000001 . deletion 20223 20223 0.00 . . reference=T;length=1;confidence=0;coverage=0;Name=20222delT } } else { newSequence.seq[newPos] = refGenome.seq[pos]; newPos++; } } stringstream titlestrm; titlestrm << " mutated ins " << insRate << " del " << delRate << " mut " << mutRate; newSequence.CopyTitle(refGenome.title); newSequence.AppendToTitle(titlestrm.str()); newSequence.PrintSeq(mutGenomeOut); newSequence.Free(); readIndex++; } } blasr-8e668beae0dda1da6914586fb458182c6c3c7482/extrautils/ExciseRepeats.cpp000066400000000000000000000031061260737656700253340ustar00rootroot00000000000000#include #include "FASTAReader.hpp" #include "FASTASequence.hpp" #include "utils.hpp" using namespace std; int main(int argc, char* argv[]) { string seqInName, seqOutName, dotOutName; if (argc < 4) { cout << "usage: exciseRepeats inName repMaskOutFile outName" << endl; exit(1); } seqInName = argv[1]; dotOutName = argv[2]; seqOutName = argv[3]; FASTAReader reader; reader.Initialize(seqInName); FASTASequence origSeq; reader.GetNext(origSeq); ifstream dotOutFile; CrucialOpen(dotOutName, dotOutFile); ofstream seqOutFile; ofstream seqOut; CrucialOpen(seqOutName, seqOut, std::ios::out); string dotOutLine; getline(dotOutFile, dotOutLine); getline(dotOutFile, dotOutLine); getline(dotOutFile, dotOutLine); while(getline(dotOutFile, dotOutLine)) { stringstream lineStrm(dotOutLine); int swScore; float pctDiv, pctDel, pctIns; string query; int qPosBegin, qPosEnd; string left; char strand; string matchingRepeat; string repClass; string repPos, repEnd, repLeft; int id; lineStrm >> swScore >> pctDiv >> pctDel >> pctIns >> query >> qPosBegin >> qPosEnd >> left >> strand >> matchingRepeat >> repClass >> repPos >> repEnd >> repLeft >> id; DNALength seqPos; for (seqPos = qPosBegin; seqPos < qPosEnd; seqPos++) { origSeq.seq[seqPos] = 'X'; } } DNALength seqPos, unexPos; unexPos = 0; for (seqPos = 0; seqPos < origSeq.length; seqPos++) { if (origSeq.seq[seqPos] != 'X') { origSeq.seq[unexPos] = origSeq.seq[seqPos]; unexPos++; } } origSeq.length = unexPos; origSeq.PrintSeq(seqOut); return 0; } blasr-8e668beae0dda1da6914586fb458182c6c3c7482/extrautils/PrintTupleCountTable.cpp000066400000000000000000000036501260737656700266630ustar00rootroot00000000000000#include #include #include #include #include "utils.hpp" #include "FASTASequence.hpp" #include "FASTAReader.hpp" #include "CommandLineParser.hpp" #include "tuples/DNATuple.hpp" #include "tuples/CompressedDNATuple.hpp" #include "tuples/TupleMetrics.hpp" #include "tuples/TupleCountTable.hpp" #ifdef COMPRESSED typedef TupleCountTable > CountTable; #else typedef TupleCountTable CountTable; #endif int main(int argc, char* argv[]) { CommandLineParser clp; string tableFileName; vector sequenceFiles; TupleMetrics tm; int tupleSize = 8; clp.SetProgramName("printTupleCountTable"); clp.SetProgramSummary("Count the number of occurrences of every k-mer in a file."); clp.RegisterStringOption("table", &tableFileName, "Output table name.", true); clp.RegisterIntOption("wordsize", &tupleSize, "Size of words to count", CommandLineParser::NonNegativeInteger, false); clp.RegisterStringListOption("reads", &sequenceFiles, "All sequences.", false); clp.RegisterPreviousFlagsAsHidden(); vector opts; if (argc == 2) { string fastaFileName = argv[1]; sequenceFiles.push_back(fastaFileName); tableFileName = fastaFileName + ".ctab"; } else { clp.ParseCommandLine(argc, argv, opts); } tm.tupleSize = tupleSize; tm.InitializeMask(); ofstream tableOut; CrucialOpen(tableFileName, tableOut, std::ios::out| std::ios::binary); CountTable table; table.InitCountTable(tm); int i; FASTASequence seq; for (i = 0; i < sequenceFiles.size(); i++ ){ FASTAReader reader; reader.Init(sequenceFiles[i]); while (reader.GetNext(seq)) { seq.ToUpper(); table.AddSequenceTupleCountsLR(seq); } } table.Write(tableOut); return 0; } blasr-8e668beae0dda1da6914586fb458182c6c3c7482/extrautils/SALS.cpp000066400000000000000000000017431260737656700233370ustar00rootroot00000000000000#include #include #include "utils.hpp" #include "suffixarray/SuffixArray.hpp" #include "suffixarray/SuffixArrayTypes.hpp" using namespace std; int main(int argc, char* argv[]) { if (argc <= 1) { cout << "sals checks if a suffix array has lookup table or not." < #include #include "NucConversion.hpp" #include "FASTASequence.hpp" #include "FASTAReader.hpp" #include "suffixarray/SuffixArray.hpp" #include "suffixarray/SuffixArrayTypes.hpp" #include "suffixarray/ssort.hpp" #include "algorithms/sorting/qsufsort.hpp" void PrintUsage() { cout << "samodify changes word size of input suffix array." << endl; cout << "Usage: samodify in.sa genome.fasta out.sa [-blt p]" << endl; cout << " -blt p Build a lookup table on prefixes of length 'p' " << endl; } int main(int argc, char* argv[]) { if (argc < 4) { PrintUsage(); exit(1); } int argi = 1; string saInFile = argv[argi++]; string genomeFileName = argv[argi++]; string saOutFile = argv[argi++]; vector inFiles; int doBLT = 0; int doBLCP = 0; int bltPrefixLength = 0; int lcpLength = 0; int parsingOptions = 0; while (argi < argc) { if (strcmp(argv[argi], "-blt") == 0) { doBLT = 1; bltPrefixLength = atoi(argv[++argi]); } else if (strcmp(argv[argi], "-blcp") == 0) { doBLCP = 1; lcpLength = atoi(argv[++argi]); } else { PrintUsage(); cout << "Bad option: " << argv[argi] << endl; exit(1); } ++argi; } // // Read the suffix array to modify. // DNASuffixArray sa; sa.Read(saInFile); FASTAReader reader; reader.Initialize(genomeFileName); FASTASequence seq; reader.ReadAllSequencesIntoOne(seq); if (doBLT) { sa.BuildLookupTable(seq.seq, seq.length, bltPrefixLength); } if (doBLCP) { cout << "LCP Table not yet implemented." << endl; } sa.Write(saOutFile); } blasr-8e668beae0dda1da6914586fb458182c6c3c7482/extrautils/SWMatcher.cpp000066400000000000000000000130521260737656700244260ustar00rootroot00000000000000#include #include #include #include #include "FASTAReader.hpp" #include "FASTASequence.hpp" #include "algorithms/alignment/AlignmentUtils.hpp" #include "algorithms/alignment/DistanceMatrixScoreFunction.hpp" #include "algorithms/alignment/IDSScoreFunction.hpp" #include "algorithms/alignment/SWAlign.hpp" #include "format/StickAlignmentPrinter.hpp" using namespace std; int main(int argc, char* argv[]) { if (argc < 3) { cout << "usage: swMatcher query target [-indel i] [-local] [-showalign] " << endl << " [-type queryfit|overlap|global] [-match m ] [-mismatch m]" << endl << " or [-local] [-queryfit] [-overlap] [-fixedtarget] [-fixedquery]" << endl << " [-printmatrix]"<< endl << " Unless -showalign is specified, output is tabular and in the formt:"< scoreFn( SMRTDistanceMatrix, insertion, deletion); FASTASequence query, target; FASTAReader queryReader, targetReader; queryReader.Init(queryName); targetReader.Init(targetName); if (fixedTarget) { targetReader.GetNext(target); } if (fixedQuery) { queryReader.GetNext(query); } // // Prepare the target database; // // // Prepare the query match set. // int seqIndex = 0; vector scoreMat; vector pathMat; int alignScore; MatchedAlignment alignment; if (match != 0) { int i; for (i = 0; i < 4; i++ ) { LocalAlignLowMutationMatrix[i][i] = match; } } int i,j; for (i = 0; i < 5; i++) { for (j = 0; j < 5 ; j++) { if (i == j) continue; SMRTDistanceMatrix[i][j] += 3; } } cout << "qlen tlen score" << endl; while ((fixedQuery or queryReader.GetNext(query)) and (fixedTarget or targetReader.GetNext(target))) { alignment.qName.assign(query.title, query.titleLength); alignment.tName.assign(target.title, target.titleLength); alignment.blocks.clear(); alignment.qPos = 0; alignment.tPos = 0; alignment.qStart = 0; alignment.tStart = 0; if (query.length == 0 or target.length == 0) continue; alignScore = SWAlign(query, target, scoreMat, pathMat, alignment, scoreFn, alignType, false, printMatrix); cout << query.length << " " << target.length << " " << alignScore << endl; cout << alignment.qPos << " " << alignment.QEnd() << " " << alignment.tPos << " " << alignment.TEnd() << endl; if (showAlign) { ComputeAlignmentStats(alignment, query.seq, target.seq, scoreFn); //SMRTDistanceMatrix, indelCost, indelCost); PrintAlignmentStats(alignment, cout); StickPrintAlignment(alignment, query, target, cout); } ++seqIndex; } return 0; } blasr-8e668beae0dda1da6914586fb458182c6c3c7482/extrautils/SimpleShredder.cpp000066400000000000000000000134161260737656700255070ustar00rootroot00000000000000#include #include #include "utils.hpp" #include "FASTAReader.hpp" #include "FASTQSequence.hpp" #include "FASTASequence.hpp" #include "CommandLineParser.hpp" #include "metagenome/FindRandomSequence.hpp" #include "statistics/StatUtils.hpp" using namespace std; int main(int argc, char* argv[]) { string inFileName, readsFileName; DNALength readLength; float coverage = 0; bool noRandInit = false; int numReads = -1; CommandLineParser clp; int qualityValue = 20; bool printFastq = false; int stratify = 0; string titleType = "pacbio"; string fastqType = "illumina"; // or "sanger" clp.RegisterStringOption("inFile", &inFileName, "Reference sequence", 0); clp.RegisterPreviousFlagsAsHidden(); clp.RegisterIntOption("readLength", (int*) &readLength, "The length of reads to simulate. The length is fixed.", CommandLineParser::PositiveInteger, "Length of every read.", 0); clp.RegisterFloatOption("coverage", &coverage, "Total coverage (from which the number of reads is calculated", CommandLineParser::PositiveFloat, 0); clp.RegisterFlagOption("nonRandInit", &noRandInit, "Skip initializing the random number generator with time."); clp.RegisterIntOption("nReads", &numReads, "Total number of reads (from which coverage is calculated)", CommandLineParser::PositiveInteger, 0); clp.RegisterStringOption("readsFile", &readsFileName, "Reads output file", 0); clp.RegisterFlagOption("fastq", &printFastq, "Fake fastq output with constant quality value (20)"); clp.RegisterIntOption("quality", &qualityValue, "Value to use for fastq quality", CommandLineParser::PositiveInteger); clp.RegisterIntOption("stratify", &stratify, "Sample a read every 'stratify' bases, rather than randomly.", CommandLineParser::PositiveInteger); clp.RegisterStringOption("titleType", &titleType, "Set the name of the title: 'pacbio'|'illumina'"); clp.RegisterStringOption("fastqType", &fastqType, "Set the type of fastq: 'illumina'|'sanger'"); vector leftovers; clp.ParseCommandLine(argc, argv, leftovers); if (!noRandInit) { InitializeRandomGeneratorWithTime(); } FASTAReader inReader; inReader.Init(inFileName); vector reference; inReader.ReadAllSequences(reference); ofstream readsFile; if (readsFileName == "") { cout << "ERROR. You must specify a reads file." << endl; exit(1); } CrucialOpen(readsFileName, readsFile, std::ios::out); ofstream sangerFastqFile; if (fastqType == "sanger") { string sangerFastqFileName = readsFileName + ".fastq"; CrucialOpen(sangerFastqFileName, sangerFastqFile, std::ios::out); } DNALength refLength = 0; int i; for (i = 0; i < reference.size(); i++) { refLength += reference[i].length; } if (numReads == -1 and coverage == 0 and stratify == 0) { cout << "ERROR, you must specify either coverage, nReads, or stratify." << endl; exit(1); } else if (numReads == -1) { numReads = (refLength / readLength) * coverage; } if (stratify) { if (!readLength) { cout << "ERROR. If you are using stratification, a read length must be specified." << endl; exit(1); } } DNASequence sampleSeq; sampleSeq.length = readLength; int maxRetry = 10000000; int retryNumber = 0; DNALength seqIndex, seqPos; if (stratify) { seqIndex = 0; seqPos = 0; } DNALength origReadLength = readLength; for (i = 0; stratify or i < numReads; i++) { if (stratify == 0) { FindRandomPos(reference, seqIndex, seqPos, readLength ); } else { // // find the next start pos, or bail if done // if (seqPos >= reference[seqIndex].length) { if (seqIndex == reference.size() - 1) { break; } else { seqIndex = seqIndex + 1; seqPos = 0; continue; } } readLength = min(reference[seqIndex].length - seqPos, origReadLength); } sampleSeq.seq = &reference[seqIndex].seq[seqPos]; int j; int gappedRead = 0; string title; stringstream titleStrm; if (titleType == "pacbio") { titleStrm << i << "|"<< reference[seqIndex].GetName() << "|" << seqPos << "|" << seqPos + readLength; } else if (titleType == "illumina") { titleStrm << "SE_" << i << "_0@" << seqPos << "-"<" << title << endl; sampleSeq.PrintSeq(readsFile); } else { FASTQSequence fastqSampleSeq; fastqSampleSeq.CopyTitle(title); fastqSampleSeq.seq = sampleSeq.seq; fastqSampleSeq.length = sampleSeq.length; fastqSampleSeq.qual.data = new unsigned char[sampleSeq.length]; fill(fastqSampleSeq.qual.data, fastqSampleSeq.qual.data + sampleSeq.length, qualityValue); if (fastqType == "illumina") { fastqSampleSeq.PrintFastq(readsFile, fastqSampleSeq.length+1); } else { fastqSampleSeq.PrintSeq(readsFile); fastqSampleSeq.PrintQual(sangerFastqFile); } delete[] fastqSampleSeq.qual.data; delete[] fastqSampleSeq.title; } if (stratify) { seqPos += readLength; } } return 0; } blasr-8e668beae0dda1da6914586fb458182c6c3c7482/extrautils/StoreQualityByContextFromCmpH5.cpp000066400000000000000000000236741260737656700305720ustar00rootroot00000000000000#include "files/ReaderAgglomerate.hpp" #include "SMRTSequence.hpp" #include "utils/FileOfFileNames.hpp" #include "simulator/ContextSet.hpp" #include "simulator/OutputSampleListSet.hpp" #include "datastructures/alignment/CmpFile.hpp" #include "HDFCmpFile.hpp" #include "format/StickAlignmentPrinter.hpp" class ScoredLength { public: int score, length; int operator<(const ScoredLength &rhs) const { return score < rhs.score; } ScoredLength(int s, int l) : score(s), length(l) {} ScoredLength() {} }; void PrintUsage() { cout << "cmpH5StoreQualityByContext - grab quality values from cmp.h5 files until minimum requirements for the number of times a context has been sampled are met." << endl; cout << "usage: cmpH5StoreQualityByContext aligned_reads.cmp.h5 output.qbc [options] " << endl; cout << "options: " << endl << " -contextLength L The length of the context to sample (default: 5) " << endl << " -minSamples S(500) Report pass if all contexts are sampled" << endl << " at least S times." << endl << " -maxSamples S(1000) Stop sampling a context once it has reached" << endl << " S samples." << endl << " -onlyMaxLength" < maxLengthMap; OutputSampleListSet samples(contextLength); SMRTSequence read; ofstream sampleOut; CrucialOpen(outFileName, sampleOut, std::ios::out|std::ios::binary); int fileNameIndex; int numContextsReached = 0; int numContexts = 1 << (contextLength*2); ReaderAgglomerate reader; samples.keyLength = contextLength; HDFCmpFile cmpReader; cmpReader.IncludeField("QualityValue"); cmpReader.IncludeField("DeletionQV"); cmpReader.IncludeField("InsertionQV"); cmpReader.IncludeField("SubstitutionQV"); cmpReader.IncludeField("SubstitutionTag"); cmpReader.IncludeField("DeletionTag"); cmpReader.IncludeField("PulseIndex"); cmpReader.IncludeField("WidthInFrames"); cmpReader.IncludeField("PreBaseFrames"); if (cmpReader.Initialize(cmpH5FileName, H5F_ACC_RDWR) == 0) { cout << "ERROR, could not open the cmp file." << endl; exit(1); } cout << "Reading cmp file." << endl; CmpFile cmpFile; cmpReader.ReadAlignmentDescriptions(cmpFile); cmpReader.ReadStructure(cmpFile); cout << "done reading structure."< alignmentToBaseMap; for (alignmentIndex = 0; alignmentIndex < nAlignments and !samples.Sufficient(); alignmentIndex++) { // // For ease of use, store the length of the alignment to make another model. // ByteAlignment alignmentArray; cmpReader.ReadAlignmentArray(alignmentIndex, alignmentArray); Alignment alignment; ByteAlignmentToAlignment(alignmentArray, alignment); string readSequence, refSequence; readSequence.resize(alignmentArray.size()); refSequence.resize(alignmentArray.size()); DNASequence readDNA, refDNA; ByteAlignmentToQueryString(&alignmentArray[0], alignmentArray.size(), &readSequence[0]); ByteAlignmentToRefString(&alignmentArray[0], alignmentArray.size(), &refSequence[0]); RemoveGaps(readSequence, readSequence); RemoveGaps(refSequence, refSequence); readDNA.seq = (Nucleotide*) readSequence.c_str(); readDNA.length = readSequence.size(); refDNA.seq = (Nucleotide*) refSequence.c_str(); refDNA.length = refSequence.size(); CmpAlignment cmpAlignment; cmpReader.ImportReadFromCmpH5(alignmentIndex, cmpAlignment, read); CreateAlignmentToSequenceMap(alignmentArray, alignmentToBaseMap); if (read.length < contextLength) { continue; } int subreadLength = (cmpFile.alnInfo.alignments[alignmentIndex].GetQueryEnd() - cmpFile.alnInfo.alignments[alignmentIndex].GetQueryStart()); if (onlyMaxLength == false) { samples.lengths.push_back(subreadLength); } else { int score = (cmpAlignment.GetNMatch() - cmpAlignment.GetNMismatch() - cmpAlignment.GetNInsertions() - cmpAlignment.GetNDeletions()); stringstream nameStrm; nameStrm << cmpAlignment.GetMovieId() << "_" << cmpAlignment.GetHoleNumber(); string nameStr = nameStrm.str(); if (maxLengthMap.find(nameStr) == maxLengthMap.end()) { maxLengthMap[nameStr] = ScoredLength(score, subreadLength); } } int sampleEnd = alignmentArray.size() - contextLength/2; int a; for (a = contextLength/2; a < sampleEnd; a++) { // Make sure the context begins on a real nucleotide. while (a < sampleEnd and ((RefChar[alignmentArray[a]] == ' '))) {a++;} // // Move ab back to an index where there are contextLength/2 non-gap // characters, counted by nb // int ab; //num bases int ae; //alignment end ab = a-1; int nb = 0, ne = 0; while (true) { if (RefChar[alignmentArray[ab]] != ' ') { nb++; } if (ab == 0 or nb == contextLength/2) break; ab--; } // // Advance ae to an index where there are contextLength/2 non-gap // characters, counted by ne. // ae = a + 1; while (ae < alignmentArray.size() and ne < contextLength/ 2) { if (RefChar[alignmentArray[ae]] != ' ') { ne++; } ae++; } // // Make sure there are no edge effects that prevent a context of the correct length from being assigned. // if (nb + ne + 1 != contextLength) { continue; } int ai; string context; for (ai = ab; ai < ae; ai++) { if (RefChar[alignmentArray[ai]] != ' ') { context.push_back(RefChar[alignmentArray[ai]]); } } assert(context.size() == contextLength); // // Now create the context. // OutputSample sample; // // This context is a deletion, create that. // sample.type = OutputSample::Deletion; // // This context is either an insertion or substitution // // Look to see if the previous aligned position was an // insertion, and move back as far as the insertion extends. int aq = a-1; int sampleLength; if (QueryChar[alignmentArray[a]] == ' ') { sample.type = OutputSample::Deletion; sampleLength = 0; } else if (RefChar[alignmentArray[aq]] == ' ') { while (aq > 0 and RefChar[alignmentArray[aq]] == ' ' and QueryChar[alignmentArray[aq]] != ' ') { aq--; } sample.type = OutputSample::Insertion; sampleLength = a - aq; } else if (QueryChar[alignmentArray[a]] == RefChar[alignmentArray[aq]]) { sample.type = OutputSample::Match; sampleLength = 1; } else { sample.type = OutputSample::Substitution; sampleLength = 1; } sample.Resize(sampleLength); if (sampleLength > 0) { int seqPos = alignmentToBaseMap[aq]; if (seqPos < read.length) { sample.CopyFromSeq(read, seqPos, sampleLength); string nucs; int n; for (n = 0; n < sample.nucleotides.size(); n++) { char c = sample.nucleotides[n]; assert(c == 'A' or c == 'T' or c == 'G' or c == 'C'); nucs.push_back(sample.nucleotides[n]); } } } samples.AppendOutputSample(context, sample); } read.Free(); } if (onlyMaxLength) { map::iterator maxScoreIt; for (maxScoreIt = maxLengthMap.begin(); maxScoreIt != maxLengthMap.end(); ++maxScoreIt) { cout << maxScoreIt->second.length << endl; samples.lengths.push_back(maxScoreIt->second.length); } } samples.Write(sampleOut); return 0; } blasr-8e668beae0dda1da6914586fb458182c6c3c7482/extrautils/SuffixArrayToBWT.cpp000066400000000000000000000020771260737656700257210ustar00rootroot00000000000000#include "FASTASequence.hpp" #include "FASTAReader.hpp" #include "suffixarray/SuffixArrayTypes.hpp" #include "suffixarray/SuffixArray.hpp" #include "bwt/BWT.hpp" #include #include using namespace std; int main(int argc, char* argv[]) { if (argc < 4) { cout << "usage: sa2bwt genomeFileName suffixArray bwt [-debug]" << endl; exit(1); } string genomeFileName = argv[1]; string suffixArrayFileName = argv[2]; string bwtFileName = argv[3]; int storeDebugInformation = 0; int argi = 4; while(argi < argc) { if (strcmp(argv[argi], "-debug") == 0) { storeDebugInformation = 1; } ++argi; } ofstream bwtOutFile; CrucialOpen(bwtFileName, bwtOutFile, std::ios::out|std::ios::binary); FASTAReader reader; reader.Init(genomeFileName); FASTASequence seq; reader.ReadAllSequencesIntoOne(seq); DNASuffixArray suffixArray; suffixArray.Read(suffixArrayFileName); Bwt bwt; bwt.InitializeFromSuffixArray(seq, suffixArray.index, storeDebugInformation ); bwt.Write(bwtOutFile); return 0; } blasr-8e668beae0dda1da6914586fb458182c6c3c7482/extrautils/ctest/000077500000000000000000000000001260737656700232065ustar00rootroot00000000000000blasr-8e668beae0dda1da6914586fb458182c6c3c7482/extrautils/ctest/alchemy.t000066400000000000000000000012621260737656700250160ustar00rootroot00000000000000Set up $ . $TESTDIR/setup.sh Set up the executable: alchemy. $ EXEC=$TESTDIR/../alchemy test_alchemy.cmp.h5 was generated by pbalign.py $DATDIR/test_alchemy_read.fa $DATDIR/test_alchemy_ref.fa test_alchemy.cmp.h5 $ ./cmpH5StoreQualityByContext $DATDIR/test_alchemy.cmp.h5 $OUTDIR/test_alchemy.qbc -contextLength 3 $ $EXEC $DATDIR/ecoli_out.qbc -genome $DATDIR/ecoli_reference.fasta -numBasesPerFile 100000 -baseFileName 'this_bas_file' -movieName $OUTDIR/alchemy_ $ echo $? 0 pls2fasta can be successfully applied to the simulated bas.h5 file. $ pls2fasta *.bas.h5 $OUTDIR/test_alchemy_pls2fasta.fa [INFO] * [pls2fasta] started. (glob) [INFO] * [pls2fasta] ended. (glob) blasr-8e668beae0dda1da6914586fb458182c6c3c7482/extrautils/ctest/bwt2sa.t000066400000000000000000000006611260737656700246000ustar00rootroot00000000000000Set up $ . $TESTDIR/setup.sh Noticed that pipeline fa->sa->bwt works OK, however fa->sa->bwt->sa does not generate identical suffix array. Set up the executable: bwt2sa. $ EXEC=$TESTDIR/../bwt2sa Define tmporary files $ TMP1=$OUTDIR/$$.tmp.out $ TMP2=$OUTDIR/$$.tmp.stdout Make OUTDIR $ mkdir -p $OUTDIR $ SA=$OUTDIR/ecoli_reference.bwt2sa.sa $ BWT=$DATDIR/ecoli_reference.bwt $ $EXEC $BWT $SA $ echo $? 0 blasr-8e668beae0dda1da6914586fb458182c6c3c7482/extrautils/ctest/ccsh5tobam.t000066400000000000000000000005341260737656700254250ustar00rootroot00000000000000Set up $ . $TESTDIR/setup.sh Set up the executable: ccsh5tobam $ EXEC=$TESTDIR/../ccsh5tobam $ SMRTWRAP=/mnt/secondary/Smrtpipe/builds/Internal_Mainline_Nightly_LastSuccessfulBuild/smrtcmds/bin/smrtwrap $ $SMRTWRAP python $SCRIPTDIR/test_ccsh5tobam.py $EXEC $DATDIR/test_ccsh5tobam/input.fofn $OUTDIR/test_ccsh5tobam.bam $ echo $? 0 blasr-8e668beae0dda1da6914586fb458182c6c3c7482/extrautils/ctest/cmpH5StoreQualityByContext.t000066400000000000000000000006131260737656700305750ustar00rootroot00000000000000Set up $ . $TESTDIR/setup.sh Set up the executable: cmpH5StoreQualityByContext. $ EXEC=$TESTDIR/../cmpH5StoreQualityByContext Define tmporary files $ TMP1=$OUTDIR/$$.tmp.out $ TMP2=$OUTDIR/$$.tmp.stdout Make OUTDIR $ mkdir -p $OUTDIR $ $EXEC $DATDIR/ecoli_out.cmp.h5 $OUTDIR/ecoli_out.qbc -contextLength 8 -onlyMaxLength -minSamples 600 -maxSamples 1500 > $TMP1 $ echo $? 0 blasr-8e668beae0dda1da6914586fb458182c6c3c7482/extrautils/ctest/printTupleCountTable.t000066400000000000000000000006331260737656700275240ustar00rootroot00000000000000Set up $ . $TESTDIR/setup.sh Set up the executable: printTupleCountTable. $ EXEC=$TESTDIR/../printTupleCountTable Define tmporary files $ TMP1=$OUTDIR/$$.tmp.out $ TMP2=$OUTDIR/$$.tmp.stdout Make OUTDIR $ mkdir -p $OUTDIR $ $EXEC $OUTDIR/ecoli_tuple.table $DATDIR/ecoli_reference.fasta $ echo $? 0 $ md5sum $OUTDIR/ecoli_tuple.table |cut -f 1 -d ' ' 3f1ae70fd009827d6d6e56050341b5df blasr-8e668beae0dda1da6914586fb458182c6c3c7482/extrautils/ctest/sa2bwt.t000066400000000000000000000005441260737656700246000ustar00rootroot00000000000000Set up $ . $TESTDIR/setup.sh Set up the executable: sa2bwt. $ EXEC=$TESTDIR/../sa2bwt Define tmporary files $ TMP1=$OUTDIR/$$.tmp.out $ TMP2=$OUTDIR/$$.tmp.stdout Make OUTDIR $ mkdir -p $OUTDIR $ FA=$DATDIR/ecoli_reference.fasta $ SA=$DATDIR/ecoli_reference.sa $ BWT=$OUTDIR/ecoli_reference.bwt $ $EXEC $FA $SA $BWT $ echo $? 0 blasr-8e668beae0dda1da6914586fb458182c6c3c7482/extrautils/ctest/sals.t000066400000000000000000000005051260737656700243350ustar00rootroot00000000000000Set up $ . $TESTDIR/setup.sh Set up the executable: sals. $ EXEC=$TESTDIR/../sals Define tmporary files $ TMP1=$OUTDIR/$$.tmp.out $ TMP2=$OUTDIR/$$.tmp.stdout Make OUTDIR $ mkdir -p $OUTDIR $ $EXEC $DATDIR/ecoli_reference.sa * has a suffix array. * has a lookup table for word size. 8 $ echo $? 0 blasr-8e668beae0dda1da6914586fb458182c6c3c7482/extrautils/ctest/samodify.t000066400000000000000000000006641260737656700252140ustar00rootroot00000000000000Set up $ . $TESTDIR/setup.sh Set up the executable: samodify. $ EXEC=$TESTDIR/../samodify Define tmporary files $ TMP1=$OUTDIR/$$.tmp.out $ TMP2=$OUTDIR/$$.tmp.stdout Make OUTDIR $ mkdir -p $OUTDIR $ $EXEC $DATDIR/ecoli_reference.sa $DATDIR/ecoli_reference.fasta $OUTDIR/ecoli_reference_blt13.sa -blt 13 $ echo $? 0 $ md5sum $OUTDIR/ecoli_reference_blt13.sa | cut -f 1 -d ' ' ac70eef5a6e03ae8177f27b3aeacc4c5 blasr-8e668beae0dda1da6914586fb458182c6c3c7482/extrautils/ctest/setup.sh000077500000000000000000000004551260737656700247110ustar00rootroot00000000000000# Set up directories CURDIR=$TESTDIR REMOTEDIR=/mnt/secondary-siv/testdata/BlasrTestData/ctest DATDIR=$REMOTEDIR/data OUTDIR=$CURDIR/out STDDIR=$REMOTEDIR/stdout SCRIPTDIR=$REMOTEDIR/scripts/ # Define tmporary files TMP1=$OUTDIR/$$.tmp.out TMP2=$OUTDIR/$$.tmp.stdout # Make OUTDIR mkdir -p $OUTDIR blasr-8e668beae0dda1da6914586fb458182c6c3c7482/extrautils/ctest/swmatcher.t000066400000000000000000000005671260737656700254000ustar00rootroot00000000000000Set up $ . $TESTDIR/setup.sh Set up the executable: swMather. $ EXEC=$TESTDIR/../swMatcher Define tmporary files $ TMP1=$OUTDIR/$$.tmp.out $ TMP2=$OUTDIR/$$.tmp.stdout Make OUTDIR $ mkdir -p $OUTDIR $ FA=$DATDIR/ecoli_subset.fasta $ $EXEC $FA $FA 10 -local > $OUTDIR/swmatcher.out $ echo $? 0 $ diff $OUTDIR/swmatcher.out $STDDIR/swmatcher.stdout blasr-8e668beae0dda1da6914586fb458182c6c3c7482/extrautils/makefile000066400000000000000000000025471260737656700235740ustar00rootroot00000000000000.PHONY=all cramtests SRCDIR:=$(dir $(realpath $(lastword $(MAKEFILE_LIST)))) -include ${CURDIR}/../defines.mk include ${SRCDIR}/../rules.mk CXXOPTS := -std=c++0x -pedantic \ -Wall -Wuninitialized -Wno-div-by-zero \ -MMD -MP -w -fpermissive CXXFLAGS += ${CXXOPTS} ${GCXXFLAGS} EXE = sa2bwt bwt2sa alchemy excrep evolve bsdb simpleShredder swMatcher \ samodify sals printTupleCountTable cmpH5StoreQualityByContext ccsh5tobam LD_LIBRARY_PATH=${HDF5_LIB}:${LIBBLASR_LIB}:${LIBPBIHDF_LIB}:${LIBPBDATA_LIB} export LD_LIBRARY_PATH vpath %.cpp ${SRCDIR} all: ${EXE} ${EXE}: ${CXX} -o $@ $< ${CXXFLAGS} ${CPPFLAGS} -MF"${@:%=%.d}" ${STATIC} ${LDFLAGS} ${LDLIBS} sa2bwt: SuffixArrayToBWT.o bwt2sa: BwtToSuffixArray.o alchemy: BasH5Simulator.o excrep: ExciseRepeats.o evolve: Evolve.o bsdb: BuildSequenceDB.o simpleShredder: SimpleShredder.o swMatcher: SWMatcher.o samodify: SAModify.o sals: SALS.o printTupleCountTable: PrintTupleCountTable.o cmpH5StoreQualityByContext: StoreQualityByContextFromCmpH5.o ccsh5tobam: CCSH5ToBam.o CTESTS := \ ctest/alchemy.t ctest/ccsh5tobam.t ctest/printTupleCountTable.t ctest/sals.t ctest/swmatcher.t \ ctest/bwt2sa.t ctest/cmpH5StoreQualityByContext.t ctest/sa2bwt.t ctest/samodify.t cramtests: ${EXE} cram -v --shell=/bin/bash ${CTESTS} clean: @rm -f ${EXE} @rm -f *.d *.o blasr-8e668beae0dda1da6914586fb458182c6c3c7482/include/000077500000000000000000000000001260737656700213035ustar00rootroot00000000000000blasr-8e668beae0dda1da6914586fb458182c6c3c7482/include/BlasrAlign.hpp000066400000000000000000000127411260737656700240370ustar00rootroot00000000000000// Copyright (c) 2014-2015, Pacific Biosciences of California, Inc. // // All rights reserved. // // Redistribution and use in source and binary forms, with or without // modification, are permitted (subject to the limitations in the // disclaimer below) provided that the following conditions are met: // // * Redistributions of source code must retain the above copyright // notice, this list of conditions and the following disclaimer. // // * Redistributions in binary form must reproduce the above // copyright notice, this list of conditions and the following // disclaimer in the documentation and/or other materials provided // with the distribution. // // * Neither the name of Pacific Biosciences nor the names of its // contributors may be used to endorse or promote products derived // from this software without specific prior written permission. // // NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE // GRANTED BY THIS LICENSE. THIS SOFTWARE IS PROVIDED BY PACIFIC // BIOSCIENCES AND ITS CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED // WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES // OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE // DISCLAIMED. IN NO EVENT SHALL PACIFIC BIOSCIENCES OR ITS // CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, // SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT // LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF // USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND // ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, // OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT // OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF // SUCH DAMAGE. // Author: Mark Chaisson #ifndef __BLASR_ALIGN_HPP_ #define __BLASR_ALIGN_HPP_ #include "BlasrHeaders.h" #include "BlasrMiscs.hpp" //------------------MAP READS---------------------------------// template void MapRead(T_Sequence &read, T_Sequence &readRC, T_RefSequence &genome, T_SuffixArray &sarray, BWT &bwt, SeqBoundaryFtr &seqBoundary, T_TupleCountTable &ct, SequenceIndexDatabase &seqdb, MappingParameters ¶ms, MappingMetrics &metrics, vector &alignmentPtrs, MappingBuffers &mappingBuffers, MappingIPC *mapData, MappingSemaphores & semaphores); template void MapRead(T_Sequence &read, T_Sequence &readRC, vector &alignmentPtrs, MappingBuffers &mappingBuffers, MappingIPC *mapData, MappingSemaphores & semaphores); /* void MapReads(MappingData *mapData); */ //------------------MAKE ALIGNMENTS---------------------------// template void AlignIntervals(T_TargetSequence &genome, T_QuerySequence &read, T_QuerySequence &rcRead, WeightedIntervalSet &weightedIntervals, int mutationCostMatrix[][5], int ins, int del, int sdpTupleSize, int useSeqDB, SequenceIndexDatabase &seqDB, vector &alignments, MappingParameters ¶ms, MappingBuffers &mappingBuffers, int procId=0); template void PairwiseLocalAlign(T_Sequence &qSeq, T_RefSequence &tSeq, int k, MappingParameters ¶ms, T_AlignmentCandidate &alignment, MappingBuffers &mappingBuffers, AlignmentType alignType=Global); // Extend target aligned sequence of the input alignement to both ends // by flankSize bases. Update alignment->tAlignedSeqPos, // alignment->tAlignedSeqLength and alignment->tAlignedSeq. void FlankTAlignedSeq(T_AlignmentCandidate * alignment, SequenceIndexDatabase &seqdb, DNASequence & genome, int flankSize); // Align a subread of a SMRT sequence to target sequence of an alignment. // Input: // subread - a subread of a SMRT sequence. // unrolledRead - the full SMRT sequence. // alignment - an alignment. // passDirection - whether or not the subread has the // same direction as query of the alignment. // 0 = true, 1 = false. // subreadInterval - [start, end) interval of the subread in the // SMRT read. // subreadIndex - index of the subread in allReadAlignments. // params - mapping paramters. // Output: // allReadAlignments - where the sequence and alignments of the // subread are saved. // threadOut - an out stream for debugging the current thread. void AlignSubreadToAlignmentTarget(ReadAlignments & allReadAlignments, SMRTSequence & subread, SMRTSequence & unrolledRead, T_AlignmentCandidate * alignment, int passDirection, ReadInterval & subreadInterval, int subreadIndex, MappingParameters & params, MappingBuffers & mappingBuffers, ostream & threadOut); #include "BlasrAlignImpl.hpp" #endif blasr-8e668beae0dda1da6914586fb458182c6c3c7482/include/BlasrAlignImpl.hpp000066400000000000000000002127171260737656700246660ustar00rootroot00000000000000// Copyright (c) 2014-2015, Pacific Biosciences of California, Inc. // // All rights reserved. // // Redistribution and use in source and binary forms, with or without // modification, are permitted (subject to the limitations in the // disclaimer below) provided that the following conditions are met: // // * Redistributions of source code must retain the above copyright // notice, this list of conditions and the following disclaimer. // // * Redistributions in binary form must reproduce the above // copyright notice, this list of conditions and the following // disclaimer in the documentation and/or other materials provided // with the distribution. // // * Neither the name of Pacific Biosciences nor the names of its // contributors may be used to endorse or promote products derived // from this software without specific prior written permission. // // NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE // GRANTED BY THIS LICENSE. THIS SOFTWARE IS PROVIDED BY PACIFIC // BIOSCIENCES AND ITS CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED // WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES // OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE // DISCLAIMED. IN NO EVENT SHALL PACIFIC BIOSCIENCES OR ITS // CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, // SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT // LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF // USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND // ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, // OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT // OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF // SUCH DAMAGE. // Author: Mark Chaisson #ifndef _BLASR_ALIGN_IMPL_HPP_ #define _BLASR_ALIGN_IMPL_HPP_ template void MapRead(T_Sequence &read, T_Sequence &readRC, T_RefSequence &genome, T_SuffixArray &sarray, BWT &bwt, SeqBoundaryFtr &seqBoundary, T_TupleCountTable &ct, SequenceIndexDatabase &seqdb, MappingParameters ¶ms, MappingMetrics &metrics, vector &alignmentPtrs, MappingBuffers &mappingBuffers, MappingIPC *mapData, MappingSemaphores & semaphores) { bool matchFound; WeightedIntervalSet topIntervals(params.nCandidates); int numKeysMatched=0, rcNumKeysMatched=0; int expand = params.minExpand; metrics.clocks.total.Tick(); int nTotalCells = 0; int forwardNumBasesMatched = 0, reverseNumBasesMatched = 0; do { matchFound = false; mappingBuffers.matchPosList.clear(); mappingBuffers.rcMatchPosList.clear(); alignmentPtrs.clear(); topIntervals.clear(); params.anchorParameters.expand = expand; metrics.clocks.mapToGenome.Tick(); if (params.useSuffixArray) { params.anchorParameters.lcpBoundsOutPtr = mapData->lcpBoundsOutPtr; numKeysMatched = MapReadToGenome(genome, sarray, read, params.lookupTableLength, mappingBuffers.matchPosList, params.anchorParameters); // // Only print values for the read in forward direction (and only // the first read). // mapData->lcpBoundsOutPtr = NULL; if (!params.forwardOnly) { rcNumKeysMatched = MapReadToGenome(genome, sarray, readRC, params.lookupTableLength, mappingBuffers.rcMatchPosList, params.anchorParameters); } } else if (params.useBwt){ numKeysMatched = MapReadToGenome(bwt, read, read.SubreadStart(), read.SubreadEnd(), mappingBuffers.matchPosList, params.anchorParameters, forwardNumBasesMatched); if (!params.forwardOnly) { rcNumKeysMatched = MapReadToGenome(bwt, readRC, readRC.SubreadStart(), readRC.SubreadEnd(), mappingBuffers.rcMatchPosList, params.anchorParameters, reverseNumBasesMatched); } } // // Look to see if only the anchors are printed. if (params.anchorFileName != "") { int i; if (params.nProc > 1) { #ifdef __APPLE__ sem_wait(semaphores.writer); #else sem_wait(&semaphores.writer); #endif } *mapData->anchorFilePtr << read.title << endl; for (i = 0; i < mappingBuffers.matchPosList.size(); i++) { *mapData->anchorFilePtr << mappingBuffers.matchPosList[i] << endl; } *mapData->anchorFilePtr << readRC.title << " (RC) " << endl; for (i = 0; i < mappingBuffers.rcMatchPosList.size(); i++) { *mapData->anchorFilePtr << mappingBuffers.rcMatchPosList[i] << endl; } if (params.nProc > 1) { #ifdef __APPLE__ sem_post(semaphores.writer); #else sem_post(&semaphores.writer); #endif } } metrics.totalAnchors += mappingBuffers.matchPosList.size() + mappingBuffers.rcMatchPosList.size(); metrics.clocks.mapToGenome.Tock(); metrics.clocks.sortMatchPosList.Tick(); SortMatchPosList(mappingBuffers.matchPosList); SortMatchPosList(mappingBuffers.rcMatchPosList); metrics.clocks.sortMatchPosList.Tock(); PValueWeightor lisPValue(read, genome, ct.tm, &ct); MultiplicityPValueWeightor lisPValueByWeight(genome); LISSumOfLogPWeightor > lisPValueByLogSum(genome); LISSizeWeightor > lisWeightFn; IntervalSearchParameters intervalSearchParameters; intervalSearchParameters.globalChainType = params.globalChainType; intervalSearchParameters.advanceHalf = params.advanceHalf; intervalSearchParameters.warp = params.warp; intervalSearchParameters.fastMaxInterval = params.fastMaxInterval; intervalSearchParameters.aggressiveIntervalCut = params.aggressiveIntervalCut; intervalSearchParameters.verbosity = params.verbosity; // // If specified, only align a band from the anchors. // DNALength squareRefLength = read.length * 1.25 + params.limsAlign; if (params.limsAlign != 0) { int fi; for (fi = 0; fi < mappingBuffers.matchPosList.size(); fi++) { if (mappingBuffers.matchPosList[fi].t >= squareRefLength) { break; } } if (fi < mappingBuffers.matchPosList.size()) { mappingBuffers.matchPosList.resize(fi); } } metrics.clocks.findMaxIncreasingInterval.Tick(); // // For now say that something that has a 50% chance of happening // by chance is too high of a p value. This is probably many times // the size. // intervalSearchParameters.maxPValue = log(0.5); intervalSearchParameters.aboveCategoryPValue = -300; VarianceAccumulator accumPValue; VarianceAccumulator accumWeight; VarianceAccumulator accumNBases; mappingBuffers.clusterList.Clear(); mappingBuffers.revStrandClusterList.Clear(); // // Remove anchors that are fully encompassed by longer ones. This // speeds up limstemplate a lot. // RemoveOverlappingAnchors(mappingBuffers.matchPosList); RemoveOverlappingAnchors(mappingBuffers.rcMatchPosList); if (params.pValueType == 0) { int original = mappingBuffers.matchPosList.size(); int numMerged = 0; if (params.printDotPlots) { ofstream dotPlotOut; string dotPlotName = string(read.title) + ".anchors"; CrucialOpen(dotPlotName, dotPlotOut, std::ios::out); int mp; for (mp = 0; mp < mappingBuffers.matchPosList.size(); mp++ ){ dotPlotOut << mappingBuffers.matchPosList[mp].q << " " << mappingBuffers.matchPosList[mp].t << " " << mappingBuffers.matchPosList[mp].l << " " << endl; } dotPlotOut.close(); } /* This is an optimization that is being tested out that places a grid over the area where there are anchors, and then finds an increasing maximally weighted path through the grid. The weight of a cell in the grid is the sum of the number of anchors in it. All other anchors are to be removed. This will likely only work for LIMSTemplate sequences, or other sequences with little structural variation. FindBand(mappingBuffers.matchPosList, refCopy, read, 100); */ FindMaxIncreasingInterval(Forward, mappingBuffers.matchPosList, // allow for indels to stretch out the mapping of the read. (DNALength) ((read.SubreadLength()) * (1 + params.indelRate)), params.nCandidates, seqBoundary, lisPValue,//lisPValue2, lisWeightFn, topIntervals, genome, read, intervalSearchParameters, &mappingBuffers.globalChainEndpointBuffer, mappingBuffers.clusterList, accumPValue, accumWeight, accumNBases, read.title); // Uncomment when the version of the weight functor needs the sequence. mappingBuffers.clusterList.ResetCoordinates(); FindMaxIncreasingInterval(Reverse, mappingBuffers.rcMatchPosList, (DNALength) ((read.SubreadLength()) * (1 + params.indelRate)), params.nCandidates, seqBoundary, lisPValue,//lisPValue2 lisWeightFn, topIntervals, genome, readRC, intervalSearchParameters, &mappingBuffers.globalChainEndpointBuffer, mappingBuffers.revStrandClusterList, accumPValue, accumWeight, accumNBases, read.title); } else if (params.pValueType == 1) { FindMaxIncreasingInterval(Forward, mappingBuffers.matchPosList, // allow for indels to stretch out the mapping of the read. (DNALength) ((read.SubreadLength()) * (1 + params.indelRate)), params.nCandidates, seqBoundary, lisPValueByWeight, // different from pvaltype == 2 and 0 lisWeightFn, topIntervals, genome, read, intervalSearchParameters, &mappingBuffers.globalChainEndpointBuffer, mappingBuffers.clusterList, accumPValue, accumWeight, accumNBases, read.title); mappingBuffers.clusterList.ResetCoordinates(); FindMaxIncreasingInterval(Reverse, mappingBuffers.rcMatchPosList, (DNALength) ((read.SubreadLength()) * (1 + params.indelRate)), params.nCandidates, seqBoundary, lisPValueByWeight, // different from pvaltype == 2 and 0 lisWeightFn, topIntervals, genome, readRC, intervalSearchParameters, &mappingBuffers.globalChainEndpointBuffer, mappingBuffers.revStrandClusterList, accumPValue, accumWeight, accumNBases, read.title); } else if (params.pValueType == 2) { FindMaxIncreasingInterval(Forward, mappingBuffers.matchPosList, // allow for indels to stretch out the mapping of the read. (DNALength) ((read.SubreadLength()) * (1 + params.indelRate)), params.nCandidates, seqBoundary, lisPValueByLogSum, // different from pvaltype == 1 and 0 lisWeightFn, topIntervals, genome, read, intervalSearchParameters, &mappingBuffers.globalChainEndpointBuffer, mappingBuffers.clusterList, accumPValue, accumWeight, accumNBases, read.title); mappingBuffers.clusterList.ResetCoordinates(); FindMaxIncreasingInterval(Reverse, mappingBuffers.rcMatchPosList, (DNALength) ((read.SubreadLength()) * (1 + params.indelRate)), params.nCandidates, seqBoundary, lisPValueByLogSum, // different from pvaltype == 1 and 0 lisWeightFn, topIntervals, genome, readRC, intervalSearchParameters, &mappingBuffers.globalChainEndpointBuffer, mappingBuffers.revStrandClusterList, accumPValue, accumWeight, accumNBases, read.title); } mappingBuffers.clusterList.numBases.insert(mappingBuffers.clusterList.numBases.end(), mappingBuffers.revStrandClusterList.numBases.begin(), mappingBuffers.revStrandClusterList.numBases.end()); mappingBuffers.clusterList.numAnchors.insert(mappingBuffers.clusterList.numAnchors.end(), mappingBuffers.revStrandClusterList.numAnchors.begin(), mappingBuffers.revStrandClusterList.numAnchors.end()); metrics.clocks.findMaxIncreasingInterval.Tock(); // // Print verbose output. // WeightedIntervalSet::iterator topIntIt, topIntEnd; topIntEnd = topIntervals.end(); if (params.verbosity > 0) { int topintind = 0; cout << " intv: index start end qstart qend seq_boundary_start seq_boundary_end pvalue " << endl; for (topIntIt = topIntervals.begin();topIntIt != topIntEnd ; ++topIntIt) { cout << " intv: " << topintind << " " << (*topIntIt).start << " " << (*topIntIt).end << " " << (*topIntIt).qStart << " " << (*topIntIt).qEnd << " " << seqBoundary((*topIntIt).start) << " " << seqBoundary((*topIntIt).end) << " " << (*topIntIt).pValue << endl; if (params.verbosity > 2) { int m; for (m = 0; m < (*topIntIt).matches.size(); m++) { cout << " (" << (*topIntIt).matches[m].q << ", " << (*topIntIt).matches[m].t << ", " << (*topIntIt).matches[m].l << ") "; } cout << endl; } ++topintind; } } // // Allocate candidate alignments on the stack. Each interval is aligned. // alignmentPtrs.resize(topIntervals.size()); UInt i; for (i = 0; i < alignmentPtrs.size(); i++ ) { alignmentPtrs[i] = new T_AlignmentCandidate; } metrics.clocks.alignIntervals.Tick(); AlignIntervals( genome, read, readRC, topIntervals, SMRTDistanceMatrix, params.indel, params.indel, params.sdpTupleSize, params.useSeqDB, seqdb, alignmentPtrs, params, mappingBuffers, params.startRead ); /* cout << read.title << endl; for (i = 0; i < alignmentPtrs.size(); i++) { cout << alignmentPtrs[i]->clusterScore << " " << alignmentPtrs[i]->score << endl; } */ StoreRankingStats(alignmentPtrs, accumPValue, accumWeight); std::sort(alignmentPtrs.begin(), alignmentPtrs.end(), SortAlignmentPointersByScore()); metrics.clocks.alignIntervals.Tock(); // // Evalutate the matches that are found for 'good enough'. // matchFound = CheckForSufficientMatch(read, alignmentPtrs, params); // // When no proper alignments are found, the loop will resume. // Delete all alignments because they are bad. // if (expand < params.maxExpand and matchFound == false) { DeleteAlignments(alignmentPtrs, 0); } // // Record some metrics that show how long this took to run per base. // if (alignmentPtrs.size() > 0) { metrics.RecordNumAlignedBases(read.length); metrics.RecordNumCells(alignmentPtrs[0]->nCells); } if (matchFound == true) { metrics.totalAnchorsForMappedReads += mappingBuffers.matchPosList.size() + mappingBuffers.rcMatchPosList.size(); } ++expand; } while ( expand <= params.maxExpand and matchFound == false); metrics.clocks.total.Tock(); UInt i; int totalCells = 0; for (i = 0; i< alignmentPtrs.size(); i++) { totalCells += alignmentPtrs[i]->nCells; } metrics.clocks.AddCells(totalCells); int totalBases = 0; for (i = 0; i < alignmentPtrs.size(); i++) { totalBases += alignmentPtrs[i]->qLength; } metrics.clocks.AddBases(totalBases); // // Some of the alignments are to spurious regions. Delete the // references that have too small of a score. // int effectiveReadLength = 0; for (i = 0; i< read.length; i++) { if (read.seq[i] != 'N') effectiveReadLength++; } if (params.sdpFilterType == 0) { RemoveLowQualityAlignments(read, alignmentPtrs, params); } else if (params.sdpFilterType == 1) { RemoveLowQualitySDPAlignments(effectiveReadLength, alignmentPtrs, params); } // // Now remove overlapping alignments. // vector bothQueryStrands; bothQueryStrands.resize(2); bothQueryStrands[Forward] = &read; bothQueryStrands[Reverse] = &readRC; // // Possibly use banded dynamic programming to refine the columns // of an alignment and the alignment score. // if (params.refineAlignments) { RefineAlignments(bothQueryStrands, genome, alignmentPtrs, params, mappingBuffers); RemoveLowQualityAlignments(read,alignmentPtrs,params); RemoveOverlappingAlignments(alignmentPtrs, params); } if (params.forPicard) { int a; for (a = 0; a < alignmentPtrs.size(); a++ ) { alignmentPtrs[a]->OrderGapsByType(); } } // // Look to see if the number of anchors found for this read match // what is expected given the expected distribution of number of // anchors. // if (alignmentPtrs.size() > 0) { int clusterIndex; // // Compute some stats on the read. For now this is fixed but will // be updated on the fly soon. // float meanAnchorBasesPerRead, sdAnchorBasesPerRead; float meanAnchorsPerRead, sdAnchorsPerRead; int lookupValue; // // If a very short anchor size was used, or very long min match // size there may be no precomputed distributions for it. // Handle this by bounding the min match by the smallest and // largest values for which there are precomputed statistics. int boundedMinWordMatchLength = min(max(params.minMatchLength, anchorMinKValues[0]), anchorMinKValues[1]); // // Do a similar bounding for match length and accuracy. // int boundedMatchLength = min(max((int) alignmentPtrs[0]->qAlignedSeq.length, anchorReadLengths[0]), anchorReadLengths[1]); int boundedPctSimilarity = min(max((int)alignmentPtrs[0]->pctSimilarity, anchorReadAccuracies[0]), anchorReadAccuracies[1]); lookupValue = LookupAnchorDistribution(boundedMatchLength, boundedMinWordMatchLength, boundedPctSimilarity, meanAnchorsPerRead, sdAnchorsPerRead, meanAnchorBasesPerRead, sdAnchorBasesPerRead); float minExpAnchors = meanAnchorsPerRead - sdAnchorsPerRead; // // The number of standard deviations is just trial and error. float minExpAnchorBases = meanAnchorBasesPerRead - 2 * sdAnchorBasesPerRead; if (lookupValue < 0 or minExpAnchorBases < 0) { minExpAnchorBases = 0; } int numSignificantClusters = 0; int totalSignificantClusterSize = 0; int maxClusterSize = 0; int maxClusterIndex = 0; int numAlnAnchorBases, numAlnAnchors, scaledMaxClusterSize; alignmentPtrs[0]->ComputeNumAnchors(boundedMinWordMatchLength, numAlnAnchors, numAlnAnchorBases); int totalAnchorBases = 0; if (numAlnAnchorBases > meanAnchorBasesPerRead + sdAnchorBasesPerRead) { numSignificantClusters = 1; } else { if (alignmentPtrs[0]->score < params.maxScore) { for (clusterIndex = 0; clusterIndex < mappingBuffers.clusterList.numBases.size(); clusterIndex++) { if (mappingBuffers.clusterList.numBases[clusterIndex] > maxClusterSize) { maxClusterSize = mappingBuffers.clusterList.numBases[clusterIndex]; maxClusterIndex = clusterIndex; } } int scaledExpectedClusterSize = maxClusterSize / ((float)numAlnAnchorBases) * minExpAnchorBases; for (clusterIndex = 0; clusterIndex < mappingBuffers.clusterList.numBases.size(); clusterIndex++) { bool isSignificant = false; if (mappingBuffers.clusterList.numBases[clusterIndex] >= scaledExpectedClusterSize) { // cout << mappingBuffers.clusterList.numBases[clusterIndex] << " " << scaledExpectedClusterSize << " " << meanAnchorBasesPerRead << " " << sdAnchorBasesPerRead << endl; ++numSignificantClusters; totalSignificantClusterSize += meanAnchorBasesPerRead; isSignificant = true; } // // The following output block is useful in debugging mapqv // calculation. It should be uncommented and examined when // mapqvs do not look correct. // totalAnchorBases += mappingBuffers.clusterList.numBases[clusterIndex]; } } if (lookupValue == 0) { int scaledMaxClusterSize; alignmentPtrs[0]->ComputeNumAnchors(params.minMatchLength, numAlnAnchors, numAlnAnchorBases); scaledMaxClusterSize = ( ((float)numAlnAnchorBases )/ meanAnchorBasesPerRead) * maxClusterSize; } } for (i = 0; i < alignmentPtrs.size(); i++) { alignmentPtrs[i]->numSignificantClusters = numSignificantClusters; } if (mapData->clusterFilePtr != NULL and topIntervals.size() > 0 and alignmentPtrs.size() > 0) { WeightedIntervalSet::iterator intvIt = topIntervals.begin(); if (params.nProc > 1) { #ifdef __APPLE__ sem_wait(semaphores.hitCluster); #else sem_wait(&semaphores.hitCluster); #endif } *mapData->clusterFilePtr << (*intvIt).size << " " << (*intvIt).pValue << " " << (*intvIt).nAnchors << " " << read.length << " " << alignmentPtrs[0]->score << " " << alignmentPtrs[0]->pctSimilarity << " " << " " << minExpAnchors << " " << alignmentPtrs[0]->qAlignedSeq.length << endl; if (params.nProc > 1) { #ifdef __APPLE__ sem_post(semaphores.hitCluster); #else sem_post(&semaphores.hitCluster); #endif } } } // // Assign the query name and strand for each alignment. // for (i = 0; i < alignmentPtrs.size(); i++) { T_AlignmentCandidate *aref = alignmentPtrs[i]; if (aref->tStrand == 0) { aref->qName = read.GetName(); } else { aref->qName = readRC.GetName(); } } AssignRefContigLocations(alignmentPtrs, seqdb, genome); } template void MapRead(T_Sequence &read, T_Sequence &readRC, vector &alignmentPtrs, MappingBuffers &mappingBuffers, MappingIPC *mapData, MappingSemaphores & semaphores) { DNASuffixArray sarray; TupleCountTable ct; SequenceIndexDatabase seqdb; T_GenomeSequence genome; BWT *bwtPtr = mapData->bwtPtr; mapData->ShallowCopySuffixArray(sarray); mapData->ShallowCopyReferenceSequence(genome); mapData->ShallowCopySequenceIndexDatabase(seqdb); mapData->ShallowCopyTupleCountTable(ct); SeqBoundaryFtr seqBoundary(&seqdb); return MapRead(read, readRC, genome, // possibly multi fasta file read into one sequence sarray, *bwtPtr, // The suffix array, and the bwt-fm index structures seqBoundary, // Boundaries of contigs in the // genome, alignments do not span // the ends of boundaries. ct, // Count table to use word frequencies in the genome to weight matches. seqdb, // Information about the names of // chromosomes in the genome, and // where their sequences are in the genome. mapData->params,// A huge list of parameters for // mapping, only compile/command // line values set. mapData->metrics, // Keep track of time/ hit counts, // etc.. Not fully developed, but // should be. alignmentPtrs, // Where the results are stored. mappingBuffers, // A class of buffers for structurs // like dyanmic programming // matrices, match lists, etc., that are not // reallocated between calls to // MapRead. They are cleared though. mapData, // Some values that are shared // across threads. semaphores); } template void AlignIntervals(T_TargetSequence &genome, T_QuerySequence &read, T_QuerySequence &rcRead, WeightedIntervalSet &weightedIntervals, int mutationCostMatrix[][5], int ins, int del, int sdpTupleSize, int useSeqDB, SequenceIndexDatabase &seqDB, vector &alignments, MappingParameters ¶ms, MappingBuffers &mappingBuffers, int procId) { vector forrev; forrev.resize(2); forrev[Forward] = &read; forrev[Reverse] = &rcRead; // // Use an edit distance scoring function instead of IDS. Although // the IDS should be more accurate, it is more slow, and it is more // important at this stage to have faster alignments than accurate, // since all alignments are rerun using GuidedAlignment later on. // DistanceMatrixScoreFunction distScoreFn(SMRTDistanceMatrix, params.insertion, params.deletion); DistanceMatrixScoreFunction distScoreFn2(SMRTDistanceMatrix, ins, ins); // // Assume there is at least one interval. // if (weightedIntervals.size() == 0) return; WeightedIntervalSet::iterator intvIt = weightedIntervals.begin(); int alignmentIndex = 0; do { T_AlignmentCandidate *alignment = alignments[alignmentIndex]; alignment->clusterWeight= (*intvIt).size; // totalAnchorSize == size alignment->clusterScore = (*intvIt).pValue; // // Advance references. Intervals are stored in reverse order, so // go backwards in the list, and alignments are in forward order. // That should probably be changed. // ++alignmentIndex; // // Try aligning the read to the genome. // DNALength matchIntervalStart, matchIntervalEnd; matchIntervalStart = (*intvIt).start; matchIntervalEnd = (*intvIt).end; bool readOverlapsContigStart = false; bool readOverlapsContigEnd = false; int startOverlappedContigIndex = 0; int endOverlappedContigIndex = 0; if (params.verbosity > 0) { cout << "aligning interval : " << read.length << " " << (*intvIt).start << " " << (*intvIt).end << " " << (*intvIt).qStart << " " << (*intvIt).qEnd << " " << matchIntervalStart << " to " << matchIntervalEnd << " " << params.approximateMaxInsertionRate << " " << endl; } assert(matchIntervalEnd >= matchIntervalStart); // // If using a sequence database, check to make sure that the // boundaries of the sequence windows do not overlap with // the boundaries of the reads. If the beginning is before // the boundary, move the beginning up to the start of the read. // If the end is past the end boundary of the read, similarly move // the window boundary to the end of the read boundary. DNALength tAlignedContigStart = 0; int seqDBIndex = 0; // // Stretch the alignment interval so that it is close to where // the read actually starts. // DNALength subreadStart = read.SubreadStart(); DNALength subreadEnd = read.SubreadEnd(); if ((*intvIt).GetStrandIndex() == Reverse) { subreadEnd = read.MakeRCCoordinate(read.SubreadStart()) + 1; subreadStart = read.MakeRCCoordinate(read.SubreadEnd()-1); } DNALength lengthBeforeFirstMatch = ((*intvIt).qStart - subreadStart) * params.approximateMaxInsertionRate ; DNALength lengthAfterLastMatch = (subreadEnd - (*intvIt).qEnd) * params.approximateMaxInsertionRate; if (matchIntervalStart < lengthBeforeFirstMatch or params.doGlobalAlignment) { matchIntervalStart = 0; } else { matchIntervalStart -= lengthBeforeFirstMatch; } if (genome.length < matchIntervalEnd + lengthAfterLastMatch or params.doGlobalAlignment) { matchIntervalEnd = genome.length; } else { matchIntervalEnd += lengthAfterLastMatch; } DNALength intervalContigStartPos, intervalContigEndPos; if (useSeqDB) { // // The sequence db index is the one where the actual match is // contained. The matchIntervalStart might be before the sequence // index boundary due to the extrapolation of alignment start by // insertion rate. If this is the case, bump up the // matchIntervalStart to be at the beginning of the boundary. // Modify bounds similarly for the matchIntervalEnd and the end // of a boundary. // seqDBIndex = seqDB.SearchForIndex((*intvIt).start); intervalContigStartPos = seqDB.seqStartPos[seqDBIndex]; if (intervalContigStartPos > matchIntervalStart) { matchIntervalStart = intervalContigStartPos; } intervalContigEndPos = seqDB.seqStartPos[seqDBIndex+1] - 1; if (intervalContigEndPos < matchIntervalEnd) { matchIntervalEnd = intervalContigEndPos; } alignment->tName = seqDB.GetSpaceDelimitedName(seqDBIndex); alignment->tLength = intervalContigEndPos - intervalContigStartPos; // // When there are multiple sequences in the database, store the // index of this sequence. This lets one compare the contigs // that reads are mapped to, for instance. // alignment->tIndex = seqDBIndex; } else { alignment->tLength = genome.length; alignment->tName = genome.GetName(); intervalContigStartPos = 0; intervalContigEndPos = genome.length; // // When there are multiple sequences in the database, store the // index of this sequence. This lets one compare the contigs // that reads are mapped to, for instance. // } alignment->qName = read.title; // // Look to see if a read overhangs the beginning of a contig. // if (params.verbosity > 2) { cout << "Check for prefix/suffix overlap on interval: " << (*intvIt).qStart << " ?> " << (*intvIt).start - intervalContigStartPos < (*intvIt).start - intervalContigStartPos) { readOverlapsContigStart = true; startOverlappedContigIndex = seqDBIndex; } // // Look to see if the read overhangs the end of a contig. // if (params.verbosity > 2) { cout << "Check for suffix/prefix overlap on interval, read overhang: " << read.length - (*intvIt).qEnd << " ?> " << matchIntervalEnd - (*intvIt).end < matchIntervalEnd - (*intvIt).end) { if (params.verbosity > 2) { cout << "read overlaps genome end." << endl; } readOverlapsContigEnd = true; endOverlappedContigIndex = seqDBIndex; } int alignScore; alignScore = 0; alignment->tAlignedSeqPos = matchIntervalStart; alignment->tAlignedSeqLength = matchIntervalEnd - matchIntervalStart; if ((*intvIt).GetStrandIndex() == Forward) { alignment->tAlignedSeq.Copy(genome, alignment->tAlignedSeqPos, alignment->tAlignedSeqLength); alignment->tStrand = Forward; } else { DNALength rcAlignedSeqPos = genome.MakeRCCoordinate(alignment->tAlignedSeqPos + alignment->tAlignedSeqLength - 1); genome.CopyAsRC(alignment->tAlignedSeq, rcAlignedSeqPos, alignment->tAlignedSeqLength); // Map forward coordinates into reverse complement. intervalContigStartPos = genome.MakeRCCoordinate(intervalContigStartPos) + 1; intervalContigEndPos = genome.MakeRCCoordinate(intervalContigEndPos - 1); swap(intervalContigStartPos, intervalContigEndPos); alignment->tAlignedSeqPos = rcAlignedSeqPos; alignment->tStrand = Reverse; } // Configure the part of the query that is aligned. The entire // query should always be aligned. alignment->qAlignedSeqPos = 0; alignment->qAlignedSeq.ReferenceSubstring(read); alignment->qAlignedSeqLength = alignment->qAlignedSeq.length; alignment->qLength = read.length; alignment->qStrand = 0; if (params.verbosity > 1) { cout << "aligning read " << endl; static_cast(&(alignment->qAlignedSeq))->PrintSeq(cout); cout << endl << "aligning reference" << endl; static_cast(&(alignment->tAlignedSeq))->PrintSeq(cout); cout << endl; } // // The type of alignment that is performed depends on the mode // blasr is running in. If it is running in normal mode, local // aligment is performed and guided by SDP alignment. When // running in overlap mode, the alignments are forced to the ends // of reads. // int intervalSize = 0; int m; // // Check to see if the matches to the genome are sufficiently // dense to allow them to be used instead of having to redo // sdp alignment. // // First count how much of the read matches the genome exactly. for (m = 0; m < intvIt->matches.size(); m++) { intervalSize += intvIt->matches[m].l;} int subreadLength = forrev[(*intvIt).GetStrandIndex()]->SubreadEnd() - forrev[(*intvIt).GetStrandIndex()]->SubreadStart(); if ((1.0*intervalSize) / subreadLength < params.sdpBypassThreshold and !params.emulateNucmer) { // // Not enough of the read maps to the genome, need to use // sdp alignment to define the regions of the read that map. // if (params.refineBetweenAnchorsOnly) { // // Run SDP alignment only between the genomic anchors, // including the genomic anchors as part of the alignment. // int m; vector *matches; vector rcMatches; Alignment anchorsOnly; DNASequence tAlignedSeq; FASTQSequence qAlignedSeq; // // The strand bookkeeping is a bit confusing, so hopefully // this will set things straight. // // If the alignment is forward strand, the coordinates of the // blocks are relative to the forward read, starting at 0, not // the subread start. // If the alignment is reverse strand, the coordinates of the // blocks are relative to the reverse strand, starting at the // position of the subread on the reverse strand. // // The coordinates of the blocks in the genome are always // relative to the forward strand on the genome, starting at // 0. // // // The first step to refining between anchors only is to make // the anchors relative to the tAlignedSeq. matches = (vector*) &(*intvIt).matches; tAlignedSeq = alignment->tAlignedSeq; qAlignedSeq = alignment->qAlignedSeq; if (alignment->tStrand == 0) { for (m = 0; m < matches->size(); m++) { (*matches)[m].t -= alignment->tAlignedSeqPos; (*matches)[m].q -= alignment->qAlignedSeqPos; } } else { // // Flip the entire alignment if it is on the reverse strand. DNALength rcAlignedSeqPos = genome.MakeRCCoordinate(alignment->tAlignedSeqPos + alignment->tAlignedSeqLength - 1); for (m = 0; m < matches->size(); m++) { (*matches)[m].t -= rcAlignedSeqPos; (*matches)[m].q -= alignment->qAlignedSeqPos; } alignment->tAlignedSeq.CopyAsRC(tAlignedSeq); rcMatches.resize((*intvIt).matches.size()); // // Make the reverse complement of the match list. // // 1. Reverse complement the coordinates. for (m = 0; m < (*intvIt).matches.size(); m++) { int revCompIndex = rcMatches.size() - m - 1; rcMatches[revCompIndex].q = read.MakeRCCoordinate((*intvIt).matches[m].q + (*intvIt).matches[m].l - 1); rcMatches[revCompIndex].t = tAlignedSeq.MakeRCCoordinate((*intvIt).matches[m].t + (*intvIt).matches[m].l - 1); rcMatches[revCompIndex].l = (*intvIt).matches[m].l; } matches = &rcMatches; } /* Uncomment to get a dot plot ofstream matchFile; matchFile.open("matches.txt"); matchFile << "q t l " << endl; for (m = 0; matches->size() > 0 and m < matches->size() - 1; m++) { matchFile << (*matches)[m].q << " " << (*matches)[m].t << " " << (*matches)[m].l << endl; } */ DNASequence tSubSeq; FASTQSequence qSubSeq; for (m = 0; matches->size() > 0 and m < matches->size() - 1; m++) { Block block; block.qPos = (*matches)[m].q; block.tPos = (*matches)[m].t; block.length = (*matches)[m].l; // // Find the lengths of the gaps between anchors. // int tGap, qGap; tGap = (*matches)[m+1].t - ((*matches)[m].t + (*matches)[m].l); qGap = (*matches)[m+1].q - ((*matches)[m].q + (*matches)[m].l); float gapRatio = (1.0*tGap)/qGap; if (tGap > 0 and qGap > 0) { DNALength tPos, qPos; tPos = block.tPos + block.length; qPos = block.qPos + block.length; tSubSeq.ReferenceSubstring(tAlignedSeq, tPos, tGap); qSubSeq.ReferenceSubstring(alignment->qAlignedSeq, qPos, qGap); Alignment alignmentInGap; int alignScore; /* The following code is experimental code for trying to do something like affine gap alignment in long gaps. It would eventually be used in cDNA alignment to align between exons, but for now is being tested here by using it to align when there is a big gap between anchors. */ if (params.separateGaps == true and qSubSeq.length > 0 and tSubSeq.length > 0 and ( (1.0*qSubSeq.length)/tSubSeq.length < 0.25 )) { alignScore = OneGapAlign(qSubSeq, tSubSeq, distScoreFn, mappingBuffers, alignmentInGap); } else { /* This is the 'normal/default' way to align between gaps. It is more well tested than OneGapAlign. */ alignScore = SDPAlign(qSubSeq, tSubSeq, distScoreFn, params.sdpTupleSize, params.sdpIns, params.sdpDel, params.indelRate*2, alignmentInGap, mappingBuffers, Global, params.detailedSDPAlignment, params.extendFrontAlignment, params.recurseOver, params.fastSDP); } // // Now, splice the fragment alignment into the current // alignment. // if (alignmentInGap.blocks.size() > 0) { int b; // // Configure this block to be relative to the beginning // of the aligned substring. // for (b = 0; b < alignmentInGap.size(); b++) { alignmentInGap.blocks[b].tPos += tPos + alignmentInGap.tPos; alignmentInGap.blocks[b].qPos += qPos + alignmentInGap.qPos; assert(alignmentInGap.blocks[b].tPos < alignment->tAlignedSeq.length); assert(alignmentInGap.blocks[b].qPos < alignment->qAlignedSeq.length); } } // Add the original block alignment->blocks.push_back(block); anchorsOnly.blocks.push_back(block); // Add the blocks for the refined alignment alignment->blocks.insert(alignment->blocks.end(), alignmentInGap.blocks.begin(), alignmentInGap.blocks.end()); } } // Add the last block m = (*matches).size() - 1; Block block; block.qPos = (*matches)[m].q; block.tPos = (*matches)[m].t; assert(block.tPos <= alignment->tAlignedSeq.length); assert(block.qPos <= alignment->qAlignedSeq.length); block.length = (*matches)[m].l; alignment->blocks.push_back(block); anchorsOnly.blocks.push_back(block); // // By convention, blocks start at 0, and the // alignment->tPos,qPos give the start of the alignment. // Modify the block positions so that they are offset by 0. alignment->tPos = alignment->blocks[0].tPos; alignment->qPos = alignment->blocks[0].qPos; int b; int blocksSize = alignment->blocks.size(); for (b = 0; b < blocksSize ; b++) { assert(alignment->tPos <= alignment->blocks[b].tPos); assert(alignment->qPos <= alignment->blocks[b].qPos); alignment->blocks[b].tPos -= alignment->tPos; alignment->blocks[b].qPos -= alignment->qPos; } for (b = 0; b < anchorsOnly.blocks.size(); b++) { anchorsOnly.blocks[b].tPos -= alignment->tPos; anchorsOnly.blocks[b].qPos -= alignment->qPos; } anchorsOnly.tPos = alignment->tPos; anchorsOnly.qPos = alignment->qPos; ComputeAlignmentStats(*alignment, alignment->qAlignedSeq.seq, alignment->tAlignedSeq.seq, distScoreFn); tAlignedSeq.Free(); qAlignedSeq.Free(); tSubSeq.Free(); qSubSeq.Free(); } else { alignScore = SDPAlign(alignment->qAlignedSeq, alignment->tAlignedSeq, distScoreFn, sdpTupleSize, params.sdpIns, params.sdpDel, params.indelRate*3, *alignment, mappingBuffers, Local, params.detailedSDPAlignment, params.extendFrontAlignment, params.recurseOver, params.fastSDP); ComputeAlignmentStats(*alignment, alignment->qAlignedSeq.seq, alignment->tAlignedSeq.seq, distScoreFn); } } else { // // The anchors used to anchor the sequence are sufficient to extend the alignment. // int m; for (m = 0; m < (*intvIt).matches.size(); m++ ){ Block block; block.qPos = (*intvIt).matches[m].q - alignment->qAlignedSeqPos; block.tPos = (*intvIt).matches[m].t - alignment->tAlignedSeqPos; block.length = (*intvIt).matches[m].l; alignment->blocks.push_back(block); } } // // The anchors/sdp alignments may leave portions of the read // unaligned at the beginning and end. If the parameters // specify extending alignments, try and align extra bases at // the beginning and end of alignments. if (params.extendAlignments) { // // Modify the alignment so that the start and end of the // alignment strings are at the alignment boundaries. // // Since the query sequence is pointing at a subsequence of the // read (and is always in the forward direction), just reference // a new portion of the read. alignment->qAlignedSeqPos = alignment->qAlignedSeqPos + alignment->qPos; alignment->qAlignedSeqLength = alignment->QEnd(); alignment->qAlignedSeq.ReferenceSubstring(read, alignment->qAlignedSeqPos, alignment->qAlignedSeqLength ); alignment->qPos = 0; // // Since the target sequence may be on the forward or reverse // strand, a copy of the subsequence is made, and the original // sequence free'd. // DNASequence tSubseq; alignment->tAlignedSeqPos = alignment->tAlignedSeqPos + alignment->tPos; alignment->tAlignedSeqLength = alignment->TEnd(); tSubseq.Copy(alignment->tAlignedSeq, alignment->tPos, alignment->tAlignedSeqLength); alignment->tPos = 0; alignment->tAlignedSeq.Free(); alignment->tAlignedSeq.TakeOwnership(tSubseq); DNALength maximumExtendLength = 500; if (alignment->blocks.size() > 0 ) { int lastAlignedBlock = alignment->blocks.size() - 1; DNALength lastAlignedQPos = alignment->blocks[lastAlignedBlock].QEnd() + alignment->qPos + alignment->qAlignedSeqPos; DNALength lastAlignedTPos = alignment->blocks[lastAlignedBlock].TEnd() + alignment->tPos + alignment->tAlignedSeqPos; T_AlignmentCandidate extendedAlignmentForward, extendedAlignmentReverse; int forwardScore, reverseScore; SMRTSequence readSuffix; DNALength readSuffixLength; DNASequence genomeSuffix; DNALength genomeSuffixLength; SMRTSequence readPrefix; DNALength readPrefixLength; DNASequence genomePrefix; DNALength genomePrefixLength; // // Align the entire end of the read if it is short enough. // readSuffixLength = min(read.length - lastAlignedQPos, maximumExtendLength); if (readSuffixLength > 0) { readSuffix.ReferenceSubstring(read, lastAlignedQPos, readSuffixLength); } else { readSuffix.length = 0; } // // Align The entire end of the genome up to the maximum extend length; // genomeSuffixLength = min(intervalContigEndPos - lastAlignedTPos, maximumExtendLength); if (genomeSuffixLength > 0) { if (alignment->tStrand == Forward) { genomeSuffix.Copy(genome, lastAlignedTPos, genomeSuffixLength); } else { static_cast(&genome)->CopyAsRC(genomeSuffix, lastAlignedTPos, genomeSuffixLength); } } else { genomeSuffix.length = 0; } forwardScore = 0; if (readSuffix.length > 0 and genomeSuffix.length > 0) { forwardScore = ExtendAlignmentForward(readSuffix, 0, genomeSuffix, 0, params.extendBandSize, // Reuse buffers to speed up alignment mappingBuffers.scoreMat, mappingBuffers.pathMat, // Do the alignment in the forward direction. extendedAlignmentForward, distScoreFn, 1, // don't bother attempting // to extend the alignment // if one of the sequences // is less than 1 base long params.maxExtendDropoff); } if ( forwardScore < 0 ) { // // The extended alignment considers the whole genome, but // should be modified to be starting at the end of where // the original alignment left off. // if (params.verbosity > 0) { cout << "forward extended an alignment of score " << alignment->score << " with score " << forwardScore << " by " << extendedAlignmentForward.blocks.size() << " blocks and length " << extendedAlignmentForward.blocks[extendedAlignmentForward.blocks.size()-1].qPos << endl; } extendedAlignmentForward.tAlignedSeqPos = lastAlignedTPos; extendedAlignmentForward.qAlignedSeqPos = lastAlignedQPos; genomeSuffix.length = extendedAlignmentForward.tPos + extendedAlignmentForward.TEnd(); alignment->tAlignedSeq.Append(genomeSuffix); alignment->qAlignedSeq.length += extendedAlignmentForward.qPos + extendedAlignmentForward.QEnd(); assert(alignment->qAlignedSeq.length <= read.length); alignment->AppendAlignment(extendedAlignmentForward); } DNALength firstAlignedQPos = alignment->qPos + alignment->qAlignedSeqPos; DNALength firstAlignedTPos = alignment->tPos + alignment->tAlignedSeqPos; readPrefixLength = min(firstAlignedQPos, maximumExtendLength); if (readPrefixLength > 0) { readPrefix.ReferenceSubstring(read, firstAlignedQPos-readPrefixLength, readPrefixLength); } else { readPrefix.length = 0; } genomePrefixLength = min(firstAlignedTPos - intervalContigStartPos, maximumExtendLength); if (genomePrefixLength > 0) { if (alignment->tStrand == 0) { genomePrefix.Copy(genome, firstAlignedTPos - genomePrefixLength, genomePrefixLength); } else { static_cast(&genome)->MakeRC(genomePrefix, firstAlignedTPos - genomePrefixLength, genomePrefixLength); } } reverseScore = 0; if (readPrefix.length > 0 and genomePrefix.length > 0) { reverseScore = ExtendAlignmentReverse(readPrefix, readPrefix.length-1, genomePrefix, genomePrefixLength - 1, params.extendBandSize, //k mappingBuffers.scoreMat, mappingBuffers.pathMat, extendedAlignmentReverse, distScoreFn, 1, // don't bother attempting // to extend the alignment // if one of the sequences // is less than 1 base long params.maxExtendDropoff); } if (reverseScore < 0 ) { // // Make alignment->tPos relative to the beginning of the // extended alignment so that when it is appended, the // coordinates match correctly. if (params.verbosity > 0) { cout << "reverse extended an alignment of score " << alignment->score << " with score " << reverseScore << " by " << extendedAlignmentReverse.blocks.size() << " blocks and length " << extendedAlignmentReverse.blocks[extendedAlignmentReverse.blocks.size()-1].qPos << endl; } extendedAlignmentReverse.tAlignedSeqPos = firstAlignedTPos - genomePrefixLength; extendedAlignmentReverse.qAlignedSeqPos = firstAlignedQPos - readPrefixLength; extendedAlignmentReverse.AppendAlignment(*alignment); genomePrefix.Append(alignment->tAlignedSeq, genomePrefix.length - alignment->tPos); alignment->tAlignedSeq.Free(); alignment->tAlignedSeq.TakeOwnership(genomePrefix); alignment->blocks = extendedAlignmentReverse.blocks; alignment->tAlignedSeqPos = extendedAlignmentReverse.tAlignedSeqPos; alignment->tPos = extendedAlignmentReverse.tPos; alignment->qAlignedSeqPos = extendedAlignmentReverse.qAlignedSeqPos; alignment->qAlignedSeq.length = readPrefix.length + alignment->qAlignedSeq.length; alignment->qPos = extendedAlignmentReverse.qPos; alignment->qAlignedSeq.seq = readPrefix.seq; // // Make sure the two ways of accounting for aligned sequence // length are in sync. This needs to go. // if (alignment->blocks.size() > 0) { int lastBlock = alignment->blocks.size() - 1; alignment->qAlignedSeqLength = alignment->qAlignedSeq.length; alignment->tAlignedSeqLength = alignment->tAlignedSeq.length; } else { alignment->qAlignedSeqLength = alignment->qAlignedSeq.length = 0; alignment->tAlignedSeqLength = alignment->tAlignedSeq.length = 0; } } // end of if (reverseScore < 0 ) readSuffix.Free(); readPrefix.Free(); genomePrefix.Free(); genomeSuffix.Free(); } tSubseq.Free(); } if (params.verbosity > 0) { cout << "interval align score: " << alignScore << endl; StickPrintAlignment(*alignment, (DNASequence&) alignment->qAlignedSeq, (DNASequence&) alignment->tAlignedSeq, cout, 0, alignment->tAlignedSeqPos); } ComputeAlignmentStats(*alignment, alignment->qAlignedSeq.seq, alignment->tAlignedSeq.seq, distScoreFn2); //SMRTDistanceMatrix, ins, del ); intvIt++; } while (intvIt != weightedIntervals.end()); } template void PairwiseLocalAlign(T_Sequence &qSeq, T_RefSequence &tSeq, int k, MappingParameters ¶ms, T_AlignmentCandidate &alignment, MappingBuffers &mappingBuffers, AlignmentType alignType) { // // Perform a pairwise alignment between qSeq and tSeq, but choose // the pairwise alignment method based on the parameters. The // options for pairwise alignment are: // - Affine KBanded alignment: usually used for sequences with no // quality information. // - KBanded alignment: For sequences with quality information. // Gaps are scored with quality values. // QualityValueScoreFunction scoreFn; scoreFn.del = params.indel; scoreFn.ins = params.indel; DistanceMatrixScoreFunction distScoreFn2( SMRTDistanceMatrix, params.indel, params.indel); IDSScoreFunction idsScoreFn; idsScoreFn.ins = params.insertion; idsScoreFn.del = params.deletion; idsScoreFn.substitutionPrior = params.substitutionPrior; idsScoreFn.globalDeletionPrior = params.globalDeletionPrior; idsScoreFn.InitializeScoreMatrix(SMRTDistanceMatrix); int kbandScore; int qvAwareScore; if (params.ignoreQualities || qSeq.qual.Empty() || !ReadHasMeaningfulQualityValues(qSeq) ) { kbandScore = AffineKBandAlign(qSeq, tSeq, SMRTDistanceMatrix, params.indel+2, params.indel - 3, // homopolymer insertion open and extend params.indel+2, params.indel - 1, // any insertion open and extend params.indel, // deletion k*1.2, mappingBuffers.scoreMat, mappingBuffers.pathMat, mappingBuffers.hpInsScoreMat, mappingBuffers.hpInsPathMat, mappingBuffers.insScoreMat, mappingBuffers.insPathMat, alignment, Global); alignment.score = kbandScore; if (params.verbosity >= 2) { cout << "align score: " << kbandScore << endl; } } else { if (qSeq.insertionQV.Empty() == false) { qvAwareScore = KBandAlign(qSeq, tSeq, SMRTDistanceMatrix, params.indel+2, // ins params.indel+2, // del k, mappingBuffers.scoreMat, mappingBuffers.pathMat, alignment, idsScoreFn, alignType); if (params.verbosity >= 2) { cout << "ids score fn score: " << qvAwareScore << endl; } } else { qvAwareScore = KBandAlign(qSeq, tSeq, SMRTDistanceMatrix, params.indel+2, // ins params.indel+2, // del k, mappingBuffers.scoreMat, mappingBuffers.pathMat, alignment, scoreFn, alignType); if (params.verbosity >= 2) { cout << "qv score fn score: " << qvAwareScore << endl; } } alignment.sumQVScore = qvAwareScore; alignment.score = qvAwareScore; alignment.probScore = 0; } // Compute stats and assign a default alignment score using an edit distance. ComputeAlignmentStats(alignment, qSeq.seq, tSeq.seq, distScoreFn2); if (params.scoreType == 1) { alignment.score = alignment.sumQVScore; } } // Extend target aligned sequence of the input alignement to both ends // by flankSize bases. Update alignment->tAlignedSeqPos, // alignment->tAlignedSeqLength and alignment->tAlignedSeq. void FlankTAlignedSeq(T_AlignmentCandidate * alignment, SequenceIndexDatabase &seqdb, DNASequence & genome, int flankSize) { assert(alignment != NULL and alignment->tIsSubstring); UInt forwardTPos, newTAlignedSeqPos, newTAlignedSeqLen; // New aligned start position relative to this chromosome, with // the same direction as alignment->tStrand. newTAlignedSeqPos = UInt((alignment->tAlignedSeqPos > UInt(flankSize))? (alignment->tAlignedSeqPos - flankSize): 0); newTAlignedSeqLen = min(alignment->tAlignedSeqPos + alignment->tAlignedSeqLength + flankSize, alignment->tLength) - newTAlignedSeqPos; if (alignment->tStrand ==0) { forwardTPos = newTAlignedSeqPos; } else { forwardTPos = alignment->tLength - newTAlignedSeqPos - 1; } // Find where this chromosome is in the genome. int seqIndex = seqdb.GetIndexOfSeqName(alignment->tName); assert(seqIndex != -1); UInt newGenomePos = seqdb.ChromosomePositionToGenome(seqIndex, forwardTPos); if (alignment->tIsSubstring == false) { alignment->tAlignedSeq.Free(); } alignment->tAlignedSeqPos = newTAlignedSeqPos; alignment->tAlignedSeqLength = newTAlignedSeqLen; if (alignment->tStrand == 0) { alignment->tAlignedSeq.ReferenceSubstring(genome, newGenomePos, newTAlignedSeqLen); } else { // Copy and then reverse complement. genome.MakeRC(alignment->tAlignedSeq, newGenomePos + 1 - alignment->tAlignedSeqLength, alignment->tAlignedSeqLength); alignment->tIsSubstring = false; } } // Align a subread of a SMRT sequence to target sequence of an alignment. // Input: // subread - a subread of a SMRT sequence. // unrolledRead - the full SMRT sequence. // alignment - an alignment. // passDirection - whether or not the subread has the // same direction as query of the alignment. // 0 = true, 1 = false. // subreadInterval - [start, end) interval of the subread in the // SMRT read. // subreadIndex - index of the subread in allReadAlignments. // params - mapping paramters. // Output: // allReadAlignments - where the sequence and alignments of the // subread are saved. // threadOut - an out stream for debugging the current thread. void AlignSubreadToAlignmentTarget(ReadAlignments & allReadAlignments, SMRTSequence & subread, SMRTSequence & unrolledRead, T_AlignmentCandidate * alignment, int passDirection, ReadInterval & subreadInterval, int subreadIndex, MappingParameters & params, MappingBuffers & mappingBuffers, ostream & threadOut) { assert(passDirection == 0 or passDirection == 1); // // Determine where in the genome the subread has mapped. // DNASequence alignedForwardRefSequence, alignedReverseRefSequence; if (alignment->tStrand == 0) { // This needs to be changed -- copy copies RHS into LHS, // CopyAsRC copies LHS into RHS alignedForwardRefSequence.Copy(alignment->tAlignedSeq); alignment->tAlignedSeq.CopyAsRC(alignedReverseRefSequence); } else { alignment->tAlignedSeq.CopyAsRC(alignedForwardRefSequence); alignedReverseRefSequence.Copy(alignment->tAlignedSeq); } IDSScoreFunction idsScoreFn; idsScoreFn.ins = params.insertion; idsScoreFn.del = params.deletion; idsScoreFn.InitializeScoreMatrix(SMRTDistanceMatrix); idsScoreFn.globalDeletionPrior = params.globalDeletionPrior; idsScoreFn.substitutionPrior = params.substitutionPrior; DistanceMatrixScoreFunction distScoreFn2( SMRTDistanceMatrix, params.indel, params.indel); // // Determine the strand to align the subread to. // T_AlignmentCandidate exploded; bool sameAlignmentPassDirection = (alignment->tStrand == passDirection); bool computeProbIsFalse = false; DNASequence & alignedRefSequence = (sameAlignmentPassDirection? alignedForwardRefSequence:alignedReverseRefSequence); // // In the original code, parameters: bandSize=10, alignType=Global, // sdpTupleSize=4 (instead of 12, Local and 6) were used when // alignment & pass have different directions. // int explodedScore = GuidedAlign(subread, alignedRefSequence, idsScoreFn, 12, params.sdpIns, params.sdpDel, params.indelRate, mappingBuffers, exploded, Local, computeProbIsFalse, 6); if (params.verbosity >= 3) { threadOut << "zmw " << unrolledRead.zmwData.holeNumber << ", subreadIndex " << subreadIndex << ", passDirection " << passDirection << ", subreadInterval [" << subreadInterval.start << ", " << subreadInterval.end << ")" << endl << "StickPrintAlignment subread-reference alignment which has" << " the " << (sameAlignmentPassDirection?"same":"different") << " direction as the ccs-reference (or the " << "longestSubread-reference) alignment. " << endl << "subread: " << endl; static_cast(&subread)->PrintSeq(threadOut); threadOut << endl; threadOut << "alignedRefSeq: " << endl; static_cast(&alignedRefSequence)->PrintSeq(threadOut); StickPrintAlignment(exploded, (DNASequence&) subread, (DNASequence&) alignedRefSequence, threadOut, exploded.qAlignedSeqPos, exploded.tAlignedSeqPos); } if (exploded.blocks.size() > 0) { DistanceMatrixScoreFunction distScoreFn( SMRTDistanceMatrix, params.indel, params.indel); ComputeAlignmentStats(exploded, subread.seq, alignedRefSequence.seq, distScoreFn2); //SMRTDistanceMatrix, params.indel, params.indel); if (exploded.score <= params.maxScore) { // // The coordinates of the alignment should be // relative to the reference sequence (the specified chromosome, // not the whole genome). // exploded.qStrand = 0; exploded.tStrand = sameAlignmentPassDirection?0:1; exploded.qLength = unrolledRead.length; exploded.tLength = alignment->tLength; exploded.tAlignedSeq.Copy(alignedRefSequence); exploded.tAlignedSeqPos = (passDirection == 0)? (alignment->tAlignedSeqPos): (exploded.tLength - alignment->tAlignedSeqPos - alignment->tAlignedSeqLength); exploded.tAlignedSeqLength = alignment->tAlignedSeqLength; exploded.qAlignedSeq.ReferenceSubstring(subread); exploded.qAlignedSeqPos = subreadInterval.start; exploded.qAlignedSeqLength = subreadInterval.end - subreadInterval.start; exploded.mapQV = alignment->mapQV; exploded.tName = alignment->tName; exploded.tIndex = alignment->tIndex; stringstream namestrm; namestrm << "/" << subreadInterval.start << "_" << subreadInterval.end; exploded.qName = string(unrolledRead.title) + namestrm.str(); // // Don't call AssignRefContigLocation as the coordinates // of the alignment is already relative to the chromosome coordiantes. // // Save this alignment for printing later. // T_AlignmentCandidate *alignmentPtr = new T_AlignmentCandidate; *alignmentPtr = exploded; allReadAlignments.AddAlignmentForSeq(subreadIndex, alignmentPtr); } // End of exploded score <= maxScore. if (params.verbosity >= 3) { threadOut << "exploded score: " << exploded.score << endl << "exploded alignment: "<< endl; exploded.Print(threadOut); threadOut << endl; } } // End of exploded.blocks.size() > 0. } #endif blasr-8e668beae0dda1da6914586fb458182c6c3c7482/include/BlasrHeaders.h000066400000000000000000000130411260737656700240120ustar00rootroot00000000000000// Copyright (c) 2014-2015, Pacific Biosciences of California, Inc. // // All rights reserved. // // Redistribution and use in source and binary forms, with or without // modification, are permitted (subject to the limitations in the // disclaimer below) provided that the following conditions are met: // // * Redistributions of source code must retain the above copyright // notice, this list of conditions and the following disclaimer. // // * Redistributions in binary form must reproduce the above // copyright notice, this list of conditions and the following // disclaimer in the documentation and/or other materials provided // with the distribution. // // * Neither the name of Pacific Biosciences nor the names of its // contributors may be used to endorse or promote products derived // from this software without specific prior written permission. // // NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE // GRANTED BY THIS LICENSE. THIS SOFTWARE IS PROVIDED BY PACIFIC // BIOSCIENCES AND ITS CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED // WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES // OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE // DISCLAIMED. IN NO EVENT SHALL PACIFIC BIOSCIENCES OR ITS // CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, // SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT // LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF // USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND // ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, // OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT // OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF // SUCH DAMAGE. #ifndef _BLASR_HEADERS_H_ #define _BLASR_HEADERS_H_ #ifdef __linux__ # include #endif #include #include #include #include #include #include #include #include #include #include #define MAX_PHRED_SCORE 254 #define MAPQV_END_ALIGN_WIGGLE 5 using namespace std; #include "libconfig.h" #ifdef USE_PBBAM #include #endif #include "CCSSequence.hpp" #include "SMRTSequence.hpp" #include "FASTASequence.hpp" #include "FASTAReader.hpp" #include "SeqUtils.hpp" #include "defs.h" #include "utils.hpp" #include "tuples/DNATuple.hpp" #include "tuples/HashedTupleList.hpp" #include "algorithms/compare/CompareStrings.hpp" #include "algorithms/alignment/AffineKBandAlign.hpp" #include "algorithms/alignment/GuidedAlign.hpp" #include "algorithms/alignment/AffineGuidedAlign.hpp" #include "algorithms/alignment/FullQVAlign.hpp" #include "algorithms/alignment/ExtendAlign.hpp" #include "algorithms/alignment/OneGapAlignment.hpp" #include "algorithms/alignment/AlignmentUtils.hpp" #include "algorithms/alignment/QualityValueScoreFunction.hpp" #include "algorithms/alignment/IDSScoreFunction.hpp" #include "algorithms/alignment/DistanceMatrixScoreFunction.hpp" #include "algorithms/alignment/StringToScoreMatrix.hpp" #include "algorithms/alignment/AlignmentFormats.hpp" #include "algorithms/anchoring/LISPValue.hpp" #include "algorithms/anchoring/LISPValueWeightor.hpp" #include "algorithms/anchoring/LISSizeWeightor.hpp" #include "algorithms/anchoring/LISQValueWeightor.hpp" #include "algorithms/anchoring/FindMaxInterval.hpp" #include "algorithms/anchoring/MapBySuffixArray.hpp" #include "datastructures/anchoring/ClusterList.hpp" #include "algorithms/anchoring/ClusterProbability.hpp" #include "algorithms/anchoring/BWTSearch.hpp" #include "metagenome/SequenceIndexDatabase.hpp" #include "metagenome/TitleTable.hpp" #include "suffixarray/SharedSuffixArray.hpp" #include "suffixarray/SuffixArrayTypes.hpp" #include "tuples/TupleCountTable.hpp" #include "datastructures/anchoring/WeightedInterval.hpp" #include "datastructures/anchoring/AnchorParameters.hpp" #include "datastructures/alignment/AlignmentCandidate.hpp" #include "datastructures/alignment/AlignmentContext.hpp" #include "MappingMetrics.hpp" #include "reads/ReadInterval.hpp" #include "utils/FileOfFileNames.hpp" #include "utils/RegionUtils.hpp" #include "utils/TimeUtils.hpp" #include "utils/SMRTTitle.hpp" #include "qvs/QualityTransform.hpp" #include "files/ReaderAgglomerate.hpp" #include "files/CCSIterator.hpp" #include "files/FragmentCCSIterator.hpp" #include "HDFRegionTableReader.hpp" #include "bwt/BWT.hpp" #include "PackedDNASequence.hpp" #include "CommandLineParser.hpp" #include "qvs/QualityValue.hpp" #include "statistics/VarianceAccumulator.hpp" #include "statistics/pdfs.hpp" #include "statistics/cdfs.hpp" #include "statistics/StatUtils.hpp" #include "statistics/LookupAnchorDistribution.hpp" #include "format/StickAlignmentPrinter.hpp" #include "format/SAMPrinter.hpp" #include "format/XMLPrinter.hpp" #include "format/CompareSequencesPrinter.hpp" #include "format/VulgarPrinter.hpp" #include "format/IntervalPrinter.hpp" #include "format/SummaryPrinter.hpp" #include "format/SAMHeaderPrinter.hpp" #include "format/BAMPrinter.hpp" #include "MappingIPC.h" #include "MappingSemaphores.h" #include "MappingBuffers.hpp" #include "ReadAlignments.hpp" typedef SMRTSequence T_Sequence; typedef FASTASequence T_GenomeSequence; typedef DNASuffixArray T_SuffixArray; typedef DNATuple T_Tuple; typedef LISPValueWeightor > PValueWeightor; typedef LISSMatchFrequencyPValueWeightor > MultiplicityPValueWeightor; typedef MappingData MappingIPC; #endif blasr-8e668beae0dda1da6914586fb458182c6c3c7482/include/BlasrMiscs.hpp000066400000000000000000000103161260737656700240570ustar00rootroot00000000000000// Copyright (c) 2014-2015, Pacific Biosciences of California, Inc. // // All rights reserved. // // Redistribution and use in source and binary forms, with or without // modification, are permitted (subject to the limitations in the // disclaimer below) provided that the following conditions are met: // // * Redistributions of source code must retain the above copyright // notice, this list of conditions and the following disclaimer. // // * Redistributions in binary form must reproduce the above // copyright notice, this list of conditions and the following // disclaimer in the documentation and/or other materials provided // with the distribution. // // * Neither the name of Pacific Biosciences nor the names of its // contributors may be used to endorse or promote products derived // from this software without specific prior written permission. // // NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE // GRANTED BY THIS LICENSE. THIS SOFTWARE IS PROVIDED BY PACIFIC // BIOSCIENCES AND ITS CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED // WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES // OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE // DISCLAIMED. IN NO EVENT SHALL PACIFIC BIOSCIENCES OR ITS // CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, // SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT // LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF // USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND // ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, // OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT // OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF // SUCH DAMAGE. // Author: Mark Chaisson #ifndef _BLASR_MISCS_HPP_ #define _BLASR_MISCS_HPP_ #include "BlasrHeaders.h" //-------------------------Fetch Reads----------------------------// template bool GetNextReadThroughSemaphore(ReaderAgglomerate &reader, MappingParameters ¶ms, T_Sequence &read, string & readGroupId, int & associatedRandInt, MappingSemaphores & semaphores); //---------------------MAKE & CHECK READS-------------------------// //FIXME: move to SMRTSequence bool ReadHasMeaningfulQualityValues(FASTQSequence &sequence); //FIXME: Move to SMRTSequence // Given a SMRT sequence and a subread interval, make the subread. // Input: // smrtRead - a SMRT sequence // subreadInterval - a subread interval // params - mapping parameters // Output: // subreadSequence - the constructed subread void MakeSubreadOfInterval(SMRTSequence & subreadSequence, SMRTSequence & smrtRead, ReadInterval & subreadInterval, MappingParameters & params); //FIXME: Move to SMRTSequence // Given a SMRT sequence and one of its subreads, make the // reverse complement of the subread in the coordinate of the // reverse complement sequence of the SMRT sequence. // Input: // smrtRead - a SMRT read // subreadSequence - a subread of smrtRead // Output: // subreadSequenceRC - the reverse complement of the subread // in the coordinate of the reverse // complement of the SMRT read. void MakeSubreadRC(SMRTSequence & subreadSequenceRC, SMRTSequence & subreadSequence, SMRTSequence & smrtRead); // Make a virtual SMRTSequence (polymerase reads) given all subreads. // NO QVs will be copied at this point. void MakeVirtualRead(SMRTSequence & smrtRead, const vector & subreads); // Construct subreads invervals from subreads void MakeSubreadIntervals(vector & subreads, vector & subreadIntervals); // Get index of median length interval int GetIndexOfMedian(const vector & subreadIntervals); //-------------------------MISC-----------------------------------// int CountZero(unsigned char *ptr, int length); #include "BlasrMiscsImpl.hpp" #endif blasr-8e668beae0dda1da6914586fb458182c6c3c7482/include/BlasrMiscsImpl.hpp000066400000000000000000000174041260737656700247060ustar00rootroot00000000000000// Copyright (c) 2014-2015, Pacific Biosciences of California, Inc. // // All rights reserved. // // Redistribution and use in source and binary forms, with or without // modification, are permitted (subject to the limitations in the // disclaimer below) provided that the following conditions are met: // // * Redistributions of source code must retain the above copyright // notice, this list of conditions and the following disclaimer. // // * Redistributions in binary form must reproduce the above // copyright notice, this list of conditions and the following // disclaimer in the documentation and/or other materials provided // with the distribution. // // * Neither the name of Pacific Biosciences nor the names of its // contributors may be used to endorse or promote products derived // from this software without specific prior written permission. // // NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE // GRANTED BY THIS LICENSE. THIS SOFTWARE IS PROVIDED BY PACIFIC // BIOSCIENCES AND ITS CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED // WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES // OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE // DISCLAIMED. IN NO EVENT SHALL PACIFIC BIOSCIENCES OR ITS // CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, // SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT // LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF // USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND // ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, // OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT // OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF // SUCH DAMAGE. // Author: Mark Chaisson #ifndef _BLASR_MISCS_IMPL_HPP_ #define _BLASR_MISCS_IMPL_HPP_ #include "utils/SMRTTitle.hpp" template bool GetNextReadThroughSemaphore(ReaderAgglomerate &reader, MappingParameters ¶ms, T_Sequence &read, string & readGroupId, int & associatedRandInt, MappingSemaphores & semaphores) { // Wait on a semaphore if (params.nProc > 1) { #ifdef __APPLE__ sem_wait(semaphores.reader); #else sem_wait(&semaphores.reader); #endif } bool returnValue = true; // // CCS Reads are read differently from other reads. Do static casting here // of this. // if (reader.GetNext(read, associatedRandInt) == 0) { returnValue = false; } // // Set the read group id before releasing the semaphore, since other // threads may change the reader object to a new read group before // sending this alignment out to printing. readGroupId = reader.readGroupId; if (params.nProc > 1) { #ifdef __APPLE__ sem_post(semaphores.reader); #else sem_post(&semaphores.reader); #endif } return returnValue; } bool ReadHasMeaningfulQualityValues(FASTQSequence &sequence) { if (sequence.qual.Empty() == true) { return 0; } else { int numZero=0, numNonZero=0; if (sequence.qual.data == NULL) { return false; } numZero = CountZero(sequence.qual.data, sequence.length); numNonZero = sequence.length - numZero; int subNumZero = 0, subNonZero = 0; if (sequence.substitutionQV.data == NULL) { return false; } subNumZero = CountZero(sequence.substitutionQV.data, sequence.length); subNonZero = sequence.length - subNumZero; if (numZero < 0.5*numNonZero and subNumZero < 0.5 * subNonZero) { return true; } else { return false; } } } // Given a SMRT sequence and a subread interval, make the subread. // Input: // smrtRead - a SMRT sequence // subreadInterval - a subread interval // params - mapping parameters // Output: // subreadSequence - the constructed subread void MakeSubreadOfInterval(SMRTSequence & subreadSequence, SMRTSequence & smrtRead, ReadInterval & subreadInterval, MappingParameters & params) { int start = subreadInterval.start; int end = subreadInterval.end; assert(smrtRead.length >= subreadSequence.length); smrtRead.MakeSubreadAsMasked(subreadSequence, start, end); if (!params.preserveReadTitle) { smrtRead.SetSubreadTitle(subreadSequence, subreadSequence.SubreadStart(), subreadSequence.SubreadEnd()); } else { subreadSequence.CopyTitle(smrtRead.title); } subreadSequence.zmwData = smrtRead.zmwData; } // Given a SMRT sequence and one of its subreads, make the // reverse complement of the subread in the coordinate of the // reverse complement sequence of the SMRT sequence. // Input: // smrtRead - a SMRT read // subreadSequence - a subread of smrtRead // Output: // subreadSequenceRC - the reverse complement of the subread // in the coordinate of the reverse // complement of the SMRT read. void MakeSubreadRC(SMRTSequence & subreadSequenceRC, SMRTSequence & subreadSequence, SMRTSequence & smrtRead) { assert(smrtRead.length >= subreadSequence.length); // Reverse complement sequence of the subread. subreadSequence.MakeRC(subreadSequenceRC); // Update start and end positions of subreadSequenceRC in the // coordinate of reverse compelement sequence of the SMRT read. subreadSequenceRC.SubreadStart(smrtRead.length - subreadSequence.SubreadEnd()); subreadSequenceRC.SubreadEnd (smrtRead.length - subreadSequence.SubreadStart()); subreadSequenceRC.zmwData = smrtRead.zmwData; } int CountZero(unsigned char *ptr, int length) { int i; int nZero = 0; for (i = 0; i < length; i++) { if (ptr[i] == 0) { ++nZero; } } return nZero; } void MakeVirtualRead(SMRTSequence & smrtRead, const vector & subreads) { assert(subreads.size() > 0); DNALength hqStart = 0, hqEnd = 0; for(auto subread: subreads) { hqStart = min(DNALength(subread.SubreadStart()), hqStart); hqEnd = max(DNALength(subread.SubreadEnd()), hqEnd); } smrtRead.Free(); smrtRead.Allocate(hqEnd); smrtRead.lowQualityPrefix = hqStart; smrtRead.lowQualitySuffix = smrtRead.length - hqEnd; smrtRead.highQualityRegionScore = subreads[0].highQualityRegionScore; stringstream ss; ss << SMRTTitle(subreads[0].GetTitle()).MovieName() << "/" << subreads[0].HoleNumber(); smrtRead.CopyTitle(ss.str()); for (auto subread: subreads) { memcpy(&smrtRead.seq[subread.SubreadStart()], &subread.seq[0], sizeof(char) * subread.length); } } void MakeSubreadIntervals(vector & subreads, vector & subreadIntervals) { subreadIntervals.clear(); for (auto subread: subreads) { subreadIntervals.push_back(ReadInterval(subread.SubreadStart(), subread.SubreadEnd(), subread.highQualityRegionScore)); } } int GetIndexOfMedian(const vector & subreadIntervals) { vector intervals = subreadIntervals; size_t n = intervals.size() / 2; nth_element(intervals.begin(), intervals.begin() + n, intervals.end(), [](const ReadInterval & a, const ReadInterval & b) -> bool {a.end - a.start < b.end - b.start;}); auto it = std::find(subreadIntervals.begin(), subreadIntervals.end(), intervals[n]); int pos = int(std::distance(subreadIntervals.begin(), it)); return pos; } #endif blasr-8e668beae0dda1da6914586fb458182c6c3c7482/include/BlasrUtils.hpp000066400000000000000000000206401260737656700241020ustar00rootroot00000000000000// Copyright (c) 2014-2015, Pacific Biosciences of California, Inc. // // All rights reserved. // // Redistribution and use in source and binary forms, with or without // modification, are permitted (subject to the limitations in the // disclaimer below) provided that the following conditions are met: // // * Redistributions of source code must retain the above copyright // notice, this list of conditions and the following disclaimer. // // * Redistributions in binary form must reproduce the above // copyright notice, this list of conditions and the following // disclaimer in the documentation and/or other materials provided // with the distribution. // // * Neither the name of Pacific Biosciences nor the names of its // contributors may be used to endorse or promote products derived // from this software without specific prior written permission. // // NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE // GRANTED BY THIS LICENSE. THIS SOFTWARE IS PROVIDED BY PACIFIC // BIOSCIENCES AND ITS CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED // WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES // OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE // DISCLAIMED. IN NO EVENT SHALL PACIFIC BIOSCIENCES OR ITS // CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, // SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT // LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF // USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND // ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, // OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT // OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF // SUCH DAMAGE. // Author: Mark Chaisson #ifndef _BLASR_INC_UTILS_HPP_ #define _BLASR_INC_UTILS_HPP_ #include "BlasrHeaders.h" //----------------------MODIFY ALIGNMENTS--------------------------// //FIXME: refactor class SequenceIndexDatabase void AssignRefContigLocation(T_AlignmentCandidate &alignment, SequenceIndexDatabase &seqdb, DNASequence &genome); //FIXME: refactor class SequenceIndexDatabase void AssignRefContigLocations(vector &alignmentPtrs, SequenceIndexDatabase &seqdb, DNASequence &genome); template //FIXME: refactor class SequenceIndexDatabase void AssignGenericRefContigName(vector &alignmentPtrs, T_RefSequence &genome); //FIXME: move to class ReadAlignments void StoreRankingStats(vector &alignments, VarianceAccumulator &accumPValue, VarianceAccumulator &accumWeight); //FIXME: mapQV should be assigned when alignments are created. void AssignMapQV(vector &alignmentPtrs); //FIXME: move to class ReadAlignments void ScaleMapQVByClusterSize(T_AlignmentCandidate &alignment, MappingParameters ¶ms); void StoreMapQVs(SMRTSequence &read, vector &alignmentPtrs, MappingParameters ¶ms); //--------------------SEARCH & CHECK ALIGNMENTS-------------------// //FIXME: move to class ReadAlignments template bool CheckForSufficientMatch(T_Sequence &read, vector &alignmentPtrs, MappingParameters ¶ms); //FIXME: move to class ReadAlignments int FindMaxLengthAlignment(vector alignmentPtrs, int &maxLengthIndex); //FIXME: move to class T_AlignmentCandidate void SumMismatches(SMRTSequence &read, T_AlignmentCandidate &alignment, int mismatchScore, int fullIntvStart, int fullIntvEnd, int &sum); //FIXME: move to class T_AlignmentCandidate /// \returns whether two alignments overlap by more than minPcercentOverlap% bool AlignmentsOverlap(T_AlignmentCandidate &alnA, T_AlignmentCandidate &alnB, float minPercentOverlap); /// \Partition overlapping alignments. void PartitionOverlappingAlignments(vector &alignmentPtrs, vector > &partitions, float minOverlap); //--------------------FILTER ALIGNMENTS---------------------------// //FIXME: move to class T_AlignmentCandidate and ReadAlignments int RemoveLowQualitySDPAlignments(int readLength, vector &alignmentPtrs, MappingParameters ¶ms); //FIXME: move to class ReadAlignments template int RemoveLowQualityAlignments(T_Sequence &read, vector &alignmentPtrs, MappingParameters ¶ms); //FIXME: move to class ReadAlignments int RemoveOverlappingAlignments(vector &alignmentPtrs, MappingParameters ¶ms); // FIXME: move to class ReadAlignments // Delete all alignments from index startIndex in vector, inclusive. void DeleteAlignments(vector &alignmentPtrs, int startIndex=0); //--------------------REFINE ALIGNMENTS---------------------------// template void RefineAlignment(vector &bothQueryStrands, T_RefSequence &genome, T_AlignmentCandidate &alignmentCandidate, MappingParameters ¶ms, MappingBuffers &mappingBuffers); template void RefineAlignments(vector &bothQueryStrands, T_RefSequence &genome, vector &alignmentPtrs, MappingParameters ¶ms, MappingBuffers &mappingBuffers); //--------------------PRINT ALIGNMENTS---------------------------// vector SelectAlignmentsToPrint(vector alignmentPtrs, MappingParameters & params, const int & associatedRandInt); // // The full read is not the subread, and does not have masked off characters. // void PrintAlignment(T_AlignmentCandidate &alignment, SMRTSequence &fullRead, MappingParameters ¶ms, AlignmentContext &alignmentContext, ostream &outFile #ifdef USE_PBBAM , PacBio::BAM::BamWriter * bamWriterPtr #endif ); // Print all alignments in vector alignmentPtrs void PrintAlignments(vector alignmentPtrs, SMRTSequence &read, MappingParameters ¶ms, ostream &outFile, AlignmentContext alignmentContext, #ifdef USE_PBBAM PacBio::BAM::BamWriter * bamWriterPtr, #endif MappingSemaphores & semaphores); void PrintAlignmentPtrs(vector & alignmentPtrs, ostream & out = cout); // Print all alignments for subreads in allReadAlignments. // Input: // allReadAlignments - contains a set of subreads, each of which // is associated with a group of alignments. // alignmentContext - an alignment context of each subread used // for printing in SAM format. // params - mapping parameters. // Output: // outFilePtr - where to print alignments for subreads. // unalignedFilePtr - where to print sequences for unaligned subreads. void PrintAllReadAlignments(ReadAlignments & allReadAlignments, AlignmentContext & alignmentContext, ostream & outFilePtr, ostream & unalignedFilePtr, MappingParameters & params, vector & subreads, #ifdef USE_PBBAM PacBio::BAM::BamWriter * bamWriterPtr, #endif MappingSemaphores & semaphores); #include "BlasrUtilsImpl.hpp" #endif blasr-8e668beae0dda1da6914586fb458182c6c3c7482/include/BlasrUtilsImpl.hpp000066400000000000000000001370431260737656700247320ustar00rootroot00000000000000// Copyright (c) 2014-2015, Pacific Biosciences of California, Inc. // // All rights reserved. // // Redistribution and use in source and binary forms, with or without // modification, are permitted (subject to the limitations in the // disclaimer below) provided that the following conditions are met: // // * Redistributions of source code must retain the above copyright // notice, this list of conditions and the following disclaimer. // // * Redistributions in binary form must reproduce the above // copyright notice, this list of conditions and the following // disclaimer in the documentation and/or other materials provided // with the distribution. // // * Neither the name of Pacific Biosciences nor the names of its // contributors may be used to endorse or promote products derived // from this software without specific prior written permission. // // NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE // GRANTED BY THIS LICENSE. THIS SOFTWARE IS PROVIDED BY PACIFIC // BIOSCIENCES AND ITS CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED // WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES // OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE // DISCLAIMED. IN NO EVENT SHALL PACIFIC BIOSCIENCES OR ITS // CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, // SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT // LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF // USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND // ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, // OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT // OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF // SUCH DAMAGE. // Author: Mark Chaisson #ifndef _BLASR_INC_UTILS_IMPL_HPP_ #define _BLASR_INC_UTILS_IMPL_HPP_ #include "BlasrAlign.hpp" //----------------------MODIFY ALIGNMENTS--------------------------// void AssignRefContigLocation(T_AlignmentCandidate &alignment, SequenceIndexDatabase &seqdb, DNASequence &genome) { // // If the sequence database is used, the start position of // the alignment is relative to the start of the chromosome, // not the entire index. Subtract off the start position of // the chromosome to get the true position. // DNALength forwardTPos; int seqDBIndex; if (alignment.tStrand == 0) { forwardTPos = alignment.tAlignedSeqPos; seqDBIndex = seqdb.SearchForIndex(forwardTPos); alignment.tAlignedSeqPos -= seqdb.seqStartPos[seqDBIndex]; } else { // // Flip coordinates into forward strand in order to find the boundaries // of the contig, then reverse them in order to find offset. // // Find the reverse complement coordinate of the index of the last aligned base. assert(alignment.tAlignedSeqLength > 0); forwardTPos = genome.MakeRCCoordinate(alignment.tAlignedSeqPos + alignment.tAlignedSeqLength - 1); seqDBIndex = seqdb.SearchForIndex(forwardTPos); // // Find the reverse comlement coordinate of the last base of this // sequence. This would normally be the start of the next contig // -1 to get the length, but since an 'N' is added between every // pair of sequences, this is -2. // DNALength reverseTOffset; reverseTOffset = genome.MakeRCCoordinate(seqdb.seqStartPos[seqDBIndex+1]-2); alignment.tAlignedSeqPos -= reverseTOffset; } } void AssignRefContigLocations(vector &alignmentPtrs, SequenceIndexDatabase &seqdb, DNASequence &genome) { UInt i; for (i = 0; i < alignmentPtrs.size(); i++) { T_AlignmentCandidate *aref = alignmentPtrs[i]; AssignRefContigLocation(*aref, seqdb, genome); } } template void AssignGenericRefContigName(vector &alignmentPtrs, T_RefSequence &genome) { UInt i; for (i = 0; i < alignmentPtrs.size(); i++) { T_AlignmentCandidate *aref = alignmentPtrs[i]; aref->tName = genome.title; } } void StoreRankingStats(vector &alignments, VarianceAccumulator &accumPValue, VarianceAccumulator &accumWeight) { int i; for (i = 0; i < int(alignments.size()); i++) { alignments[i]->pvalVariance = accumPValue.GetVariance(); alignments[i]->pvalNStdDev = accumPValue.GetNStdDev(alignments[i]->clusterScore); alignments[i]->weightVariance = accumWeight.GetVariance(); alignments[i]->weightNStdDev = accumWeight.GetNStdDev(alignments[i]->clusterWeight); } } void AssignMapQV(vector &alignmentPtrs) { int i; int mapQV = 1; if (alignmentPtrs.size() > 1 and alignmentPtrs[0]->score == alignmentPtrs[1]->score) { // the top two alignments have the same score, don't consider them as mapped. mapQV = 0; } for (i = 0; i < int(alignmentPtrs.size()); i++) { alignmentPtrs[i]->mapQV = mapQV; } } void ScaleMapQVByClusterSize(T_AlignmentCandidate &alignment, MappingParameters ¶ms) { if (alignment.numSignificantClusters > int(params.nCandidates)) { alignment.mapQV = Phred((1-InversePhred(alignment.mapQV))* ((float)params.nCandidates / alignment.numSignificantClusters)); } else if (alignment.numSignificantClusters == 0) { alignment.mapQV = 0; } } void StoreMapQVs(SMRTSequence &read, vector &alignmentPtrs, MappingParameters ¶ms) { // // Only weight alignments for mapqv against eachother if they are overlapping. // int a; vector > partitions; // Each set contains alignments that overlap on the read. DistanceMatrixScoreFunction distScoreFn; distScoreFn.del = params.deletion; distScoreFn.ins = params.insertion; // bug 24363, set affineOpen and affineExtend for distScoreFn distScoreFn.affineOpen = params.affineOpen; distScoreFn.affineExtend = params.affineExtend; distScoreFn.InitializeScoreMatrix(SMRTLogProbMatrix); IDSScoreFunction idsScoreFn; idsScoreFn.ins = params.insertion; idsScoreFn.del = params.deletion; idsScoreFn.affineExtend = params.affineExtend; idsScoreFn.affineOpen = params.affineOpen; idsScoreFn.substitutionPrior = params.substitutionPrior; idsScoreFn.globalDeletionPrior = params.globalDeletionPrior; // // Rescore the alignment so that it uses probabilities. // for (a = 0; a < int(alignmentPtrs.size()); a++) { if (params.ignoreQualities == false) { // bug 24363, pass -affineAlign to compute correct alignment score. alignmentPtrs[a]->probScore = -ComputeAlignmentScore(*alignmentPtrs[a], alignmentPtrs[a]->qAlignedSeq, alignmentPtrs[a]->tAlignedSeq, idsScoreFn, params.affineAlign) / 10.0; } else { alignmentPtrs[a]->probScore = -ComputeAlignmentScore(*alignmentPtrs[a], alignmentPtrs[a]->qAlignedSeq, alignmentPtrs[a]->tAlignedSeq, distScoreFn, params.affineAlign) / 10.0; } } PartitionOverlappingAlignments(alignmentPtrs, partitions, params.minFractionToBeConsideredOverlapping); int p; set::iterator partIt, partEnd; // // For each partition, store where on the read it begins, and where // it ends. // vector partitionBeginPos, partitionEndPos; partitionBeginPos.resize(partitions.size()); partitionEndPos.resize(partitions.size()); fill(partitionBeginPos.begin(), partitionBeginPos.end(), -1); fill(partitionEndPos.begin(), partitionEndPos.end(), -1); vector assigned; assigned.resize( alignmentPtrs.size()); fill(assigned.begin(), assigned.end(), false); for (p = 0; p < int(partitions.size()); p++) { partEnd = partitions[p].end(); int alnStart, alnEnd; if (partitions[p].size() > 0) { partIt = partitions[p].begin(); alignmentPtrs[*partIt]->GetQInterval(alnStart, alnEnd); partitionBeginPos[p] = alnStart; partitionEndPos[p] = alnEnd; ++partIt; partEnd = partitions[p].end(); for (; partIt != partEnd; ++partIt) { // Comment out because all reads are now in the forward strand. // alignmentPtrs[*partIt]->GetQInterval(alnStart, alnEnd, convertToForwardStrand); alignmentPtrs[*partIt]->GetQInterval(alnStart, alnEnd); if (alnEnd - alnStart > partitionEndPos[p] - partitionBeginPos[p]) { partitionBeginPos[p] = alnStart; partitionEndPos[p] = alnEnd; } } } } // // For each partition, determine the widest parts of the read that // are aligned in the partition. All alignments will be extended to // the end of the widest parts of the partition. // const static bool convertToForwardStrand = true; UInt i; // // For now, just use the alignment score as the probability score. // Although it is possible to use the full forward probability, for // the most part it is pretty much the same as the Vitterbi // probability, but it takes a lot longer to compute. // // // Now estimate what the alignment scores would be if they were // extended past the ends of their current alignment. // for (p = 0; p < int(partitions.size()); p++) { partEnd = partitions[p].end(); int alnStart, alnEnd; for (partIt = partitions[p].begin(); partitions[p].size() > 0 and partIt != partEnd; ++partIt) { int mismatchSum = 0; alignmentPtrs[*partIt]->GetQInterval(alnStart, alnEnd, convertToForwardStrand); if (alnStart - partitionBeginPos[p] > MAPQV_END_ALIGN_WIGGLE or partitionEndPos[p] - alnEnd > MAPQV_END_ALIGN_WIGGLE) { // bug 24363, use updated SumMismatches to compute mismatch score when // no QV is available. SumMismatches(read, *alignmentPtrs[*partIt], 15, partitionBeginPos[p], partitionEndPos[p], mismatchSum); } // // Random sequence can be aligned with about 50% similarity due // to optimization, so weight the qv sum // alignmentPtrs[*partIt]->probScore += -(mismatchSum) * 0.5; } } // // Determine mapqv by summing qvscores in partitions float mapQVDenominator = 0; for (p = 0; p < int(partitions.size()); p++) { set::iterator nextIt; if (partitions[p].size() == 0) { continue; } int index = *partitions[p].begin(); mapQVDenominator = alignmentPtrs[index]->probScore; if (partitions[p].size() > 1) { partIt = partitions[p].begin(); partEnd = partitions[p].end(); ++partIt; for (; partIt != partEnd; ++partIt) { index = *partIt; mapQVDenominator = LogSumOfTwo(mapQVDenominator, alignmentPtrs[index]->probScore); } } for (partIt = partitions[p].begin(); partIt != partitions[p].end(); ++partIt) { // // If only one alignment is found, assume maximum mapqv. // assigned[*partIt] = true; if (partitions[p].size() == 1) { alignmentPtrs[*partIt]->mapQV = MAX_PHRED_SCORE; } // // Look for overflow. // else if (alignmentPtrs[*partIt]->probScore - mapQVDenominator < -20) { alignmentPtrs[*partIt]->mapQV = 0; } else { double log10 = log(10); double sub = alignmentPtrs[*partIt]->probScore - mapQVDenominator; double expo = exp(log10*sub); double diff = 1.0 - expo; int phredValue; if (expo == 0) { phredValue = 0; } else if (diff == 0) { phredValue = MAX_PHRED_SCORE; } else { phredValue = Phred(diff); } if (phredValue > MAX_PHRED_SCORE) { phredValue = MAX_PHRED_SCORE; } alignmentPtrs[*partIt]->mapQV = phredValue; assigned[*partIt]=true; } if (params.scaleMapQVByNumSignificantClusters) { ScaleMapQVByClusterSize(*alignmentPtrs[*partIt], params); } } } for (i = 0; i < assigned.size(); i++) { assert(assigned[i]); } } //--------------------SEARCH & CHECK ALIGNMENTS-------------------// template bool CheckForSufficientMatch(T_Sequence &read, vector &alignmentPtrs, MappingParameters ¶ms) { if (alignmentPtrs.size() > 0 and alignmentPtrs[0]->score < params.maxScore) { return true; } else { return false; } } int FindMaxLengthAlignment(vector alignmentPtrs, int &maxLengthIndex) { int i; int maxLength = 0; maxLengthIndex = -1; for (i = 0; i < int(alignmentPtrs.size()); i++) { int qStart, qEnd; alignmentPtrs[i]->GetQInterval(qStart, qEnd); if (qEnd - qStart > maxLength) { maxLengthIndex = i; maxLength = qEnd - qStart; } } return (maxLength != -1); } void SumMismatches(SMRTSequence &read, T_AlignmentCandidate &alignment, int mismatchScore, int fullIntvStart, int fullIntvEnd, int &sum) { int alnStart, alnEnd; alignment.GetQIntervalOnForwardStrand(alnStart, alnEnd); int p; sum = 0; if (read.substitutionQV.Empty() == false) { for (p = fullIntvStart; p < alnStart; p++) { sum += read.substitutionQV[p]; } for (p = alnEnd; p < fullIntvEnd; p++) { sum += read.substitutionQV[p]; } } else { // bug 24363, compute mismatch score when QV is not available. sum += mismatchScore * ((alnStart - fullIntvStart) + (fullIntvEnd - alnEnd)); } } bool AlignmentsOverlap(T_AlignmentCandidate &alnA, T_AlignmentCandidate &alnB, float minPercentOverlap) { int alnAStart, alnAEnd, alnBStart, alnBEnd; bool useForwardStrand=true; alnA.GetQInterval(alnAStart, alnAEnd, useForwardStrand); alnB.GetQInterval(alnBStart, alnBEnd, useForwardStrand); // Look if one alignment encompasses the other int ovp = 0; if (alnAStart <= alnBStart and alnAEnd >= alnBEnd) { return true; } else if (alnBStart <= alnAStart and alnBEnd >= alnAEnd) { return true; //ovp = alnAEnd - alnAStart; } else { // // Look to see if the alignments overlap // if (alnAEnd >= alnBStart and alnAEnd <= alnBEnd) { ovp = alnAEnd - alnBStart; } else if (alnAStart >= alnBStart and alnAStart <= alnBEnd) { ovp = alnBEnd - alnAStart; } } // float ovpPercent = (2.0*ovp) / ((alnAEnd - alnAStart) + (alnBEnd - alnBStart)); float ovpPercent = 0; if (alnAEnd - alnAStart > 0 and alnBEnd - alnBStart > 0) { // overlap percentage: maximum overlap percent in A and B. ovpPercent = max(float(ovp)/float(alnAEnd - alnAStart), float(ovp)/float(alnBEnd - alnBStart)); } // returns true when an overlap is found. return (ovpPercent > minPercentOverlap); } void PartitionOverlappingAlignments(vector &alignmentPtrs, vector > &partitions, float minOverlap) { if (alignmentPtrs.size() == 0) { partitions.clear(); return; } set::iterator setIt, setEnd; int i, p; bool overlapFound = false; for (i = 0; i < int(alignmentPtrs.size()); i++) { overlapFound = false; for (p = 0; p < int(partitions.size()) and overlapFound == false; p++) { setEnd = partitions[p].end(); for (setIt = partitions[p].begin(); setIt != partitions[p].end() and overlapFound == false; ++setIt) { if (AlignmentsOverlap(*alignmentPtrs[i], *alignmentPtrs[*setIt], minOverlap) or ((alignmentPtrs[i]->QAlignStart() <= alignmentPtrs[*setIt]->QAlignStart()) and (alignmentPtrs[i]->QAlignEnd() > alignmentPtrs[*setIt]->QAlignEnd()))) { partitions[p].insert(i); overlapFound = true; } } } // // If this alignment does not overlap any other, create a // partition with it as the first element. // if (overlapFound == false) { partitions.push_back(set()); partitions[partitions.size()-1].insert(i); } } } //--------------------FILTER ALIGNMENTS---------------------------// int RemoveLowQualitySDPAlignments(int readLength, vector &alignmentPtrs, MappingParameters ¶ms) { // Just a hack. For now, assume there is at least 1 match per 50 bases. int totalBasesMatched = 0; int a; for (a = 0; a < int(alignmentPtrs.size()); a++) { int b; for (b = 0; b < int(alignmentPtrs[a]->blocks.size()); b++) { totalBasesMatched += alignmentPtrs[a]->blocks[b].length; } int expectedMatches = params.sdpTupleSize/50.0 * readLength; if (totalBasesMatched < expectedMatches) { delete alignmentPtrs[a]; alignmentPtrs[a] = NULL; } } int packedAlignmentIndex = 0; for (a = 0; a < int(alignmentPtrs.size()); a++) { if (alignmentPtrs[a] != NULL) { alignmentPtrs[packedAlignmentIndex] = alignmentPtrs[a]; packedAlignmentIndex++; } } alignmentPtrs.resize(packedAlignmentIndex); return packedAlignmentIndex; } template int RemoveLowQualityAlignments(T_Sequence &read, vector &alignmentPtrs, MappingParameters ¶ms) { if (params.verbosity > 0) { cout << "checking at least " << alignmentPtrs.size() << " alignments to see if they are accurate." << endl; } UInt i; for (i = 0; i < MIN(params.nCandidates, alignmentPtrs.size()); i++) { if (params.verbosity > 0) { cout << "Quality check " << i << " " << alignmentPtrs[i]->score << endl; } if (alignmentPtrs[i]->blocks.size() == 0 or alignmentPtrs[i]->score > params.maxScore) { // // Since the alignments are sorted according to alignment // score, once one of the alignments is too low of a score, // all remaining alignments are also too low, and should be // removed as well. Do that all at once. // if (alignmentPtrs[i]->blocks.size() == 0 and params.verbosity > 0) { cout << "Removing empty alignment " << alignmentPtrs[i]->qName << endl; } if (params.verbosity > 0) { cout << alignmentPtrs[i]->qName << " alignment " << i << " is too low of a score." << alignmentPtrs[i]->score << endl; } int deletedIndex = i; for (; deletedIndex < alignmentPtrs.size(); deletedIndex++) { delete alignmentPtrs[deletedIndex]; alignmentPtrs[deletedIndex] = NULL; } alignmentPtrs.erase(i + alignmentPtrs.begin(), alignmentPtrs.end()); break; } else { if (params.verbosity > 0) { cout << "Keeping alignment " << i << " " << alignmentPtrs[i]->qPos << " " << alignmentPtrs[i]->qLength << " " << alignmentPtrs[i]->tName << " " << alignmentPtrs[i]->tPos << " " << alignmentPtrs[i]->tLength << " from score: " << alignmentPtrs[i]->score << endl; } } } return alignmentPtrs.size(); } //FIXME: move to class ReadAlignments int RemoveOverlappingAlignments(vector &alignmentPtrs, MappingParameters ¶ms) { vector alignmentIsContained; alignmentIsContained.resize(alignmentPtrs.size()); std::fill(alignmentIsContained.begin(), alignmentIsContained.end(), false); int j; int numContained = 0; int curNotContained = 0; if (alignmentPtrs.size() > 0) { UInt i; for (i = 0; i < alignmentPtrs.size()-1; i++ ){ T_AlignmentCandidate *aref = alignmentPtrs[i]; if (aref->pctSimilarity < params.minPctSimilarity) { continue; } for (j = i + 1; j < int(alignmentPtrs.size()); j++ ){ // // Make sure this alignment isn't already removed. // if (alignmentIsContained[j]) { continue; } // // Only check for containment if the two sequences are from the same contig. // if (alignmentPtrs[i]->tIndex != alignmentPtrs[j]->tIndex) { continue; } // // Check for an alignment that is fully overlapping another // alignment. if (aref->GenomicTBegin() <= alignmentPtrs[j]->GenomicTBegin() and aref->GenomicTEnd() >= alignmentPtrs[j]->GenomicTEnd() and alignmentPtrs[i]->tIndex == alignmentPtrs[j]->tIndex) { // // Alignment i is contained in j is only true if it has a worse score. // if (aref->score <= alignmentPtrs[j]->score) { alignmentIsContained[j] = true; } if (params.verbosity >= 2) { cout << "alignment " << i << " is contained in " << j << endl; cout << aref->tAlignedSeqPos << " " << alignmentPtrs[j]->tAlignedSeqPos << " " << aref->tAlignedSeqPos + aref->tAlignedSeqLength << " " << alignmentPtrs[j]->tAlignedSeqPos + alignmentPtrs[j]->tAlignedSeqLength << endl; } } else if (alignmentPtrs[j]->GenomicTBegin() <= aref->GenomicTBegin() and alignmentPtrs[j]->GenomicTEnd() >= aref->GenomicTEnd() and alignmentPtrs[i]->tIndex == alignmentPtrs[j]->tIndex) { if (params.verbosity >= 2) { cout << "ALIGNMENT " << j << " is contained in " << i << endl; cout << alignmentPtrs[j]->tAlignedSeqPos << " " << aref->tAlignedSeqPos << " " << alignmentPtrs[j]->tAlignedSeqPos + alignmentPtrs[j]->tAlignedSeqLength << " " << aref->tAlignedSeqPos + aref->tAlignedSeqLength << endl; } if (alignmentPtrs[j]->score <= aref->score) { alignmentIsContained[i] = true; } } } } for (i = 0; i < alignmentPtrs.size(); i++) { T_AlignmentCandidate *aref = alignmentPtrs[i]; if (alignmentIsContained[i]) { delete alignmentPtrs[i]; alignmentPtrs[i] = NULL; numContained++; } else { alignmentPtrs[curNotContained] = aref; ++curNotContained; } } alignmentPtrs.resize(alignmentPtrs.size() - numContained); } return alignmentPtrs.size(); } // Delete all alignments from index startIndex in vector, inclusive. void DeleteAlignments(vector &alignmentPtrs, int startIndex) { int i; for (i = startIndex; i < int(alignmentPtrs.size()); i++ ) { delete alignmentPtrs[i]; } alignmentPtrs.resize(0); } //--------------------REFINE ALIGNMENTS---------------------------// template void RefineAlignment(vector &bothQueryStrands, T_RefSequence &genome, T_AlignmentCandidate &alignmentCandidate, MappingParameters ¶ms, MappingBuffers &mappingBuffers) { FASTQSequence qSeq; DNASequence tSeq; DistanceMatrixScoreFunction distScoreFn( SMRTDistanceMatrix, params.deletion, params.insertion); DistanceMatrixScoreFunction distScoreFn2( SMRTDistanceMatrix, params.indel, params.indel); QualityValueScoreFunction scoreFn; IDSScoreFunction idsScoreFn; idsScoreFn.InitializeScoreMatrix(SMRTDistanceMatrix); scoreFn.del = params.indel; scoreFn.ins = params.indel; idsScoreFn.ins = params.insertion; idsScoreFn.del = params.deletion; idsScoreFn.affineExtend = params.affineExtend; idsScoreFn.affineOpen = params.affineOpen; idsScoreFn.substitutionPrior = params.substitutionPrior; idsScoreFn.globalDeletionPrior = params.globalDeletionPrior; if (params.doGlobalAlignment) { SMRTSequence subread; subread.ReferenceSubstring(*bothQueryStrands[0], bothQueryStrands[0]->SubreadStart(), (bothQueryStrands[0]->SubreadLength())); int drift = ComputeDrift(alignmentCandidate); T_AlignmentCandidate refinedAlignment; KBandAlign(subread, alignmentCandidate.tAlignedSeq, SMRTDistanceMatrix, params.insertion, params.deletion, drift, mappingBuffers.scoreMat, mappingBuffers.pathMat, refinedAlignment, idsScoreFn, Global); refinedAlignment.RemoveEndGaps(); ComputeAlignmentStats(refinedAlignment, subread.seq, alignmentCandidate.tAlignedSeq.seq, distScoreFn2); //idsScoreFn); alignmentCandidate.blocks = refinedAlignment.blocks; alignmentCandidate.gaps = refinedAlignment.gaps; alignmentCandidate.tPos = refinedAlignment.tPos; alignmentCandidate.qPos = refinedAlignment.qPos + bothQueryStrands[0]->SubreadStart(); alignmentCandidate.score = refinedAlignment.score; subread.Free(); } else if (params.useGuidedAlign) { T_AlignmentCandidate refinedAlignment; int lastBlock = alignmentCandidate.blocks.size() - 1; if (alignmentCandidate.blocks.size() > 0) { /* * Refine the alignment without expanding past the current * boundaries of the sequences that are already aligned. */ // // NOTE** this only makes sense when // alignmentCandidate.blocks[0].tPos == 0. Otherwise the length // of the sequence is not correct. // tSeq.Copy(alignmentCandidate.tAlignedSeq, alignmentCandidate.tPos, (alignmentCandidate.blocks[lastBlock].tPos + alignmentCandidate.blocks[lastBlock].length - alignmentCandidate.blocks[0].tPos)); // qSeq.ReferenceSubstring(alignmentCandidate.qAlignedSeq, qSeq.ReferenceSubstring(*bothQueryStrands[0], alignmentCandidate.qAlignedSeqPos + alignmentCandidate.qPos, (alignmentCandidate.blocks[lastBlock].qPos + alignmentCandidate.blocks[lastBlock].length)); if (!params.ignoreQualities && ReadHasMeaningfulQualityValues(alignmentCandidate.qAlignedSeq)) { if (params.affineAlign) { AffineGuidedAlign(qSeq, tSeq, alignmentCandidate, idsScoreFn, params.bandSize, mappingBuffers, refinedAlignment, Global, false); } else { GuidedAlign(qSeq, tSeq, alignmentCandidate, idsScoreFn, params.guidedAlignBandSize, mappingBuffers, refinedAlignment, Global, false); } } else { if (params.affineAlign) { AffineGuidedAlign(qSeq, tSeq, alignmentCandidate, distScoreFn, params.bandSize, mappingBuffers, refinedAlignment, Global, false); } else { GuidedAlign(qSeq, tSeq, alignmentCandidate, distScoreFn, params.guidedAlignBandSize, mappingBuffers, refinedAlignment, Global, false); } } ComputeAlignmentStats(refinedAlignment, qSeq.seq, tSeq.seq, distScoreFn2, params.affineAlign); // // Copy the refine alignment, which may be a subsequence of the // alignmentCandidate into the alignment candidate. // // First copy the alignment block and gap (the description of // the base by base alignment). alignmentCandidate.blocks.clear(); alignmentCandidate.blocks = refinedAlignment.blocks; alignmentCandidate.CopyStats(refinedAlignment); alignmentCandidate.gaps = refinedAlignment.gaps; alignmentCandidate.score = refinedAlignment.score; alignmentCandidate.nCells = refinedAlignment.nCells; // Next copy the information that describes what interval was // aligned. Since the reference sequences of the alignment // candidate have been modified, they are reassigned. alignmentCandidate.tAlignedSeq.Free(); alignmentCandidate.tAlignedSeq.TakeOwnership(tSeq); alignmentCandidate.ReassignQSequence(qSeq); alignmentCandidate.tAlignedSeqPos += alignmentCandidate.tPos; alignmentCandidate.qAlignedSeqPos += alignmentCandidate.qPos; // // tPos and qPos are the positions within the interval where the // alignment begins. The refined alignment has adifferent tPos // and qPos from the alignment candidate. alignmentCandidate.tPos = refinedAlignment.tPos; alignmentCandidate.qPos = refinedAlignment.qPos; // The lengths of the newly aligned sequences may differ, update those. alignmentCandidate.tAlignedSeqLength = tSeq.length; alignmentCandidate.qAlignedSeqLength = qSeq.length; } } else { // // This assumes an SDP alignment has been performed to create 'alignmentCandidate'. // // Recompute the alignment using a banded smith waterman to // get rid of any spurious effects of usign the seeded gaps. // // // The k-banded alignment is over a subsequence of the first // (sparse dynamic programming, SDP) alignment. The SDP // alignment is over a large window that may contain the // candidate sequence. The k-band alignment is over a tighter // region. int drift = ComputeDrift(alignmentCandidate); // // Rescore the alignment with a banded alignment that has a // better model of sequencing error. // if (alignmentCandidate.blocks.size() == 0 ){ alignmentCandidate.score = 0; return; } int lastBlock = alignmentCandidate.blocks.size() - 1; // // Assign the sequences that are going to be realigned using // banded alignment. The SDP alignment does not give that great // of a score, but it does do a good job at finding a backbone // alignment that closely defines the sequence that is aligned. // Reassign the subsequences for alignment with a tight bound // around the beginning and ending of each sequence, so that // global banded alignment may be performed. // // // This section needs to be cleaned up substantially. Right now it // copies a substring from the ref to a temp, then from the temp // back to the ref. It may be possible to just keep one pointer per // read to the memory that was allocated, then allow the seq // parameter to float around. The reason for all the copying is // that in case there is a compressed version of the genome the // seqences must be transformed before alignment. // if (alignmentCandidate.qIsSubstring) { qSeq.ReferenceSubstring(*bothQueryStrands[0], // the original sequence alignmentCandidate.qPos + alignmentCandidate.qAlignedSeqPos, alignmentCandidate.blocks[lastBlock].qPos + alignmentCandidate.blocks[lastBlock].length); } else { qSeq.ReferenceSubstring(alignmentCandidate.qAlignedSeq, // the subsequence that the alignment points to alignmentCandidate.qPos + alignmentCandidate.qAlignedSeqPos, alignmentCandidate.blocks[lastBlock].qPos + alignmentCandidate.blocks[lastBlock].length - alignmentCandidate.blocks[0].qPos); } tSeq.Copy(alignmentCandidate.tAlignedSeq, // the subsequence the alignment points to alignmentCandidate.tPos, // ofset into the subsequence alignmentCandidate.blocks[lastBlock].tPos + alignmentCandidate.blocks[lastBlock].length - alignmentCandidate.blocks[0].tPos); T_AlignmentCandidate refinedAlignment; // // When the parameter bandSize is 0, set the alignment band size // to the drift off the diagonal, plus a little more for wiggle // room. When the parameteris nonzero, use that as a fixed band. // int k; if (params.bandSize == 0) { k = abs(drift) * 1.5; } else { k = params.bandSize; } if (params.verbosity > 0) { cout << "drift: " << drift << " qlen: " << alignmentCandidate.qAlignedSeq.length << " tlen: " << alignmentCandidate.tAlignedSeq.length << " k: " << k << endl; cout << "aligning in " << k << " * " << alignmentCandidate.tAlignedSeq.length << " " << k * alignmentCandidate.tAlignedSeq.length << endl; } if (k < 10) { k = 10; } alignmentCandidate.tAlignedSeqPos += alignmentCandidate.tPos; VectorIndex lastSDPBlock = alignmentCandidate.blocks.size() - 1; if (alignmentCandidate.blocks.size() > 0) { DNALength prevLength = alignmentCandidate.tAlignedSeqLength -= alignmentCandidate.tPos; alignmentCandidate.tAlignedSeqLength = (alignmentCandidate.blocks[lastSDPBlock].tPos + alignmentCandidate.blocks[lastSDPBlock].length - alignmentCandidate.blocks[0].tPos); } else { alignmentCandidate.tAlignedSeqLength = 0; } alignmentCandidate.tPos = 0; alignmentCandidate.qAlignedSeqPos += alignmentCandidate.qPos; if (alignmentCandidate.blocks.size() > 0) { DNALength prevLength = alignmentCandidate.qAlignedSeqLength -= alignmentCandidate.qPos; alignmentCandidate.qAlignedSeqLength = (alignmentCandidate.blocks[lastSDPBlock].qPos + alignmentCandidate.blocks[lastSDPBlock].length - alignmentCandidate.blocks[0].qPos); } else { alignmentCandidate.qAlignedSeqLength = 0; } alignmentCandidate.qPos = 0; alignmentCandidate.blocks.clear(); alignmentCandidate.tAlignedSeq.Free(); alignmentCandidate.tAlignedSeq.TakeOwnership(tSeq); alignmentCandidate.ReassignQSequence(qSeq); if (params.verbosity >= 2) { cout << "refining target: " << endl; alignmentCandidate.tAlignedSeq.PrintSeq(cout); cout << "refining query: " << endl; static_cast(&alignmentCandidate.qAlignedSeq)->PrintSeq(cout); cout << endl; } PairwiseLocalAlign(qSeq, tSeq, k, params, alignmentCandidate, mappingBuffers, Fit); } } template void RefineAlignments(vector &bothQueryStrands, T_RefSequence &genome, vector &alignmentPtrs, MappingParameters ¶ms, MappingBuffers &mappingBuffers) { UInt i; for (i = 0; i < alignmentPtrs.size(); i++ ) { RefineAlignment(bothQueryStrands, genome, *alignmentPtrs[i], params, mappingBuffers); } // // It's possible the alignment references change their order after running // the local alignments. This is made into a parameter rather than resorting // every time so that the performance gain by resorting may be measured. // if (params.sortRefinedAlignments) { std::sort(alignmentPtrs.begin(), alignmentPtrs.end(), SortAlignmentPointersByScore()); } } vector SelectAlignmentsToPrint(vector alignmentPtrs, MappingParameters & params, const int & associatedRandInt) { if (params.placeRandomly) {assert(params.hitPolicy.IsRandombest());} if (alignmentPtrs.size() == 0) {return vector({});} std::sort(alignmentPtrs.begin(), alignmentPtrs.end(), SortAlignmentPointersByScore()); // Apply filter criteria and hit policy. // Shallow copy AlignmentCandidate pointers. vector filtered; for (auto ptr: alignmentPtrs) { if (params.filterCriteria.Satisfy(ptr)) { filtered.push_back(ptr); if (filtered.size() == params.nBest) break; } } return params.hitPolicy.Apply(filtered, false, associatedRandInt); } // The full read is not the subread, and does not have masked off characters. void PrintAlignment(T_AlignmentCandidate &alignment, SMRTSequence &fullRead, MappingParameters ¶ms, AlignmentContext &alignmentContext, ostream &outFile #ifdef USE_PBBAM , PacBio::BAM::BamWriter * bamWriterPtr #endif ) { try { int lastBlock = alignment.blocks.size() - 1; if (params.printFormat == StickPrint) { PrintAlignmentStats(alignment, outFile); StickPrintAlignment(alignment, (DNASequence&) alignment.qAlignedSeq, (DNASequence&) alignment.tAlignedSeq, outFile, alignment.qAlignedSeqPos, alignment.tAlignedSeqPos); } else if (params.printFormat == SAM) { SAMOutput::PrintAlignment(alignment, fullRead, outFile, alignmentContext, params.samQVList, params.clipping, params.cigarUseSeqMatch); } else if (params.printFormat == BAM) { #ifdef USE_PBBAM BAMOutput::PrintAlignment(alignment, fullRead, *bamWriterPtr, alignmentContext, params.samQVList, params.clipping, params.cigarUseSeqMatch); #else REQUIRE_PBBAM_ERROR(); #endif } else if (params.printFormat == CompareXML) { XMLOutput::Print(alignment, (DNASequence&) alignment.qAlignedSeq, (DNASequence&) alignment.tAlignedSeq, outFile, alignment.qAlignedSeqPos, alignment.tAlignedSeqPos); } else if (params.printFormat == Vulgar) { PrintAlignmentStats(alignment, outFile); VulgarOutput::Print(alignment, outFile); } else if (params.printFormat == CompareSequencesParsable) { CompareSequencesOutput::Print(alignment, alignment.qAlignedSeq, alignment.tAlignedSeq, outFile); } else if (params.printFormat == Interval) { if (alignment.blocks.size() > 0) { IntervalOutput::Print(alignment, outFile); } } else if (params.printFormat == SummaryPrint) { if (alignment.blocks.size() > 0) { SummaryOutput::Print(alignment, outFile); } } } catch (ostream::failure f) { cout << "ERROR writing to output file. The output drive may be full, or you " << endl; cout << "may not have proper write permissions." << endl; exit(1); } } // Print all alignments in vector alignmentPtrs void PrintAlignments(vector alignmentPtrs, SMRTSequence &read, MappingParameters ¶ms, ostream &outFile, AlignmentContext alignmentContext, #ifdef USE_PBBAM PacBio::BAM::BamWriter * bamWriterPtr, #endif MappingSemaphores & semaphores) { if (params.nProc > 1) { #ifdef __APPLE__ sem_wait(semaphores.writer); #else sem_wait(&semaphores.writer); #endif } for (int i = 0; i < int(alignmentPtrs.size()); i++) { T_AlignmentCandidate *aref = alignmentPtrs[i]; if (aref->blocks.size() == 0) { // // If the SDP alignment finds nothing, there will be no // blocks. This may happen if the sdp block size is larger // than the anchor size found with the suffix array. When no // blocks are found there is no alignment, so zero-out the // score and continue. // aref->score = 0; if (params.verbosity > 0) { cout << "Zero blocks found for " << aref->qName << " " << aref->qAlignedSeqPos << " " << aref->tAlignedSeqPos << endl; } continue; } // // Configure some of the alignment context before printing. // if (i > 0 and params.placeRandomly == false) { alignmentContext.isPrimary = false; } else { alignmentContext.isPrimary = true; } if (params.printSAM or params.printBAM) { DistanceMatrixScoreFunction editdistScoreFn(EditDistanceMatrix, 1, 1); T_AlignmentCandidate & alignment = *alignmentPtrs[i]; alignmentContext.editDist = ComputeAlignmentScore(alignment, alignment.qAlignedSeq, alignment.tAlignedSeq, editdistScoreFn); } PrintAlignment(*alignmentPtrs[i], read, params, alignmentContext, outFile #ifdef USE_PBBAM , bamWriterPtr #endif ); } if (params.nProc > 1) { #ifdef __APPLE__ sem_post(semaphores.writer); #else sem_post(&semaphores.writer); #endif } } void PrintAlignmentPtrs(vector & alignmentPtrs, ostream & out) { for(int alignmentIndex = 0; alignmentIndex < int(alignmentPtrs.size()); alignmentIndex++) { out << "["<< alignmentIndex << "/" << alignmentPtrs.size() << "]" << endl; T_AlignmentCandidate *alignment = alignmentPtrs[alignmentIndex]; alignment->Print(out); } out << endl; } // Print all alignments for subreads in allReadAlignments. // Input: // allReadAlignments - contains a set of subreads, each of which // is associated with a group of alignments. // alignmentContext - an alignment context of each subread used // for printing in SAM format. // params - mapping parameters. // Output: // outFilePtr - where to print alignments for subreads. // unalignedFilePtr - where to print sequences for unaligned subreads. void PrintAllReadAlignments(ReadAlignments & allReadAlignments, AlignmentContext & alignmentContext, ostream & outFilePtr, ostream & unalignedFilePtr, MappingParameters & params, vector & subreads, #ifdef USE_PBBAM PacBio::BAM::BamWriter * bamWriterPtr, #endif MappingSemaphores & semaphores) { int subreadIndex; int nAlignedSubreads = allReadAlignments.GetNAlignedSeq(); // // Initialize the alignemnt context with information applicable to SAM output. // alignmentContext.alignMode = allReadAlignments.alignMode; for (subreadIndex = 0; subreadIndex < nAlignedSubreads; subreadIndex++) { if (allReadAlignments.subreadAlignments[subreadIndex].size() > 0) { alignmentContext.numProperlyAlignedSubreads++; } } if (alignmentContext.numProperlyAlignedSubreads == int(allReadAlignments.subreadAlignments.size())) { alignmentContext.allSubreadsProperlyAligned = true; } alignmentContext.nSubreads = nAlignedSubreads; for (subreadIndex = 0; subreadIndex < nAlignedSubreads; subreadIndex++) { alignmentContext.subreadIndex = subreadIndex; if (subreadIndex < nAlignedSubreads-1 and allReadAlignments.subreadAlignments[subreadIndex+1].size() > 0) { alignmentContext.nextSubreadPos = allReadAlignments.subreadAlignments[subreadIndex+1][0]->QAlignStart(); alignmentContext.nextSubreadDir = allReadAlignments.subreadAlignments[subreadIndex+1][0]->qStrand; alignmentContext.rNext = allReadAlignments.subreadAlignments[subreadIndex+1][0]->tName; alignmentContext.hasNextSubreadPos = true; } else { alignmentContext.nextSubreadPos = 0; alignmentContext.nextSubreadDir = 0; alignmentContext.rNext = ""; alignmentContext.hasNextSubreadPos = false; } SMRTSequence & sourceSubread = allReadAlignments.subreads[subreadIndex]; if (subreads.size() == allReadAlignments.subreads.size()) { sourceSubread = subreads[subreadIndex]; } if (allReadAlignments.subreadAlignments[subreadIndex].size() > 0) { PrintAlignments(allReadAlignments.subreadAlignments[subreadIndex], sourceSubread, // for these alignments params, outFilePtr,//*mapData->outFilePtr, alignmentContext, #ifdef USE_PBBAM bamWriterPtr, #endif semaphores); } else { // // Print the unaligned sequences. // if (params.printUnaligned == true) { if (params.nProc == 1) { //allReadAlignments.subreads[subreadIndex].PrintSeq(*mapData->unalignedFilePtr); allReadAlignments.subreads[subreadIndex].PrintSeq(unalignedFilePtr); } else { #ifdef __APPLE__ sem_wait(semaphores.unaligned); #else sem_wait(&semaphores.unaligned); #endif //allReadAlignments.subreads[subreadIndex].PrintSeq(*mapData->unalignedFilePtr); allReadAlignments.subreads[subreadIndex].PrintSeq(unalignedFilePtr); #ifdef __APPLE__ sem_post(semaphores.unaligned); #else sem_post(&semaphores.unaligned); #endif } // End of nproc > 1. } // End of printing unaligned sequences. } // End of finding no alignments for the subread with subreadIndex. } // End of printing and processing alignmentContext for each subread. } #endif blasr-8e668beae0dda1da6914586fb458182c6c3c7482/include/MappingBuffers.hpp000066400000000000000000000110211260737656700247170ustar00rootroot00000000000000// Copyright (c) 2014-2015, Pacific Biosciences of California, Inc. // // All rights reserved. // // Redistribution and use in source and binary forms, with or without // modification, are permitted (subject to the limitations in the // disclaimer below) provided that the following conditions are met: // // * Redistributions of source code must retain the above copyright // notice, this list of conditions and the following disclaimer. // // * Redistributions in binary form must reproduce the above // copyright notice, this list of conditions and the following // disclaimer in the documentation and/or other materials provided // with the distribution. // // * Neither the name of Pacific Biosciences nor the names of its // contributors may be used to endorse or promote products derived // from this software without specific prior written permission. // // NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE // GRANTED BY THIS LICENSE. THIS SOFTWARE IS PROVIDED BY PACIFIC // BIOSCIENCES AND ITS CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED // WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES // OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE // DISCLAIMED. IN NO EVENT SHALL PACIFIC BIOSCIENCES OR ITS // CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, // SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT // LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF // USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND // ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, // OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT // OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF // SUCH DAMAGE. // Author: Mark Chaisson #ifndef __BLASR_MAPPING_BUFFERS__ #define __BLASR_MAPPING_BUFFERS__ #include #include "tuples/DNATuple.hpp" #include "tuples/TupleList.hpp" #include "algorithms/alignment/sdp/SDPFragment.hpp" #include "algorithms/anchoring/BasicEndpoint.hpp" #include "datastructures/anchoring/ClusterList.hpp" #include "datastructures/anchoring/MatchPos.hpp" using namespace std; // // Define a list of buffers that are meant to grow to high-water // marks, and not shrink down past that. The memory is reused rather // than having multiple calls to new. // class MappingBuffers { public: vector hpInsScoreMat, insScoreMat; vector kbandScoreMat; vector hpInsPathMat, insPathMat; vector kbandPathMat; vector scoreMat; vector pathMat; vector affineScoreMat; vector affinePathMat; vector matchPosList; vector rcMatchPosList; vector > globalChainEndpointBuffer; vector sdpFragmentSet, sdpPrefixFragmentSet, sdpSuffixFragmentSet; TupleList sdpCachedTargetTupleList; TupleList sdpCachedTargetPrefixTupleList; TupleList sdpCachedTargetSuffixTupleList; std::vector sdpCachedMaxFragmentChain; vector probMat; vector optPathProbMat; vector lnSubPValueMat; vector lnInsPValueMat; vector lnDelPValueMat; vector lnMatchPValueMat; vector clusterNumBases; ClusterList clusterList; ClusterList revStrandClusterList; void Reset(void); }; inline void MappingBuffers::Reset(void) { vector().swap(hpInsScoreMat); vector().swap(insScoreMat); vector().swap(kbandScoreMat); vector().swap(hpInsPathMat); vector().swap(insPathMat); vector().swap(kbandPathMat); vector().swap(scoreMat); vector().swap(pathMat); vector().swap(matchPosList); vector().swap(rcMatchPosList); vector >().swap(globalChainEndpointBuffer); vector().swap(sdpFragmentSet); vector().swap(sdpPrefixFragmentSet); vector().swap(sdpSuffixFragmentSet); sdpCachedTargetTupleList.Reset(); sdpCachedTargetPrefixTupleList.Reset(); sdpCachedTargetSuffixTupleList.Reset(); vector().swap(sdpCachedMaxFragmentChain); vector().swap(probMat); vector().swap(optPathProbMat); vector().swap(lnSubPValueMat); vector().swap(lnInsPValueMat); vector().swap(lnDelPValueMat); vector().swap(lnMatchPValueMat); vector().swap(clusterNumBases); } #endif blasr-8e668beae0dda1da6914586fb458182c6c3c7482/include/MappingIPC.h000066400000000000000000000073441260737656700234130ustar00rootroot00000000000000#ifndef MAPPING_IPC_H_ #define MAPPING_IPC_H_ #include #include "MappingParameters.h" #include "FASTASequence.hpp" #include "FASTQSequence.hpp" #include "tuples/TupleList.hpp" #include "tuples/DNATuple.hpp" #include "tuples/CompressedDNATuple.hpp" #include "tuples/TupleCountTable.hpp" #include "files/ReaderAgglomerate.hpp" #include "MappingMetrics.hpp" #include "suffixarray/SuffixArrayTypes.hpp" #include "metagenome/SequenceIndexDatabase.hpp" #include "reads/RegionTable.hpp" #include "bwt/BWT.hpp" /* * This structure contains pointers to all required data structures * for mapping reads to a suffix array and evaluating the significance * of the matches. */ template class MappingData { public: T_SuffixArray *suffixArrayPtr; BWT *bwtPtr; T_GenomeSequence *referenceSeqPtr; SequenceIndexDatabase *seqDBPtr; TupleCountTable *ctabPtr; MappingParameters params; MappingMetrics metrics; RegionTable *regionTablePtr; ReaderAgglomerate *reader; ostream *outFilePtr; ostream *unalignedFilePtr; ostream *anchorFilePtr; ostream *clusterFilePtr; ostream *lcpBoundsOutPtr; // Declare a semaphore for blocking on reading from the same hdhf file. void ShallowCopySuffixArray(T_SuffixArray &dest) { dest.index = suffixArrayPtr->index; dest.length = suffixArrayPtr->length; dest.target = suffixArrayPtr->target; dest.startPosTable = suffixArrayPtr->startPosTable; dest.endPosTable = suffixArrayPtr->endPosTable; dest.lookupTableLength = suffixArrayPtr->lookupTableLength; dest.lookupPrefixLength = suffixArrayPtr->lookupPrefixLength; dest.tm = suffixArrayPtr->tm; dest.deleteStructures = false; // dest.useLCPTable = suffixArrayPtr->useLCPTable; } void ShallowCopySequenceIndexDatabase(SequenceIndexDatabase &dest) { dest.nSeqPos = seqDBPtr->nSeqPos; dest.seqStartPos = seqDBPtr->seqStartPos; dest.nameLengths = seqDBPtr->nameLengths; dest.names = seqDBPtr->names; dest.deleteStructures = false; } void ShallowCopyTupleCountTable( TupleCountTable &dest) { dest.countTable = ctabPtr->countTable; dest.countTableLength = ctabPtr->countTableLength; dest.nTuples = ctabPtr->nTuples; dest.tm = ctabPtr->tm; dest.deleteStructures = false; } void ShallowCopyReferenceSequence(T_GenomeSequence &refSeq) { refSeq.ShallowCopy(*referenceSeqPtr); refSeq.deleteOnExit = false; } void Initialize(T_SuffixArray *saP, T_GenomeSequence *refP, SequenceIndexDatabase *seqDBP, TupleCountTable *ctabP, ReverseCompressIndex *rciP, MappingParameters ¶msP, ReaderAgglomerate *readerP, RegionTable *regionTableP, ostream *outFileP, ostream *unalignedFileP, ostream *anchorFilePtrP, ostream *clusterFilePtrP=NULL) { suffixArrayPtr = saP; referenceSeqPtr = refP; seqDBPtr = seqDBP; ctabPtr = ctabP; regionTablePtr = regionTableP; params = paramsP; reader = readerP; outFilePtr = outFileP; unalignedFilePtr = unalignedFileP; anchorFilePtr = anchorFilePtrP; clusterFilePtr= clusterFilePtrP; } }; #endif blasr-8e668beae0dda1da6914586fb458182c6c3c7482/include/MappingParameters.h000066400000000000000000000507651260737656700251100ustar00rootroot00000000000000#ifndef MAPPING_PARAMETERS_H_ #define MAPPING_PARAMETERS_H_ #define REQUIRE_PBBAM_ERROR() \ assert("blasr must be compiled with lib pbbam to perform IO on bam." == 0); #include #include "reads/ReadType.hpp" #include "utils/FileOfFileNames.hpp" #include "utils/RangeUtils.hpp" #include "tuples/TupleMetrics.hpp" #include "datastructures/anchoring/AnchorParameters.hpp" #include "qvs/QualityValue.hpp" #include "format/SAMPrinter.hpp" #include "algorithms/alignment/AlignmentFormats.hpp" #include "files/BaseSequenceIO.hpp" #include "datastructures/alignment/FilterCriteria.hpp" class MappingParameters { public: // // Parameters for global substitution, insertion, and deletion priors. // float minFractionToBeConsideredOverlapping; float indelRate; float minRatio; int indel; int idsIndel; int sdpIndel; int sdpIns, sdpDel; int insertion; int deletion; int mismatch; int sdpTupleSize; int match; int showAlign; int refineAlign; bool useScoreCutoff; int maxScore; int argi; int nProc; int globalChainType; SAMOutput::Clipping clipping; string clippingString; QVScale qvScaleType; vector readsFileNames; // = queryFileNames, genomeFileName vector queryFileNames; string genomeFileName; // Query file type: FASTA/FASTQ/HDF*/PBBAM, // Note that mixed query file types is not allowed. FileType queryFileType; // Query read type, SUBREAD, CCS or UNKNOWN // Note that mixed read types is not allowed. ReadType::ReadTypeEnum queryReadType; vector regionTableFileNames; vector ccsFofnFileNames; string tupleListName; string posTableName; string outFileName; string suffixArrayFileName; string bwtFileName; string indexFileName; string anchorFileName; string clusterFileName; VectorIndex nBest; int printWindow; int doCondense; int do4BitComp; int cutoff; int useSuffixArray; int useBwt; int useReverseCompressIndex; int useTupleList; int useSeqDB; string seqDBName; int useCountTable; string countTableName; int minMatchLength; int listTupleSize; int printFormat; int maxExpand, minExpand; int startRead; int stride; int pValueType; float subsample; int sortRefinedAlignments; int verbosity; bool printSAM; bool cigarUseSeqMatch; bool printBAM; bool storeMapQV; bool useRandomSeed; int randomSeed; bool placeRandomly; bool printHeader; bool samplePaths; bool warp, nowarp; //bool usePrefixLookupTable; bool doSensitiveSearch; bool emulateNucmer; bool refineBetweenAnchorsOnly; bool byAdapter; bool extendDenovoCCSSubreads; TupleMetrics saTupleMetrics; TupleMetrics sdpTupleMetrics; int lookupTableLength; //int branchQualityThreshold; int qualityLowerCaseThreshold; AnchorParameters anchorParameters; int readsFileIndex; //int numBranches; bool storeMetrics; bool ignoreQualities; bool extendFrontAlignment; bool extendAlignments; int maxExtendDropoff; int minReadLength; int maxReadLength; int minSubreadLength; int minRawSubreadScore; int minAvgQual; bool overlap; bool advanceHalf; int advanceExactMatches; float approximateMaxInsertionRate; float minPctSimilarity; // [0, 100] float minPctAccuracy; // [0, 100] bool refineAlignments; int nCandidates; bool doGlobalAlignment; string tempDirectory; bool useTitleTable; string titleTableName; bool readSeparateRegionTable; bool readSeparateCcsFofn; string regionTableFileName; string ccsFofnFileName; //float averageMismatchScore; bool mapSubreadsSeparately; bool concordant; int flankSize; bool useRegionTable; bool useHQRegionTable; bool printUnaligned; string unalignedFileName; string metricsFileName; string lcpBoundsFileName; string fullMetricsFileName; bool printSubreadTitle; bool useCcs; bool useAllSubreadsInCcs; bool useCcsOnly; bool detailedSDPAlignment, nouseDetailedSDPAlignment; int chunkSize; int sdpFilterType; bool useGuidedAlign; int guidedAlignBandSize; int bandSize; int extendBandSize; bool useQVScore; int scoreType; bool printVerboseHelp; bool printDiscussion; float sdpBypassThreshold; bool computeAlignProbability; float qvMatchWeight; float qvMismatchWeight; float qvInsWeight; float qvDelWeight; float readAccuracyPrior; bool printVersion; int substitutionPrior; int globalDeletionPrior; bool outputByThread; int recurseOver; bool forPicard; bool separateGaps; string scoreMatrixString; bool printDotPlots; bool preserveReadTitle; bool forwardOnly; bool printOnlyBest; bool affineAlign; int affineExtend; int affineOpen; bool scaleMapQVByNumSignificantClusters; int limsAlign; string holeNumberRangesStr; Ranges holeNumberRanges; int minAlnLength; bool printSAMQV; vector samQV; SupplementalQVList samQVList; bool fastMaxInterval; bool aggressiveIntervalCut; bool fastSDP; string concordantTemplate; bool concordantAlignBothDirections; FilterCriteria filterCriteria; string hitPolicyStr; HitPolicy hitPolicy; bool enableHiddenPaths; void Init() { qvMatchWeight = 1.0; qvMismatchWeight = 1.0; qvInsWeight = 1.0; qvDelWeight = 1.0; minFractionToBeConsideredOverlapping = 0.75; minRatio = 0.25; indelRate = 0.3; indel = 5; insertion = 4; // asymmetric indel parameters deletion = 5; idsIndel = 15; sdpIndel = 5; sdpIns = 5; sdpDel = 10; sdpTupleSize = 11; match = 0; mismatch = 0; showAlign = 1; refineAlign = 1; useScoreCutoff = false; maxScore = -200; argi = 1; nProc = 1; readsFileNames.clear(); queryFileNames.clear(); genomeFileName = ""; queryReadType = ReadType::UNKNOWN; queryFileType = FileType::None; tupleListName = ""; posTableName = ""; suffixArrayFileName= ""; bwtFileName = ""; indexFileName = ""; anchorFileName = ""; outFileName = ""; nBest = 10; nCandidates = 10; printWindow = 0; doCondense = 0; do4BitComp = 0; pValueType = 0; cutoff = 0; useSuffixArray = 0; useBwt = 0; useReverseCompressIndex = 0; useTupleList = 0; useSeqDB = 0; seqDBName = ""; useCountTable = 0; countTableName = ""; lookupTableLength = 8; anchorParameters.minMatchLength = minMatchLength = 12; printFormat = SummaryPrint; maxExpand = 0; minExpand = 0; startRead = 0; stride = 1; subsample = 1.1; listTupleSize = 6; sortRefinedAlignments = 1; anchorParameters.verbosity = verbosity = 0; saTupleMetrics.Initialize(listTupleSize); sdpTupleMetrics.Initialize(sdpTupleSize); qualityLowerCaseThreshold = 0; anchorParameters.branchQualityThreshold = 0; readsFileIndex = 0; printSAM = false; printBAM = false; useRandomSeed = false; randomSeed = 0; placeRandomly = false; samplePaths = false; nowarp = false; storeMapQV = true; warp = true; extendDenovoCCSSubreads = false; storeMetrics = false; ignoreQualities = true; extendFrontAlignment = false; extendAlignments = false; maxExtendDropoff = 10; minReadLength = 50; maxReadLength = 0; // means no max read length minSubreadLength = 0; minRawSubreadScore = -1; // raw subread score in region table should be in range [0, 1000]. minAvgQual = 0; overlap = false; advanceHalf = false; refineAlignments = true; anchorParameters.advanceExactMatches = advanceExactMatches = 0; approximateMaxInsertionRate = 1.30; minPctSimilarity = 0; minPctAccuracy = 0; doGlobalAlignment = false; tempDirectory = ""; useTitleTable = false; titleTableName = ""; readSeparateRegionTable = false; readSeparateCcsFofn = false; regionTableFileName = ""; ccsFofnFileName = ""; mapSubreadsSeparately=true; concordant=false; flankSize=40; useRegionTable = true; useHQRegionTable=true; printUnaligned = false; unalignedFileName = ""; globalChainType = 0; metricsFileName = ""; fullMetricsFileName = ""; doSensitiveSearch = false; emulateNucmer = false; refineBetweenAnchorsOnly = false; printSubreadTitle = true; detailedSDPAlignment = true; nouseDetailedSDPAlignment = false; useCcs = false; useCcsOnly = false; useAllSubreadsInCcs = false; chunkSize = 10000000; sdpFilterType = 0; anchorParameters.stopMappingOnceUnique = true; useGuidedAlign = true; bandSize = 0; extendBandSize = 10; guidedAlignBandSize = 10; useQVScore = false; printVerboseHelp = false; printDiscussion = false; sdpBypassThreshold = 1000000.0; scoreType = 0; byAdapter = false; qvScaleType = PHRED; printHeader = false; computeAlignProbability = false; readAccuracyPrior = 0.85; printVersion = false; clipping = SAMOutput::none; clippingString = ""; substitutionPrior = 20; globalDeletionPrior = 13; outputByThread = false; recurseOver = 10000; forPicard = false; separateGaps = false; scoreMatrixString = ""; printDotPlots = false; preserveReadTitle = false; forwardOnly = false; printOnlyBest = false; affineAlign = false; affineExtend = 0; affineOpen = 10; scaleMapQVByNumSignificantClusters = false; limsAlign = 0; holeNumberRangesStr = ""; minAlnLength = 0; printSAMQV = false; cigarUseSeqMatch = false; samQV.clear(); samQVList.clear(); fastMaxInterval = false; aggressiveIntervalCut = false; fastSDP = false; concordantTemplate = "mediansubread"; // typicalsubread or longestsubread concordantAlignBothDirections = false; hitPolicyStr = "all"; ResetFilterAndHit(); enableHiddenPaths = false; //turn off hidden paths. } MappingParameters() : filterCriteria(0, 0, 0, false, Score(0, ScoreSign::NEGATIVE)) , hitPolicy("all", ScoreSign::NEGATIVE) { Init(); } void MakeSane() { // Expand FOFN FileOfFileNames::ExpandFileNameList(readsFileNames); // Must have at least a query and a genome if (readsFileNames.size() <= 1) { cout << "Error, you must provide at least one reads file and a genome file." < 1) { cerr << "Warning: using new filter method for SDP alignments. The parameter is " << endl << "either 0 or 1, but " << sdpFilterType << " was specified." << endl; sdpFilterType = 1; } if (sdpFilterType == 0) { detailedSDPAlignment = true; nouseDetailedSDPAlignment = false; } if (detailedSDPAlignment == false) { sdpFilterType = 1; } if (useGuidedAlign == true and bandSize == 0) { bandSize = 16; } anchorParameters.minMatchLength = minMatchLength; if (suffixArrayFileName != "") { useSuffixArray = true; } if (bwtFileName != "") { useBwt = true; } if (useBwt and useSuffixArray) { cout << "ERROR, sa and bwt must be used independently." << endl; exit(1); } if (countTableName != "") { useCountTable = true; } if (metricsFileName != "" or fullMetricsFileName != "") { storeMetrics = true; } if (useCcsOnly) { useCcs = true; } if (useAllSubreadsInCcs == true) { useCcs = true; } if (titleTableName != "") { useTitleTable = true; } if (unalignedFileName != "") { printUnaligned = true; } if (regionTableFileName != "") { useRegionTable = true; readSeparateRegionTable = true; } if (ccsFofnFileName != "") { readSeparateCcsFofn = true; } if (nouseDetailedSDPAlignment == true) { detailedSDPAlignment = false; } if (nouseDetailedSDPAlignment == false) { detailedSDPAlignment = true; } if (anchorParameters.maxLCPLength != 0 and anchorParameters.maxLCPLength < anchorParameters.minMatchLength) { cerr << "ERROR: maxLCPLength is less than minLCPLength, which will result in no hits." << endl; } if (subsample < 1 and stride > 1) { cout << "ERROR, subsample and stride must be used independently." << endl; exit(1); } if (emulateNucmer) { SetEmulateNucmer(); } if (randomSeed != 0) { useRandomSeed = true; } if (printSAM) { printFormat = SAM; forPicard = true; } // // Parse the clipping. // if (clippingString == "soft") { clipping = SAMOutput::soft; } else if (clippingString == "hard") { clipping = SAMOutput::hard; } else if (clippingString == "none") { clipping = SAMOutput::none; } else if (clippingString == "subread") { clipping = SAMOutput::subread; } else if (clippingString != "") { cout << "ERROR, clipping should either be soft, hard, or none." << endl; exit(1); } if (printBAM) { #ifndef USE_PBBAM REQUIRE_PBBAM_ERROR(); #else cigarUseSeqMatch = true; // ALWAYS true for BAM printFormat = BAM; forPicard = true; printSAM = false; samQVList.SetDefaultQV(); printSAMQV = true; if (clipping != SAMOutput::soft) { // Only support two clipping methods: soft or subread. clipping = SAMOutput::subread; } if (queryFileType != PBBAM and not enableHiddenPaths) { // bax|fasta|fastq -> bam paths are turned off by default cout << "ERROR, could not output alignments in BAM unless input reads are in PacBio BAM files." << endl; exit(1); } if (outFileName == "") { cout << "ERROR, BAM output file must be specified." << endl; exit(1); } if (outputByThread) { cout << "ERROR, could not output alignments by threads in BAM format." << endl; exit(1); } #endif } if (limsAlign != 0) { mapSubreadsSeparately = false; forwardOnly = true; } if (holeNumberRangesStr.size() > 0) { if (not holeNumberRanges.setRanges(holeNumberRangesStr)) { cout << "ERROR, could not parse hole number ranges: " << holeNumberRangesStr << "." << endl; exit(1); } } if (printSAMQV) { if (samQV.size() == 0) { samQVList.SetDefaultQV(); } else { samQVList.UseQV(samQV); } } if (minRawSubreadScore > 1000) { cout << "ERROR, minimum raw subread score should be less than 1000." << endl; exit(1); } if (minRawSubreadScore != -1 and byAdapter) { cout << "ERROR, minRawSubreadScore and byAdapter should not be used together." << endl; exit(1); } // Determine query read type queryReadType = DetermineQueryReadType(); // Pass verbosity anchorParameters.verbosity = verbosity; // Set filter criteria and hit policy ResetFilterAndHit(); } void ResetFilterAndHit(void) { filterCriteria = FilterCriteria(minAlnLength, minPctSimilarity, minPctAccuracy, true, Score(static_cast(maxScore), ScoreSign::NEGATIVE)); hitPolicy = HitPolicy(hitPolicyStr, ScoreSign::NEGATIVE); } ReadType::ReadTypeEnum DetermineQueryReadType() { if (useCcsOnly or queryFileType == HDFCCSONLY) { return ReadType::CCS; } if (queryFileType == PBBAM) { // Read type in BAM may be CCS, SUBREAD, HQREGION or POLYMERASE. // Determine it later. return ReadType::UNKNOWN; } if (mapSubreadsSeparately) { return ReadType::SUBREAD; } else { if (useHQRegionTable) { return ReadType::HQREGION; } else { return ReadType::POLYMERASE; } } } void SetEmulateNucmer() { anchorParameters.stopMappingOnceUnique = true; anchorParameters.advanceExactMatches = 30; anchorParameters.maxAnchorsPerPosition = 1; sdpBypassThreshold = 0.75; sdpTupleSize = 15; anchorParameters.minMatchLength = 30; useGuidedAlign = true; refineAlignments = false; } void SetForSensitivity() { advanceExactMatches = 0; anchorParameters.numBranches = 1; anchorParameters.maxAnchorsPerPosition = 10000; } }; #endif blasr-8e668beae0dda1da6914586fb458182c6c3c7482/include/MappingSemaphores.h000066400000000000000000000022611260737656700250770ustar00rootroot00000000000000#ifndef ALIGNMENT_MAPPING_SEMAPHORE_H_ #define ALIGNMENT_MAPPING_SEMAPHORE_H_ #include #include #include #ifndef __APPLE__ class MappingSemaphores { public: sem_t reader; sem_t writer; sem_t unaligned; sem_t hitCluster; MappingSemaphores& operator=(MappingSemaphores &rhs) { return *this; } void InitializeAll() { sem_init(&reader, 0, 1); sem_init(&writer, 0, 1); sem_init(&unaligned, 0, 1); sem_init(&hitCluster, 0, 1); } }; #else class MappingSemaphores { public: sem_t *reader; sem_t *writer; sem_t *unaligned; sem_t *hitCluster; MappingSemaphores& operator=(MappingSemaphores &rhs) { return *this; } void InitializeAll() { reader = sem_open("/reader", O_CREAT, 0644, 1); writer = sem_open("/writer", O_CREAT, 0644, 1); unaligned = sem_open("/unaligned", O_CREAT, 0644, 1); hitCluster = sem_open("/hitCluster", O_CREAT, 0644, 1); } }; #endif #endif blasr-8e668beae0dda1da6914586fb458182c6c3c7482/include/ReadAlignments.hpp000066400000000000000000000152731260737656700247210ustar00rootroot00000000000000// Copyright (c) 2014-2015, Pacific Biosciences of California, Inc. // // All rights reserved. // // Redistribution and use in source and binary forms, with or without // modification, are permitted (subject to the limitations in the // disclaimer below) provided that the following conditions are met: // // * Redistributions of source code must retain the above copyright // notice, this list of conditions and the following disclaimer. // // * Redistributions in binary form must reproduce the above // copyright notice, this list of conditions and the following // disclaimer in the documentation and/or other materials provided // with the distribution. // // * Neither the name of Pacific Biosciences nor the names of its // contributors may be used to endorse or promote products derived // from this software without specific prior written permission. // // NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE // GRANTED BY THIS LICENSE. THIS SOFTWARE IS PROVIDED BY PACIFIC // BIOSCIENCES AND ITS CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED // WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES // OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE // DISCLAIMED. IN NO EVENT SHALL PACIFIC BIOSCIENCES OR ITS // CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, // SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT // LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF // USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND // ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, // OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT // OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF // SUCH DAMAGE. // Author: Mark Chaisson #ifndef __BLASR_READ_ALIGNMENTS__ #define __BLASR_READ_ALIGNMENTS__ #include #include #include #include "SMRTSequence.hpp" #include "datastructures/alignment/AlignmentCandidate.hpp" using namespace std; class ReadAlignments { public: /* This class stores the alignments from a read. A read may be aligned in several different modes: 1. Fullread - Treat the read as a unit from start to end 2. Subread - Align each subread independently 3. CCSDeNovo - Only align the CCS sequence from a read 4. CCSAllPass - Align the de novo ccs sequences and then the subreads to where the denovo ccs aligned. 5. CCSFullPass - Same as allpass, except using only complete subreads. 6. ZmwSubreads - Align subreads of each zmw to where the longest subread of the zmw aligned to. The alignments are a raggad array of n sequences; n is 1 for cases 1 and 3, the number of subreads for cases 2 and 4, and the number of full length passes for case 5. A ReadAligments class must only have alignments for a single type of read in it. */ vector > subreadAlignments; vector subreads; AlignMode alignMode; SMRTSequence read; inline int GetNAlignedSeq(); inline bool AllSubreadsHaveAlignments(); inline void Clear(); inline void Resize(int nSeq); inline void CheckSeqIndex(int seqIndex); inline void SetSequence(int seqIndex, SMRTSequence &seq); inline void AddAlignmentForSeq(int seqIndex, T_AlignmentCandidate *alignmentPtr); inline void AddAlignmentsForSeq(int seqIndex, vector &seqAlignmentPtrs); // Copy all T_AlignmentCandidate objects (to which subreadAlignment[seqIndex] // is pointing) to newly created objects, and then return pointers to the new // objects. inline vector CopySubreadAlignments(int seqIndex); inline void Print(ostream &out=cout); inline ~ReadAlignments(); }; inline int ReadAlignments::GetNAlignedSeq() { return subreadAlignments.size(); } inline bool ReadAlignments::AllSubreadsHaveAlignments() { int i, nAlignedSeq; nAlignedSeq = subreadAlignments.size(); for (i = 0; i < nAlignedSeq; i++) { if (subreadAlignments[i].size() == 0) { return false; } } return true; } inline void ReadAlignments::Clear() { int i; int nAlignedSeq; for (i = 0, nAlignedSeq = subreadAlignments.size(); i < nAlignedSeq; i++) { int nAlignments; int a; for (a = 0, nAlignments = subreadAlignments[i].size(); a < nAlignments; a++) { delete subreadAlignments[i][a]; } subreadAlignments[i].clear(); } for (i = 0, nAlignedSeq = subreads.size(); i< nAlignedSeq; i++) { subreads[i].Free(); } subreadAlignments.clear(); read.Free(); } inline void ReadAlignments::Resize(int nSeq) { subreadAlignments.resize(nSeq); subreads.resize(nSeq); } inline void ReadAlignments::CheckSeqIndex(int seqIndex) { if ( seqIndex < 0 or seqIndex >= int(subreads.size()) ) { cout << "ERROR, adding a sequence to an unallocated position." << endl; assert(0); } } inline void ReadAlignments::SetSequence(int seqIndex, SMRTSequence &seq) { CheckSeqIndex(seqIndex); subreads[seqIndex] = seq; } inline void ReadAlignments::AddAlignmentForSeq(int seqIndex, T_AlignmentCandidate *alignmentPtr) { CheckSeqIndex(seqIndex); subreadAlignments[seqIndex].push_back(alignmentPtr); } inline void ReadAlignments::AddAlignmentsForSeq(int seqIndex, vector &seqAlignmentPtrs) { CheckSeqIndex(seqIndex); subreadAlignments[seqIndex].insert(subreadAlignments[seqIndex].end(), seqAlignmentPtrs.begin(), seqAlignmentPtrs.end()); } inline vector ReadAlignments::CopySubreadAlignments(int seqIndex) { vector ret; for (int i=0; iPrint(out); } } out << " read: "; read.Print(out); out << endl << endl; } inline ReadAlignments::~ReadAlignments() { read.Free(); } #endif blasr-8e668beae0dda1da6914586fb458182c6c3c7482/include/RegisterBlasrOptions.h000066400000000000000000001200671260737656700256060ustar00rootroot00000000000000/* * ============================================================================ * * Filename: RegisterOptions.hpp * * Description: * * Version: 1.0 * Created: 04/29/2015 04:48:26 PM * Revision: none * Compiler: gcc * * Author: Yuan Li (yli), yli@pacificbiosciences.com * Company: Pacific Biosciences * * ============================================================================ */ #include "libconfig.h" #include "CommandLineParser.hpp" #include "MappingParameters.h" #include "RegisterFilterOptions.h" #include using namespace std; void RegisterBlasrOptions(CommandLineParser & clp, MappingParameters & params) { int trashbinInt; float trashbinFloat; bool trashbinBool; clp.RegisterStringOption("sa", ¶ms.suffixArrayFileName, ""); clp.RegisterStringOption("ctab", ¶ms.countTableName, "" ); clp.RegisterStringOption("regionTable", ¶ms.regionTableFileName, ""); clp.RegisterStringOption("ccsFofn", ¶ms.ccsFofnFileName, ""); clp.RegisterIntOption("bestn", (int*) ¶ms.nBest, "", CommandLineParser::PositiveInteger); clp.RegisterIntOption("limsAlign", ¶ms.limsAlign, "", CommandLineParser::PositiveInteger); clp.RegisterFlagOption("printOnlyBest", ¶ms.printOnlyBest, ""); clp.RegisterFlagOption("outputByThread", ¶ms.outputByThread, ""); clp.RegisterFlagOption("rbao", ¶ms.refineBetweenAnchorsOnly, ""); clp.RegisterFlagOption("allowAdjacentIndels", ¶ms.forPicard, ""); clp.RegisterFlagOption("onegap", ¶ms.separateGaps, ""); clp.RegisterFlagOption("allowAdjacentIndels", ¶ms.forPicard, ""); clp.RegisterFlagOption("placeRepeatsRandomly", ¶ms.placeRandomly, ""); clp.RegisterIntOption("randomSeed", ¶ms.randomSeed, "", CommandLineParser::Integer); clp.RegisterFlagOption("extend", ¶ms.extendAlignments, ""); clp.RegisterIntOption("branchExpand", ¶ms.anchorParameters.branchExpand, "", CommandLineParser::NonNegativeInteger); clp.RegisterIntOption("maxExtendDropoff", ¶ms.maxExtendDropoff, "", CommandLineParser::NonNegativeInteger); clp.RegisterFlagOption("nucmer", ¶ms.emulateNucmer, ""); clp.RegisterIntOption("maxExpand", ¶ms.maxExpand, "", CommandLineParser::PositiveInteger); clp.RegisterIntOption("minExpand", ¶ms.minExpand, "", CommandLineParser::NonNegativeInteger); clp.RegisterStringOption("seqdb", ¶ms.seqDBName, ""); clp.RegisterStringOption("anchors", ¶ms.anchorFileName, ""); clp.RegisterStringOption("clusters", ¶ms.clusterFileName, ""); clp.RegisterFlagOption("samplePaths", (bool*) ¶ms.samplePaths, ""); clp.RegisterFlagOption("noStoreMapQV", ¶ms.storeMapQV, ""); clp.RegisterFlagOption("nowarp", (bool*) ¶ms.nowarp, ""); clp.RegisterFlagOption("noRefineAlign", (bool*) ¶ms.refineAlign, ""); clp.RegisterFlagOption("guidedAlign", (bool*)¶ms.useGuidedAlign, ""); clp.RegisterFlagOption("useGuidedAlign", (bool*)&trashbinBool, ""); clp.RegisterFlagOption("noUseGuidedAlign", (bool*)¶ms.useGuidedAlign, ""); clp.RegisterFlagOption("header", (bool*)¶ms.printHeader, ""); clp.RegisterIntOption("bandSize", ¶ms.bandSize, "", CommandLineParser::PositiveInteger); clp.RegisterIntOption("extendBandSize", ¶ms.extendBandSize, "", CommandLineParser::PositiveInteger); clp.RegisterIntOption("guidedAlignBandSize", ¶ms.guidedAlignBandSize, "", CommandLineParser::PositiveInteger); clp.RegisterIntOption("maxAnchorsPerPosition", ¶ms.anchorParameters.maxAnchorsPerPosition, "", CommandLineParser::PositiveInteger); clp.RegisterIntOption("stopMappingOnceUnique", (int*) ¶ms.anchorParameters.stopMappingOnceUnique, "", CommandLineParser::NonNegativeInteger); clp.RegisterStringOption("out", ¶ms.outFileName, ""); clp.RegisterIntOption("match", ¶ms.match, "", CommandLineParser::Integer); clp.RegisterIntOption("mismatch", ¶ms.mismatch, "", CommandLineParser::Integer); clp.RegisterIntOption("minMatch", ¶ms.minMatchLength, "", CommandLineParser::PositiveInteger); clp.RegisterIntOption("maxMatch", ¶ms.anchorParameters.maxLCPLength, "", CommandLineParser::NonNegativeInteger); clp.RegisterIntOption("maxLCPLength", ¶ms.anchorParameters.maxLCPLength, "", CommandLineParser::NonNegativeInteger); clp.RegisterIntOption("indel", ¶ms.indel, "", CommandLineParser::Integer); clp.RegisterIntOption("insertion", ¶ms.insertion, "", CommandLineParser::Integer); clp.RegisterIntOption("deletion", ¶ms.deletion, "", CommandLineParser::Integer); clp.RegisterIntOption("idsIndel", ¶ms.idsIndel, "", CommandLineParser::Integer); clp.RegisterIntOption("sdpindel", ¶ms.sdpIndel, "", CommandLineParser::Integer); clp.RegisterIntOption("sdpIns", ¶ms.sdpIns, "", CommandLineParser::Integer); clp.RegisterIntOption("sdpDel", ¶ms.sdpDel, "", CommandLineParser::Integer); clp.RegisterFloatOption("indelRate", ¶ms.indelRate, "", CommandLineParser::NonNegativeFloat); clp.RegisterFloatOption("minRatio", ¶ms.minRatio, "", CommandLineParser::NonNegativeFloat); clp.RegisterFloatOption("sdpbypass", ¶ms.sdpBypassThreshold, "", CommandLineParser::NonNegativeFloat); clp.RegisterFloatOption("minFrac", &trashbinFloat, "", CommandLineParser::NonNegativeFloat); clp.RegisterIntOption("maxScore", ¶ms.maxScore, "", CommandLineParser::Integer); clp.RegisterStringOption("bwt", ¶ms.bwtFileName, ""); clp.RegisterIntOption("m", ¶ms.printFormat, "", CommandLineParser::NonNegativeInteger); clp.RegisterFlagOption("sam", ¶ms.printSAM, ""); #ifdef USE_PBBAM clp.RegisterFlagOption("bam", ¶ms.printBAM, ""); #endif clp.RegisterStringOption("clipping", ¶ms.clippingString, ""); clp.RegisterIntOption("sdpTupleSize", ¶ms.sdpTupleSize, "", CommandLineParser::PositiveInteger); clp.RegisterIntOption("pvaltype", ¶ms.pValueType, "", CommandLineParser::NonNegativeInteger); clp.RegisterIntOption("start", ¶ms.startRead, "", CommandLineParser::NonNegativeInteger); clp.RegisterIntOption("stride", ¶ms.stride, "", CommandLineParser::NonNegativeInteger); clp.RegisterFloatOption("subsample", ¶ms.subsample, "", CommandLineParser::PositiveFloat); clp.RegisterIntOption("nproc", ¶ms.nProc, "", CommandLineParser::PositiveInteger); clp.RegisterFlagOption("sortRefinedAlignments",(bool*) ¶ms.sortRefinedAlignments, ""); clp.RegisterIntOption("quallc", ¶ms.qualityLowerCaseThreshold, "", CommandLineParser::Integer); clp.RegisterFlagOption("v", (bool*) ¶ms.verbosity, ""); clp.RegisterIntOption("V", ¶ms.verbosity, "Specify a level of verbosity.", CommandLineParser::NonNegativeInteger); clp.RegisterIntOption("contextAlignLength", ¶ms.anchorParameters.contextAlignLength, "", CommandLineParser::PositiveInteger); clp.RegisterFlagOption("skipLookupTable", ¶ms.anchorParameters.useLookupTable, ""); clp.RegisterStringOption("metrics", ¶ms.metricsFileName, ""); clp.RegisterStringOption("lcpBounds", ¶ms.lcpBoundsFileName, ""); clp.RegisterStringOption("fullMetrics", ¶ms.fullMetricsFileName, ""); clp.RegisterIntOption("nbranch", ¶ms.anchorParameters.numBranches, "", CommandLineParser::NonNegativeInteger); clp.RegisterFlagOption("divideByAdapter", ¶ms.byAdapter, ""); clp.RegisterFlagOption("useQuality", ¶ms.ignoreQualities, ""); clp.RegisterFlagOption("noFrontAlign", ¶ms.extendFrontAlignment, ""); clp.RegisterIntOption("minReadLength", ¶ms.minReadLength, "", CommandLineParser::NonNegativeInteger); clp.RegisterIntOption("maxReadLength", ¶ms.maxReadLength, "", CommandLineParser::NonNegativeInteger); clp.RegisterIntOption("minSubreadLength", ¶ms.minSubreadLength, "", CommandLineParser::NonNegativeInteger); clp.RegisterIntOption("minRawSubreadScore", ¶ms.minRawSubreadScore, "", CommandLineParser::NonNegativeInteger); clp.RegisterIntOption("minAvgQual", ¶ms.minAvgQual, "", CommandLineParser::Integer); clp.RegisterFlagOption("advanceHalf", ¶ms.advanceHalf, ""); clp.RegisterIntOption("advanceExactMatches", ¶ms.anchorParameters.advanceExactMatches, "", CommandLineParser::NonNegativeInteger); clp.RegisterFlagOption("useccs", ¶ms.useCcs, ""); clp.RegisterFlagOption("useccsdenovo", ¶ms.useCcsOnly, ""); clp.RegisterFlagOption("useccsall", ¶ms.useAllSubreadsInCcs, ""); clp.RegisterFlagOption("extendDenovoCCSSubreads", ¶ms.extendDenovoCCSSubreads, ""); clp.RegisterFlagOption("noRefineAlignments", ¶ms.refineAlignments, ""); clp.RegisterIntOption("nCandidates", ¶ms.nCandidates, "", CommandLineParser::NonNegativeInteger); clp.RegisterFlagOption("useTemp", (bool*) ¶ms.tempDirectory, ""); clp.RegisterFlagOption("noSplitSubreads", ¶ms.mapSubreadsSeparately, ""); clp.RegisterFlagOption("concordant", ¶ms.concordant, ""); // When -concordant is turned on, blasr first selects a subread (e.g., the median length full-pass subread) // of a zmw as template, maps the template subread to a reference, then infers directions of all other subreads // of the same zmw based on direction of the template, and finally maps all other subreads to the same // genomic coordinates as the template. When -concordantAlignBothDirections is turned on, blasr will align // all other subreads both forwardly and backwardly, without infering their directions. This is a hidden // diagnostic option only useful for analyzing movies which have lots of un-identified or missed adapters such // that directions of subreads can not be inferred accurately. clp.RegisterFlagOption("concordantAlignBothDirections", ¶ms.concordantAlignBothDirections, ""); clp.RegisterIntOption("flankSize", ¶ms.flankSize, "", CommandLineParser::NonNegativeInteger); clp.RegisterStringOption("titleTable", ¶ms.titleTableName, ""); clp.RegisterFlagOption("useSensitiveSearch", ¶ms.doSensitiveSearch, ""); clp.RegisterFlagOption("ignoreRegions", ¶ms.useRegionTable, ""); clp.RegisterFlagOption("ignoreHQRegions", ¶ms.useHQRegionTable, ""); clp.RegisterFlagOption("computeAlignProbability", ¶ms.computeAlignProbability, ""); clp.RegisterStringOption("unaligned", ¶ms.unalignedFileName, ""); clp.RegisterFlagOption("global", ¶ms.doGlobalAlignment, ""); clp.RegisterIntOption("globalChainType", ¶ms.globalChainType, "", CommandLineParser::NonNegativeInteger); clp.RegisterFlagOption("noPrintSubreadTitle", (bool*) ¶ms.printSubreadTitle, ""); clp.RegisterIntOption("saLookupTableLength", ¶ms.lookupTableLength, "", CommandLineParser::PositiveInteger); clp.RegisterFlagOption("useDetailedSDP", ¶ms.detailedSDPAlignment, ""); clp.RegisterFlagOption("nouseDetailedSDP", &trashbinBool, ""); clp.RegisterIntOption("sdpFilterType", ¶ms.sdpFilterType, "", CommandLineParser::NonNegativeInteger); clp.RegisterIntOption("scoreType", ¶ms.scoreType, "", CommandLineParser::NonNegativeInteger); clp.RegisterFlagOption("h", ¶ms.printVerboseHelp, ""); clp.RegisterFlagOption("help", ¶ms.printDiscussion, ""); clp.RegisterFloatOption("accuracyPrior", ¶ms.readAccuracyPrior, "", CommandLineParser::NonNegativeFloat); // holeNumberRangesStr is a string of comma-delimited hole number ranges, such as '1,2,3,10-15'. // Blasr only analyzes reads whose hole numbers are in the specified hole number ranges. clp.RegisterStringOption("holeNumbers", ¶ms.holeNumberRangesStr, ""); clp.RegisterIntOption("substitutionPrior", ¶ms.substitutionPrior, "", CommandLineParser::NonNegativeInteger); clp.RegisterIntOption("deletionPrior", ¶ms.globalDeletionPrior, "", CommandLineParser::NonNegativeInteger); clp.RegisterIntOption("recurseOver", ¶ms.recurseOver, "", CommandLineParser::NonNegativeInteger); clp.RegisterStringOption("scoreMatrix", ¶ms.scoreMatrixString, ""); clp.RegisterFlagOption("printDotPlots", ¶ms.printDotPlots, ""); clp.RegisterFlagOption("preserveReadTitle", ¶ms.preserveReadTitle,""); clp.RegisterFlagOption("forwardOnly", ¶ms.forwardOnly,""); clp.RegisterFlagOption("affineAlign", ¶ms.affineAlign, ""); clp.RegisterIntOption("affineOpen", ¶ms.affineOpen, "", CommandLineParser::NonNegativeInteger); clp.RegisterIntOption("affineExtend", ¶ms.affineExtend, "", CommandLineParser::NonNegativeInteger); clp.RegisterFlagOption("scaleMapQVByNClusters", ¶ms.scaleMapQVByNumSignificantClusters, "", false); clp.RegisterFlagOption("printSAMQV", ¶ms.printSAMQV, "", false); clp.RegisterFlagOption("cigarUseSeqMatch", ¶ms.cigarUseSeqMatch, ""); clp.RegisterStringListOption("samQV", ¶ms.samQV, ""); clp.RegisterFlagOption("fastMaxInterval", ¶ms.fastMaxInterval, "", false); clp.RegisterFlagOption("aggressiveIntervalCut", ¶ms.aggressiveIntervalCut, "", false); clp.RegisterFlagOption("fastSDP", ¶ms.fastSDP, "", false); clp.RegisterStringOption("concordantTemplate", ¶ms.concordantTemplate, "typicalsubread"); RegisterFilterOptions(clp, params.minAlnLength, params.minPctSimilarity, params.minPctAccuracy, params.hitPolicyStr, trashbinBool=true, trashbinInt, params.maxScore); } const string BlasrHelp(MappingParameters & params) { stringstream helpStream; helpStream << " Options for blasr " << endl << " Basic usage: 'blasr reads.{bam|fasta|bax.h5|fofn} genome.fasta [-options] " << endl << " option\tDescription (default_value)." << endl << endl << " Input Files." << endl << " reads.bam is a PacBio BAM file of reads." << endl << " This is the preferred input to blasr because rich quality" << endl << " value (insertion,deletion, and substitution quality values) information is " << endl << " maintained. The extra quality information improves variant detection and mapping"< 3." << endl << " -maxMatch l (inf)" << endl << " Stop mapping a read to the genome when the lcp length reaches l. " << endl << " This is useful when the query is part of the reference, for example when " < using namespace std; /// Register options for filtering alignments. void RegisterFilterOptions(CommandLineParser & clp, int & minAlnLength, float & minPctSimilarity, float & minPctAccuracy, string & hitPolicyStr, bool & useScoreCutoff, int & scoreSignInt, int & scoreCutoff) { ScoreSign ss = static_cast(scoreSignInt); Score sc(static_cast(scoreCutoff), ss); FilterCriteria fc(static_cast(minAlnLength), minPctSimilarity, minPctAccuracy, useScoreCutoff, sc); HitPolicy hp("randombest", ScoreSign::NEGATIVE); clp.RegisterIntOption("minAlnLength", &minAlnLength, fc.MinAlnLengthHelp(), CommandLineParser::PositiveInteger); clp.RegisterIntOption("minAlignLength", &minAlnLength, "Alias of -minAlnLength", CommandLineParser::PositiveInteger); clp.RegisterIntOption("minLength", &minAlnLength, "Alias of -minAlnLength", CommandLineParser::PositiveInteger); clp.RegisterFloatOption("minPctSimilarity", &minPctSimilarity, fc.MinPctSimilarityHelp(), CommandLineParser::PositiveFloat); clp.RegisterFloatOption("minPctIdentity", &minPctSimilarity, "Alias of -minPctSimilarity", CommandLineParser::PositiveFloat); clp.RegisterFloatOption("minPctAccuracy", &minPctAccuracy, fc.MinPctAccuracyHelp(), CommandLineParser::PositiveFloat); clp.RegisterFloatOption("minAccuracy", &minPctAccuracy, "Alias of -minPctAccuracy", CommandLineParser::PositiveFloat); clp.RegisterStringOption("hitPolicy", &hitPolicyStr, hp.Help()); clp.RegisterIntOption("scoreSign", &scoreSignInt, fc.ScoreSignHelp(), CommandLineParser::Integer); clp.RegisterIntOption("scoreCutoff", &scoreCutoff, fc.ScoreCutoffHelp(), CommandLineParser::Integer); } blasr-8e668beae0dda1da6914586fb458182c6c3c7482/libcpp/000077500000000000000000000000001260737656700211315ustar00rootroot00000000000000blasr-8e668beae0dda1da6914586fb458182c6c3c7482/makefile000066400000000000000000000061311260737656700213610ustar00rootroot00000000000000all: SRCDIR:=$(dir $(realpath $(firstword $(MAKEFILE_LIST)))) -include ${CURDIR}/defines.mk -include ${SRCDIR}/rules.mk foo: echo $(realpath $(firstword $(MAKEFILE_LIST))) echo $(firstword $(MAKEFILE_LIST)) echo $(MAKEFILE_LIST) echo ${SRCDIR} CXXFLAGS += -O3 -g CXXOPTS += \ -std=c++0x -pedantic \ -Wall -Wuninitialized -Wno-div-by-zero \ -MMD -MP -w -fpermissive GCXXFLAGS := -fno-builtin-malloc -fno-builtin-calloc -fno-builtin-realloc -fno-builtin-free -fno-omit-frame-pointer CXXFLAGS += ${CXXOPTS} ${GCXXFLAGS} #INC_DIRS:=${LIBBLASR_INC} ${LIBPBIHDF_INC} ${LIBPBDATA_INC} ${PBBAM_INC} ${HTSLIB_INC} ${HDF5_INC} ${ZLIB_INC} #LIB_DIRS:=${LIBBLASR_LIB} ${LIBPBIHDF_LIB} ${LIBPBDATA_LIB} ${PBBAM_LIB} ${HTSLIB_LIB} ${HDF5_LIB} ${ZLIB_LIB} #LDLIBS := \ # ${LIBBLASR_LIBFLAGS} ${LIBPBIHDF_LIBFLAGS} ${LIBPBDATA_LIBFLAGS} \ # ${PBBAM_LIBFLAGS} ${HTSLIB_LIBFLAGS} ${HDF5_LIBFLAGS} ${ZLIB_LIBFLAGS} \ # -ldl -lpthread # HDF5 needs -ldl, but mobs does not pass it in. CPPFLAGS:=-I${SRCDIR}/include ${CPPFLAGS} SRCS := Blasr.cpp OBJS := ${SRCS:.cpp=.o} DEPS := ${SRCS:.cpp=.d} LD_LIBRARY_PATH:=${HDF5_LIB}:${LIBBLASR_LIB}:${LIBPBIHDF_LIB}:${LIBPBDATA_LIB}:${LD_LIBRARY_PATH} export LD_LIBRARY_PATH # Note: On macosx, this would be DYLD_LIBRARY_PATH. vpath %.cpp ${SRCDIR} init-submodule: ${MAKE} update-submodule ${MAKE} build-submodule update-submodule: git submodule update --init build-submodule: # DON'T use pbbam which is not on github. cd libcpp && NOPBBAM=true HDF5_LIB=${HDF5_LIB} HDF5_INC=${HDF5_INC} ./configure.py ${MAKE} -C libcpp submodule-clean: ${RM} -r libcpp # The rules above must be run separately. all: blasr makeutils #all: makeextrautils #This would require pbbam. blasr: ${OBJS} ${CXX} -o $@ ${CXXFLAGS} ${CPPFLAGS} -MF"${@:%=%.d}" ${OBJS} ${LDFLAGS} ${LDLIBS} @echo LD_LIBRARY_PATH=${LD_LIBRARY_PATH} makeutils: ${MAKE} -C utils makeextrautils: ${MAKE} -C extrautils CTESTS := \ ctest/affineAlign.t ctest/bamOut.t ctest/ccsH5.t ctest/filtercriteria.t ctest/m0-5.t ctest/samNM.t \ ctest/aggressiveIntervalCut.t ctest/bug25328.t ctest/concordant.t ctest/fofn.t ctest/multipart.t ctest/useccsallBestN1.t \ ctest/alignScore.t ctest/bug25741.t ctest/ecoli.t ctest/hitpolicy.t ctest/noSplitSubreads.t ctest/useccsallLargeGenome.t\ ctest/bamIn.t ctest/bug25766.t ctest/fastMaxInterval.t ctest/holeNumbers.t ctest/open_fail.t ctest/verbose.t SLOW_CTESTS := ctest/bug25328.t ctest/useccsallLargeGenome.t cramtests: blasr utils cram -v --shell=/bin/bash ${CTESTS} ${MAKE} -C utils cramtests cramfast: blasr utils cram -v --shell=/bin/bash $(filter-out ${SLOW_CTESTS},${CTESTS}) ${MAKE} -C utils cramfast gtest: blasr # This requires the submodule to be configured with gtest. ${MAKE} -C libcpp gtest check: gtest cramtests cleanall: cleanlib clean # cleanlib is only for submodule users cleanlib: libcpp/defines.mk ${MAKE} -C libcpp clean clean: ${RM} blasr ${OBJS} ${DEPS} blasr.d ${MAKE} -C utils clean ${MAKE} -C extrautils clean -include ${DEPS} blasr-8e668beae0dda1da6914586fb458182c6c3c7482/rules.mk000066400000000000000000000013471260737656700213500ustar00rootroot00000000000000INCDIRS := \ ${BLASR_INC} \ ${LIBBLASR_INC} \ ${LIBPBDATA_INC} \ ${LIBPBIHDF_INC} \ ${PBBAM_INC} \ ${HDF5_INC} \ ${HTSLIB_INC} \ ${BOOST_INC} LIBDIRS := \ ${LIBBLASR_LIB} \ ${LIBPBDATA_LIB} \ ${LIBPBIHDF_LIB} \ ${PBBAM_LIB} \ ${HDF5_LIB} \ ${HTSLIB_LIB} \ ${GCC_LIB} \ ${SZLIB_LIB} \ ${ZLIB_LIB} LDLIBS+= \ ${LIBPBIHDF_LIBFLAGS} \ ${LIBBLASR_LIBFLAGS} \ ${LIBPBDATA_LIBFLAGS} \ ${LIBPBIHDF_LIBFLAGS} \ ${PBBAM_LIBFLAGS} \ ${HDF5_LIBFLAGS} \ ${HTSLIB_LIBFLAGS} \ ${SZLIB_LIBFLAGS} \ ${ZLIB_LIBFLAGS} \ ${RT_LIBFLAGS} \ ${PTHREAD_LIBFLAGS} \ ${DL_LIBFLAGS} # We repeat LIBPBIHDF_LIBFLAGS because of a circular dependency. See #77. CPPFLAGS+=$(patsubst %,-I%,${INCDIRS}) LDFLAGS+=$(patsubst %,-L%,${LIBDIRS}) blasr-8e668beae0dda1da6914586fb458182c6c3c7482/travis.sh000077500000000000000000000005161260737656700215310ustar00rootroot00000000000000#!/usr/bin/env bash # This will not work within Travis until have have pre-compiled HDF5 # (or least headers?). But it shows the steps. set -ex # There is a bug without --shared. Working on it. See #77. ./configure.py --shared --sub --no-pbbam HDF5_INC=${HDF5_INC} HDF5_LIB=${HDF5_LIB} make -j4 init-submodule make --debug=b -j4 all blasr-8e668beae0dda1da6914586fb458182c6c3c7482/utils/000077500000000000000000000000001260737656700210205ustar00rootroot00000000000000blasr-8e668beae0dda1da6914586fb458182c6c3c7482/utils/.gitignore000066400000000000000000000001211260737656700230020ustar00rootroot00000000000000/loadPulses /pls2fasta /samFilter /samtoh5 /samtom4 /sawriter /sdpMatcher /toAfg blasr-8e668beae0dda1da6914586fb458182c6c3c7482/utils/LoadPulses.cpp000066400000000000000000003504021260737656700236030ustar00rootroot00000000000000#define __FAST_MATH__ #include "HDFCmpFile.hpp" #include "HDFBasReader.hpp" #include "HDFPlsReader.hpp" #include "HDFCCSReader.hpp" #include "datastructures/alignment/CmpFile.hpp" #include "alignment/CmpAlignment.hpp" #include "datastructures/alignment/ByteAlignment.h" #include "datastructures/alignment/AlignmentMap.hpp" #include "reads/BaseFile.hpp" #include "reads/PulseFile.hpp" #include "reads/ReadType.hpp" #include "loadpulses/MetricField.hpp" #include "loadpulses/MovieAlnIndexLookupTable.hpp" #include "utils/FileOfFileNames.hpp" #include "utils/TimeUtils.hpp" #include "files/BaseSequenceIO.hpp" #include "CommandLineParser.hpp" #include #include #include #include #include #include #include using namespace std; typedef map MovieNameToArrayIndex; typedef map MetricOptionsMap; typedef map > RequirementMap; char VERSION[] = "v1.1.0"; char PERFORCE_VERSION_STRING[] = "$Change: 126407 $"; // define default values for metrics const float NaN = 0.0/0.0; const UChar missingQualityValue = 255; const unsigned char maxQualityValue = 100; const HalfWord missingFrameRateValue = USHRT_MAX; const unsigned int missingPulseIndex = UINT_MAX; void CapQualityValue(QualityValueVector &vect, DNALength length, unsigned char maxQualityValue=100) { unsigned int i; if (vect.data == NULL) { return; } for (i = 0; i < length; i++) { vect.data[i] = min(vect.data[i], maxQualityValue); } } void CapQualityValues(SMRTSequence &seq, unsigned char maxQualityValue = 100) { CapQualityValue(seq.qual, seq.length, maxQualityValue); CapQualityValue(seq.deletionQV, seq.length, maxQualityValue); CapQualityValue(seq.preBaseDeletionQV, seq.length, maxQualityValue); CapQualityValue(seq.insertionQV, seq.length, maxQualityValue); CapQualityValue(seq.substitutionQV, seq.length, maxQualityValue); CapQualityValue(seq.mergeQV, seq.length, maxQualityValue); } int CheckCmpFileFormat(CmpFile &cmpFile) { if (cmpFile.readType != ReadType::Standard) { cout << "ERROR! Reading pulse information into a cmp.h5 file generated from circular " << endl << "consensus called sequences is not supported." << endl; exit(1); } return 1; } void BuildRequirementMap(RequirementMap &fieldRequirements) { fieldRequirements["StartTimeOffset"].push_back("StartFrame"); fieldRequirements["StartTimeOffset"].push_back("NumEvent"); fieldRequirements["StartFrame"].push_back("PreBaseFrames"); fieldRequirements["StartFrame"].push_back("WidthInFrames"); fieldRequirements["PulseWidth"].push_back("WidthInFrames"); fieldRequirements["pkmid"].push_back("MidSignal"); fieldRequirements["pkmid"].push_back("NumEvent"); fieldRequirements["IPD"].push_back("StartFrame"); fieldRequirements["IPD"].push_back("NumEvent"); fieldRequirements["IPD"].push_back("PreBaseFrames"); fieldRequirements["IPD"].push_back("WidthInFrames"); fieldRequirements["Light"].push_back("MeanSignal"); fieldRequirements["Light"].push_back("NumEvent"); fieldRequirements["Light"].push_back("WidthInFrames"); // Build requirementMap for sneaky metrics fieldRequirements["StartFrameBase"].push_back("PreBaseFrames"); fieldRequirements["StartFrameBase"].push_back("WidthInFrames"); fieldRequirements["StartFramePulse"].push_back("PreBaseFrames"); fieldRequirements["StartFramePulse"].push_back("WidthInFrames"); } void ExclusivelyAdd(const char *value, vector &vect) { if (find(vect.begin(), vect.end(), value) == vect.end()) { vect.push_back(value); } } bool AnyFieldRequiresFrameRate(vector &fields) { int i; for (i = 0; i < fields.size(); i++ ) { if (fields[i] == "PulseWidth" or fields[i] == "IPD" or fields[i] == "Light" or fields[i] == "StartTimeOffset" or fields[i] == "StartFrame" or fields[i] == "PulseWidth" or fields[i] == "PreBaseFrames" or fields[i] == "WidthInFrames") { return true; } } return false; } template void Free(T* &buf) { if (buf != NULL){ delete[] buf; } buf = NULL; } // Return all eighteen metrics that can be loaded. // StartTimeOffset QualityValue InsertionQV MergeQV // DeletionQV DeletionTag PulseIndex SubstitutionTag // SubstitutionQV ClassifierQV StartFrame PulseWidth // PreBaseFrames WidthInFrames pkmid IPD // Light WhenStarted vector GetAllSupportedMetrics(bool isSneakyMetricsIncluded = true) { // The order of metrics matters. With -bymetric option, all fields // which are required for computing a metric are cached before WriteMetric() // and cleared afterwards. If two neighboring metrics share a subset of // required fields, then the cached fields can be re-used. Arrange metrics // in an order that maximizes reuse of cached fields. vector supportedMetrics; supportedMetrics.push_back("WhenStarted"); supportedMetrics.push_back("QualityValue"); supportedMetrics.push_back("InsertionQV"); supportedMetrics.push_back("MergeQV"); supportedMetrics.push_back("DeletionQV"); supportedMetrics.push_back("DeletionTag"); supportedMetrics.push_back("SubstitutionTag"); supportedMetrics.push_back("SubstitutionQV"); supportedMetrics.push_back("PreBaseFrames"); // Sneaky metrics for internal use Only if (isSneakyMetricsIncluded) { supportedMetrics.push_back("StartFrameBase"); } supportedMetrics.push_back("IPD"); supportedMetrics.push_back("StartFrame"); if (isSneakyMetricsIncluded) { supportedMetrics.push_back("StartFramePulse"); } // Disable metric StartTimeOffset for now. // StartTimeOffset is placed at the same level as AlnArray, However, the // size of StartTimeOffset is far less than AlnArray, while cmp.h5 spec // requires all datasets at that level to have the same size. // supportedMetrics.push_back("StartTimeOffset"); supportedMetrics.push_back("PulseWidth"); supportedMetrics.push_back("WidthInFrames"); supportedMetrics.push_back("Light"); supportedMetrics.push_back("pkmid"); supportedMetrics.push_back("ClassifierQV"); supportedMetrics.push_back("PulseIndex"); return supportedMetrics; } // Return metrics to load by default. vector GetDefaultMetrics() { vector defaultMetrics; defaultMetrics.push_back("QualityValue"); defaultMetrics.push_back("ClassifierQV"); defaultMetrics.push_back("StartFrame"); defaultMetrics.push_back("PulseWidth"); defaultMetrics.push_back("WidthInFrames"); defaultMetrics.push_back("pkmid"); defaultMetrics.push_back("IPD"); return defaultMetrics; } // Return metrics that can be computed from PulseCalls. vector GetPulseMetrics() { vector pulseMetrics; pulseMetrics.push_back("StartFrame"); pulseMetrics.push_back("StartTimeOffset"); pulseMetrics.push_back("ClassifierQV"); pulseMetrics.push_back("PulseWidth"); pulseMetrics.push_back("WidthInFrames"); pulseMetrics.push_back("IPD"); pulseMetrics.push_back("pkmid"); pulseMetrics.push_back("Light"); pulseMetrics.push_back("StartFramePulse"); return pulseMetrics; } // Return true if this metric can be computed from PulseCalls. bool IsPulseMetric(const string & metric) { vector pulseMetrics = GetPulseMetrics(); for (int i = 0; i < pulseMetrics.size(); i++) { if (pulseMetrics[i] == metric) return true; } return false; } // Return all metrics that are // (1) supported, // (2) requested to load, and // (3) computable with all required fields available // in either bas.h5 or pls.h5. vector GetMetricsToLoad(map & metricOptions) { vector metricsToLoad; // Get all supported metrics. vector supportedMetrics = GetAllSupportedMetrics(); map::iterator metricIt; for (int i = 0; i < supportedMetrics.size(); i++) { string metric = supportedMetrics[i]; metricIt = metricOptions.find(metric); if (metricIt!=metricOptions.end() and metricIt->second) { // Get metrics that are required and computable metricsToLoad.push_back(metricIt->first); } } return metricsToLoad; } void StoreDatasetFieldsFromPulseFields(MetricOptionsMap &fieldSet, RequirementMap &fieldRequirements, vector &datasetFields) { int f; int d; MetricOptionsMap::iterator optionsIt; for (optionsIt = fieldSet.begin(); optionsIt != fieldSet.end(); ++optionsIt) { if (optionsIt->second == true) { if (fieldRequirements.find(optionsIt->first) == fieldRequirements.end()) { ExclusivelyAdd(optionsIt->first.c_str(), datasetFields); } else { for (d = 0; d < fieldRequirements[optionsIt->first].size(); d++) { ExclusivelyAdd(fieldRequirements[optionsIt->first][d].c_str(), datasetFields ); } } } } } void ParseMetricsList(string metricListString, MetricOptionsMap &metricOptions) { vector metrics; Splice(metricListString, ",", metrics); int m; for (m = 0; m < metrics.size(); m++) { if (metricOptions.find(metrics[m]) != metricOptions.end()) { metricOptions[metrics[m]] = true; } else { cout << "ERROR! Metric " << metrics[m] << " is not supported." << endl; exit(1); } } } // Set default metric options to true void SetDefaultMetricOptions(map & metricOptions) { vector defaultMetrics = GetDefaultMetrics(); for (int i = 0; i < defaultMetrics.size(); i++) { metricOptions[defaultMetrics[i]] = true; } } // Initialize all supported metric options and set all to false void CreateMetricOptions(map &metricOptions) { vector supportedMetrics = GetAllSupportedMetrics(); for (int i = 0; i < supportedMetrics.size(); i++) { metricOptions[supportedMetrics[i]] = false; } } // Check whether all fields are available or not. bool AreAllFieldsAvailable( vector & requiredFields, HDFBasReader & hdfBasReader, HDFPlsReader & hdfPlsReader, const bool & useBaseFile, const bool & usePulseFile) { bool allAvailable = true; for (int i = 0; i < requiredFields.size(); i++) { Field field = requiredFields[i]; if (field.type == BasField) { if (!useBaseFile or !hdfBasReader.FieldIsIncluded(field.name) or !hdfBasReader.includedFields[field.name]) { allAvailable = false; break; } } else if (field.type == PlsField) { if (!usePulseFile or !hdfPlsReader.FieldIsIncluded(field.name) or !hdfPlsReader.includedFields[field.name]) { allAvailable = false; break; } } } return allAvailable; } // // Check whether a metric is computable or not. // fieldsToBeUsed = all fields that will be used for computing a metric. // If a metric can be computed from both bas and pls files (e.g. // StartFrame, IPD, PulseWidth, WidthInFrame), only compute it from pls. // bool CanThisMetricBeComputed ( const string & metricName, HDFBasReader & hdfBasReader, HDFPlsReader & hdfPlsReader, const bool & useBaseFile, const bool & usePulseFile, vector & fieldsToBeUsed) { fieldsToBeUsed.clear(); FieldsRequirement fieldsRequirement = FieldsRequirement(metricName); bool metricMayBeComputedFromPls = true; if (fieldsRequirement.fieldsUsePlsFile.size() != 0 && usePulseFile) { metricMayBeComputedFromPls = AreAllFieldsAvailable( fieldsRequirement.fieldsUsePlsFile, hdfBasReader, hdfPlsReader, useBaseFile, usePulseFile); } else { metricMayBeComputedFromPls = false; } bool metricMayBeComputedFromBas = true; if (fieldsRequirement.fieldsUseBasFile.size() != 0 && useBaseFile) { metricMayBeComputedFromBas = AreAllFieldsAvailable( fieldsRequirement.fieldsUseBasFile, hdfBasReader, hdfPlsReader, useBaseFile, usePulseFile); } else { metricMayBeComputedFromBas = false; } bool metricMayBeComputed = true; if (!metricMayBeComputedFromBas and !metricMayBeComputedFromPls) { metricMayBeComputed = false; } // Compute from pls if possible if (metricMayBeComputedFromPls) { fieldsToBeUsed = fieldsRequirement.fieldsUsePlsFile; } else if (metricMayBeComputedFromBas) { fieldsToBeUsed = fieldsRequirement.fieldsUseBasFile; } if (metricName == "StartTimeOffset") { metricMayBeComputed = false; // Disable StartTimeOffset for now. } if (metricName == "WhenStarted") { // WhenStarted requires no fields from neither bas nor pls. metricMayBeComputed = true; } return metricMayBeComputed; } // // Check whether metrics are computable or not. If a metric is not // computable, disable it with a warning or exit with an error. // void CanMetricsBeComputed( MetricOptionsMap & metricOptions, HDFBasReader & hdfBasReader, HDFPlsReader & hdfPlsReader, const bool & useBaseFile, const bool & usePulseFile, const bool & failOnMissingData, const string & movieName) { map::iterator metricIt; for (metricIt = metricOptions.begin(); metricIt != metricOptions.end(); ++metricIt) { string metricName = metricIt->first; if (metricName == "") { metricIt->second == false; } if (metricIt->second == false) { continue; } vector fieldsToBeUsed; bool metricMayBeComputed = CanThisMetricBeComputed(metricName, hdfBasReader, hdfPlsReader, useBaseFile, usePulseFile, fieldsToBeUsed); if (metricMayBeComputed == false) { if (failOnMissingData) { cout << "ERROR"; } else { cout << "WARNING"; } cout << ": There is insufficient data to compute metric: " << metricName << " in the file " << movieName << " "; cout << " It will be ignored." << endl; if (failOnMissingData) { exit(1); } metricOptions[metricName] = false; } } } // Return size of a single field in KB. UInt ComputeRequiredMemoryForThisField( Field & thisField, HDFBasReader & hdfBasReader, HDFPlsReader & hdfPlsReader, const bool & useBaseFile, const bool & usePulseFile) { UInt memory = 0; if (thisField.type == BasField) { assert(useBaseFile); return hdfBasReader.GetFieldSize(thisField.name); } if (thisField.type == PlsField) { assert(usePulseFile); return hdfPlsReader.GetFieldSize(thisField.name); } assert(false); } // // Return estimated memory peak (in KB) for buffering all data using -bymetric. // UInt ComputeRequiredMemory( vector & metricsToLoad, HDFBasReader & hdfBasReader, HDFPlsReader & hdfPlsReader, const bool & useBaseFile, const bool & usePulseFile, HDFCmpFile & cmpReader, UInt & totalAlnLength) { UInt maxMemory = 0; for (int i = 0; i < metricsToLoad.size(); i++) { UInt memoryForThisMetric = 0; vector fieldsToBeUsed; bool canBeComputed = CanThisMetricBeComputed( metricsToLoad[i], hdfBasReader, hdfPlsReader, useBaseFile, usePulseFile, fieldsToBeUsed); for (int j = 0; j < fieldsToBeUsed.size(); j++) { UInt memoryForThisField = ComputeRequiredMemoryForThisField( fieldsToBeUsed[j], hdfBasReader, hdfPlsReader, useBaseFile, usePulseFile); memoryForThisMetric += memoryForThisField; } maxMemory = max(maxMemory, memoryForThisMetric); } // // AlnIndex will be buffered. Some other datastructures also need // to be buffered for quick look up. Approximately double the size. // UInt totalAlnIndexMem = 2 * cmpReader.alnInfoGroup.GetAlnIndexSize(); // // AlnArray and metrics to load needs to be buffered in KB. // UInt totalAlnArrayMem = totalAlnLength / 1024 * (sizeof(unsigned int) + sizeof(unsigned char)); // // It's diffcult to estimate how much memory will be used by hdf5. // Assume memory consumed by hdf5 scales with AlnIndex and AlnArray datasets. // UInt hdf5Mem = totalAlnIndexMem / 2 + totalAlnLength / 1024 * sizeof(unsigned int); maxMemory += totalAlnIndexMem + totalAlnArrayMem + hdf5Mem; //cout << "The estimated peak memory for buffering fields is " // << maxMemory << " KB." << endl; //cout << "The estimated memory for buffering AlnIndex related data is " // << totalAlnIndexMem << " KB."<< endl; //cout << "The estimated memory for buffering AlnArray related data is " // << totalAlnArrayMem << " KB." << endl; //cout << "The estimated memory for hdf5 is " // << hdf5Mem << " KB." << endl; //cout << "The estimated total memory is " // << maxMemory << " KB." << endl; return maxMemory; } // // Get aligned sequence for this alignment from cmpFile // string GetAlignedSequenceFromCmpFile( const HDFCmpFile & cmpReader, MovieAlnIndexLookupTable & lookupTable) { string alignedSequence; vector byteAlignment; int alignedSequenceLength = lookupTable.offsetEnd - lookupTable.offsetBegin; if (alignedSequenceLength >= 0 ) { alignedSequence.resize(alignedSequenceLength); byteAlignment.resize(alignedSequenceLength); } // // Read the alignment string. All alignments // cmpReader.refAlignGroups[lookupTable.refGroupIndex]->readGroups[lookupTable.readGroupIndex]->alignmentArray.Read( lookupTable.offsetBegin, lookupTable.offsetEnd, &byteAlignment[0]); // // Convert to something we can compare easily. // ByteAlignmentToQueryString(&byteAlignment[0], byteAlignment.size(), &alignedSequence[0]); return alignedSequence; } // // Store info necessary for loading pulses to lookupTable. // void BuildLookupTable( const int & movieAlignmentIndex, CmpFile & cmpFile, BaseFile & baseFile, const bool & usePulseFile, PulseFile & pulseFile, HDFCmpFile & cmpReader, const vector & movieAlnIndex, const vector< pair > & toFrom, const set & moviePartHoleNumbers, MovieAlnIndexLookupTable & lookupTable) { // // Query the cmp file for a way to look up a read based on // coordinate information. For Astro reads, the coords are // based on x and y. For Springfield, it is read index. The // base files should be able to look up reads by x,y or by // index. // if (cmpFile.platformId == Astro) { cout << "ASTRO pulse loading is deprecated." << endl; exit(1); } int alignmentIndex = movieAlnIndex[toFrom[movieAlignmentIndex].second]; // // Alignments are grouped by ref group id then movie id. // int refGroupId = cmpFile.alnInfo.alignments[alignmentIndex].GetRefGroupId(); int movieId = cmpFile.alnInfo.alignments[alignmentIndex].GetMovieId(); UInt holeNumber = cmpFile.alnInfo.alignments[alignmentIndex].GetHoleNumber(); int alnGroupId = cmpFile.alnInfo.alignments[alignmentIndex].GetAlnGroupId(); if (cmpReader.refGroupIdToArrayIndex.find(refGroupId) == cmpReader.refGroupIdToArrayIndex.end()) { cout << "ERROR! An alignment " << alignmentIndex << " is specified with reference group " << endl << refGroupId << " that is not found as an alignment group." << endl; exit(1); } int refGroupIndex = cmpReader.refGroupIdToArrayIndex[refGroupId]; // // Now find the group containing the alignment. // if (cmpReader.alnGroupIdToReadGroupName.find(alnGroupId) == cmpReader.alnGroupIdToReadGroupName.end()) { cout << "ERROR! An alignment " << alignmentIndex << " is specified with alignment group " << endl << alnGroupId << " that is not found." << endl; exit(1); } string readGroupName = cmpReader.alnGroupIdToReadGroupName[alnGroupId]; if (cmpReader.refAlignGroups[refGroupIndex]->experimentNameToIndex.find(readGroupName) == cmpReader.refAlignGroups[refGroupIndex]->experimentNameToIndex.end()) { cout << "ERROR! An alignment " << alignmentIndex << " is specified with read group name " << endl << readGroupName << " that is not found." << endl; exit(1); } int readGroupIndex = cmpReader.refAlignGroups[refGroupIndex]->experimentNameToIndex[readGroupName]; UInt offsetBegin = cmpFile.alnInfo.alignments[alignmentIndex].GetOffsetBegin(); UInt offsetEnd = cmpFile.alnInfo.alignments[alignmentIndex].GetOffsetEnd(); // // First pull out the bases corresponding to this read. // int queryStart = cmpFile.alnInfo.alignments[alignmentIndex].GetQueryStart(); int queryEnd = cmpFile.alnInfo.alignments[alignmentIndex].GetQueryEnd(); bool skip = false; int readIndex, readStart, readLength, plsReadIndex; readIndex = readStart = readLength = plsReadIndex = -1; // // Since the movie may be split into multiple parts, look to see // if this hole number is one of the ones covered by this // set. If it is not, just continue. It will be loaded on // another pass through a different movie part. // if (moviePartHoleNumbers.find(holeNumber) == moviePartHoleNumbers.end()) { skip = true; } else { if (!baseFile.LookupReadIndexByHoleNumber(holeNumber, readIndex)) { cout << "ERROR! Alignment has hole number " << holeNumber << " that is not in the movie. " << endl; exit(1); } readStart = baseFile.readStartPositions[readIndex]; readLength = baseFile.readStartPositions[readIndex+1] - baseFile.readStartPositions[readIndex]; if (usePulseFile) { if (!pulseFile.LookupReadIndexByHoleNumber(holeNumber, plsReadIndex)) { cout << "ERROR! Alignment has hole number " << holeNumber << " that is not in the movie. " << endl; exit(1); } assert(pulseFile.holeNumbers[plsReadIndex] == baseFile.holeNumbers[readIndex]); } } // Save info to lookupTable lookupTable.SetValue(skip, // Skip processing this or not movieAlignmentIndex, alignmentIndex, refGroupIndex, readGroupIndex, holeNumber, // cmp.h5 /AlnInfo/AlnIndex column 7 offsetBegin, // cmp.h5 /AlnInfo/AlnIndex column 18 offsetEnd, // cmp.h5 /AlnInfo/AlnIndex column 19 queryStart, // cmp.h5 /AlnInfo/AlnIndex column 11 queryEnd, // cmp.h5 /AlnInfo/AlnIndex column 12 readIndex, // hole Index in BaseCalls/ZMW/HoleNumber readStart, // readStart in BaseCalls/* (e.g. *=Basecall) readLength, // readLength in BaseCalls/* plsReadIndex); // readIndex in PulseCalls/ZMW/HoleNumber } // // Map bases of a read to pulse indices. // void MapBaseToPulseIndex( BaseFile & baseFile, PulseFile & pulseFile, MovieAlnIndexLookupTable & table, vector & baseToPulseIndexMap) { baseToPulseIndexMap.resize(table.readLength); int pulseStart = pulseFile.pulseStartPositions[table.plsReadIndex]; // // Copy the subset of pulses that correspond to the ones called as bases. // int i; for (i = 0; i < table.readLength; i++) { baseToPulseIndexMap[i] = pulseStart + baseFile.pulseIndex[table.readStart + i]; } } // // Get source read from the bas/pls file. // void GetSourceRead(CmpFile & cmpFile, BaseFile & baseFile, PulseFile & pulseFile, HDFBasReader & hdfBasReader, HDFPlsReader & hdfPlsReader, HDFCCSReader & hdfCcsReader, const bool & useBaseFile, const bool & usePulseFile, const bool & useCcsOnly, //const bool & byRead, MovieAlnIndexLookupTable & table, const string & alignedSequence, SMRTSequence & sourceRead, unsigned int & numPasses) { assert(!table.skip); // // These are not allocated in the regular allocate function // since they are only used in loadPulses. (maybe I should // subclass SMRTSequence here). // //if (byRead) { // Read in the data from the bas file if it exsts. if (useBaseFile) { hdfBasReader.GetReadAt(table.readIndex, sourceRead); if (cmpFile.readType == ReadType::CCS or useCcsOnly) { numPasses = hdfCcsReader.GetNumPasses(table.readIndex); } } // Read in the data from the pls file if it exists. if (usePulseFile) { hdfPlsReader.GetReadAt(table.plsReadIndex, sourceRead.pulseIndex, sourceRead); } // } // else { // This is deprecated // // // // The entire base/pulse file was read in, so copy data from that into a read // // For the data used in the read, it is possible to simply // // reference the data, but for the pls file it is necessary // // to copy since there is a packing of data. // // // if (useBaseFile) { // baseFile.CopyReadAt(table.readIndex, sourceRead); // if (cmpFile.readType == ReadType::CCS or useCcsOnly) { // numPasses = hdfCcsReader.GetNumPasses(table.readIndex); // } // } // if (usePulseFile) { // vector baseToPulseIndexMap; // MapBaseToPulseIndex(baseFile, pulseFile, table, baseToPulseIndexMap); // pulseFile.CopyReadAt(table.readIndex, &baseToPulseIndexMap[0], sourceRead); // } //} CapQualityValues(sourceRead); } // // Build lookup tables for all alignments whose indices in // AlnArray are saved in movieAlnIndex. // Also check whether the bas file and the cmp file match. // void BuildLookupTablesAndMakeSane( CmpFile & cmpFile, BaseFile & baseFile, PulseFile & pulseFile, HDFCmpFile & cmpReader, HDFBasReader & hdfBasReader, HDFPlsReader & hdfPlsReader, HDFCCSReader & hdfCcsReader, const bool & useBaseFile, const bool & usePulseFile, const bool & useCcsOnly, const vector & movieAlnIndex, const vector< pair > & toFrom, const set & moviePartHoleNumbers, vector & lookupTables) { lookupTables.resize(movieAlnIndex.size()); int movieAlignmentIndex = 0; for (movieAlignmentIndex = 0; movieAlignmentIndex < movieAlnIndex.size(); movieAlignmentIndex++) { BuildLookupTable(movieAlignmentIndex, cmpFile, baseFile, usePulseFile, pulseFile, cmpReader, movieAlnIndex, toFrom, moviePartHoleNumbers, lookupTables[movieAlignmentIndex]); } // // Load entire Basecall from pls/bas to memory, and // check whether aligned sequences in cmp.h5 matches // sequences in pls/bas or not // hdfBasReader.ReadField(baseFile, "Basecall"); // // For each alignment, do sanity check and // cache aligned sequence in MovieAlnIndexLookupTable // for (movieAlignmentIndex = 0; movieAlignmentIndex < movieAlnIndex.size(); movieAlignmentIndex++) { MovieAlnIndexLookupTable & table = lookupTables[movieAlignmentIndex]; if (table.skip) continue; // // Get aligned sequence for this alignment from cmpFile // string alignedSequence = GetAlignedSequenceFromCmpFile(cmpReader, table); // Save the aligned sequence in the table table.alignedSequence = alignedSequence; RemoveGaps(alignedSequence, alignedSequence); // // Get sequence for this alignment from baseFile // Nucleotide * seq = new Nucleotide[table.readLength]; baseFile.CopyArray(baseFile.baseCalls, table.readStart, table.readLength, seq); string readSequence; readSequence.resize(table.queryEnd - table.queryStart); copy((char*) (seq + table.queryStart), (char*) (seq + table.queryEnd), readSequence.begin()); delete seq; // // Do a sanity check to make sure the pulses and the alignment // make sense. The main check is to see if the query sequence // in the alignment is the same as the query sequence in the // read. // if (alignedSequence.size() != readSequence.size() or alignedSequence != readSequence) { cout << "ERROR, the query sequence does not match the aligned query sequence." << endl << "HoleNumber: " << cmpFile.alnInfo.alignments[table.alignmentIndex].GetHoleNumber() << ", MovieName: " << baseFile.GetMovieName() << ", ReadIndex: " << table.readIndex << ", qStart: " << table.queryStart << ", qEnd: " << table.queryEnd << endl << "Aligned sequence: " << endl << alignedSequence << endl << "Original sequence: "<< endl << readSequence << endl; exit(1); } } hdfBasReader.ClearField(baseFile, "Basecall"); } // Given a vector of lookupTables in which items with the same // refGroupIndex and readGroupIndex are grouped, find index boundaries // of each group and save these boundaries to groupedLookupTablesIndexPairs // The index boundary of each group consists of: // 1, index (0 based, inclusive) of the very first item of a group // 2, index (0 based, exclusive) of the very last item of a group // // Assume that lookupTables satisfy the following criteria. // 1, items are already grouped by refGroupIndex and readGroupIndex // 2, items which have the same alnGroupIndex, should have // the same refGroupIndex and readGroupIndex // Note that: // 1, alnGroupIndex represents index of AlnGroupID, (i.e. dataset // /AlnInfo/AlnIndex column 1); // refGroupIndex represents index of RefGroupID, (i.e. dataset // /AlnInfo/AlnIndex column 3); // readGroupIndex represents index of an experiment group within // a refGroup (e.g. if a refGroup /ref0001 contains two experiment // groups /ref0001/movie1 and /ref0001/movie2, then readGroupIndex // for these two groups are 0 and 1.). // 2, within each grouped item, offsetBegin may not begin from 0, // and offsets may not be continugous. // void GroupLookupTables( vector & lookupTables, vector > & groupedLookupTablesIndexPairs) { vector > refGroupIndexReadGroupIndexPairs; UInt movieAlignmentIndex = 0; UInt preRefGroupIndex = 0; UInt preReadGroupIndex = 0; UInt pairFirst = 0; bool isVeryFirstGroup = true; for (movieAlignmentIndex = 0; movieAlignmentIndex < lookupTables.size(); movieAlignmentIndex++) { MovieAlnIndexLookupTable & lookupTable = lookupTables[movieAlignmentIndex]; if (isVeryFirstGroup or (lookupTable.refGroupIndex != preRefGroupIndex or lookupTable.readGroupIndex != preReadGroupIndex)) { // Find a new group if (isVeryFirstGroup) { // This is the very first group isVeryFirstGroup = false; } else if (lookupTable.refGroupIndex == preRefGroupIndex && lookupTable.readGroupIndex != preReadGroupIndex) { // Assumption (1) has been violated cout << "ERROR! lookupTables should have been sorted by reference" << "group index and read group index." << endl; exit(1); } else { // Find the first lookupTable of a new group, save indices of [first and last) // lookupTables of the last group. groupedLookupTablesIndexPairs.push_back(pair (pairFirst, movieAlignmentIndex)); // Save refGroupIndex and readGroupIndex of the last group pair refGroupIndexReadGroupIndexPair(preRefGroupIndex, preReadGroupIndex); refGroupIndexReadGroupIndexPairs.push_back(refGroupIndexReadGroupIndexPair); } // Store index of the first lookupTable of the new group in lookupTables pairFirst = movieAlignmentIndex; // Store refGroupIndex and readGroupIndex of the new group preRefGroupIndex = lookupTable.refGroupIndex; preReadGroupIndex = lookupTable.readGroupIndex; } } if (not isVeryFirstGroup) { // Save indices of [first and last) lookupTables of the very last group groupedLookupTablesIndexPairs.push_back(pair (pairFirst, movieAlignmentIndex)); // Save refGroupIndex and readGroupIndex of the very last group pair refGroupIndexReadGroupIndexPair(preRefGroupIndex, preReadGroupIndex); refGroupIndexReadGroupIndexPairs.push_back(refGroupIndexReadGroupIndexPair); } // Do nothing, if no lookupTable exists // Double check all assumptions are met for (int i = 0; i < refGroupIndexReadGroupIndexPairs.size(); i++) { for (int j = i+1; j < refGroupIndexReadGroupIndexPairs.size(); j++) { // Assure that assumption (1) is met. If this assertion fails, // then alignments in the input cmp.h5 are not grouped by // reference. Check /AlnInfo/AlnIndex dataset column 3. assert(refGroupIndexReadGroupIndexPairs[i] != refGroupIndexReadGroupIndexPairs[j]); } } assert(groupedLookupTablesIndexPairs.size() == refGroupIndexReadGroupIndexPairs.size()); int i ; for (i = 0; i < groupedLookupTablesIndexPairs.size(); i++) { UInt firstIndex = groupedLookupTablesIndexPairs[i].first; UInt lastIndex = groupedLookupTablesIndexPairs[i].second; UInt refGroupIndex = refGroupIndexReadGroupIndexPairs[i].first; UInt readGroupIndex = refGroupIndexReadGroupIndexPairs[i].second; for(UInt index = firstIndex; index < lastIndex; index++) { assert(lookupTables[index].refGroupIndex == refGroupIndex); assert(lookupTables[index].readGroupIndex == readGroupIndex); } } } // // Read all required fields for computing the specified metric into memory, // unless the fields have been cached. // void CacheRequiredFieldsForMetric( BaseFile & baseFile, PulseFile & pulseFile, HDFBasReader & hdfBasReader, HDFPlsReader & hdfPlsReader, HDFCCSReader & hdfCcsReader, const bool & useBaseFile, const bool & usePulseFile, const bool & useCcsOnly, vector & cachedFields, const string & curMetric) { vector fieldsToBeUsed; bool canBeComputed = CanThisMetricBeComputed( curMetric, hdfBasReader, hdfPlsReader, useBaseFile, usePulseFile, fieldsToBeUsed); assert(canBeComputed); // Cache all required fields for (int i = 0; i < fieldsToBeUsed.size(); i++) { bool isFieldCached = false; for (int j = 0; j < cachedFields.size(); j++) { if (fieldsToBeUsed[i] == cachedFields[j]) { isFieldCached = true; break; } } if (isFieldCached) { continue; } string & curField = fieldsToBeUsed[i].name; FieldType & fieldType= fieldsToBeUsed[i].type; if (fieldType == BasField and useBaseFile and hdfBasReader.FieldIsIncluded(curField) and hdfBasReader.includedFields[curField]) { hdfBasReader.ReadField(baseFile, curField); cachedFields.push_back(fieldsToBeUsed[i]); } else if (fieldType == PlsField and usePulseFile and hdfPlsReader.FieldIsIncluded(curField) and hdfPlsReader.includedFields[curField]) { hdfPlsReader.ReadField(pulseFile, curField); cachedFields.push_back(fieldsToBeUsed[i]); } } } // // Clear cached fields unless they are also required for computing // the next metric. // void ClearCachedFields( BaseFile & baseFile, PulseFile & pulseFile, HDFBasReader & hdfBasReader, HDFPlsReader & hdfPlsReader, HDFCCSReader & hdfCcsReader, const bool & useBaseFile, const bool & usePulseFile, const bool & useCcsOnly, vector & cachedFields, const string & curMetric, const string & nextMetric) { vector nextRequiredFields; if (nextMetric != "") { bool canBeComputed = CanThisMetricBeComputed( nextMetric, hdfBasReader, hdfPlsReader, useBaseFile, usePulseFile, nextRequiredFields); assert(canBeComputed); } for (int i = 0; i < cachedFields.size(); i++) { bool isRequiredForNextMetric = false; for (int j = 0; j < nextRequiredFields.size(); j++) { if (cachedFields[i] == nextRequiredFields[j]) { isRequiredForNextMetric = true; break; } } if (isRequiredForNextMetric) { continue; } string & curField = cachedFields[i].name; FieldType & fieldType= cachedFields[i].type; if (fieldType == BasField and useBaseFile and hdfBasReader.FieldIsIncluded(curField) and hdfBasReader.includedFields[curField]) { hdfBasReader.ClearField(baseFile, curField); // Remove it from cachedFields cachedFields.erase(cachedFields.begin()+i); i--; } else if (fieldType == PlsField and usePulseFile and hdfPlsReader.FieldIsIncluded(curField) and hdfPlsReader.includedFields[curField]) { if (curField == "NumEvent") { // Always keep NumEvent continue; } hdfPlsReader.ClearField(pulseFile, curField); // Remove it from cachedFields cachedFields.erase(cachedFields.begin()+i); i--; } } } // Compute StartFrame from BaseCalls only. // Return true if succeed, false otherwise. bool ComputeStartFrameFromBase( BaseFile & baseFile, HDFBasReader & hdfBasReader, const bool & useBaseFile, MovieAlnIndexLookupTable & lookupTable, vector & newStartFrame) { newStartFrame.resize(lookupTable.readLength); if (useBaseFile and hdfBasReader.FieldIsIncluded("PreBaseFrames") and hdfBasReader.includedFields["PreBaseFrames"] and baseFile.preBaseFrames.size() > 0) { // baseFile.preBaseFrame data type = uint16 // startFrame data type = uint32 for (int i = 0; i < lookupTable.readLength; i++) { newStartFrame[i] = baseFile.preBaseFrames[lookupTable.readStart+i]; } for (int i = 0; i < lookupTable.readLength-1; i++) { newStartFrame[i+1] += baseFile.basWidthInFrames[lookupTable.readStart+i]; } partial_sum(&newStartFrame[0], &newStartFrame[lookupTable.readLength], &newStartFrame[0]); return true; } return false; } // Compute StartFrame from PulseCalls only. // Return true if succeed, false otherwise. bool ComputeStartFrameFromPulse( PulseFile & pulseFile, HDFPlsReader & hdfPlsReader, const bool & usePulseFile, MovieAlnIndexLookupTable & lookupTable, vector & baseToPulseIndexMap, vector & newStartFrame) { newStartFrame.resize(lookupTable.readLength); if (usePulseFile) { assert(pulseFile.startFrame.size() > 0); hdfPlsReader.CopyFieldAt(pulseFile, "StartFrame", lookupTable.plsReadIndex, &baseToPulseIndexMap[0], &newStartFrame[0], lookupTable.readLength); return true; } return false; } // Compute StartFrame from either (1) BaseCalls or (2) PulseCalls. // (1) Uses baseFile.preBaseFrames and baseFile.basWidthInFrames // (2) Uses pulseFile.startFrame // In theory, the generated results using both methods should // be exactly the same. However, they can be different in practice // because PreBaseFrames is of data type uint_16, while its // value can exceed maximum uint_16 (65535). // When possible, always use PulseCalls. void ComputeStartFrame( BaseFile & baseFile, PulseFile & pulseFile, HDFBasReader & hdfBasReader, HDFPlsReader & hdfPlsReader, bool useBaseFile, bool usePulseFile, MovieAlnIndexLookupTable & lookupTable, vector & baseToPulseIndexMap, vector & newStartFrame) { if (!ComputeStartFrameFromPulse(pulseFile, hdfPlsReader, usePulseFile, lookupTable, baseToPulseIndexMap, newStartFrame)) { if (!ComputeStartFrameFromBase(baseFile, hdfBasReader, useBaseFile, lookupTable, newStartFrame)) { cout << "ERROR! There is insufficient data to compute metric: StartFrame." << endl; exit(1); } } } // // Compute and write an entire metric to cmp.h5. // Assume that all required fields have been loaded. // void WriteMetric( CmpFile & cmpFile, BaseFile & baseFile, PulseFile & pulseFile, HDFCmpFile & cmpReader, HDFBasReader & hdfBasReader, HDFPlsReader & hdfPlsReader, HDFCCSReader & hdfCcsReader, const bool & useBaseFile, const bool & usePulseFile, const bool & useCcsOnly, vector & lookupTables, vector > & groupedLookupTablesIndexPairs, const string & curMetric ) { int movieAlignmentIndex = 0; for (int index = 0; index < groupedLookupTablesIndexPairs.size(); index++) { // Group[index] contains all items in lookupTables[firstIndex...lastIndex) UInt firstIndex = groupedLookupTablesIndexPairs[index].first; UInt lastIndex = groupedLookupTablesIndexPairs[index].second; assert(lookupTables.size() > firstIndex); UInt refGroupIndex = lookupTables[firstIndex].refGroupIndex; UInt readGroupIndex = lookupTables[firstIndex].readGroupIndex; // Obtain alignment array length from *.cmp.h5/refGroup/readGroup/AlnArray. HDFCmpExperimentGroup* expGroup = cmpReader.refAlignGroups[refGroupIndex]->readGroups[readGroupIndex]; UInt alnArrayLength = expGroup->alignmentArray.size(); // // Compute any necessary data fields. These usually involve // using differences of pulse indices, pulse widths, etc.. // Missing fields are stored as 0's. // vector startTimeOffsetMetric; // pulseIndex's data type is uint16 in ICD, // but I have seen it defined as uint32 in a bas file. vector pulseMetric; vector qvMetric; vector frameRateMetric; vector timeMetric; vector tagMetric; vector floatMetric; /* if (curMetric == "StartTimeOffset") { startTimeOffsetMetric.resize(alnNum); HDFArray * data = (HDFArray*) expGroup->fields[curMetric]; if (data->IsInitialized()) { assert(data->size() == alnNum); data->UpdateH5Dataspace(); data->Read(0, alnNum-1, &StartTimeOffsetMetric[0]); } else { data->Initialize(expGroup->experimentGroup, curMetric); fill(startTimeOffsetMetric.begin(), startTimeOffsetMetric.end(), ); } } else */ if (curMetric == "QualityValue" || curMetric == "InsertionQV" || curMetric == "DeletionQV" || curMetric == "MergeQV" || curMetric == "SubstitutionQV") { qvMetric.resize(alnArrayLength); HDFArray * data = (HDFArray*) expGroup->fields[curMetric]; if (data->IsInitialized()) { assert(data->size() == alnArrayLength); data->UpdateH5Dataspace(); data->Read(0, alnArrayLength-1, &qvMetric[0]); } else { data->Initialize(expGroup->experimentGroup, curMetric); //fill(qvMetric.begin(), qvMetric.end(), missingQualityValue); } } else if (curMetric == "ClassifierQV" || curMetric == "pkmid" ) { // Note that data type of pkmid=midSignal, which is uint_8 in bas/pls files, // has been changed to float in cmp.h5. Why? floatMetric.resize(alnArrayLength); HDFArray * data = (HDFArray*) expGroup->fields[curMetric]; if (data->IsInitialized()) { assert(data->size() == alnArrayLength); data->UpdateH5Dataspace(); data->Read(0, alnArrayLength-1, &floatMetric[0]); } else { data->Initialize(expGroup->experimentGroup, curMetric); //fill(floatMetric.begin(), floatMetric.end(), NaN); } } else if (curMetric == "PulseIndex" ) { pulseMetric.resize(alnArrayLength); HDFArray * data = (HDFArray*) expGroup->fields[curMetric]; if (data->IsInitialized()) { assert(data->size() == alnArrayLength); data->UpdateH5Dataspace(); data->Read(0, alnArrayLength-1, &pulseMetric[0]); } else { data->Initialize(expGroup->experimentGroup, curMetric); //fill(pulseMetric.begin(), pulseMetric.end(), 0); } } else if (curMetric == "DeletionTag" || curMetric == "SubstitutionTag") { tagMetric.resize(alnArrayLength); HDFArray * data = (HDFArray*) expGroup->fields[curMetric]; if (data->IsInitialized()) { assert(data->size() == alnArrayLength); data->UpdateH5Dataspace(); data->Read(0, alnArrayLength-1, &tagMetric[0]); } else { data->Initialize(expGroup->experimentGroup, curMetric); //fill(tagMetric.begin(), tagMetric.end(), '-'); } } else if (curMetric == "StartFrame" || curMetric == "StartFrameBase" || curMetric == "StartFramePulse") { timeMetric.resize(alnArrayLength); HDFArray * data = (HDFArray*) expGroup->fields[curMetric]; if (data->IsInitialized()) { assert(data->size() == alnArrayLength); data->UpdateH5Dataspace(); data->Read(0, alnArrayLength-1, &timeMetric[0]); } else { data->Initialize(expGroup->experimentGroup, curMetric); //fill(timeMetric.begin(), timeMetric.end(), missingPulseIndex); } } else if (curMetric == "PulseWidth" || curMetric == "PreBaseFrames" || curMetric == "WidthInFrames"|| curMetric == "IPD" || curMetric == "Light") { frameRateMetric.resize(alnArrayLength); HDFArray * data = (HDFArray*) expGroup->fields[curMetric]; if (data->IsInitialized()) { assert(data->size() == alnArrayLength); data->UpdateH5Dataspace(); data->Read(0, alnArrayLength-1, &frameRateMetric[0]); } else { data->Initialize(expGroup->experimentGroup, curMetric); //fill(frameRateMetric.begin(), frameRateMetric.end(), missingFrameRateValue); } } else { cout << "ERROR, metric " << curMetric << " is not supported." << endl; exit(1); } for (movieAlignmentIndex = firstIndex; movieAlignmentIndex < lastIndex; movieAlignmentIndex++) { MovieAlnIndexLookupTable & lookupTable = lookupTables[movieAlignmentIndex]; if (lookupTable.skip) continue; const UInt alignedSequenceLength = lookupTable.offsetEnd - lookupTable.offsetBegin; const UInt ungappedAlignedSequenceLength = lookupTable.queryEnd - lookupTable.queryStart; const UInt & readIndex = lookupTable.readIndex; const UInt & plsReadIndex = lookupTable.plsReadIndex; const UInt & readStart = lookupTable.readStart; const UInt & readLength = lookupTable.readLength; const UInt & queryStart = lookupTable.queryStart; const UInt & offsetBegin = lookupTable.offsetBegin; const UInt & offsetEnd = lookupTable.offsetEnd; assert (offsetEnd <= alnArrayLength); assert (offsetBegin+alignedSequenceLength <= alnArrayLength); // Condense gaps and get ungapped aligned sequence. string ungappedAlignedSequence = lookupTable.alignedSequence; RemoveGaps(ungappedAlignedSequence, ungappedAlignedSequence); vector baseToAlignmentMap; // Map bases in the aligned sequence to their positions in the alignment. CreateSequenceToAlignmentMap(lookupTable.alignedSequence, baseToAlignmentMap); vector baseToPulseIndexMap; if (usePulseFile && IsPulseMetric(curMetric)) { // Map bases in the read to pulse indices. MapBaseToPulseIndex(baseFile, pulseFile, lookupTable, baseToPulseIndexMap); } UInt i; if (curMetric == "QualityValue") { assert(baseFile.qualityValues.size() > 0 && baseFile.qualityValues.size() >= readStart + readLength); fill(&qvMetric[offsetBegin], &qvMetric[offsetEnd], missingPulseIndex); for (i = 0; i < ungappedAlignedSequenceLength; i++ ) { // cap quality value qvMetric[offsetBegin+baseToAlignmentMap[i]] = min(maxQualityValue, baseFile.qualityValues[readStart+queryStart+i]); } qvMetric[offsetBegin+alignedSequenceLength] = 0; } else if (curMetric == "InsertionQV") { assert(baseFile.insertionQV.size() > 0 && baseFile.insertionQV.size() >= readStart + readLength); fill(&qvMetric[offsetBegin], &qvMetric[offsetEnd], missingPulseIndex); for (i = 0; i < ungappedAlignedSequenceLength; i++) { // cap quality value qvMetric[offsetBegin+baseToAlignmentMap[i]] = min(maxQualityValue, baseFile.insertionQV[readStart+queryStart+i]); } qvMetric[offsetBegin+alignedSequenceLength] = 0; } else if (curMetric == "MergeQV") { assert(baseFile.mergeQV.size() > 0 && baseFile.mergeQV.size() >= readStart + readLength); fill(&qvMetric[offsetBegin], &qvMetric[offsetEnd], missingPulseIndex); for (i = 0; i < ungappedAlignedSequenceLength; i++ ) { // cap quality value qvMetric[offsetBegin+baseToAlignmentMap[i]] = min(maxQualityValue, baseFile.mergeQV[readStart+queryStart+i]); } qvMetric[offsetBegin+alignedSequenceLength] = 0; } else if (curMetric == "DeletionQV") { assert(baseFile.deletionQV.size() > 0 && baseFile.deletionQV.size() >= readStart + readLength); fill(&qvMetric[offsetBegin], &qvMetric[offsetEnd], missingPulseIndex); for (i = 0; i < ungappedAlignedSequenceLength; i++) { // cap quality value qvMetric[offsetBegin+baseToAlignmentMap[i]] = min(maxQualityValue, baseFile.deletionQV[readStart+queryStart+i]); } qvMetric[offsetBegin+alignedSequenceLength] = 0; } else if (curMetric == "DeletionTag") { assert(baseFile.deletionTag.size() > 0 && baseFile.deletionTag.size() >= readStart + readLength); fill(&tagMetric[offsetBegin], &tagMetric[offsetEnd], '-'); for (i = 0; i < ungappedAlignedSequenceLength; i++ ) { assert(offsetBegin+baseToAlignmentMap[i] < tagMetric.size()); tagMetric[offsetBegin+baseToAlignmentMap[i]] = baseFile.deletionTag[readStart+queryStart+i]; } tagMetric[offsetBegin+alignedSequenceLength] = 0; } else if (curMetric == "PulseIndex") { assert(baseFile.pulseIndex.size() > 0 && baseFile.pulseIndex.size() >= readStart + readLength); fill(&pulseMetric[offsetBegin], &pulseMetric[offsetEnd], 0); for (i = 0; i < ungappedAlignedSequenceLength; i++ ) { pulseMetric[offsetBegin+baseToAlignmentMap[i]] = baseFile.pulseIndex[readStart+queryStart+i]; } pulseMetric[offsetBegin+alignedSequenceLength] = 0; } else if (curMetric == "SubstitutionTag") { assert(baseFile.substitutionTag.size() > 0 && baseFile.substitutionTag.size() >= readStart + readLength); fill(&tagMetric[offsetBegin], &tagMetric[offsetEnd], '-'); for (i = 0; i < ungappedAlignedSequenceLength; i++ ) { tagMetric[offsetBegin+baseToAlignmentMap[i]] = baseFile.substitutionTag[readStart+queryStart+i]; } tagMetric[offsetBegin+alignedSequenceLength] = 0; } else if (curMetric == "SubstitutionQV") { assert(baseFile.substitutionQV.size() > 0 && baseFile.substitutionQV.size() >= readStart + readLength); fill(&qvMetric[offsetBegin], &qvMetric[offsetEnd], missingPulseIndex); for (i = 0; i < ungappedAlignedSequenceLength; i++ ) { qvMetric[offsetBegin+baseToAlignmentMap[i]] = min(maxQualityValue, baseFile.substitutionQV[readStart+queryStart+i]); } qvMetric[offsetBegin+alignedSequenceLength] = 0; } else if (curMetric == "ClassifierQV") { assert(pulseFile.classifierQV.size() > 0 && pulseFile.classifierQV.size() >= readStart + readLength); vector newClassifierQV; newClassifierQV.resize(ungappedAlignedSequenceLength); // For the data used for this table, it is possible to simply // reference the data for the bas file, but for the pls file, // it is necessary to copy since there is a packing of data. hdfPlsReader.CopyFieldAt(pulseFile, "ClassifierQV", plsReadIndex, &baseToPulseIndexMap[queryStart], &newClassifierQV[0], ungappedAlignedSequenceLength); fill(&floatMetric[offsetBegin], &floatMetric[offsetEnd], NaN); for (i = 0; i < ungappedAlignedSequenceLength; i++ ) { floatMetric[offsetBegin+baseToAlignmentMap[i]] = newClassifierQV[i]; } floatMetric[offsetBegin+alignedSequenceLength] = 0; /* } else if (curMetric == "StartTimeOffset") { // StartTimeOffset is a subset of StartFrame. vector newStartFrame; ComputeStartFrame(baseFile, pulseFile, hdfBasReader, hdfPlsReader, useBaseFile, usePulseFile, lookupTable, baseToPulseIndexMap, newStartFrame); startTimeOffsetMetric[offsetBegin] = newStartFrame[queryStart]; */ } else if (curMetric == "StartFrame") { vector newStartFrame; ComputeStartFrame(baseFile, pulseFile, hdfBasReader, hdfPlsReader, useBaseFile, usePulseFile, lookupTable, baseToPulseIndexMap, newStartFrame); fill(&timeMetric[offsetBegin], &timeMetric[offsetEnd], missingPulseIndex); for (i = 0; i < ungappedAlignedSequenceLength; i++ ) { timeMetric[offsetBegin+baseToAlignmentMap[i]] = newStartFrame[queryStart+i]; } timeMetric[offsetBegin+alignedSequenceLength] = 0; } else if (curMetric == "StartFrameBase") { // Sneaky metric, compute StartFrame from BaseCalls only. vector newStartFrame; ComputeStartFrameFromBase(baseFile, hdfBasReader, useBaseFile, lookupTable, newStartFrame); fill(&timeMetric[offsetBegin], &timeMetric[offsetEnd], missingPulseIndex); for (i = 0; i < ungappedAlignedSequenceLength; i++ ) { timeMetric[offsetBegin+baseToAlignmentMap[i]] = newStartFrame[queryStart+i]; } timeMetric[offsetBegin+alignedSequenceLength] = 0; } else if (curMetric == "StartFramePulse") { // Sneaky metric, compute StartFrame from PulseCalls only. vector newStartFrame; ComputeStartFrameFromPulse(pulseFile, hdfPlsReader, usePulseFile, lookupTable, baseToPulseIndexMap, newStartFrame); fill(&timeMetric[offsetBegin], &timeMetric[offsetEnd], missingPulseIndex); for (i = 0; i < ungappedAlignedSequenceLength; i++ ) { timeMetric[offsetBegin+baseToAlignmentMap[i]] = newStartFrame[queryStart+i]; } timeMetric[offsetBegin+alignedSequenceLength] = 0; } else if (curMetric == "PreBaseFrames") { // Directly load baseFile.PreBaseFrames. // DON'T compute it from PulseCalls even if you can. assert(baseFile.preBaseFrames.size() > 0 && baseFile.preBaseFrames.size() >= readStart + readLength); fill(&frameRateMetric[offsetBegin], &frameRateMetric[offsetEnd], missingFrameRateValue); for (i = 0; i < ungappedAlignedSequenceLength; i++ ) { frameRateMetric[offsetBegin+baseToAlignmentMap[i]] = baseFile.preBaseFrames[readStart+queryStart+i]; } frameRateMetric[offsetBegin+alignedSequenceLength] = 0; } else if (curMetric == "WidthInFrames" || curMetric == "PulseWidth") { // For legacy reasons, it's possible the width in frames is // stored in the bas file. If this is the case, use the width // in frames there. Otherwise, use the width in frames stored // in the pls file. vector newWidthInFrames; newWidthInFrames.resize(ungappedAlignedSequenceLength); if (usePulseFile) { hdfPlsReader.CopyFieldAt(pulseFile, "WidthInFrames", plsReadIndex, &baseToPulseIndexMap[queryStart], &newWidthInFrames[0], ungappedAlignedSequenceLength); } else if (useBaseFile) { // basWidthInFrames data type uint16 copy(&baseFile.basWidthInFrames[readStart+queryStart], &baseFile.basWidthInFrames[readStart+queryStart+ungappedAlignedSequenceLength], &newWidthInFrames[0]); } fill(&frameRateMetric[offsetBegin], &frameRateMetric[offsetEnd], missingFrameRateValue); for (i = 0; i < ungappedAlignedSequenceLength; i++ ) { frameRateMetric[offsetBegin+baseToAlignmentMap[i]] = newWidthInFrames[i]; } frameRateMetric[offsetBegin+alignedSequenceLength] = 0; } else if (curMetric == "pkmid") { // pkmid in cmp.h5 is MidSignal in pls.h5, but // data type of MidSignal is uint16 in pls files, // data type of pkmid is float in cmp files. assert(usePulseFile); vector newMidSignal; newMidSignal.resize(ungappedAlignedSequenceLength); hdfPlsReader.CopyFieldAt(pulseFile, "MidSignal", plsReadIndex, &baseToPulseIndexMap[queryStart], &newMidSignal[0], ungappedAlignedSequenceLength, ungappedAlignedSequence); fill(&floatMetric[offsetBegin], &floatMetric[offsetEnd], NaN); for (i = 0; i < ungappedAlignedSequenceLength; i++ ) { floatMetric[offsetBegin+baseToAlignmentMap[i]] = newMidSignal[i]; } floatMetric[offsetBegin+alignedSequenceLength] = 0; } else if (curMetric == "IPD") { fill(&frameRateMetric[offsetBegin], &frameRateMetric[offsetEnd], missingFrameRateValue); // IPD can be either (1) copied from baseFile.preBaseFrames // or (2) computed from pulseFile.StartFrame and pulseFile.WidthInFrames // Always use method (2) when possible as it is more accurate. if (usePulseFile) { // Need to read StartFrame & WidthInFrames for the entire read, // not only for a subset of bases in the alignment assert(pulseFile.startFrame.size() > 0); assert(pulseFile.plsWidthInFrames.size() > 0); vector newStartFrame; newStartFrame.resize(readLength); hdfPlsReader.CopyFieldAt(pulseFile, "StartFrame", plsReadIndex, &baseToPulseIndexMap[0], &newStartFrame[0], readLength); vector newWidthInFrames; newWidthInFrames.resize(readLength); hdfPlsReader.CopyFieldAt(pulseFile, "WidthInFrames", plsReadIndex, &baseToPulseIndexMap[0], &newWidthInFrames[0], readLength); for (i = 0; i < ungappedAlignedSequenceLength; i++) { // The IPD is undefined for the first base in a read. if (queryStart == 0 and i == 0) { frameRateMetric[offsetBegin+baseToAlignmentMap[i]] = 0; } else { frameRateMetric[offsetBegin+baseToAlignmentMap[i]] = newStartFrame[queryStart+i] - newStartFrame[i+queryStart-1] - newWidthInFrames[i+queryStart-1]; } } } else if (useBaseFile) { assert(baseFile.preBaseFrames.size() > 0); assert(baseFile.preBaseFrames.size() >= readStart + readLength); for (i = 0; i < ungappedAlignedSequenceLength; i++) { frameRateMetric[offsetBegin+baseToAlignmentMap[i]] = baseFile.preBaseFrames[readStart+queryStart+i]; } } frameRateMetric[offsetBegin+alignedSequenceLength] = 0; } else if (curMetric == "Light") { // Light can be computed from pulseFile.meanSignal and // pulseFile.plsWidthInFrames. Might have been deprecated. assert(usePulseFile); fill(&frameRateMetric[offsetBegin], &frameRateMetric[offsetEnd], missingFrameRateValue); vector newMeanSignal; newMeanSignal.resize(ungappedAlignedSequenceLength); hdfPlsReader.CopyFieldAt(pulseFile, "MeanSignal", plsReadIndex, &baseToPulseIndexMap[queryStart], &newMeanSignal[0], ungappedAlignedSequenceLength, ungappedAlignedSequence); vector newWidthInFrames; newWidthInFrames.resize(ungappedAlignedSequenceLength); hdfPlsReader.CopyFieldAt(pulseFile, "WidthInFrames", plsReadIndex, &baseToPulseIndexMap[queryStart], &newWidthInFrames[0], ungappedAlignedSequenceLength); for (i = 0; i < ungappedAlignedSequenceLength; i++ ) { frameRateMetric[offsetBegin+baseToAlignmentMap[i]] = newMeanSignal[i] * newWidthInFrames[i]; } frameRateMetric[offsetBegin+alignedSequenceLength] = 0; } else { cout << "ERROR, unknown metric " << curMetric << endl; exit(1); } } // Write the computed metric to cmp.h5. /*if (curMetric == "StartTimeOffset") { expGroup->startTimeOffset.WriteToPos(&startTimeOffsetMetric[0], startTimeOffsetMetric.size(), 0); } else */ if (curMetric == "QualityValue" || curMetric == "InsertionQV" || curMetric == "DeletionQV" || curMetric == "MergeQV" || curMetric == "SubstitutionQV") { HDFArray * data = (HDFArray *) expGroup->fields[curMetric]; data->WriteToPos(&qvMetric[0], qvMetric.size(), 0); } else if (curMetric == "ClassifierQV" || curMetric == "pkmid" ) { HDFArray * data = (HDFArray *) expGroup->fields[curMetric]; data->WriteToPos(&floatMetric[0], floatMetric.size(), 0); } else if (curMetric == "PulseIndex") { HDFArray * data = (HDFArray *) expGroup->fields[curMetric]; data->WriteToPos(&pulseMetric[0], pulseMetric.size(), 0); } else if (curMetric == "DeletionTag" || curMetric == "SubstitutionTag") { HDFArray * data = (HDFArray *) expGroup->fields[curMetric]; data->WriteToPos(&tagMetric[0], tagMetric.size(), 0); } else if (curMetric == "StartFrame" || curMetric == "StartFrameBase"|| curMetric == "StartFramePulse") { HDFArray * data = (HDFArray*) expGroup->fields[curMetric]; data->WriteToPos(&timeMetric[0], timeMetric.size(), 0); } else if (curMetric == "PulseWidth" || curMetric == "PreBaseFrames" || curMetric == "WidthInFrames"|| curMetric == "IPD" || curMetric == "Light") { HDFArray * data = (HDFArray*) expGroup->fields[curMetric]; data->WriteToPos(&frameRateMetric[0], frameRateMetric.size(), 0); } else { cout << "ERROR, unknown metric " << curMetric << endl; exit(1); } } } // // Write "WhenStarted" from pls.h5 and write to cmp.h5 // void WriteMetricWhenStarted( HDFCmpFile & cmpReader, HDFPlsReader & hdfPlsReader, const string & movieName) { string metric = "WhenStarted"; string whenStarted; if (hdfPlsReader.scanDataReader.useWhenStarted == false) { cout << "ERROR! Attempting to read WhenStarted from " << movieName << " but the attriubte does not exist." << endl; exit(1); } hdfPlsReader.scanDataReader.ReadWhenStarted(whenStarted); if (!cmpReader.movieInfoGroup.whenStartedArray.IsInitialized()) { cmpReader.movieInfoGroup.whenStartedArray.Initialize(cmpReader.movieInfoGroup.movieInfoGroup, metric); } cmpReader.movieInfoGroup.whenStartedArray.Write(&whenStarted, 1); } // // Print metrics. // string MetricsToString(const vector & metrics) { string ret = ""; int j = 0; for (int i = 0; i < metrics.size(); i++) { ret += metrics[i]; if (i != metrics.size()-1) ret += ","; if (i % 4 == 3) ret += "\n"; } return ret; } // // Print usage. // void PrintUsage() { cout << " loadPulses - Load pulse information and quality values into a Compare file" << endl; cout << "usage: loadPulses movieFile cmpFile [-metrics m1,m2,...] [-byread]" << endl; cout << " movieFile may be a movie file or a fofn of movie file names." << endl; cout << " metrics m1,m2,... is a comma-separated list (without spaces) of metrics " << endl << " to print to the pulse file." << endl; cout << " Valid metrics are: " << endl; cout << MetricsToString(GetAllSupportedMetrics(false)) << endl; // << " QualityValue, ClassifierQV, MergeQV," << endl // << " StartFrame, PulseWidth, pkmid, IPD, Light" << endl // << " WhenStarted, StartTimeOffset, PreBaseFrames," << endl // << " InsertionQV, DeletionQV, DeletionTag, SubstitutionQV" << endl // << " SubstitutionTag, PulseIndex, WidthInFrames" << endl; cout << " By default, " << MetricsToString(GetDefaultMetrics()) << " are added" << endl; // Deprecate -useccs, an option for old data. // cout << " -useccs This option is for older cmp.h5 files that do not have the read type " << endl // << " stored. Newer cmp.h5 files have a read type that indicates the cmp.h5 file " << endl // << " has alignments generated from de novo ccs sequences. Using this flag assuems"< metricOptions; int maxElements = 0; //Maximum Memory allowed for bymetric is 6 GB int maxMemory = 4; // // Default is all options are false // CreateMetricOptions(metricOptions); string metricList = ""; bool useCcsOnly = false; bool byRead = false; bool byMetric = false; bool failOnMissingData = false; CommandLineParser clp; clp.SetProgramName(program); clp.SetVersion(versionStr); clp.RegisterStringOption("basFileName", &movieFileName, "The input {bas,pls}.h5 or input.fofn.", true); clp.RegisterStringOption("cmpFileName", &cmpFileName, "The cmp.h5 file to load pulse information into.", true); clp.RegisterPreviousFlagsAsHidden(); string metricsDescription = "A comma separated list of metrics (with no spaces).\nValid options are:\n"; metricsDescription += MetricsToString(GetAllSupportedMetrics(false)); metricsDescription += "\nDefault options are:\n"; metricsDescription += MetricsToString(GetDefaultMetrics()); clp.RegisterStringOption("metrics", &metricList, metricsDescription); clp.RegisterFlagOption("failOnMissingData", &failOnMissingData, "Exit if any data fields are missing from the bas.h5 or pls.h5 " "input that are required to load a metric. Defualt is a warning."); clp.RegisterFlagOption("byread", &byRead, "Load pulse information by read rather than buffering metrics."); clp.RegisterFlagOption("bymetric", & byMetric, "Load pulse information by metric rather than by read. " "This uses more memory than -byread, but can be faster."); clp.RegisterIntOption("maxElements", &maxElements, "Set a limit on the size of pls/bas file to buffer in with -bymetric " "(default value: maximum int). Use -byread if the limit is exceeded.", CommandLineParser::PositiveInteger); clp.RegisterIntOption("maxMemory", & maxMemory, "Set a limit (in GB) on the memory to buffer data with -bymetric " "(default value: 4 GB). Use -byread if the limit is exceeded.", CommandLineParser::PositiveInteger); int metaNElements, rawChunkSize, rawNElements; metaNElements = rawChunkSize = metaNElements = 0; clp.RegisterIntOption("metaNElements", & metaNElements, "Set number of elements in meta data cache for reading bas/bax/pls.h5 file.", CommandLineParser::PositiveInteger); clp.RegisterIntOption("rawNElements", & rawNElements, "Set number of elements in raw data cache for reading bas/bax/pls.h5 file.", CommandLineParser::PositiveInteger); clp.RegisterIntOption("rawChunkSize", & rawChunkSize, "Set chunk size of raw data cache for reading bas/bax/pls.h5 file.", CommandLineParser::PositiveInteger); string progSummary = ("Loads pulse information such as inter pulse " "distance, or quality information into the cmp.h5 file. This allows " "one to analyze kinetic and quality information by alignment column."); clp.SetProgramSummary(progSummary); clp.ParseCommandLine(argc, argv); cerr << "[INFO] " << GetTimestamp() << " [" << program << "] started." << endl; //use byMetric by default unless byRead is specified. byMetric = true; if (byRead) { byMetric = false; } if (metricList == "") { SetDefaultMetricOptions(metricOptions); } else { ParseMetricsList(metricList, metricOptions); } // // Always read in basecalls since they are used to check the sanity // of the alignment indices. // metricOptions["Basecall"] = true; // // Translate from the metrics to be loaded to the ones that are // required to compute them. // Need to be refactored. // vector datasetFields; RequirementMap fieldRequirements; BuildRequirementMap(fieldRequirements); StoreDatasetFieldsFromPulseFields(metricOptions, fieldRequirements, datasetFields); //e.g. /PATH_TO_FILE/m120321_032600_42142_c100310572550000001523013208061210_s1_p0.bas.h5 // /PATH_TO_FILE/m120321_032600_42142_c100310572550000001523013208061210_s2_p0.bas.h5 vector movieFileNames; //e.g. m120321_032600_42142_c100310572550000001523013208061210_s1_p0 // m120321_032600_42142_c100310572550000001523013208061210_s2_p0 vector fofnMovieNames; FileOfFileNames::StoreFileOrFileList(movieFileName, movieFileNames); HDFBasReader hdfBasReader; HDFPlsReader hdfPlsReader; HDFCCSReader hdfCcsReader; vector baseFileFields, pulseFileFields; int fieldIndex; bool useBaseFile = false, usePulseFile = false; for (fieldIndex = 0; fieldIndex < datasetFields.size(); fieldIndex++) { if (hdfBasReader.ContainsField(datasetFields[fieldIndex])) { useBaseFile = true; baseFileFields.push_back(datasetFields[fieldIndex]); } } if (maxElements != 0) { hdfBasReader.maxAllocNElements = maxElements; hdfPlsReader.maxAllocNElements = maxElements; } // // For now, all runs will attempt to use information from a .bas // file, since it's assumed that if one has alignments, one has a // .bas file. // useBaseFile = true; // // Add some default fields. // hdfBasReader.IncludeField("Basecall"); hdfBasReader.IncludeField("PulseIndex"); hdfBasReader.InitializeFields(baseFileFields); for (fieldIndex = 0; fieldIndex < datasetFields.size(); fieldIndex++) { if (hdfPlsReader.ContainsField(datasetFields[fieldIndex])) { usePulseFile = true; pulseFileFields.push_back(datasetFields[fieldIndex]); } } if (usePulseFile) { // set hdfPlsReader.includedFields[fieldX] to true if fieldX is // in pulseFileFields hdfPlsReader.InitializeFields(pulseFileFields); } hdfPlsReader.IncludeField("NumEvent"); int nMovies = movieFileNames.size(); int movieIndex; MovieNameToArrayIndex movieNameMap; // // Initialize movies. This accomplishes two tasks. First, all movie // files are opened and initialized, so that if there are data // fields missing the program will exit now rather than in the // middle of loading pulses. // Next, a list of movie names is created in fofnMovieNames. The // cmp file does not necessarily index movies in the order of the // fofn, and so when loading pulses from a movie indexed by a cmp // file, one needs to look up the file name of the movie. This is // done by scanning the fofnMovieNames list in order until the movie // is found. // // h5 file access property list can be customized here. // H5::FileAccPropList fileAccPropList = H5::FileAccPropList::DEFAULT; // h5: number of items in meta data cache int mdc_nelmts = (metaNElements==0)?(4096):(metaNElements); // h5: number of items in raw data chunk cache size_t rdcc_nelmts = (rawNElements==0)?(4096):(rawNElements); // h5: raw data chunk cache size (in bytes) per dataset size_t rdcc_nbytes = (rawChunkSize==0)?(9192):(rawChunkSize); double rdcc_w0 = 0.75; // h5: preemption policy // fileAccPropList.getCache(mdc_nelmts, rdcc_nelmts, rdcc_nbytes, rdcc_w0); fileAccPropList.setCache(mdc_nelmts, rdcc_nelmts, rdcc_nbytes, rdcc_w0); // fileAccPropList.setCache(4096, 4096, 8388608, rdcc_w0); // If one of the h5 in the fofn is a ccs.h5 file, then only load pulse // information from group /PulseData/ConsensusBaseCalls. for (movieIndex = 0; movieIndex < nMovies; movieIndex++) { FileType fileType; BaseSequenceIO::DetermineFileTypeByExtension(movieFileNames[movieIndex], fileType, true); if (fileType == HDFCCSONLY) { useCcsOnly = true; } } for (movieIndex = 0; movieIndex < nMovies; movieIndex++) { if (useCcsOnly) { hdfCcsReader.SetReadBasesFromCCS(); hdfBasReader.SetReadBasesFromCCS(); } if (!hdfBasReader.Initialize(movieFileNames[movieIndex], fileAccPropList)) { cout << "ERROR, could not initialize HDF file " << movieFileNames[movieIndex] << " for reading bases." << endl; exit(1); } else { fofnMovieNames.push_back(hdfBasReader.GetMovieName()); movieNameMap[hdfBasReader.GetMovieName()] = movieIndex; hdfBasReader.Close(); } // // The pulse file is optional. // if (usePulseFile) { if (hdfPlsReader.Initialize(movieFileNames[movieIndex], fileAccPropList) == 0) { usePulseFile = false; } } } CmpFile cmpFile; // // These readers pull information from the same pls file. // HDFCmpFile cmpReader; if (cmpReader.Initialize(cmpFileName, H5F_ACC_RDWR) == 0) { cout << "ERROR, could not open the cmp file." << endl; exit(1); } if (cmpReader.HasNoAlignments()) { cout << "WARNING, there is no alignment in the cmp file." << endl; if (useBaseFile) { hdfBasReader.Close(); } if (usePulseFile) { hdfPlsReader.Close(); } cmpReader.Close(); cerr << "[INFO] " << GetTimestamp() << " [" << program << "] ended." << endl; exit(0); } cmpReader.Read(cmpFile, false); // Sanity check: if there is a ccs.h5 file in the fofn and // cmp.h5 file's readType is not CCS, something is wrong. if (cmpFile.readType != ReadType::CCS and useCcsOnly) { cout << "ERROR, there is a ccs.h5 file in the fofn, while read type of" << " the cmp.h5 file is not CCS." << endl; exit(1); } string commandLine; clp.CommandLineToString(argc, argv, commandLine); cmpReader.fileLogGroup.AddEntry(commandLine, "Loading pulse metrics", program, GetTimestamp(), versionStr); // // Group alignment indices by movie so that they may be processed one movie at a time // later on. The movie indices set keeps track of all indices // listed in alignment files. This keeps a reference to all // alignments in memory at once. At the time of writing this, most // projects will have at most a few million alignments, and so the // size of this structure is modest. // Each movieIndexSets[$movieId] contains indices of all the alignments, which // are associated with a movie whose id in dataset /MovieInfo/ID equals $movieId // UInt alignmentIndex; map > movieIndexSets; for (alignmentIndex = 0; alignmentIndex < cmpFile.alnInfo.alignments.size(); alignmentIndex++) { movieIndexSets[cmpFile.alnInfo.alignments[alignmentIndex].GetMovieId()].push_back(alignmentIndex); } // // Load pulses from movies in order they appear in the input fofn. // int m; int fofnMovieIndex; for (fofnMovieIndex = 0; fofnMovieIndex < fofnMovieNames.size(); fofnMovieIndex++) { bool byMetricForThisMovie = byMetric; if (cmpFile.readType == ReadType::CCS or useCcsOnly) { hdfBasReader.SetReadBasesFromCCS(); hdfCcsReader.Initialize(movieFileNames[fofnMovieIndex], fileAccPropList); } hdfBasReader.Initialize(movieFileNames[fofnMovieIndex], fileAccPropList); BaseFile baseFile; PulseFile pulseFile; // // Deprecate reading the entire bas.h5 file. Reads are scanned // one by one or by metric, instead of caching all. // It is still necessary to read in some of the datasets entirely, // in particular the start positions and hole numbers. // hdfBasReader.ReadBaseFileInit(baseFile); set moviePartHoleNumbers; copy(baseFile.holeNumbers.begin(), baseFile.holeNumbers.end(), inserter(moviePartHoleNumbers, moviePartHoleNumbers.begin())); if (usePulseFile) { hdfPlsReader.Initialize(movieFileNames[fofnMovieIndex], fileAccPropList); hdfPlsReader.IncludeField("NumEvent"); hdfPlsReader.IncludeField("StartFrame"); // // Deprecate reading the entire pls.h5 file. // Reads are scanned by read or by metric instead of caching all. // It is still necessary to read in some of the datasets entirely, // in particular the start positions and hole numbers. // hdfPlsReader.ReadPulseFileInit(pulseFile); } string cmpFileMovieName; for (m = 0; m < cmpFile.movieInfo.name.size(); m++) { // // First find the file name for the movie 'm' // cmpFileMovieName = cmpFile.movieInfo.name[m]; if (baseFile.GetMovieName() == cmpFileMovieName) { break; } } // // If the movie specified in the input.fofn is not found in the // cmp file, that indicates something bad is happeing. Either the // input.fofn was not used to generate the cmp.h5 file, or no // alignments were found between the input bas.h5 and the // reference. That shouldn't happen. // if (m == cmpFile.movieInfo.name.size()) { cout << "WARNING: Could not find any alignments for file " << movieFileNames[fofnMovieIndex] << endl; continue; } // // Open the movie and load its pulses into memory. // movieIndex = cmpFile.movieInfo.id[m]; UInt movieAlignmentIndex; // // Since usePulseFile is set when the input file is a pulseFile, // and ReadType::CCS becomes the read type when the alignments are // ccs, when pulse files are specified for de novo ccs alignments, // they will be opened as pulse files. Since the de novo ccs // sequences do not have pulse file information, the auto-reading // of pulse files needs to be disabled. Do that here. // if (cmpFile.readType == ReadType::CCS or useCcsOnly) { usePulseFile = false; } // Check whether all metrics are computable or not. CanMetricsBeComputed(metricOptions, hdfBasReader, hdfPlsReader, useBaseFile, usePulseFile, failOnMissingData, movieFileNames[fofnMovieIndex]); // Get all metrics that are (1) supported, (2) required and (3) can be loaded. vector metricsToLoad = GetMetricsToLoad(metricOptions); // // An index set is a set of indices into the alignment array that // are of reads generated by this movie. Load pulses for all // alignments generated for this movie. // // Movie index sets should be sorted by alignment index. Build a lookup table for this. // std::vector > toFrom; UInt totalAlnLength = 0; for (movieAlignmentIndex = 0; movieAlignmentIndex < movieIndexSets[movieIndex].size(); movieAlignmentIndex++) { alignmentIndex = movieIndexSets[movieIndex][movieAlignmentIndex]; totalAlnLength += cmpFile.alnInfo.alignments[alignmentIndex].GetOffsetEnd() - \ cmpFile.alnInfo.alignments[alignmentIndex].GetOffsetBegin(); toFrom.push_back(std::pair(cmpFile.alnInfo.alignments[alignmentIndex].GetAlignmentId(), movieAlignmentIndex)); } // orders by first by default. std::sort(toFrom.begin(), toFrom.end()); // // Check metric dataset size in this movie and the required memory // consumption, if either limit is exceeded, switch to byread. // if (byMetricForThisMovie) { UInt requiredMem = ComputeRequiredMemory(metricsToLoad, hdfBasReader, hdfPlsReader, useBaseFile, usePulseFile, cmpReader, totalAlnLength); if (hdfBasReader.baseArray.arrayLength > hdfBasReader.maxAllocNElements or (usePulseFile and hdfPlsReader.GetStartFrameSize() > hdfPlsReader.maxAllocNElements) or ((float)requiredMem / 1024 / 1024) > maxMemory) { cout << "Either the number of elements exceeds maxElement (" << hdfPlsReader.maxAllocNElements << "). Or the estimated memory " << endl << "consumption exceeds maxMemory (" << maxMemory << " GB)." << endl << "Loading pulses from " << movieFileNames[fofnMovieIndex] << " by read." << endl; byMetricForThisMovie = false; } } if (((metricOptions.find("StartFrameBase") != metricOptions.end() and metricOptions["StartFrameBase"]) or (metricOptions.find("StartFramePulse")!= metricOptions.end() and metricOptions["StartFramePulse"])) and !byMetricForThisMovie) { // Sneaky metrics StartFrameBase and StartFramePulse can used // with -bymetric only cout << "ERROR: Internal metrics StartFrameBase and StartFramePulse " << "can only be loaded with -bymetric." << endl; exit(1); } // Load "WhenStarted" before processing the others. if (metricOptions["WhenStarted"]) { WriteMetricWhenStarted(cmpReader, hdfPlsReader, movieFileNames[fofnMovieIndex]); } // Now load frame rate. // if (AnyFieldRequiresFrameRate(datasetFields)) { // Load frame rate anyway to ensure that cmp.h5 files are consistent. if (useBaseFile) { cmpReader.movieInfoGroup.StoreFrameRate(m, baseFile.GetFrameRate()); } else if (usePulseFile) { cmpReader.movieInfoGroup.StoreFrameRate(m, pulseFile.GetFrameRate()); } // // Load metrics for alignments from movie 'movieIndex'. // cout << "loading " << movieIndexSets[movieIndex].size() << " alignments for movie " << movieIndex << endl; UInt i; if (byMetricForThisMovie) { // // Build lookup tables for all alignments which // are generated by the movie and check whether // pls/bas.h5 and cmp.h5 match. // vector lookupTables; BuildLookupTablesAndMakeSane(cmpFile, baseFile, pulseFile, cmpReader, hdfBasReader, hdfPlsReader, hdfCcsReader, useBaseFile, usePulseFile, useCcsOnly, movieIndexSets[movieIndex], toFrom, moviePartHoleNumbers, lookupTables); // // Group lookup tables by refGroupIndex and readGroupIndex. // vector > groupedLookupTablesIndexPairs; GroupLookupTables(lookupTables, groupedLookupTablesIndexPairs); if (cmpFile.readType == ReadType::CCS or useCcsOnly) { vector numPassesMetric; numPassesMetric.resize(lookupTables.size()); UInt index = 0; for (index = 0; index < lookupTables.size(); index++) { if (lookupTables[index].skip) { continue; } numPassesMetric[index] = hdfCcsReader.GetNumPasses(lookupTables[index].readIndex); } if (!cmpReader.alnInfoGroup.numPasses.IsInitialized()) { cmpReader.alnInfoGroup.InitializeNumPasses(); // Clear /AlnInfo/NumPasses dataset. cmpReader.alnInfoGroup.numPasses.Resize(0); } // Append numPasses of this movie to the end of /AlnInfo/NumPasses. UInt numPassesSize = cmpReader.alnInfoGroup.numPasses.size(); cmpReader.alnInfoGroup.numPasses.WriteToPos( &numPassesMetric[0], numPassesMetric.size(), numPassesSize); } // Keep a list of currently cached fields. vector cachedFields; if (usePulseFile) { // PulseCalls/ZMW/NumEvent is always cached in plsFile. cachedFields.push_back(Field("NumEvent", PlsField)); } for (int metricsToLoadIndex = 0; metricsToLoadIndex < metricsToLoad.size(); metricsToLoadIndex++) { string curMetric = metricsToLoad[metricsToLoadIndex]; // Metric "WhenStarted" should have been loaded before getting here. if (curMetric == "WhenStarted") { continue; } // Get the next metric to load. string nextMetric = ""; if (metricsToLoadIndex+1 < metricsToLoad.size()) { nextMetric = metricsToLoad[metricsToLoadIndex+1]; } // Cache all required data for computing this metric. CacheRequiredFieldsForMetric(baseFile, pulseFile, hdfBasReader, hdfPlsReader, hdfCcsReader, useBaseFile, usePulseFile, useCcsOnly, cachedFields, curMetric); // Compute the metric and write it to cmp.h5. WriteMetric(cmpFile, baseFile, pulseFile, cmpReader, hdfBasReader, hdfPlsReader, hdfCcsReader, useBaseFile, usePulseFile, useCcsOnly, lookupTables, groupedLookupTablesIndexPairs, curMetric); // Clear cached fields unless they are required by the next metric. ClearCachedFields(baseFile, pulseFile, hdfBasReader, hdfPlsReader, hdfCcsReader, useBaseFile, usePulseFile, useCcsOnly, cachedFields, curMetric, nextMetric); } // Clear the default field "NumEvent" if (usePulseFile) { hdfPlsReader.ClearField(pulseFile, "NumEvent"); } } else { // byRead for this movie for (movieAlignmentIndex = 0; movieAlignmentIndex < movieIndexSets[movieIndex].size(); movieAlignmentIndex++) { MovieAlnIndexLookupTable lookupTable; BuildLookupTable(movieAlignmentIndex, cmpFile, baseFile, usePulseFile, pulseFile, cmpReader, movieIndexSets[movieIndex], toFrom, moviePartHoleNumbers, lookupTable); // Skip this alignment if it is not generated by this movie if (lookupTable.skip) { continue; } UInt & alignmentIndex = lookupTable.alignmentIndex; int & refGroupIndex = lookupTable.refGroupIndex; int & readGroupIndex = lookupTable.readGroupIndex; UInt & holeNumber = lookupTable.holeNumber; int & readIndex = lookupTable.readIndex; int & queryStart = lookupTable.queryStart; int & queryEnd = lookupTable.queryEnd; int & readStart = lookupTable.readStart; int & readLength = lookupTable.readLength; UInt & offsetBegin = lookupTable.offsetBegin; UInt & offsetEnd = lookupTable.offsetEnd; string alignedSequence = GetAlignedSequenceFromCmpFile(cmpReader, lookupTable); // Create a map of where. vector baseToAlignmentMap; CreateSequenceToAlignmentMap(alignedSequence, baseToAlignmentMap); // Condense gaps in the alignment for easy comparison. RemoveGaps(alignedSequence, alignedSequence); // Get source read. unsigned int numPasses; SMRTSequence sourceRead; GetSourceRead(cmpFile, baseFile , pulseFile , hdfBasReader, hdfPlsReader, hdfCcsReader, useBaseFile , usePulseFile, useCcsOnly , //byRead , lookupTable , alignedSequence, sourceRead , numPasses); string readSequence; readSequence.resize(queryEnd - queryStart); copy((char*) (sourceRead.seq + queryStart), (char*) (sourceRead.seq + queryEnd), readSequence.begin()); if (alignedSequence.size() != readSequence.size() or alignedSequence != readSequence) { cout << "ERROR, the query sequence does not match the aligned query sequence." << endl; cout << "HoleNumber: "<< holeNumber << ", MovieName: " << cmpFileMovieName; cout << ", ReadIndex: " << (int) readIndex; cout << ", qStart: "<< queryStart << ", qEnd: " << queryEnd << endl; cout << "Aligned sequence: "<< endl; cout << alignedSequence << endl; cout << "Original sequence: " << endl; cout << readSequence << endl; assert(0); } // // Compute any necessary data fields. These usually involve // using differences of pulse indices, pulse widths, etc.. // Missing fields are stored as 0's. // vector readPulseMetric; vector floatMetric; vector qvMetric; vector frameRateMetric; vector timeMetric; int ungappedAlignedSequenceLength = alignedSequence.size(); assert(ungappedAlignedSequenceLength == queryEnd - queryStart); int alignedSequenceLength = offsetEnd - offsetBegin; readPulseMetric.resize(alignedSequenceLength+1); qvMetric.resize(alignedSequenceLength+1); frameRateMetric.resize(alignedSequenceLength+1); timeMetric.resize(alignedSequenceLength+1); UInt i; UInt pi; HDFCmpExperimentGroup* expGroup = cmpReader.refAlignGroups[refGroupIndex]->readGroups[readGroupIndex]; UInt alnArrayLength = expGroup->alignmentArray.size(); if (cmpFile.readType == ReadType::CCS or useCcsOnly) { if (!cmpReader.alnInfoGroup.numPasses.IsInitialized()) { cmpReader.alnInfoGroup.InitializeNumPasses(); } cmpReader.alnInfoGroup.numPasses.WriteToPos(&numPasses, 1, alignmentIndex); } if (metricOptions["StartTimeOffset"] == true) { if (!expGroup->startTimeOffset.IsInitialized()) { expGroup->startTimeOffset.Initialize(expGroup->experimentGroup, "StartTimeOffset"); } unsigned int readStartTimeOffset = sourceRead.startFrame[queryStart]; expGroup->startTimeOffset.WriteToPos(&readStartTimeOffset, 1, alignmentIndex); } if (metricOptions["QualityValue"] == true) { if (!expGroup->qualityValue.IsInitialized()) { expGroup->qualityValue.Initialize(expGroup->experimentGroup, "QualityValue", true, alnArrayLength); } // Store QualityValue. fill(qvMetric.begin(), qvMetric.end(), missingQualityValue); for (i = 0; i < ungappedAlignedSequenceLength; i++ ) { qvMetric[baseToAlignmentMap[i]] = sourceRead.qual[queryStart + i]; } qvMetric[qvMetric.size()-1] = 0; expGroup->qualityValue.WriteToPos(&qvMetric[0], qvMetric.size(), offsetBegin); } if (metricOptions["InsertionQV"] == true) { if (!expGroup->insertionQV.IsInitialized()) { expGroup->insertionQV.Initialize(expGroup->experimentGroup, "InsertionQV", true, alnArrayLength); } // Store InsertionQV. fill(qvMetric.begin(), qvMetric.end(), missingQualityValue); for (i = 0; i < ungappedAlignedSequenceLength; i++ ) { qvMetric[baseToAlignmentMap[i]] = sourceRead.insertionQV[queryStart+ i]; } qvMetric[qvMetric.size()-1] = 0; expGroup->insertionQV.WriteToPos(&qvMetric[0], qvMetric.size(), offsetBegin); } if (metricOptions["MergeQV"] == true) { if (!expGroup->mergeQV.IsInitialized()) { expGroup->mergeQV.Initialize(expGroup->experimentGroup, "MergeQV", true, alnArrayLength); } // Store MergeQV. fill(qvMetric.begin(), qvMetric.end(), missingQualityValue); for (i = 0; i < ungappedAlignedSequenceLength; i++ ) { qvMetric[baseToAlignmentMap[i]] = sourceRead.mergeQV[queryStart+ i]; } qvMetric[qvMetric.size()-1] = 0; expGroup->mergeQV.WriteToPos(&qvMetric[0], qvMetric.size(), offsetBegin); } if (metricOptions["DeletionQV"] == true) { if (!expGroup->deletionQV.IsInitialized()) { expGroup->deletionQV.Initialize(expGroup->experimentGroup, "DeletionQV", true, alnArrayLength); } // Store DeletionQV. fill(qvMetric.begin(), qvMetric.end(), missingQualityValue); for (i = 0; i < ungappedAlignedSequenceLength; i++ ) { qvMetric[baseToAlignmentMap[i]] = sourceRead.deletionQV[queryStart+i]; } qvMetric[qvMetric.size()-1] = 0; expGroup->deletionQV.WriteToPos(&qvMetric[0], qvMetric.size(), offsetBegin); } if (metricOptions["DeletionTag"] == true) { if (!expGroup->deletionTag.IsInitialized()) { expGroup->deletionTag.Initialize(expGroup->experimentGroup, "DeletionTag", true, alnArrayLength); } vector readDeletionTagMetric; readDeletionTagMetric.resize(readPulseMetric.size()); // Store DeletionTag. for (i = 0; i < readDeletionTagMetric.size()-1; i++ ) { readDeletionTagMetric[i] = '-'; } readDeletionTagMetric[i] = '\0'; for (i = 0; i < ungappedAlignedSequenceLength; i++ ) { assert(baseToAlignmentMap[i] < readDeletionTagMetric.size()); readDeletionTagMetric[baseToAlignmentMap[i]] = sourceRead.deletionTag[queryStart+i]; } readDeletionTagMetric[readDeletionTagMetric.size()-1] = 0; expGroup->deletionTag.WriteToPos(&readDeletionTagMetric[0], readDeletionTagMetric.size(), offsetBegin); } if (metricOptions["PulseIndex"] == true) { if (!expGroup->pulseIndex.IsInitialized()) { expGroup->pulseIndex.Initialize(expGroup->experimentGroup, "PulseIndex", true, alnArrayLength); } vector readPulseIndexMetric; fill(readPulseIndexMetric.begin(), readPulseIndexMetric.end(), missingPulseIndex); readPulseIndexMetric.resize(readPulseMetric.size()); // Store Pulse Index. assert(readPulseIndexMetric.size() > 0); for (i = 0; i < readPulseIndexMetric.size(); i++ ) { readPulseIndexMetric[i] = 0; } for (i = 0; i < ungappedAlignedSequenceLength; i++ ) { readPulseIndexMetric[baseToAlignmentMap[i]] = sourceRead.pulseIndex[queryStart+i]; } readPulseIndexMetric[readPulseIndexMetric.size()-1] = 0; expGroup->pulseIndex.WriteToPos(&readPulseIndexMetric[0], readPulseIndexMetric.size(), offsetBegin); } if (metricOptions["SubstitutionTag"] == true) { if (!expGroup->substitutionTag.IsInitialized()) { expGroup->substitutionTag.Initialize(expGroup->experimentGroup, "SubstitutionTag", true, alnArrayLength); } vector readSubstitutionTagMetric; readSubstitutionTagMetric.resize(readPulseMetric.size()); // Store substitutionTag for (i = 0; i < readSubstitutionTagMetric.size()-1; i++ ) { readSubstitutionTagMetric[i] = '-'; } readSubstitutionTagMetric[i] = '\0'; for (i = 0; i < ungappedAlignedSequenceLength; i++ ) { readSubstitutionTagMetric[baseToAlignmentMap[i]] = sourceRead.substitutionTag[queryStart+i]; } readSubstitutionTagMetric[readSubstitutionTagMetric.size()-1] = 0; expGroup->substitutionTag.WriteToPos(&readSubstitutionTagMetric[0], readSubstitutionTagMetric.size(), offsetBegin); } if (metricOptions["SubstitutionQV"] == true) { if (!expGroup->substitutionQV.IsInitialized()) { expGroup->substitutionQV.Initialize(expGroup->experimentGroup, "SubstitutionQV", true, alnArrayLength); } // Store start time normalized to frame rate. fill(qvMetric.begin(), qvMetric.end(), missingQualityValue); for (i = 0; i < ungappedAlignedSequenceLength; i++ ) { qvMetric[baseToAlignmentMap[i]] = sourceRead.substitutionQV[queryStart+i]; } qvMetric[qvMetric.size()-1] = 0; expGroup->substitutionQV.WriteToPos(&qvMetric[0], qvMetric.size(), offsetBegin); } if (metricOptions["ClassifierQV"] == true) { if (!expGroup->classifierQV.IsInitialized()) { expGroup->classifierQV.Initialize(expGroup->experimentGroup, "ClassifierQV", true, alnArrayLength); } fill(floatMetric.begin(), floatMetric.end(), NaN); for (i = 0; i < ungappedAlignedSequenceLength; i++ ) { floatMetric[baseToAlignmentMap[i]] = sourceRead.classifierQV[i+queryStart]; } qvMetric[qvMetric.size()-1] = 0; expGroup->classifierQV.WriteToPos(&floatMetric[0], floatMetric.size(), offsetBegin); } if (metricOptions["StartFrame"] == true) { if (!expGroup->startTime.IsInitialized()) { expGroup->startTime.Initialize(expGroup->experimentGroup, "StartFrame", true, alnArrayLength); } // StartFrame used to be computed from baseFile.preBaseFrame and // baseFile.basWidthInFrames, whenever possible. But a more accurate // way is to obtain StartFrame directly from pulseFile.StartFrame // when a pulseFile is provided. if (usePulseFile) { assert(sourceRead.startFrame); } else if (useBaseFile) { if (sourceRead.startFrame) { Free(sourceRead.startFrame); } sourceRead.startFrame = new unsigned int[sourceRead.length]; copy(sourceRead.preBaseFrames, &sourceRead.preBaseFrames[sourceRead.length], sourceRead.startFrame); for (i = 0; i < sourceRead.length-1; i++) { sourceRead.startFrame[i+1] += sourceRead.widthInFrames[i]; } partial_sum(sourceRead.startFrame, &sourceRead.startFrame[sourceRead.length], sourceRead.startFrame); } fill(timeMetric.begin(), timeMetric.end(), missingPulseIndex); for (i = 0; i < ungappedAlignedSequenceLength; i++ ) { timeMetric[baseToAlignmentMap[i]] = sourceRead.startFrame[i+queryStart]; } timeMetric[timeMetric.size()-1] = 0; expGroup->startTime.WriteToPos(&timeMetric[0], timeMetric.size(), offsetBegin); } if (metricOptions["PulseWidth"] == true) { if (!expGroup->pulseWidth.IsInitialized()) { expGroup->pulseWidth.Initialize(expGroup->experimentGroup, "PulseWidth", true, alnArrayLength); } fill(frameRateMetric.begin(), frameRateMetric.end(), missingFrameRateValue); // For legacy reasons, it's possible the width in frames is // stored in the bas file. If this is the case, use the width // in frames there. Otherwise, use the width in frames stored // in the pls file. for (i = 0; i < ungappedAlignedSequenceLength; i++ ) { frameRateMetric[baseToAlignmentMap[i]] = sourceRead.widthInFrames[queryStart + i]; } frameRateMetric[frameRateMetric.size()-1] = 0; expGroup->pulseWidth.WriteToPos(&frameRateMetric[0], frameRateMetric.size(), offsetBegin); } if (metricOptions["PreBaseFrames"] == true) { if (!expGroup->preBaseFrames.IsInitialized()) { expGroup->preBaseFrames.Initialize(expGroup->experimentGroup, "PreBaseFrames", true, alnArrayLength); } fill(frameRateMetric.begin(), frameRateMetric.end(), missingFrameRateValue); for (i = 0; i < ungappedAlignedSequenceLength; i++ ) { frameRateMetric[baseToAlignmentMap[i]] = sourceRead.preBaseFrames[i+queryStart]; } frameRateMetric[frameRateMetric.size()-1] = 0; expGroup->preBaseFrames.WriteToPos(&frameRateMetric[0], frameRateMetric.size(), offsetBegin); } if (metricOptions["WidthInFrames"] == true) { if (!expGroup->widthInFrames.IsInitialized()) { expGroup->widthInFrames.Initialize(expGroup->experimentGroup, "WidthInFrames", true, alnArrayLength); } // Compute width in frames. fill(frameRateMetric.begin(), frameRateMetric.end(), missingFrameRateValue); for (i = 0; i < ungappedAlignedSequenceLength; i++ ) { frameRateMetric[baseToAlignmentMap[i]] = sourceRead.widthInFrames[i+queryStart]; } frameRateMetric[frameRateMetric.size()-1] = 0; expGroup->widthInFrames.WriteToPos(&frameRateMetric[0], frameRateMetric.size(), offsetBegin); } if (metricOptions["pkmid"] == true) { if (!expGroup->pkmid.IsInitialized()) { expGroup->pkmid.Initialize(expGroup->experimentGroup, "pkmid", true, alnArrayLength); } for (i = 0; i < readPulseMetric.size(); i++ ) { readPulseMetric[i] = NaN; } for (i = 0; i < ungappedAlignedSequenceLength; i++ ) { readPulseMetric[baseToAlignmentMap[i]] = sourceRead.midSignal[i+queryStart]; } readPulseMetric[readPulseMetric.size()-1] = 0; expGroup->pkmid.WriteToPos(&readPulseMetric[0], readPulseMetric.size(), offsetBegin); } if (metricOptions["IPD"] == true) { if (!expGroup->ipd.IsInitialized()) { expGroup->ipd.Initialize(expGroup->experimentGroup, "IPD", true, alnArrayLength); } fill(frameRateMetric.begin(), frameRateMetric.end(), missingFrameRateValue); for (i = 0; i < ungappedAlignedSequenceLength; i++ ) { // // The IPD is undefined for the first base in a read. // if (usePulseFile ) { if (queryStart == 0 and i == 0) { frameRateMetric[baseToAlignmentMap[i]] = 0; } else { frameRateMetric[baseToAlignmentMap[i]] = (sourceRead.startFrame[i+queryStart] - sourceRead.startFrame[i+queryStart-1] - sourceRead.widthInFrames[i+queryStart-1]); } } else if (useBaseFile) { frameRateMetric[baseToAlignmentMap[i]] = sourceRead.preBaseFrames[i + queryStart]; } } frameRateMetric[frameRateMetric.size()-1] = 0; expGroup->ipd.WriteToPos(&frameRateMetric[0], frameRateMetric.size(), offsetBegin); } if (metricOptions["Light"] == true) { if (!expGroup->light.IsInitialized()) { expGroup->light.Initialize(expGroup->experimentGroup, "Light", true, alnArrayLength); } fill(frameRateMetric.begin(), frameRateMetric.end(), missingFrameRateValue); for (i = 0; i < ungappedAlignedSequenceLength; i++ ) { frameRateMetric[baseToAlignmentMap[i]] = sourceRead.meanSignal[i+queryStart]; frameRateMetric[baseToAlignmentMap[i]] = (frameRateMetric[baseToAlignmentMap[i]] * sourceRead.widthInFrames[i+queryStart]); } frameRateMetric[frameRateMetric.size()-1] = 0; expGroup->light.WriteToPos(&frameRateMetric[0], frameRateMetric.size(), offsetBegin); } sourceRead.Free(); Free(sourceRead.meanSignal); Free(sourceRead.maxSignal); Free(sourceRead.midSignal); Free(sourceRead.startFrame); Free(sourceRead.classifierQV); Free(sourceRead.widthInFrames); } } if (useBaseFile) { hdfBasReader.Close(); } if (cmpFile.readType == ReadType::CCS or useCcsOnly) { hdfCcsReader.Close(); } if (usePulseFile) { hdfPlsReader.Close(); } } // Done loading movies. cmpReader.Close(); cerr << "[INFO] " << GetTimestamp() << " [" << program << "] ended." << endl; } blasr-8e668beae0dda1da6914586fb458182c6c3c7482/utils/PulseToFasta.cpp000066400000000000000000000306121260737656700241000ustar00rootroot00000000000000#include #include #include #include "HDFPlsReader.hpp" #include "HDFUtils.hpp" #include "HDFRegionTableReader.hpp" #include "reads/RegionTable.hpp" #include "reads/ReadInterval.hpp" #include "files/ReaderAgglomerate.hpp" #include "utils/FileOfFileNames.hpp" #include "utils/RegionUtils.hpp" #include "utils/TimeUtils.hpp" #include "SMRTSequence.hpp" #include "utils.hpp" #include "CommandLineParser.hpp" using namespace std; char VERSION[] = "v1.0.0"; char PERFORCE_VERSION_STRING[] = "$Change: 126414 $"; int main(int argc, char* argv[]) { string program = "pls2fasta"; string versionString = VERSION; AppendPerforceChangelist(PERFORCE_VERSION_STRING, versionString); string plsFileName, fastaOutName; vector plsFileNames; bool trimByRegion, maskByRegion; trimByRegion = false; maskByRegion = false; int argi = 3; RegionTable regionTable; string regionsFOFNName = ""; vector regionFileNames; bool splitSubreads = true; int minSubreadLength = 0; bool addSimulatedData = false; bool printSimulatedCoordinate = false; bool printSimulatedSequenceIndex = false; bool printFastq = false; bool printCcs = false; int lineLength = 50; int minReadScore = 0; vector holeNumbers; CommandLineParser clp; bool printOnlyBest = false; clp.SetProgramName(program); clp.SetVersion(versionString); clp.RegisterStringOption("in.bax.h5", &plsFileName, "Input plx.h5/bax.h5/fofn file.", true); clp.RegisterStringOption("out.fasta", &fastaOutName, "Output fasta/fastq file.", true); clp.RegisterPreviousFlagsAsHidden(); clp.RegisterFlagOption("trimByRegion", &trimByRegion, "Trim away low quality regions."); clp.RegisterFlagOption("maskByRegion", &maskByRegion, "Mask low quality regions with 'N'."); clp.RegisterStringOption("regionTable", ®ionsFOFNName, "Optional HDF file with a /PulseData/Regions dataset."); clp.RegisterIntOption("minSubreadLength", &minSubreadLength, "Do not write subreads less than the specified length.", CommandLineParser::PositiveInteger); clp.RegisterFlagOption("noSplitSubreads", &splitSubreads, "Do not split reads on adapter sequences."); clp.RegisterIntListOption("holeNumber", &holeNumbers, "Only print this hole number (or list of numbers)."); clp.RegisterFlagOption("fastq", &printFastq, "Print in FASTQ format with quality."); clp.RegisterFlagOption("ccs", &printCcs, "Print de novo CCS sequences"); clp.RegisterIntOption("lineLength", &lineLength, "Specify fasta/fastq line length", CommandLineParser::PositiveInteger); clp.RegisterIntOption("minReadScore", &minReadScore, "Minimum read score to print a read. The score is " "a number between 0 and 1000 and represents the expected accuracy percentage * 10. " "A typical value would be between 750 and 800. This does not apply to ccs reads.", CommandLineParser::NonNegativeInteger); clp.RegisterFlagOption("best", &printOnlyBest, "If a CCS sequence exists, print this. Otherwise, print the longest" "subread. This does not support fastq."); string description = ("Converts plx.h5/bax.h5/fofn files to fasta or fastq files. Although fasta files are provided" " with every run, they are not trimmed nor split into subreads. This program takes " "additional annotation information, such as the subread coordinates and high quality regions " "and uses them to create fasta sequences that are substrings of all bases called. Most of the time " "you will want to trim low quality reads, so you should specify -trimByRegion."); clp.SetProgramSummary(description); clp.ParseCommandLine(argc, argv); cerr << "[INFO] " << GetTimestamp() << " [" << program << "] started." << endl; if (trimByRegion and maskByRegion) { cout << "ERROR! You cannot both trim and mask regions. Use one or the other." << endl; exit(1); } if (printFastq) { // Setting lineLength to 0 flags to print on one line. lineLength = 0; } FileOfFileNames::StoreFileOrFileList(plsFileName, plsFileNames); if (regionsFOFNName == "") { regionFileNames = plsFileNames; } else { FileOfFileNames::StoreFileOrFileList(regionsFOFNName, regionFileNames); } ofstream fastaOut; CrucialOpen(fastaOutName, fastaOut); int plsFileIndex; HDFRegionTableReader hdfRegionReader; sort(holeNumbers.begin(), holeNumbers.end()); vector pls2rgn = MapPls2Rgn(plsFileNames, regionFileNames); for (plsFileIndex = 0; plsFileIndex < plsFileNames.size(); plsFileIndex++) { if (trimByRegion or maskByRegion or splitSubreads) { hdfRegionReader.Initialize(regionFileNames[pls2rgn[plsFileIndex]]); hdfRegionReader.ReadTable(regionTable); } ReaderAgglomerate reader; HDFBasReader ccsReader; if (printOnlyBest) { ccsReader.SetReadBasesFromCCS(); ccsReader.Initialize(plsFileNames[plsFileIndex]); } if (printCcs == false) { reader.IgnoreCCS(); } else { reader.hdfBasReader.SetReadBasesFromCCS(); } if (addSimulatedData) { reader.hdfBasReader.IncludeField("SimulatedCoordinate"); reader.hdfBasReader.IncludeField("SimulatedSequenceIndex"); } if (reader.SetReadFileName(plsFileNames[plsFileIndex]) == 0) { cout << "ERROR, could not determine file type." << plsFileNames[plsFileIndex] << endl; exit(1); } if (reader.Initialize() == 0) { cout << "ERROR, could not initialize file " << plsFileNames[plsFileIndex] << endl; exit(1); } DNALength simulatedCoordinate; DNALength simulatedSequenceIndex; reader.SkipReadQuality(); SMRTSequence seq; vector subreadIntervals;; SMRTSequence ccsSeq; while (reader.GetNextBases(seq, printFastq)) { if (printOnlyBest) { ccsReader.GetNext(ccsSeq); } if (holeNumbers.size() != 0 and binary_search(holeNumbers.begin(), holeNumbers.end(), seq.zmwData.holeNumber) == false) { continue; } if (seq.length == 0) { continue; } if (addSimulatedData) { reader.hdfBasReader.simulatedCoordinateArray.Read(reader.hdfBasReader.curRead-1, reader.hdfBasReader.curRead, &simulatedCoordinate); reader.hdfBasReader.simulatedSequenceIndexArray.Read(reader.hdfBasReader.curRead-1, reader.hdfBasReader.curRead, &simulatedSequenceIndex); } if (printCcs == true) { if (printFastq == false) { seq.PrintSeq(fastaOut); } else { seq.PrintFastq(fastaOut, lineLength); } continue; } // // Determine the high quality boundaries of the read. This is // the full read is no hq regions exist, or it is stated to // ignore regions. // DNALength hqReadStart, hqReadEnd; int hqRegionScore; if (GetReadTrimCoordinates(seq, seq.zmwData, regionTable, hqReadStart, hqReadEnd, hqRegionScore) == false or (trimByRegion == false and maskByRegion == false)) { hqReadStart = 0; hqReadEnd = seq.length; } // // Mask off the low quality portions of the reads. // if (maskByRegion) { if (hqReadStart > 0) { fill(&seq.seq[0], &seq.seq[hqReadStart], 'N'); } if (hqReadEnd != seq.length) { fill(&seq.seq[hqReadEnd], &seq.seq[seq.length], 'N'); } } // // Now possibly print the full read with masking. This could be handled by making a // if (splitSubreads == false) { ReadInterval wholeRead(0, seq.length); // The set of subread intervals is just the entire read. subreadIntervals.clear(); subreadIntervals.push_back(wholeRead); } else { // // Print subread coordinates no matter whether or not reads have subreads. // if (regionTable.HasHoleNumber(seq.HoleNumber())) { subreadIntervals = regionTable[seq.HoleNumber()].SubreadIntervals(seq.length, false, true); } else { subreadIntervals = {}; } } // // Output all subreads as separate sequences. // int intvIndex; SMRTSequence bestSubreadSequence; int bestSubreadScore = -1; int bestSubreadIndex = 0; int bestSubreadStart = 0, bestSubreadEnd = 0; SMRTSequence bestSubread; for (intvIndex = 0; intvIndex < subreadIntervals.size(); intvIndex++) { SMRTSequence subreadSequence, subreadSequenceRC; subreadSequence.SubreadStart(subreadIntervals[intvIndex].start); subreadSequence.SubreadEnd (subreadIntervals[intvIndex].end); // // When trimming by region, only output the parts of the // subread that overlap the hq region. // if (trimByRegion == true) { subreadSequence.SubreadStart(max((DNALength) subreadIntervals[intvIndex].start, hqReadStart)); subreadSequence.SubreadEnd ( min((DNALength) subreadIntervals[intvIndex].end, hqReadEnd)); } if (subreadSequence.SubreadStart() >= subreadSequence.SubreadEnd() or subreadSequence.SubreadEnd() - subreadSequence.SubreadStart() <= minSubreadLength) { // // There is no high qualty portion of this subread. Skip it. // continue; } if (hqRegionScore < minReadScore) { continue; } // // Print the subread, adding the coordinates as part of the title. // subreadSequence.ReferenceSubstring(seq, subreadSequence.SubreadStart(), subreadSequence.SubreadLength()); stringstream titleStream; titleStream << seq.title; if (splitSubreads) { // // Add the subread coordinates if splitting on subread. // titleStream << "/" << subreadSequence.SubreadStart() << "_" << subreadSequence.SubreadEnd(); } // // If running on simulated data, add where the values were simulated from. // if (addSimulatedData) { titleStream << ((FASTASequence*)&seq)->title << "/chrIndex_" << simulatedSequenceIndex << "/position_"<< simulatedCoordinate; ((FASTASequence*)&seq)->CopyTitle(titleStream.str()); } subreadSequence.CopyTitle(titleStream.str()); // // Eventually replace with WriterAgglomerate. // if (printOnlyBest == false) { if (subreadSequence.length > 0) { if (printFastq == false) { ((FASTASequence*)&subreadSequence)->PrintSeq(fastaOut); } else { subreadSequence.PrintFastq(fastaOut, lineLength); } } } else { int subreadWeightedScore = subreadSequence.length * hqRegionScore; if (subreadWeightedScore > bestSubreadScore) { bestSubreadIndex = intvIndex; bestSubread = subreadSequence; bestSubreadScore = subreadWeightedScore; } } } if (printOnlyBest) { if (ccsSeq.length > 0) { if (printFastq == false) { ccsSeq.PrintSeq(fastaOut); } else { ccsSeq.PrintFastq(fastaOut, ccsSeq.length); } } else { if (bestSubreadScore >= 0) { if (printFastq == false) { bestSubread.PrintSeq(fastaOut); } else { bestSubread.PrintFastq(fastaOut, bestSubread.length); } bestSubread.Free(); } } ccsSeq.Free(); } seq.Free(); } reader.Close(); hdfRegionReader.Close(); } cerr << "[INFO] " << GetTimestamp() << " [" << program << "] ended." << endl; } blasr-8e668beae0dda1da6914586fb458182c6c3c7482/utils/SAWriter.cpp000066400000000000000000000163011260737656700232250ustar00rootroot00000000000000#include #include #include "suffixarray/SuffixArray.hpp" #include "FASTASequence.hpp" #include "FASTAReader.hpp" #include "NucConversion.hpp" #include "Types.h" #include "suffixarray/ssort.hpp" #include "algorithms/sorting/qsufsort.hpp" #include "algorithms/sorting/Karkkainen.hpp" #include "CompressedSequence.hpp" void PrintUsage() { cout << "usage: sawriter saOut fastaIn [fastaIn2 fastaIn3 ...] [-blt p] [-larsson] [-4bit] [-manmy] [-kar]" << endl; cout << " or sawriter fastaIn (writes to fastIn.sa)." << endl; cout << " -blt p Build a lookup table on prefixes of length 'p'. This speeds " << endl << " up lookups considerably (more than the LCP table), but misses matches " << endl << " less than p when searching." << endl; cout << " -4bit Read in (one) fasta file as a compressed sequence file." << endl; cout << " -larsson (default) Uses the method of Larsson and Sadakane to build the array." << endl; cout << " -mamy Uses the method of MAnber and MYers to build the array (slower than larsson, " << endl << " and produces the same result. This is mainly for double checking"< inFiles; int doBLT = 1; int bltPrefixLength = 8; int parsingOptions = 0; SAType saBuildType = larsson; int read4BitCompressed = 0; int diffCoverSize = 0; while (argi < argc) { if (strlen(argv[argi]) > 0 and argv[argi][0] == '-'){ parsingOptions = 1; } if (!parsingOptions) { inFiles.push_back(argv[argi]); } else { if (strcmp(argv[argi], "-blt") == 0) { doBLT = 1; if (argi < argc - 1) { bltPrefixLength = atoi(argv[++argi]); if (bltPrefixLength == 0) { cout << argv[argi] << " is not a valid lookup table length." << endl; exit(1); } } else { cout << "Please specify a lookup table length." << endl; exit(1); } } else if (strcmp(argv[argi], "-mamy") == 0) { saBuildType = manmy; } else if (strcmp(argv[argi], "-larsson") == 0) { saBuildType = larsson; } else if (strcmp(argv[argi], "-mcilroy") == 0) { saBuildType = mcilroy; } else if (strcmp(argv[argi], "-slow") == 0) { saBuildType = slow; } else if (strcmp(argv[argi], "-kark") == 0) { saBuildType = kark; } else if (strcmp(argv[argi], "-mafe") == 0) { saBuildType = mafe; } else if (strcmp(argv[argi], "-welter") == 0) { saBuildType = welter; } else if (strcmp(argv[argi], "-welterweight") == 0) { if (argi < argc-1) { diffCoverSize = atoi(argv[++argi]); } else { cout << "Please specify a difference cover size. Valid values are 7,32,64,111, and 2281. Larger values use less memory but may be slower." << endl; exit(1); } if ( ! (diffCoverSize == 7 or diffCoverSize == 32 or diffCoverSize == 64 or diffCoverSize == 111 or diffCoverSize == 2281) ) { cout << "The difference cover size must be one of 7,32,64,111, or 2281." << endl; cout << "Larger numbers use less space but are more slow." << endl; exit(1); } } else if (strcmp(argv[argi], "-4bit") == 0) { read4BitCompressed = 1; } else { PrintUsage(); cout << "ERROR, bad option: " << argv[argi] << endl; exit(1); } } ++argi; } if (inFiles.size() == 0) { // // Special use case: the input file is a fasta file. Write to that file + .sa // inFiles.push_back(saFile); saFile = saFile + ".sa"; } VectorIndex inFileIndex; FASTASequence seq; CompressedSequence compSeq; if (read4BitCompressed == 0) { for (inFileIndex = 0; inFileIndex < inFiles.size(); ++inFileIndex) { FASTAReader reader; reader.Init(inFiles[inFileIndex]); reader.SetSpacePadding(111); if (saBuildType == kark) { // // The Karkkainen sa building method requires a little extra // space at the end of the dna sequence so that counting may // be done mod 3 without adding extra logic for boundaries. // } if (inFileIndex == 0) { reader.ReadAllSequencesIntoOne(seq); reader.Close(); } else { while(reader.ConcatenateNext(seq)) { cout << "added " << seq.title << endl; } } } seq.ToThreeBit(); //seq.ToUpper(); } else { assert(inFiles.size() == 1); cout << "reading compressed sequence." << endl; compSeq.Read(inFiles[0]); seq.seq = compSeq.seq; seq.length = compSeq.length; compSeq.RemoveCompressionCounts(); cout << "done." << endl; } // // For now, do not allow creation of suffix arrays on sequences > 4G. // if (seq.length >= UINT_MAX) { cout << "ERROR, references greater than " << UINT_MAX << " bases are not supported." << endl; cout << "Consider breaking the reference into multiple files, running alignment. " << endl; cout << "against each file, and merging the result." << endl; exit(1); } vector alphabet; SuffixArray > sa; // sa.InitTwoBitDNAAlphabet(alphabet); // sa.InitAsciiCharDNAAlphabet(alphabet); sa.InitThreeBitDNAAlphabet(alphabet); if (saBuildType == manmy) { sa.MMBuildSuffixArray(seq.seq, seq.length, alphabet); } else if (saBuildType == mcilroy) { sa.index = new SAIndex[seq.length+1]; DNALength i; for (i = 0; i < seq.length; i++) { sa.index[i] = seq.seq[i] + 1;} sa.index[seq.length] = 0; ssort(sa.index, NULL); for (i = 1; i < seq.length+1; i++ ){ sa.index[i-1] = sa.index[i];}; sa.length = seq.length; } else if (saBuildType == larsson) { sa.LarssonBuildSuffixArray(seq.seq, seq.length, alphabet); } else if (saBuildType == kark) { sa.index = new SAIndex[seq.length]; seq.ToThreeBit(); DNALength p; for (p = 0; p < seq.length; p++ ){ seq.seq[p]++; } KarkkainenBuildSuffixArray(seq.seq, sa.index, seq.length, 5); sa.length = seq.length; } else if (saBuildType == mafe) { // sa.MaFeBuildSuffixArray(seq.seq, seq.length); } else if (saBuildType == welter) { if (diffCoverSize == 0) { sa.LightweightBuildSuffixArray(seq.seq, seq.length); } else { sa.LightweightBuildSuffixArray(seq.seq, seq.length, diffCoverSize); } } if (doBLT) { sa.BuildLookupTable(seq.seq, seq.length, bltPrefixLength); } sa.Write(saFile); return 0; } blasr-8e668beae0dda1da6914586fb458182c6c3c7482/utils/SDPMatcher.cpp000066400000000000000000000117071260737656700234640ustar00rootroot00000000000000#include #include #include #include #include "defs.h" #include "FASTAReader.hpp" #include "FASTASequence.hpp" #include "tuples/DNATuple.hpp" #include "tuples/TupleMetrics.hpp" #include "datastructures/alignment/Path.h" #include "datastructures/alignment/Alignment.hpp" #include "algorithms/alignment/AlignmentUtils.hpp" #include "format/StickAlignmentPrinter.hpp" #include "algorithms/alignment/SWAlign.hpp" #include "algorithms/alignment/SDPAlign.hpp" /* * Performs sparse dynamic programming (SDP) between pairs of sequences as they * are given in two FASTA files, one called for convenience query, the other * target. k is the size of the k-mer used for the SDP algorithm. */ void PrintUsage() { cout << "usage: sdpMatcher query target k [-indelRate delta] " "[-showalign] [-printsw] [-noRefine] [-indel i] [ -local ] " "[-match m] [-sdpIndel i]" << endl; } int main(int argc, char* argv[]) { if (argc < 4) { PrintUsage(); exit(1); } string queryName, targetName; queryName = argv[1]; targetName = argv[2]; TupleMetrics tm; tm.Initialize(atoi(argv[3])); int argi = 4; float indelRate = 0.25; int indel = 3; int match = 0; int printSW = 0; int refineAlignments = 1; int showalign = 0; int fixedTarget = 0; int sdpIndel = indel; int sdpIns = 5; int sdpDel = 5; AlignmentType alignType = Global; while (argi < argc) { if (strcmp(argv[argi], "-indelRate") == 0) { ++argi; indelRate = atof(argv[argi]); } else if (strcmp(argv[argi], "-printsw") == 0) { printSW = 1; } else if (strcmp(argv[argi], "-noRefine") == 0) { refineAlignments = 0; } else if (strcmp(argv[argi], "-indel") == 0) { indel = atoi(argv[++argi]); } else if (strcmp(argv[argi], "-sdpIndel") == 0) { sdpIndel = atoi(argv[++argi]); } else if (strcmp(argv[argi], "-sdpIns") == 0) { sdpIns = atoi(argv[++argi]); } else if (strcmp(argv[argi], "-sdpDel") == 0) { sdpDel = atoi(argv[++argi]); } else if (strcmp(argv[argi], "-showalign") == 0) { showalign = 1; } else if (strcmp(argv[argi], "-local") == 0) { alignType = Local; } else if (strcmp(argv[argi], "-match") == 0) { match = atoi(argv[++argi]); } else if (strcmp(argv[argi], "-fixedtarget") == 0) { fixedTarget = 1; } else { PrintUsage(); cout << "Bad option: " << argv[argi] << endl; exit(1); } ++argi; } FASTASequence query, target; FASTAReader queryReader, targetReader; queryReader.Init(queryName); targetReader.Init(targetName); if (match != 0) { int i; for (i = 0; i < 4; i++ ){ LocalAlignLowMutationMatrix[i][i] = match; } } int seqIndex = 0; Alignment alignment; vector scoreMat; vector pathMat; DistanceMatrixScoreFunction distScoreFn; distScoreFn.del = indel; distScoreFn.ins = indel; distScoreFn.InitializeScoreMatrix(SMRTDistanceMatrix); if (fixedTarget) { targetReader.GetNext(target); } cout << "qid,tid,qstart,qend,qlen,tstart,tend,tlen,score" << endl; while (queryReader.GetNext(query) and (fixedTarget or targetReader.GetNext(target))) { if (query.length == 0 or target.length == 0) continue; alignment.blocks.clear(); int alignScore; alignScore = SDPAlign(query, target, distScoreFn, tm.tupleSize, sdpIndel, sdpIndel, indelRate, alignment, alignType, refineAlignments, false, 0); if (alignScore > 0){ // in rare cases the SDP returns positive. alignScore = 0; // this makes it more like a true local alignment } if (showalign) { StickPrintAlignment(alignment, query, target, cout); } if (printSW) { MatchedAlignment swAlignment; vector scoreMat; vector pathMat; SWAlign(query, target, scoreMat, pathMat, swAlignment, distScoreFn); StickPrintAlignment(swAlignment, query, target, cout); } cout << query.GetName() << "," << target.GetName() << "," << alignment.qPos << "," << alignment.QEnd() << "," << query.length << "," << alignment.tPos << "," << alignment.TEnd() << "," << target.length << "," << alignScore << endl; ++seqIndex; } return 0; } blasr-8e668beae0dda1da6914586fb458182c6c3c7482/utils/SamFilter.cpp000066400000000000000000000504411260737656700234160ustar00rootroot00000000000000/* * ===================================================================================== * * Filename: SAMFilter.cpp * * Description: Filter SAM Hits according to * filteration criteria * minPctSimilarity, minAccuracy, * minLength, holeNumbers * and multiple-hit policy * random : a random hit * all : all hits * allbest : all hits with the best score * randombest: a random hit selected from all the hits * that have the best score * * Version: 1.0 * Created: 03/19/2013 01:19:43 PM * Revision: none * Compiler: gcc * * Author: Yuan Li (yli), yli@pacificbiosciences.com * Company: Pacific Biosciences * * ===================================================================================== */ #include #include #include "FASTASequence.hpp" #include "FASTAReader.hpp" #include "CommandLineParser.hpp" #include "ChangeListID.hpp" #include "utils/TimeUtils.hpp" #include "utils/RangeUtils.hpp" #include "utils/SMRTReadUtils.hpp" #include "algorithms/alignment/DistanceMatrixScoreFunction.hpp" #include "algorithms/alignment/AlignmentUtils.hpp" #include "algorithms/alignment/StringToScoreMatrix.hpp" #include "sam/SAMReader.hpp" #include "format/SAMPrinter.hpp" #include "datastructures/alignment/AlignmentCandidate.hpp" #include "datastructures/alignment/FilterCriteria.hpp" #include "metagenome/TitleTable.hpp" #include "datastructures/alignment/SAMToAlignmentCandidateAdapter.hpp" #include "GFFFile.hpp" #include "defs.h" #include "RegisterFilterOptions.h" //#define USE_GOOGLE_PROFILER #ifdef USE_GOOGLE_PROFILER #include "gperftools/profiler.h" #endif char VERSION[] = "v0.1.0"; char PERFORCE_VERSION_STRING[] = "$Change: 134995 $"; // By default negative score is better. ScoreSign scoreSign = ScoreSign::NEGATIVE; // Compare SAMAlignment objects by qName, score and // target positions. bool byQNameScoreTStart(const SAMAlignment & a, const SAMAlignment & b) { if (a.qName == b.qName) { if (a.score == b.score) return a.pos < b.pos; return Score(a.score, scoreSign).WorseThan(Score(b.score, scoreSign)); } return (a.qName < b.qName); } // Compare SAMAlignment objects by rName and qName bool byRNameQName(const SAMAlignment & a, const SAMAlignment & b) { if (a.rName == b.rName) { return a.qName < b.qName; } return (a.rName < b.rName); } // Get the next group of SAM alignments that have the same qName from // allSAMAlignments[groupBegin ... groupEnd) // Note that allSAMAlignments is already sorted by qName, score and tPos. void GetNextSAMAlignmentGroup(vector & allSAMAlignments, unsigned int groupBegin, unsigned int & groupEnd) { assert(groupBegin < allSAMAlignments.size()); groupEnd = groupBegin + 1; string queryName = allSAMAlignments[groupBegin].qName; while(groupEnd < allSAMAlignments.size()) { if (allSAMAlignments[groupEnd].qName == queryName) groupEnd ++; else break; } } // Get the best SAM alignments whose alignment score are the best. // Assume that alignments in allSAMAlignments[groupBegin, groupEnd) // all have the same queryName and are sorted by score and tPos // asscendingly: worst, ...., best void GetBestSAMAlignmentsInGroup(vector & allSAMAlignments, const unsigned int & groupBegin, const unsigned int & groupEnd, unsigned int & bestBegin, unsigned int & bestEnd) { assert(groupEnd <= allSAMAlignments.size() and groupBegin < groupEnd); bestEnd = groupEnd; bestBegin = groupEnd - 1; int groupBestScore = allSAMAlignments[bestBegin].score; string queryName = allSAMAlignments[bestBegin].qName; while (bestBegin >= groupBegin and bestBegin < groupEnd) { assert(allSAMAlignments[bestBegin].qName == queryName); if (allSAMAlignments[bestBegin].score == groupBestScore) bestBegin -= 1; else break; } bestBegin += 1; } // Apply hit policy to a group of SAM alignments and return indices // of the selected alignments. vector ApplyHitPolicy(HitPolicy & hitPolicy, vector & allSAMAlignments, const unsigned int & groupBegin, const unsigned int & groupEnd) { vector hitIndices; if (hitPolicy.IsAll()) { for(unsigned int i = groupBegin; i < groupEnd; i++){ hitIndices.push_back(i); } } else if (hitPolicy.IsRandom()) { hitIndices.push_back(rand()%(groupEnd - groupBegin) + groupBegin); } else { unsigned int bestBegin, bestEnd; GetBestSAMAlignmentsInGroup(allSAMAlignments, groupBegin, groupEnd, bestBegin, bestEnd); if (hitPolicy.IsAllbest()) { for(unsigned int i = bestBegin; i < bestEnd; i++){ hitIndices.push_back(i); } } else if (hitPolicy.IsRandombest()) { hitIndices.push_back(rand()%(bestEnd-bestBegin) + bestBegin); } else if (hitPolicy.IsLeftmost()) { hitIndices.push_back(bestBegin); } else { assert(false); } } return hitIndices; } // Convert references[...].title in reference.fasta to their corresponding // indices in the title table. void ConvertTitlesToTitleTableIndices(vector & references, string & titleTableName) { TitleTable tt; tt.Read(titleTableName); for(int i = 0; i < references.size(); i++) { string title = references[i].GetTitle(); int idx = -1; if (tt.Lookup(title, idx)) { stringstream ss; ss << idx; references[i].CopyTitle(ss.str()); } else { cout << "ERROR, reference " << title << " does not exist " << " in the title table " << titleTableName << ". The " << "reference fasta and the title table do not match." << endl; exit(1); } } tt.Free(); } // Return true if the alignment can only map to an adapter specified // in the adapter GFF file. // A sample record in adapter GFF file: // ref000001 . adapter 10955 10999 0.00 + . xxxx // ref000001 . adapter 32886 32930 0.00 + . xxxx // Note that the first field (e.g., 'ref000001') is id of sequence // in a reference repository, not sequence name, so we need to // reconstruct the mapping between sequence id and sequence name. bool CheckAdapterOnly(GFFFile & adapterGffFile, //Adapter gff file AlignmentCandidate<> & alignment, // An alignment map & refNameToIndex) { // Map target sequence name to its index in reference repository. if (refNameToIndex.find(alignment.tName) == refNameToIndex.end()) { // This should not happen ... cout << "ERROR, could not find alignment target name " << alignment.tName << " in the reference file." << endl; exit(1); } int refNameIndex = refNameToIndex[alignment.tName]; char buf [16]; sprintf(buf, "ref%06d", refNameIndex + 1); // Reconstruct ref id in the format "ref00000?". string refNameId(buf); int FUZZY_OVERLAP = 20; for(int eindex = 0; eindex < adapterGffFile.entries.size(); eindex++) { GFFEntry & entry = adapterGffFile.entries[eindex]; // Convert each GFF record from 1-based inclusive to // 0-based exclusive. if (entry.type == "adapter" and (entry.name == alignment.tName or entry.name == refNameId)) { UInt estart = entry.start - 1; UInt eend = entry.end; if (entry.strand == '-') { UInt tmp = estart; estart = alignment.tLength - 1 - eend; eend = alignment.tLength - 1 - tmp; } if (not (eend < alignment.GenomicTBegin() or estart > alignment.GenomicTEnd())) { int lengthUnion = max(eend, alignment.GenomicTEnd()) - min(estart, alignment.GenomicTBegin()); if (lengthUnion < eend - estart + FUZZY_OVERLAP) { return true; } } } } return false; } int main(int argc, char* argv[]) { #ifdef USE_GOOGLE_PROFILER char *profileFileName = getenv("CPUPROFILE"); if (profileFileName != NULL) { ProfilerStart(profileFileName); } else { ProfilerStart("google_profile.txt"); } #endif // Register inputs and outputs. string samFileName, refFileName, outFileName; CommandLineParser clp; clp.RegisterStringOption("file.sam", &samFileName, "Input SAM file."); clp.RegisterStringOption("reference.fasta", &refFileName, "Reference used to generate reads."); clp.RegisterStringOption("out.sam", &outFileName, "Output SAM file."); clp.RegisterPreviousFlagsAsHidden(); // Register filter criteria options. int minAlnLength = 50; float minPctSimilarity = 70, minPctAccuracy = 70; string hitPolicyStr = "randombest"; bool useScoreCutoff = false; int scoreCutoff = INF_INT; int scoreSignInt = -1; RegisterFilterOptions(clp, minAlnLength, minPctSimilarity, minPctAccuracy, hitPolicyStr, useScoreCutoff, scoreSignInt, scoreCutoff); int seed = 1; clp.RegisterIntOption("seed", &seed, "(1) Seed for random number generator.\n" "If seed is 0, then use current time as seed.", CommandLineParser::Integer); string holeNumberStr; Ranges holeNumberRanges; clp.RegisterStringOption("holeNumbers", &holeNumberStr, "A string of comma-delimited hole number ranges to output hits, " "such as '1,2,10-12'. " "This requires hit titles to be in SMRT read title format."); bool parseSmrtTitle = false; clp.RegisterFlagOption("smrtTitle", &parseSmrtTitle, "Use this option when filtering alignments generated by " "programs other than blasr, e.g. bwa-sw or gmap. " " Parse read coordinates from the SMRT read title. " "The title is in the format /name/hole/coordinates, where" " coordinates are in the format \\d+_\\d+, and represent " "the interval of the read that was aligned."); /* This experimental option can be useful for metagenomics, in which case * there are hundreds of sequences in the target, of which many titles are * long and may contain white spaces (e.g., ' ', '\t'). * In order to save disc space and avoid the (possibly) none unique mapping * between full and short reference names, one may call blasr with * -titleTable option to represent all target sequences in the output * by their indices in the title table.*/ string titleTableName = ""; clp.RegisterStringOption("titleTable", &titleTableName, "Use this experimental option when filtering alignments generated by " "blasr with -titleTable titleTableName, in which case " "reference titles in SAM are represented by their " "indices (e.g., 0, 1, 2, ...) in the title table."); string adapterGffFileName = ""; clp.RegisterStringOption("filterAdapterOnly", &adapterGffFileName, "Use this option to remove reads which can only map to adapters " "specified in the GFF file."); bool verbose = false; clp.RegisterFlagOption("v", &verbose, "Be verbose."); clp.SetExamples( "Because SAM has optional tags that have different meanings" " in different programs, careful usage is required in order " "to have proper output. The \"xs\" tag in bwa-sw is used to " "show the suboptimal score, but in PacBio SAM (blasr) it is " "defined as the start in the query sequence of the alignment.\n" "When \"-smrtTitle\" is specified, the xs tag is ignored, but " "when it is not specified, the coordinates given by the xs and " "xe tags are used to define the interval of a read that is " "aligned. The CIGAR string is relative to this interval."); clp.ParseCommandLine(argc, argv); // Set random number seed. if (seed == 0) { srand(time(NULL)); } else { srand(seed); } scoreSign = (scoreSignInt == -1)?ScoreSign::NEGATIVE:ScoreSign::POSITIVE; Score s(static_cast(scoreCutoff), scoreSign); FilterCriteria filterCriteria(minAlnLength, minPctSimilarity, minPctAccuracy, true, s); filterCriteria.Verbose(verbose); HitPolicy hitPolicy(hitPolicyStr, scoreSign); string errMsg; if (not filterCriteria.MakeSane(errMsg)) { cout << errMsg << endl; exit(1); } // Parse hole number ranges. if (holeNumberStr.size() != 0) { if (not holeNumberRanges.setRanges(holeNumberStr)) { cout << "Could not parse hole number ranges: " << holeNumberStr << "." << endl; exit(1); } } // Open output file. ostream * outFilePtr = &cout; ofstream outFileStrm; if (outFileName != "") { CrucialOpen(outFileName, outFileStrm, std::ios::out); outFilePtr = &outFileStrm; } GFFFile adapterGffFile; if (adapterGffFileName != "") adapterGffFile.ReadAll(adapterGffFileName); SAMReader samReader; FASTAReader fastaReader; // // Initialize samReader and fastaReader. // samReader.Initialize(samFileName); fastaReader.Initialize(refFileName); // // Configure the file log. // string command; CommandLineParser::CommandLineToString(argc, argv, command); string log = "Filter sam hits."; string program = "samFilter"; string versionString = VERSION; AppendPerforceChangelist(PERFORCE_VERSION_STRING, versionString); // // Read necessary input. // vector references; fastaReader.ReadAllSequences(references); // If the SAM file is generated by blasr with -titleTable, // then references in the SAM are represented by // their corresponding indices in the title table. // In that case, we need to convert reference titles in fasta file // to their corresponding indices in the title table, such that // references in both SAM and fasta files are represented // by title table indices and therefore can match. if (titleTableName != "") { ConvertTitlesToTitleTableIndices(references, titleTableName); } AlignmentSet alignmentSet; vector allHeaders = samReader.ReadHeader(alignmentSet); // Process SAM Header. string commandLineString; clp.CommandLineToString(argc, argv, commandLineString); allHeaders.push_back("@PG\tID:SAMFILTER\tVN:" + versionString + \ "\tCL:" + program + " " + commandLineString); for (int i = 0; i < allHeaders.size(); i++) { outFileStrm << allHeaders[i] << endl; } // // The order of references in vector references and // AlignmentSet<, , >alignmentSet.references can be different. // Rearrange alignmentSet.references such that they are ordered in // exactly the same way as vector references. // alignmentSet.RearrangeReferences(references); // Map reference name obtained from SAM file to indices map refNameToIndex; for (int i = 0; i < references.size(); i++) { string refName = alignmentSet.references[i].GetSequenceName(); refNameToIndex[refName] = i; } // // Store the alignments. // SAMAlignment samAlignment; int alignIndex = 0; // // For 150K, each chip produces about 300M sequences // (not including quality values and etc.). // Let's assume that the sam file and reference data can // fit in the memory. // Need to scale for larger sequal data in the future. // vector allSAMAlignments; while (samReader.GetNextAlignment(samAlignment)) { if (samAlignment.rName == "*") { continue; } if (parseSmrtTitle and holeNumberStr.size() != 0) { string movieName; int thisHoleNumber; if (not ParsePBIReadName(samAlignment.qName, movieName, thisHoleNumber)) { cout << "ERROR, could not parse SMRT title: " << samAlignment.qName << "." << endl; exit(1); } if (not holeNumberRanges.contains(UInt(thisHoleNumber))) { if (verbose) cout << thisHoleNumber << " is not in range." << endl; continue; } } if (samAlignment.cigar.find('P') != string::npos) { cout << "WARNING. Could not process SAM record with 'P' in " << "its cigar string." << endl; continue; } vector > convertedAlignments; SAMAlignmentsToCandidates(samAlignment, references, refNameToIndex, convertedAlignments, parseSmrtTitle, false); if (convertedAlignments.size() > 1) { cout << "WARNING. Ignore multiple segments." << endl; continue; } for (int i = 0; i < 1; i++) { AlignmentCandidate<> & alignment = convertedAlignments[i]; //score func does not matter DistanceMatrixScoreFunction distFunc; ComputeAlignmentStats(alignment, alignment.qAlignedSeq.seq, alignment.tAlignedSeq.seq, distFunc); // Check whether this alignment can only map to adapters in // the adapter GFF file. if (adapterGffFileName != "" and CheckAdapterOnly(adapterGffFile, alignment, refNameToIndex)) { if (verbose) cout << alignment.qName << " filter adapter only." << endl; continue; } // Assign score to samAlignment. samAlignment.score = samAlignment.as; if (not filterCriteria.Satisfy(static_cast *>(&alignment))) { continue; } allSAMAlignments.push_back( samAlignment ); alignment.FreeSubsequences(); } ++alignIndex; } // Sort all SAM alignments by qName, score and target position. sort(allSAMAlignments.begin(), allSAMAlignments.end(), byQNameScoreTStart); unsigned int groupBegin = 0; unsigned int groupEnd = -1; vector filteredSAMAlignments; while(groupBegin < allSAMAlignments.size()) { // Get the next group of SAM alignments which have the same qName // from allSAMAlignments[groupBegin ... groupEnd) GetNextSAMAlignmentGroup(allSAMAlignments, groupBegin, groupEnd); vector hitIndices = ApplyHitPolicy( hitPolicy, allSAMAlignments, groupBegin, groupEnd); for(unsigned int i = 0; i < hitIndices.size(); i++) { filteredSAMAlignments.push_back(allSAMAlignments[hitIndices[i]]); } groupBegin = groupEnd; } // Sort all SAM alignments by reference name and query name sort(filteredSAMAlignments.begin(), filteredSAMAlignments.end(), byRNameQName); for(unsigned int i = 0; i < filteredSAMAlignments.size(); i++) { filteredSAMAlignments[i].PrintSAMAlignment(outFileStrm); } if (outFileName != "") { outFileStrm.close(); } #ifdef USE_GOOGLE_PROFILER ProfilerStop(); #endif return 0; } blasr-8e668beae0dda1da6914586fb458182c6c3c7482/utils/SamToCmpH5.cpp000066400000000000000000000200771260737656700234120ustar00rootroot00000000000000#include #include "datastructures/alignment/AlignmentCandidate.hpp" #include "sam/SAMReader.hpp" #include "format/StickAlignmentPrinter.hpp" #include "HDFCmpFile.hpp" #include "FASTASequence.hpp" #include "FASTAReader.hpp" #include "CommandLineParser.hpp" #include "datastructures/alignmentset/AlignmentSetToCmpH5Adapter.hpp" #include "datastructures/alignment/SAMToAlignmentCandidateAdapter.hpp" #include "ChangeListID.hpp" #include "utils/TimeUtils.hpp" char VERSION[] = "v1.0.0"; char PERFORCE_VERSION_STRING[] = "$Change: 141782 $"; int main(int argc, char* argv[]) { string program = "samtoh5"; string versionString = VERSION; AppendPerforceChangelist(PERFORCE_VERSION_STRING, versionString); string samFileName, cmpFileName, refFileName; bool parseSmrtTitle = false; bool useShortRefName = false; bool copyQVs = false; CommandLineParser clp; string readType = "standard"; int verbosity = 0; clp.SetProgramName(program); clp.SetProgramSummary("Converts in.sam file to out.cmp.h5 file."); clp.SetVersion(versionString); clp.RegisterStringOption("in.sam", &samFileName, "Input SAM file.", true); clp.RegisterStringOption("reference.fasta", &refFileName, "Reference used to generate reads.", true); clp.RegisterStringOption("out.cmp.h5", &cmpFileName, "Output cmp.h5 file.", true); clp.RegisterPreviousFlagsAsHidden(); clp.RegisterFlagOption("smrtTitle", &parseSmrtTitle, "Use this option when converting alignments " "generated from reads produced by the " "pls2fasta from bas.h5 files by parsing read " "coordinates from the SMRT read title. The title " "is in the format /name/hole/coordinates, where " "coordinates are in the format \\d+_\\d+, and " "represent the interval of the read that was " "aligned."); clp.RegisterStringOption("readType", &readType, "Set the read type: 'standard', 'strobe', 'CCS', " "or 'cDNA'"); clp.RegisterIntOption("verbosity", &verbosity, "Set desired verbosity.", CommandLineParser::PositiveInteger); clp.RegisterFlagOption("useShortRefName", &useShortRefName, "Use abbreviated reference names obtained " "from file.sam instead of using full names " "from reference.fasta."); clp.RegisterFlagOption("copyQVs", ©QVs, "Copy all QVs available in the SAM file into the " "cmp.h5 file. This includes things like InsertionQV " "and DeletionTag."); string description = ("Because SAM has optional tags that have different " "meanings in different programs, careful usage is required in order to " "have proper output. The \"xs\" tag in bwa-sw is used to show the " "suboptimal score, but in PacBio SAM (blasr) it is defined as the start " "in the query sequence of the alignment.\nWhen \"-smrtTitle\" is " "specified, the xs tag is ignored, but when it is not specified, the " "coordinates given by the xs and xe tags are used to define the interval " "of a read that is aligned. The CIGAR string is relative to this interval."); clp.SetExamples(description); clp.ParseCommandLine(argc, argv); if (readType != "standard" and readType != "strobe" and readType != "cDNA" and readType != "CCS") { cout << "ERROR. Read type '" << readType << "' must be one of either 'standard', 'strobe', 'cDNA' or 'CCS'." << endl; exit(1); } cerr << "[INFO] " << GetTimestamp() << " [" << program << "] started." << endl; SAMReader samReader; FASTAReader fastaReader; HDFCmpFile > cmpFile; // // Initialize input/output files. // samReader.Initialize(samFileName); fastaReader.Initialize(refFileName); cmpFile.Create(cmpFileName); // // Configure the file log. // string command; CommandLineParser::CommandLineToString(argc, argv, command); string log = "Convert sam to cmp.h5"; cmpFile.fileLogGroup.AddEntry(command, log, program, GetTimestamp(), versionString); // // Set the readType // cmpFile.SetReadType(readType); // // Read necessary input. // vector references; fastaReader.ReadAllSequences(references); // // This should probably be handled by the alignmentSetAdapter, but // time constraints... // AlignmentSet alignmentSet; samReader.ReadHeader(alignmentSet); // // The order of references in vector references and // AlignmentSet<, , >alignmentSet.references can be different. // Rearrange alignmentSet.references such that it is ordered in // exactly the same way as vector references. // alignmentSet.RearrangeReferences(references); // // Always recompute the MD5 values even if they exist in the input // sam file. Because MD5 is defined differently in sam and cmp.h5 files. // The SAM convention uppercases and normalizes before computing the MD5. // For cmp.h5, we compute the MD5 on the sequence 'as is'. // for(int i = 0; i < alignmentSet.references.size(); i++) { MakeMD5((const char*)&references[i].seq[0], (unsigned int)references[i].length, alignmentSet.references[i].md5); } // // Map short names for references obtained from file.sam to full names obtained from reference.fasta // map shortRefNameToFull; map::iterator it; assert(references.size() == alignmentSet.references.size()); if (!useShortRefName) { for (int i = 0; i < references.size(); i++) { string shortRefName = alignmentSet.references[i].GetSequenceName(); string fullRefName(references[i].title); if (shortRefNameToFull.find(shortRefName) != shortRefNameToFull.end()) { cout << "ERROR, Found more than one reference " << shortRefName << "in sam header" << endl; exit(1); } shortRefNameToFull[shortRefName] = fullRefName; alignmentSet.references[i].sequenceName = fullRefName; } } // // Start setting up the cmp.h5 file. // AlignmentSetToCmpH5Adapter > > alignmentSetAdapter; alignmentSetAdapter.Initialize(); alignmentSetAdapter.StoreReferenceInfo(alignmentSet.references, cmpFile); // // Store the alignments. // SAMAlignment samAlignment; while (samReader.GetNextAlignment(samAlignment)) { if (samAlignment.rName == "*") { continue; } if (!useShortRefName) { //convert shortRefName to fullRefName it = shortRefNameToFull.find(samAlignment.rName); if (it == shortRefNameToFull.end()) { cout << "ERROR, Could not find " << samAlignment.rName << " in the reference repository." << endl; exit(1); } samAlignment.rName = (*it).second; } vector > convertedAlignments; if (verbosity > 0) { cout << "Storing alignment for " << samAlignment.qName << endl; } SAMAlignmentsToCandidates(samAlignment, // Order of references and alignmentSetAdapter.RefInfoGroup // should be exactly the same. references, alignmentSetAdapter.refNameToRefInfoIndex, convertedAlignments, parseSmrtTitle, false, copyQVs); // -1: moleculeID will be computed dynamically. // o.w., the value will be assigned as moleculeID. alignmentSetAdapter.StoreAlignmentCandidateList(convertedAlignments, cmpFile, -1, copyQVs); int a; for (a = 0; a < convertedAlignments.size(); a++) { convertedAlignments[a].FreeSubsequences(); } } cerr << "[INFO] " << GetTimestamp() << " [" << program << "] ended." << endl; return 0; } blasr-8e668beae0dda1da6914586fb458182c6c3c7482/utils/SamToM4.cpp000066400000000000000000000175651260737656700227660ustar00rootroot00000000000000/* * ===================================================================================== * * Filename: SamToM4.cpp * * Description: Convert a sam file to a blasr m4 file. * * Version: 1.0 * Created: 04/03/2013 01:19:43 PM * Revision: none * Compiler: gcc * * Author: Yuan Li (yli), yli@pacificbiosciences.com * Company: Pacific Biosciences * * ===================================================================================== */ #include #include "FASTASequence.hpp" #include "FASTAReader.hpp" #include "CommandLineParser.hpp" #include "ChangeListID.hpp" #include "algorithms/alignment/DistanceMatrixScoreFunction.hpp" #include "algorithms/alignment/AlignmentUtils.hpp" #include "sam/SAMReader.hpp" #include "format/IntervalPrinter.hpp" #include "datastructures/alignment/AlignmentCandidate.hpp" #include "datastructures/alignment/SAMToAlignmentCandidateAdapter.hpp" char VERSION[] = "v0.1.0"; char PERFORCE_VERSION_STRING[] = "$Change: 126414 $"; int main(int argc, char* argv[]) { string program = "samtom4"; string versionString = VERSION; AppendPerforceChangelist(PERFORCE_VERSION_STRING, versionString); string samFileName, refFileName, outFileName; bool printHeader = false; bool parseSmrtTitle = false; bool useShortRefName = false; CommandLineParser clp; clp.SetProgramName(program); clp.SetVersion(versionString); clp.SetProgramSummary("Converts a SAM file generated by blasr to M4 format."); clp.RegisterStringOption("in.sam", &samFileName, "Input SAM file, which is produced by blasr."); clp.RegisterStringOption("reference.fasta", &refFileName, "Reference used to generate file.sam."); clp.RegisterStringOption("out.m4", &outFileName, "Output in blasr M4 format."); clp.RegisterPreviousFlagsAsHidden(); clp.RegisterFlagOption("header", &printHeader, "Print M4 header."); clp.RegisterFlagOption("useShortRefName", &useShortRefName, "Use abbreviated reference names obtained " "from file.sam instead of using full names " "from reference.fasta."); //clp.SetExamples(program + " file.sam reference.fasta out.m4"); clp.ParseCommandLine(argc, argv); ostream * outFilePtr = &cout; ofstream outFileStrm; if (outFileName != "") { CrucialOpen(outFileName, outFileStrm, std::ios::out); outFilePtr = &outFileStrm; } SAMReader samReader; FASTAReader fastaReader; // // Initialize samReader and fastaReader. // samReader.Initialize(samFileName); fastaReader.Initialize(refFileName); // // Configure the file log. // string command; CommandLineParser::CommandLineToString(argc, argv, command); // // Read necessary input. // vector references; fastaReader.ReadAllSequences(references); AlignmentSet alignmentSet; samReader.ReadHeader(alignmentSet); // // The order of references in vector references and // AlignmentSet<, , >alignmentSet.references can be different. // Rearrange alignmentSet.references such that it is ordered in // exactly the same way as vector references. // alignmentSet.RearrangeReferences(references); // // Map short names for references obtained from file.sam to // full names obtained from reference.fasta // map shortRefNameToFull; map::iterator it; assert(references.size() == alignmentSet.references.size()); if (!useShortRefName) { for (int i = 0; i < references.size(); i++) { string shortRefName = alignmentSet.references[i].GetSequenceName(); string fullRefName(references[i].title); if (shortRefNameToFull.find(shortRefName) != shortRefNameToFull.end()) { cout << "ERROR, Found more than one reference " << shortRefName << "in sam header" << endl; exit(1); } shortRefNameToFull[shortRefName] = fullRefName; alignmentSet.references[i].sequenceName = fullRefName; } } // Map reference name obtained from SAM file to indices map refNameToIndex; for (int i = 0; i < references.size(); i++) { string refName = alignmentSet.references[i].GetSequenceName(); refNameToIndex[refName] = i; } // // Store the alignments. // SAMAlignment samAlignment; int alignIndex = 0; // // For 150K, each chip produces about 300M sequences // (not including quality values and etc.). // Let's assume that the sam file and reference data can // fit in the memory. // Need to scale for larger sequal data in the future. // if (printHeader) IntervalOutput::PrintHeader(*outFilePtr); // The socre matrix does not matter because we will use the // aligner's score from SAM file anyway. DistanceMatrixScoreFunction distScoreFn; while (samReader.GetNextAlignment(samAlignment)) { if (samAlignment.rName == "*") { continue; } if (!useShortRefName) { //convert shortRefName to fullRefName it = shortRefNameToFull.find(samAlignment.rName); if (it == shortRefNameToFull.end()) { cout << "ERROR, Could not find " << samAlignment.rName << " in the reference repository." << endl; exit(1); } samAlignment.rName = (*it).second; } // The padding character 'P' is not supported if (samAlignment.cigar.find('P') != string::npos) { cout << "WARNING. Could not process sam record with 'P' in its cigar string." << endl; continue; } vector > convertedAlignments; // // Keep reference as forward. // So if IsReverseComplement(sam.flag)==true, then qStrand is reverse // and tStrand is forward. // bool keepRefAsForward = false; SAMAlignmentsToCandidates(samAlignment, references, refNameToIndex, convertedAlignments, parseSmrtTitle, keepRefAsForward); if (convertedAlignments.size() > 1) { cout << "WARNING. Ignore an alignment which has multiple segments." << endl; continue; } //all alignments are unique single-ended alignments. for (int i = 0; i < 1; i++) { AlignmentCandidate<> & alignment = convertedAlignments[i]; ComputeAlignmentStats(alignment, alignment.qAlignedSeq.seq, alignment.tAlignedSeq.seq, distScoreFn); // Use aligner's score from SAM file anyway. alignment.score = samAlignment.as; alignment.mapQV = samAlignment.mapQV; // Since SAM only has the aligned sequence, many info of the // original query (e.g. the full length) is missing. // Overwrite alignment.qLength (which is length of the query // in the SAM alignment) with xq (which is the length of the // original query sequence saved by blasr) right before printing // the output so that one can reconstruct a blasr m4 record from // a blasr sam alignment. if (samAlignment.xq!=0) alignment.qLength = samAlignment.xq; IntervalOutput::PrintFromSAM(alignment, *outFilePtr); alignment.FreeSubsequences(); } ++alignIndex; } if (outFileName != "") { outFileStrm.close(); } return 0; } blasr-8e668beae0dda1da6914586fb458182c6c3c7482/utils/ToAfg.cpp000066400000000000000000000142361260737656700225320ustar00rootroot00000000000000#include #include #include #include #include "HDFPlsReader.hpp" #include "amos/AfgBasWriter.hpp" #include "HDFRegionTableReader.hpp" #include "reads/RegionTable.hpp" #include "reads/ReadInterval.hpp" #include "files/ReaderAgglomerate.hpp" #include "utils/FileOfFileNames.hpp" #include "utils/RegionUtils.hpp" #include "SMRTSequence.hpp" #include "utils.hpp" using namespace std; void PrintUsage() { cout << "usage: toAfg input.filetype output.filetype" << endl << " [-minSubreadLength l] " << endl << " [-regionTable regions_file] " << endl << " [-noSplitSubreads]" << endl << " [-useccsdenovo]" << endl << " [-uniformQV QV]" << endl << "Print reads stored in a file (pls|fasta|fastq) as an afg." << endl; } int main(int argc, char* argv[]) { string inputFileName, outputFileName; if (argc < 2) { PrintUsage(); exit(1); } vector inputFileNames; inputFileName = argv[1]; outputFileName = argv[2]; int argi = 3; RegionTable regionTable; string regionsFOFNName = ""; vector regionFileNames; bool splitSubreads = true; bool useCCS = false; bool useUniformQV = false; int uniformQV = 7; int minSubreadLength = 1; while (argi < argc) { if (strcmp(argv[argi], "-regionTable") == 0) { regionsFOFNName = argv[++argi]; } else if (strcmp(argv[argi], "-noSplitSubreads") == 0) { splitSubreads = false; } else if (strcmp(argv[argi], "-minSubreadLength") == 0) { minSubreadLength = atoi(argv[++argi]); } else if (strcmp(argv[argi], "-useccsdenovo") == 0) { useCCS = true; } else if (strcmp(argv[argi], "-uniformQV") == 0) { useUniformQV = true; uniformQV = atoi(argv[++argi]); } else { PrintUsage(); cout << "ERROR! Option " << argv[argi] << " is not supported." << endl; } argi++; } if (FileOfFileNames::IsFOFN(inputFileName)) { FileOfFileNames::FOFNToList(inputFileName, inputFileNames); } else { inputFileNames.push_back(inputFileName); } if (regionsFOFNName == "") { regionFileNames = inputFileNames; } else { if (FileOfFileNames::IsFOFN(regionsFOFNName)) { FileOfFileNames::FOFNToList(regionsFOFNName, regionFileNames); } else { regionFileNames.push_back(regionsFOFNName); } } ofstream fastaOut; CrucialOpen(outputFileName, fastaOut); int plsFileIndex; HDFRegionTableReader hdfRegionReader; AfgBasWriter afgWriter; if (useUniformQV){ afgWriter.SetDefaultQuality(uniformQV); } afgWriter.Initialize(outputFileName); for (plsFileIndex = 0; plsFileIndex < inputFileNames.size(); plsFileIndex++) { if (splitSubreads) { hdfRegionReader.Initialize(regionFileNames[plsFileIndex]); hdfRegionReader.ReadTable(regionTable); } ReaderAgglomerate reader; // reader.SkipReadQuality(); // should have been taken care of by *Filter modules if (useCCS){ reader.UseCCS(); } else { reader.IgnoreCCS(); } reader.Initialize(inputFileNames[plsFileIndex]); CCSSequence seq; int seqIndex = 0; int numRecords = 0; vector subreadIntervals; while (reader.GetNext(seq)){ ++seqIndex; if (useUniformQV && seq.qual.data != NULL){ for (int qvIndex = 0; qvIndex < seq.length; qvIndex++){ seq.qual[qvIndex] = uniformQV; } } if (splitSubreads == false) { if (seq.length >= minSubreadLength) { afgWriter.Write(seq); } seq.Free(); continue; } DNALength hqReadStart, hqReadEnd; int score; GetReadTrimCoordinates(seq, seq.zmwData, regionTable, hqReadStart, hqReadEnd, score); if (regionTable.HasHoleNumber(seq.HoleNumber())) { subreadIntervals = regionTable[seq.HoleNumber()].SubreadIntervals(seq.length, true, true); } else { subreadIntervals = {}; } if (seq.length == 0 and subreadIntervals.size() > 0) { cout << "WARNING! A high quality interval region exists for a read of length 0." < hqReadStart ? subreadIntervals[intvIndex].start : hqReadStart; int subreadEnd = subreadIntervals[intvIndex].end < hqReadEnd ? subreadIntervals[intvIndex].end : hqReadEnd; int subreadLength = subreadEnd - subreadStart; if (subreadLength < minSubreadLength) continue; subreadSequence.SubreadStart(subreadStart); subreadSequence.SubreadEnd (subreadEnd); subreadSequence.ReferenceSubstring(seq, subreadStart, subreadLength); stringstream titleStream; titleStream << seq.title << "/" << subreadIntervals[intvIndex].start << "_" << subreadIntervals[intvIndex].end; subreadSequence.CopyTitle(titleStream.str()); afgWriter.Write(subreadSequence); } seq.Free(); } reader.Close(); hdfRegionReader.Close(); } } blasr-8e668beae0dda1da6914586fb458182c6c3c7482/utils/ctest/000077500000000000000000000000001260737656700221425ustar00rootroot00000000000000blasr-8e668beae0dda1da6914586fb458182c6c3c7482/utils/ctest/loadPulses.t000066400000000000000000000157611260737656700244540ustar00rootroot00000000000000Set up directories $ . $TESTDIR/setup.sh Set up the executable: loadPulses. $ EXEC=$TESTDIR/../loadPulses #Test loadPulses: input is a pls.h5 file #Test -byread and -bymetric $ PLS_IN=$DATDIR/ecoli_lp.fofn $ CMP_IN_2=$DATDIR/ecoli_lp_tiny.cmp.h5 $ CMP_STDOUT_2=$STDDIR/ecoli_lp_tiny.cmp.h5 $ CMP_OUT_byread_2=$OUTDIR/ecoli_lp_tiny.byread.cmp.h5 $ CMP_OUT_bymetric_2=$OUTDIR/ecoli_lp_tiny.bymetric.cmp.h5 $ METRICS=StartFrame,PulseWidth,WidthInFrames,pkmid,IPD,Light $ rm -f $CMP_OUT_byread_2 $ cp $CMP_IN_2 $CMP_OUT_byread_2 $ $EXEC $PLS_IN $CMP_OUT_byread_2 -metrics $METRICS -byread [INFO] * [loadPulses] started. (glob) loading 2 alignments for movie 1 loading 2 alignments for movie 2 [INFO] * [loadPulses] ended. (glob) $ h5diff -c $CMP_OUT_byread_2 $CMP_STDOUT_2 dataset: and \d+ differences found (re) dataset: and \d+ differences found (re) [1] $ rm -f $CMP_OUT_bymetric_2 $ cp $CMP_IN_2 $CMP_OUT_bymetric_2 $ $EXEC $PLS_IN $CMP_OUT_bymetric_2 -metrics $METRICS -bymetric [INFO] * [loadPulses] started. (glob) loading 2 alignments for movie 1 loading 2 alignments for movie 2 [INFO] * [loadPulses] ended. (glob) $ h5diff -c $CMP_OUT_bymetric_2 $CMP_STDOUT_2 dataset: and \d+ differences found (re) dataset: and \d+ differences found (re) [1] #Test loadPulses for deep sorted cmp.h5 $ FOFN_IN=$DATDIR/ecoli_lp.fofn $ CMP_IN_SORTED=$DATDIR/ecoli_lp_tiny_sorted.cmp.h5 $ CMP_STDOUT_SORTED=$STDDIR/ecoli_lp_tiny_sorted.cmp.h5 $ CMP_OUT_SORTED_bymetric=$OUTDIR/ecoli_lp_tiny_sorted_bymetric.cmp.h5 $ CMP_OUT_SORTED_byread=$OUTDIR/ecoli_lp_tiny_sorted_byread.cmp.h5 $ METRICS=StartFrame,PulseWidth,WidthInFrames,pkmid,IPD,Light,DeletionQV,InsertionQV,SubstitutionQV,MergeQV,QualityValue,DeletionTag,SubstitutionTag,ClassifierQV,PreBaseFrames,PulseIndex $ rm -f $CMP_OUT_SORTED_bymetric $ cp $CMP_IN_SORTED $CMP_OUT_SORTED_bymetric $ $EXEC $FOFN_IN $CMP_OUT_SORTED_bymetric -bymetric -metrics $METRICS > $OUTDIR/tmp.log [INFO] * [loadPulses] started. (glob) [INFO] * [loadPulses] ended. (glob) $ h5diff -c $CMP_OUT_SORTED_bymetric $CMP_STDOUT_SORTED dataset: and \d+ differences found (re) dataset: and \d+ differences found (re) [1] $ rm -f $CMP_OUT_SORTED_byread $ cp $CMP_IN_SORTED $CMP_OUT_SORTED_byread $ $EXEC $FOFN_IN $CMP_OUT_SORTED_byread -byread -metrics $METRICS > $OUTDIR/tmp.log [INFO] * [loadPulses] started. (glob) [INFO] * [loadPulses] ended. (glob) $ h5diff -c $CMP_OUT_SORTED_bymetric $CMP_STDOUT_SORTED dataset: and \d+ differences found (re) dataset: and \d+ differences found (re) [1] #Test loadPulses for a zero-alignment cmp.h5 file. $ FOFN_IN=$DATDIR/ecoli_lp.fofn $ CMP_IN_NOALN=$DATDIR/noaln_lp.cmp.h5 $ $EXEC $FOFN_IN $CMP_IN_NOALN -byread -metrics $METRICS [INFO] * [loadPulses] started. (glob) WARNING, there is no alignment in the cmp file. [INFO] * [loadPulses] ended. (glob) #Test loadPulses -byMetric with a 'large' bas.h5 file of which the dataset size is greater than maxElements. $ FOFN_IN=$DATDIR/ecoli_lp.fofn $ CMP_IN=$DATDIR/ecoli_lp_tiny_sorted.cmp.h5 $ CMP_OUT=$OUTDIR/ecoli_lp_maxEle.cmp.h5 $ CMP_STDOUT=$STDDIR/ecoli_lp_maxEle.cmp.h5 $ METRICS=QualityValue,MergeQV,InsertionQV,DeletionQV,DeletionTag,PulseWidth,SubstitutionQV,SubstitutionTag $ MAX_ELEMENTS=140000000 $ rm -f $CMP_OUT $ cp $CMP_IN $CMP_OUT $ $EXEC $FOFN_IN $CMP_OUT -bymetric -metrics $METRICS -maxElements $MAX_ELEMENTS [INFO] * [loadPulses] started. (glob) Either the number of elements exceeds maxElement (140000000). Or the estimated memory consumption exceeds maxMemory (4 GB). Loading pulses from .+ by read. (re) loading 2 alignments for movie 1 loading 2 alignments for movie 2 [INFO] * [loadPulses] ended. (glob) $ h5diff -c $CMP_OUT $CMP_STDOUT dataset: and \d+ differences found (re) dataset: and \d+ differences found (re) dataset: and \d+ differences found (re) [1] #Test loadPulses -byMetric on a multi-streaming job. $ FOFN_IN=$DATDIR/lambda_bax.fofn $ CMP_IN=$DATDIR/lambda_bax.cmp.h5 $ CMP_OUT=$OUTDIR/lambda_bax.cmp.h5 $ CMP_STDOUT=$STDDIR/lambda_bax.cmp.h5 $ METRICS=QualityValue,MergeQV,InsertionQV,DeletionQV,DeletionTag,PulseWidth,SubstitutionQV,SubstitutionTag $ rm -f $CMP_OUT $ cp $CMP_IN $CMP_OUT $ $EXEC $FOFN_IN $CMP_OUT -bymetric -metrics $METRICS [INFO] * [loadPulses] started. (glob) WARNING: There is insufficient data to compute metric: MergeQV in the file .+ It will be ignored. (re) loading 2 alignments for movie 1 loading 2 alignments for movie 1 [INFO] * [loadPulses] ended. (glob) $ h5diff -c $CMP_OUT $CMP_STDOUT dataset: and \d+ differences found (re) dataset: and \d+ differences found (re) dataset: and \d+ differences found (re) [1] #Test loadPulses -bymetric for a ccs cmp.h5 file generated from multiple movies. $ FOFN_IN=$DATDIR/ccs_lp.fofn $ CMP_IN=$DATDIR/ccs_lp.cmp.h5 $ CMP_OUT=$OUTDIR/ccs_lp.cmp.h5 # The original pls.h5 files disappeared, to use another dataset instead. $ rm -f CMP_OUT $ cp $CMP_IN $CMP_OUT $ $EXEC $FOFN_IN $CMP_OUT -bymetric -metrics QualityValue [INFO] * [loadPulses] started. (glob) loading 100 alignments for movie 1 loading 45 alignments for movie 2 [INFO] * [loadPulses] ended. (glob) $ h5ls -r $CMP_OUT | grep "AlnInfo" /AlnInfo Group /AlnInfo/AlnIndex Dataset {145/Inf, 22} /AlnInfo/NumPasses Dataset {145/Inf} #Test loadPulses *.fofn cmp.h5 where *.fofn can either contain ccs.h5 or bas.h5 # and the cmp.h5 s readType is CCS $ CCS_FOFN=$DATDIR/test_ccs.fofn $ BAS_FOFN=$DATDIR/test_bas.fofn $ CMP_IN=$DATDIR/test_ccs_bas.cmp.h5 $ CCS_OUT=$OUTDIR/test_ccs_bas_ccs.cmp.h5 $ BAS_OUT=$OUTDIR/test_ccs_bas_bas.cmp.h5 $ cp $CMP_IN $CCS_OUT $ cp $CMP_IN $BAS_OUT $ $EXEC $CCS_FOFN $CCS_OUT -metrics QualityValue,DeletionQV,DeletionTag,InsertionQV,SubstitutionQV [INFO] * [loadPulses] started. (glob) loading 11 alignments for movie 1 [INFO] * [loadPulses] ended. (glob) $ sleep 1 $ $EXEC $BAS_FOFN $BAS_OUT -metrics QualityValue,DeletionQV,DeletionTag,InsertionQV,SubstitutionQV [INFO] * [loadPulses] started. (glob) loading 11 alignments for movie 1 [INFO] * [loadPulses] ended. (glob) $ h5diff $CCS_OUT $BAS_OUT dataset: and (glob) * differences found (glob) dataset: and * differences found (glob) [1] blasr-8e668beae0dda1da6914586fb458182c6c3c7482/utils/ctest/loadPulses_fast.t000066400000000000000000000026271260737656700254660ustar00rootroot00000000000000Set up directories $ . $TESTDIR/setup.sh Set up the executable: loadPulses. $ EXEC=$TESTDIR/../loadPulses #Test loadPulses: input is a bas.h5 file #Test -byread and -bymetric $ BAS_IN_1=$DATDIR/lambda_lp.fofn $ CMP_IN_1=$DATDIR/lambda_lp.cmp.h5 $ CMP_STDOUT_1=$STDDIR/lambda_lp.cmp.h5 $ CMP_OUT_byread_1=$OUTDIR/lambda.byread.cmp.h5 $ CMP_OUT_bymetric_1=$OUTDIR/lambda.bymetric.cmp.h5 $ METRICS=QualityValue,MergeQV,InsertionQV,DeletionQV,DeletionTag,PulseWidth,SubstitutionQV,SubstitutionTag $ rm -f $CMP_OUT_byread_1 $ cp $CMP_IN_1 $CMP_OUT_byread_1 $ $EXEC $BAS_IN_1 $CMP_OUT_byread_1 -metrics $METRICS -byread > $OUTDIR/tmp.log [INFO] * [loadPulses] started. (glob) [INFO] * [loadPulses] ended. (glob) $ h5diff -c $CMP_OUT_byread_1 $CMP_STDOUT_1 dataset: and \d+ differences found (re) dataset: and \d+ differences found (re) [1] $ rm -f $CMP_OUT_bymetric_1 $ cp $CMP_IN_1 $CMP_OUT_bymetric_1 $ $EXEC $BAS_IN_1 $CMP_OUT_bymetric_1 -metrics $METRICS -bymetric > $OUTDIR/tmp.log [INFO] * [loadPulses] started. (glob) [INFO] * [loadPulses] ended. (glob) $ h5diff -c $CMP_OUT_bymetric_1 $CMP_STDOUT_1 dataset: and \d+ differences found (re) dataset: and \d+ differences found (re) [1] blasr-8e668beae0dda1da6914586fb458182c6c3c7482/utils/ctest/pls2fasta.t000066400000000000000000000012351260737656700242270ustar00rootroot00000000000000Set up $ . $TESTDIR/setup.sh Set up the executable: pls2fasta. $ EXEC=$TESTDIR/../pls2fasta Test pls2fasta Condition: the order of region tables do not match the order bax.h5 files. $ $EXEC $DATDIR/test_pls2fasta.fofn $OUTDIR/test_pls2fasta.fa -regionTable $DATDIR/test_pls2fasta_rgn.fofn -trimByRegion [INFO] * [pls2fasta] started. (glob) [INFO] * [pls2fasta] ended. (glob) Test pls2fasta output fastq $ $EXEC $DATDIR/ecoli_lp.fofn $OUTDIR/test_pls2fasta_ecoli.fq -trimByRegion -fastq [INFO] * [pls2fasta] started. (glob) [INFO] * [pls2fasta] ended. (glob) $ echo $? 0 $ diff $OUTDIR/test_pls2fasta_ecoli.fq $STDDIR/test_pls2fasta_ecoli.fq blasr-8e668beae0dda1da6914586fb458182c6c3c7482/utils/ctest/pls2fasta_fast.t000066400000000000000000000005641260737656700252500ustar00rootroot00000000000000Set up $ . $TESTDIR/setup.sh Set up the executable: pls2fasta. $ EXEC=$TESTDIR/../pls2fasta Test pls2fasta output fasta $ $EXEC $DATDIR/ecoli_lp.fofn $OUTDIR/test_pls2fasta_ecoli.fa -trimByRegion [INFO] * [pls2fasta] started. (glob) [INFO] * [pls2fasta] ended. (glob) $ echo $? 0 $ diff $OUTDIR/test_pls2fasta_ecoli.fa $STDDIR/test_pls2fasta_ecoli.fa blasr-8e668beae0dda1da6914586fb458182c6c3c7482/utils/ctest/samFilter.t000066400000000000000000000062461260737656700242650ustar00rootroot00000000000000Set up $ . $TESTDIR/setup.sh Set up the executable: samFilter. $ EXEC=$TESTDIR/../samFilter #Test samFilter with a *.sam file generated by blasr $ OUTFILE=$OUTDIR/lambda_bax_filter_1.sam $ STDFILE=$STDDIR/lambda_bax_filter_1.sam $ TMP1=$OUTDIR/$$.tmp.out $ TMP2=$OUTDIR/$$.tmp.stdout $ rm -f $OUTFILE $ $EXEC $DATDIR/lambda_bax.sam $DATDIR/lambda_ref.fasta $OUTFILE -minAccuracy 70 -minPctSimilarity 30 -hitPolicy all $ tail -n+7 $OUTFILE |sort > $TMP1 $ tail -n+7 $STDFILE |sort > $TMP2 $ diff $TMP1 $TMP2 $ rm $TMP1 $TMP2 #Test whether minAccuracy and minPctSimilarity can be float. # $ rm -f $OUTFILE # $ $EXEC $DATDIR/lambda_bax.sam $DATDIR/lambda_ref.fasta $OUTFILE -minAccuracy 70.0 -minPctSimilarity 30.0 -hitPolicy all # $ tail -n+7 $OUTFILE | sort > $TMP1 # $ tail -n+7 $STDFILE | sort > $TMP2 # $ diff $TMP1 $TMP2 # $ rm $TMP1 $TMP2 #Test samFilter with -hitPolicy allbest $ OUTFILE=$OUTDIR/lambda_bax_filter_2.sam $ STDFILE=$STDDIR/lambda_bax_filter_2.sam $ rm -f $OUTFILE $ $EXEC $DATDIR/lambda_bax.sam $DATDIR/lambda_ref.fasta $OUTFILE -hitPolicy allbest $ tail -n+7 $OUTFILE > $TMP1 $ tail -n+7 $STDFILE > $TMP2 $ diff $TMP1 $TMP2 $ rm $TMP1 $TMP2 #Test samFilter with -hitPolicy random $ OUTFILE=$OUTDIR/lambda_bax_filter_3.sam $ STDFILE=$STDDIR/lambda_bax_filter_3.sam $ rm -f $OUTFILE $ $EXEC $DATDIR/lambda_bax.sam $DATDIR/lambda_ref.fasta $OUTFILE -hitPolicy random $ tail -n+7 $OUTFILE > $TMP1 $ tail -n+7 $STDFILE > $TMP2 $ diff $TMP1 $TMP2 $ rm $TMP1 $TMP2 #Test samFilter with -hitPolicy randombest $ OUTFILE=$OUTDIR/lambda_bax_filter_4.sam $ STDFILE=$STDDIR/lambda_bax_filter_4.sam $ rm -f $OUTFILE $ $EXEC $DATDIR/lambda_bax.sam $DATDIR/lambda_ref.fasta $OUTFILE -hitPolicy randombest $ tail -n+7 $OUTFILE > $TMP1 $ tail -n+7 $STDFILE > $TMP2 $ diff $TMP1 $TMP2 $ rm $TMP1 $TMP2 # Test samFilter with -hitPolicy leftmost $ OUTFILE=$OUTDIR/test_leftmost_out.sam $ rm -f $OUTFILE $ $EXEC $DATDIR/test_leftmost.sam $DATDIR/test_leftmost_target.fasta $OUTFILE -hitPolicy leftmost $ tail -n+6 $OUTFILE |cut -f 4 1 $ OUTFILE=$OUTDIR/lambda_bax_filter_5.sam $ STDFILE=$STDDIR/lambda_bax_filter_5.sam $ rm -f $OUTFILE $ $EXEC $DATDIR/lambda_bax.sam $DATDIR/lambda_ref.fasta $OUTFILE $ tail -n+7 $OUTFILE > $TMP1 $ tail -n+7 $STDFILE > $TMP2 $ diff $TMP1 $TMP2 $ rm $TMP1 $TMP2 #Test samFilter with -holeNumbers $ OUTFILE=$OUTDIR/lambda_bax_filter_6.sam $ STDFILE=$STDDIR/lambda_bax_filter_6.sam $ rm -f $OUTFILE $ $EXEC $DATDIR/lambda_bax.sam $DATDIR/lambda_ref.fasta $OUTFILE -holeNumbers 101350-105000,21494 $ tail -n+7 $OUTFILE > $TMP1 $ tail -n+7 $STDFILE > $TMP2 $ diff $TMP1 $TMP2 $ rm $TMP1 $TMP2 #Test samFilter with -filterAdapterOnly $ $EXEC $DATDIR/filterAdapterOnly.sam $DATDIR/references/H1_6_Scal_6x/sequence/H1_6_Scal_6x.fasta $OUTDIR/filterAdapterOnly.sam -filterAdapterOnly $DATDIR/references/H1_6_Scal_6x/annotations/H1_6_Scal_6x_adapters.gff $ tail -n+6 $OUTDIR/filterAdapterOnly.sam |cut -f 1,2,3,4 m130302_124313_42130_c100502672550000001523078308081365_s1_p0/10817\t16\t11k_plasmidbell_H1_6_Scal_6x\t29466 (esc) blasr-8e668beae0dda1da6914586fb458182c6c3c7482/utils/ctest/samtoh5.t000066400000000000000000000150161260737656700237120ustar00rootroot00000000000000Set up directories $ . $TESTDIR/setup.sh Set up the executable: samtoh5. $ EXEC=$TESTDIR/../samtoh5 #2014_06_05, changed cmp.h5 /version and /primaryversion from 1.3.1* to 2.0.0 #2014_10_30, changelist 141782, make cmp.h5 column 10 (0 based) moleculeID unique for zmws. #Test samtoh5 with *.sam files generated by blasr. $ rm -f $OUTDIR/ecoli.cmp.h5 $ $EXEC -useShortRefName $DATDIR/ecoli.sam $DATDIR/ecoli_reference.fasta $OUTDIR/ecoli.cmp.h5 [INFO] * [samtoh5] started. (glob) [INFO] * [samtoh5] ended. (glob) $ h5diff $OUTDIR/ecoli.cmp.h5 $STDDIR/ecoli_2014_10_30.cmp.h5 dataset: and \d+ differences found (re) dataset: and \d+ differences found (re) [1] # dataset: and # \d+ differences found (re) #Verify bug 21794 has been fixed. #samtoh5 should print the following error message. $ rm -f $OUTDIR/bug21794.cmp.h5 $ $EXEC $DATDIR/bug21794.sam $DATDIR/bug21794_reference.fasta $OUTDIR/bug21794.cmp.h5 [INFO] * [samtoh5] started. (glob) WARNING. The mapping of read m120504_033026_sherri_c100311672550000001523012508061292_s1_p0/71092/3721_4845 to reference chr4_ctg9_hap1 is out of bounds. StartPos (4294967288) + AlnLength (614) > RefLength (590426) + 2 [INFO] * [samtoh5] ended. (glob) #Test boundary case where a read exactly maps to the end of reference. $ rm -f $OUTDIR/bad.cmp.h5 $ $EXEC -useShortRefName $DATDIR/bad.sam $DATDIR/ecoli_mutated.fasta $OUTDIR/bad.cmp.h5 [INFO] * [samtoh5] started. (glob) [INFO] * [samtoh5] ended. (glob) $ h5diff $OUTDIR/bad.cmp.h5 $STDDIR/bad_2014_10_30.cmp.h5 dataset: and \d+ differences found (re) dataset: and \d+ differences found (re) [1] # dataset: and # \d+ differences found (re) #Test more out-of-boundary cases. samtoh5 prints warnings. $ rm -f $OUTDIR/bad2.cmp.h5 $ $EXEC $DATDIR/bad2.sam $DATDIR/ecoli_mutated.fasta $OUTDIR/bad2.cmp.h5 [INFO] * [samtoh5] started. (glob) WARNING. The mapping of read m120724_232507_ethan_c100384812550000001523033110171290_s1_p0/21020/11218_12655 to reference ecoliK12_mutated is out of bounds. StartPos (4638237) + AlnLength (1327) > RefLength (4639560) + 2 WARNING. The mapping of read m120724_232507_ethan_c100384812550000001523033110171290_s1_p0/60189/0_4202 to reference ecoliK12_mutated is out of bounds. StartPos (4639141) + AlnLength (431) > RefLength (4639560) + 2 [INFO] * [samtoh5] ended. (glob) #Test samtoh5 uses full reference names instead of short reference names, if -useShortRefName is not specified $ rm -f $OUTDIR/ecoli_fullRefName.cmp.h5 $ $EXEC $DATDIR/ecoli.sam $DATDIR/ecoli_reference.fasta $OUTDIR/ecoli_fullRefName.cmp.h5 [INFO] * [samtoh5] started. (glob) [INFO] * [samtoh5] ended. (glob) $ h5diff $OUTDIR/ecoli_fullRefName.cmp.h5 $STDDIR/ecoli_fullRefName_2014_10_30.cmp.h5 dataset: and \d+ differences found (re) dataset: and \d+ differences found (re) [1] # dataset: and # \d+ differences found (re) $ h5dump --dataset /RefInfo/FullName $OUTDIR/ecoli_fullRefName.cmp.h5 | sed -n '11p' (0): "ref000001|gi|49175990|ref|NC_000913.2| Escherichia coli str. K-12 substr. MG1655 chromosome, complete genome" #Compare the generated reference names with option -useShortRefName is set $ h5dump --dataset /RefInfo/FullName $OUTDIR/ecoli.cmp.h5 |sed -n '11p' (0): "ref000001|gi|49175990|ref|NC_000913.2|" #Test whether samtoh5 generates correct MD5 for the output cmp.h5 files #even if there are invalid MD5 values in the input sam file. bug 22578. $ rm -f $OUTDIR/test_MD5.cmp.h5 $ $EXEC $DATDIR/test_MD5.sam $DATDIR/test_MD5MultiContigsRef.fasta $OUTDIR/test_MD5.cmp.h5 [INFO] * [samtoh5] started. (glob) [INFO] * [samtoh5] ended. (glob) $ h5dump -d /RefInfo/MD5 $OUTDIR/test_MD5.cmp.h5 |tail -7 |head -4 (0): "3cba630ed67592e8e11fb94ef99a122a", (1): "a687c808a666ea90e0a273c4ac2591c3", (2): "81cf96e23ab1392d898a697c9c4c3acd", (3): "4f4bff70a6ac5ae926e5ed6165684dd3" #Test whether samtoh5 accepts smrtTitle movie/zmw/start_end/start2_end2 $ rm -f $OUTDIR/test_smrtTitle.cmp.h5 $ $EXEC $DATDIR/fns.sam $DATDIR/ecoli_reference.fasta $OUTDIR/test_smrtTitle.cmp.h5 [INFO] * [samtoh5] started. (glob) [INFO] * [samtoh5] ended. (glob) $ h5dump -d /AlnInfo/AlnIndex $OUTDIR/test_smrtTitle.cmp.h5 | sed -n '6,25p' (0,0): 1, 1, 1, 1, 4407727, 4407871, 0, 6, 0, 0, 6, 18, 165, 254, 137, 3, (0,16): 7, 4, 0, 151, 0, 0, (1,0): 2, 1, 1, 1, 4407311, 4407788, 1, 6, 0, 0, 6, 109, 657, 254, 460, 4, (1,16): 84, 13, 152, 713, 0, 0, (2,0): 3, 1, 1, 1, 4407314, 4407871, 0, 6, 0, 0, 6, 5, 641, 254, 547, 2, (2,16): 87, 8, 714, 1358, 0, 0, (3,0): 4, 1, 1, 1, 4407372, 4407876, 1, 6, 0, 0, 6, 0, 586, 254, 494, 2, (3,16): 90, 8, 1359, 1953, 0, 0, (4,0): 5, 1, 1, 1, 4407316, 4407877, 0, 6, 0, 0, 6, 7, 640, 254, 545, 4, (4,16): 84, 12, 1954, 2599, 0, 0, (5,0): 6, 1, 1, 1, 4407334, 4407870, 1, 6, 0, 0, 6, 6, 632, 254, 521, 6, (5,16): 99, 9, 2600, 3235, 0, 0, (6,0): 7, 1, 1, 1, 4407310, 4407868, 0, 6, 0, 0, 6, 3, 639, 254, 542, 8, (6,16): 86, 8, 3236, 3880, 0, 0, (7,0): 8, 1, 1, 1, 4407311, 4407815, 0, 6, 0, 0, 6, 0, 601, 254, 484, 5, (7,16): 112, 15, 3881, 4497, 0, 0, (8,0): 9, 1, 1, 1, 4407319, 4407806, 1, 6, 0, 0, 6, 87, 681, 254, 473, 6, (8,16): 115, 8, 4498, 5100, 0, 0, (9,0): 10, 1, 1, 1, 4407381, 4407592, 0, 6, 0, 0, 6, 93, 345, 254, 205, 3, (9,16): 44, 3, 5101, 5356, 0, 0 #Test whether samtoh5 mimic the behaviour of compareSequences.py and remove #reference groups which have no alignments to any movie. # $ NAME=test_rm_empty_refGroup # $ rm -f $OUTDIR/$NAME.cmp.h5 # $ $EXEC $DATDIR/$NAME.sam $DATDIR/$NAME.fasta $OUTDIR/$NAME.cmp.h5 # [INFO] * [samtoh5] started. (glob) # [INFO] * [samtoh5] ended. (glob) # $ h5dump -d /RefGroup/ID $OUTDIR/$NAME.cmp.h5 | sed -n '6,6p' # (0): 1, 2 # # $ h5dump -d /RefGroup/Path $OUTDIR/$NAME.cmp.h5 | sed -n '11,11p' # (0): "/ref000003", "/ref000005" #Test whether samtoh5 accepts PBI reads in format "movie/holeNumber" $ NAME=test_pbiname $ rm -f $OUTDIR/$NAME.cmp.h5 $ $EXEC $DATDIR/$NAME.sam $DATDIR/ecoli_reference.fasta $OUTDIR/$NAME.cmp.h5 [INFO] * [samtoh5] started. (glob) [INFO] * [samtoh5] ended. (glob) blasr-8e668beae0dda1da6914586fb458182c6c3c7482/utils/ctest/samtom4.t000066400000000000000000000024731260737656700237210ustar00rootroot00000000000000Set up directories $ . $TESTDIR/setup.sh Set up the executable: samtom4. $ EXEC=$TESTDIR/../samtom4 #Test samtom4 with a *.sam file generated by blasr from a fofn $ OUTFILE=$OUTDIR/lambda_bax.m4 #STDFILE has been generated by blasr -m 4 && sort $ STDFILE=$STDDIR/lambda_bax.m4 $ TMPFILE=test_samtom4_2.tmp $ rm -rf $OUTFILE $TMPFILE $ $EXEC $DATDIR/lambda_bax.sam $DATDIR/lambda_ref.fasta $TMPFILE $ sort $TMPFILE > $OUTFILE $ diff $OUTFILE $STDFILE $ rm $TMPFILE #Test samtom4 with a *.sam file generated by blasr with -noSplitSubreads from a fofn $ OUTFILE=$OUTDIR/lambda_bax_noSplitSubreads.m4 #STDFILE has been generated by blasr -m 4 && sort $ STDFILE=$STDDIR/lambda_bax_noSplitSubreads.m4 $ TMPFILE=test_samtom4_3.tmp $ rm -rf $OUTFILE $TMPFILE $ $EXEC $DATDIR/lambda_bax_noSplitSubreads.sam $DATDIR/lambda_ref.fasta $TMPFILE $ sort $TMPFILE > $OUTFILE $ diff $OUTFILE $STDFILE $ rm $TMPFILE #Test samtom4 with a *.sam file generated by blasr. $ OUTFILE=$OUTDIR/ecoli_samtom4.m4 #STDFILE has been generated by blasr -m 4 && sort $ STDFILE=$STDDIR/ecoli_sorted.m4 $ TMPFILE=$OUTDIR/test_samtom4_4.tmp $ rm -rf $OUTFILE $TMPFILE $ $EXEC $DATDIR/ecoli.sam $DATDIR/ecoli_reference.fasta $OUTFILE -useShortRefName $ sort -n $OUTFILE > $TMPFILE $ diff $TMPFILE $STDFILE blasr-8e668beae0dda1da6914586fb458182c6c3c7482/utils/ctest/sawriter.t000066400000000000000000000013651260737656700241740ustar00rootroot00000000000000Set up directories $ CURDIR=$TESTDIR $ REMOTEDIR=/mnt/secondary-siv/testdata/BlasrTestData/ctest $ DATDIR=$REMOTEDIR/data $ OUTDIR=$CURDIR/out $ STDDIR=$REMOTEDIR/stdout Set up the executable: sawriter. $ EXEC=$TESTDIR/../sawriter Define tmporary files $ TMP1=$OUTDIR/$$.tmp.out $ TMP2=$OUTDIR/$$.tmp.stdout Make OUTDIR $ mkdir -p $OUTDIR $ $EXEC $OUTDIR/ecoli_larsson.sa $DATDIR/ecoli_reference.fasta -blt 11 -larsson $ echo $? 0 $ $EXEC $OUTDIR/ecoli_welter.sa $DATDIR/ecoli_reference.fasta -blt 11 -welter 2>$OUTDIR/sawriter.log $ echo $? 0 $ md5sum $OUTDIR/ecoli_larsson.sa |cut -f 1 -d ' ' e23b6afe6ddd74b2656e36bf93f6840c $ md5sum $OUTDIR/ecoli_welter.sa |cut -f 1 -d ' ' e23b6afe6ddd74b2656e36bf93f6840c blasr-8e668beae0dda1da6914586fb458182c6c3c7482/utils/ctest/sdpMatcher.t000066400000000000000000000010211260737656700244130ustar00rootroot00000000000000Set up directories $ CURDIR=$TESTDIR $ REMOTEDIR=/mnt/secondary-siv/testdata/BlasrTestData/ctest $ DATDIR=$REMOTEDIR/data $ OUTDIR=$CURDIR/out $ STDDIR=$REMOTEDIR/stdout Set up the executable: sdpMatcher. $ EXEC=$TESTDIR/../sdpMatcher Define tmporary files $ TMP1=$OUTDIR/$$.tmp.out $ TMP2=$OUTDIR/$$.tmp.stdout Make OUTDIR $ mkdir -p $OUTDIR $ FA=$DATDIR/ecoli_subset.fasta $ $EXEC $FA $FA 10 -local > $OUTDIR/sdpMatcher.out $ echo $? 0 $ diff $OUTDIR/sdpMatcher.out $STDDIR/sdpMatcher.stdout blasr-8e668beae0dda1da6914586fb458182c6c3c7482/utils/ctest/setup.sh000077500000000000000000000004171260737656700236430ustar00rootroot00000000000000# Set up directories CURDIR=$TESTDIR REMOTEDIR=/mnt/secondary-siv/testdata/BlasrTestData/ctest DATDIR=$REMOTEDIR/data OUTDIR=$CURDIR/out STDDIR=$REMOTEDIR/stdout # Define tmporary files TMP1=$OUTDIR/$$.tmp.out TMP2=$OUTDIR/$$.tmp.stdout # Make OUTDIR mkdir -p $OUTDIR blasr-8e668beae0dda1da6914586fb458182c6c3c7482/utils/ctest/toAfg.t000066400000000000000000000021461260737656700233720ustar00rootroot00000000000000Set up directories $ . $TESTDIR/setup.sh Set up the executable: toAfg. $ EXEC=$TESTDIR/../toAfg test basic FASTA use case $ $EXEC $DATDIR/read.fasta output.afg -noSplitSubreads $ head output.afg {UNV iid:1 com: generated by AfgBasWriter Mon Jun 28 14:43:52 2010 . } {LIB iid:1 {DST $ tail output.afg qlt: 55555555555555555555555555555555555555555555555555555555555555555555555555555555 55555555555555555555 . } {FRG iid:24 lib:1 typ:I } test uniform QV use case $ $EXEC $DATDIR/test_toafg.fastq output.afg -noSplitSubreads -uniformQV 3 $ head output.afg {UNV iid:1 com: generated by AfgBasWriter Mon Jun 28 14:43:52 2010 . } {LIB iid:1 {DST test uniform QV FASTA use case $ $EXEC $DATDIR/read.fasta output.afg -noSplitSubreads -uniformQV 7 $ head output.afg {UNV iid:1 com: generated by AfgBasWriter Mon Jun 28 14:43:52 2010 . } {LIB iid:1 {DST $ tail output.afg qlt: 77777777777777777777777777777777777777777777777777777777777777777777777777777777 77777777777777777777 . } {FRG iid:24 lib:1 typ:I } blasr-8e668beae0dda1da6914586fb458182c6c3c7482/utils/makefile000066400000000000000000000023421260737656700225210ustar00rootroot00000000000000.PHONY=all cramtests SRCDIR:=$(dir $(realpath $(lastword $(MAKEFILE_LIST)))) -include ${CURDIR}/../defines.mk include ${SRCDIR}/../rules.mk CXXOPTS := -std=c++0x -pedantic \ -Wall -Wuninitialized -Wno-div-by-zero \ -MMD -MP -w -fpermissive CXXFLAGS += ${CXXOPTS} ${GCXXFLAGS} EXE = loadPulses pls2fasta samtoh5 samtom4 samFilter toAfg sawriter sdpMatcher LD_LIBRARY_PATH=${HDF5_LIB}:${LIBBLASR_LIB}:${LIBPBIHDF_LIB}:${LIBPBDATA_LIB} export LD_LIBRARY_PATH vpath %.cpp ${SRCDIR} all: ${EXE} ${EXE}: ${CXX} -o $@ $< ${CXXFLAGS} ${CPPFLAGS} -MF"${@:%=%.d}" ${STATIC} ${LDFLAGS} ${LDLIBS} loadPulses: LoadPulses.o pls2fasta: PulseToFasta.o samtoh5: SamToCmpH5.o samtom4: SamToM4.o samFilter: SamFilter.o toAfg: ToAfg.o sawriter: SAWriter.o sdpMatcher: SDPMatcher.o CTESTS := \ ctest/loadPulses_fast.t ctest/pls2fasta_fast.t ctest/samFilter.t ctest/samtom4.t ctest/sdpMatcher.t \ ctest/loadPulses.t ctest/pls2fasta.t ctest/samtoh5.t ctest/sawriter.t ctest/toAfg.t SLOW_CTESTS := ctest/loadPulses.t ctest/pls2fasta.t cramtests: ${EXE} cram -v --shell=/bin/bash ${CTESTS} cramfast: ${EXE} cram -v --shell=/bin/bash ${filter-out ${SLOW_CTESTS}, ${CTESTS}} clean: @rm -f ${EXE} @rm -f *.d *.o