probalign1.4/0000755013460500000360000000000011475736045011752 5ustar usmanafsprobalign1.4/FileBuffer.h0000755013460500000360000000506610532124320014122 0ustar usmanafs///////////////////////////////////////////////////////////////// // FileBuffer.h // // Buffered file reading. ///////////////////////////////////////////////////////////////// #ifndef FILEBUFFER_H #define FILEBUFFER_H #include #include #include using namespace std; const int BufferSize = 1000; ///////////////////////////////////////////////////////////////// // FileBuffer // // Class for buffering file reading. ///////////////////////////////////////////////////////////////// class FileBuffer { ifstream file; char buffer[BufferSize]; int currPos; int size; bool isEOF; bool isValid; bool canUnget; public: // Some common routines FileBuffer (const char *filename) : file (filename), currPos (0), size (0), isEOF (false), isValid (!file.fail()), canUnget (false){} ~FileBuffer (){ close(); } bool fail () const { return !isValid; } bool eof () const { return (!isValid || isEOF); } void close(){ file.close(); isValid = false; } ///////////////////////////////////////////////////////////////// // FileBuffer::Get() // // Retrieve a character from the file buffer. Returns true if // and only if a character is read. ///////////////////////////////////////////////////////////////// bool Get (char &ch){ // check to make sure that there's more stuff in the file if (!isValid || isEOF) return false; // if the buffer is empty, it's time to reload it if (currPos == size){ file.read (buffer, BufferSize); size = file.gcount(); isEOF = (size == 0); currPos = 0; if (isEOF) return false; } // store the read character ch = buffer[currPos++]; canUnget = true; return true; } ///////////////////////////////////////////////////////////////// // FileBuffer::UnGet() // // Unretrieve the most recently read character from the file // buffer. Note that this allows only a one-level undo. ///////////////////////////////////////////////////////////////// void UnGet (){ assert (canUnget); assert (isValid); assert (currPos > 0); currPos--; assert (currPos < size); isEOF = false; canUnget = false; } ///////////////////////////////////////////////////////////////// // FileBuffer::GetLine() // // Retrieve characters of text until a newline character is // encountered. Terminates properly on end-of-file condition. ///////////////////////////////////////////////////////////////// void GetLine (string &s){ char ch; s = ""; while (Get (ch) && ch != '\n') s += ch; } }; #endif probalign1.4/Main.cc0000755013460500000360000011074411475547702013157 0ustar usmanafs///////////////////////////////////////////////////////////////// // Main.cc // // Main routines for PROBALIGN 1.4 program (Nov 2010). // ///////////////////////////////////////////////////////////////// #include "SafeVector.h" #include "MultiSequence.h" #include "EvolutionaryTree.h" #include "SparseMatrix.h" #include #include #include #include #include #include //struct for column reliability typedef struct { int columnNo; float probProduct; }columnReliability; columnReliability *column; //Usman string posteriorProbsFilename = ""; string parametersInputFilename = ""; string parametersOutputFilename = "no training"; string annotationFilename = ""; bool allscores = true; bool enableVerbose = false; bool enableAllPairs = false; bool enableAnnotation = false; bool enableClustalWOutput = false; bool enableAlignOrder = false; int numConsistencyReps = 2; int numIterativeRefinementReps = 100; float cutoff = 0; float gapOpenPenalty = 0; float gapContinuePenalty = 0; const int MIN_CONSISTENCY_REPS = 0; const int MAX_CONSISTENCY_REPS = 5; const int MIN_ITERATIVE_REFINEMENT_REPS = 0; const int MAX_ITERATIVE_REFINEMENT_REPS = 1000; string infilename; int flag_gui=0; //0: no gui related o/p //1: gui related o/p generated int flag_ppscore=0; //0: no pp score sequence added to o/p fasta alignment //1: pp score seq added to o/p fasta alignment /////////////////////////////// // global scoring matrix variables ////////////////////////////// float g_gap_open1, g_gap_open2, g_gap_ext1, g_gap_ext2; char *aminos, *bases, matrixtype[20] = "gonnet_160"; int subst_index[26]; float sub_matrix[26][26]; float scorez_matrix[26][26]; int firstread = 0; //this makes sure that matrices are read only once float TEMPERATURE = 5; int MATRIXTYPE = 160; int prot_nuc = 0; //0=prot, 1=nucleotide float GAPOPEN = 0; float GAPEXT = 0; ///////////////////////////////////////////////////////////////// //extern function prototypes //////////////////////////////////////////////////////////////// extern pair < SafeVector < char >*, float >ComputeAlignment(int seq1Length, int seq2Length, const VF & posterior); extern pair < SafeVector < char >*, float >ComputeAlignmentWithGapPenalties(MultiSequence * align1, MultiSequence * align2, const VF & posterior, int numSeqs1, int numSeqs2, float gapOpenPenalty, float gapContinuePenalty); extern VF *BuildPosterior(MultiSequence * align1, MultiSequence * align2, const SafeVector < SafeVector < SparseMatrix * > >&sparseMatrices, float cutoff = 0.0f); extern VF *ComputePostProbs(int a, int b, string s1, string s2); //argument support typedef struct { char input[30]; int matrix; int N; float T; float beta; char opt; //can be 'P' or 'M' float gapopen; float gapext; } argument_decl; argument_decl argument; extern inline void read_sustitution_matrix(char *fileName); extern void setmatrixtype(int le); extern inline int matrixtype_to_int(); extern inline void read_dna_matrix(); extern inline void read_vtml_la_matrix(); extern void init_arguments(); ///////////////////////////////////////////////////////////////// // Function prototypes ///////////////////////////////////////////////////////////////// void PrintHeading(); MultiSequence *DoAlign(MultiSequence * sequence); SafeVector < string > ParseParams(int argc, char **argv); MultiSequence *ComputeFinalAlignment(const TreeNode * tree, MultiSequence * sequences, const SafeVector < SafeVector < SparseMatrix * > >&sparseMatrices); MultiSequence *AlignAlignments(MultiSequence * align1, MultiSequence * align2, const SafeVector < SafeVector < SparseMatrix * > >&sparseMatrices); SafeVector < SafeVector < SparseMatrix * > >DoRelaxation(MultiSequence * sequences, SafeVector < SafeVector < SparseMatrix * > >&sparseMatrices); void Relax(SparseMatrix * matXZ, SparseMatrix * matZY, VF & posterior); void Relax1(SparseMatrix * matXZ, SparseMatrix * matZY, VF & posterior); set < int >GetSubtree(const TreeNode * tree); void TreeBasedBiPartitioning(const SafeVector < SafeVector < SparseMatrix * > >&sparseMatrices, MultiSequence * &alignment, const TreeNode * tree); void DoIterativeRefinement(const SafeVector < SafeVector < SparseMatrix * > >&sparseMatrices, MultiSequence * &alignment); //java gui related change void WriteAnnotation(MultiSequence * alignment, const SafeVector < SafeVector < SparseMatrix * > >&sparseMatrices); //java gui related change float ComputeScore(const SafeVector < pair < int, int > >&active, const SafeVector < SafeVector < SparseMatrix * > >&sparseMatrices); ///////////////////////////////////////////////////////////////// // main() // // Calls all initialization routines and runs the PROBCONS // aligner. ///////////////////////////////////////////////////////////////// int main(int argc, char **argv) { // print PROBCONS heading PrintHeading(); // parse program parameters SafeVector < string > sequenceNames = ParseParams(argc, argv); infilename = sequenceNames[0]; // now, we'll process all the files given as input. If we are given // several filenames as input, then we'll load all of those sequences // simultaneously. // load all files together MultiSequence *sequences = new MultiSequence(); assert(sequences); for (int i = 0; i < (int) sequenceNames.size(); i++) { cerr << "Loading sequence file: " << sequenceNames[i] << endl; sequences->LoadMFA(sequenceNames[i], true); } // now, we can perform the alignments and write them out MultiSequence *alignment = DoAlign(sequences); if (!enableAllPairs) { if (enableClustalWOutput) alignment->WriteALN(cout); else alignment->WriteMFA(cout); } delete alignment; delete sequences; } ///////////////////////////////////////////////////////////////// // PrintHeading() // // Prints heading for PROBCONS program. ///////////////////////////////////////////////////////////////// void PrintHeading() { cerr << endl << "PROBALIGN Version 1.4 (Nov 2010) "<< "aligns multiple protein sequences and prints to the"<GetNumSequences(); VVF distances(numSeqs, VF(numSeqs, 0)); SafeVector < SafeVector < SparseMatrix * > >sparseMatrices(numSeqs, SafeVector < SparseMatrix * >(numSeqs, NULL)); //initialize arguments init_arguments(); int a, b, c=1; if(flag_gui==1) { cout<GetSequence(a); Sequence *seq2 = sequences->GetSequence(b); // verbose output if (enableVerbose) { cerr << "Computing posterior matrix: (" << a + 1 << ") " << seq1->GetHeader() << " vs. " << "(" << b + 1 << ") " << seq2->GetHeader() << " -- " << endl; cerr << a << " " << strlen(seq1->GetString(). c_str()) << endl; cerr << b << " " << strlen(seq2->GetString(). c_str()) << endl << "-------------" << endl; } // if we are training, then we'll simply want to compute the // expected counts for each region within the matrix separately; // otherwise, we'll need to put all of the regions together and // assemble a posterior probability match matrix VF *posterior; posterior = ComputePostProbs(a, b, seq1->GetString(), seq2->GetString()); assert(posterior); // compute sparse representations sparseMatrices[a][b] = new SparseMatrix(seq1->GetLength(), seq2->GetLength(), *posterior); sparseMatrices[b][a] = NULL; // perform the pairwise sequence alignment pair < SafeVector < char >*, float >alignment = ComputeAlignment(seq1->GetLength(), seq2->GetLength(), *posterior); // compute "expected accuracy" distance for evolutionary tree computation float distance = alignment.second / min(seq1->GetLength(), seq2->GetLength()); distances[a][b] = distances[b][a] = distance; if (enableVerbose) cerr << setprecision(10) << distance << endl; delete alignment.first; delete posterior; } } if(flag_gui==1) { cout<<"@-"< >newSparseMatrices = DoRelaxation(sequences, sparseMatrices); // now replace the old posterior matrices for (int i = 0; i < numSeqs; i++) { for (int j = 0; j < numSeqs; j++) { delete sparseMatrices[i][j]; sparseMatrices[i][j] = newSparseMatrices[i][j]; } } } MultiSequence *finalAlignment = NULL; // now if we still need to do a final multiple alignment if (enableVerbose) cerr << endl; // compute the evolutionary tree TreeNode *tree = TreeNode::ComputeTree(distances); tree->Print(cerr, sequences); cerr << endl; // make the final alignment finalAlignment = ComputeFinalAlignment(tree, sequences, sparseMatrices); // build annotation if (enableAnnotation) { WriteAnnotation(finalAlignment, sparseMatrices); } delete tree; //CLEANUP // delete sparse matrices for (int a = 0; a < numSeqs - 1; a++) { for (int b = a + 1; b < numSeqs; b++) { delete sparseMatrices[a][b]; delete sparseMatrices[b][a]; } } return finalAlignment; return NULL; } ///////////////////////////////////////////////////////////////// // GetInteger() // // Attempts to parse an integer from the character string given. // Returns true only if no parsing error occurs. ///////////////////////////////////////////////////////////////// bool GetInteger(char *data, int *val) { char *endPtr; long int retVal; assert(val); errno = 0; retVal = strtol(data, &endPtr, 0); if (retVal == 0 && (errno != 0 || data == endPtr)) return false; if (errno != 0 && (retVal == LONG_MAX || retVal == LONG_MIN)) return false; if (retVal < (long) INT_MIN || retVal > (long) INT_MAX) return false; *val = (int) retVal; return true; } ///////////////////////////////////////////////////////////////// // GetFloat() // // Attempts to parse a float from the character string given. // Returns true only if no parsing error occurs. ///////////////////////////////////////////////////////////////// bool GetFloat(char *data, float *val) { char *endPtr; double retVal; assert(val); errno = 0; retVal = strtod(data, &endPtr); if (retVal == 0 && (errno != 0 || data == endPtr)) return false; if (errno != 0 && (retVal >= 1000000.0 || retVal <= -1000000.0)) return false; *val = (float) retVal; return true; } ///////////////////////////////////////////////////////////////// // ParseParams() // // Parse all command-line options. ///////////////////////////////////////////////////////////////// SafeVector < string > ParseParams(int argc, char **argv) { if (argc < 2) { cerr << "PROBALIGN 1.4 comes with ABSOLUTELY NO WARRANTY." << endl << "This is free software, and you are welcome to redistribute it under certain conditions." << endl << "See the README file for details." << endl << endl << "Usage:" << endl << " probalign [OPTION]... [MFAFILE]..." << endl << endl << "Description:" << endl << " Align sequences in MFAFILE(s) and print result to standard output" << endl << endl << " -clustalw" << endl << " use CLUSTALW output format instead of MFA" << endl << endl << " -v, --verbose" << endl << " report progress while aligning (default: " << (enableVerbose ? "on" : "off") << ")" << endl << endl << " -a, --alignment-order" << endl << " print sequences in alignment order rather than input order (default: " << (enableAlignOrder ? "on" : "off") << ")" << endl << endl << " -T, -temperature" << endl << " Sets the thermodynamic temperature parameter "< sequenceNames; float tempFloat; int tempInt; for (int i = 1; i < argc; i++) { if (argv[i][0] == '-') { if (!strcmp(argv[i], "-p") || !strcmp(argv[i], "--paramfile")) { if (i < argc - 1) parametersInputFilename = string(argv[++i]); else { cerr << "ERROR: Filename expected for option " << argv[i] << endl; exit(1); } } else if (!strcmp(argv[i], "-nuc")) { prot_nuc = 1; if (!strcmp(matrixtype, "gonnet_160")) strcpy(matrixtype, "nuc_simple"); if (GAPOPEN == 0) GAPOPEN = 3; if (GAPEXT == 0) GAPEXT = 0.25; if (TEMPERATURE == 5) TEMPERATURE = 1; } else if (!strcmp(argv[i], "-prot")) { prot_nuc = 0; if (GAPOPEN == 0) GAPOPEN = 22; if (GAPEXT == 0) GAPEXT = 1; } // number of randomized partitioning iterative refinement passes //uncomment to make value of numIterativeRefinementReps modifiable else if (!strcmp (argv[i], "-ir") || !strcmp (argv[i], "--iterative-refinement")) { if (i < argc - 1) { if (!GetInteger (argv[++i], &tempInt)) { cerr << "ERROR: Invalid integer following option " << argv[i - 1] << ": " << argv[i] << endl; exit (1); } else { if (tempInt < MIN_ITERATIVE_REFINEMENT_REPS || tempInt > MAX_ITERATIVE_REFINEMENT_REPS) { cerr << "ERROR: For option " << argv[i - 1] << ", integer must be between " << MIN_ITERATIVE_REFINEMENT_REPS << " and " << MAX_ITERATIVE_REFINEMENT_REPS << "." << endl; exit (1); } else numIterativeRefinementReps = tempInt; } } else { cerr << "ERROR: Integer expected for option " << argv[i] << endl; exit (1); } } // gap open penalty else if (!strcmp(argv[i], "-go") || !strcmp(argv[i], "--gap-open")) { if (i < argc - 1) { if (!GetFloat(argv[++i], &tempFloat)) { cerr << "ERROR: Invalid floating-point value following option " << argv[i - 1] << ": " << argv[i] << endl; exit(1); } else { if (tempFloat < 0) { cerr << "ERROR: For option " << argv[i - 1] << ", floating-point value must be positive." << endl; exit(1); } else GAPOPEN = tempFloat; } } else { cerr << "ERROR: Floating-point value expected for option " << argv[i] << endl; exit(1); } } // gap extension penalty else if (!strcmp(argv[i], "-ge") || !strcmp(argv[i], "--gap-extension")) { if (i < argc - 1) { if (!GetFloat(argv[++i], &tempFloat)) { cerr << "ERROR: Invalid floating-point value following option " << argv[i - 1] << ": " << argv[i] << endl; exit(1); } else { if (tempFloat < 0) { cerr << "ERROR: For option " << argv[i - 1] << ", floating-point value must be positive." << endl; exit(1); } else GAPEXT = tempFloat; } } else { cerr << "ERROR: Floating-point value expected for option " << argv[i] << endl; exit(1); } } // feeds the java gui else if (!strcmp (argv[i], "-gui")){ flag_gui=1; enableAnnotation = true; } // add pp score seq to output alignment else if (!strcmp (argv[i], "-showPP")){ flag_ppscore=1; enableAnnotation = true; } // generate column scores else if (!strcmp (argv[i], "-columnscore")){ enableAnnotation = true; } // clustalw output format else if (!strcmp(argv[i], "-clustalw")) { enableClustalWOutput = true; } // verbose reporting else if (!strcmp(argv[i], "-v") || !strcmp(argv[i], "--verbose")) { enableVerbose = true; } // alignment order else if (!strcmp(argv[i], "-a") || !strcmp(argv[i], "--alignment-order")) { enableAlignOrder = true; } else if (!strcmp(argv[i], "-T") || !strcmp(argv[i], "--temperature")) { if (i < argc - 1) { if (!GetFloat(argv[++i], &tempFloat)) { cerr << "ERROR: Invalid floating-point value following option " << argv[i - 1] << ": " << argv[i] << endl; exit(1); } else { if (tempFloat == 0) { cerr << "ERROR: Non-Zero Integer expected for option " << argv[i] << endl; exit(1); } else TEMPERATURE = tempFloat; } } else { cerr << "ERROR: Floating-point value expected for option " << argv[i] << endl; exit(1); } } //matrix filenames are read by this option else if (!strcmp(argv[i], "-score_matrix") || !strcmp(argv[i], "--score_matrix")) { if (i < argc - 1) { strcpy(matrixtype, argv[++i]); } else { cerr << "ERROR: Value expected for option " << argv[i] << endl; exit(1); } } // bad arguments else { cerr << "ERROR: Unrecognized option: " << argv[i] << endl; exit(1); } } else { sequenceNames.push_back(string(argv[i])); } } return sequenceNames; } ///////////////////////////////////////////////////////////////// // ProcessTree() // // Process the tree recursively. Returns the aligned sequences // corresponding to a node or leaf of the tree. ///////////////////////////////////////////////////////////////// MultiSequence *ProcessTree(const TreeNode * tree, MultiSequence * sequences, const SafeVector < SafeVector < SparseMatrix * > >&sparseMatrices) { MultiSequence *result; // check if this is a node of the alignment tree if (tree->GetSequenceLabel() == -1) { MultiSequence *alignLeft = ProcessTree(tree->GetLeftChild(), sequences, sparseMatrices); MultiSequence *alignRight = ProcessTree(tree->GetRightChild(), sequences, sparseMatrices); assert(alignLeft); assert(alignRight); result = AlignAlignments(alignLeft, alignRight, sparseMatrices); assert(result); delete alignLeft; delete alignRight; } // otherwise, this is a leaf of the alignment tree else { result = new MultiSequence(); assert(result); result->AddSequence(sequences-> GetSequence(tree->GetSequenceLabel())-> Clone()); } return result; } ///////////////////////////////////////////////////////////////// // ComputeFinalAlignment() // // Compute the final alignment by calling ProcessTree(), then // performing iterative refinement as needed. ///////////////////////////////////////////////////////////////// MultiSequence *ComputeFinalAlignment(const TreeNode * tree, MultiSequence * sequences, const SafeVector < SafeVector < SparseMatrix * > >&sparseMatrices) { MultiSequence *alignment = ProcessTree(tree, sequences, sparseMatrices); if (enableAlignOrder) { alignment->SaveOrdering(); enableAlignOrder = false; } // iterative refinement for (int i = 0; i < numIterativeRefinementReps; i++) DoIterativeRefinement(sparseMatrices, alignment); cerr << endl; // return final alignment return alignment; } ///////////////////////////////////////////////////////////////// // AlignAlignments() // // Returns the alignment of two MultiSequence objects. ///////////////////////////////////////////////////////////////// MultiSequence *AlignAlignments(MultiSequence * align1, MultiSequence * align2, const SafeVector < SafeVector < SparseMatrix * > >&sparseMatrices) { // print some info about the alignment if (enableVerbose) { for (int i = 0; i < align1->GetNumSequences(); i++) cerr << ((i == 0) ? "[" : ",") << align1->GetSequence(i)-> GetLabel(); cerr << "] vs. "; for (int i = 0; i < align2->GetNumSequences(); i++) cerr << ((i == 0) ? "[" : ",") << align2->GetSequence(i)-> GetLabel(); cerr << "]: "; } VF *posterior = BuildPosterior(align1, align2, sparseMatrices, cutoff); pair < SafeVector < char >*, float >alignment; // choose the alignment routine depending on the "cosmetic" gap penalties used if (gapOpenPenalty == 0 && gapContinuePenalty == 0) alignment = ComputeAlignment(align1->GetSequence(0)->GetLength(), align2->GetSequence(0)->GetLength(), *posterior); else alignment = ComputeAlignmentWithGapPenalties(align1, align2, *posterior, align1->GetNumSequences(), align2->GetNumSequences(), gapOpenPenalty, gapContinuePenalty); delete posterior; if (enableVerbose) { // compute total length of sequences int totLength = 0; for (int i = 0; i < align1->GetNumSequences(); i++) for (int j = 0; j < align2->GetNumSequences(); j++) totLength += min(align1->GetSequence(i)->GetLength(), align2->GetSequence(j)->GetLength()); // give an "accuracy" measure for the alignment cerr << alignment.second / totLength << endl; } // now build final alignment MultiSequence *result = new MultiSequence(); for (int i = 0; i < align1->GetNumSequences(); i++) result->AddSequence(align1->GetSequence(i)-> AddGaps(alignment.first, 'X')); for (int i = 0; i < align2->GetNumSequences(); i++) result->AddSequence(align2->GetSequence(i)-> AddGaps(alignment.first, 'Y')); if (!enableAlignOrder) result->SortByLabel(); // free temporary alignment delete alignment.first; return result; } ///////////////////////////////////////////////////////////////// // DoRelaxation() // // Performs one round of the consistency transformation. The // formula used is: // 1 // P'(x[i]-y[j]) = --- sum sum P(x[i]-z[k]) P(z[k]-y[j]) // |S| z in S k // // where S = {x, y, all other sequences...} // ///////////////////////////////////////////////////////////////// SafeVector < SafeVector < SparseMatrix * > >DoRelaxation(MultiSequence * sequences, SafeVector < SafeVector < SparseMatrix * > >&sparseMatrices) { const int numSeqs = sequences->GetNumSequences(); SafeVector < SafeVector < SparseMatrix * > >newSparseMatrices(numSeqs, SafeVector < SparseMatrix * >(numSeqs, NULL)); // for every pair of sequences for (int i = 0; i < numSeqs; i++) { for (int j = i + 1; j < numSeqs; j++) { Sequence *seq1 = sequences->GetSequence(i); Sequence *seq2 = sequences->GetSequence(j); if (enableVerbose) cerr << "Relaxing (" << i + 1 << ") " << seq1->GetHeader() << " vs. " << "(" << j + 1 << ") " << seq2->GetHeader() << ": "; // get the original posterior matrix VF *posteriorPtr = sparseMatrices[i][j]->GetPosterior(); assert(posteriorPtr); VF & posterior = *posteriorPtr; const int seq1Length = seq1->GetLength(); const int seq2Length = seq2->GetLength(); // contribution from the summation where z = x and z = y for (int k = 0; k < (seq1Length + 1) * (seq2Length + 1); k++) posterior[k] += posterior[k]; if (enableVerbose) cerr << sparseMatrices[i][j]->GetNumCells() << " --> "; // contribution from all other sequences for (int k = 0; k < numSeqs; k++) if (k != i && k != j) { if (k < i) Relax1(sparseMatrices[k][i], sparseMatrices[k][j], posterior); else if (k > i && k < j) Relax(sparseMatrices[i][k], sparseMatrices[k][j], posterior); else { SparseMatrix *temp = sparseMatrices[j][k]->ComputeTranspose(); Relax(sparseMatrices[i][k], temp, posterior); delete temp; } } // now renormalization for (int k = 0; k < (seq1Length + 1) * (seq2Length + 1); k++) posterior[k] /= numSeqs; // mask out positions not originally in the posterior matrix SparseMatrix *matXY = sparseMatrices[i][j]; for (int y = 0; y <= seq2Length; y++) posterior[y] = 0; for (int x = 1; x <= seq1Length; x++) { SafeVector < PIF >::iterator XYptr = matXY->GetRowPtr(x); SafeVector < PIF >::iterator XYend = XYptr + matXY->GetRowSize(x); VF::iterator base = posterior.begin() + x * (seq2Length + 1); int curr = 0; while (XYptr != XYend) { // zero out all cells until the first filled column while (curr < XYptr->first) { base[curr] = 0; curr++; } // now, skip over this column curr++; ++XYptr; } // zero out cells after last column while (curr <= seq2Length) { base[curr] = 0; curr++; } } // save the new posterior matrix newSparseMatrices[i][j] = new SparseMatrix(seq1->GetLength(), seq2->GetLength(), posterior); newSparseMatrices[j][i] = NULL; if (enableVerbose) cerr << newSparseMatrices[i][j]->GetNumCells() << " -- "; delete posteriorPtr; if (enableVerbose) cerr << "done." << endl; } } return newSparseMatrices; } ///////////////////////////////////////////////////////////////// // Relax() // // Computes the consistency transformation for a single sequence // z, and adds the transformed matrix to "posterior". ///////////////////////////////////////////////////////////////// void Relax(SparseMatrix * matXZ, SparseMatrix * matZY, VF & posterior) { assert(matXZ); assert(matZY); int lengthX = matXZ->GetSeq1Length(); int lengthY = matZY->GetSeq2Length(); assert(matXZ->GetSeq2Length() == matZY->GetSeq1Length()); // for every x[i] for (int i = 1; i <= lengthX; i++) { SafeVector < PIF >::iterator XZptr = matXZ->GetRowPtr(i); SafeVector < PIF >::iterator XZend = XZptr + matXZ->GetRowSize(i); VF::iterator base = posterior.begin() + i * (lengthY + 1); // iterate through all x[i]-z[k] while (XZptr != XZend) { SafeVector < PIF >::iterator ZYptr = matZY->GetRowPtr(XZptr->first); SafeVector < PIF >::iterator ZYend = ZYptr + matZY->GetRowSize(XZptr->first); const float XZval = XZptr->second; // iterate through all z[k]-y[j] while (ZYptr != ZYend) { base[ZYptr->first] += XZval * ZYptr->second; ZYptr++; } XZptr++; } } } ///////////////////////////////////////////////////////////////// // Relax1() // // Computes the consistency transformation for a single sequence // z, and adds the transformed matrix to "posterior". ///////////////////////////////////////////////////////////////// void Relax1(SparseMatrix * matZX, SparseMatrix * matZY, VF & posterior) { assert(matZX); assert(matZY); int lengthZ = matZX->GetSeq1Length(); int lengthY = matZY->GetSeq2Length(); // for every z[k] for (int k = 1; k <= lengthZ; k++) { SafeVector < PIF >::iterator ZXptr = matZX->GetRowPtr(k); SafeVector < PIF >::iterator ZXend = ZXptr + matZX->GetRowSize(k); // iterate through all z[k]-x[i] while (ZXptr != ZXend) { SafeVector < PIF >::iterator ZYptr = matZY->GetRowPtr(k); SafeVector < PIF >::iterator ZYend = ZYptr + matZY->GetRowSize(k); const float ZXval = ZXptr->second; VF::iterator base = posterior.begin() + ZXptr->first * (lengthY + 1); // iterate through all z[k]-y[j] while (ZYptr != ZYend) { base[ZYptr->first] += ZXval * ZYptr->second; ZYptr++; } ZXptr++; } } } ///////////////////////////////////////////////////////////////// // GetSubtree // // Returns set containing all leaf labels of the current subtree. ///////////////////////////////////////////////////////////////// set < int >GetSubtree(const TreeNode * tree) { set < int >s; if (tree->GetSequenceLabel() == -1) { s = GetSubtree(tree->GetLeftChild()); set < int >t = GetSubtree(tree->GetRightChild()); for (set < int >::iterator iter = t.begin(); iter != t.end(); ++iter) s.insert(*iter); } else { s.insert(tree->GetSequenceLabel()); } return s; } ///////////////////////////////////////////////////////////////// // TreeBasedBiPartitioning // // Uses the iterative refinement scheme from MUSCLE. ///////////////////////////////////////////////////////////////// void TreeBasedBiPartitioning(const SafeVector < SafeVector < SparseMatrix * > >&sparseMatrices, MultiSequence * &alignment, const TreeNode * tree) { // check if this is a node of the alignment tree if (tree->GetSequenceLabel() == -1) { TreeBasedBiPartitioning(sparseMatrices, alignment, tree->GetLeftChild()); TreeBasedBiPartitioning(sparseMatrices, alignment, tree->GetRightChild()); set < int >leftSubtree = GetSubtree(tree->GetLeftChild()); set < int >rightSubtree = GetSubtree(tree->GetRightChild()); set < int >leftSubtreeComplement, rightSubtreeComplement; // calculate complement of each subtree for (int i = 0; i < alignment->GetNumSequences(); i++) { if (leftSubtree.find(i) == leftSubtree.end()) leftSubtreeComplement.insert(i); if (rightSubtree.find(i) == rightSubtree.end()) rightSubtreeComplement.insert(i); } // perform realignments for edge to left child if (!leftSubtree.empty() && !leftSubtreeComplement.empty()) { MultiSequence *groupOneSeqs = alignment->Project(leftSubtree); assert(groupOneSeqs); MultiSequence *groupTwoSeqs = alignment->Project(leftSubtreeComplement); assert(groupTwoSeqs); delete alignment; alignment = AlignAlignments(groupOneSeqs, groupTwoSeqs, sparseMatrices); } // perform realignments for edge to right child if (!rightSubtree.empty() && !rightSubtreeComplement.empty()) { MultiSequence *groupOneSeqs = alignment->Project(rightSubtree); assert(groupOneSeqs); MultiSequence *groupTwoSeqs = alignment->Project(rightSubtreeComplement); assert(groupTwoSeqs); delete alignment; alignment = AlignAlignments(groupOneSeqs, groupTwoSeqs, sparseMatrices); } } } ///////////////////////////////////////////////////////////////// // DoIterativeRefinement() // // Performs a single round of randomized partionining iterative // refinement. ///////////////////////////////////////////////////////////////// void DoIterativeRefinement(const SafeVector < SafeVector < SparseMatrix * > >&sparseMatrices, MultiSequence * &alignment) { set < int >groupOne, groupTwo; // create two separate groups for (int i = 0; i < alignment->GetNumSequences(); i++) { if (rand() % 2) groupOne.insert(i); else groupTwo.insert(i); } if (groupOne.empty() || groupTwo.empty()) return; // project into the two groups MultiSequence *groupOneSeqs = alignment->Project(groupOne); assert(groupOneSeqs); MultiSequence *groupTwoSeqs = alignment->Project(groupTwo); assert(groupTwoSeqs); delete alignment; // realign alignment = AlignAlignments(groupOneSeqs, groupTwoSeqs, sparseMatrices); delete groupOneSeqs; delete groupTwoSeqs; } ///////////////////////////////////////////////////////////////// // WriteAnnotation() // // Computes annotation for multiple alignment and write values // to a file. ///////////////////////////////////////////////////////////////// void WriteAnnotation(MultiSequence * alignment, const SafeVector < SafeVector < SparseMatrix * > >&sparseMatrices) { float probprodct=0; const int alignLength = alignment->GetSequence(0)->GetLength(); const int numSeqs = alignment->GetNumSequences(); int i,j; SafeVector < int >position(numSeqs, 0); SafeVector < SafeVector < char >::iterator > seqs(numSeqs); for (i = 0; i < numSeqs; i++) seqs[i] = alignment->GetSequence(i)->GetDataPtr(); SafeVector < pair < int, int > >active; active.reserve(numSeqs); column=new columnReliability[alignLength+1]; column[0].columnNo=0; if(flag_ppscore==1) { cout<<"> Posterior_Probabilities"<= 1) { probprodct = 0.99999; } cout< >&active, const SafeVector < SafeVector < SparseMatrix * > >&sparseMatrices) { if (active.size() <= 1) return 0; // ALTERNATIVE #1: Compute the average alignment score. float prob_product=0; for (int i = 0; i < (int) active.size(); i++) { for (int j = i + 1; j < (int) active.size(); j++) { prob_product+= sparseMatrices[active[i].first][active[j].first]-> GetValue(active[i].second, active[j].second); if(enableVerbose) printf("%d-%d %d-%d %1.3f %f\n",active[i].first,active[i].second,active[j].first,active[j].second,sparseMatrices[active[i].first][active[j].first]->GetValue(active[i].second, active[j].second), prob_product); } } if(enableVerbose) printf("active size= %d \n",(int)active.size()); return 2*prob_product/((int) active.size() * ((int) active.size() - 1)); } probalign1.4/README0000755013460500000360000000516711472772647012653 0ustar usmanafsPROBALIGN Version 1.4 (Nov 2010) Written by Satish Chikkagoudar and Usman Roshan using PROBCONS 1.1 code (written by Chuong Do) and based upon probA (written by Ulrike Muckstein). ----------------------------------------------------------------- PROBALIGN has been made freely available as PUBLIC DOMAIN software and hence is not subject to copyright in the United States. This system and/or any portion of the source code may be used, modified, or redistributed without restrictions. PROBALIGN is distributed WITHOUT WARRANTY, express or implied. The authors accept NO LEGAL LIABILITY OR RESPONSIBILITY for loss due to reliance on the program. ----------------------------------------------------------------- PROBALIGN aligns inputted sequences and displays the output to the screen. USAGE: ------ ./probalign [file containing sequences in MFA format] [-T temperature] [-v|--verbose] [-a|--alignment-order] [-clustalw] [-prot|-nuc] [-go|--gap-open gap_open] [-ge|--gap-extension gap_ext] [-score_matrix scoring_matrix] OPTION DETAILS: --------------- -T This option is used to specify the thermodynamic temperature. ( Default: 5 when -prot option is used: 5 when -nuc option is used : 1 ). -score_matrix This option can be used to specify the scoring matrix. The values that can be entered under this option are: gonnet_160 = Gonnet 160 [DEFAULT] nuc_simple = Identity nucleotide scoring matrix [DEFAULT for RNA/DNA] -clustalw use CLUSTALW output format instead of MFA -v, --verbose report progress while aligning (default: off) -a, --alignment-order print sequences in alignment order rather than input order (default: off) -go, --gap-open This option can be used to specify the gap open parameter. Default for Gonnet 160 (protein) is 22 and nucleotide (simple matrix) is 4. -ge, --gap-extension This option can be used to specify the gap extend parameter. Default for Gonnet 160 (protein) is 1 and nucleotide (simple matrix) is 0.25. -nuc Specify this option to indicate that inputted sequences are nucleotide sequences. -prot Specify this option to indicate that inputted sequences are protein sequences [DEFAULT MODE] -showPP Outputs the posterior probabilities of alignment columns as a new sequence named Posterior Probabilities (The probability values are scaled to be between integers between 0 and 9). probalign1.4/RELEASE_NOTES0000755013460500000360000000124111472775656013736 0ustar usmanafsCode last updated Nov 2010 by Usman Roshan and Meera Prasad Code last updated Nov 9 2006 420 PM Code last checked on Cipres10 , oct 25 Testing on RV11/12 DONE indenting to be done using command: indent -kr -bl -nce -bli 0 bug associated with 0th row/ 0th column of posterior resolved on sept 16 (1510) removed external matrix file reading module sept 17(0100) itegration with Dr. Usman's code done on oct 19 for scol scoring module Implemented a low memory revpart function that also computes probabilities while eliminating redundancies in earlier module added -showPP option to print new sequence of probability values added -gui to support java gui probalign1.4/ComputeAlignment.cc0000755013460500000360000003156710532124320015527 0ustar usmanafs///////////////////////////////////////////////////////////////// // ComputeAlignment.cc // // Routines for (1) maximum weight trace alignment // ///////////////////////////////////////////////////////////////// #include #include #include "SafeVector.h" #include "SparseMatrix.h" #include "MultiSequence.h" #include #include #include using namespace std; const float LOG_ZERO = -2e20; const float LOG_ONE = 0.0; ///////////////////////////////////////////////////////////////// // ChooseBestOfThree() // // Store the largest of three values x1, x2, and x3 in *x. Also // if xi is the largest value, then store bi in *b. ///////////////////////////////////////////////////////////////// inline void ChooseBestOfThree(float x1, float x2, float x3, char b1, char b2, char b3, float *x, char *b) { if (x1 >= x2) { if (x1 >= x3) { *x = x1; *b = b1; return; } *x = x3; *b = b3; return; } if (x2 >= x3) { *x = x2; *b = b2; return; } *x = x3; *b = b3; } ///////////////////////////////////////////////////////////////// // ComputeAlignment() // // Computes an alignment based on the given posterior matrix. // This is done by finding the maximum summing path (or // maximum weight trace) through the posterior matrix. The // final alignment is returned as a pair consisting of: // (1) a string (e.g., XXXBBXXXBBBBBBYYYYBBB) where X's and // denote insertions in one of the two sequences and // B's denote that both sequences are present (i.e. // matches). // (2) a float indicating the sum achieved ///////////////////////////////////////////////////////////////// pair < SafeVector < char >*, float >ComputeAlignment(int seq1Length, int seq2Length, const VF & posterior) { float *twoRows = new float[(seq2Length + 1) * 2]; assert(twoRows); float *oldRow = twoRows; float *newRow = twoRows + seq2Length + 1; char *tracebackMatrix = new char[(seq1Length + 1) * (seq2Length + 1)]; assert(tracebackMatrix); char *tracebackPtr = tracebackMatrix; VF::const_iterator posteriorPtr = posterior.begin() + seq2Length + 1; // initialization for (int i = 0; i <= seq2Length; i++) { oldRow[i] = 0; *(tracebackPtr++) = 'L'; } // fill in matrix for (int i = 1; i <= seq1Length; i++) { // initialize left column newRow[0] = 0; posteriorPtr++; *(tracebackPtr++) = 'U'; // fill in rest of row for (int j = 1; j <= seq2Length; j++) { ChooseBestOfThree(*(posteriorPtr++) + oldRow[j - 1], newRow[j - 1], oldRow[j], 'D', 'L', 'U', &newRow[j], tracebackPtr++); } // swap rows float *temp = oldRow; oldRow = newRow; newRow = temp; } // store best score float total = oldRow[seq2Length]; delete[]twoRows; // compute traceback SafeVector < char >*alignment = new SafeVector < char >; assert(alignment); int r = seq1Length, c = seq2Length; while (r != 0 || c != 0) { char ch = tracebackMatrix[r * (seq2Length + 1) + c]; switch (ch) { case 'L': c--; alignment->push_back('Y'); break; case 'U': r--; alignment->push_back('X'); break; case 'D': c--; r--; alignment->push_back('B'); break; default: assert(false); } } delete[]tracebackMatrix; reverse(alignment->begin(), alignment->end()); return make_pair(alignment, total); } ///////////////////////////////////////////////////////////////// // ComputeAlignmentWithGapPenalties() // // Similar to ComputeAlignment() except with gap penalties. ///////////////////////////////////////////////////////////////// pair < SafeVector < char >*, float >ComputeAlignmentWithGapPenalties(MultiSequence * align1, MultiSequence * align2, const VF & posterior, int numSeqs1, int numSeqs2, float gapOpenPenalty, float gapContinuePenalty) { int seq1Length = align1->GetSequence(0)->GetLength(); int seq2Length = align2->GetSequence(0)->GetLength(); SafeVector < SafeVector < char >::iterator > dataPtrs1(align1->GetNumSequences()); SafeVector < SafeVector < char >::iterator > dataPtrs2(align2->GetNumSequences()); // grab character data for (int i = 0; i < align1->GetNumSequences(); i++) dataPtrs1[i] = align1->GetSequence(i)->GetDataPtr(); for (int i = 0; i < align2->GetNumSequences(); i++) dataPtrs2[i] = align2->GetSequence(i)->GetDataPtr(); // the number of active sequences at any given column is defined to be the // number of non-gap characters in that column; the number of gap opens at // any given column is defined to be the number of gap characters in that // column where the previous character in the respective sequence was not // a gap SafeVector < int >numActive1(seq1Length + 1), numGapOpens1(seq1Length + 1); SafeVector < int >numActive2(seq2Length + 1), numGapOpens2(seq2Length + 1); // compute number of active sequences and gap opens for each group for (int i = 0; i < align1->GetNumSequences(); i++) { SafeVector < char >::iterator dataPtr = align1->GetSequence(i)->GetDataPtr(); numActive1[0] = numGapOpens1[0] = 0; for (int j = 1; j <= seq1Length; j++) { if (dataPtr[j] != '-') { numActive1[j]++; numGapOpens1[j] += (j != 1 && dataPtr[j - 1] != '-'); } } } for (int i = 0; i < align2->GetNumSequences(); i++) { SafeVector < char >::iterator dataPtr = align2->GetSequence(i)->GetDataPtr(); numActive2[0] = numGapOpens2[0] = 0; for (int j = 1; j <= seq2Length; j++) { if (dataPtr[j] != '-') { numActive2[j]++; numGapOpens2[j] += (j != 1 && dataPtr[j - 1] != '-'); } } } VVF openingPenalty1(numSeqs1 + 1, VF(numSeqs2 + 1)); VF continuingPenalty1(numSeqs1 + 1); VVF openingPenalty2(numSeqs1 + 1, VF(numSeqs2 + 1)); VF continuingPenalty2(numSeqs2 + 1); // precompute penalties for (int i = 0; i <= numSeqs1; i++) for (int j = 0; j <= numSeqs2; j++) openingPenalty1[i][j] = i * (gapOpenPenalty * j + gapContinuePenalty * (numSeqs2 - j)); for (int i = 0; i <= numSeqs1; i++) continuingPenalty1[i] = i * gapContinuePenalty * numSeqs2; for (int i = 0; i <= numSeqs2; i++) for (int j = 0; j <= numSeqs1; j++) openingPenalty2[i][j] = i * (gapOpenPenalty * j + gapContinuePenalty * (numSeqs1 - j)); for (int i = 0; i <= numSeqs2; i++) continuingPenalty2[i] = i * gapContinuePenalty * numSeqs1; float *twoRows = new float[6 * (seq2Length + 1)]; assert(twoRows); float *oldRowMatch = twoRows; float *newRowMatch = twoRows + (seq2Length + 1); float *oldRowInsertX = twoRows + 2 * (seq2Length + 1); float *newRowInsertX = twoRows + 3 * (seq2Length + 1); float *oldRowInsertY = twoRows + 4 * (seq2Length + 1); float *newRowInsertY = twoRows + 5 * (seq2Length + 1); char *tracebackMatrix = new char[3 * (seq1Length + 1) * (seq2Length + 1)]; assert(tracebackMatrix); char *tracebackPtr = tracebackMatrix; VF::const_iterator posteriorPtr = posterior.begin() + seq2Length + 1; // initialization for (int i = 0; i <= seq2Length; i++) { oldRowMatch[i] = oldRowInsertX[i] = (i == 0) ? 0 : LOG_ZERO; oldRowInsertY[i] = (i == 0) ? 0 : oldRowInsertY[i - 1] + continuingPenalty2[numActive2[i]]; *(tracebackPtr) = *(tracebackPtr + 1) = *(tracebackPtr + 2) = 'Y'; tracebackPtr += 3; } // fill in matrix for (int i = 1; i <= seq1Length; i++) { // initialize left column newRowMatch[0] = newRowInsertY[0] = LOG_ZERO; newRowInsertX[0] = oldRowInsertX[0] + continuingPenalty1[numActive1[i]]; posteriorPtr++; *(tracebackPtr) = *(tracebackPtr + 1) = *(tracebackPtr + 2) = 'X'; tracebackPtr += 3; // fill in rest of row for (int j = 1; j <= seq2Length; j++) { // going to MATCH state ChooseBestOfThree(oldRowMatch[j - 1], oldRowInsertX[j - 1], oldRowInsertY[j - 1], 'M', 'X', 'Y', &newRowMatch[j], tracebackPtr++); newRowMatch[j] += *(posteriorPtr++); // going to INSERT X state ChooseBestOfThree(oldRowMatch[j] + openingPenalty1[numActive1[i]][numGapOpens2 [j]], oldRowInsertX[j] + continuingPenalty1[numActive1[i]], oldRowInsertY[j] + openingPenalty1[numActive1[i]][numGapOpens2 [j]], 'M', 'X', 'Y', &newRowInsertX[j], tracebackPtr++); // going to INSERT Y state ChooseBestOfThree(newRowMatch[j - 1] + openingPenalty2[numActive2[j]][numGapOpens1 [i]], newRowInsertX[j - 1] + openingPenalty2[numActive2[j]][numGapOpens1 [i]], newRowInsertY[j - 1] + continuingPenalty2[numActive2[j]], 'M', 'X', 'Y', &newRowInsertY[j], tracebackPtr++); } // swap rows float *temp; temp = oldRowMatch; oldRowMatch = newRowMatch; newRowMatch = temp; temp = oldRowInsertX; oldRowInsertX = newRowInsertX; newRowInsertX = temp; temp = oldRowInsertY; oldRowInsertY = newRowInsertY; newRowInsertY = temp; } // store best score float total; char matrix; ChooseBestOfThree(oldRowMatch[seq2Length], oldRowInsertX[seq2Length], oldRowInsertY[seq2Length], 'M', 'X', 'Y', &total, &matrix); delete[]twoRows; // compute traceback SafeVector < char >*alignment = new SafeVector < char >; assert(alignment); int r = seq1Length, c = seq2Length; while (r != 0 || c != 0) { int offset = (matrix == 'M') ? 0 : (matrix == 'X') ? 1 : 2; char ch = tracebackMatrix[(r * (seq2Length + 1) + c) * 3 + offset]; switch (matrix) { case 'Y': c--; alignment->push_back('Y'); break; case 'X': r--; alignment->push_back('X'); break; case 'M': c--; r--; alignment->push_back('B'); break; default: assert(false); } matrix = ch; } delete[]tracebackMatrix; reverse(alignment->begin(), alignment->end()); return make_pair(alignment, 1.0f); } ///////////////////////////////////////////////////////////////// // BuildPosterior() // // Builds a posterior probability matrix needed to align a pair // of alignments. Mathematically, the returned matrix M is // defined as follows: // M[i,j] = sum sum f(s,t,i,j) // s in align1 t in align2 // where // [ P(s[i'] <--> t[j']) // [ if s[i'] is a letter in the ith column of align1 and // [ t[j'] it a letter in the jth column of align2 // f(s,t,i,j) = [ // [ 0 otherwise // ///////////////////////////////////////////////////////////////// VF *BuildPosterior(MultiSequence * align1, MultiSequence * align2, const SafeVector < SafeVector < SparseMatrix * > >&sparseMatrices, float cutoff = 0.0f) { const int seq1Length = align1->GetSequence(0)->GetLength(); const int seq2Length = align2->GetSequence(0)->GetLength(); VF *posteriorPtr = new VF((seq1Length + 1) * (seq2Length + 1), 0); assert(posteriorPtr); VF & posterior = *posteriorPtr; // VF::iterator postPtr = posterior.begin(); // for each s in align1 for (int i = 0; i < align1->GetNumSequences(); i++) { int first = align1->GetSequence(i)->GetLabel(); SafeVector < int >*mapping1 = align1->GetSequence(i)->GetMapping(); // for each t in align2 for (int j = 0; j < align2->GetNumSequences(); j++) { int second = align2->GetSequence(j)->GetLabel(); SafeVector < int >*mapping2 = align2->GetSequence(j)->GetMapping(); if (first < second) { // get the associated sparse matrix SparseMatrix *matrix = sparseMatrices[first][second]; for (int ii = 1; ii <= matrix->GetSeq1Length(); ii++) { SafeVector < PIF >::iterator row = matrix->GetRowPtr(ii); int base = (*mapping1)[ii] * (seq2Length + 1); int rowSize = matrix->GetRowSize(ii); // add in all relevant values for (int jj = 0; jj < rowSize; jj++) posterior[base + (*mapping2)[row[jj].first]] += row[jj].second; // subtract cutoff for (int jj = 0; jj < matrix->GetSeq2Length(); jj++) posterior[base + (*mapping2)[jj]] -= cutoff; } } else { // get the associated sparse matrix SparseMatrix *matrix = sparseMatrices[second][first]; for (int jj = 1; jj <= matrix->GetSeq1Length(); jj++) { SafeVector < PIF >::iterator row = matrix->GetRowPtr(jj); int base = (*mapping2)[jj]; int rowSize = matrix->GetRowSize(jj); // add in all relevant values for (int ii = 0; ii < rowSize; ii++) posterior[base + (*mapping1)[row[ii].first] * (seq2Length + 1)] += row[ii].second; // subtract cutoff for (int ii = 0; ii < matrix->GetSeq2Length(); ii++) posterior[base + (*mapping1)[ii] * (seq2Length + 1)] -= cutoff; } } delete mapping2; } delete mapping1; } return posteriorPtr; } probalign1.4/EvolutionaryTree.h0000755013460500000360000001364410532124320015432 0ustar usmanafs///////////////////////////////////////////////////////////////// // EvolutionaryTree.h // // Utilities for reading/writing multiple sequence data. ///////////////////////////////////////////////////////////////// #ifndef EVOLUTIONARYTREE_H #define EVOLUTIONARYTREE_H #include #include #include #include "SafeVector.h" #include "MultiSequence.h" #include "Sequence.h" using namespace std; ///////////////////////////////////////////////////////////////// // TreeNode // // The fundamental unit for representing an alignment tree. The // guide tree is represented as a binary tree. ///////////////////////////////////////////////////////////////// class TreeNode { int sequenceLabel; // sequence label TreeNode *left, *right, *parent; // pointers to left, right children ///////////////////////////////////////////////////////////////// // TreeNode::PrintNode() // // Internal routine used to print out the sequence comments // associated with the evolutionary tree, using a hierarchical // parenthesized format. ///////////////////////////////////////////////////////////////// void PrintNode (ostream &outfile, const MultiSequence *sequences) const { // if this is a leaf node, print out the associated sequence comment if (sequenceLabel >= 0) outfile << sequences->GetSequence (sequenceLabel)->GetHeader(); // otherwise, it must have two children; print out their subtrees recursively else { assert (left); assert (right); outfile << "("; left->PrintNode (outfile, sequences); outfile << " "; right->PrintNode (outfile, sequences); outfile << ")"; } } public: ///////////////////////////////////////////////////////////////// // TreeNode::TreeNode() // // Constructor for a tree node. Note that sequenceLabel = -1 // implies that the current node is not a leaf in the tree. ///////////////////////////////////////////////////////////////// TreeNode (int sequenceLabel) : sequenceLabel (sequenceLabel), left (NULL), right (NULL), parent (NULL) { assert (sequenceLabel >= -1); } ///////////////////////////////////////////////////////////////// // TreeNode::~TreeNode() // // Destructor for a tree node. Recursively deletes all children. ///////////////////////////////////////////////////////////////// ~TreeNode (){ if (left){ delete left; left = NULL; } if (right){ delete right; right = NULL; } parent = NULL; } // getters int GetSequenceLabel () const { return sequenceLabel; } TreeNode *GetLeftChild () const { return left; } TreeNode *GetRightChild () const { return right; } TreeNode *GetParent () const { return parent; } // setters void SetSequenceLabel (int sequenceLabel){ this->sequenceLabel = sequenceLabel; assert (sequenceLabel >= -1); } void SetLeftChild (TreeNode *left){ this->left = left; } void SetRightChild (TreeNode *right){ this->right = right; } void SetParent (TreeNode *parent){ this->parent = parent; } ///////////////////////////////////////////////////////////////// // TreeNode::ComputeTree() // // Routine used to compute an evolutionary tree based on the // given distance matrix. We assume the distance matrix has the // form, distMatrix[i][j] = expected accuracy of aligning i with j. ///////////////////////////////////////////////////////////////// static TreeNode *ComputeTree (const VVF &distMatrix){ int numSeqs = distMatrix.size(); // number of sequences in distance matrix VVF distances (numSeqs, VF (numSeqs)); // a copy of the distance matrix SafeVector nodes (numSeqs, NULL); // list of nodes for each sequence SafeVector valid (numSeqs, 1); // valid[i] tells whether or not the ith // nodes in the distances and nodes array // are valid // initialization: make a copy of the distance matrix for (int i = 0; i < numSeqs; i++) for (int j = 0; j < numSeqs; j++) distances[i][j] = distMatrix[i][j]; // initialization: create all the leaf nodes for (int i = 0; i < numSeqs; i++){ nodes[i] = new TreeNode (i); assert (nodes[i]); } // repeat until only a single node left for (int numNodesLeft = numSeqs; numNodesLeft > 1; numNodesLeft--){ float bestProb = -1; pair bestPair; // find the closest pair for (int i = 0; i < numSeqs; i++) if (valid[i]){ for (int j = i+1; j < numSeqs; j++) if (valid[j]){ if (distances[i][j] > bestProb){ bestProb = distances[i][j]; bestPair = make_pair(i, j); } } } // merge the closest pair TreeNode *newParent = new TreeNode (-1); newParent->SetLeftChild (nodes[bestPair.first]); newParent->SetRightChild (nodes[bestPair.second]); nodes[bestPair.first]->SetParent (newParent); nodes[bestPair.second]->SetParent (newParent); nodes[bestPair.first] = newParent; nodes[bestPair.second] = NULL; // now update the distance matrix for (int i = 0; i < numSeqs; i++) if (valid[i]){ distances[bestPair.first][i] = distances[i][bestPair.first] = (distances[i][bestPair.first] + distances[i][bestPair.second]) * bestProb / 2; } // finally, mark the second node entry as no longer valid valid[bestPair.second] = 0; } assert (nodes[0]); return nodes[0]; } ///////////////////////////////////////////////////////////////// // TreeNode::Print() // // Print out the subtree associated with this node in a // parenthesized representation. ///////////////////////////////////////////////////////////////// void Print (ostream &outfile, const MultiSequence *sequences) const { outfile << "Alignment tree: "; PrintNode (outfile, sequences); outfile << endl; } }; #endif probalign1.4/Makefile0000755013460500000360000000057211472772452013420 0ustar usmanafsCXX = g++ CXXFLAGS = -O3 -W -Wall -funroll-loops TARGET = probalign all : $(TARGET) probalign : MultiSequence.h Sequence.h FileBuffer.h SparseMatrix.h EvolutionaryTree.h SafeVector.h Main.cc Sequence.h PostProbs.cc ComputeAlignment.cc ReadMatrix.cc Matrix.h $(CXX) $(CXXFLAGS) -lm -o $(TARGET) Main.cc PostProbs.cc ComputeAlignment.cc ReadMatrix.cc clean: rm -f $(TARGET) probalign1.4/Matrix.h0000755013460500000360000001113011463614010013346 0ustar usmanafs///////////////////////////////////////////////////////////////// // Matrix.h // // Specifies scoring matrices and their structure // // // ///////////////////////////////////////////////////////////////// typedef struct{ char monomers[26]; /* amino or nucleic acid order */ float matrix[676]; /* entries of the score matix, 26*26=676 */ } score_matrix; //default protein sequence scoring matrix as well as default scoring matrix of the PROBALIGN //also used when -prot option is used score_matrix gonnet_160 = { "ABCDEFGHIKLMNPQRSTVWXYZ", { 4.6, 0.0, 0.0, 0.3, 0.0, 13.5, -1.1, 0.0, -5.3, 7.0, -0.4, 0.0, -5.2, 3.4, 5.9, -3.8, 0.0, -1.8, -7.0, -6.2, 9.1, 0.2, 0.0, -3.4, -0.7, -2.1, -7.6, 8.2, -1.8, 0.0, -2.3, -0.1, -0.1, -0.7, -2.7, 9.3, -1.8, 0.0, -2.5, -6.2, -4.3, 0.3, -7.0, -3.7, 5.9, -1.2, 0.0, -4.8, -0.1, 1.3, -5.3, -2.4, 0.2, -3.5, 5.5, -2.2, 0.0, -2.9, -6.5, -4.5, 1.9, -6.7, -3.2, 3.0, -3.4, 5.7, -1.2, 0.0, -1.9, -5.0, -3.1, 1.4, -5.2, -2.1, 2.9, -2.1, 3.4, 7.6, -1.2, 0.0, -3.1, 2.6, 0.5, -4.7, -0.2, 1.5, -4.4, 0.8, -4.8, -3.6, 6.5, -0.1, 0.0, -5.2, -1.9, -1.4, -5.8, -3.0, -2.2, -4.3, -1.6, -3.5, -4.2, -2.2, 9.6, -0.7, 0.0, -4.2, 0.6, 2.3, -4.1, -2.1, 1.7, -3.2, 2.0, -2.4, -1.2, 0.5, -0.8, 5.6, -1.6, 0.0, -3.5, -1.6, -0.3, -5.3, -2.1, 0.3, -4.1, 3.5, -3.5, -2.9, -0.4, -2.1, 1.7, 7.1, 1.6, 0.0, -0.2, 0.0, -0.3, -4.5, -0.1, -0.8, -3.3, -0.4, -3.6, -2.3, 1.1, 0.0, -0.2, -0.9, 4.4, 0.5, 0.0, -1.4, -0.6, -0.8, -3.6, -2.4, -0.8, -1.2, -0.2, -2.4, -1.1, 0.3, -0.4, -0.4, -0.9, 2.3, 5.0, 0.1, 0.0, -0.6, -4.9, -3.0, -0.8, -5.2, -3.5, 4.0, -3.0, 1.7, 1.4, -3.8, -3.2, -2.7, -3.4, -2.0, 0.0, 5.3, -5.5, 0.0, -2.1, -7.8, -6.4, 3.2, -5.5, -1.9, -3.4, -5.4, -2.0, -2.2, -5.5, -7.4, -4.0, -2.4, -4.7, -5.4, -4.5, 15.8, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, -3.7, 0.0, -1.3, -4.2, -4.4, 5.6, -6.0, 2.7, -2.0, -3.5, -1.1, -1.3, -2.2, -4.8, -2.9, -2.9, -2.8, -3.2, -2.4, 3.8, 0.0, 10.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0 } }; //default nucleotide sequence scoring matrix //used when -nuc option is used score_matrix nuc_simple= { "ABCDGHKMNRSTUVWXY", /* { 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } */ { 5, 0, 0, -4, 0, 5, 0, 0, 0, 0, -4, 0, -4, 0, 5, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -4, 0, -4, 0, -4, 0, 0, 0, 0, 0, 0, 5, -4, 0, -4, 0, -4, 0, 0, 0, 0, 0, 0, 5, 5, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } //Ribosum85-60 /* { 2.22, 0, 0, -1.86, 0, 1.16, 0, 0, 0, 0, -1.46, 0, -2.48, 0, 1.03, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -1.39, 0, -1.05, 0, -1.74, 0, 0, 0, 0, 0, 0, 1.65, -1.39, 0, -1.05, 0, -1.74, 0, 0, 0, 0, 0, 0, 0, 1.65, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } */ }; probalign1.4/MultiSequence.h0000755013460500000360000005106011472772563014715 0ustar usmanafs//////////////////////////////////////////////////////////////// // MultiSequence.h // // Utilities for reading/writing multiple sequence data. ///////////////////////////////////////////////////////////////// #ifndef MULTISEQUENCE_H #define MULTISEQUENCE_H #include #include #include #include #include #include #include #include "SafeVector.h" #include "Sequence.h" #include "FileBuffer.h" ///////////////////////////////////////////////////////////////// // MultiSequence // // Class for multiple sequence alignment input/output. ///////////////////////////////////////////////////////////////// class MultiSequence { SafeVector *sequences; public: ///////////////////////////////////////////////////////////////// // MultiSequence::MultiSequence() // // Default constructor. ///////////////////////////////////////////////////////////////// MultiSequence () : sequences (NULL) {} ///////////////////////////////////////////////////////////////// // MultiSequence::MultiSequence() // // Constructor. Load MFA from a FileBuffer object. ///////////////////////////////////////////////////////////////// MultiSequence (FileBuffer &infile) : sequences (NULL) { LoadMFA (infile); } ///////////////////////////////////////////////////////////////// // MultiSequence::MultiSequence() // // Constructor. Load MFA from a filename. ///////////////////////////////////////////////////////////////// MultiSequence (const string &filename) : sequences (NULL){ LoadMFA (filename); } ///////////////////////////////////////////////////////////////// // MultiSequence::~MultiSequence() // // Destructor. Gets rid of sequence objects contained in the // multiple alignment. ///////////////////////////////////////////////////////////////// ~MultiSequence(){ // if sequences allocated if (sequences){ // free all sequences for (SafeVector::iterator iter = sequences->begin(); iter != sequences->end(); ++iter){ assert (*iter); delete *iter; *iter = NULL; } // free sequence vector delete sequences; sequences = NULL; } } ///////////////////////////////////////////////////////////////// // MultiSequence::LoadMFA() // // Load MFA from a filename. ///////////////////////////////////////////////////////////////// void LoadMFA (const string &filename, bool stripGaps = false){ // try opening file FileBuffer infile (filename.c_str()); if (infile.fail()){ cerr << "ERROR: Could not open file '" << filename << "' for reading." << endl; exit (1); } // if successful, then load using other LoadMFA() routine LoadMFA (infile, stripGaps); infile.close(); } ///////////////////////////////////////////////////////////////// // MultiSequence::LoadMFA() // // Load MSF from a FileBuffer object. ///////////////////////////////////////////////////////////////// void ParseMSF (FileBuffer &infile, string header, bool stripGaps = false){ SafeVector *> seqData; SafeVector seqNames; SafeVector seqLengths; istringstream in; bool valid = true; bool missingHeader = false; bool clustalW = false; // read until data starts while (!infile.eof() && header.find ("..", 0) == string::npos){ if (header.find ("CLUSTAL", 0) == 0 || header.find ("PROBCONS", 0) == 0){ clustalW = true; break; } infile.GetLine (header); if (header.find ("//", 0) != string::npos){ missingHeader = true; break; } } // read until end-of-file while (valid){ infile.GetLine (header); if (infile.eof()) break; string word; in.clear(); in.str(header); // check if there's anything on this line if (in >> word){ // clustalw name parsing if (clustalW){ if (!isspace(header[0]) && find (seqNames.begin(), seqNames.end(), word) == seqNames.end()){ seqNames.push_back (word); seqData.push_back (new SafeVector()); seqLengths.push_back (0); seqData[(int) seqData.size() - 1]->push_back ('@'); } } // look for new sequence label if (word == string ("Name:")){ if (in >> word){ seqNames.push_back (word); seqData.push_back (new SafeVector()); seqLengths.push_back (0); seqData[(int) seqData.size() - 1]->push_back ('@'); } else valid = false; } // check if this is sequence data else if (find (seqNames.begin(), seqNames.end(), word) != seqNames.end()){ int index = find (seqNames.begin(), seqNames.end(), word) - seqNames.begin(); // read all remaining characters on the line char ch; while (in >> ch){ if (isspace (ch)) continue; if (ch >= 'a' && ch <= 'z') ch = ch - 'a' + 'A'; if (ch == '.') ch = '-'; if (stripGaps && ch == '-') continue; if (!((ch >= 'A' && ch <= 'Z') || ch == '*' || ch == '-')){ cerr << "ERROR: Unknown character encountered: " << ch << endl; exit (1); } // everything's ok so far, so just store this character. seqData[index]->push_back (ch); seqLengths[index]++; } } else if (missingHeader){ seqNames.push_back (word); seqData.push_back (new SafeVector()); seqLengths.push_back (0); seqData[(int) seqData.size() - 1]->push_back ('@'); int index = (int) seqNames.size() - 1; // read all remaining characters on the line char ch; while (in >> ch){ if (isspace (ch)) continue; if (ch >= 'a' && ch <= 'z') ch = ch - 'a' + 'A'; if (ch == '.') ch = '-'; if (stripGaps && ch == '-') continue; if (!((ch >= 'A' && ch <= 'Z') || ch == '*' || ch == '-')){ cerr << "ERROR: Unknown character encountered: " << ch << endl; exit (1); } // everything's ok so far, so just store this character. seqData[index]->push_back (ch); seqLengths[index]++; } } } } // check for errors if (seqNames.size() == 0){ cerr << "ERROR: No sequences read!" << endl; exit (1); } assert (!sequences); sequences = new SafeVector; for (int i = 0; i < (int) seqNames.size(); i++){ if (seqLengths[i] == 0){ cerr << "ERROR: Sequence of zero length!" << endl; exit (1); } Sequence *seq = new Sequence (seqData[i], seqNames[i], seqLengths[i], i, i); sequences->push_back (seq); } } ///////////////////////////////////////////////////////////////// // MultiSequence::LoadMFA() // // Load MFA from a FileBuffer object. ///////////////////////////////////////////////////////////////// void LoadMFA (FileBuffer &infile, bool stripGaps = false){ // check to make sure that file reading is ok if (infile.fail()){ cerr << "ERROR: Error reading file." << endl; exit (1); } // read all sequences while (true){ // get the sequence label as being the current # of sequences // NOTE: sequence labels here are zero-based int index = (!sequences) ? 0 : sequences->size(); // read the sequence Sequence *seq = new Sequence (infile, stripGaps); if (seq->Fail()){ // check if alternative file format (i.e. not MFA) if (index == 0){ string header = seq->GetHeader(); if (header.length() > 0 && header[0] != '>'){ // try MSF format ParseMSF (infile, header); break; } } delete seq; break; } seq->SetLabel (index); // add the sequence to the list of current sequences if (!sequences) sequences = new SafeVector; sequences->push_back (seq); } // make sure at least one sequence was read if (!sequences){ cerr << "ERROR: No sequences read." << endl; exit (1); } } ///////////////////////////////////////////////////////////////// // MultiSequence::AddSequence() // // Add another sequence to an existing sequence list ///////////////////////////////////////////////////////////////// void AddSequence (Sequence *sequence){ assert (sequence); assert (!sequence->Fail()); // add sequence if (!sequences) sequences = new SafeVector; sequences->push_back (sequence); } ///////////////////////////////////////////////////////////////// // MultiSequence::RemoveSequence() // // Remove a sequence from the MultiSequence ///////////////////////////////////////////////////////////////// void RemoveSequence (int index){ assert (sequences); assert (index >= 0 && index < (int) sequences->size()); delete (*sequences)[index]; sequences->erase (sequences->begin() + index); } ///////////////////////////////////////////////////////////////// // MultiSequence::WriteMFA() // // Write MFA to the outfile. Allows the user to specify the // number of columns for the output. Also, useIndices determines // whether or not the actual sequence comments will be printed // out or whether the artificially assigned sequence labels will // be used instead. ///////////////////////////////////////////////////////////////// void WriteMFA (ostream &outfile, int numColumns = 60, bool useIndices = false){ if (!sequences) return; // loop through all sequences and write them out for (SafeVector::iterator iter = sequences->begin(); iter != sequences->end(); ++iter){ (*iter)->WriteMFA (outfile, numColumns, useIndices); } } ///////////////////////////////////////////////////////////////// // MultiSequence::GetAnnotationChar() // // Return CLUSTALW annotation for column. ///////////////////////////////////////////////////////////////// char GetAnnotationChar (SafeVector &column){ SafeVector counts (256, 0); int allChars = (int) column.size(); for (int i = 0; i < allChars; i++){ counts[(unsigned char) toupper(column[i])]++; } allChars -= counts[(unsigned char) '-']; if (allChars == 1) return ' '; for (int i = 0; i < 256; i++) if ((char) i != '-' && counts[i] == allChars) return '*'; if (counts[(unsigned char) 'S'] + counts[(unsigned char) 'T'] + counts[(unsigned char) 'A'] == allChars) return ':'; if (counts[(unsigned char) 'N'] + counts[(unsigned char) 'E'] + counts[(unsigned char) 'Q'] + counts[(unsigned char) 'K'] == allChars) return ':'; if (counts[(unsigned char) 'N'] + counts[(unsigned char) 'H'] + counts[(unsigned char) 'Q'] + counts[(unsigned char) 'K'] == allChars) return ':'; if (counts[(unsigned char) 'N'] + counts[(unsigned char) 'D'] + counts[(unsigned char) 'E'] + counts[(unsigned char) 'Q'] == allChars) return ':'; if (counts[(unsigned char) 'Q'] + counts[(unsigned char) 'H'] + counts[(unsigned char) 'R'] + counts[(unsigned char) 'K'] == allChars) return ':'; if (counts[(unsigned char) 'M'] + counts[(unsigned char) 'I'] + counts[(unsigned char) 'L'] + counts[(unsigned char) 'V'] == allChars) return ':'; if (counts[(unsigned char) 'M'] + counts[(unsigned char) 'I'] + counts[(unsigned char) 'L'] + counts[(unsigned char) 'F'] == allChars) return ':'; if (counts[(unsigned char) 'H'] + counts[(unsigned char) 'Y'] == allChars) return ':'; if (counts[(unsigned char) 'F'] + counts[(unsigned char) 'Y'] + counts[(unsigned char) 'W'] == allChars) return ':'; if (counts[(unsigned char) 'C'] + counts[(unsigned char) 'S'] + counts[(unsigned char) 'A'] == allChars) return '.'; if (counts[(unsigned char) 'A'] + counts[(unsigned char) 'T'] + counts[(unsigned char) 'V'] == allChars) return '.'; if (counts[(unsigned char) 'S'] + counts[(unsigned char) 'A'] + counts[(unsigned char) 'G'] == allChars) return '.'; if (counts[(unsigned char) 'S'] + counts[(unsigned char) 'T'] + counts[(unsigned char) 'N'] + counts[(unsigned char) 'K'] == allChars) return '.'; if (counts[(unsigned char) 'S'] + counts[(unsigned char) 'T'] + counts[(unsigned char) 'P'] + counts[(unsigned char) 'A'] == allChars) return '.'; if (counts[(unsigned char) 'S'] + counts[(unsigned char) 'G'] + counts[(unsigned char) 'N'] + counts[(unsigned char) 'D'] == allChars) return '.'; if (counts[(unsigned char) 'S'] + counts[(unsigned char) 'N'] + counts[(unsigned char) 'D'] + counts[(unsigned char) 'E'] + counts[(unsigned char) 'Q'] + counts[(unsigned char) 'K'] == allChars) return '.'; if (counts[(unsigned char) 'N'] + counts[(unsigned char) 'D'] + counts[(unsigned char) 'E'] + counts[(unsigned char) 'Q'] + counts[(unsigned char) 'H'] + counts[(unsigned char) 'K'] == allChars) return '.'; if (counts[(unsigned char) 'N'] + counts[(unsigned char) 'E'] + counts[(unsigned char) 'H'] + counts[(unsigned char) 'Q'] + counts[(unsigned char) 'R'] + counts[(unsigned char) 'K'] == allChars) return '.'; if (counts[(unsigned char) 'F'] + counts[(unsigned char) 'V'] + counts[(unsigned char) 'L'] + counts[(unsigned char) 'I'] + counts[(unsigned char) 'M'] == allChars) return '.'; if (counts[(unsigned char) 'H'] + counts[(unsigned char) 'F'] + counts[(unsigned char) 'Y'] == allChars) return '.'; return ' '; } ///////////////////////////////////////////////////////////////// // MultiSequence::WriteALN() // // Write ALN to the outfile. Allows the user to specify the // number of columns for the output. ///////////////////////////////////////////////////////////////// void WriteALN (ostream &outfile, int numColumns = 60){ if (!sequences) return; outfile << "Probalign version 1.4 multiple sequence alignment" << endl; int longestComment = 0; SafeVector::iterator> ptrs (GetNumSequences()); SafeVector lengths (GetNumSequences()); for (int i = 0; i < GetNumSequences(); i++){ ptrs[i] = GetSequence (i)->GetDataPtr(); lengths[i] = GetSequence (i)->GetLength(); longestComment = max (longestComment, (int) GetSequence(i)->GetName().length()); } longestComment += 4; int writtenChars = 0; bool allDone = false; while (!allDone){ outfile << endl; allDone = true; // loop through all sequences and write them out for (int i = 0; i < GetNumSequences(); i++){ if (writtenChars < lengths[i]){ outfile << GetSequence(i)->GetName(); for (int j = 0; j < longestComment - (int) GetSequence(i)->GetName().length(); j++) outfile << ' '; for (int j = 0; j < numColumns; j++){ if (writtenChars + j < lengths[i]) outfile << ptrs[i][writtenChars + j + 1]; else break; } outfile << endl; if (writtenChars + numColumns < lengths[i]) allDone = false; } } // write annotation line for (int j = 0; j < longestComment; j++) outfile << ' '; for (int j = 0; j < numColumns; j++){ SafeVector column; for (int i = 0; i < GetNumSequences(); i++) if (writtenChars + j < lengths[i]) column.push_back (ptrs[i][writtenChars + j + 1]); if (column.size() > 0) outfile << GetAnnotationChar (column); } outfile << endl; writtenChars += numColumns; } } ///////////////////////////////////////////////////////////////// // MultiSequence::GetSequence() // // Retrieve a sequence from the MultiSequence object. ///////////////////////////////////////////////////////////////// Sequence* GetSequence (int i){ assert (sequences); assert (0 <= i && i < (int) sequences->size()); return (*sequences)[i]; } ///////////////////////////////////////////////////////////////// // MultiSequence::GetSequence() // // Retrieve a sequence from the MultiSequence object // (const version). ///////////////////////////////////////////////////////////////// const Sequence* GetSequence (int i) const { assert (sequences); assert (0 <= i && i < (int) sequences->size()); return (*sequences)[i]; } ///////////////////////////////////////////////////////////////// // MultiSequence::GetNumSequences() // // Returns the number of sequences in the MultiSequence. ///////////////////////////////////////////////////////////////// int GetNumSequences () const { if (!sequences) return 0; return (int) sequences->size(); } ///////////////////////////////////////////////////////////////// // MultiSequence::SortByHeader() // // Organizes the sequences according to their sequence headers // in ascending order. ///////////////////////////////////////////////////////////////// void SortByHeader () { assert (sequences); // a quick and easy O(n^2) sort for (int i = 0; i < (int) sequences->size()-1; i++){ for (int j = i+1; j < (int) sequences->size(); j++){ if ((*sequences)[i]->GetHeader() > (*sequences)[j]->GetHeader()) swap ((*sequences)[i], (*sequences)[j]); } } } ///////////////////////////////////////////////////////////////// // MultiSequence::SortByLabel() // // Organizes the sequences according to their sequence labels // in ascending order. ///////////////////////////////////////////////////////////////// void SortByLabel () { assert (sequences); // a quick and easy O(n^2) sort for (int i = 0; i < (int) sequences->size()-1; i++){ for (int j = i+1; j < (int) sequences->size(); j++){ if ((*sequences)[i]->GetSortLabel() > (*sequences)[j]->GetSortLabel()) swap ((*sequences)[i], (*sequences)[j]); } } } ///////////////////////////////////////////////////////////////// // MultiSequence::SaveOrdering() // // Relabels sequences so as to preserve the current ordering. ///////////////////////////////////////////////////////////////// void SaveOrdering () { assert (sequences); for (int i = 0; i < (int) sequences->size(); i++) (*sequences)[i]->SetSortLabel (i); } ///////////////////////////////////////////////////////////////// // MultiSequence::Project() // // Given a set of indices, extract all sequences from the current // MultiSequence object whose index is included in the set. // Then, project the multiple alignments down to the desired // subset, and return the projection as a new MultiSequence // object. ///////////////////////////////////////////////////////////////// MultiSequence *Project (const set &indices){ SafeVector::iterator> oldPtrs (indices.size()); SafeVector *> newPtrs (indices.size()); assert (indices.size() != 0); // grab old data int i = 0; for (set::const_iterator iter = indices.begin(); iter != indices.end(); ++iter){ oldPtrs[i++] = GetSequence (*iter)->GetDataPtr(); } // compute new length int oldLength = GetSequence (*indices.begin())->GetLength(); int newLength = 0; for (i = 1; i <= oldLength; i++){ // check to see if there is a gap in every sequence of the set bool found = false; for (int j = 0; !found && j < (int) indices.size(); j++) found = (oldPtrs[j][i] != '-'); // if not, then this column counts towards the sequence length if (found) newLength++; } // build new alignments for (i = 0; i < (int) indices.size(); i++){ newPtrs[i] = new SafeVector(); assert (newPtrs[i]); newPtrs[i]->push_back ('@'); } // add all needed columns for (i = 1; i <= oldLength; i++){ // make sure column is not gapped in all sequences in the set bool found = false; for (int j = 0; !found && j < (int) indices.size(); j++) found = (oldPtrs[j][i] != '-'); // if not, then add it if (found){ for (int j = 0; j < (int) indices.size(); j++) newPtrs[j]->push_back (oldPtrs[j][i]); } } // wrap sequences in MultiSequence object MultiSequence *ret = new MultiSequence(); i = 0; for (set::const_iterator iter = indices.begin(); iter != indices.end(); ++iter){ ret->AddSequence (new Sequence (newPtrs[i++], GetSequence (*iter)->GetHeader(), newLength, GetSequence (*iter)->GetSortLabel(), GetSequence (*iter)->GetLabel())); } return ret; } }; #endif probalign1.4/PostProbs.cc0000755013460500000360000004677211475726767014251 0ustar usmanafs#include "SafeVector.h" #include #include #include #include #include #include #include #include #define TRACE 0 // 0: NOTRACE 1: TRACE //proba like settings #define endgaps 1 // 1: engap penaties enabled 0: disabled #define PART_FULL_MEMORY 0 //0: LOW MEM OPTION #define REVPART_FULL_MEMORY 0 //0: LOW MEM OPTION using namespace std; /////////////////////////////////////////////////////////////////////////////////////////////////// // // stochastic alignment modules : PostProbs.cc // // Version : Nov 2010 // --------------------------- // // Updates: // scoring matrix module relocated, replaced by static matrices // Low memory integration of posterior probability module // and reverse partition function implemented // // Satish Chikkagoudar, Dept of CS, NJIT 2005-06 //////////////////////////////////////////////////////////////////////////////////////////////////////////// typedef struct { char input[30]; int matrix; int N; float T; float beta; char opt; //can be 'P' or 'M' float gapopen; float gapext; } argument_decl; typedef struct sequence { char *title; char *text; int length; } fasta; typedef struct alignment { char *title; char *text; int length; } align; fasta sequences[2]; char proteins[20]; int dna[4]; int prob_flag; float termgapopen, termgapextend; float *hydro_seq1, *hydro_seq2; //////////////////////////////////////////////////////// //externs related to scoring matrix and input arguments /////////////////////////////////////////////////////////// extern float g_gap_open1, g_gap_open2, g_gap_ext1, g_gap_ext2; extern char aminos[26], matrixtype[20], bases[26]; extern float sub_matrix[26][26]; extern float scorez_matrix[26][26]; extern int subst_index[26]; extern float TEMPERATURE; extern int MATRIXTYPE; extern float GAPOPEN; extern float GAPEXT; extern argument_decl argument; ////////////////////////////////////////////////////////// //rest of the structures ////////////////////////////////////////////////////////// int number_of_sequences = 0; int monomers = 0; float beta_ln; int DNA_flag = 0; //input is (0:protein 1: DNA) ////////////////////////////////////////////////////////////////////////////// //calculates reverse partition function values based on z matrices //and also simulaneously calculates the propability of each basepair //or aminoacid residue pair i,j ////////////////////////////////////////////////////////////////////////////// VF *revers_partf(long double **Zfm, float d, float e, float beta) { // printf("revpart\n"); //rest of the declarations int i, j; long double **Zm = NULL; long double **Ze = NULL; long double **Zf = NULL; int len0, len1; float probability; long double tempvar; float open0, extend0, open1, extend1; open0 = open1 = d; extend0 = extend1 = e; const long double beta_d = exp(beta * d); const long double beta_e = exp(beta * e); long double beta_open0 = exp(beta * open0); long double beta_open1 = exp(beta * open1); long double beta_extend0 = exp(beta * extend0); long double beta_extend1 = exp(beta * extend1); int Si, Tj; float endgapopen, endgapextend; FILE *fo; //Init lengths of sequences len0 = strlen(sequences[0].text); len1 = strlen(sequences[1].text); //Safe vector declared VF *posteriorPtr = new VF((len0 + 1) * (len1 + 1)); VF & posterior = *posteriorPtr; VF::iterator ptr = posterior.begin(); if (TRACE) //open the trace file fo = fopen("revpartdump", "a"); //default: endgapopen = termgapopen; endgapextend = termgapextend; const long double beta_endgapopen = exp(beta * endgapopen); const long double beta_endgapextend = exp(beta * endgapextend); //instantiate the z matrix if (REVPART_FULL_MEMORY) { Ze = new long double *[sequences[1].length + 1]; Zf = new long double *[sequences[1].length + 1]; Zm = new long double *[sequences[1].length + 1]; if (TRACE) printf("\n\n %e %e\n", d, e); //DYNAMICALLY GROW 2D Zm Zf Ze MARICES (long double) for (i = 0; i <= sequences[1].length; i++) { Ze[i] = new long double[sequences[0].length + 1]; Zf[i] = new long double[sequences[0].length + 1]; Zm[i] = new long double[sequences[0].length + 1]; } } else { Zm = new long double *[2]; Ze = new long double *[2]; Zf = new long double *[2]; for (i = 0; i <= 1; i++) { Zm[i] = new long double[sequences[0].length + 1]; Ze[i] = new long double[sequences[0].length + 1]; Zf[i] = new long double[sequences[0].length + 1]; } } if (TRACE) { printf("in rev partf---"); printf("\n\n"); } if (REVPART_FULL_MEMORY) { for (i = 0; i <= len1; i++) for (j = 0; j <= len0; j++) { Zm[i][j] = 0.0; Zf[i][j] = 0.0; Ze[i][j] = 0.0; } } else { for (j = 0; j <= len0; j++) { Zm[0][j] = 0; Zf[0][j] = 0; Ze[0][j] = 0; Zf[1][j] = 0; Ze[1][j] = 0; Zm[1][j] = 0; } } //fill the probability matrix with 0s for (i = 0; i <= len1; i++) for (j = 0; j <= len0; j++) ptr[j * (len1 + 1) + i] = 0; if (endgaps == 0) { Zm[len1][len0] = 1; Ze[len1][len0] = Zf[len1][len0] = 0; Zf[len1 - 1][len0] = Zm[len1][len0] * beta_d; Ze[len1][len0 - 1] = Zm[len1][len0] * beta_d; //>=2ND ROW INIT if (REVPART_FULL_MEMORY) { for (i = len1 - 2; i >= 0; i--) { Zf[i][len0] = Zf[i + 1][len0] * beta_e; } } //>=2ND COL INIT if (REVPART_FULL_MEMORY) { for (j = len0 - 2; j >= 0; j--) { Ze[len1][j] = Ze[len1][j + 1] * beta_e; } } else { for (j = len0 - 2; j >= 0; j--) { Ze[0][j] = Ze[0][j + 1] * beta_e; } } } else { if (REVPART_FULL_MEMORY) { Zm[len1][len0] = 1; Ze[len1][len0] = Zf[len1][len0] = 0; Zf[len1 - 1][len0] = Zm[len1][len0] * beta_endgapopen; Ze[len1][len0 - 1] = Zm[len1][len0] * beta_endgapopen; //>=2ND ROW INIT for (i = len1 - 2; i >= 0; i--) { Zf[i][len0] = Zf[i + 1][len0] * beta_endgapextend; } //M Iy= d+j*e //>=2ND COL INIT for (j = len0 - 2; j >= 0; j--) { Ze[len1][j] = Ze[len1][j + 1] * beta_endgapextend; } } else { //in Zm //let: // Zm(0) be the current row being filled/computed // Zm(1) be the previous row Zm[1][len0] = 1; Ze[0][len0] = Zf[0][len0] = 0; Zf[1][len0] = Zm[1][len0] * beta_endgapopen; Ze[0][len0 - 1] = Zm[1][len0] * beta_endgapopen; //>=2ND COL INIT for (j = len0 - 2; j >= 0; j--) { Ze[0][j] = Ze[0][j + 1] * beta_endgapextend; } } //END ELSE } //END FULL MEMORY and GAP enablement IF STATEMENT long double zz = 0; long double beta_scorez; for (i = len1 - 1; i >= 0; i--) { for (j = len0 - 1; j >= 0; j--) { Si = subst_index[sequences[1].text[i] - 'A']; Tj = subst_index[sequences[0].text[j] - 'A']; // scorez = sub_matrix[Si][Tj]; beta_scorez = scorez_matrix[Si][Tj]; //endgaps modification aug 10 // float open0, extend0, open1, extend1; // open0 = open1 = d; // extend0 = extend1 = e; beta_open0 = beta_d; beta_extend0 = beta_e; beta_open1 = beta_d; beta_extend1 = beta_e; if (endgaps == 1) { //check to see if one of the 2 sequences or both reach the end if (i == 0) { // open0 = endgapopen; // extend0 = endgapextend; beta_open0 = beta_endgapopen; beta_extend0 = beta_endgapextend; } if (j == 0) { // open1 = endgapopen; // extend1 = endgapextend; beta_open1 = beta_endgapopen; beta_extend1 = beta_endgapextend; } } if (REVPART_FULL_MEMORY) { //z computation Ze[i][j] = Zm[i][j + 1] * beta_open0 + Ze[i][j + 1] * beta_extend0; Zf[i][j] = Zm[i + 1][j] * beta_open1 + Zf[i + 1][j] * beta_extend1; Zm[i][j] = (Zm[i + 1][j + 1] + Zf[i + 1][j + 1] + Ze[i + 1][j + 1]) * beta_scorez; zz = Zm[i][j] + Zf[i][j] + Ze[i][j]; } else { //2 ROW zE zF ALGORITHM GOES...: //Ze[1][j] =Zm[i][j + 1] * exp(beta * open0) + Ze[1][j + 1] *exp(beta * extend0); //Zf[1][j] = Zm[i + 1][j] * exp(beta * open1) + Zf[0][j] * exp(beta * extend1); //Zm[i][j] = (Zm[i + 1][j + 1] + Zf[0][j + 1] + Ze[0][j + 1]) * exp(beta * scorez); //zz = Zm[0][j] + Zf[1][j] + Ze[1][j]; //lowmem code for merging probability calculating module //Here we make use of Zm as a 2 row matrix Zf[1][j] = Zm[1][j] * beta_open1 + Zf[0][j] * beta_extend1; Ze[1][j] = Zm[0][j + 1] * beta_open0 + Ze[1][j + 1] * beta_extend0; Zm[0][j] = (Zm[1][j + 1] + Zf[0][j + 1] + Ze[0][j + 1]) * beta_scorez; tempvar = Zfm[i + 1][j + 1] * Zm[0][j]; //divide P(i,j) i.e. pairwise probability by denominator tempvar /= (beta_scorez * Zfm[0][0]); probability = (float) tempvar; //store only noticable probabilities //Usman Dec 1st 2010: Not doing the above anymore. Store all probabilities. { //algorithm goes... //validprob[i + 1][j + 1] = probability; ptr[(j + 1) * (len1 + 1) + (i + 1)] = probability; } //lowmem code ends here } } //end of for if (REVPART_FULL_MEMORY == 0) { for (int t = 0; t <= sequences[0].length; t++) { Ze[0][t] = Ze[1][t]; Ze[1][t] = 0; Zf[0][t] = Zf[1][t]; Zf[1][t] = 0; Zm[1][t] = Zm[0][t]; Zm[0][t] = 0; } Zf[0][len0] = 1; } } //end of for if(TRACE) { printf("\n\nrM:....\n\n"); if (REVPART_FULL_MEMORY) { for (i = 0; i <= len1; i++) { for (j = 0; j <= len0; j++) printf("%.2Le ", Zm[i][j]); printf("\n"); } printf("\n\nrE:....\n\n"); for (i = 0; i <= len1; i++) { for (j = 0; j <= len0; j++) printf("%.2Le ", Ze[i][j]); printf("\n"); } printf("\n\nrF:....\n\n"); for (i = 0; i <= len1; i++) { for (j = 0; j <= len0; j++) printf("%.2Le ", Zf[i][j]); printf("\n"); } } } if (TRACE) { fprintf(fo, "\n"); fclose(fo); } //delete unused memory if (REVPART_FULL_MEMORY) { for (i = 0; i <= len1; i++) { delete(Zm[i]); delete(Zf[i]); delete(Ze[i]); } } else { delete(Zf[0]); delete(Ze[0]); delete(Zm[0]); delete(Zm[1]); delete(Zf[1]); delete(Ze[1]); } for (i = 0; i <= len1; i++) { delete(Zfm[i]); } if (Zf != NULL) delete(Zf); if (Ze != NULL) delete(Ze); if (Zm != NULL) delete(Zm); if (Zfm != NULL) delete(Zfm); posterior[0] = 0; return (posteriorPtr); } ////////////////////////////////////////////////////////////// //forward partition function ///////////////////////////////////////////////////////////// long double **partf(float d, float e, float beta) { //printf("partf\n"); int i, j, len1, len0; long double **Zm = NULL, **Zf = NULL, **Ze = NULL, zz = 0; float endgapopen, endgapextend; //default: endgapopen = termgapopen; endgapextend = termgapextend; float open0, extend0, open1, extend1; open0 = open1 = d; extend0 = extend1 = e; const long double beta_d = exp(beta * d); const long double beta_e = exp(beta * e); long double beta_open0 = exp(beta * open0); long double beta_open1 = exp(beta * open1); long double beta_extend0 = exp(beta * extend0); long double beta_extend1 = exp(beta * extend1); const long double beta_endgapopen = exp(beta * endgapopen); const long double beta_endgapextend = exp(beta * endgapextend); //the flag endgaps is set at the #define section if (PART_FULL_MEMORY) { Zf = new long double *[sequences[1].length + 1]; Ze = new long double *[sequences[1].length + 1]; Zm = new long double *[sequences[1].length + 1]; //comment if (TRACE) printf("\nPARTF:====\n"); //DYNAMICALLY GROW 2D M,IX,IY,PIX,PIY MARICES for (i = 0; i <= sequences[1].length; i++) { Zf[i] = new long double[sequences[0].length + 1]; Ze[i] = new long double[sequences[0].length + 1]; Zm[i] = new long double[sequences[0].length + 1]; } } else { Zm = new long double *[sequences[1].length + 1]; Ze = new long double *[2]; Zf = new long double *[2]; for (i = 0; i <= sequences[1].length; i++) { Zm[i] = new long double[sequences[0].length + 1]; } Ze[0] = new long double[sequences[0].length + 1]; Zf[0] = new long double[sequences[0].length + 1]; Ze[1] = new long double[sequences[0].length + 1]; Zf[1] = new long double[sequences[0].length + 1]; } len0 = strlen(sequences[0].text); len1 = strlen(sequences[1].text); if (PART_FULL_MEMORY) { for (i = 0; i <= sequences[1].length; i++) for (j = 0; j <= sequences[0].length; j++) { Zm[i][j] = 0.00; Zf[i][j] = 0.00; Ze[i][j] = 0.00; } } else { for (i = 0; i <= len1; i++) { for (j = 0; j <= len0; j++) { Zm[i][j] = 0; } } for (j = 0; j <= len0; j++) { Zf[0][j] = 0; Ze[0][j] = 0; Zf[1][j] = 0; Ze[1][j] = 0; } } //INTITIALIZE THE DP if (endgaps == 0) { Zm[0][0] = 1.00; Zf[0][0] = Ze[0][0] = 0; Zf[1][0] = Zm[0][0] * beta_d; Ze[0][1] = Zm[0][0] * beta_d; //>=2ND ROW INIT if (PART_FULL_MEMORY) { for (i = 2; i <= sequences[1].length; i++) { Zf[i][0] = Zf[i - 1][0] * beta_e; } } //>=2ND COL INIT for (j = 2; j <= sequences[0].length; j++) { Ze[0][j] = Ze[0][j - 1] * beta_e; } } else { //init z Zm[0][0] = 1.00; Zf[0][0] = Ze[0][0] = 0; Zf[1][0] = Zm[0][0] * beta_endgapopen; Ze[0][1] = Zm[0][0] * beta_endgapopen; //>=2ND ROW INIT if (PART_FULL_MEMORY) { for (i = 2; i <= sequences[1].length; i++) { Zf[i][0] = Zf[i - 1][0] * beta_endgapextend; } } //>=2ND COL INIT for (j = 2; j <= sequences[0].length; j++) { Ze[0][j] = Ze[0][j - 1] * beta_endgapextend; } } //1ST ROW/COL INIT int Si, Tj; long double beta_score; for (i = 1; i <= sequences[1].length; i++) { for (j = 1; j <= sequences[0].length; j++) { Si = subst_index[sequences[1].text[i - 1] - 'A']; Tj = subst_index[sequences[0].text[j - 1] - 'A']; // score = sub_matrix[Si][Tj]; beta_score = scorez_matrix[Si][Tj]; beta_open0 = beta_d; beta_extend0 = beta_e; beta_open1 = beta_d; beta_extend1 = beta_e; if (endgaps == 1) { //check to see if one of the 2 sequences or both reach the end if (i == sequences[1].length) { // open0 = endgapopen; // extend0 = endgapextend; beta_open0 = beta_endgapopen; beta_extend0 = beta_endgapextend; } if (j == sequences[0].length) { // open1 = endgapopen; // extend1 = endgapextend; beta_open1 = beta_endgapopen; beta_extend1 = beta_endgapextend; } } // //z computation using open and extend temp vars //open0 is gap open in seq0 and open1 is gap open in seq1 //entend0 is gap extend in seq0 and extend1 is gap extend in seq1 if (PART_FULL_MEMORY) { Ze[i][j] = Zm[i][j - 1] * beta_open0 + Ze[i][j - 1] * beta_extend0; if (Ze[i][j] >= HUGE_VALL) { printf("ERROR: huge val error for Ze\n"); exit(1); } Zf[i][j] = Zm[i - 1][j] * beta_open1 + Zf[i - 1][j] * beta_extend1; if (Zf[i][j] >= HUGE_VALL) { printf("ERROR: huge val error for Zf\n"); exit(1); } Zm[i][j] = (Zm[i - 1][j - 1] + Ze[i - 1][j - 1] + Zf[i - 1][j - 1]) * beta_score; if (Zm[i][j] >= HUGE_VALL) { printf("ERROR: huge val error for Zm\n"); exit(1); } zz = Zm[i][j] + Ze[i][j] + Zf[i][j]; } else { Ze[1][j] = Zm[i][j - 1] * beta_open0 + Ze[1][j - 1] * beta_extend0; if (Ze[1][j] >= HUGE_VALL) { printf("ERROR: huge val error for zE\n"); exit(1); } Zf[1][j] = Zm[i - 1][j] * beta_open1 + Zf[0][j] * beta_extend1; if (Zf[1][j] >= HUGE_VALL) { printf("ERROR: huge val error for zF\n"); exit(1); } Zm[i][j] = (Zm[i - 1][j - 1] + Ze[0][j - 1] + Zf[0][j - 1]) * beta_score; if (Zm[i][j] >= HUGE_VALL) { printf("ERROR: huge val error for zM\n"); exit(1); } zz = Zm[i][j] + Ze[1][j] + Zf[1][j]; } } //end for if (!PART_FULL_MEMORY) { for (int t = 0; t <= sequences[0].length; t++) { Ze[0][t] = Ze[1][t]; Ze[1][t] = 0; Zf[0][t] = Zf[1][t]; Zf[1][t] = 0; } Zf[1][0] = 1; } } //end for //store the sum of zm zf ze (m,n)s in zm's 0,0 th position Zm[0][0] = zz; if (TRACE) { //debug code aug 3 //print the 3 Z matrices namely Zm Zf and Ze printf("\n\nFINAL Zm:\n"); for (i = 0; i <= sequences[1].length; i++) { for (j = 0; j <= sequences[0].length; j++) printf("%.2Le ", Zm[i][j]); printf("\n"); } printf("FINAL Zf \n"); for (i = 0; i <= sequences[1].length; i++) { for (j = 0; j <= sequences[0].length; j++) printf("%.2Le ", Zf[i][j]); printf("\n"); } printf("FINAL Ze \n"); for (i = 0; i <= sequences[1].length; i++) { for (j = 0; j <= sequences[0].length; j++) printf("%.2Le ", Ze[i][j]); printf("\n"); } //end debug dump code } if (PART_FULL_MEMORY) { for (i = 0; i <= sequences[1].length; i++) { delete(Zf[i]); delete(Ze[i]); } } else { delete(Zf[0]); delete(Ze[0]); delete(Zf[1]); delete(Ze[1]); } delete(Zf); delete(Ze); return Zm; } //end of forward partition function ///////////////////////////////////////////////////////////////////////////////////////// //entry point (was the main function) , returns the posterior probability safe vector //////////////////////////////////////////////////////////////////////////////////////// VF *ComputePostProbs(int a, int b, string seq1, string seq2) { //printf("probamod\n"); float gap_open = -22, gap_ext = -1, beta = .2; int stock_loop = 1; int le = 160; //init and parse the arguments termgapopen = 0.0; termgapextend = 0.0; sequences[0].length = strlen((char *) seq1.c_str()); sequences[0].text = (char *) seq1.c_str(); sequences[0].title = new char[10]; strcpy(sequences[0].title, "seq0"); sequences[1].length = strlen((char *) seq2.c_str()); sequences[1].text = (char *) seq2.c_str(); sequences[1].title = new char[10]; strcpy(sequences[1].title, "seq1"); if (TRACE) { printf("%d %d %s\n%d %d %s\n--\n", a, sequences[0].length, sequences[0].text, b, sequences[1].length, sequences[1].text); printf("after init\n"); FILE *dump1 = fopen("dump1", "a"); fprintf(dump1, "%d %d %s\n%d %d %s\n--\n", a, sequences[0].length, sequences[0].text, b, sequences[1].length, sequences[1].text); fclose(dump1); } gap_open = argument.gapopen; gap_ext = argument.gapext; stock_loop = argument.N; le = argument.matrix; beta = argument.beta; if (TRACE) printf("%f %f %f %d\n", gap_open, gap_ext, beta, le); //call for calculating the posterior probabilities // 1. call partition function partf // 2. calculate revpartition using revers_parf // 3. calculate probabilities /// MODIFICATION... POPULATE SAFE VECTOR long double **MAT1; MAT1 = partf(gap_open, gap_ext, beta); return revers_partf(MAT1, gap_open, gap_ext, beta); } //end of posterior probability module probalign1.4/ReadMatrix.cc0000755013460500000360000001051211467040512014307 0ustar usmanafs///////////////////////////////////////////////////////////////// // ReadMatrix.cc // // routines for reading substitution matrix ///////////////////////////////////////////////////////////////// #include "SafeVector.h" #include #include #include #include #include "Matrix.h" #define TRACE 0 //////////////////////////////////////////////////////////// // extern variables for scoring matrix data //////////////////////////////////////////////////////////// extern float g_gap_open1, g_gap_open2, g_gap_ext1, g_gap_ext2; extern char *aminos, *bases, matrixtype[20]; extern int subst_index[26]; extern float sub_matrix[26][26]; extern float scorez_matrix[26][26]; extern float TEMPERATURE; extern int MATRIXTYPE; extern float GAPOPEN; extern float GAPEXT; typedef struct { char input[30]; int matrix; int N; float T; float beta; char opt; //can be 'P' or 'M' float gapopen; float gapext; } argument_decl; //argument support extern argument_decl argument; ///////////////////////////////////////////////////////// //sets substitution matrix type //////////////////////////////////////////////////////// void setmatrixtype(int le) { switch (le) { case 160: strcpy(matrixtype, "gonnet_160"); break; case 4: strcpy(matrixtype, "nuc_simple"); break; default: strcpy(matrixtype, "CUSTOM"); break; }; } /////////////////////////////////////////////////////////////////// //sets matrix flag /////////////////////////////////////////////////////////////////// inline int matrixtype_to_int() { if (!strcmp(matrixtype, "nuc_simple")) return 4; else if (!strcmp(matrixtype, "gonnet_160")) return 160; else return 1000; } ///////////////////////////////////////////////////////////////// // // Can read any scoring matrix as long as it is defined in Matrix.h // AND it is a lower triangular // AND the order of amino acids/bases is mentioned ///////////////////////////////////////////////////////////////// inline void read_matrix(score_matrix matrx) { int i, j, basecount,position=0; bases=(char *)matrx.monomers; basecount = strlen(bases); for (i = 0; i < basecount; i++) subst_index[i] = -1; for (i = 0; i < basecount; i++) subst_index[bases[i] - 'A'] = i; if (TRACE == 1) printf("\nbases read: %d\n", basecount); for (i = 0; i < basecount; i++) for (j = 0; j <= i; j++) { sub_matrix[i][j]=matrx.matrix[position++]; sub_matrix[j][i] = sub_matrix[i][j]; scorez_matrix[i][j]= exp(sub_matrix[i][j]*argument.beta); scorez_matrix[j][i] = exp(sub_matrix[j][i]*argument.beta); } if (TRACE) for (i = 0; i < basecount; i++) { for (j = 0; j < basecount; j++) printf(" %f ", sub_matrix[i][j]); printf("\n"); } } ////////////////////////////////////////////////////////////////////////////////// //intialize the arguments (default values) ////////////////////////////////////////////////////////////////////////////////// void init_arguments() { float gap_open = 0, gap_ext = 0; int le; le = matrixtype_to_int(); argument.N = 1; strcpy(argument.input, "tempin"); argument.matrix = le; argument.gapopen = GAPOPEN; argument.gapext = GAPEXT; argument.T = TEMPERATURE; argument.beta = 1.0 / TEMPERATURE; argument.opt = 'P'; if (le == 4) //NUC OPTION :default is nuc_simple { read_matrix(nuc_simple); gap_open = -4; gap_ext = -0.25; } else if (le == 160) //PROT option: default is gonnet_160 { if (TRACE) printf("read matrix\n"); read_matrix(gonnet_160); gap_open = -22; gap_ext = -1; } else if (le == 1000) //Error handling { printf("Error: enter a valid matrix type\n"); exit(1); //additional matrices can only be lower triangular } //now override the gapopen and gapext if (argument.gapopen != 0.0 || argument.gapext != 0.00) { gap_open = -argument.gapopen; gap_ext = -argument.gapext; } if (TRACE) printf("%f %f %f %d\n",argument.T, gap_open, gap_ext, le); argument.gapopen = gap_open; argument.gapext = gap_ext; argument.opt = 'P'; } //end of init ///////////////////////////////////////////////////////////////////////////////// //END OF MATRIX MODULE /////////////////////////////////////////////////////////////////////////////// probalign1.4/SafeVector.h0000755013460500000360000000276010532124320014150 0ustar usmanafs///////////////////////////////////////////////////////////////// // SafeVector.h // // STL vector with array bounds checking. To enable bounds // checking, #define ENABLE_CHECKS. ///////////////////////////////////////////////////////////////// #ifndef SAFEVECTOR_H #define SAFEVECTOR_H #include #include ///////////////////////////////////////////////////////////////// // SafeVector // // Class derived from the STL std::vector for bounds checking. ///////////////////////////////////////////////////////////////// template class SafeVector : public std::vector{ public: // miscellaneous constructors SafeVector() : std::vector() {} SafeVector (size_t size) : std::vector(size) {} SafeVector (size_t size, const TYPE &value) : std::vector(size, value) {} SafeVector (const SafeVector &source) : std::vector(source) {} #ifdef ENABLE_CHECKS // [] array bounds checking TYPE &operator[](int index){ assert (index >= 0 && index < (int) size()); return std::vector::operator[] ((size_t) index); } // [] const array bounds checking const TYPE &operator[] (int index) const { assert (index >= 0 && index < (int) size()); return std::vector::operator[] ((size_t) index) ; } #endif }; // some commonly used vector types typedef SafeVector VI; typedef SafeVector VVI; typedef SafeVector VVVI; typedef SafeVector VF; typedef SafeVector VVF; typedef SafeVector VVVF; #endif probalign1.4/Sequence.h0000755013460500000360000003067310532124321013664 0ustar usmanafs///////////////////////////////////////////////////////////////// // Sequence.h // // Class for reading/manipulating single sequence character data. ///////////////////////////////////////////////////////////////// #ifndef SEQUENCE_H #define SEQUENCE_H #include #include #include #include #include #include "SafeVector.h" #include "FileBuffer.h" ///////////////////////////////////////////////////////////////// // Sequence // // Class for storing sequence information. ///////////////////////////////////////////////////////////////// class Sequence { bool isValid; // a boolean indicating whether the sequence data is valid or not string header; // string containing the comment line of the FASTA file SafeVector *data; // pointer to character data int length; // length of the sequence int sequenceLabel; // integer sequence label, typically to indicate the ordering of sequences // in a Multi-FASTA file int inputLabel; // position of sequence in original input ///////////////////////////////////////////////////////////////// // Sequence::Sequence() // // Default constructor. Does nothing. ///////////////////////////////////////////////////////////////// Sequence () : isValid (false), header (""), data (NULL), length (0), sequenceLabel (0), inputLabel (0) {} public: ///////////////////////////////////////////////////////////////// // Sequence::Sequence() // // Constructor. Reads the sequence from a FileBuffer. ///////////////////////////////////////////////////////////////// Sequence (FileBuffer &infile, bool stripGaps = false) : isValid (false), header ("~"), data (NULL), length(0), sequenceLabel (0), inputLabel (0) { // read until the first non-blank line while (!infile.eof()){ infile.GetLine (header); if (header.length() != 0) break; } // check to make sure that it is a correct header line if (header[0] == '>'){ // if so, remove the leading ">" header = header.substr (1); // remove any leading or trailing white space in the header comment while (header.length() > 0 && isspace (header[0])) header = header.substr (1); while (header.length() > 0 && isspace (header[header.length() - 1])) header = header.substr(0, header.length() - 1); // get ready to read the data[] array; note that data[0] is always '@' char ch; data = new SafeVector; assert (data); data->push_back ('@'); // get a character from the file while (infile.Get(ch)){ // if we've reached a new comment line, put the character back and stop if (ch == '>'){ infile.UnGet(); break; } // skip whitespace if (isspace (ch)) continue; // substitute gap character if (ch == '.') ch = '-'; if (stripGaps && ch == '-') continue; // check for known characters if (!((ch >= 'A' && ch <= 'Z') || (ch >= 'a' && ch <= 'z') || ch == '*' || ch == '-')){ cerr << "ERROR: Unknown character encountered: " << ch << endl; exit (1); } // everything's ok so far, so just store this character. data->push_back(ch); ++length; } // sequence must contain data in order to be valid isValid = length > 0; if (!isValid){ delete data; data = NULL; } } } ///////////////////////////////////////////////////////////////// // Sequence::Sequence() // // Constructor. Builds a sequence from existing data. Note // that the data must use one-based indexing where data[0] should // be set to '@'. ///////////////////////////////////////////////////////////////// Sequence (SafeVector *data, string header, int length, int sequenceLabel, int inputLabel) : isValid (data != NULL), header(header), data(data), length (length), sequenceLabel (sequenceLabel), inputLabel (inputLabel) { assert (data); assert ((*data)[0] == '@'); } ///////////////////////////////////////////////////////////////// // Sequence::Sequence() // // Destructor. Release allocated memory. ///////////////////////////////////////////////////////////////// ~Sequence (){ if (data){ assert (isValid); delete data; data = NULL; isValid = false; } } ///////////////////////////////////////////////////////////////// // Sequence::GetHeader() // // Return the string comment associated with this sequence. ///////////////////////////////////////////////////////////////// string GetHeader () const { return header; } ///////////////////////////////////////////////////////////////// // Sequence::GetName() // // Return the first word of the string comment associated with this sequence. ///////////////////////////////////////////////////////////////// string GetName () const { char name[1024]; sscanf (header.c_str(), "%s", name); return string(name); } ///////////////////////////////////////////////////////////////// // Sequence::GetDataPtr() // // Return the iterator to data associated with this sequence. ///////////////////////////////////////////////////////////////// SafeVector::iterator GetDataPtr(){ assert (isValid); assert (data); return data->begin(); } ///////////////////////////////////////////////////////////////// // Sequence::GetPosition() // // Return the character at position i. Recall that the character // data is stored with one-based indexing. ///////////////////////////////////////////////////////////////// char GetPosition (int i) const { assert (isValid); assert (data); assert (i >= 1 && i <= length); return (*data)[i]; } ///////////////////////////////////////////////////////////////// // Sequence::SetLabel() // // Sets the sequence label to i. ///////////////////////////////////////////////////////////////// void SetLabel (int i){ assert (isValid); sequenceLabel = i; inputLabel = i; } ///////////////////////////////////////////////////////////////// // Sequence::SetSortLabel() // // Sets the sequence sorting label to i. ///////////////////////////////////////////////////////////////// void SetSortLabel (int i){ assert (isValid); sequenceLabel = i; } ///////////////////////////////////////////////////////////////// // Sequence::GetLabel() // // Retrieves the input label. ///////////////////////////////////////////////////////////////// int GetLabel () const { assert (isValid); return inputLabel; } ///////////////////////////////////////////////////////////////// // Sequence::GetSortLabel() // // Retrieves the sorting label. ///////////////////////////////////////////////////////////////// int GetSortLabel () const { assert (isValid); return sequenceLabel; } ///////////////////////////////////////////////////////////////// // Sequence::Fail() // // Checks to see if the sequence successfully loaded. ///////////////////////////////////////////////////////////////// bool Fail () const { return !isValid; } ///////////////////////////////////////////////////////////////// // Sequence::Length() // // Returns the length of the sequence. ///////////////////////////////////////////////////////////////// int GetLength () const { assert (isValid); assert (data); return length; } ///////////////////////////////////////////////////////////////// // Sequence::WriteMFA() // // Writes the sequence to outfile in MFA format. Uses numColumns // columns per line. If useIndex is set to false, then the // header is printed as normal, but if useIndex is true, then // ">S###" is printed where ### represents the sequence label. ///////////////////////////////////////////////////////////////// void WriteMFA (ostream &outfile, int numColumns, bool useIndex = false) const { assert (isValid); assert (data); assert (!outfile.fail()); // print out heading if (useIndex) outfile << ">S" << GetLabel() << endl; else outfile << ">" << header << endl; // print out character data int ct = 1; for (; ct <= length; ct++){ outfile << (*data)[ct]; if (ct % numColumns == 0) outfile << endl; } if ((ct-1) % numColumns != 0) outfile << endl; } ///////////////////////////////////////////////////////////////// // Sequence::Clone() // // Returns a new deep copy of the seqeuence. ///////////////////////////////////////////////////////////////// Sequence *Clone () const { Sequence *ret = new Sequence(); assert (ret); ret->isValid = isValid; ret->header = header; ret->data = new SafeVector; assert (ret->data); *(ret->data) = *data; ret->length = length; ret->sequenceLabel = sequenceLabel; ret->inputLabel = inputLabel; return ret; } ///////////////////////////////////////////////////////////////// // Sequence::GetRange() // // Returns a new sequence object consisting of a range of // characters from the current seuquence. ///////////////////////////////////////////////////////////////// Sequence *GetRange (int start, int end) const { Sequence *ret = new Sequence(); assert (ret); assert (start >= 1 && start <= length); assert (end >= 1 && end <= length); assert (start <= end); ret->isValid = isValid; ret->header = header; ret->data = new SafeVector; assert (ret->data); ret->data->push_back ('@'); for (int i = start; i <= end; i++) ret->data->push_back ((*data)[i]); ret->length = end - start + 1; ret->sequenceLabel = sequenceLabel; ret->inputLabel = inputLabel; return ret; } ///////////////////////////////////////////////////////////////// // Sequence::AddGaps() // // Given an SafeVector containing the skeleton for an // alignment and the identity of the current character, this // routine will create a new sequence with all necesssary gaps added. // For instance, // alignment = "XXXBBYYYBBYYXX" // id = 'X' // will perform the transformation // "ATGCAGTCA" --> "ATGCC---GT--CA" // (XXXBBYYYBBYYXX) ///////////////////////////////////////////////////////////////// Sequence *AddGaps (SafeVector *alignment, char id){ Sequence *ret = new Sequence(); assert (ret); ret->isValid = isValid; ret->header = header; ret->data = new SafeVector; assert (ret->data); ret->length = (int) alignment->size(); ret->sequenceLabel = sequenceLabel; ret->inputLabel = inputLabel; ret->data->push_back ('@'); SafeVector::iterator dataIter = data->begin() + 1; for (SafeVector::iterator iter = alignment->begin(); iter != alignment->end(); ++iter){ if (*iter == 'B' || *iter == id){ ret->data->push_back (*dataIter); ++dataIter; } else ret->data->push_back ('-'); } return ret; } ///////////////////////////////////////////////////////////////// // Sequence::GetString() // // Returns the sequence as a string with gaps removed. ///////////////////////////////////////////////////////////////// string GetString (){ string s = ""; for (int i = 1; i <= length; i++){ if ((*data)[i] != '-') s += (*data)[i]; } return s; } ///////////////////////////////////////////////////////////////// // Sequence::GetMapping() // // Returns a SafeVector containing the indices of every // character in the sequence. For instance, if the data is // "ATGCC---GT--CA", the method returns {1,2,3,4,5,9,10,13,14}. ///////////////////////////////////////////////////////////////// SafeVector *GetMapping () const { SafeVector *ret = new SafeVector(1, 0); for (int i = 1; i <= length; i++){ if ((*data)[i] != '-') ret->push_back (i); } return ret; } ///////////////////////////////////////////////////////////////// // Sequence::Highlight() // // Changes all positions with score >= cutoff to upper case and // all positions with score < cutoff to lower case. ///////////////////////////////////////////////////////////////// void Highlight (const SafeVector &scores, const float cutoff){ for (int i = 1; i <= length; i++){ if (scores[i-1] >= cutoff) (*data)[i] = toupper ((*data)[i]); else (*data)[i] = tolower ((*data)[i]); } } }; #endif probalign1.4/SparseMatrix.h0000755013460500000360000001771610532124321014541 0ustar usmanafs///////////////////////////////////////////////////////////////// // SparseMatrix.h // // Sparse matrix computations ///////////////////////////////////////////////////////////////// #ifndef SPARSEMATRIX_H #define SPARSEMATRIX_H #include using namespace std; const float POSTERIOR_CUTOFF = 0.01; // minimum posterior probability // value that is maintained in the // sparse matrix representation typedef pair PIF; // Sparse matrix entry type // first --> column // second --> value ///////////////////////////////////////////////////////////////// // SparseMatrix // // Class for sparse matrix computations ///////////////////////////////////////////////////////////////// class SparseMatrix { int seq1Length, seq2Length; // dimensions of matrix VI rowSize; // rowSize[i] = # of cells in row i SafeVector data; // data values SafeVector::iterator> rowPtrs; // pointers to the beginning of each row ///////////////////////////////////////////////////////////////// // SparseMatrix::SparseMatrix() // // Private constructor. ///////////////////////////////////////////////////////////////// SparseMatrix (){} public: ///////////////////////////////////////////////////////////////// // SparseMatrix::SparseMatrix() // // Constructor. Builds a sparse matrix from a posterior matrix. // Note that the expected format for the posterior matrix is as // a (seq1Length+1) x (seq2Length+1) matrix where the 0th row // and 0th column are ignored (they should contain all zeroes). ///////////////////////////////////////////////////////////////// SparseMatrix (int seq1Length, int seq2Length, const VF &posterior) : seq1Length (seq1Length), seq2Length (seq2Length) { int numCells = 0; assert (seq1Length > 0); assert (seq2Length > 0); // calculate memory required; count the number of cells in the // posterior matrix above the threshold VF::const_iterator postPtr = posterior.begin(); for (int i = 0; i <= seq1Length; i++){ for (int j = 0; j <= seq2Length; j++){ if (*(postPtr++) >= POSTERIOR_CUTOFF){ assert (i != 0 && j != 0); numCells++; } } } // allocate memory data.resize(numCells); rowSize.resize (seq1Length + 1); rowSize[0] = -1; rowPtrs.resize (seq1Length + 1); rowPtrs[0] = data.end(); // build sparse matrix postPtr = posterior.begin() + seq2Length + 1; // note that we're skipping the first row here SafeVector::iterator dataPtr = data.begin(); for (int i = 1; i <= seq1Length; i++){ postPtr++; // and skipping the first column of each row rowPtrs[i] = dataPtr; for (int j = 1; j <= seq2Length; j++){ if (*postPtr >= POSTERIOR_CUTOFF){ dataPtr->first = j; dataPtr->second = *postPtr; dataPtr++; } postPtr++; } rowSize[i] = dataPtr - rowPtrs[i]; } } ///////////////////////////////////////////////////////////////// // SparseMatrix::GetRowPtr() // // Returns the pointer to a particular row in the sparse matrix. ///////////////////////////////////////////////////////////////// SafeVector::iterator GetRowPtr (int row) const { assert (row >= 1 && row <= seq1Length); return rowPtrs[row]; } ///////////////////////////////////////////////////////////////// // SparseMatrix::GetValue() // // Returns value at a particular row, column. ///////////////////////////////////////////////////////////////// float GetValue (int row, int col){ assert (row >= 1 && row <= seq1Length); assert (col >= 1 && col <= seq2Length); for (int i = 0; i < rowSize[row]; i++){ if (rowPtrs[row][i].first == col) return rowPtrs[row][i].second; } return 0; } ///////////////////////////////////////////////////////////////// // SparseMatrix::GetRowSize() // // Returns the number of entries in a particular row. ///////////////////////////////////////////////////////////////// int GetRowSize (int row) const { assert (row >= 1 && row <= seq1Length); return rowSize[row]; } ///////////////////////////////////////////////////////////////// // SparseMatrix::GetSeq1Length() // // Returns the first dimension of the matrix. ///////////////////////////////////////////////////////////////// int GetSeq1Length () const { return seq1Length; } ///////////////////////////////////////////////////////////////// // SparseMatrix::GetSeq2Length() // // Returns the second dimension of the matrix. ///////////////////////////////////////////////////////////////// int GetSeq2Length () const { return seq2Length; } ///////////////////////////////////////////////////////////////// // SparseMatrix::GetRowPtr // // Returns the pointer to a particular row in the sparse matrix. ///////////////////////////////////////////////////////////////// int GetNumCells () const { return data.size(); } ///////////////////////////////////////////////////////////////// // SparseMatrix::Print() // // Prints out a sparse matrix. ///////////////////////////////////////////////////////////////// void Print (ostream &outfile) const { outfile << "Sparse Matrix:" << endl; for (int i = 1; i <= seq1Length; i++){ outfile << " " << i << ":"; for (int j = 0; j < rowSize[i]; j++){ outfile << " (" << rowPtrs[i][j].first << "," << rowPtrs[i][j].second << ")"; } outfile << endl; } } ///////////////////////////////////////////////////////////////// // SparseMatrix::ComputeTranspose() // // Returns a new sparse matrix containing the transpose of the // current matrix. ///////////////////////////////////////////////////////////////// SparseMatrix *ComputeTranspose () const { // create a new sparse matrix SparseMatrix *ret = new SparseMatrix(); int numCells = data.size(); ret->seq1Length = seq2Length; ret->seq2Length = seq1Length; // allocate memory ret->data.resize (numCells); ret->rowSize.resize (seq2Length + 1); ret->rowSize[0] = -1; ret->rowPtrs.resize (seq2Length + 1); ret->rowPtrs[0] = ret->data.end(); // compute row sizes for (int i = 1; i <= seq2Length; i++) ret->rowSize[i] = 0; for (int i = 0; i < numCells; i++) ret->rowSize[data[i].first]++; // compute row ptrs for (int i = 1; i <= seq2Length; i++){ ret->rowPtrs[i] = (i == 1) ? ret->data.begin() : ret->rowPtrs[i-1] + ret->rowSize[i-1]; } // now fill in data SafeVector::iterator> currPtrs = ret->rowPtrs; for (int i = 1; i <= seq1Length; i++){ SafeVector::iterator row = rowPtrs[i]; for (int j = 0; j < rowSize[i]; j++){ currPtrs[row[j].first]->first = i; currPtrs[row[j].first]->second = row[j].second; currPtrs[row[j].first]++; } } return ret; } ///////////////////////////////////////////////////////////////// // SparseMatrix::GetPosterior() // // Return the posterior representation of the sparse matrix. ///////////////////////////////////////////////////////////////// VF *GetPosterior () const { // create a new posterior matrix VF *posteriorPtr = new VF((seq1Length+1) * (seq2Length+1)); assert (posteriorPtr); VF &posterior = *posteriorPtr; // build the posterior matrix for (int i = 0; i < (seq1Length+1) * (seq2Length+1); i++) posterior[i] = 0; for (int i = 1; i <= seq1Length; i++){ VF::iterator postPtr = posterior.begin() + i * (seq2Length+1); for (int j = 0; j < rowSize[i]; j++){ postPtr[rowPtrs[i][j].first] = rowPtrs[i][j].second; } } return posteriorPtr; } }; #endif