probcons/0000755000175000017500000000000010604511152013074 5ustar chuongdochuongdoprobcons/CompareToRef.cc0000644000175000017500000002202210075047142015734 0ustar chuongdochuongdo///////////////////////////////////////////////////////////////// // CompareToRef.cc // // Program for scoring alignments according to the SUM-OF-PAIRS // or COLUMN score. ///////////////////////////////////////////////////////////////// #include "SafeVector.h" #include "MultiSequence.h" #include #include #include #include #include #include #include #include #include #include #include const char CORE_BLOCK = 'h'; typedef pair PII; bool useCoreBlocks = false; bool useColScore = false; bool useCaps = false; bool useBaliAnnot = false; bool makeAnnot = false; ///////////////////////////////////////////////////////////////// // Function prototypes ///////////////////////////////////////////////////////////////// set ComputePairs (MultiSequence *align, bool isRef); set ComputeColumns (MultiSequence *align, bool isRef); string GetName (string s); set coreCols; set refCols, testCols; set refPairs, testPairs; VI annotation; ///////////////////////////////////////////////////////////////// // main() // // Main program. ///////////////////////////////////////////////////////////////// int main (int argc, char **argv){ // check arguments if (argc < 3){ cerr << "Usage: score TEST_ALIGNMENT REFERENCE_ALIGNMENT [BALIBASE_ANNOT_FILE] [-col] [-core] [-caps] [-annot FILENAME]" << endl; exit (1); } // try opening file FileBuffer infile (argv[1]); MultiSequence *testAlign; if (infile.fail()){ cerr << "ERROR: Could not open file '" << argv[1] << "' for reading." << endl; testAlign = NULL; } else { testAlign = new MultiSequence(); assert (testAlign); testAlign->LoadMFA (infile); } infile.close(); MultiSequence *refAlign = new MultiSequence (string (argv[2])); assert (refAlign); string outFilename = ""; for (int i = 3; i < argc; i++){ if (strcmp (argv[i], "-core") == 0) useCoreBlocks = true; else if (strcmp (argv[i], "-col") == 0) useColScore = true; else if (strcmp (argv[i], "-caps") == 0) useCaps = true; else if (strcmp (argv[i], "-annot") == 0){ makeAnnot = true; outFilename = string (argv[++i]); } else { // annotation file useBaliAnnot = true; ifstream annotFile (argv[i]); if (annotFile.fail()){ cerr << "ERROR: Could not read BAliBASE annotation file." << endl; exit (1); } SafeVector *indices = refAlign->GetSequence(0)->GetMapping(); char buffer[10000]; while (annotFile.getline (buffer, 10000)){ istringstream ss; ss.str (string (buffer)); string s; if ((ss >> s) && s == string ("BPOS")){ while (ss >> s){ int begin=-1, end=-1; if (sscanf (s.c_str(), "%d=%d", &begin, &end) == 2){ for (int i = (*indices)[begin]; i <= (*indices)[end]; i++) coreCols.insert (i); } } } } delete indices; annotFile.close(); } } if (useColScore) makeAnnot = false; if (testAlign){ for (int i = 0; i < testAlign->GetNumSequences(); i++){ bool found = false; for (int j = 0; !found && j < refAlign->GetNumSequences(); j++){ if (testAlign->GetSequence(i)->GetHeader() == refAlign->GetSequence(j)->GetHeader()) found = true; } if (!found){ testAlign->RemoveSequence (i); i--; } } for (int i = 0; i < refAlign->GetNumSequences(); i++){ bool found = false; for (int j = 0; !found && j < testAlign->GetNumSequences(); j++){ if (refAlign->GetSequence(i)->GetHeader() == testAlign->GetSequence(j)->GetHeader()) found = true; } if (!found){ refAlign->RemoveSequence (i); i--; } } testAlign->SortByHeader(); refAlign->SortByHeader(); } int TP = 0; int TPFN = 0; int TPFP = 0; double FD, FM; if (useColScore){ refCols = ComputeColumns (refAlign, true); if (testAlign) testCols = ComputeColumns (testAlign, false); set colIntersect; insert_iterator > colIntersectIter (colIntersect, colIntersect.begin()); set_intersection (testCols.begin(), testCols.end(), refCols.begin(), refCols.end(), colIntersectIter); TP = (int) colIntersect.size(); TPFN = (int) refCols.size(); if (testAlign) TPFP = (int) testCols.size(); } else { refPairs = ComputePairs (refAlign, true); if (testAlign) testPairs = ComputePairs (testAlign, false); set pairIntersect; insert_iterator > pairIntersectIter (pairIntersect, pairIntersect.begin()); set_intersection (testPairs.begin(), testPairs.end(), refPairs.begin(), refPairs.end(), pairIntersectIter); TP = (int) pairIntersect.size(); TPFN = (int) refPairs.size(); if (testAlign) TPFP = (int) testPairs.size(); } FD = (double) TP / TPFN; FM = (double) TP / TPFP; cout << GetName(string (argv[2])) << " " << TP << " " << TPFN << " " << TPFP << " " << FD << " " << FM << endl; if (makeAnnot){ ofstream outfile (outFilename.c_str()); for (int i = 0; i < (int) annotation.size(); i++){ outfile << annotation[i] << endl; } outfile.close(); } if (testAlign) delete testAlign; delete refAlign; } int GetOffset (Sequence *testSeq, Sequence *refSeq){ string test = testSeq->GetString(); string ref = refSeq->GetString(); for (int i = 0; i < (int) test.length(); i++) test[i] = toupper(test[i]); for (int i = 0; i < (int) ref.length(); i++) ref[i] = toupper(ref[i]); size_t offset = test.find (ref, 0); if (offset == string::npos){ cerr << "ERROR: Reference string not found in original sequence!" << endl; cerr << " test = " << test << endl; cerr << " ref = " << ref << endl; exit (1); } cerr << "Offset found: " << offset << endl; return (int) offset; } string GetName (string s){ size_t index1 = s.rfind ('/'); size_t index2 = s.rfind ('.'); if (index1 == string::npos) index1 = 0; else index1++; if (index2 == string::npos) index2 = s.length(); if (index2 < index1) index2 = s.length(); return s.substr (index1, index2 - index1); } bool isCore (char ch, int col){ if (ch == '-') return false; if (useBaliAnnot){ return coreCols.find (col) != coreCols.end(); } if (useCaps){ return ch >= 'A' && ch <= 'Z'; } return ch == CORE_BLOCK; } ///////////////////////////////////////////////////////////////// // ComputePairs // // Returns the set of all matching pairs. ///////////////////////////////////////////////////////////////// set ComputePairs (MultiSequence *align, bool isRef){ int N = align->GetNumSequences(); int L = align->GetSequence(0)->GetLength(); // retrieve all sequence data pointers SafeVector::iterator> seqs (N); for (int i = 0; i < N; i++){ seqs[i] = align->GetSequence(i)->GetDataPtr(); assert (align->GetSequence(i)->GetLength() == L); } set ret; VI ctr(N); // compute pairs for (int i = 1; i <= L; i++){ // ctr keeps track of the current position in each sequence for (int j = 0; j < N; j++){ ctr[j] += (seqs[j][i] != '-'); } int good = 0; int ct = 0; // check for all matching pairs for (int j = 0; j < N - 1; j++){ for (int k = j + 1; k < N; k++){ // skip if one of the sequences is gapped if (seqs[j][i] == '-' || seqs[k][i] == '-') continue; // check for core blocks in the reference sequence if (isRef && useCoreBlocks) if (!isCore (seqs[j][i], i) || !isCore (seqs[k][i], i)) continue; // if all ok, then add pair to list of pairs pair p (10000 * j + ctr[j], 10000 * k + ctr[k]); // if we're making an annotation, compute annotation statistics if (makeAnnot && !isRef){ ct++; if (refPairs.find (p) != refPairs.end()) good++; } ret.insert (p); } } // build annotation if (makeAnnot && !isRef){ annotation.push_back ((ct == 0) ? 0 : 100 * good / ct); } } return ret; } ///////////////////////////////////////////////////////////////// // ComputeColumns // // Returns the set of all columns. ///////////////////////////////////////////////////////////////// set ComputeColumns (MultiSequence *align, bool isRef){ int N = align->GetNumSequences(); int L = align->GetSequence(0)->GetLength(); // retrieve all sequence data pointers SafeVector::iterator> seqs (N); for (int i = 0; i < N; i++){ seqs[i] = align->GetSequence(i)->GetDataPtr(); } set ret; VI ctr(N); // compute pairs for (int i = 1; i <= L; i++){ // ctr keeps track of the current position in each sequence for (int j = 0; j < N; j++){ ctr[j] += (seqs[j][i] != '-'); } // add column, pick only positions that are matched SafeVector column (N); bool useThisColumn = !useCoreBlocks; for (int j = 0; j < N; j++){ if (isCore (seqs[j][i], i)) useThisColumn = true; column[j] = (seqs[j][i] == '-') ? -1 : ctr[j]; } if (useThisColumn || !isRef) ret.insert (column); } return ret; } probcons/Defaults.h0000644000175000017500000001270610075047142015027 0ustar chuongdochuongdo///////////////////////////////////////////////////////////////// // Defaults.h // // Default constants for use in PROBCONS. The emission // probabilities were computed using the program used to build // the BLOSUM62 matrix from the BLOCKS 5.0 dataset. Transition // parameters were obtained via unsupervised EM training on the // BALIBASE 2.0 benchmark alignment database. ///////////////////////////////////////////////////////////////// #ifndef DEFAULTS_H #define DEFAULTS_H #include using namespace std; /* float initDistrib1Default[] = { 0.3202854395, 0.3398572505, 0.3398572505 }; float gapOpen1Default[] = { 0.1375414133, 0.1375414133 }; float gapExtend1Default[] = { 0.7832147479, 0.7832147479 }; */ float initDistrib1Default[] = { 0.6080327034f, 0.1959836632f, 0.1959836632f }; float gapOpen1Default[] = { 0.01993141696f, 0.01993141696f }; float gapExtend1Default[] = { 0.7943345308f, 0.7943345308f }; float initDistrib2Default[] = { 0.6814756989f, 8.615339902e-05f, 8.615339902e-05f, 0.1591759622f, 0.1591759622 }; float gapOpen2Default[] = { 0.0119511066f, 0.0119511066f, 0.008008334786f, 0.008008334786 }; float gapExtend2Default[] = { 0.3965826333f, 0.3965826333f, 0.8988758326f, 0.8988758326 }; string alphabetDefault = "ARNDCQEGHILKMFPSTWYV"; float emitSingleDefault[20] = { 0.07831005f, 0.05246024f, 0.04433257f, 0.05130349f, 0.02189704f, 0.03585766f, 0.05615771f, 0.07783433f, 0.02601093f, 0.06511648f, 0.09716489f, 0.05877077f, 0.02438117f, 0.04463228f, 0.03940142f, 0.05849916f, 0.05115306f, 0.01203523f, 0.03124726f, 0.07343426f }; float emitPairsDefault[20][20] = { {0.02373072f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f}, {0.00244502f, 0.01775118f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f}, {0.00210228f, 0.00207782f, 0.01281864f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f}, {0.00223549f, 0.00161657f, 0.00353540f, 0.01911178f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f}, {0.00145515f, 0.00044701f, 0.00042479f, 0.00036798f, 0.01013470f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f}, {0.00219102f, 0.00253532f, 0.00158223f, 0.00176784f, 0.00032102f, 0.00756604f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f}, {0.00332218f, 0.00268865f, 0.00224738f, 0.00496800f, 0.00037956f, 0.00345128f, 0.01676565f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f}, {0.00597898f, 0.00194865f, 0.00288882f, 0.00235249f, 0.00071206f, 0.00142432f, 0.00214860f, 0.04062876f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f}, {0.00114353f, 0.00132105f, 0.00141205f, 0.00097077f, 0.00026421f, 0.00113901f, 0.00131767f, 0.00103704f, 0.00867996f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f}, {0.00318853f, 0.00138145f, 0.00104273f, 0.00105355f, 0.00094040f, 0.00100883f, 0.00124207f, 0.00142520f, 0.00059716f, 0.01778263f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f}, {0.00449576f, 0.00246811f, 0.00160275f, 0.00161966f, 0.00138494f, 0.00180553f, 0.00222063f, 0.00212853f, 0.00111754f, 0.01071834f, 0.03583921f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f}, {0.00331693f, 0.00595650f, 0.00257310f, 0.00252518f, 0.00046951f, 0.00312308f, 0.00428420f, 0.00259311f, 0.00121376f, 0.00157852f, 0.00259626f, 0.01612228f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f}, {0.00148878f, 0.00076734f, 0.00063401f, 0.00047808f, 0.00037421f, 0.00075546f, 0.00076105f, 0.00066504f, 0.00042237f, 0.00224097f, 0.00461939f, 0.00096120f, 0.00409522f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f}, {0.00165004f, 0.00090768f, 0.00084658f, 0.00069041f, 0.00052274f, 0.00059248f, 0.00078814f, 0.00115204f, 0.00072545f, 0.00279948f, 0.00533369f, 0.00087222f, 0.00116111f, 0.01661038f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f}, {0.00230618f, 0.00106268f, 0.00100282f, 0.00125381f, 0.00034766f, 0.00090111f, 0.00151550f, 0.00155601f, 0.00049078f, 0.00103767f, 0.00157310f, 0.00154836f, 0.00046718f, 0.00060701f, 0.01846071f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f}, {0.00631752f, 0.00224540f, 0.00301397f, 0.00285226f, 0.00094867f, 0.00191155f, 0.00293898f, 0.00381962f, 0.00116422f, 0.00173565f, 0.00250962f, 0.00312633f, 0.00087787f, 0.00119036f, 0.00180037f, 0.01346609f, 0.0f, 0.0f, 0.0f, 0.0f}, {0.00389995f, 0.00186053f, 0.00220144f, 0.00180488f, 0.00073798f, 0.00154526f, 0.00216760f, 0.00214841f, 0.00077747f, 0.00248968f, 0.00302273f, 0.00250862f, 0.00093371f, 0.00107595f, 0.00147982f, 0.00487295f, 0.01299436f, 0.0f, 0.0f, 0.0f}, {0.00039119f, 0.00029139f, 0.00021006f, 0.00016015f, 0.00010666f, 0.00020592f, 0.00023815f, 0.00038786f, 0.00019097f, 0.00039549f, 0.00076736f, 0.00028448f, 0.00016253f, 0.00085751f, 0.00015674f, 0.00026525f, 0.00024961f, 0.00563625f, 0.0f, 0.0f}, {0.00131840f, 0.00099430f, 0.00074960f, 0.00066005f, 0.00036626f, 0.00070192f, 0.00092548f, 0.00089301f, 0.00131038f, 0.00127857f, 0.00219713f, 0.00100817f, 0.00054105f, 0.00368739f, 0.00047608f, 0.00102648f, 0.00094759f, 0.00069226f, 0.00999315f, 0.0f}, {0.00533241f, 0.00169359f, 0.00136609f, 0.00127915f, 0.00119152f, 0.00132844f, 0.00178697f, 0.00194579f, 0.00071553f, 0.01117956f, 0.00914460f, 0.00210897f, 0.00197461f, 0.00256159f, 0.00135781f, 0.00241601f, 0.00343452f, 0.00038538f, 0.00148001f, 0.02075171f} }; #endif probcons/EvolutionaryTree.h0000644000175000017500000001364410075047142016602 0ustar chuongdochuongdo///////////////////////////////////////////////////////////////// // EvolutionaryTree.h // // Utilities for reading/writing multiple sequence data. ///////////////////////////////////////////////////////////////// #ifndef EVOLUTIONARYTREE_H #define EVOLUTIONARYTREE_H #include #include #include #include "SafeVector.h" #include "MultiSequence.h" #include "Sequence.h" using namespace std; ///////////////////////////////////////////////////////////////// // TreeNode // // The fundamental unit for representing an alignment tree. The // guide tree is represented as a binary tree. ///////////////////////////////////////////////////////////////// class TreeNode { int sequenceLabel; // sequence label TreeNode *left, *right, *parent; // pointers to left, right children ///////////////////////////////////////////////////////////////// // TreeNode::PrintNode() // // Internal routine used to print out the sequence comments // associated with the evolutionary tree, using a hierarchical // parenthesized format. ///////////////////////////////////////////////////////////////// void PrintNode (ostream &outfile, const MultiSequence *sequences) const { // if this is a leaf node, print out the associated sequence comment if (sequenceLabel >= 0) outfile << sequences->GetSequence (sequenceLabel)->GetHeader(); // otherwise, it must have two children; print out their subtrees recursively else { assert (left); assert (right); outfile << "("; left->PrintNode (outfile, sequences); outfile << " "; right->PrintNode (outfile, sequences); outfile << ")"; } } public: ///////////////////////////////////////////////////////////////// // TreeNode::TreeNode() // // Constructor for a tree node. Note that sequenceLabel = -1 // implies that the current node is not a leaf in the tree. ///////////////////////////////////////////////////////////////// TreeNode (int sequenceLabel) : sequenceLabel (sequenceLabel), left (NULL), right (NULL), parent (NULL) { assert (sequenceLabel >= -1); } ///////////////////////////////////////////////////////////////// // TreeNode::~TreeNode() // // Destructor for a tree node. Recursively deletes all children. ///////////////////////////////////////////////////////////////// ~TreeNode (){ if (left){ delete left; left = NULL; } if (right){ delete right; right = NULL; } parent = NULL; } // getters int GetSequenceLabel () const { return sequenceLabel; } TreeNode *GetLeftChild () const { return left; } TreeNode *GetRightChild () const { return right; } TreeNode *GetParent () const { return parent; } // setters void SetSequenceLabel (int sequenceLabel){ this->sequenceLabel = sequenceLabel; assert (sequenceLabel >= -1); } void SetLeftChild (TreeNode *left){ this->left = left; } void SetRightChild (TreeNode *right){ this->right = right; } void SetParent (TreeNode *parent){ this->parent = parent; } ///////////////////////////////////////////////////////////////// // TreeNode::ComputeTree() // // Routine used to compute an evolutionary tree based on the // given distance matrix. We assume the distance matrix has the // form, distMatrix[i][j] = expected accuracy of aligning i with j. ///////////////////////////////////////////////////////////////// static TreeNode *ComputeTree (const VVF &distMatrix){ int numSeqs = distMatrix.size(); // number of sequences in distance matrix VVF distances (numSeqs, VF (numSeqs)); // a copy of the distance matrix SafeVector nodes (numSeqs, NULL); // list of nodes for each sequence SafeVector valid (numSeqs, 1); // valid[i] tells whether or not the ith // nodes in the distances and nodes array // are valid // initialization: make a copy of the distance matrix for (int i = 0; i < numSeqs; i++) for (int j = 0; j < numSeqs; j++) distances[i][j] = distMatrix[i][j]; // initialization: create all the leaf nodes for (int i = 0; i < numSeqs; i++){ nodes[i] = new TreeNode (i); assert (nodes[i]); } // repeat until only a single node left for (int numNodesLeft = numSeqs; numNodesLeft > 1; numNodesLeft--){ float bestProb = -1; pair bestPair; // find the closest pair for (int i = 0; i < numSeqs; i++) if (valid[i]){ for (int j = i+1; j < numSeqs; j++) if (valid[j]){ if (distances[i][j] > bestProb){ bestProb = distances[i][j]; bestPair = make_pair(i, j); } } } // merge the closest pair TreeNode *newParent = new TreeNode (-1); newParent->SetLeftChild (nodes[bestPair.first]); newParent->SetRightChild (nodes[bestPair.second]); nodes[bestPair.first]->SetParent (newParent); nodes[bestPair.second]->SetParent (newParent); nodes[bestPair.first] = newParent; nodes[bestPair.second] = NULL; // now update the distance matrix for (int i = 0; i < numSeqs; i++) if (valid[i]){ distances[bestPair.first][i] = distances[i][bestPair.first] = (distances[i][bestPair.first] + distances[i][bestPair.second]) * bestProb / 2; } // finally, mark the second node entry as no longer valid valid[bestPair.second] = 0; } assert (nodes[0]); return nodes[0]; } ///////////////////////////////////////////////////////////////// // TreeNode::Print() // // Print out the subtree associated with this node in a // parenthesized representation. ///////////////////////////////////////////////////////////////// void Print (ostream &outfile, const MultiSequence *sequences) const { outfile << "Alignment tree: "; PrintNode (outfile, sequences); outfile << endl; } }; #endif probcons/FileBuffer.h0000644000175000017500000000506610075047142015272 0ustar chuongdochuongdo///////////////////////////////////////////////////////////////// // FileBuffer.h // // Buffered file reading. ///////////////////////////////////////////////////////////////// #ifndef FILEBUFFER_H #define FILEBUFFER_H #include #include #include using namespace std; const int BufferSize = 1000; ///////////////////////////////////////////////////////////////// // FileBuffer // // Class for buffering file reading. ///////////////////////////////////////////////////////////////// class FileBuffer { ifstream file; char buffer[BufferSize]; int currPos; int size; bool isEOF; bool isValid; bool canUnget; public: // Some common routines FileBuffer (const char *filename) : file (filename), currPos (0), size (0), isEOF (false), isValid (!file.fail()), canUnget (false){} ~FileBuffer (){ close(); } bool fail () const { return !isValid; } bool eof () const { return (!isValid || isEOF); } void close(){ file.close(); isValid = false; } ///////////////////////////////////////////////////////////////// // FileBuffer::Get() // // Retrieve a character from the file buffer. Returns true if // and only if a character is read. ///////////////////////////////////////////////////////////////// bool Get (char &ch){ // check to make sure that there's more stuff in the file if (!isValid || isEOF) return false; // if the buffer is empty, it's time to reload it if (currPos == size){ file.read (buffer, BufferSize); size = file.gcount(); isEOF = (size == 0); currPos = 0; if (isEOF) return false; } // store the read character ch = buffer[currPos++]; canUnget = true; return true; } ///////////////////////////////////////////////////////////////// // FileBuffer::UnGet() // // Unretrieve the most recently read character from the file // buffer. Note that this allows only a one-level undo. ///////////////////////////////////////////////////////////////// void UnGet (){ assert (canUnget); assert (isValid); assert (currPos > 0); currPos--; assert (currPos < size); isEOF = false; canUnget = false; } ///////////////////////////////////////////////////////////////// // FileBuffer::GetLine() // // Retrieve characters of text until a newline character is // encountered. Terminates properly on end-of-file condition. ///////////////////////////////////////////////////////////////// void GetLine (string &s){ char ch; s = ""; while (Get (ch) && ch != '\n') s += ch; } }; #endif probcons/FixRef.cc0000644000175000017500000010526010075047142014577 0ustar chuongdochuongdo///////////////////////////////////////////////////////////////// // Main.cc ///////////////////////////////////////////////////////////////// #include "SafeVector.h" #include "MultiSequence.h" #include "Defaults.h" #include "ScoreType.h" #include "ProbabilisticModel.h" #include "EvolutionaryTree.h" #include "SparseMatrix.h" #include #include #include #include #include #include #include #include #include #include string matrixFilename = ""; string parametersInputFilename = ""; string parametersOutputFilename = "no training"; bool enableTraining = false; bool enableVerbose = false; int numConsistencyReps = 2; int numPreTrainingReps = 0; int numIterativeRefinementReps = 100; float gapOpenPenalty = 0; float gapContinuePenalty = 0; VF initDistrib (NumMatrixTypes); VF gapOpen (2*NumInsertStates); VF gapExtend (2*NumInsertStates); SafeVector alphabet; VVF emitPairs; VF emitSingle; const int MIN_PRETRAINING_REPS = 0; const int MAX_PRETRAINING_REPS = 20; const int MIN_CONSISTENCY_REPS = 0; const int MAX_CONSISTENCY_REPS = 5; const int MIN_ITERATIVE_REFINEMENT_REPS = 0; const int MAX_ITERATIVE_REFINEMENT_REPS = 1000; ///////////////////////////////////////////////////////////////// // Function prototypes ///////////////////////////////////////////////////////////////// void PrintHeading(); void PrintParameters (const char *message, const VF &initDistrib, const VF &gapOpen, const VF &gapExtend, const char *filename); MultiSequence *DoAlign (MultiSequence *sequence, const ProbabilisticModel &model); SafeVector ParseParams (int argc, char **argv); void ReadParameters (); MultiSequence *ComputeFinalAlignment (const TreeNode *tree, MultiSequence *sequences, const SafeVector > &sparseMatrices, const ProbabilisticModel &model); MultiSequence *AlignAlignments (MultiSequence *align1, MultiSequence *align2, const SafeVector > &sparseMatrices, const ProbabilisticModel &model); void DoRelaxation (MultiSequence *sequences, SafeVector > &sparseMatrices); void Relax (SparseMatrix *matXZ, SparseMatrix *matZY, VF &posterior); void DoIterativeRefinement (const SafeVector > &sparseMatrices, const ProbabilisticModel &model, MultiSequence* &alignment); //float ScoreAlignment (MultiSequence *alignment, MultiSequence *sequences, SparseMatrix **sparseMatrices, const int numSeqs); ///////////////////////////////////////////////////////////////// // main() // // Calls all initialization routines and runs the PROBCONS // aligner. ///////////////////////////////////////////////////////////////// int main (int argc, char **argv){ if (argc != 3){ cerr << "Usage: FixRef inputfile reffile" << endl; exit (1); } string inputFilename = string (argv[1]); string refFilename = string (argv[2]); ReadParameters(); // build new model for aligning ProbabilisticModel model (initDistrib, gapOpen, gapExtend, alphabet, emitPairs, emitSingle); MultiSequence *inputSeq = new MultiSequence(); inputSeq->LoadMFA (inputFilename); MultiSequence *refSeq = new MultiSequence(); refSeq->LoadMFA (refFilename); SafeVector *ali = new SafeVector; if (refSeq->GetNumSequences() != 2){ cerr << "ERROR: Expected two sequences in reference alignment." << endl; exit (1); } set s; s.insert (0); MultiSequence *ref1 = refSeq->Project (s); s.clear(); s.insert (1); MultiSequence *ref2 = refSeq->Project (s); for (int i = 0; i < inputSeq->GetNumSequences(); i++){ if (inputSeq->GetSequence(i)->GetHeader() == ref1->GetSequence(0)->GetHeader()){ ref1->AddSequence (inputSeq->GetSequence(i)->Clone()); } if (inputSeq->GetSequence(i)->GetHeader() == ref2->GetSequence(0)->GetHeader()) ref2->AddSequence (inputSeq->GetSequence(i)->Clone()); } if (ref1->GetNumSequences() != 2){ cerr << "ERROR: Expected two sequences in reference1 alignment." << endl; exit (1); } if (ref2->GetNumSequences() != 2){ cerr << "ERROR: Expected two sequences in reference2 alignment." << endl; exit (1); } ref1->GetSequence(0)->SetLabel(0); ref2->GetSequence(0)->SetLabel(0); ref1->GetSequence(1)->SetLabel(1); ref2->GetSequence(1)->SetLabel(1); // cerr << "Aligning..." << endl; // now, we can perform the alignments and write them out MultiSequence *alignment1 = DoAlign (ref1, ProbabilisticModel (initDistrib, gapOpen, gapExtend, alphabet, emitPairs, emitSingle)); //cerr << "Aligning second..." << endl; MultiSequence *alignment2 = DoAlign (ref2, ProbabilisticModel (initDistrib, gapOpen, gapExtend, alphabet, emitPairs, emitSingle)); SafeVector::iterator iter1 = alignment1->GetSequence(0)->GetDataPtr(); SafeVector::iterator iter2 = alignment1->GetSequence(1)->GetDataPtr(); for (int i = 1; i <= alignment1->GetSequence(0)->GetLength(); i++){ if (islower(iter1[i])) iter2[i] = tolower(iter2[i]); if (isupper(iter1[i])) iter2[i] = toupper(iter2[i]); } iter1 = alignment2->GetSequence(0)->GetDataPtr(); iter2 = alignment2->GetSequence(1)->GetDataPtr(); for (int i = 1; i <= alignment2->GetSequence(0)->GetLength(); i++){ if (islower(iter1[i])) iter2[i] = tolower(iter2[i]); if (isupper(iter1[i])) iter2[i] = toupper(iter2[i]); } //alignment1->WriteMFA (cout); //alignment2->WriteMFA (cout); int a1 = 0, a = 0; int b1 = 0, b = 0; for (int i = 1; i <= refSeq->GetSequence(0)->GetLength(); i++){ // catch up in filler sequences if (refSeq->GetSequence(0)->GetPosition(i) != '-'){ while (true){ a++; if (alignment1->GetSequence(0)->GetPosition(a) != '-') break; ali->push_back ('X'); } } if (refSeq->GetSequence(1)->GetPosition(i) != '-'){ while (true){ b++; if (alignment2->GetSequence(0)->GetPosition(b) != '-') break; ali->push_back ('Y'); } } if (refSeq->GetSequence(0)->GetPosition(i) != '-' && refSeq->GetSequence(1)->GetPosition(i) != '-'){ //cerr << "M: " << refSeq->GetSequence(0)->GetPosition(i) << refSeq->GetSequence(1)->GetPosition(i) << endl; ali->push_back ('B'); } else if (refSeq->GetSequence(0)->GetPosition(i) != '-'){ //cerr << "X" << endl; ali->push_back ('X'); } else if (refSeq->GetSequence(1)->GetPosition(i) != '-'){ //cerr << "Y" << endl; ali->push_back ('Y'); } } while (a < alignment1->GetSequence(0)->GetLength()){ a++; ali->push_back ('X'); if (alignment1->GetSequence(0)->GetPosition(a) != '-') a1++; } while (b < alignment2->GetSequence(0)->GetLength()){ b++; ali->push_back ('Y'); if (alignment2->GetSequence(0)->GetPosition(b) != '-') b1++; } Sequence *seq1 = alignment1->GetSequence(1)->AddGaps (ali, 'X'); Sequence *seq2 = alignment2->GetSequence(1)->AddGaps (ali, 'Y'); seq1->WriteMFA (cout, 60); seq2->WriteMFA (cout, 60); delete seq1; delete seq2; delete ali; delete alignment1; delete alignment2; delete inputSeq; delete refSeq; } ///////////////////////////////////////////////////////////////// // PrintHeading() // // Prints heading for PROBCONS program. ///////////////////////////////////////////////////////////////// void PrintHeading (){ cerr << endl << "PROBCONS version 1.02 - align multiple protein sequences and print to standard output" << endl << "Copyright (C) 2004 Chuong Ba Do" << endl << endl; } ///////////////////////////////////////////////////////////////// // PrintParameters() // // Prints PROBCONS parameters to STDERR. If a filename is // specified, then the parameters are also written to the file. ///////////////////////////////////////////////////////////////// void PrintParameters (const char *message, const VF &initDistrib, const VF &gapOpen, const VF &gapExtend, const char *filename){ // print parameters to the screen cerr << message << endl << " initDistrib[] = { "; for (int i = 0; i < NumMatrixTypes; i++) cerr << setprecision (10) << initDistrib[i] << " "; cerr << "}" << endl << " gapOpen[] = { "; for (int i = 0; i < NumInsertStates*2; i++) cerr << setprecision (10) << gapOpen[i] << " "; cerr << "}" << endl << " gapExtend[] = { "; for (int i = 0; i < NumInsertStates*2; i++) cerr << setprecision (10) << gapExtend[i] << " "; cerr << "}" << endl << endl; // if a file name is specified if (filename){ // attempt to open the file for writing FILE *file = fopen (filename, "w"); if (!file){ cerr << "ERROR: Unable to write parameter file: " << filename << endl; exit (1); } // if successful, then write the parameters to the file for (int i = 0; i < NumMatrixTypes; i++) fprintf (file, "%.10f ", initDistrib[i]); fprintf (file, "\n"); for (int i = 0; i < 2*NumInsertStates; i++) fprintf (file, "%.10f ", gapOpen[i]); fprintf (file, "\n"); for (int i = 0; i < 2*NumInsertStates; i++) fprintf (file, "%.10f ", gapExtend[i]); fprintf (file, "\n"); fclose (file); } } ///////////////////////////////////////////////////////////////// // DoAlign() // // First computes all pairwise posterior probability matrices. // Then, computes new parameters if training, or a final // alignment, otherwise. ///////////////////////////////////////////////////////////////// MultiSequence *DoAlign (MultiSequence *sequences, const ProbabilisticModel &model){ assert (sequences); const int numSeqs = sequences->GetNumSequences(); VVF distances (numSeqs, VF (numSeqs, 0)); SafeVector > sparseMatrices (numSeqs, SafeVector(numSeqs, NULL)); // do all pairwise alignments for (int a = 0; a < numSeqs-1; a++){ for (int b = a+1; b < numSeqs; b++){ Sequence *seq1 = sequences->GetSequence (a); Sequence *seq2 = sequences->GetSequence (b); // verbose output if (enableVerbose) cerr << "(" << a+1 << ") " << seq1->GetHeader() << " vs. " << "(" << b+1 << ") " << seq2->GetHeader() << ": "; // compute forward and backward probabilities VF *forward = model.ComputeForwardMatrix (seq1, seq2); assert (forward); VF *backward = model.ComputeBackwardMatrix (seq1, seq2); assert (backward); // if we are training, then we'll simply want to compute the // expected counts for each region within the matrix separately; // otherwise, we'll need to put all of the regions together and // assemble a posterior probability match matrix // compute posterior probability matrix VF *posterior = model.ComputePosteriorMatrix (seq1, seq2, *forward, *backward); assert (posterior); // compute "expected accuracy" distance for evolutionary tree computation pair *, float> alignment = model.ComputeAlignment (seq1->GetLength(), seq2->GetLength(), *posterior); float distance = alignment.second / min (seq1->GetLength(), seq2->GetLength()); if (enableVerbose) cerr << setprecision (10) << distance << endl; // save posterior probability matrices in sparse format distances[a][b] = distances[b][a] = distance; sparseMatrices[a][b] = new SparseMatrix (seq1->GetLength(), seq2->GetLength(), *posterior); sparseMatrices[b][a] = sparseMatrices[a][b]->ComputeTranspose(); delete alignment.first; delete posterior; delete forward; delete backward; } } if (!enableTraining){ if (enableVerbose) cerr << endl; // now, perform the consistency transformation the desired number of times for (int i = 0; i < numConsistencyReps; i++) DoRelaxation (sequences, sparseMatrices); // compute the evolutionary tree TreeNode *tree = TreeNode::ComputeTree (distances); //tree->Print (cerr, sequences); //cerr << endl; // make the final alignment MultiSequence *alignment = ComputeFinalAlignment (tree, sequences, sparseMatrices, model); delete tree; return alignment; } return NULL; } ///////////////////////////////////////////////////////////////// // GetInteger() // // Attempts to parse an integer from the character string given. // Returns true only if no parsing error occurs. ///////////////////////////////////////////////////////////////// bool GetInteger (char *data, int *val){ char *endPtr; long int retVal; assert (val); errno = 0; retVal = strtol (data, &endPtr, 0); if (retVal == 0 && (errno != 0 || data == endPtr)) return false; if (errno != 0 && (retVal == LONG_MAX || retVal == LONG_MIN)) return false; if (retVal < (long) INT_MIN || retVal > (long) INT_MAX) return false; *val = (int) retVal; return true; } ///////////////////////////////////////////////////////////////// // GetFloat() // // Attempts to parse a float from the character string given. // Returns true only if no parsing error occurs. ///////////////////////////////////////////////////////////////// bool GetFloat (char *data, float *val){ char *endPtr; double retVal; assert (val); errno = 0; retVal = strtod (data, &endPtr); if (retVal == 0 && (errno != 0 || data == endPtr)) return false; if (errno != 0 && (retVal >= 1000000.0 || retVal <= -1000000.0)) return false; *val = (float) retVal; return true; } ///////////////////////////////////////////////////////////////// // ParseParams() // // Parse all command-line options. ///////////////////////////////////////////////////////////////// SafeVector ParseParams (int argc, char **argv){ if (argc < 2){ cerr << "PROBCONS comes with ABSOLUTELY NO WARRANTY. This is free software, and" << endl << "you are welcome to redistribute it under certain conditions. See the" << endl << "file COPYING.txt for details." << endl << endl << "Usage:" << endl << " probcons [OPTION]... [MFAFILE]..." << endl << endl << "Description:" << endl << " Align sequences in MFAFILE(s) and print result to standard output" << endl << endl << " -t, --train FILENAME" << endl << " compute EM transition probabilities, store in FILENAME (default: " << parametersOutputFilename << ")" << endl << endl << " -m, --matrixfile FILENAME" << endl << " read transition parameters from FILENAME (default: " << matrixFilename << ")" << endl << endl << " -p, --paramfile FILENAME" << endl << " read scoring matrix probabilities from FILENAME (default: " << parametersInputFilename << ")" << endl << endl << " -c, --consistency REPS" << endl << " use " << MIN_CONSISTENCY_REPS << " <= REPS <= " << MAX_CONSISTENCY_REPS << " (default: " << numConsistencyReps << ") passes of consistency transformation" << endl << endl << " -ir, --iterative-refinement REPS" << endl << " use " << MIN_ITERATIVE_REFINEMENT_REPS << " <= REPS <= " << MAX_ITERATIVE_REFINEMENT_REPS << " (default: " << numIterativeRefinementReps << ") passes of iterative-refinement" << endl << endl << " -pre, --pre-training REPS" << endl << " use " << MIN_PRETRAINING_REPS << " <= REPS <= " << MAX_PRETRAINING_REPS << " (default: " << numPreTrainingReps << ") rounds of pretraining" << endl << endl << " -go, --gap-open VALUE" << endl << " gap opening penalty of VALUE <= 0 (default: " << gapOpenPenalty << ")" << endl << endl << " -ge, --gap-extension VALUE" << endl << " gap extension penalty of VALUE <= 0 (default: " << gapContinuePenalty << ")" << endl << endl << " -v, --verbose" << endl << " report progress while aligning (default: " << (enableVerbose ? "on" : "off") << ")" << endl << endl; exit (1); } SafeVector sequenceNames; int tempInt; float tempFloat; for (int i = 1; i < argc; i++){ if (argv[i][0] == '-'){ // training if (!strcmp (argv[i], "-t") || !strcmp (argv[i], "--train")){ enableTraining = true; if (i < argc - 1) parametersOutputFilename = string (argv[++i]); else { cerr << "ERROR: Filename expected for option " << argv[i] << endl; exit (1); } } // scoring matrix file else if (!strcmp (argv[i], "-m") || !strcmp (argv[i], "--matrixfile")){ if (i < argc - 1) matrixFilename = string (argv[++i]); else { cerr << "ERROR: Filename expected for option " << argv[i] << endl; exit (1); } } // transition/initial distribution parameter file else if (!strcmp (argv[i], "-p") || !strcmp (argv[i], "--paramfile")){ if (i < argc - 1) parametersInputFilename = string (argv[++i]); else { cerr << "ERROR: Filename expected for option " << argv[i] << endl; exit (1); } } // number of consistency transformations else if (!strcmp (argv[i], "-c") || !strcmp (argv[i], "--consistency")){ if (i < argc - 1){ if (!GetInteger (argv[++i], &tempInt)){ cerr << "ERROR: Invalid integer following option " << argv[i-1] << ": " << argv[i] << endl; exit (1); } else { if (tempInt < MIN_CONSISTENCY_REPS || tempInt > MAX_CONSISTENCY_REPS){ cerr << "ERROR: For option " << argv[i-1] << ", integer must be between " << MIN_CONSISTENCY_REPS << " and " << MAX_CONSISTENCY_REPS << "." << endl; exit (1); } else numConsistencyReps = tempInt; } } else { cerr << "ERROR: Integer expected for option " << argv[i] << endl; exit (1); } } // number of randomized partitioning iterative refinement passes else if (!strcmp (argv[i], "-ir") || !strcmp (argv[i], "--iterative-refinement")){ if (i < argc - 1){ if (!GetInteger (argv[++i], &tempInt)){ cerr << "ERROR: Invalid integer following option " << argv[i-1] << ": " << argv[i] << endl; exit (1); } else { if (tempInt < MIN_ITERATIVE_REFINEMENT_REPS || tempInt > MAX_ITERATIVE_REFINEMENT_REPS){ cerr << "ERROR: For option " << argv[i-1] << ", integer must be between " << MIN_ITERATIVE_REFINEMENT_REPS << " and " << MAX_ITERATIVE_REFINEMENT_REPS << "." << endl; exit (1); } else numIterativeRefinementReps = tempInt; } } else { cerr << "ERROR: Integer expected for option " << argv[i] << endl; exit (1); } } // number of EM pre-training rounds else if (!strcmp (argv[i], "-pre") || !strcmp (argv[i], "--pre-training")){ if (i < argc - 1){ if (!GetInteger (argv[++i], &tempInt)){ cerr << "ERROR: Invalid integer following option " << argv[i-1] << ": " << argv[i] << endl; exit (1); } else { if (tempInt < MIN_PRETRAINING_REPS || tempInt > MAX_PRETRAINING_REPS){ cerr << "ERROR: For option " << argv[i-1] << ", integer must be between " << MIN_PRETRAINING_REPS << " and " << MAX_PRETRAINING_REPS << "." << endl; exit (1); } else numPreTrainingReps = tempInt; } } else { cerr << "ERROR: Integer expected for option " << argv[i] << endl; exit (1); } } // gap open penalty else if (!strcmp (argv[i], "-go") || !strcmp (argv[i], "--gap-open")){ if (i < argc - 1){ if (!GetFloat (argv[++i], &tempFloat)){ cerr << "ERROR: Invalid floating-point value following option " << argv[i-1] << ": " << argv[i] << endl; exit (1); } else { if (tempFloat > 0){ cerr << "ERROR: For option " << argv[i-1] << ", floating-point value must not be positive." << endl; exit (1); } else gapOpenPenalty = tempFloat; } } else { cerr << "ERROR: Floating-point value expected for option " << argv[i] << endl; exit (1); } } // gap extension penalty else if (!strcmp (argv[i], "-ge") || !strcmp (argv[i], "--gap-extension")){ if (i < argc - 1){ if (!GetFloat (argv[++i], &tempFloat)){ cerr << "ERROR: Invalid floating-point value following option " << argv[i-1] << ": " << argv[i] << endl; exit (1); } else { if (tempFloat > 0){ cerr << "ERROR: For option " << argv[i-1] << ", floating-point value must not be positive." << endl; exit (1); } else gapContinuePenalty = tempFloat; } } else { cerr << "ERROR: Floating-point value expected for option " << argv[i] << endl; exit (1); } } // verbose reporting else if (!strcmp (argv[i], "-v") || !strcmp (argv[i], "--verbose")){ enableVerbose = true; } // bad arguments else { cerr << "ERROR: Unrecognized option: " << argv[i] << endl; exit (1); } } else { sequenceNames.push_back (string (argv[i])); } } return sequenceNames; } ///////////////////////////////////////////////////////////////// // ReadParameters() // // Read initial distribution, transition, and emission // parameters from a file. ///////////////////////////////////////////////////////////////// void ReadParameters (){ ifstream data; // read initial state distribution and transition parameters if (parametersInputFilename == string ("")){ if (NumInsertStates == 1){ for (int i = 0; i < NumMatrixTypes; i++) initDistrib[i] = initDistrib1Default[i]; for (int i = 0; i < 2*NumInsertStates; i++) gapOpen[i] = gapOpen1Default[i]; for (int i = 0; i < 2*NumInsertStates; i++) gapExtend[i] = gapExtend1Default[i]; } else if (NumInsertStates == 2){ for (int i = 0; i < NumMatrixTypes; i++) initDistrib[i] = initDistrib2Default[i]; for (int i = 0; i < 2*NumInsertStates; i++) gapOpen[i] = gapOpen2Default[i]; for (int i = 0; i < 2*NumInsertStates; i++) gapExtend[i] = gapExtend2Default[i]; } else { cerr << "ERROR: No default initial distribution/parameter settings exist" << endl << " for " << NumInsertStates << " pairs of insert states. Use --paramfile." << endl; exit (1); } } else { data.open (parametersInputFilename.c_str()); if (data.fail()){ cerr << "ERROR: Unable to read parameter file: " << parametersInputFilename << endl; exit (1); } for (int i = 0; i < NumMatrixTypes; i++) data >> initDistrib[i]; for (int i = 0; i < 2*NumInsertStates; i++) data >> gapOpen[i]; for (int i = 0; i < 2*NumInsertStates; i++) data >> gapExtend[i]; data.close(); } // read emission parameters int alphabetSize = 20; // allocate memory alphabet = SafeVector(alphabetSize); emitPairs = VVF (alphabetSize, VF (alphabetSize, 0)); emitSingle = VF (alphabetSize); if (matrixFilename == string ("")){ for (int i = 0; i < alphabetSize; i++) alphabet[i] = alphabetDefault[i]; for (int i = 0; i < alphabetSize; i++){ emitSingle[i] = emitSingleDefault[i]; for (int j = 0; j <= i; j++){ emitPairs[i][j] = emitPairs[j][i] = (i == j); } } } else { data.open (matrixFilename.c_str()); if (data.fail()){ cerr << "ERROR: Unable to read scoring matrix file: " << matrixFilename << endl; exit (1); } for (int i = 0; i < alphabetSize; i++) data >> alphabet[i]; for (int i = 0; i < alphabetSize; i++){ for (int j = 0; j <= i; j++){ data >> emitPairs[i][j]; emitPairs[j][i] = emitPairs[i][j]; } } for (int i = 0; i < alphabetSize; i++){ char ch; data >> ch; assert (ch == alphabet[i]); } for (int i = 0; i < alphabetSize; i++) data >> emitSingle[i]; data.close(); } } ///////////////////////////////////////////////////////////////// // ProcessTree() // // Process the tree recursively. Returns the aligned sequences // corresponding to a node or leaf of the tree. ///////////////////////////////////////////////////////////////// MultiSequence *ProcessTree (const TreeNode *tree, MultiSequence *sequences, const SafeVector > &sparseMatrices, const ProbabilisticModel &model){ MultiSequence *result; // check if this is a node of the alignment tree if (tree->GetSequenceLabel() == -1){ MultiSequence *alignLeft = ProcessTree (tree->GetLeftChild(), sequences, sparseMatrices, model); MultiSequence *alignRight = ProcessTree (tree->GetRightChild(), sequences, sparseMatrices, model); assert (alignLeft); assert (alignRight); result = AlignAlignments (alignLeft, alignRight, sparseMatrices, model); assert (result); delete alignLeft; delete alignRight; } // otherwise, this is a leaf of the alignment tree else { result = new MultiSequence(); assert (result); result->AddSequence (sequences->GetSequence(tree->GetSequenceLabel())->Clone()); } return result; } ///////////////////////////////////////////////////////////////// // ComputeFinalAlignment() // // Compute the final alignment by calling ProcessTree(), then // performing iterative refinement as needed. ///////////////////////////////////////////////////////////////// MultiSequence *ComputeFinalAlignment (const TreeNode *tree, MultiSequence *sequences, const SafeVector > &sparseMatrices, const ProbabilisticModel &model){ MultiSequence *alignment = ProcessTree (tree, sequences, sparseMatrices, model); // iterative refinement for (int i = 0; i < numIterativeRefinementReps; i++) DoIterativeRefinement (sparseMatrices, model, alignment); cerr << endl; // return final alignment return alignment; } ///////////////////////////////////////////////////////////////// // AlignAlignments() // // Returns the alignment of two MultiSequence objects. ///////////////////////////////////////////////////////////////// MultiSequence *AlignAlignments (MultiSequence *align1, MultiSequence *align2, const SafeVector > &sparseMatrices, const ProbabilisticModel &model){ // print some info about the alignment if (enableVerbose){ for (int i = 0; i < align1->GetNumSequences(); i++) cerr << ((i==0) ? "[" : ",") << align1->GetSequence(i)->GetLabel(); cerr << "] vs. "; for (int i = 0; i < align2->GetNumSequences(); i++) cerr << ((i==0) ? "[" : ",") << align2->GetSequence(i)->GetLabel(); cerr << "]: "; } VF *posterior = model.BuildPosterior (align1, align2, sparseMatrices); pair *, float> alignment; // choose the alignment routine depending on the "cosmetic" gap penalties used if (gapOpenPenalty == 0 && gapContinuePenalty == 0) alignment = model.ComputeAlignment (align1->GetSequence(0)->GetLength(), align2->GetSequence(0)->GetLength(), *posterior); else alignment = model.ComputeAlignmentWithGapPenalties (align1, align2, *posterior, align1->GetNumSequences(), align2->GetNumSequences(), gapOpenPenalty, gapContinuePenalty); delete posterior; if (enableVerbose){ // compute total length of sequences int totLength = 0; for (int i = 0; i < align1->GetNumSequences(); i++) for (int j = 0; j < align2->GetNumSequences(); j++) totLength += min (align1->GetSequence(i)->GetLength(), align2->GetSequence(j)->GetLength()); // give an "accuracy" measure for the alignment cerr << alignment.second / totLength << endl; } // now build final alignment MultiSequence *result = new MultiSequence(); for (int i = 0; i < align1->GetNumSequences(); i++) result->AddSequence (align1->GetSequence(i)->AddGaps(alignment.first, 'X')); for (int i = 0; i < align2->GetNumSequences(); i++) result->AddSequence (align2->GetSequence(i)->AddGaps(alignment.first, 'Y')); result->SortByLabel(); // free temporary alignment delete alignment.first; return result; } ///////////////////////////////////////////////////////////////// // DoRelaxation() // // Performs one round of the consistency transformation. The // formula used is: // 1 // P'(x[i]-y[j]) = --- sum sum P(x[i]-z[k]) P(z[k]-y[j]) // |S| z in S k // // where S = {x, y, all other sequences...} // ///////////////////////////////////////////////////////////////// void DoRelaxation (MultiSequence *sequences, SafeVector > &sparseMatrices){ const int numSeqs = sequences->GetNumSequences(); SafeVector > newSparseMatrices (numSeqs, SafeVector(numSeqs, NULL)); // for every pair of sequences for (int i = 0; i < numSeqs; i++){ for (int j = i+1; j < numSeqs; j++){ Sequence *seq1 = sequences->GetSequence (i); Sequence *seq2 = sequences->GetSequence (j); if (enableVerbose) cerr << "Relaxing (" << i+1 << ") " << seq1->GetHeader() << " vs. " << "(" << j+1 << ") " << seq2->GetHeader() << ": "; // get the original posterior matrix VF *posteriorPtr = sparseMatrices[i][j]->GetPosterior(); assert (posteriorPtr); VF &posterior = *posteriorPtr; const int seq1Length = seq1->GetLength(); const int seq2Length = seq2->GetLength(); // contribution from the summation where z = x and z = y for (int k = 0; k < (seq1Length+1) * (seq2Length+1); k++) posterior[k] += posterior[k]; if (enableVerbose) cerr << sparseMatrices[i][j]->GetNumCells() << " --> "; // contribution from all other sequences for (int k = 0; k < numSeqs; k++) if (k != i && k != j){ Relax (sparseMatrices[i][k], sparseMatrices[k][j], posterior); } // now renormalization for (int k = 0; k < (seq1Length+1) * (seq2Length+1); k++) posterior[k] /= numSeqs; // save the new posterior matrix newSparseMatrices[i][j] = new SparseMatrix (seq1->GetLength(), seq2->GetLength(), posterior); newSparseMatrices[j][i] = newSparseMatrices[i][j]->ComputeTranspose(); if (enableVerbose) cerr << newSparseMatrices[i][j]->GetNumCells() << " -- "; delete posteriorPtr; if (enableVerbose) cerr << "done." << endl; } } // now replace the old posterior matrices for (int i = 0; i < numSeqs; i++){ for (int j = 0; j < numSeqs; j++){ delete sparseMatrices[i][j]; sparseMatrices[i][j] = newSparseMatrices[i][j]; } } } ///////////////////////////////////////////////////////////////// // DoRelaxation() // // Computes the consistency transformation for a single sequence // z, and adds the transformed matrix to "posterior". ///////////////////////////////////////////////////////////////// void Relax (SparseMatrix *matXZ, SparseMatrix *matZY, VF &posterior){ assert (matXZ); assert (matZY); int lengthX = matXZ->GetSeq1Length(); int lengthY = matZY->GetSeq2Length(); assert (matXZ->GetSeq2Length() == matZY->GetSeq1Length()); // for every x[i] for (int i = 1; i <= lengthX; i++){ SafeVector::iterator XZptr = matXZ->GetRowPtr(i); SafeVector::iterator XZend = XZptr + matXZ->GetRowSize(i); VF::iterator base = posterior.begin() + i * (lengthY + 1); // iterate through all x[i]-z[k] while (XZptr != XZend){ SafeVector::iterator ZYptr = matZY->GetRowPtr(XZptr->first); SafeVector::iterator ZYend = ZYptr + matZY->GetRowSize(XZptr->first); const float XZval = XZptr->second; // iterate through all z[k]-y[j] while (ZYptr != ZYend){ base[ZYptr->first] += XZval * ZYptr->second;; ZYptr++; } XZptr++; } } } ///////////////////////////////////////////////////////////////// // DoIterativeRefinement() // // Performs a single round of randomized partionining iterative // refinement. ///////////////////////////////////////////////////////////////// void DoIterativeRefinement (const SafeVector > &sparseMatrices, const ProbabilisticModel &model, MultiSequence* &alignment){ set groupOne, groupTwo; // create two separate groups for (int i = 0; i < alignment->GetNumSequences(); i++){ if (random() % 2) groupOne.insert (i); else groupTwo.insert (i); } if (groupOne.empty() || groupTwo.empty()) return; // project into the two groups MultiSequence *groupOneSeqs = alignment->Project (groupOne); assert (groupOneSeqs); MultiSequence *groupTwoSeqs = alignment->Project (groupTwo); assert (groupTwoSeqs); delete alignment; // realign alignment = AlignAlignments (groupOneSeqs, groupTwoSeqs, sparseMatrices, model); } /* float ScoreAlignment (MultiSequence *alignment, MultiSequence *sequences, SparseMatrix **sparseMatrices, const int numSeqs){ int totLength = 0; float score = 0; for (int a = 0; a < alignment->GetNumSequences(); a++){ for (int b = a+1; b < alignment->GetNumSequences(); b++){ Sequence *seq1 = alignment->GetSequence(a); Sequence *seq2 = alignment->GetSequence(b); const int seq1Length = sequences->GetSequence(seq1->GetLabel())->GetLength(); const int seq2Length = sequences->GetSequence(seq2->GetLabel())->GetLength(); totLength += min (seq1Length, seq2Length); int pos1 = 0, pos2 = 0; for (int i = 1; i <= seq1->GetLength(); i++){ char ch1 = seq1->GetPosition(i); char ch2 = seq2->GetPosition(i); if (ch1 != '-') pos1++; if (ch2 != '-') pos2++; if (ch1 != '-' && ch2 != '-'){ score += sparseMatrices[a * numSeqs + b]->GetValue (pos1, pos2); } } } } return score / totLength; } */ probcons/Main.cc0000644000175000017500000014737110604510314014303 0ustar chuongdochuongdo///////////////////////////////////////////////////////////////// // Main.cc // // Main routines for PROBCONS program. ///////////////////////////////////////////////////////////////// #include "SafeVector.h" #include "MultiSequence.h" #include "Defaults.h" #include "ScoreType.h" #include "ProbabilisticModel.h" #include "EvolutionaryTree.h" #include "SparseMatrix.h" #include #include #include #include #include #include #include #include #include #include #include #include string parametersInputFilename = ""; string parametersOutputFilename = "no training"; string annotationFilename = ""; bool enableTraining = false; bool enableVerbose = false; bool enableAllPairs = false; bool enableAnnotation = false; bool enableViterbi = false; bool enableClustalWOutput = false; bool enableTrainEmissions = false; bool enableAlignOrder = false; int numConsistencyReps = 2; int numPreTrainingReps = 0; int numIterativeRefinementReps = 100; float cutoff = 0; float gapOpenPenalty = 0; float gapContinuePenalty = 0; VF initDistrib (NumMatrixTypes); VF gapOpen (2*NumInsertStates); VF gapExtend (2*NumInsertStates); VVF emitPairs (256, VF (256, 1e-10)); VF emitSingle (256, 1e-5); string alphabet = alphabetDefault; const int MIN_PRETRAINING_REPS = 0; const int MAX_PRETRAINING_REPS = 20; const int MIN_CONSISTENCY_REPS = 0; const int MAX_CONSISTENCY_REPS = 5; const int MIN_ITERATIVE_REFINEMENT_REPS = 0; const int MAX_ITERATIVE_REFINEMENT_REPS = 1000; ///////////////////////////////////////////////////////////////// // Function prototypes ///////////////////////////////////////////////////////////////// void PrintHeading(); void PrintParameters (const char *message, const VF &initDistrib, const VF &gapOpen, const VF &gapExtend, const VVF &emitPairs, const VF &emitSingle, const char *filename); MultiSequence *DoAlign (MultiSequence *sequence, const ProbabilisticModel &model, VF &initDistrib, VF &gapOpen, VF &gapExtend, VVF &emitPairs, VF &emitSingle); SafeVector ParseParams (int argc, char **argv); void ReadParameters (); MultiSequence *ComputeFinalAlignment (const TreeNode *tree, MultiSequence *sequences, const SafeVector > &sparseMatrices, const ProbabilisticModel &model); MultiSequence *AlignAlignments (MultiSequence *align1, MultiSequence *align2, const SafeVector > &sparseMatrices, const ProbabilisticModel &model); SafeVector > DoRelaxation (MultiSequence *sequences, SafeVector > &sparseMatrices); void Relax (SparseMatrix *matXZ, SparseMatrix *matZY, VF &posterior); void Relax1 (SparseMatrix *matXZ, SparseMatrix *matZY, VF &posterior); set GetSubtree (const TreeNode *tree); void TreeBasedBiPartitioning (const SafeVector > &sparseMatrices, const ProbabilisticModel &model, MultiSequence* &alignment, const TreeNode *tree); void DoIterativeRefinement (const SafeVector > &sparseMatrices, const ProbabilisticModel &model, MultiSequence* &alignment); void WriteAnnotation (MultiSequence *alignment, const SafeVector > &sparseMatrices); int ComputeScore (const SafeVector > &active, const SafeVector > &sparseMatrices); ///////////////////////////////////////////////////////////////// // main() // // Calls all initialization routines and runs the PROBCONS // aligner. ///////////////////////////////////////////////////////////////// int main (int argc, char **argv){ // print PROBCONS heading PrintHeading(); // parse program parameters SafeVector sequenceNames = ParseParams (argc, argv); ReadParameters(); PrintParameters ("Using parameter set:", initDistrib, gapOpen, gapExtend, emitPairs, emitSingle, NULL); // now, we'll process all the files given as input. If we are given // several filenames as input, then we'll load all of those sequences // simultaneously, as long as we're not training. On the other hand, // if we are training, then we'll treat each file as a separate // training instance // if we are training if (enableTraining){ // build new model for aligning ProbabilisticModel model (initDistrib, gapOpen, gapExtend, emitPairs, emitSingle); // prepare to average parameters for (int i = 0; i < (int) initDistrib.size(); i++) initDistrib[i] = 0; for (int i = 0; i < (int) gapOpen.size(); i++) gapOpen[i] = 0; for (int i = 0; i < (int) gapExtend.size(); i++) gapExtend[i] = 0; if (enableTrainEmissions){ for (int i = 0; i < (int) emitPairs.size(); i++) for (int j = 0; j < (int) emitPairs[i].size(); j++) emitPairs[i][j] = 0; for (int i = 0; i < (int) emitSingle.size(); i++) emitSingle[i] = 0; } // align each file individually for (int i = 0; i < (int) sequenceNames.size(); i++){ VF thisInitDistrib (NumMatrixTypes); VF thisGapOpen (2*NumInsertStates); VF thisGapExtend (2*NumInsertStates); VVF thisEmitPairs (256, VF (256, 1e-10)); VF thisEmitSingle (256, 1e-5); // load sequence file MultiSequence *sequences = new MultiSequence(); assert (sequences); cerr << "Loading sequence file: " << sequenceNames[i] << endl; sequences->LoadMFA (sequenceNames[i], true); // align sequences DoAlign (sequences, model, thisInitDistrib, thisGapOpen, thisGapExtend, thisEmitPairs, thisEmitSingle); // add in contribution of the derived parameters for (int i = 0; i < (int) initDistrib.size(); i++) initDistrib[i] += thisInitDistrib[i]; for (int i = 0; i < (int) gapOpen.size(); i++) gapOpen[i] += thisGapOpen[i]; for (int i = 0; i < (int) gapExtend.size(); i++) gapExtend[i] += thisGapExtend[i]; if (enableTrainEmissions){ for (int i = 0; i < (int) emitPairs.size(); i++) for (int j = 0; j < (int) emitPairs[i].size(); j++) emitPairs[i][j] += thisEmitPairs[i][j]; for (int i = 0; i < (int) emitSingle.size(); i++) emitSingle[i] += thisEmitSingle[i]; } delete sequences; } // compute new parameters and print them out for (int i = 0; i < (int) initDistrib.size(); i++) initDistrib[i] /= (int) sequenceNames.size(); for (int i = 0; i < (int) gapOpen.size(); i++) gapOpen[i] /= (int) sequenceNames.size(); for (int i = 0; i < (int) gapExtend.size(); i++) gapExtend[i] /= (int) sequenceNames.size(); if (enableTrainEmissions){ for (int i = 0; i < (int) emitPairs.size(); i++) for (int j = 0; j < (int) emitPairs[i].size(); j++) emitPairs[i][j] /= (int) sequenceNames.size(); for (int i = 0; i < (int) emitSingle.size(); i++) emitSingle[i] /= sequenceNames.size(); } PrintParameters ("Trained parameter set:", initDistrib, gapOpen, gapExtend, emitPairs, emitSingle, parametersOutputFilename.c_str()); } // if we are not training, we must simply want to align some sequences else { // load all files together MultiSequence *sequences = new MultiSequence(); assert (sequences); for (int i = 0; i < (int) sequenceNames.size(); i++){ cerr << "Loading sequence file: " << sequenceNames[i] << endl; sequences->LoadMFA (sequenceNames[i], true); } // do all "pre-training" repetitions first for (int ct = 0; ct < numPreTrainingReps; ct++){ enableTraining = true; // build new model for aligning ProbabilisticModel model (initDistrib, gapOpen, gapExtend, emitPairs, emitSingle); // do initial alignments DoAlign (sequences, model, initDistrib, gapOpen, gapExtend, emitPairs, emitSingle); // print new parameters PrintParameters ("Recomputed parameter set:", initDistrib, gapOpen, gapExtend, emitPairs, emitSingle, NULL); enableTraining = false; } // now, we can perform the alignments and write them out MultiSequence *alignment = DoAlign (sequences, ProbabilisticModel (initDistrib, gapOpen, gapExtend, emitPairs, emitSingle), initDistrib, gapOpen, gapExtend, emitPairs, emitSingle); if (!enableAllPairs){ if (enableClustalWOutput) alignment->WriteALN (cout); else alignment->WriteMFA (cout); } delete alignment; delete sequences; } } ///////////////////////////////////////////////////////////////// // PrintHeading() // // Prints heading for PROBCONS program. ///////////////////////////////////////////////////////////////// void PrintHeading (){ cerr << endl << "PROBCONS version " << VERSION << " - align multiple protein sequences and print to standard output" << endl << "Written by Chuong Do" << endl << endl; } ///////////////////////////////////////////////////////////////// // PrintParameters() // // Prints PROBCONS parameters to STDERR. If a filename is // specified, then the parameters are also written to the file. ///////////////////////////////////////////////////////////////// void PrintParameters (const char *message, const VF &initDistrib, const VF &gapOpen, const VF &gapExtend, const VVF &emitPairs, const VF &emitSingle, const char *filename){ // print parameters to the screen cerr << message << endl << " initDistrib[] = { "; for (int i = 0; i < NumMatrixTypes; i++) cerr << setprecision (10) << initDistrib[i] << " "; cerr << "}" << endl << " gapOpen[] = { "; for (int i = 0; i < NumInsertStates*2; i++) cerr << setprecision (10) << gapOpen[i] << " "; cerr << "}" << endl << " gapExtend[] = { "; for (int i = 0; i < NumInsertStates*2; i++) cerr << setprecision (10) << gapExtend[i] << " "; cerr << "}" << endl << endl; /* for (int i = 0; i < 5; i++){ for (int j = 0; j <= i; j++){ cerr << emitPairs[(unsigned char) alphabet[i]][(unsigned char) alphabet[j]] << " "; } cerr << endl; }*/ // if a file name is specified if (filename){ // attempt to open the file for writing FILE *file = fopen (filename, "w"); if (!file){ cerr << "ERROR: Unable to write parameter file: " << filename << endl; exit (1); } // if successful, then write the parameters to the file for (int i = 0; i < NumMatrixTypes; i++) fprintf (file, "%.10f ", initDistrib[i]); fprintf (file, "\n"); for (int i = 0; i < 2*NumInsertStates; i++) fprintf (file, "%.10f ", gapOpen[i]); fprintf (file, "\n"); for (int i = 0; i < 2*NumInsertStates; i++) fprintf (file, "%.10f ", gapExtend[i]); fprintf (file, "\n"); fprintf (file, "%s\n", alphabet.c_str()); for (int i = 0; i < (int) alphabet.size(); i++){ for (int j = 0; j <= i; j++) fprintf (file, "%.10f ", emitPairs[(unsigned char) alphabet[i]][(unsigned char) alphabet[j]]); fprintf (file, "\n"); } for (int i = 0; i < (int) alphabet.size(); i++) fprintf (file, "%.10f ", emitSingle[(unsigned char) alphabet[i]]); fprintf (file, "\n"); fclose (file); } } ///////////////////////////////////////////////////////////////// // DoAlign() // // First computes all pairwise posterior probability matrices. // Then, computes new parameters if training, or a final // alignment, otherwise. ///////////////////////////////////////////////////////////////// MultiSequence *DoAlign (MultiSequence *sequences, const ProbabilisticModel &model, VF &initDistrib, VF &gapOpen, VF &gapExtend, VVF &emitPairs, VF &emitSingle){ assert (sequences); const int numSeqs = sequences->GetNumSequences(); VVF distances (numSeqs, VF (numSeqs, 0)); SafeVector > sparseMatrices (numSeqs, SafeVector(numSeqs, NULL)); if (enableTraining){ // prepare to average parameters for (int i = 0; i < (int) initDistrib.size(); i++) initDistrib[i] = 0; for (int i = 0; i < (int) gapOpen.size(); i++) gapOpen[i] = 0; for (int i = 0; i < (int) gapExtend.size(); i++) gapExtend[i] = 0; if (enableTrainEmissions){ for (int i = 0; i < (int) emitPairs.size(); i++) for (int j = 0; j < (int) emitPairs[i].size(); j++) emitPairs[i][j] = 0; for (int i = 0; i < (int) emitSingle.size(); i++) emitSingle[i] = 0; } } // skip posterior calculations if we just want to do Viterbi alignments if (!enableViterbi){ // do all pairwise alignments for posterior probability matrices for (int a = 0; a < numSeqs-1; a++){ for (int b = a+1; b < numSeqs; b++){ Sequence *seq1 = sequences->GetSequence (a); Sequence *seq2 = sequences->GetSequence (b); // verbose output if (enableVerbose) cerr << "Computing posterior matrix: (" << a+1 << ") " << seq1->GetHeader() << " vs. " << "(" << b+1 << ") " << seq2->GetHeader() << " -- "; // compute forward and backward probabilities VF *forward = model.ComputeForwardMatrix (seq1, seq2); assert (forward); VF *backward = model.ComputeBackwardMatrix (seq1, seq2); assert (backward); // if we are training, then we'll simply want to compute the // expected counts for each region within the matrix separately; // otherwise, we'll need to put all of the regions together and // assemble a posterior probability match matrix // so, if we're training if (enableTraining){ // compute new parameters VF thisInitDistrib (NumMatrixTypes); VF thisGapOpen (2*NumInsertStates); VF thisGapExtend (2*NumInsertStates); VVF thisEmitPairs (256, VF (256, 1e-10)); VF thisEmitSingle (256, 1e-5); model.ComputeNewParameters (seq1, seq2, *forward, *backward, thisInitDistrib, thisGapOpen, thisGapExtend, thisEmitPairs, thisEmitSingle, enableTrainEmissions); // add in contribution of the derived parameters for (int i = 0; i < (int) initDistrib.size(); i++) initDistrib[i] += thisInitDistrib[i]; for (int i = 0; i < (int) gapOpen.size(); i++) gapOpen[i] += thisGapOpen[i]; for (int i = 0; i < (int) gapExtend.size(); i++) gapExtend[i] += thisGapExtend[i]; if (enableTrainEmissions){ for (int i = 0; i < (int) emitPairs.size(); i++) for (int j = 0; j < (int) emitPairs[i].size(); j++) emitPairs[i][j] += thisEmitPairs[i][j]; for (int i = 0; i < (int) emitSingle.size(); i++) emitSingle[i] += thisEmitSingle[i]; } // let us know that we're done. if (enableVerbose) cerr << "done." << endl; } else { // compute posterior probability matrix VF *posterior = model.ComputePosteriorMatrix (seq1, seq2, *forward, *backward); assert (posterior); // compute sparse representations sparseMatrices[a][b] = new SparseMatrix (seq1->GetLength(), seq2->GetLength(), *posterior); sparseMatrices[b][a] = NULL; if (!enableAllPairs){ // perform the pairwise sequence alignment pair *, float> alignment = model.ComputeAlignment (seq1->GetLength(), seq2->GetLength(), *posterior); // compute "expected accuracy" distance for evolutionary tree computation float distance = alignment.second / min (seq1->GetLength(), seq2->GetLength()); distances[a][b] = distances[b][a] = distance; if (enableVerbose) cerr << setprecision (10) << distance << endl; delete alignment.first; } else { // let us know that we're done. if (enableVerbose) cerr << "done." << endl; } delete posterior; } delete forward; delete backward; } } } // now average out parameters derived if (enableTraining){ // compute new parameters for (int i = 0; i < (int) initDistrib.size(); i++) initDistrib[i] /= numSeqs * (numSeqs - 1) / 2; for (int i = 0; i < (int) gapOpen.size(); i++) gapOpen[i] /= numSeqs * (numSeqs - 1) / 2; for (int i = 0; i < (int) gapExtend.size(); i++) gapExtend[i] /= numSeqs * (numSeqs - 1) / 2; if (enableTrainEmissions){ for (int i = 0; i < (int) emitPairs.size(); i++) for (int j = 0; j < (int) emitPairs[i].size(); j++) emitPairs[i][j] /= numSeqs * (numSeqs - 1) / 2; for (int i = 0; i < (int) emitSingle.size(); i++) emitSingle[i] /= numSeqs * (numSeqs - 1) / 2; } } // see if we still want to do some alignments else { if (!enableViterbi){ // perform the consistency transformation the desired number of times for (int r = 0; r < numConsistencyReps; r++){ SafeVector > newSparseMatrices = DoRelaxation (sequences, sparseMatrices); // now replace the old posterior matrices for (int i = 0; i < numSeqs; i++){ for (int j = 0; j < numSeqs; j++){ delete sparseMatrices[i][j]; sparseMatrices[i][j] = newSparseMatrices[i][j]; } } } } MultiSequence *finalAlignment = NULL; if (enableAllPairs){ for (int a = 0; a < numSeqs-1; a++){ for (int b = a+1; b < numSeqs; b++){ Sequence *seq1 = sequences->GetSequence (a); Sequence *seq2 = sequences->GetSequence (b); if (enableVerbose) cerr << "Performing pairwise alignment: (" << a+1 << ") " << seq1->GetHeader() << " vs. " << "(" << b+1 << ") " << seq2->GetHeader() << " -- "; // perform the pairwise sequence alignment pair *, float> alignment; if (enableViterbi) alignment = model.ComputeViterbiAlignment (seq1, seq2); else { // build posterior matrix VF *posterior = sparseMatrices[a][b]->GetPosterior(); assert (posterior); int length = (seq1->GetLength() + 1) * (seq2->GetLength() + 1); for (int i = 0; i < length; i++) (*posterior)[i] -= cutoff; alignment = model.ComputeAlignment (seq1->GetLength(), seq2->GetLength(), *posterior); delete posterior; } // write pairwise alignments string name = seq1->GetHeader() + "-" + seq2->GetHeader() + (enableClustalWOutput ? ".aln" : ".fasta"); ofstream outfile (name.c_str()); MultiSequence *result = new MultiSequence(); result->AddSequence (seq1->AddGaps(alignment.first, 'X')); result->AddSequence (seq2->AddGaps(alignment.first, 'Y')); if (enableClustalWOutput) result->WriteALN (outfile); else result->WriteMFA (outfile); outfile.close(); delete alignment.first; } } } // now if we still need to do a final multiple alignment else { if (enableVerbose) cerr << endl; // compute the evolutionary tree TreeNode *tree = TreeNode::ComputeTree (distances); tree->Print (cerr, sequences); cerr << endl; // make the final alignment finalAlignment = ComputeFinalAlignment (tree, sequences, sparseMatrices, model); // build annotation if (enableAnnotation){ WriteAnnotation (finalAlignment, sparseMatrices); } delete tree; } if (!enableViterbi){ // delete sparse matrices for (int a = 0; a < numSeqs-1; a++){ for (int b = a+1; b < numSeqs; b++){ delete sparseMatrices[a][b]; delete sparseMatrices[b][a]; } } } return finalAlignment; } return NULL; } ///////////////////////////////////////////////////////////////// // GetInteger() // // Attempts to parse an integer from the character string given. // Returns true only if no parsing error occurs. ///////////////////////////////////////////////////////////////// bool GetInteger (char *data, int *val){ char *endPtr; long int retVal; assert (val); errno = 0; retVal = strtol (data, &endPtr, 0); if (retVal == 0 && (errno != 0 || data == endPtr)) return false; if (errno != 0 && (retVal == LONG_MAX || retVal == LONG_MIN)) return false; if (retVal < (long) INT_MIN || retVal > (long) INT_MAX) return false; *val = (int) retVal; return true; } ///////////////////////////////////////////////////////////////// // GetFloat() // // Attempts to parse a float from the character string given. // Returns true only if no parsing error occurs. ///////////////////////////////////////////////////////////////// bool GetFloat (char *data, float *val){ char *endPtr; double retVal; assert (val); errno = 0; retVal = strtod (data, &endPtr); if (retVal == 0 && (errno != 0 || data == endPtr)) return false; if (errno != 0 && (retVal >= 1000000.0 || retVal <= -1000000.0)) return false; *val = (float) retVal; return true; } ///////////////////////////////////////////////////////////////// // ParseParams() // // Parse all command-line options. ///////////////////////////////////////////////////////////////// SafeVector ParseParams (int argc, char **argv){ if (argc < 2){ cerr << "PROBCONS comes with ABSOLUTELY NO WARRANTY. This is free software, and" << endl << "you are welcome to redistribute it under certain conditions. See the" << endl << "file COPYING.txt for details." << endl << endl << "Usage:" << endl << " probcons [OPTION]... [MFAFILE]..." << endl << endl << "Description:" << endl << " Align sequences in MFAFILE(s) and print result to standard output" << endl << endl << " -clustalw" << endl << " use CLUSTALW output format instead of MFA" << endl << endl << " -c, --consistency REPS" << endl << " use " << MIN_CONSISTENCY_REPS << " <= REPS <= " << MAX_CONSISTENCY_REPS << " (default: " << numConsistencyReps << ") passes of consistency transformation" << endl << endl << " -ir, --iterative-refinement REPS" << endl << " use " << MIN_ITERATIVE_REFINEMENT_REPS << " <= REPS <= " << MAX_ITERATIVE_REFINEMENT_REPS << " (default: " << numIterativeRefinementReps << ") passes of iterative-refinement" << endl << endl << " -pre, --pre-training REPS" << endl << " use " << MIN_PRETRAINING_REPS << " <= REPS <= " << MAX_PRETRAINING_REPS << " (default: " << numPreTrainingReps << ") rounds of pretraining" << endl << endl << " -pairs" << endl << " generate all-pairs pairwise alignments" << endl << endl << " -viterbi" << endl << " use Viterbi algorithm to generate all pairs (automatically enables -pairs)" << endl << endl << " -v, --verbose" << endl << " report progress while aligning (default: " << (enableVerbose ? "on" : "off") << ")" << endl << endl << " -annot FILENAME" << endl << " write annotation for multiple alignment to FILENAME" << endl << endl << " -t, --train FILENAME" << endl << " compute EM transition probabilities, store in FILENAME (default: " << parametersOutputFilename << ")" << endl << endl << " -e, --emissions" << endl << " also reestimate emission probabilities (default: " << (enableTrainEmissions ? "on" : "off") << ")" << endl << endl << " -p, --paramfile FILENAME" << endl << " read parameters from FILENAME (default: " << parametersInputFilename << ")" << endl << endl << " -a, --alignment-order" << endl << " print sequences in alignment order rather than input order (default: " << (enableAlignOrder ? "on" : "off") << ")" << endl << endl; // << " -go, --gap-open VALUE" << endl // << " gap opening penalty of VALUE <= 0 (default: " << gapOpenPenalty << ")" << endl // << endl // << " -ge, --gap-extension VALUE" << endl // << " gap extension penalty of VALUE <= 0 (default: " << gapContinuePenalty << ")" << endl // << endl // << " -co, --cutoff CUTOFF" << endl // << " subtract 0 <= CUTOFF <= 1 (default: " << cutoff << ") from all posterior values before final alignment" << endl // << endl exit (1); } SafeVector sequenceNames; int tempInt; float tempFloat; for (int i = 1; i < argc; i++){ if (argv[i][0] == '-'){ // training if (!strcmp (argv[i], "-t") || !strcmp (argv[i], "--train")){ enableTraining = true; if (i < argc - 1) parametersOutputFilename = string (argv[++i]); else { cerr << "ERROR: Filename expected for option " << argv[i] << endl; exit (1); } } // emission training else if (!strcmp (argv[i], "-e") || !strcmp (argv[i], "--emissions")){ enableTrainEmissions = true; } // parameter file else if (!strcmp (argv[i], "-p") || !strcmp (argv[i], "--paramfile")){ if (i < argc - 1) parametersInputFilename = string (argv[++i]); else { cerr << "ERROR: Filename expected for option " << argv[i] << endl; exit (1); } } // number of consistency transformations else if (!strcmp (argv[i], "-c") || !strcmp (argv[i], "--consistency")){ if (i < argc - 1){ if (!GetInteger (argv[++i], &tempInt)){ cerr << "ERROR: Invalid integer following option " << argv[i-1] << ": " << argv[i] << endl; exit (1); } else { if (tempInt < MIN_CONSISTENCY_REPS || tempInt > MAX_CONSISTENCY_REPS){ cerr << "ERROR: For option " << argv[i-1] << ", integer must be between " << MIN_CONSISTENCY_REPS << " and " << MAX_CONSISTENCY_REPS << "." << endl; exit (1); } else numConsistencyReps = tempInt; } } else { cerr << "ERROR: Integer expected for option " << argv[i] << endl; exit (1); } } // number of randomized partitioning iterative refinement passes else if (!strcmp (argv[i], "-ir") || !strcmp (argv[i], "--iterative-refinement")){ if (i < argc - 1){ if (!GetInteger (argv[++i], &tempInt)){ cerr << "ERROR: Invalid integer following option " << argv[i-1] << ": " << argv[i] << endl; exit (1); } else { if (tempInt < MIN_ITERATIVE_REFINEMENT_REPS || tempInt > MAX_ITERATIVE_REFINEMENT_REPS){ cerr << "ERROR: For option " << argv[i-1] << ", integer must be between " << MIN_ITERATIVE_REFINEMENT_REPS << " and " << MAX_ITERATIVE_REFINEMENT_REPS << "." << endl; exit (1); } else numIterativeRefinementReps = tempInt; } } else { cerr << "ERROR: Integer expected for option " << argv[i] << endl; exit (1); } } // number of EM pre-training rounds else if (!strcmp (argv[i], "-pre") || !strcmp (argv[i], "--pre-training")){ if (i < argc - 1){ if (!GetInteger (argv[++i], &tempInt)){ cerr << "ERROR: Invalid integer following option " << argv[i-1] << ": " << argv[i] << endl; exit (1); } else { if (tempInt < MIN_PRETRAINING_REPS || tempInt > MAX_PRETRAINING_REPS){ cerr << "ERROR: For option " << argv[i-1] << ", integer must be between " << MIN_PRETRAINING_REPS << " and " << MAX_PRETRAINING_REPS << "." << endl; exit (1); } else numPreTrainingReps = tempInt; } } else { cerr << "ERROR: Integer expected for option " << argv[i] << endl; exit (1); } } // gap open penalty else if (!strcmp (argv[i], "-go") || !strcmp (argv[i], "--gap-open")){ if (i < argc - 1){ if (!GetFloat (argv[++i], &tempFloat)){ cerr << "ERROR: Invalid floating-point value following option " << argv[i-1] << ": " << argv[i] << endl; exit (1); } else { if (tempFloat > 0){ cerr << "ERROR: For option " << argv[i-1] << ", floating-point value must not be positive." << endl; exit (1); } else gapOpenPenalty = tempFloat; } } else { cerr << "ERROR: Floating-point value expected for option " << argv[i] << endl; exit (1); } } // gap extension penalty else if (!strcmp (argv[i], "-ge") || !strcmp (argv[i], "--gap-extension")){ if (i < argc - 1){ if (!GetFloat (argv[++i], &tempFloat)){ cerr << "ERROR: Invalid floating-point value following option " << argv[i-1] << ": " << argv[i] << endl; exit (1); } else { if (tempFloat > 0){ cerr << "ERROR: For option " << argv[i-1] << ", floating-point value must not be positive." << endl; exit (1); } else gapContinuePenalty = tempFloat; } } else { cerr << "ERROR: Floating-point value expected for option " << argv[i] << endl; exit (1); } } // all-pairs pairwise alignments else if (!strcmp (argv[i], "-pairs")){ enableAllPairs = true; } // all-pairs pairwise Viterbi alignments else if (!strcmp (argv[i], "-viterbi")){ enableAllPairs = true; enableViterbi = true; } // annotation files else if (!strcmp (argv[i], "-annot")){ enableAnnotation = true; if (i < argc - 1) annotationFilename = argv[++i]; else { cerr << "ERROR: FILENAME expected for option " << argv[i] << endl; exit (1); } } // clustalw output format else if (!strcmp (argv[i], "-clustalw")){ enableClustalWOutput = true; } // cutoff else if (!strcmp (argv[i], "-co") || !strcmp (argv[i], "--cutoff")){ if (i < argc - 1){ if (!GetFloat (argv[++i], &tempFloat)){ cerr << "ERROR: Invalid floating-point value following option " << argv[i-1] << ": " << argv[i] << endl; exit (1); } else { if (tempFloat < 0 || tempFloat > 1){ cerr << "ERROR: For option " << argv[i-1] << ", floating-point value must be between 0 and 1." << endl; exit (1); } else cutoff = tempFloat; } } else { cerr << "ERROR: Floating-point value expected for option " << argv[i] << endl; exit (1); } } // verbose reporting else if (!strcmp (argv[i], "-v") || !strcmp (argv[i], "--verbose")){ enableVerbose = true; } // alignment order else if (!strcmp (argv[i], "-a") || !strcmp (argv[i], "--alignment-order")){ enableAlignOrder = true; } // bad arguments else { cerr << "ERROR: Unrecognized option: " << argv[i] << endl; exit (1); } } else { sequenceNames.push_back (string (argv[i])); } } if (enableTrainEmissions && !enableTraining){ cerr << "ERROR: Training emissions (-e) requires training (-t)" << endl; exit (1); } return sequenceNames; } ///////////////////////////////////////////////////////////////// // ReadParameters() // // Read initial distribution, transition, and emission // parameters from a file. ///////////////////////////////////////////////////////////////// void ReadParameters (){ ifstream data; emitPairs = VVF (256, VF (256, 1e-10)); emitSingle = VF (256, 1e-5); // read initial state distribution and transition parameters if (parametersInputFilename == string ("")){ if (NumInsertStates == 1){ for (int i = 0; i < NumMatrixTypes; i++) initDistrib[i] = initDistrib1Default[i]; for (int i = 0; i < 2*NumInsertStates; i++) gapOpen[i] = gapOpen1Default[i]; for (int i = 0; i < 2*NumInsertStates; i++) gapExtend[i] = gapExtend1Default[i]; } else if (NumInsertStates == 2){ for (int i = 0; i < NumMatrixTypes; i++) initDistrib[i] = initDistrib2Default[i]; for (int i = 0; i < 2*NumInsertStates; i++) gapOpen[i] = gapOpen2Default[i]; for (int i = 0; i < 2*NumInsertStates; i++) gapExtend[i] = gapExtend2Default[i]; } else { cerr << "ERROR: No default initial distribution/parameter settings exist" << endl << " for " << NumInsertStates << " pairs of insert states. Use --paramfile." << endl; exit (1); } alphabet = alphabetDefault; for (int i = 0; i < (int) alphabet.length(); i++){ emitSingle[(unsigned char) tolower(alphabet[i])] = emitSingleDefault[i]; emitSingle[(unsigned char) toupper(alphabet[i])] = emitSingleDefault[i]; for (int j = 0; j <= i; j++){ emitPairs[(unsigned char) tolower(alphabet[i])][(unsigned char) tolower(alphabet[j])] = emitPairsDefault[i][j]; emitPairs[(unsigned char) tolower(alphabet[i])][(unsigned char) toupper(alphabet[j])] = emitPairsDefault[i][j]; emitPairs[(unsigned char) toupper(alphabet[i])][(unsigned char) tolower(alphabet[j])] = emitPairsDefault[i][j]; emitPairs[(unsigned char) toupper(alphabet[i])][(unsigned char) toupper(alphabet[j])] = emitPairsDefault[i][j]; emitPairs[(unsigned char) tolower(alphabet[j])][(unsigned char) tolower(alphabet[i])] = emitPairsDefault[i][j]; emitPairs[(unsigned char) tolower(alphabet[j])][(unsigned char) toupper(alphabet[i])] = emitPairsDefault[i][j]; emitPairs[(unsigned char) toupper(alphabet[j])][(unsigned char) tolower(alphabet[i])] = emitPairsDefault[i][j]; emitPairs[(unsigned char) toupper(alphabet[j])][(unsigned char) toupper(alphabet[i])] = emitPairsDefault[i][j]; } } } else { data.open (parametersInputFilename.c_str()); if (data.fail()){ cerr << "ERROR: Unable to read parameter file: " << parametersInputFilename << endl; exit (1); } string line[3]; for (int i = 0; i < 3; i++){ if (!getline (data, line[i])){ cerr << "ERROR: Unable to read transition parameters from parameter file: " << parametersInputFilename << endl; exit (1); } } istringstream data2; data2.clear(); data2.str (line[0]); for (int i = 0; i < NumMatrixTypes; i++) data2 >> initDistrib[i]; data2.clear(); data2.str (line[1]); for (int i = 0; i < 2*NumInsertStates; i++) data2 >> gapOpen[i]; data2.clear(); data2.str (line[2]); for (int i = 0; i < 2*NumInsertStates; i++) data2 >> gapExtend[i]; if (!getline (data, line[0])){ cerr << "ERROR: Unable to read alphabet from scoring matrix file: " << parametersInputFilename << endl; exit (1); } // read alphabet as concatenation of all characters on alphabet line alphabet = ""; string token; data2.clear(); data2.str (line[0]); while (data2 >> token) alphabet += token; for (int i = 0; i < (int) alphabet.size(); i++){ for (int j = 0; j <= i; j++){ float val; data >> val; emitPairs[(unsigned char) tolower(alphabet[i])][(unsigned char) tolower(alphabet[j])] = val; emitPairs[(unsigned char) tolower(alphabet[i])][(unsigned char) toupper(alphabet[j])] = val; emitPairs[(unsigned char) toupper(alphabet[i])][(unsigned char) tolower(alphabet[j])] = val; emitPairs[(unsigned char) toupper(alphabet[i])][(unsigned char) toupper(alphabet[j])] = val; emitPairs[(unsigned char) tolower(alphabet[j])][(unsigned char) tolower(alphabet[i])] = val; emitPairs[(unsigned char) tolower(alphabet[j])][(unsigned char) toupper(alphabet[i])] = val; emitPairs[(unsigned char) toupper(alphabet[j])][(unsigned char) tolower(alphabet[i])] = val; emitPairs[(unsigned char) toupper(alphabet[j])][(unsigned char) toupper(alphabet[i])] = val; } } for (int i = 0; i < (int) alphabet.size(); i++){ float val; data >> val; emitSingle[(unsigned char) tolower(alphabet[i])] = val; emitSingle[(unsigned char) toupper(alphabet[i])] = val; } data.close(); } } ///////////////////////////////////////////////////////////////// // ProcessTree() // // Process the tree recursively. Returns the aligned sequences // corresponding to a node or leaf of the tree. ///////////////////////////////////////////////////////////////// MultiSequence *ProcessTree (const TreeNode *tree, MultiSequence *sequences, const SafeVector > &sparseMatrices, const ProbabilisticModel &model){ MultiSequence *result; // check if this is a node of the alignment tree if (tree->GetSequenceLabel() == -1){ MultiSequence *alignLeft = ProcessTree (tree->GetLeftChild(), sequences, sparseMatrices, model); MultiSequence *alignRight = ProcessTree (tree->GetRightChild(), sequences, sparseMatrices, model); assert (alignLeft); assert (alignRight); result = AlignAlignments (alignLeft, alignRight, sparseMatrices, model); assert (result); delete alignLeft; delete alignRight; } // otherwise, this is a leaf of the alignment tree else { result = new MultiSequence(); assert (result); result->AddSequence (sequences->GetSequence(tree->GetSequenceLabel())->Clone()); } return result; } ///////////////////////////////////////////////////////////////// // ComputeFinalAlignment() // // Compute the final alignment by calling ProcessTree(), then // performing iterative refinement as needed. ///////////////////////////////////////////////////////////////// MultiSequence *ComputeFinalAlignment (const TreeNode *tree, MultiSequence *sequences, const SafeVector > &sparseMatrices, const ProbabilisticModel &model){ MultiSequence *alignment = ProcessTree (tree, sequences, sparseMatrices, model); SafeVector oldOrdering; if (enableAlignOrder){ for (int i = 0; i < alignment->GetNumSequences(); i++) oldOrdering.push_back (alignment->GetSequence(i)->GetSortLabel()); alignment->SaveOrdering(); enableAlignOrder = false; } // tree-based refinement // TreeBasedBiPartitioning (sparseMatrices, model, alignment, tree); // iterative refinement for (int i = 0; i < numIterativeRefinementReps; i++) DoIterativeRefinement (sparseMatrices, model, alignment); cerr << endl; if (oldOrdering.size() > 0){ for (int i = 0; i < (int) oldOrdering.size(); i++){ alignment->GetSequence(i)->SetSortLabel(oldOrdering[i]); } } // return final alignment return alignment; } ///////////////////////////////////////////////////////////////// // AlignAlignments() // // Returns the alignment of two MultiSequence objects. ///////////////////////////////////////////////////////////////// MultiSequence *AlignAlignments (MultiSequence *align1, MultiSequence *align2, const SafeVector > &sparseMatrices, const ProbabilisticModel &model){ // print some info about the alignment if (enableVerbose){ for (int i = 0; i < align1->GetNumSequences(); i++) cerr << ((i==0) ? "[" : ",") << align1->GetSequence(i)->GetLabel(); cerr << "] vs. "; for (int i = 0; i < align2->GetNumSequences(); i++) cerr << ((i==0) ? "[" : ",") << align2->GetSequence(i)->GetLabel(); cerr << "]: "; } VF *posterior = model.BuildPosterior (align1, align2, sparseMatrices, cutoff); pair *, float> alignment; // choose the alignment routine depending on the "cosmetic" gap penalties used if (gapOpenPenalty == 0 && gapContinuePenalty == 0) alignment = model.ComputeAlignment (align1->GetSequence(0)->GetLength(), align2->GetSequence(0)->GetLength(), *posterior); else alignment = model.ComputeAlignmentWithGapPenalties (align1, align2, *posterior, align1->GetNumSequences(), align2->GetNumSequences(), gapOpenPenalty, gapContinuePenalty); delete posterior; if (enableVerbose){ // compute total length of sequences int totLength = 0; for (int i = 0; i < align1->GetNumSequences(); i++) for (int j = 0; j < align2->GetNumSequences(); j++) totLength += min (align1->GetSequence(i)->GetLength(), align2->GetSequence(j)->GetLength()); // give an "accuracy" measure for the alignment cerr << alignment.second / totLength << endl; } // now build final alignment MultiSequence *result = new MultiSequence(); for (int i = 0; i < align1->GetNumSequences(); i++) result->AddSequence (align1->GetSequence(i)->AddGaps(alignment.first, 'X')); for (int i = 0; i < align2->GetNumSequences(); i++) result->AddSequence (align2->GetSequence(i)->AddGaps(alignment.first, 'Y')); if (!enableAlignOrder) result->SortByLabel(); // free temporary alignment delete alignment.first; return result; } ///////////////////////////////////////////////////////////////// // DoRelaxation() // // Performs one round of the consistency transformation. The // formula used is: // 1 // P'(x[i]-y[j]) = --- sum sum P(x[i]-z[k]) P(z[k]-y[j]) // |S| z in S k // // where S = {x, y, all other sequences...} // ///////////////////////////////////////////////////////////////// SafeVector > DoRelaxation (MultiSequence *sequences, SafeVector > &sparseMatrices){ const int numSeqs = sequences->GetNumSequences(); SafeVector > newSparseMatrices (numSeqs, SafeVector(numSeqs, NULL)); // for every pair of sequences for (int i = 0; i < numSeqs; i++){ for (int j = i+1; j < numSeqs; j++){ Sequence *seq1 = sequences->GetSequence (i); Sequence *seq2 = sequences->GetSequence (j); if (enableVerbose) cerr << "Relaxing (" << i+1 << ") " << seq1->GetHeader() << " vs. " << "(" << j+1 << ") " << seq2->GetHeader() << ": "; // get the original posterior matrix VF *posteriorPtr = sparseMatrices[i][j]->GetPosterior(); assert (posteriorPtr); VF &posterior = *posteriorPtr; const int seq1Length = seq1->GetLength(); const int seq2Length = seq2->GetLength(); // contribution from the summation where z = x and z = y for (int k = 0; k < (seq1Length+1) * (seq2Length+1); k++) posterior[k] += posterior[k]; if (enableVerbose) cerr << sparseMatrices[i][j]->GetNumCells() << " --> "; // contribution from all other sequences for (int k = 0; k < numSeqs; k++) if (k != i && k != j){ if (k < i) Relax1 (sparseMatrices[k][i], sparseMatrices[k][j], posterior); else if (k > i && k < j) Relax (sparseMatrices[i][k], sparseMatrices[k][j], posterior); else { SparseMatrix *temp = sparseMatrices[j][k]->ComputeTranspose(); Relax (sparseMatrices[i][k], temp, posterior); delete temp; } } // now renormalization for (int k = 0; k < (seq1Length+1) * (seq2Length+1); k++) posterior[k] /= numSeqs; // mask out positions not originally in the posterior matrix SparseMatrix *matXY = sparseMatrices[i][j]; for (int y = 0; y <= seq2Length; y++) posterior[y] = 0; for (int x = 1; x <= seq1Length; x++){ SafeVector::iterator XYptr = matXY->GetRowPtr(x); SafeVector::iterator XYend = XYptr + matXY->GetRowSize(x); VF::iterator base = posterior.begin() + x * (seq2Length + 1); int curr = 0; while (XYptr != XYend){ // zero out all cells until the first filled column while (curr < XYptr->first){ base[curr] = 0; curr++; } // now, skip over this column curr++; ++XYptr; } // zero out cells after last column while (curr <= seq2Length){ base[curr] = 0; curr++; } } // save the new posterior matrix newSparseMatrices[i][j] = new SparseMatrix (seq1->GetLength(), seq2->GetLength(), posterior); newSparseMatrices[j][i] = NULL; if (enableVerbose) cerr << newSparseMatrices[i][j]->GetNumCells() << " -- "; delete posteriorPtr; if (enableVerbose) cerr << "done." << endl; } } return newSparseMatrices; } ///////////////////////////////////////////////////////////////// // Relax() // // Computes the consistency transformation for a single sequence // z, and adds the transformed matrix to "posterior". ///////////////////////////////////////////////////////////////// void Relax (SparseMatrix *matXZ, SparseMatrix *matZY, VF &posterior){ assert (matXZ); assert (matZY); int lengthX = matXZ->GetSeq1Length(); int lengthY = matZY->GetSeq2Length(); assert (matXZ->GetSeq2Length() == matZY->GetSeq1Length()); // for every x[i] for (int i = 1; i <= lengthX; i++){ SafeVector::iterator XZptr = matXZ->GetRowPtr(i); SafeVector::iterator XZend = XZptr + matXZ->GetRowSize(i); VF::iterator base = posterior.begin() + i * (lengthY + 1); // iterate through all x[i]-z[k] while (XZptr != XZend){ SafeVector::iterator ZYptr = matZY->GetRowPtr(XZptr->first); SafeVector::iterator ZYend = ZYptr + matZY->GetRowSize(XZptr->first); const float XZval = XZptr->second; // iterate through all z[k]-y[j] while (ZYptr != ZYend){ base[ZYptr->first] += XZval * ZYptr->second; ZYptr++; } XZptr++; } } } ///////////////////////////////////////////////////////////////// // Relax1() // // Computes the consistency transformation for a single sequence // z, and adds the transformed matrix to "posterior". ///////////////////////////////////////////////////////////////// void Relax1 (SparseMatrix *matZX, SparseMatrix *matZY, VF &posterior){ assert (matZX); assert (matZY); int lengthZ = matZX->GetSeq1Length(); int lengthY = matZY->GetSeq2Length(); // for every z[k] for (int k = 1; k <= lengthZ; k++){ SafeVector::iterator ZXptr = matZX->GetRowPtr(k); SafeVector::iterator ZXend = ZXptr + matZX->GetRowSize(k); // iterate through all z[k]-x[i] while (ZXptr != ZXend){ SafeVector::iterator ZYptr = matZY->GetRowPtr(k); SafeVector::iterator ZYend = ZYptr + matZY->GetRowSize(k); const float ZXval = ZXptr->second; VF::iterator base = posterior.begin() + ZXptr->first * (lengthY + 1); // iterate through all z[k]-y[j] while (ZYptr != ZYend){ base[ZYptr->first] += ZXval * ZYptr->second; ZYptr++; } ZXptr++; } } } ///////////////////////////////////////////////////////////////// // GetSubtree // // Returns set containing all leaf labels of the current subtree. ///////////////////////////////////////////////////////////////// set GetSubtree (const TreeNode *tree){ set s; if (tree->GetSequenceLabel() == -1){ s = GetSubtree (tree->GetLeftChild()); set t = GetSubtree (tree->GetRightChild()); for (set::iterator iter = t.begin(); iter != t.end(); ++iter) s.insert (*iter); } else { s.insert (tree->GetSequenceLabel()); } return s; } ///////////////////////////////////////////////////////////////// // TreeBasedBiPartitioning // // Uses the iterative refinement scheme from MUSCLE. ///////////////////////////////////////////////////////////////// void TreeBasedBiPartitioning (const SafeVector > &sparseMatrices, const ProbabilisticModel &model, MultiSequence* &alignment, const TreeNode *tree){ // check if this is a node of the alignment tree if (tree->GetSequenceLabel() == -1){ TreeBasedBiPartitioning (sparseMatrices, model, alignment, tree->GetLeftChild()); TreeBasedBiPartitioning (sparseMatrices, model, alignment, tree->GetRightChild()); set leftSubtree = GetSubtree (tree->GetLeftChild()); set rightSubtree = GetSubtree (tree->GetRightChild()); set leftSubtreeComplement, rightSubtreeComplement; // calculate complement of each subtree for (int i = 0; i < alignment->GetNumSequences(); i++){ if (leftSubtree.find(i) == leftSubtree.end()) leftSubtreeComplement.insert (i); if (rightSubtree.find(i) == rightSubtree.end()) rightSubtreeComplement.insert (i); } // perform realignments for edge to left child if (!leftSubtree.empty() && !leftSubtreeComplement.empty()){ MultiSequence *groupOneSeqs = alignment->Project (leftSubtree); assert (groupOneSeqs); MultiSequence *groupTwoSeqs = alignment->Project (leftSubtreeComplement); assert (groupTwoSeqs); delete alignment; alignment = AlignAlignments (groupOneSeqs, groupTwoSeqs, sparseMatrices, model); } // perform realignments for edge to right child if (!rightSubtree.empty() && !rightSubtreeComplement.empty()){ MultiSequence *groupOneSeqs = alignment->Project (rightSubtree); assert (groupOneSeqs); MultiSequence *groupTwoSeqs = alignment->Project (rightSubtreeComplement); assert (groupTwoSeqs); delete alignment; alignment = AlignAlignments (groupOneSeqs, groupTwoSeqs, sparseMatrices, model); } } } ///////////////////////////////////////////////////////////////// // DoIterativeRefinement() // // Performs a single round of randomized partionining iterative // refinement. ///////////////////////////////////////////////////////////////// void DoIterativeRefinement (const SafeVector > &sparseMatrices, const ProbabilisticModel &model, MultiSequence* &alignment){ set groupOne, groupTwo; // create two separate groups for (int i = 0; i < alignment->GetNumSequences(); i++){ if (rand() % 2) groupOne.insert (i); else groupTwo.insert (i); } if (groupOne.empty() || groupTwo.empty()) return; // project into the two groups MultiSequence *groupOneSeqs = alignment->Project (groupOne); assert (groupOneSeqs); MultiSequence *groupTwoSeqs = alignment->Project (groupTwo); assert (groupTwoSeqs); delete alignment; // realign alignment = AlignAlignments (groupOneSeqs, groupTwoSeqs, sparseMatrices, model); delete groupOneSeqs; delete groupTwoSeqs; } ///////////////////////////////////////////////////////////////// // WriteAnnotation() // // Computes annotation for multiple alignment and write values // to a file. ///////////////////////////////////////////////////////////////// void WriteAnnotation (MultiSequence *alignment, const SafeVector > &sparseMatrices){ ofstream outfile (annotationFilename.c_str()); if (outfile.fail()){ cerr << "ERROR: Unable to write annotation file." << endl; exit (1); } const int alignLength = alignment->GetSequence(0)->GetLength(); const int numSeqs = alignment->GetNumSequences(); SafeVector position (numSeqs, 0); SafeVector::iterator> seqs (numSeqs); for (int i = 0; i < numSeqs; i++) seqs[i] = alignment->GetSequence(i)->GetDataPtr(); SafeVector > active; active.reserve (numSeqs); SafeVector lab; for (int i = 0; i < numSeqs; i++) lab.push_back(alignment->GetSequence(i)->GetSortLabel()); // for every column for (int i = 1; i <= alignLength; i++){ // find all aligned residues in this particular column active.clear(); for (int j = 0; j < numSeqs; j++){ if (seqs[j][i] != '-'){ active.push_back (make_pair(lab[j], ++position[j])); } } sort (active.begin(), active.end()); outfile << setw(4) << ComputeScore (active, sparseMatrices) << endl; } outfile.close(); } ///////////////////////////////////////////////////////////////// // ComputeScore() // // Computes the annotation score for a particular column. ///////////////////////////////////////////////////////////////// int ComputeScore (const SafeVector > &active, const SafeVector > &sparseMatrices){ if (active.size() <= 1) return 0; // ALTERNATIVE #1: Compute the average alignment score. float val = 0; for (int i = 0; i < (int) active.size(); i++){ for (int j = i+1; j < (int) active.size(); j++){ val += sparseMatrices[active[i].first][active[j].first]->GetValue(active[i].second, active[j].second); } } return (int) (200 * val / ((int) active.size() * ((int) active.size() - 1))); } probcons/Makefile0000644000175000017500000000417710604510513014545 0ustar chuongdochuongdo################################################################################ # Makefile for probcons ################################################################################ ################################################################################ # 1) Choose C++ compiler. ################################################################################ CXX = g++ ################################################################################ # 2) Set C++ flags. # a) DEBUG mode -- no optimizations, enable SafeVector checking, no inlining # b) PROFILE mode -- for gprof # c) RELEASE mode ################################################################################ OTHERFLAGS = -DNumInsertStates=2 -DVERSION="1.12" # debug mode #CXXFLAGS = -g -W -Wall -pedantic -DENABLE_CHECKS -fno-inline $(OTHERFLAGS) # profile mode #CXXFLAGS = -pg -W -Wall -pedantic $(OTHERFLAGS) # release mode #CXXFLAGS = -O3 -W -Wall -pedantic -DNDEBUG $(OTHERFLAGS) -mmmx -msse -msse2 -mfpmath=sse -march=pentium4 -mcpu=pentium4 -funroll-loops -fomit-frame-pointer CXXFLAGS = -O3 -W -Wall -pedantic -DNDEBUG $(OTHERFLAGS) -funroll-loops ################################################################################ # 3) Dependencies ################################################################################ TARGETS = probcons compare project makegnuplot .PHONY : all all : $(TARGETS) probcons : MultiSequence.h ProbabilisticModel.h ScoreType.h Sequence.h FileBuffer.h SparseMatrix.h EvolutionaryTree.h Defaults.h SafeVector.h Main.cc $(CXX) $(CXXFLAGS) -lm -o probcons Main.cc compare : MultiSequence.h Sequence.h FileBuffer.h SafeVector.h CompareToRef.cc $(CXX) $(CXXFLAGS) -o compare CompareToRef.cc fixref : MultiSequence.h ProbabilisticModel.h ScoreType.h Sequence.h FileBuffer.h SparseMatrix.h EvolutionaryTree.h Defaults.h SafeVector.h FixRef.cc $(CXX) $(CXXFLAGS) -o fixref FixRef.cc project : MultiSequence.h Sequence.h SafeVector.h ProjectPairwise.cc $(CXX) $(CXXFLAGS) -o project ProjectPairwise.cc makegnuplot : MakeGnuPlot.cc $(CXX) $(CXXFLAGS) -o makegnuplot MakeGnuPlot.cc .PHONY : clean clean: rm -f $(TARGETS) probcons/MakeGnuPlot.cc0000644000175000017500000000306210604510337015576 0ustar chuongdochuongdo///////////////////////////////////////////////////////////////// // MakeGnuPlot.cc ///////////////////////////////////////////////////////////////// #include #include #include using namespace std; int main (int argc, char **argv){ if (argc == 1 || argc > 3){ cerr << "Usage: makegnuplot annotscores [refscores]" << endl; exit (1); } ifstream data (argv[1]); if (data.fail()){ cerr << "ERROR: Could not open file " << argv[1] << endl; exit (1); } int x, ct = 0; while (data >> x) ct++; data.close(); ofstream out ("temporary_gnuplot_script"); if (out.fail()){ cerr << "ERROR: Could not create temporary file." << endl; exit (1); } out << "set title \"Column Reliability Scores\"" << endl << "set xlabel \"Alignment Position\"" << endl << "set ylabel \"Column Reliability\"" << endl << "set xr [1:" << ct << "]" << endl << "set term postscript enhanced color" << endl << "set output \"reliability.ps\"" << endl; if (argc == 3){ out << "set style fill solid 0.5 noborder" << endl << "plot \"" << argv[2] << "\" title \"actual\" with boxes lt 2, \\" << endl << " \"" << argv[1] << "\" title \"predicted\" with histeps lt 1 lw 3" << endl; } else out << "plot \"" << argv[1] << "\" title \"predicted\" with histeps lt 1 lw 3" << endl; out.close(); if (system ("gnuplot temporary_gnuplot_script") == -1){ cerr << "ERROR: Could not run Gnuplot correctly." << endl; exit (1); } //system ("rm temporary_gnuplot_script"); } probcons/MultiSequence.h0000644000175000017500000005107510115263352016043 0ustar chuongdochuongdo//////////////////////////////////////////////////////////////// // MultiSequence.h // // Utilities for reading/writing multiple sequence data. ///////////////////////////////////////////////////////////////// #ifndef MULTISEQUENCE_H #define MULTISEQUENCE_H #include #include #include #include #include #include #include #include "SafeVector.h" #include "Sequence.h" #include "FileBuffer.h" ///////////////////////////////////////////////////////////////// // MultiSequence // // Class for multiple sequence alignment input/output. ///////////////////////////////////////////////////////////////// class MultiSequence { SafeVector *sequences; public: ///////////////////////////////////////////////////////////////// // MultiSequence::MultiSequence() // // Default constructor. ///////////////////////////////////////////////////////////////// MultiSequence () : sequences (NULL) {} ///////////////////////////////////////////////////////////////// // MultiSequence::MultiSequence() // // Constructor. Load MFA from a FileBuffer object. ///////////////////////////////////////////////////////////////// MultiSequence (FileBuffer &infile) : sequences (NULL) { LoadMFA (infile); } ///////////////////////////////////////////////////////////////// // MultiSequence::MultiSequence() // // Constructor. Load MFA from a filename. ///////////////////////////////////////////////////////////////// MultiSequence (const string &filename) : sequences (NULL){ LoadMFA (filename); } ///////////////////////////////////////////////////////////////// // MultiSequence::~MultiSequence() // // Destructor. Gets rid of sequence objects contained in the // multiple alignment. ///////////////////////////////////////////////////////////////// ~MultiSequence(){ // if sequences allocated if (sequences){ // free all sequences for (SafeVector::iterator iter = sequences->begin(); iter != sequences->end(); ++iter){ assert (*iter); delete *iter; *iter = NULL; } // free sequence vector delete sequences; sequences = NULL; } } ///////////////////////////////////////////////////////////////// // MultiSequence::LoadMFA() // // Load MFA from a filename. ///////////////////////////////////////////////////////////////// void LoadMFA (const string &filename, bool stripGaps = false){ // try opening file FileBuffer infile (filename.c_str()); if (infile.fail()){ cerr << "ERROR: Could not open file '" << filename << "' for reading." << endl; exit (1); } // if successful, then load using other LoadMFA() routine LoadMFA (infile, stripGaps); infile.close(); } ///////////////////////////////////////////////////////////////// // MultiSequence::LoadMFA() // // Load MSF from a FileBuffer object. ///////////////////////////////////////////////////////////////// void ParseMSF (FileBuffer &infile, string header, bool stripGaps = false){ SafeVector *> seqData; SafeVector seqNames; SafeVector seqLengths; istringstream in; bool valid = true; bool missingHeader = false; bool clustalW = false; // read until data starts while (!infile.eof() && header.find ("..", 0) == string::npos){ if (header.find ("CLUSTAL", 0) == 0 || header.find ("PROBCONS", 0) == 0){ clustalW = true; break; } infile.GetLine (header); if (header.find ("//", 0) != string::npos){ missingHeader = true; break; } } // read until end-of-file while (valid){ infile.GetLine (header); if (infile.eof()) break; string word; in.clear(); in.str(header); // check if there's anything on this line if (in >> word){ // clustalw name parsing if (clustalW){ if (!isspace(header[0]) && find (seqNames.begin(), seqNames.end(), word) == seqNames.end()){ seqNames.push_back (word); seqData.push_back (new SafeVector()); seqLengths.push_back (0); seqData[(int) seqData.size() - 1]->push_back ('@'); } } // look for new sequence label if (word == string ("Name:")){ if (in >> word){ seqNames.push_back (word); seqData.push_back (new SafeVector()); seqLengths.push_back (0); seqData[(int) seqData.size() - 1]->push_back ('@'); } else valid = false; } // check if this is sequence data else if (find (seqNames.begin(), seqNames.end(), word) != seqNames.end()){ int index = find (seqNames.begin(), seqNames.end(), word) - seqNames.begin(); // read all remaining characters on the line char ch; while (in >> ch){ if (isspace (ch)) continue; if (ch >= 'a' && ch <= 'z') ch = ch - 'a' + 'A'; if (ch == '.') ch = '-'; if (stripGaps && ch == '-') continue; if (!((ch >= 'A' && ch <= 'Z') || ch == '*' || ch == '-')){ cerr << "ERROR: Unknown character encountered: " << ch << endl; exit (1); } // everything's ok so far, so just store this character. seqData[index]->push_back (ch); seqLengths[index]++; } } else if (missingHeader){ seqNames.push_back (word); seqData.push_back (new SafeVector()); seqLengths.push_back (0); seqData[(int) seqData.size() - 1]->push_back ('@'); int index = (int) seqNames.size() - 1; // read all remaining characters on the line char ch; while (in >> ch){ if (isspace (ch)) continue; if (ch >= 'a' && ch <= 'z') ch = ch - 'a' + 'A'; if (ch == '.') ch = '-'; if (stripGaps && ch == '-') continue; if (!((ch >= 'A' && ch <= 'Z') || ch == '*' || ch == '-')){ cerr << "ERROR: Unknown character encountered: " << ch << endl; exit (1); } // everything's ok so far, so just store this character. seqData[index]->push_back (ch); seqLengths[index]++; } } } } // check for errors if (seqNames.size() == 0){ cerr << "ERROR: No sequences read!" << endl; exit (1); } assert (!sequences); sequences = new SafeVector; for (int i = 0; i < (int) seqNames.size(); i++){ if (seqLengths[i] == 0){ cerr << "ERROR: Sequence of zero length!" << endl; exit (1); } Sequence *seq = new Sequence (seqData[i], seqNames[i], seqLengths[i], i, i); sequences->push_back (seq); } } ///////////////////////////////////////////////////////////////// // MultiSequence::LoadMFA() // // Load MFA from a FileBuffer object. ///////////////////////////////////////////////////////////////// void LoadMFA (FileBuffer &infile, bool stripGaps = false){ // check to make sure that file reading is ok if (infile.fail()){ cerr << "ERROR: Error reading file." << endl; exit (1); } // read all sequences while (true){ // get the sequence label as being the current # of sequences // NOTE: sequence labels here are zero-based int index = (!sequences) ? 0 : sequences->size(); // read the sequence Sequence *seq = new Sequence (infile, stripGaps); if (seq->Fail()){ // check if alternative file format (i.e. not MFA) if (index == 0){ string header = seq->GetHeader(); if (header.length() > 0 && header[0] != '>'){ // try MSF format ParseMSF (infile, header); break; } } delete seq; break; } seq->SetLabel (index); // add the sequence to the list of current sequences if (!sequences) sequences = new SafeVector; sequences->push_back (seq); } // make sure at least one sequence was read if (!sequences){ cerr << "ERROR: No sequences read." << endl; exit (1); } } ///////////////////////////////////////////////////////////////// // MultiSequence::AddSequence() // // Add another sequence to an existing sequence list ///////////////////////////////////////////////////////////////// void AddSequence (Sequence *sequence){ assert (sequence); assert (!sequence->Fail()); // add sequence if (!sequences) sequences = new SafeVector; sequences->push_back (sequence); } ///////////////////////////////////////////////////////////////// // MultiSequence::RemoveSequence() // // Remove a sequence from the MultiSequence ///////////////////////////////////////////////////////////////// void RemoveSequence (int index){ assert (sequences); assert (index >= 0 && index < (int) sequences->size()); delete (*sequences)[index]; sequences->erase (sequences->begin() + index); } ///////////////////////////////////////////////////////////////// // MultiSequence::WriteMFA() // // Write MFA to the outfile. Allows the user to specify the // number of columns for the output. Also, useIndices determines // whether or not the actual sequence comments will be printed // out or whether the artificially assigned sequence labels will // be used instead. ///////////////////////////////////////////////////////////////// void WriteMFA (ostream &outfile, int numColumns = 60, bool useIndices = false){ if (!sequences) return; // loop through all sequences and write them out for (SafeVector::iterator iter = sequences->begin(); iter != sequences->end(); ++iter){ (*iter)->WriteMFA (outfile, numColumns, useIndices); } } ///////////////////////////////////////////////////////////////// // MultiSequence::GetAnnotationChar() // // Return CLUSTALW annotation for column. ///////////////////////////////////////////////////////////////// char GetAnnotationChar (SafeVector &column){ SafeVector counts (256, 0); int allChars = (int) column.size(); for (int i = 0; i < allChars; i++){ counts[(unsigned char) toupper(column[i])]++; } allChars -= counts[(unsigned char) '-']; if (allChars == 1) return ' '; for (int i = 0; i < 256; i++) if ((char) i != '-' && counts[i] == allChars) return '*'; if (counts[(unsigned char) 'S'] + counts[(unsigned char) 'T'] + counts[(unsigned char) 'A'] == allChars) return ':'; if (counts[(unsigned char) 'N'] + counts[(unsigned char) 'E'] + counts[(unsigned char) 'Q'] + counts[(unsigned char) 'K'] == allChars) return ':'; if (counts[(unsigned char) 'N'] + counts[(unsigned char) 'H'] + counts[(unsigned char) 'Q'] + counts[(unsigned char) 'K'] == allChars) return ':'; if (counts[(unsigned char) 'N'] + counts[(unsigned char) 'D'] + counts[(unsigned char) 'E'] + counts[(unsigned char) 'Q'] == allChars) return ':'; if (counts[(unsigned char) 'Q'] + counts[(unsigned char) 'H'] + counts[(unsigned char) 'R'] + counts[(unsigned char) 'K'] == allChars) return ':'; if (counts[(unsigned char) 'M'] + counts[(unsigned char) 'I'] + counts[(unsigned char) 'L'] + counts[(unsigned char) 'V'] == allChars) return ':'; if (counts[(unsigned char) 'M'] + counts[(unsigned char) 'I'] + counts[(unsigned char) 'L'] + counts[(unsigned char) 'F'] == allChars) return ':'; if (counts[(unsigned char) 'H'] + counts[(unsigned char) 'Y'] == allChars) return ':'; if (counts[(unsigned char) 'F'] + counts[(unsigned char) 'Y'] + counts[(unsigned char) 'W'] == allChars) return ':'; if (counts[(unsigned char) 'C'] + counts[(unsigned char) 'S'] + counts[(unsigned char) 'A'] == allChars) return '.'; if (counts[(unsigned char) 'A'] + counts[(unsigned char) 'T'] + counts[(unsigned char) 'V'] == allChars) return '.'; if (counts[(unsigned char) 'S'] + counts[(unsigned char) 'A'] + counts[(unsigned char) 'G'] == allChars) return '.'; if (counts[(unsigned char) 'S'] + counts[(unsigned char) 'T'] + counts[(unsigned char) 'N'] + counts[(unsigned char) 'K'] == allChars) return '.'; if (counts[(unsigned char) 'S'] + counts[(unsigned char) 'T'] + counts[(unsigned char) 'P'] + counts[(unsigned char) 'A'] == allChars) return '.'; if (counts[(unsigned char) 'S'] + counts[(unsigned char) 'G'] + counts[(unsigned char) 'N'] + counts[(unsigned char) 'D'] == allChars) return '.'; if (counts[(unsigned char) 'S'] + counts[(unsigned char) 'N'] + counts[(unsigned char) 'D'] + counts[(unsigned char) 'E'] + counts[(unsigned char) 'Q'] + counts[(unsigned char) 'K'] == allChars) return '.'; if (counts[(unsigned char) 'N'] + counts[(unsigned char) 'D'] + counts[(unsigned char) 'E'] + counts[(unsigned char) 'Q'] + counts[(unsigned char) 'H'] + counts[(unsigned char) 'K'] == allChars) return '.'; if (counts[(unsigned char) 'N'] + counts[(unsigned char) 'E'] + counts[(unsigned char) 'H'] + counts[(unsigned char) 'Q'] + counts[(unsigned char) 'R'] + counts[(unsigned char) 'K'] == allChars) return '.'; if (counts[(unsigned char) 'F'] + counts[(unsigned char) 'V'] + counts[(unsigned char) 'L'] + counts[(unsigned char) 'I'] + counts[(unsigned char) 'M'] == allChars) return '.'; if (counts[(unsigned char) 'H'] + counts[(unsigned char) 'F'] + counts[(unsigned char) 'Y'] == allChars) return '.'; return ' '; } ///////////////////////////////////////////////////////////////// // MultiSequence::WriteALN() // // Write ALN to the outfile. Allows the user to specify the // number of columns for the output. ///////////////////////////////////////////////////////////////// void WriteALN (ostream &outfile, int numColumns = 60){ if (!sequences) return; outfile << "PROBCONS version " << VERSION << " multiple sequence alignment" << endl; int longestComment = 0; SafeVector::iterator> ptrs (GetNumSequences()); SafeVector lengths (GetNumSequences()); for (int i = 0; i < GetNumSequences(); i++){ ptrs[i] = GetSequence (i)->GetDataPtr(); lengths[i] = GetSequence (i)->GetLength(); longestComment = max (longestComment, (int) GetSequence(i)->GetName().length()); } longestComment += 4; int writtenChars = 0; bool allDone = false; while (!allDone){ outfile << endl; allDone = true; // loop through all sequences and write them out for (int i = 0; i < GetNumSequences(); i++){ if (writtenChars < lengths[i]){ outfile << GetSequence(i)->GetName(); for (int j = 0; j < longestComment - (int) GetSequence(i)->GetName().length(); j++) outfile << ' '; for (int j = 0; j < numColumns; j++){ if (writtenChars + j < lengths[i]) outfile << ptrs[i][writtenChars + j + 1]; else break; } outfile << endl; if (writtenChars + numColumns < lengths[i]) allDone = false; } } // write annotation line for (int j = 0; j < longestComment; j++) outfile << ' '; for (int j = 0; j < numColumns; j++){ SafeVector column; for (int i = 0; i < GetNumSequences(); i++) if (writtenChars + j < lengths[i]) column.push_back (ptrs[i][writtenChars + j + 1]); if (column.size() > 0) outfile << GetAnnotationChar (column); } outfile << endl; writtenChars += numColumns; } } ///////////////////////////////////////////////////////////////// // MultiSequence::GetSequence() // // Retrieve a sequence from the MultiSequence object. ///////////////////////////////////////////////////////////////// Sequence* GetSequence (int i){ assert (sequences); assert (0 <= i && i < (int) sequences->size()); return (*sequences)[i]; } ///////////////////////////////////////////////////////////////// // MultiSequence::GetSequence() // // Retrieve a sequence from the MultiSequence object // (const version). ///////////////////////////////////////////////////////////////// const Sequence* GetSequence (int i) const { assert (sequences); assert (0 <= i && i < (int) sequences->size()); return (*sequences)[i]; } ///////////////////////////////////////////////////////////////// // MultiSequence::GetNumSequences() // // Returns the number of sequences in the MultiSequence. ///////////////////////////////////////////////////////////////// int GetNumSequences () const { if (!sequences) return 0; return (int) sequences->size(); } ///////////////////////////////////////////////////////////////// // MultiSequence::SortByHeader() // // Organizes the sequences according to their sequence headers // in ascending order. ///////////////////////////////////////////////////////////////// void SortByHeader () { assert (sequences); // a quick and easy O(n^2) sort for (int i = 0; i < (int) sequences->size()-1; i++){ for (int j = i+1; j < (int) sequences->size(); j++){ if ((*sequences)[i]->GetHeader() > (*sequences)[j]->GetHeader()) swap ((*sequences)[i], (*sequences)[j]); } } } ///////////////////////////////////////////////////////////////// // MultiSequence::SortByLabel() // // Organizes the sequences according to their sequence labels // in ascending order. ///////////////////////////////////////////////////////////////// void SortByLabel () { assert (sequences); // a quick and easy O(n^2) sort for (int i = 0; i < (int) sequences->size()-1; i++){ for (int j = i+1; j < (int) sequences->size(); j++){ if ((*sequences)[i]->GetSortLabel() > (*sequences)[j]->GetSortLabel()) swap ((*sequences)[i], (*sequences)[j]); } } } ///////////////////////////////////////////////////////////////// // MultiSequence::SaveOrdering() // // Relabels sequences so as to preserve the current ordering. ///////////////////////////////////////////////////////////////// void SaveOrdering () { assert (sequences); for (int i = 0; i < (int) sequences->size(); i++) (*sequences)[i]->SetSortLabel (i); } ///////////////////////////////////////////////////////////////// // MultiSequence::Project() // // Given a set of indices, extract all sequences from the current // MultiSequence object whose index is included in the set. // Then, project the multiple alignments down to the desired // subset, and return the projection as a new MultiSequence // object. ///////////////////////////////////////////////////////////////// MultiSequence *Project (const set &indices){ SafeVector::iterator> oldPtrs (indices.size()); SafeVector *> newPtrs (indices.size()); assert (indices.size() != 0); // grab old data int i = 0; for (set::const_iterator iter = indices.begin(); iter != indices.end(); ++iter){ oldPtrs[i++] = GetSequence (*iter)->GetDataPtr(); } // compute new length int oldLength = GetSequence (*indices.begin())->GetLength(); int newLength = 0; for (i = 1; i <= oldLength; i++){ // check to see if there is a gap in every sequence of the set bool found = false; for (int j = 0; !found && j < (int) indices.size(); j++) found = (oldPtrs[j][i] != '-'); // if not, then this column counts towards the sequence length if (found) newLength++; } // build new alignments for (i = 0; i < (int) indices.size(); i++){ newPtrs[i] = new SafeVector(); assert (newPtrs[i]); newPtrs[i]->push_back ('@'); } // add all needed columns for (i = 1; i <= oldLength; i++){ // make sure column is not gapped in all sequences in the set bool found = false; for (int j = 0; !found && j < (int) indices.size(); j++) found = (oldPtrs[j][i] != '-'); // if not, then add it if (found){ for (int j = 0; j < (int) indices.size(); j++) newPtrs[j]->push_back (oldPtrs[j][i]); } } // wrap sequences in MultiSequence object MultiSequence *ret = new MultiSequence(); i = 0; for (set::const_iterator iter = indices.begin(); iter != indices.end(); ++iter){ ret->AddSequence (new Sequence (newPtrs[i++], GetSequence (*iter)->GetHeader(), newLength, GetSequence (*iter)->GetSortLabel(), GetSequence (*iter)->GetLabel())); } return ret; } }; #endif probcons/ProbabilisticModel.h0000644000175000017500000011636710216071454017037 0ustar chuongdochuongdo///////////////////////////////////////////////////////////////// // ProbabilisticModel.h // // Routines for (1) posterior probability computations // (2) chained anchoring // (3) maximum weight trace alignment ///////////////////////////////////////////////////////////////// #ifndef PROBABILISTICMODEL_H #define PROBABILISTICMODEL_H #include #include #include #include "SafeVector.h" #include "ScoreType.h" #include "SparseMatrix.h" #include "MultiSequence.h" using namespace std; const int NumMatchStates = 1; // note that in this version the number // of match states is fixed at 1...will // change in future versions const int NumMatrixTypes = NumMatchStates + NumInsertStates * 2; ///////////////////////////////////////////////////////////////// // ProbabilisticModel // // Class for storing the parameters of a probabilistic model and // performing different computations based on those parameters. // In particular, this class handles the computation of // posterior probabilities that may be used in alignment. ///////////////////////////////////////////////////////////////// class ProbabilisticModel { float initialDistribution[NumMatrixTypes]; // holds the initial probabilities for each state float transProb[NumMatrixTypes][NumMatrixTypes]; // holds all state-to-state transition probabilities float matchProb[256][256]; // emission probabilities for match states float insProb[256][NumMatrixTypes]; // emission probabilities for insert states public: ///////////////////////////////////////////////////////////////// // ProbabilisticModel::ProbabilisticModel() // // Constructor. Builds a new probabilistic model using the // given parameters. ///////////////////////////////////////////////////////////////// ProbabilisticModel (const VF &initDistribMat, const VF &gapOpen, const VF &gapExtend, const VVF &emitPairs, const VF &emitSingle){ // build transition matrix VVF transMat (NumMatrixTypes, VF (NumMatrixTypes, 0.0f)); transMat[0][0] = 1; for (int i = 0; i < NumInsertStates; i++){ transMat[0][2*i+1] = gapOpen[2*i]; transMat[0][2*i+2] = gapOpen[2*i+1]; transMat[0][0] -= (gapOpen[2*i] + gapOpen[2*i+1]); assert (transMat[0][0] > 0); transMat[2*i+1][2*i+1] = gapExtend[2*i]; transMat[2*i+2][2*i+2] = gapExtend[2*i+1]; transMat[2*i+1][2*i+2] = 0; transMat[2*i+2][2*i+1] = 0; transMat[2*i+1][0] = 1 - gapExtend[2*i]; transMat[2*i+2][0] = 1 - gapExtend[2*i+1]; } // create initial and transition probability matrices for (int i = 0; i < NumMatrixTypes; i++){ initialDistribution[i] = LOG (initDistribMat[i]); for (int j = 0; j < NumMatrixTypes; j++) transProb[i][j] = LOG (transMat[i][j]); } // create insertion and match probability matrices for (int i = 0; i < 256; i++){ for (int j = 0; j < NumMatrixTypes; j++) insProb[i][j] = LOG (emitSingle[i]); for (int j = 0; j < 256; j++) matchProb[i][j] = LOG (emitPairs[i][j]); } } ///////////////////////////////////////////////////////////////// // ProbabilisticModel::ComputeForwardMatrix() // // Computes a set of forward probability matrices for aligning // seq1 and seq2. // // For efficiency reasons, a single-dimensional floating-point // array is used here, with the following indexing scheme: // // forward[i + NumMatrixTypes * (j * (seq2Length+1) + k)] // refers to the probability of aligning through j characters // of the first sequence, k characters of the second sequence, // and ending in state i. ///////////////////////////////////////////////////////////////// VF *ComputeForwardMatrix (Sequence *seq1, Sequence *seq2) const { assert (seq1); assert (seq2); const int seq1Length = seq1->GetLength(); const int seq2Length = seq2->GetLength(); // retrieve the points to the beginning of each sequence SafeVector::iterator iter1 = seq1->GetDataPtr(); SafeVector::iterator iter2 = seq2->GetDataPtr(); // create matrix VF *forwardPtr = new VF (NumMatrixTypes * (seq1Length+1) * (seq2Length+1), LOG_ZERO); assert (forwardPtr); VF &forward = *forwardPtr; // initialization condition forward[0 + NumMatrixTypes * (1 * (seq2Length+1) + 1)] = initialDistribution[0] + matchProb[(unsigned char) iter1[1]][(unsigned char) iter2[1]]; for (int k = 0; k < NumInsertStates; k++){ forward[2*k+1 + NumMatrixTypes * (1 * (seq2Length+1) + 0)] = initialDistribution[2*k+1] + insProb[(unsigned char) iter1[1]][k]; forward[2*k+2 + NumMatrixTypes * (0 * (seq2Length+1) + 1)] = initialDistribution[2*k+2] + insProb[(unsigned char) iter2[1]][k]; } // remember offset for each index combination int ij = 0; int i1j = -seq2Length - 1; int ij1 = -1; int i1j1 = -seq2Length - 2; ij *= NumMatrixTypes; i1j *= NumMatrixTypes; ij1 *= NumMatrixTypes; i1j1 *= NumMatrixTypes; // compute forward scores for (int i = 0; i <= seq1Length; i++){ unsigned char c1 = (i == 0) ? '~' : (unsigned char) iter1[i]; for (int j = 0; j <= seq2Length; j++){ unsigned char c2 = (j == 0) ? '~' : (unsigned char) iter2[j]; if (i > 1 || j > 1){ if (i > 0 && j > 0){ forward[0 + ij] = forward[0 + i1j1] + transProb[0][0]; for (int k = 1; k < NumMatrixTypes; k++) LOG_PLUS_EQUALS (forward[0 + ij], forward[k + i1j1] + transProb[k][0]); forward[0 + ij] += matchProb[c1][c2]; } if (i > 0){ for (int k = 0; k < NumInsertStates; k++) forward[2*k+1 + ij] = insProb[c1][k] + LOG_ADD (forward[0 + i1j] + transProb[0][2*k+1], forward[2*k+1 + i1j] + transProb[2*k+1][2*k+1]); } if (j > 0){ for (int k = 0; k < NumInsertStates; k++) forward[2*k+2 + ij] = insProb[c2][k] + LOG_ADD (forward[0 + ij1] + transProb[0][2*k+2], forward[2*k+2 + ij1] + transProb[2*k+2][2*k+2]); } } ij += NumMatrixTypes; i1j += NumMatrixTypes; ij1 += NumMatrixTypes; i1j1 += NumMatrixTypes; } } return forwardPtr; } ///////////////////////////////////////////////////////////////// // ProbabilisticModel::ComputeBackwardMatrix() // // Computes a set of backward probability matrices for aligning // seq1 and seq2. // // For efficiency reasons, a single-dimensional floating-point // array is used here, with the following indexing scheme: // // backward[i + NumMatrixTypes * (j * (seq2Length+1) + k)] // refers to the probability of starting in state i and // aligning from character j+1 to the end of the first // sequence and from character k+1 to the end of the second // sequence. ///////////////////////////////////////////////////////////////// VF *ComputeBackwardMatrix (Sequence *seq1, Sequence *seq2) const { assert (seq1); assert (seq2); const int seq1Length = seq1->GetLength(); const int seq2Length = seq2->GetLength(); SafeVector::iterator iter1 = seq1->GetDataPtr(); SafeVector::iterator iter2 = seq2->GetDataPtr(); // create matrix VF *backwardPtr = new VF (NumMatrixTypes * (seq1Length+1) * (seq2Length+1), LOG_ZERO); assert (backwardPtr); VF &backward = *backwardPtr; // initialization condition for (int k = 0; k < NumMatrixTypes; k++) backward[NumMatrixTypes * ((seq1Length+1) * (seq2Length+1) - 1) + k] = initialDistribution[k]; // remember offset for each index combination int ij = (seq1Length+1) * (seq2Length+1) - 1; int i1j = ij + seq2Length + 1; int ij1 = ij + 1; int i1j1 = ij + seq2Length + 2; ij *= NumMatrixTypes; i1j *= NumMatrixTypes; ij1 *= NumMatrixTypes; i1j1 *= NumMatrixTypes; // compute backward scores for (int i = seq1Length; i >= 0; i--){ unsigned char c1 = (i == seq1Length) ? '~' : (unsigned char) iter1[i+1]; for (int j = seq2Length; j >= 0; j--){ unsigned char c2 = (j == seq2Length) ? '~' : (unsigned char) iter2[j+1]; if (i < seq1Length && j < seq2Length){ const float ProbXY = backward[0 + i1j1] + matchProb[c1][c2]; for (int k = 0; k < NumMatrixTypes; k++) LOG_PLUS_EQUALS (backward[k + ij], ProbXY + transProb[k][0]); } if (i < seq1Length){ for (int k = 0; k < NumInsertStates; k++){ LOG_PLUS_EQUALS (backward[0 + ij], backward[2*k+1 + i1j] + insProb[c1][k] + transProb[0][2*k+1]); LOG_PLUS_EQUALS (backward[2*k+1 + ij], backward[2*k+1 + i1j] + insProb[c1][k] + transProb[2*k+1][2*k+1]); } } if (j < seq2Length){ for (int k = 0; k < NumInsertStates; k++){ LOG_PLUS_EQUALS (backward[0 + ij], backward[2*k+2 + ij1] + insProb[c2][k] + transProb[0][2*k+2]); LOG_PLUS_EQUALS (backward[2*k+2 + ij], backward[2*k+2 + ij1] + insProb[c2][k] + transProb[2*k+2][2*k+2]); } } ij -= NumMatrixTypes; i1j -= NumMatrixTypes; ij1 -= NumMatrixTypes; i1j1 -= NumMatrixTypes; } } return backwardPtr; } ///////////////////////////////////////////////////////////////// // ProbabilisticModel::ComputeTotalProbability() // // Computes the total probability of an alignment given // the forward and backward matrices. ///////////////////////////////////////////////////////////////// float ComputeTotalProbability (int seq1Length, int seq2Length, const VF &forward, const VF &backward) const { // compute total probability float totalForwardProb = LOG_ZERO; float totalBackwardProb = LOG_ZERO; for (int k = 0; k < NumMatrixTypes; k++){ LOG_PLUS_EQUALS (totalForwardProb, forward[k + NumMatrixTypes * ((seq1Length+1) * (seq2Length+1) - 1)] + backward[k + NumMatrixTypes * ((seq1Length+1) * (seq2Length+1) - 1)]); } totalBackwardProb = forward[0 + NumMatrixTypes * (1 * (seq2Length+1) + 1)] + backward[0 + NumMatrixTypes * (1 * (seq2Length+1) + 1)]; for (int k = 0; k < NumInsertStates; k++){ LOG_PLUS_EQUALS (totalBackwardProb, forward[2*k+1 + NumMatrixTypes * (1 * (seq2Length+1) + 0)] + backward[2*k+1 + NumMatrixTypes * (1 * (seq2Length+1) + 0)]); LOG_PLUS_EQUALS (totalBackwardProb, forward[2*k+2 + NumMatrixTypes * (0 * (seq2Length+1) + 1)] + backward[2*k+2 + NumMatrixTypes * (0 * (seq2Length+1) + 1)]); } // cerr << totalForwardProb << " " << totalBackwardProb << endl; return (totalForwardProb + totalBackwardProb) / 2; } ///////////////////////////////////////////////////////////////// // ProbabilisticModel::ComputePosteriorMatrix() // // Computes the posterior probability matrix based on // the forward and backward matrices. ///////////////////////////////////////////////////////////////// VF *ComputePosteriorMatrix (Sequence *seq1, Sequence *seq2, const VF &forward, const VF &backward) const { assert (seq1); assert (seq2); const int seq1Length = seq1->GetLength(); const int seq2Length = seq2->GetLength(); float totalProb = ComputeTotalProbability (seq1Length, seq2Length, forward, backward); // compute posterior matrices VF *posteriorPtr = new VF((seq1Length+1) * (seq2Length+1)); assert (posteriorPtr); VF &posterior = *posteriorPtr; int ij = 0; VF::iterator ptr = posterior.begin(); for (int i = 0; i <= seq1Length; i++){ for (int j = 0; j <= seq2Length; j++){ *(ptr++) = EXP (min (LOG_ONE, forward[ij] + backward[ij] - totalProb)); ij += NumMatrixTypes; } } posterior[0] = 0; return posteriorPtr; } /* ///////////////////////////////////////////////////////////////// // ProbabilisticModel::ComputeExpectedCounts() // // Computes the expected counts for the various transitions. ///////////////////////////////////////////////////////////////// VVF *ComputeExpectedCounts () const { assert (seq1); assert (seq2); const int seq1Length = seq1->GetLength(); const int seq2Length = seq2->GetLength(); SafeVector::iterator iter1 = seq1->GetDataPtr(); SafeVector::iterator iter2 = seq2->GetDataPtr(); // compute total probability float totalProb = ComputeTotalProbability (seq1Length, seq2Length, forward, backward); // initialize expected counts VVF *countsPtr = new VVF(NumMatrixTypes + 1, VF(NumMatrixTypes, LOG_ZERO)); assert (countsPtr); VVF &counts = *countsPtr; // remember offset for each index combination int ij = 0; int i1j = -seq2Length - 1; int ij1 = -1; int i1j1 = -seq2Length - 2; ij *= NumMatrixTypes; i1j *= NumMatrixTypes; ij1 *= NumMatrixTypes; i1j1 *= NumMatrixTypes; // compute expected counts for (int i = 0; i <= seq1Length; i++){ unsigned char c1 = (i == 0) ? '~' : (unsigned char) iter1[i]; for (int j = 0; j <= seq2Length; j++){ unsigned char c2 = (j == 0) ? '~' : (unsigned char) iter2[j]; if (i > 0 && j > 0){ for (int k = 0; k < NumMatrixTypes; k++) LOG_PLUS_EQUALS (counts[k][0], forward[k + i1j1] + transProb[k][0] + matchProb[c1][c2] + backward[0 + ij]); } if (i > 0){ for (int k = 0; k < NumInsertStates; k++){ LOG_PLUS_EQUALS (counts[0][2*k+1], forward[0 + i1j] + transProb[0][2*k+1] + insProb[c1][k] + backward[2*k+1 + ij]); LOG_PLUS_EQUALS (counts[2*k+1][2*k+1], forward[2*k+1 + i1j] + transProb[2*k+1][2*k+1] + insProb[c1][k] + backward[2*k+1 + ij]); } } if (j > 0){ for (int k = 0; k < NumInsertStates; k++){ LOG_PLUS_EQUALS (counts[0][2*k+2], forward[0 + ij1] + transProb[0][2*k+2] + insProb[c2][k] + backward[2*k+2 + ij]); LOG_PLUS_EQUALS (counts[2*k+2][2*k+2], forward[2*k+2 + ij1] + transProb[2*k+2][2*k+2] + insProb[c2][k] + backward[2*k+2 + ij]); } } ij += NumMatrixTypes; i1j += NumMatrixTypes; ij1 += NumMatrixTypes; i1j1 += NumMatrixTypes; } } // scale all expected counts appropriately for (int i = 0; i < NumMatrixTypes; i++) for (int j = 0; j < NumMatrixTypes; j++) counts[i][j] -= totalProb; } */ ///////////////////////////////////////////////////////////////// // ProbabilisticModel::ComputeNewParameters() // // Computes a new parameter set based on the expected counts // given. ///////////////////////////////////////////////////////////////// void ComputeNewParameters (Sequence *seq1, Sequence *seq2, const VF &forward, const VF &backward, VF &initDistribMat, VF &gapOpen, VF &gapExtend, VVF &emitPairs, VF &emitSingle, bool enableTrainEmissions) const { assert (seq1); assert (seq2); const int seq1Length = seq1->GetLength(); const int seq2Length = seq2->GetLength(); SafeVector::iterator iter1 = seq1->GetDataPtr(); SafeVector::iterator iter2 = seq2->GetDataPtr(); // compute total probability float totalProb = ComputeTotalProbability (seq1Length, seq2Length, forward, backward); // initialize expected counts VVF transCounts (NumMatrixTypes, VF (NumMatrixTypes, LOG_ZERO)); VF initCounts (NumMatrixTypes, LOG_ZERO); VVF pairCounts (256, VF (256, LOG_ZERO)); VF singleCounts (256, LOG_ZERO); // remember offset for each index combination int ij = 0; int i1j = -seq2Length - 1; int ij1 = -1; int i1j1 = -seq2Length - 2; ij *= NumMatrixTypes; i1j *= NumMatrixTypes; ij1 *= NumMatrixTypes; i1j1 *= NumMatrixTypes; // compute initial distribution posteriors initCounts[0] = LOG_ADD (forward[0 + NumMatrixTypes * (1 * (seq2Length+1) + 1)] + backward[0 + NumMatrixTypes * (1 * (seq2Length+1) + 1)], forward[0 + NumMatrixTypes * ((seq1Length+1) * (seq2Length+1) - 1)] + backward[0 + NumMatrixTypes * ((seq1Length+1) * (seq2Length+1) - 1)]); for (int k = 0; k < NumInsertStates; k++){ initCounts[2*k+1] = LOG_ADD (forward[2*k+1 + NumMatrixTypes * (1 * (seq2Length+1) + 0)] + backward[2*k+1 + NumMatrixTypes * (1 * (seq2Length+1) + 0)], forward[2*k+1 + NumMatrixTypes * ((seq1Length+1) * (seq2Length+1) - 1)] + backward[2*k+1 + NumMatrixTypes * ((seq1Length+1) * (seq2Length+1) - 1)]); initCounts[2*k+2] = LOG_ADD (forward[2*k+2 + NumMatrixTypes * (0 * (seq2Length+1) + 1)] + backward[2*k+2 + NumMatrixTypes * (0 * (seq2Length+1) + 1)], forward[2*k+2 + NumMatrixTypes * ((seq1Length+1) * (seq2Length+1) - 1)] + backward[2*k+2 + NumMatrixTypes * ((seq1Length+1) * (seq2Length+1) - 1)]); } // compute expected counts for (int i = 0; i <= seq1Length; i++){ unsigned char c1 = (i == 0) ? '~' : (unsigned char) toupper(iter1[i]); for (int j = 0; j <= seq2Length; j++){ unsigned char c2 = (j == 0) ? '~' : (unsigned char) toupper(iter2[j]); if (i > 0 && j > 0){ if (enableTrainEmissions && i == 1 && j == 1){ LOG_PLUS_EQUALS (pairCounts[c1][c2], initialDistribution[0] + matchProb[c1][c2] + backward[0 + ij]); LOG_PLUS_EQUALS (pairCounts[c2][c1], initialDistribution[0] + matchProb[c2][c1] + backward[0 + ij]); } for (int k = 0; k < NumMatrixTypes; k++){ LOG_PLUS_EQUALS (transCounts[k][0], forward[k + i1j1] + transProb[k][0] + matchProb[c1][c2] + backward[0 + ij]); if (enableTrainEmissions && i != 1 || j != 1){ LOG_PLUS_EQUALS (pairCounts[c1][c2], forward[k + i1j1] + transProb[k][0] + matchProb[c1][c2] + backward[0 + ij]); LOG_PLUS_EQUALS (pairCounts[c2][c1], forward[k + i1j1] + transProb[k][0] + matchProb[c2][c1] + backward[0 + ij]); } } } if (i > 0){ for (int k = 0; k < NumInsertStates; k++){ LOG_PLUS_EQUALS (transCounts[0][2*k+1], forward[0 + i1j] + transProb[0][2*k+1] + insProb[c1][k] + backward[2*k+1 + ij]); LOG_PLUS_EQUALS (transCounts[2*k+1][2*k+1], forward[2*k+1 + i1j] + transProb[2*k+1][2*k+1] + insProb[c1][k] + backward[2*k+1 + ij]); if (enableTrainEmissions){ if (i == 1 && j == 0){ LOG_PLUS_EQUALS (singleCounts[c1], initialDistribution[2*k+1] + insProb[c1][k] + backward[2*k+1 + ij]); } else { LOG_PLUS_EQUALS (singleCounts[c1], forward[0 + i1j] + transProb[0][2*k+1] + insProb[c1][k] + backward[2*k+1 + ij]); LOG_PLUS_EQUALS (singleCounts[c1], forward[2*k+1 + i1j] + transProb[2*k+1][2*k+1] + insProb[c1][k] + backward[2*k+1 + ij]); } } } } if (j > 0){ for (int k = 0; k < NumInsertStates; k++){ LOG_PLUS_EQUALS (transCounts[0][2*k+2], forward[0 + ij1] + transProb[0][2*k+2] + insProb[c2][k] + backward[2*k+2 + ij]); LOG_PLUS_EQUALS (transCounts[2*k+2][2*k+2], forward[2*k+2 + ij1] + transProb[2*k+2][2*k+2] + insProb[c2][k] + backward[2*k+2 + ij]); if (enableTrainEmissions){ if (i == 0 && j == 1){ LOG_PLUS_EQUALS (singleCounts[c2], initialDistribution[2*k+2] + insProb[c2][k] + backward[2*k+2 + ij]); } else { LOG_PLUS_EQUALS (singleCounts[c2], forward[0 + ij1] + transProb[0][2*k+2] + insProb[c2][k] + backward[2*k+2 + ij]); LOG_PLUS_EQUALS (singleCounts[c2], forward[2*k+2 + ij1] + transProb[2*k+2][2*k+2] + insProb[c2][k] + backward[2*k+2 + ij]); } } } } ij += NumMatrixTypes; i1j += NumMatrixTypes; ij1 += NumMatrixTypes; i1j1 += NumMatrixTypes; } } // scale all expected counts appropriately for (int i = 0; i < NumMatrixTypes; i++){ initCounts[i] -= totalProb; for (int j = 0; j < NumMatrixTypes; j++) transCounts[i][j] -= totalProb; } if (enableTrainEmissions){ for (int i = 0; i < 256; i++){ for (int j = 0; j < 256; j++) pairCounts[i][j] -= totalProb; singleCounts[i] -= totalProb; } } // compute new initial distribution float totalInitDistribCounts = 0; for (int i = 0; i < NumMatrixTypes; i++) totalInitDistribCounts += exp (initCounts[i]); // should be 2 initDistribMat[0] = min (1.0f, max (0.0f, (float) exp (initCounts[0]) / totalInitDistribCounts)); for (int k = 0; k < NumInsertStates; k++){ float val = (exp (initCounts[2*k+1]) + exp (initCounts[2*k+2])) / 2; initDistribMat[2*k+1] = initDistribMat[2*k+2] = min (1.0f, max (0.0f, val / totalInitDistribCounts)); } // compute total counts for match state float inMatchStateCounts = 0; for (int i = 0; i < NumMatrixTypes; i++) inMatchStateCounts += exp (transCounts[0][i]); for (int i = 0; i < NumInsertStates; i++){ // compute total counts for gap state float inGapStateCounts = exp (transCounts[2*i+1][0]) + exp (transCounts[2*i+1][2*i+1]) + exp (transCounts[2*i+2][0]) + exp (transCounts[2*i+2][2*i+2]); gapOpen[2*i] = gapOpen[2*i+1] = (exp (transCounts[0][2*i+1]) + exp (transCounts[0][2*i+2])) / (2 * inMatchStateCounts); gapExtend[2*i] = gapExtend[2*i+1] = (exp (transCounts[2*i+1][2*i+1]) + exp (transCounts[2*i+2][2*i+2])) / inGapStateCounts; } if (enableTrainEmissions){ float totalPairCounts = 0; float totalSingleCounts = 0; for (int i = 0; i < 256; i++){ for (int j = 0; j <= i; j++) totalPairCounts += exp (pairCounts[j][i]); totalSingleCounts += exp (singleCounts[i]); } for (int i = 0; i < 256; i++) if (!islower ((char) i)){ int li = (int)((unsigned char) tolower ((char) i)); for (int j = 0; j <= i; j++) if (!islower ((char) j)){ int lj = (int)((unsigned char) tolower ((char) j)); emitPairs[i][j] = emitPairs[i][lj] = emitPairs[li][j] = emitPairs[li][lj] = emitPairs[j][i] = emitPairs[j][li] = emitPairs[lj][i] = emitPairs[lj][li] = exp(pairCounts[j][i]) / totalPairCounts; } emitSingle[i] = emitSingle[li] = exp(singleCounts[i]) / totalSingleCounts; } } } ///////////////////////////////////////////////////////////////// // ProbabilisticModel::ComputeAlignment() // // Computes an alignment based on the given posterior matrix. // This is done by finding the maximum summing path (or // maximum weight trace) through the posterior matrix. The // final alignment is returned as a pair consisting of: // (1) a string (e.g., XXXBBXXXBBBBBBYYYYBBB) where X's and // denote insertions in one of the two sequences and // B's denote that both sequences are present (i.e. // matches). // (2) a float indicating the sum achieved ///////////////////////////////////////////////////////////////// pair *, float> ComputeAlignment (int seq1Length, int seq2Length, const VF &posterior) const { float *twoRows = new float[(seq2Length+1)*2]; assert (twoRows); float *oldRow = twoRows; float *newRow = twoRows + seq2Length + 1; char *tracebackMatrix = new char[(seq1Length+1)*(seq2Length+1)]; assert (tracebackMatrix); char *tracebackPtr = tracebackMatrix; VF::const_iterator posteriorPtr = posterior.begin() + seq2Length + 1; // initialization for (int i = 0; i <= seq2Length; i++){ oldRow[i] = 0; *(tracebackPtr++) = 'L'; } // fill in matrix for (int i = 1; i <= seq1Length; i++){ // initialize left column newRow[0] = 0; posteriorPtr++; *(tracebackPtr++) = 'U'; // fill in rest of row for (int j = 1; j <= seq2Length; j++){ ChooseBestOfThree (*(posteriorPtr++) + oldRow[j-1], newRow[j-1], oldRow[j], 'D', 'L', 'U', &newRow[j], tracebackPtr++); } // swap rows float *temp = oldRow; oldRow = newRow; newRow = temp; } // store best score float total = oldRow[seq2Length]; delete [] twoRows; // compute traceback SafeVector *alignment = new SafeVector; assert (alignment); int r = seq1Length, c = seq2Length; while (r != 0 || c != 0){ char ch = tracebackMatrix[r*(seq2Length+1) + c]; switch (ch){ case 'L': c--; alignment->push_back ('Y'); break; case 'U': r--; alignment->push_back ('X'); break; case 'D': c--; r--; alignment->push_back ('B'); break; default: assert (false); } } delete [] tracebackMatrix; reverse (alignment->begin(), alignment->end()); return make_pair(alignment, total); } ///////////////////////////////////////////////////////////////// // ProbabilisticModel::ComputeAlignmentWithGapPenalties() // // Similar to ComputeAlignment() except with gap penalties. ///////////////////////////////////////////////////////////////// pair *, float> ComputeAlignmentWithGapPenalties (MultiSequence *align1, MultiSequence *align2, const VF &posterior, int numSeqs1, int numSeqs2, float gapOpenPenalty, float gapContinuePenalty) const { int seq1Length = align1->GetSequence(0)->GetLength(); int seq2Length = align2->GetSequence(0)->GetLength(); SafeVector::iterator > dataPtrs1 (align1->GetNumSequences()); SafeVector::iterator > dataPtrs2 (align2->GetNumSequences()); // grab character data for (int i = 0; i < align1->GetNumSequences(); i++) dataPtrs1[i] = align1->GetSequence(i)->GetDataPtr(); for (int i = 0; i < align2->GetNumSequences(); i++) dataPtrs2[i] = align2->GetSequence(i)->GetDataPtr(); // the number of active sequences at any given column is defined to be the // number of non-gap characters in that column; the number of gap opens at // any given column is defined to be the number of gap characters in that // column where the previous character in the respective sequence was not // a gap SafeVector numActive1 (seq1Length+1), numGapOpens1 (seq1Length+1); SafeVector numActive2 (seq2Length+1), numGapOpens2 (seq2Length+1); // compute number of active sequences and gap opens for each group for (int i = 0; i < align1->GetNumSequences(); i++){ SafeVector::iterator dataPtr = align1->GetSequence(i)->GetDataPtr(); numActive1[0] = numGapOpens1[0] = 0; for (int j = 1; j <= seq1Length; j++){ if (dataPtr[j] != '-'){ numActive1[j]++; numGapOpens1[j] += (j != 1 && dataPtr[j-1] != '-'); } } } for (int i = 0; i < align2->GetNumSequences(); i++){ SafeVector::iterator dataPtr = align2->GetSequence(i)->GetDataPtr(); numActive2[0] = numGapOpens2[0] = 0; for (int j = 1; j <= seq2Length; j++){ if (dataPtr[j] != '-'){ numActive2[j]++; numGapOpens2[j] += (j != 1 && dataPtr[j-1] != '-'); } } } VVF openingPenalty1 (numSeqs1+1, VF (numSeqs2+1)); VF continuingPenalty1 (numSeqs1+1); VVF openingPenalty2 (numSeqs1+1, VF (numSeqs2+1)); VF continuingPenalty2 (numSeqs2+1); // precompute penalties for (int i = 0; i <= numSeqs1; i++) for (int j = 0; j <= numSeqs2; j++) openingPenalty1[i][j] = i * (gapOpenPenalty * j + gapContinuePenalty * (numSeqs2 - j)); for (int i = 0; i <= numSeqs1; i++) continuingPenalty1[i] = i * gapContinuePenalty * numSeqs2; for (int i = 0; i <= numSeqs2; i++) for (int j = 0; j <= numSeqs1; j++) openingPenalty2[i][j] = i * (gapOpenPenalty * j + gapContinuePenalty * (numSeqs1 - j)); for (int i = 0; i <= numSeqs2; i++) continuingPenalty2[i] = i * gapContinuePenalty * numSeqs1; float *twoRows = new float[6*(seq2Length+1)]; assert (twoRows); float *oldRowMatch = twoRows; float *newRowMatch = twoRows + (seq2Length+1); float *oldRowInsertX = twoRows + 2*(seq2Length+1); float *newRowInsertX = twoRows + 3*(seq2Length+1); float *oldRowInsertY = twoRows + 4*(seq2Length+1); float *newRowInsertY = twoRows + 5*(seq2Length+1); char *tracebackMatrix = new char[3*(seq1Length+1)*(seq2Length+1)]; assert (tracebackMatrix); char *tracebackPtr = tracebackMatrix; VF::const_iterator posteriorPtr = posterior.begin() + seq2Length + 1; // initialization for (int i = 0; i <= seq2Length; i++){ oldRowMatch[i] = oldRowInsertX[i] = (i == 0) ? 0 : LOG_ZERO; oldRowInsertY[i] = (i == 0) ? 0 : oldRowInsertY[i-1] + continuingPenalty2[numActive2[i]]; *(tracebackPtr) = *(tracebackPtr+1) = *(tracebackPtr+2) = 'Y'; tracebackPtr += 3; } // fill in matrix for (int i = 1; i <= seq1Length; i++){ // initialize left column newRowMatch[0] = newRowInsertY[0] = LOG_ZERO; newRowInsertX[0] = oldRowInsertX[0] + continuingPenalty1[numActive1[i]]; posteriorPtr++; *(tracebackPtr) = *(tracebackPtr+1) = *(tracebackPtr+2) = 'X'; tracebackPtr += 3; // fill in rest of row for (int j = 1; j <= seq2Length; j++){ // going to MATCH state ChooseBestOfThree (oldRowMatch[j-1], oldRowInsertX[j-1], oldRowInsertY[j-1], 'M', 'X', 'Y', &newRowMatch[j], tracebackPtr++); newRowMatch[j] += *(posteriorPtr++); // going to INSERT X state ChooseBestOfThree (oldRowMatch[j] + openingPenalty1[numActive1[i]][numGapOpens2[j]], oldRowInsertX[j] + continuingPenalty1[numActive1[i]], oldRowInsertY[j] + openingPenalty1[numActive1[i]][numGapOpens2[j]], 'M', 'X', 'Y', &newRowInsertX[j], tracebackPtr++); // going to INSERT Y state ChooseBestOfThree (newRowMatch[j-1] + openingPenalty2[numActive2[j]][numGapOpens1[i]], newRowInsertX[j-1] + openingPenalty2[numActive2[j]][numGapOpens1[i]], newRowInsertY[j-1] + continuingPenalty2[numActive2[j]], 'M', 'X', 'Y', &newRowInsertY[j], tracebackPtr++); } // swap rows float *temp; temp = oldRowMatch; oldRowMatch = newRowMatch; newRowMatch = temp; temp = oldRowInsertX; oldRowInsertX = newRowInsertX; newRowInsertX = temp; temp = oldRowInsertY; oldRowInsertY = newRowInsertY; newRowInsertY = temp; } // store best score float total; char matrix; ChooseBestOfThree (oldRowMatch[seq2Length], oldRowInsertX[seq2Length], oldRowInsertY[seq2Length], 'M', 'X', 'Y', &total, &matrix); delete [] twoRows; // compute traceback SafeVector *alignment = new SafeVector; assert (alignment); int r = seq1Length, c = seq2Length; while (r != 0 || c != 0){ int offset = (matrix == 'M') ? 0 : (matrix == 'X') ? 1 : 2; char ch = tracebackMatrix[(r*(seq2Length+1) + c) * 3 + offset]; switch (matrix){ case 'Y': c--; alignment->push_back ('Y'); break; case 'X': r--; alignment->push_back ('X'); break; case 'M': c--; r--; alignment->push_back ('B'); break; default: assert (false); } matrix = ch; } delete [] tracebackMatrix; reverse (alignment->begin(), alignment->end()); return make_pair(alignment, 1.0f); } ///////////////////////////////////////////////////////////////// // ProbabilisticModel::ComputeViterbiAlignment() // // Computes the highest probability pairwise alignment using the // probabilistic model. The final alignment is returned as a // pair consisting of: // (1) a string (e.g., XXXBBXXXBBBBBBYYYYBBB) where X's and // denote insertions in one of the two sequences and // B's denote that both sequences are present (i.e. // matches). // (2) a float containing the log probability of the best // alignment (not used) ///////////////////////////////////////////////////////////////// pair *, float> ComputeViterbiAlignment (Sequence *seq1, Sequence *seq2) const { assert (seq1); assert (seq2); const int seq1Length = seq1->GetLength(); const int seq2Length = seq2->GetLength(); // retrieve the points to the beginning of each sequence SafeVector::iterator iter1 = seq1->GetDataPtr(); SafeVector::iterator iter2 = seq2->GetDataPtr(); // create viterbi matrix VF *viterbiPtr = new VF (NumMatrixTypes * (seq1Length+1) * (seq2Length+1), LOG_ZERO); assert (viterbiPtr); VF &viterbi = *viterbiPtr; // create traceback matrix VI *tracebackPtr = new VI (NumMatrixTypes * (seq1Length+1) * (seq2Length+1), -1); assert (tracebackPtr); VI &traceback = *tracebackPtr; // initialization condition for (int k = 0; k < NumMatrixTypes; k++) viterbi[k] = initialDistribution[k]; // remember offset for each index combination int ij = 0; int i1j = -seq2Length - 1; int ij1 = -1; int i1j1 = -seq2Length - 2; ij *= NumMatrixTypes; i1j *= NumMatrixTypes; ij1 *= NumMatrixTypes; i1j1 *= NumMatrixTypes; // compute viterbi scores for (int i = 0; i <= seq1Length; i++){ unsigned char c1 = (i == 0) ? '~' : (unsigned char) iter1[i]; for (int j = 0; j <= seq2Length; j++){ unsigned char c2 = (j == 0) ? '~' : (unsigned char) iter2[j]; if (i > 0 && j > 0){ for (int k = 0; k < NumMatrixTypes; k++){ float newVal = viterbi[k + i1j1] + transProb[k][0] + matchProb[c1][c2]; if (viterbi[0 + ij] < newVal){ viterbi[0 + ij] = newVal; traceback[0 + ij] = k; } } } if (i > 0){ for (int k = 0; k < NumInsertStates; k++){ float valFromMatch = insProb[c1][k] + viterbi[0 + i1j] + transProb[0][2*k+1]; float valFromIns = insProb[c1][k] + viterbi[2*k+1 + i1j] + transProb[2*k+1][2*k+1]; if (valFromMatch >= valFromIns){ viterbi[2*k+1 + ij] = valFromMatch; traceback[2*k+1 + ij] = 0; } else { viterbi[2*k+1 + ij] = valFromIns; traceback[2*k+1 + ij] = 2*k+1; } } } if (j > 0){ for (int k = 0; k < NumInsertStates; k++){ float valFromMatch = insProb[c2][k] + viterbi[0 + ij1] + transProb[0][2*k+2]; float valFromIns = insProb[c2][k] + viterbi[2*k+2 + ij1] + transProb[2*k+2][2*k+2]; if (valFromMatch >= valFromIns){ viterbi[2*k+2 + ij] = valFromMatch; traceback[2*k+2 + ij] = 0; } else { viterbi[2*k+2 + ij] = valFromIns; traceback[2*k+2 + ij] = 2*k+2; } } } ij += NumMatrixTypes; i1j += NumMatrixTypes; ij1 += NumMatrixTypes; i1j1 += NumMatrixTypes; } } // figure out best terminating cell float bestProb = LOG_ZERO; int state = -1; for (int k = 0; k < NumMatrixTypes; k++){ float thisProb = viterbi[k + NumMatrixTypes * ((seq1Length+1)*(seq2Length+1) - 1)] + initialDistribution[k]; if (bestProb < thisProb){ bestProb = thisProb; state = k; } } assert (state != -1); delete viterbiPtr; // compute traceback SafeVector *alignment = new SafeVector; assert (alignment); int r = seq1Length, c = seq2Length; while (r != 0 || c != 0){ int newState = traceback[state + NumMatrixTypes * (r * (seq2Length+1) + c)]; if (state == 0){ c--; r--; alignment->push_back ('B'); } else if (state % 2 == 1){ r--; alignment->push_back ('X'); } else { c--; alignment->push_back ('Y'); } state = newState; } delete tracebackPtr; reverse (alignment->begin(), alignment->end()); return make_pair(alignment, bestProb); } ///////////////////////////////////////////////////////////////// // ProbabilisticModel::BuildPosterior() // // Builds a posterior probability matrix needed to align a pair // of alignments. Mathematically, the returned matrix M is // defined as follows: // M[i,j] = sum sum f(s,t,i,j) // s in align1 t in align2 // where // [ P(s[i'] <--> t[j']) // [ if s[i'] is a letter in the ith column of align1 and // [ t[j'] it a letter in the jth column of align2 // f(s,t,i,j) = [ // [ 0 otherwise // ///////////////////////////////////////////////////////////////// VF *BuildPosterior (MultiSequence *align1, MultiSequence *align2, const SafeVector > &sparseMatrices, float cutoff = 0.0f) const { const int seq1Length = align1->GetSequence(0)->GetLength(); const int seq2Length = align2->GetSequence(0)->GetLength(); VF *posteriorPtr = new VF((seq1Length+1) * (seq2Length+1), 0); assert (posteriorPtr); VF &posterior = *posteriorPtr; VF::iterator postPtr = posterior.begin(); // for each s in align1 for (int i = 0; i < align1->GetNumSequences(); i++){ int first = align1->GetSequence(i)->GetLabel(); SafeVector *mapping1 = align1->GetSequence(i)->GetMapping(); // for each t in align2 for (int j = 0; j < align2->GetNumSequences(); j++){ int second = align2->GetSequence(j)->GetLabel(); SafeVector *mapping2 = align2->GetSequence(j)->GetMapping(); if (first < second){ // get the associated sparse matrix SparseMatrix *matrix = sparseMatrices[first][second]; for (int ii = 1; ii <= matrix->GetSeq1Length(); ii++){ SafeVector::iterator row = matrix->GetRowPtr(ii); int base = (*mapping1)[ii] * (seq2Length+1); int rowSize = matrix->GetRowSize(ii); // add in all relevant values for (int jj = 0; jj < rowSize; jj++) posterior[base + (*mapping2)[row[jj].first]] += row[jj].second; // subtract cutoff for (int jj = 0; jj < matrix->GetSeq2Length(); jj++) posterior[base + (*mapping2)[jj]] -= cutoff; } } else { // get the associated sparse matrix SparseMatrix *matrix = sparseMatrices[second][first]; for (int jj = 1; jj <= matrix->GetSeq1Length(); jj++){ SafeVector::iterator row = matrix->GetRowPtr(jj); int base = (*mapping2)[jj]; int rowSize = matrix->GetRowSize(jj); // add in all relevant values for (int ii = 0; ii < rowSize; ii++) posterior[base + (*mapping1)[row[ii].first] * (seq2Length + 1)] += row[ii].second; // subtract cutoff for (int ii = 0; ii < matrix->GetSeq2Length(); ii++) posterior[base + (*mapping1)[ii] * (seq2Length + 1)] -= cutoff; } } delete mapping2; } delete mapping1; } return posteriorPtr; } }; #endif probcons/ProjectPairwise.cc0000644000175000017500000000324310075047142016524 0ustar chuongdochuongdo///////////////////////////////////////////////////////////////// // ProjectPairwise // // Program for projecting multiple alignments to all pairwise // alignments. ///////////////////////////////////////////////////////////////// #include "SafeVector.h" #include "MultiSequence.h" #include #include #include #include #include #include #include #include #include #include #include bool compressGaps = true; ///////////////////////////////////////////////////////////////// // main() // // Main program. ///////////////////////////////////////////////////////////////// int main (int argc, char **argv){ // check arguments if (argc < 2){ cerr << "Usage: project ALIGNMENT [-nocompressgaps]" << endl; exit (1); } for (int i = 2; i < argc; i++){ if (strcmp (argv[i], "-nocompressgaps") == 0) compressGaps = false; else { cerr << "Unrecognized option: " << argv[i] << endl; exit (1); } } MultiSequence *align = new MultiSequence (string (argv[1])); assert (align); int N = align->GetNumSequences(); for (int i = 0; i < N; i++){ for (int j = i+1; j < N; j++){ string name = align->GetSequence(i)->GetHeader() + "-" + align->GetSequence(j)->GetHeader() + ".fasta"; ofstream outfile (name.c_str()); if (compressGaps){ set s; s.insert (i); s.insert (j); MultiSequence *proj = align->Project (s); proj->WriteMFA (outfile); delete proj; } else { align->GetSequence(i)->WriteMFA (outfile, 60); align->GetSequence(j)->WriteMFA (outfile, 60); } outfile.close(); } } delete align; } probcons/README0000644000175000017500000001034310604510502013753 0ustar chuongdochuongdo PROBCONS ~~~~~~~~ Probabilistic consistency-based multiple sequence alignment ----------------------------------------------------------------- PROBCONS is a novel tool for generating multiple alignments of protein sequences. Using a combination of probabilistic modeling and consistency-based alignment techniques, PROBCONS has achieved the highest accuracy of all alignment methods to date. PROBCONS was developed by Chuong B. Do in collaboration with Michael Brudno in the research group of Serafim Batzoglou, Department of Computer Science, Stanford University. For more information on the algorithms, please see Do, C.B., Brudno, M., and Batzoglou, S. (2004) PROBCONS: Probabilistic Consistency-based Multiple Alignment of Amino Acid Sequences. 12th International Conference on Intelligent Systems for Molecular Biology. In press. and Do, C.B., Brudno, M., and Batzoglou, S. (2004) PROBCONS: Probabilistic Consistency-based Multiple Alignment of Amino Acid Sequences. The 19th National Conference on Artificial Intelligence (AAAI-04). In press. ----------------------------------------------------------------- PROBCONS has been made freely available as PUBLIC DOMAIN software and hence is not subject to copyright in the United States. This system and/or any portion of the source code may be used, modified, or redistributed without restrictions. PROBCONS is distributed WITHOUT WARRANTY, express or implied. The authors accept NO LEGAL LIABILITY OR RESPONSIBILITY for loss due to reliance on the program. ----------------------------------------------------------------- Version History 1.0, 3/23/2004 (Chuong Do) -- initial release 1.01, 3/25/2004 (Chuong Do) -- fixed error in training procedure -- retrained default parameters for 1 and 2 pairs of insert states 1.02, 4/17/2004 (Chuong Do) -- replaced LOG_ADD and EXP routines -- added support for reading MSF format files -- added two extra utilities for scoring PROBCONS alignments (for benchmarking purposes) -- added the "compare" program for scoring alignments according to a reference alignment with respect to sum-of-pairs and column scores -- added the "fixref" program for adjusting PREFAB alignments to contain all letters of the input sequences; basically the main program for PROBCONS "hacked" to get the job done 1.03, 5/3/2004 (Chuong Do) -- added option to do all-pairs pairwise alignments instead of constructing a full multiple alignment -- added support for reading DIALIGN style files -- enabled support for using BAliBASE annotations for scoring BAliBASE alignments -- several minor bug fixes thanks to Bob Edgar -- added "project" program to project multiple alignment to pairwise alignments 1.04, 5/9/2004 (Chuong Do) -- switched over to default of one-insert state pair -- retrained default parameters -- added annotation scores -- small changes to model topology to make end gaps symmetrical -- added makegnuplot utility to plot annotation scores 1.05, 5/26/2004 (Chuong Do) -- added cutoff filtering for posterior scores -- made small corrections to recurrences for computing alignments -- added CLUSTALW output support 1.06, 7/13/2004 (Chuong Do) -- ProbCons is now PUBLIC DOMAIN software. 1.07, 8/30/2004 (Chuong Do) -- Fixed CLUSTALW output for sequence names (thanks to John Calley for pointing this out) 1.08, 8/31/2004 (Chuong Do) -- Added option for alignment order output (-a). 1.09, 9/1/2004 (Chuong Do) -- PROBCONS now allows input files with existing gaps -- these are automatically stripped before alignment. 1.10, 3/16/2005 (Chuong Do) -- Reduced memory consumption by -- not storing posterior matrix transposes -- restricting consistency-derived posterior matrices to original posterior matrix 1.11, 12/1/2006 (Chuong Do) -- Fixed bug with conflict between --annot and -a options 1.12, 4/1/2006 (Chuong Do) -- Fixed GCC 4.3 compatibility problems (thanks to Martin Michlmayr and Charles Plessy)probcons/SafeVector.h0000644000175000017500000000276010217640624015322 0ustar chuongdochuongdo///////////////////////////////////////////////////////////////// // SafeVector.h // // STL vector with array bounds checking. To enable bounds // checking, #define ENABLE_CHECKS. ///////////////////////////////////////////////////////////////// #ifndef SAFEVECTOR_H #define SAFEVECTOR_H #include #include ///////////////////////////////////////////////////////////////// // SafeVector // // Class derived from the STL std::vector for bounds checking. ///////////////////////////////////////////////////////////////// template class SafeVector : public std::vector{ public: // miscellaneous constructors SafeVector() : std::vector() {} SafeVector (size_t size) : std::vector(size) {} SafeVector (size_t size, const TYPE &value) : std::vector(size, value) {} SafeVector (const SafeVector &source) : std::vector(source) {} #ifdef ENABLE_CHECKS // [] array bounds checking TYPE &operator[](int index){ assert (index >= 0 && index < (int) size()); return std::vector::operator[] ((size_t) index); } // [] const array bounds checking const TYPE &operator[] (int index) const { assert (index >= 0 && index < (int) size()); return std::vector::operator[] ((size_t) index) ; } #endif }; // some commonly used vector types typedef SafeVector VI; typedef SafeVector VVI; typedef SafeVector VVVI; typedef SafeVector VF; typedef SafeVector VVF; typedef SafeVector VVVF; #endif probcons/ScoreType.h0000644000175000017500000002534310075047142015176 0ustar chuongdochuongdo///////////////////////////////////////////////////////////////// // ScoreType.h // // Routines for doing math operations in PROBCONS. ///////////////////////////////////////////////////////////////// #ifndef SCORETYPE_H #define SCORETYPE_H #include #include #include typedef float ScoreType; const float LOG_ZERO = -2e20; const float LOG_ONE = 0.0; ///////////////////////////////////////////////////////////////// // LOG() // // Compute the logarithm of x. ///////////////////////////////////////////////////////////////// inline ScoreType LOG (ScoreType x){ return log (x); } ///////////////////////////////////////////////////////////////// // EXP() // // Computes exp(x). ///////////////////////////////////////////////////////////////// inline ScoreType EXP (ScoreType x){ //return exp(x); if (x > -2){ if (x > -0.5){ if (x > 0) return exp(x); return (((0.03254409303190190000*x + 0.16280432765779600000)*x + 0.49929760485974900000)*x + 0.99995149601363700000)*x + 0.99999925508501600000; } if (x > -1) return (((0.01973899026052090000*x + 0.13822379685007000000)*x + 0.48056651562365000000)*x + 0.99326940370383500000)*x + 0.99906756856399500000; return (((0.00940528203591384000*x + 0.09414963667859410000)*x + 0.40825793595877300000)*x + 0.93933625499130400000)*x + 0.98369508190545300000; } if (x > -8){ if (x > -4) return (((0.00217245711583303000*x + 0.03484829428350620000)*x + 0.22118199801337800000)*x + 0.67049462206469500000)*x + 0.83556950223398500000; return (((0.00012398771025456900*x + 0.00349155785951272000)*x + 0.03727721426017900000)*x + 0.17974997741536900000)*x + 0.33249299994217400000; } if (x > -16) return (((0.00000051741713416603*x + 0.00002721456879608080)*x + 0.00053418601865636800)*x + 0.00464101989351936000)*x + 0.01507447981459420000; return 0; } /* ///////////////////////////////////////////////////////////////// // LOOKUP() // // Computes log (exp (x) + 1), for 0 <= x <= 7.5. ///////////////////////////////////////////////////////////////// inline ScoreType LOOKUP (ScoreType x){ //return log (exp(x) + 1); if (x < 2){ if (x < 0.5){ if (x < 0) return log (exp(x) + 1); return (((-0.00486373205785640000*x - 0.00020245408813934800)*x + 0.12504222666029800000)*x + 0.49999685320563000000)*x + 0.69314723138948900000; } if (x < 1) return (((-0.00278634205460548000*x - 0.00458097251248546000)*x + 0.12865849880472500000)*x + 0.49862228499205200000)*x + 0.69334810088688000000; return (((0.00059633755154209200*x - 0.01918996666063320000)*x + 0.15288232492093800000)*x + 0.48039958825756900000)*x + 0.69857578503189200000; } if (x < 8){ if (x < 4) return (((0.00135958539181047000*x - 0.02329807659316430000)*x + 0.15885799609532100000)*x + 0.48167498563270800000)*x + 0.69276185058669200000; return (((0.00011992394456683500*x - 0.00338464503306568000)*x + 0.03622746366545470000)*x + 0.82481250248383700000)*x + 0.32507892994863100000; } if (x < 16) return (((0.00000051726300753785*x - 0.00002720671238876090)*x + 0.00053403733818413500)*x + 0.99536021775747900000)*x + 0.01507065715532010000; return x; } ///////////////////////////////////////////////////////////////// // LOOKUP_SLOW() // // Computes log (exp (x) + 1). ///////////////////////////////////////////////////////////////// inline ScoreType LOOKUP_SLOW (ScoreType x){ return log (exp (x) + 1); } ///////////////////////////////////////////////////////////////// // MAX() // // Compute max of three numbers ///////////////////////////////////////////////////////////////// inline ScoreType MAX (ScoreType x, ScoreType y, ScoreType z){ if (x >= y){ if (x >= z) return x; return z; } if (y >= z) return y; return z; } ///////////////////////////////////////////////////////////////// // LOG_PLUS_EQUALS() // // Add two log probabilities and store in the first argument ///////////////////////////////////////////////////////////////// inline void LOG_PLUS_EQUALS (ScoreType &x, ScoreType y){ if (x < y) x = (x <= LOG_ZERO) ? y : LOOKUP(y-x) + x; else x = (y <= LOG_ZERO) ? x : LOOKUP(x-y) + y; } ///////////////////////////////////////////////////////////////// // LOG_PLUS_EQUALS_SLOW() // // Add two log probabilities and store in the first argument ///////////////////////////////////////////////////////////////// inline void LOG_PLUS_EQUALS_SLOW (ScoreType &x, ScoreType y){ if (x < y) x = (x <= LOG_ZERO) ? y : LOOKUP_SLOW(y-x) + x; else x = (y <= LOG_ZERO) ? x : LOOKUP_SLOW(x-y) + y; } ///////////////////////////////////////////////////////////////// // LOG_ADD() // // Add two log probabilities ///////////////////////////////////////////////////////////////// inline ScoreType LOG_ADD (ScoreType x, ScoreType y){ if (x < y) return (x <= LOG_ZERO) ? y : LOOKUP(y-x) + x; return (y <= LOG_ZERO) ? x : LOOKUP(x-y) + y; } */ /* ///////////////////////////////////////////////////////////////// // LOG() // // Compute the logarithm of x. ///////////////////////////////////////////////////////////////// inline float LOG (float x){ return log (x); } ///////////////////////////////////////////////////////////////// // EXP() // // Computes exp(x), fr -4.6 <= x <= 0. ///////////////////////////////////////////////////////////////// inline float EXP (float x){ assert (x <= 0.00f); if (x < EXP_UNDERFLOW_THRESHOLD) return 0.0f; return (((0.006349841068584 * x + 0.080775412572352) * x + 0.397982026296272) * x + 0.95279335963787f) * x + 0.995176455837312f; //return (((0.00681169825657f * x + 0.08386267698832f) * x + 0.40413983195844f) * x + 0.95656674979767f) * x + 0.99556744049130f; } */ const float EXP_UNDERFLOW_THRESHOLD = -4.6; const float LOG_UNDERFLOW_THRESHOLD = 7.5; ///////////////////////////////////////////////////////////////// // LOOKUP() // // Computes log (exp (x) + 1), for 0 <= x <= 7.5. ///////////////////////////////////////////////////////////////// inline float LOOKUP (float x){ assert (x >= 0.00f); assert (x <= LOG_UNDERFLOW_THRESHOLD); //return ((-0.00653779113685f * x + 0.09537236626558f) * x + 0.55317574459331f) * x + 0.68672959851568f; if (x <= 1.00f) return ((-0.009350833524763f * x + 0.130659527668286f) * x + 0.498799810682272f) * x + 0.693203116424741f; if (x <= 2.50f) return ((-0.014532321752540f * x + 0.139942324101744f) * x + 0.495635523139337f) * x + 0.692140569840976f; if (x <= 4.50f) return ((-0.004605031767994f * x + 0.063427417320019f) * x + 0.695956496475118f) * x + 0.514272634594009f; assert (x <= LOG_UNDERFLOW_THRESHOLD); return ((-0.000458661602210f * x + 0.009695946122598f) * x + 0.930734667215156f) * x + 0.168037164329057f; //return (((0.00089738532761f * x - 0.01859488697982f) * x + 0.14415772028626f) * x + 0.49515490689159f) * x + 0.69311928966454f; } ///////////////////////////////////////////////////////////////// // LOOKUP_SLOW() // // Computes log (exp (x) + 1). ///////////////////////////////////////////////////////////////// inline float LOOKUP_SLOW (float x){ return log (exp (x) + 1); } ///////////////////////////////////////////////////////////////// // MAX() // // Compute max of three numbers ///////////////////////////////////////////////////////////////// inline float MAX (float x, float y, float z){ if (x >= y){ if (x >= z) return x; return z; } if (y >= z) return y; return z; } ///////////////////////////////////////////////////////////////// // LOG_PLUS_EQUALS() // // Add two log probabilities and store in the first argument ///////////////////////////////////////////////////////////////// inline void LOG_PLUS_EQUALS (float &x, float y){ if (x < y) x = (x == LOG_ZERO || y - x >= LOG_UNDERFLOW_THRESHOLD) ? y : LOOKUP(y-x) + x; else x = (y == LOG_ZERO || x - y >= LOG_UNDERFLOW_THRESHOLD) ? x : LOOKUP(x-y) + y; } ///////////////////////////////////////////////////////////////// // LOG_PLUS_EQUALS_SLOW() // // Add two log probabilities and store in the first argument ///////////////////////////////////////////////////////////////// inline void LOG_PLUS_EQUALS_SLOW (float &x, float y){ if (x < y) x = (x == LOG_ZERO) ? y : LOOKUP_SLOW(y-x) + x; else x = (y == LOG_ZERO) ? x : LOOKUP_SLOW(x-y) + y; } ///////////////////////////////////////////////////////////////// // LOG_ADD() // // Add two log probabilities ///////////////////////////////////////////////////////////////// inline float LOG_ADD (float x, float y){ if (x < y) return (x == LOG_ZERO || y - x >= LOG_UNDERFLOW_THRESHOLD) ? y : LOOKUP(y-x) + x; return (y == LOG_ZERO || x - y >= LOG_UNDERFLOW_THRESHOLD) ? x : LOOKUP(x-y) + y; } ///////////////////////////////////////////////////////////////// // LOG_ADD() // // Add three log probabilities ///////////////////////////////////////////////////////////////// inline float LOG_ADD (float x1, float x2, float x3){ return LOG_ADD (x1, LOG_ADD (x2, x3)); } ///////////////////////////////////////////////////////////////// // LOG_ADD() // // Add four log probabilities ///////////////////////////////////////////////////////////////// inline float LOG_ADD (float x1, float x2, float x3, float x4){ return LOG_ADD (x1, LOG_ADD (x2, LOG_ADD (x3, x4))); } ///////////////////////////////////////////////////////////////// // LOG_ADD() // // Add five log probabilities ///////////////////////////////////////////////////////////////// inline float LOG_ADD (float x1, float x2, float x3, float x4, float x5){ return LOG_ADD (x1, LOG_ADD (x2, LOG_ADD (x3, LOG_ADD (x4, x5)))); } ///////////////////////////////////////////////////////////////// // LOG_ADD() // // Add siz log probabilities ///////////////////////////////////////////////////////////////// inline float LOG_ADD (float x1, float x2, float x3, float x4, float x5, float x6){ return LOG_ADD (x1, LOG_ADD (x2, LOG_ADD (x3, LOG_ADD (x4, LOG_ADD (x5, x6))))); } ///////////////////////////////////////////////////////////////// // LOG_ADD() // // Add seven log probabilities ///////////////////////////////////////////////////////////////// inline float LOG_ADD (float x1, float x2, float x3, float x4, float x5, float x6, float x7){ return LOG_ADD (x1, LOG_ADD (x2, LOG_ADD (x3, LOG_ADD (x4, LOG_ADD (x5, LOG_ADD (x6, x7)))))); } ///////////////////////////////////////////////////////////////// // ChooseBestOfThree() // // Store the largest of three values x1, x2, and x3 in *x. Also // if xi is the largest value, then store bi in *b. ///////////////////////////////////////////////////////////////// inline void ChooseBestOfThree (float x1, float x2, float x3, char b1, char b2, char b3, float *x, char *b){ if (x1 >= x2){ if (x1 >= x3){ *x = x1; *b = b1; return; } *x = x3; *b = b3; return; } if (x2 >= x3){ *x = x2; *b = b2; return; } *x = x3; *b = b3; } #endif probcons/Sequence.h0000644000175000017500000003067310115263265015034 0ustar chuongdochuongdo///////////////////////////////////////////////////////////////// // Sequence.h // // Class for reading/manipulating single sequence character data. ///////////////////////////////////////////////////////////////// #ifndef SEQUENCE_H #define SEQUENCE_H #include #include #include #include #include #include "SafeVector.h" #include "FileBuffer.h" ///////////////////////////////////////////////////////////////// // Sequence // // Class for storing sequence information. ///////////////////////////////////////////////////////////////// class Sequence { bool isValid; // a boolean indicating whether the sequence data is valid or not string header; // string containing the comment line of the FASTA file SafeVector *data; // pointer to character data int length; // length of the sequence int sequenceLabel; // integer sequence label, typically to indicate the ordering of sequences // in a Multi-FASTA file int inputLabel; // position of sequence in original input ///////////////////////////////////////////////////////////////// // Sequence::Sequence() // // Default constructor. Does nothing. ///////////////////////////////////////////////////////////////// Sequence () : isValid (false), header (""), data (NULL), length (0), sequenceLabel (0), inputLabel (0) {} public: ///////////////////////////////////////////////////////////////// // Sequence::Sequence() // // Constructor. Reads the sequence from a FileBuffer. ///////////////////////////////////////////////////////////////// Sequence (FileBuffer &infile, bool stripGaps = false) : isValid (false), header ("~"), data (NULL), length(0), sequenceLabel (0), inputLabel (0) { // read until the first non-blank line while (!infile.eof()){ infile.GetLine (header); if (header.length() != 0) break; } // check to make sure that it is a correct header line if (header[0] == '>'){ // if so, remove the leading ">" header = header.substr (1); // remove any leading or trailing white space in the header comment while (header.length() > 0 && isspace (header[0])) header = header.substr (1); while (header.length() > 0 && isspace (header[header.length() - 1])) header = header.substr(0, header.length() - 1); // get ready to read the data[] array; note that data[0] is always '@' char ch; data = new SafeVector; assert (data); data->push_back ('@'); // get a character from the file while (infile.Get(ch)){ // if we've reached a new comment line, put the character back and stop if (ch == '>'){ infile.UnGet(); break; } // skip whitespace if (isspace (ch)) continue; // substitute gap character if (ch == '.') ch = '-'; if (stripGaps && ch == '-') continue; // check for known characters if (!((ch >= 'A' && ch <= 'Z') || (ch >= 'a' && ch <= 'z') || ch == '*' || ch == '-')){ cerr << "ERROR: Unknown character encountered: " << ch << endl; exit (1); } // everything's ok so far, so just store this character. data->push_back(ch); ++length; } // sequence must contain data in order to be valid isValid = length > 0; if (!isValid){ delete data; data = NULL; } } } ///////////////////////////////////////////////////////////////// // Sequence::Sequence() // // Constructor. Builds a sequence from existing data. Note // that the data must use one-based indexing where data[0] should // be set to '@'. ///////////////////////////////////////////////////////////////// Sequence (SafeVector *data, string header, int length, int sequenceLabel, int inputLabel) : isValid (data != NULL), header(header), data(data), length (length), sequenceLabel (sequenceLabel), inputLabel (inputLabel) { assert (data); assert ((*data)[0] == '@'); } ///////////////////////////////////////////////////////////////// // Sequence::Sequence() // // Destructor. Release allocated memory. ///////////////////////////////////////////////////////////////// ~Sequence (){ if (data){ assert (isValid); delete data; data = NULL; isValid = false; } } ///////////////////////////////////////////////////////////////// // Sequence::GetHeader() // // Return the string comment associated with this sequence. ///////////////////////////////////////////////////////////////// string GetHeader () const { return header; } ///////////////////////////////////////////////////////////////// // Sequence::GetName() // // Return the first word of the string comment associated with this sequence. ///////////////////////////////////////////////////////////////// string GetName () const { char name[1024]; sscanf (header.c_str(), "%s", name); return string(name); } ///////////////////////////////////////////////////////////////// // Sequence::GetDataPtr() // // Return the iterator to data associated with this sequence. ///////////////////////////////////////////////////////////////// SafeVector::iterator GetDataPtr(){ assert (isValid); assert (data); return data->begin(); } ///////////////////////////////////////////////////////////////// // Sequence::GetPosition() // // Return the character at position i. Recall that the character // data is stored with one-based indexing. ///////////////////////////////////////////////////////////////// char GetPosition (int i) const { assert (isValid); assert (data); assert (i >= 1 && i <= length); return (*data)[i]; } ///////////////////////////////////////////////////////////////// // Sequence::SetLabel() // // Sets the sequence label to i. ///////////////////////////////////////////////////////////////// void SetLabel (int i){ assert (isValid); sequenceLabel = i; inputLabel = i; } ///////////////////////////////////////////////////////////////// // Sequence::SetSortLabel() // // Sets the sequence sorting label to i. ///////////////////////////////////////////////////////////////// void SetSortLabel (int i){ assert (isValid); sequenceLabel = i; } ///////////////////////////////////////////////////////////////// // Sequence::GetLabel() // // Retrieves the input label. ///////////////////////////////////////////////////////////////// int GetLabel () const { assert (isValid); return inputLabel; } ///////////////////////////////////////////////////////////////// // Sequence::GetSortLabel() // // Retrieves the sorting label. ///////////////////////////////////////////////////////////////// int GetSortLabel () const { assert (isValid); return sequenceLabel; } ///////////////////////////////////////////////////////////////// // Sequence::Fail() // // Checks to see if the sequence successfully loaded. ///////////////////////////////////////////////////////////////// bool Fail () const { return !isValid; } ///////////////////////////////////////////////////////////////// // Sequence::Length() // // Returns the length of the sequence. ///////////////////////////////////////////////////////////////// int GetLength () const { assert (isValid); assert (data); return length; } ///////////////////////////////////////////////////////////////// // Sequence::WriteMFA() // // Writes the sequence to outfile in MFA format. Uses numColumns // columns per line. If useIndex is set to false, then the // header is printed as normal, but if useIndex is true, then // ">S###" is printed where ### represents the sequence label. ///////////////////////////////////////////////////////////////// void WriteMFA (ostream &outfile, int numColumns, bool useIndex = false) const { assert (isValid); assert (data); assert (!outfile.fail()); // print out heading if (useIndex) outfile << ">S" << GetLabel() << endl; else outfile << ">" << header << endl; // print out character data int ct = 1; for (; ct <= length; ct++){ outfile << (*data)[ct]; if (ct % numColumns == 0) outfile << endl; } if ((ct-1) % numColumns != 0) outfile << endl; } ///////////////////////////////////////////////////////////////// // Sequence::Clone() // // Returns a new deep copy of the seqeuence. ///////////////////////////////////////////////////////////////// Sequence *Clone () const { Sequence *ret = new Sequence(); assert (ret); ret->isValid = isValid; ret->header = header; ret->data = new SafeVector; assert (ret->data); *(ret->data) = *data; ret->length = length; ret->sequenceLabel = sequenceLabel; ret->inputLabel = inputLabel; return ret; } ///////////////////////////////////////////////////////////////// // Sequence::GetRange() // // Returns a new sequence object consisting of a range of // characters from the current seuquence. ///////////////////////////////////////////////////////////////// Sequence *GetRange (int start, int end) const { Sequence *ret = new Sequence(); assert (ret); assert (start >= 1 && start <= length); assert (end >= 1 && end <= length); assert (start <= end); ret->isValid = isValid; ret->header = header; ret->data = new SafeVector; assert (ret->data); ret->data->push_back ('@'); for (int i = start; i <= end; i++) ret->data->push_back ((*data)[i]); ret->length = end - start + 1; ret->sequenceLabel = sequenceLabel; ret->inputLabel = inputLabel; return ret; } ///////////////////////////////////////////////////////////////// // Sequence::AddGaps() // // Given an SafeVector containing the skeleton for an // alignment and the identity of the current character, this // routine will create a new sequence with all necesssary gaps added. // For instance, // alignment = "XXXBBYYYBBYYXX" // id = 'X' // will perform the transformation // "ATGCAGTCA" --> "ATGCC---GT--CA" // (XXXBBYYYBBYYXX) ///////////////////////////////////////////////////////////////// Sequence *AddGaps (SafeVector *alignment, char id){ Sequence *ret = new Sequence(); assert (ret); ret->isValid = isValid; ret->header = header; ret->data = new SafeVector; assert (ret->data); ret->length = (int) alignment->size(); ret->sequenceLabel = sequenceLabel; ret->inputLabel = inputLabel; ret->data->push_back ('@'); SafeVector::iterator dataIter = data->begin() + 1; for (SafeVector::iterator iter = alignment->begin(); iter != alignment->end(); ++iter){ if (*iter == 'B' || *iter == id){ ret->data->push_back (*dataIter); ++dataIter; } else ret->data->push_back ('-'); } return ret; } ///////////////////////////////////////////////////////////////// // Sequence::GetString() // // Returns the sequence as a string with gaps removed. ///////////////////////////////////////////////////////////////// string GetString (){ string s = ""; for (int i = 1; i <= length; i++){ if ((*data)[i] != '-') s += (*data)[i]; } return s; } ///////////////////////////////////////////////////////////////// // Sequence::GetMapping() // // Returns a SafeVector containing the indices of every // character in the sequence. For instance, if the data is // "ATGCC---GT--CA", the method returns {1,2,3,4,5,9,10,13,14}. ///////////////////////////////////////////////////////////////// SafeVector *GetMapping () const { SafeVector *ret = new SafeVector(1, 0); for (int i = 1; i <= length; i++){ if ((*data)[i] != '-') ret->push_back (i); } return ret; } ///////////////////////////////////////////////////////////////// // Sequence::Highlight() // // Changes all positions with score >= cutoff to upper case and // all positions with score < cutoff to lower case. ///////////////////////////////////////////////////////////////// void Highlight (const SafeVector &scores, const float cutoff){ for (int i = 1; i <= length; i++){ if (scores[i-1] >= cutoff) (*data)[i] = toupper ((*data)[i]); else (*data)[i] = tolower ((*data)[i]); } } }; #endif probcons/SparseMatrix.h0000644000175000017500000001771610217637711015715 0ustar chuongdochuongdo///////////////////////////////////////////////////////////////// // SparseMatrix.h // // Sparse matrix computations ///////////////////////////////////////////////////////////////// #ifndef SPARSEMATRIX_H #define SPARSEMATRIX_H #include using namespace std; const float POSTERIOR_CUTOFF = 0.01; // minimum posterior probability // value that is maintained in the // sparse matrix representation typedef pair PIF; // Sparse matrix entry type // first --> column // second --> value ///////////////////////////////////////////////////////////////// // SparseMatrix // // Class for sparse matrix computations ///////////////////////////////////////////////////////////////// class SparseMatrix { int seq1Length, seq2Length; // dimensions of matrix VI rowSize; // rowSize[i] = # of cells in row i SafeVector data; // data values SafeVector::iterator> rowPtrs; // pointers to the beginning of each row ///////////////////////////////////////////////////////////////// // SparseMatrix::SparseMatrix() // // Private constructor. ///////////////////////////////////////////////////////////////// SparseMatrix (){} public: ///////////////////////////////////////////////////////////////// // SparseMatrix::SparseMatrix() // // Constructor. Builds a sparse matrix from a posterior matrix. // Note that the expected format for the posterior matrix is as // a (seq1Length+1) x (seq2Length+1) matrix where the 0th row // and 0th column are ignored (they should contain all zeroes). ///////////////////////////////////////////////////////////////// SparseMatrix (int seq1Length, int seq2Length, const VF &posterior) : seq1Length (seq1Length), seq2Length (seq2Length) { int numCells = 0; assert (seq1Length > 0); assert (seq2Length > 0); // calculate memory required; count the number of cells in the // posterior matrix above the threshold VF::const_iterator postPtr = posterior.begin(); for (int i = 0; i <= seq1Length; i++){ for (int j = 0; j <= seq2Length; j++){ if (*(postPtr++) >= POSTERIOR_CUTOFF){ assert (i != 0 && j != 0); numCells++; } } } // allocate memory data.resize(numCells); rowSize.resize (seq1Length + 1); rowSize[0] = -1; rowPtrs.resize (seq1Length + 1); rowPtrs[0] = data.end(); // build sparse matrix postPtr = posterior.begin() + seq2Length + 1; // note that we're skipping the first row here SafeVector::iterator dataPtr = data.begin(); for (int i = 1; i <= seq1Length; i++){ postPtr++; // and skipping the first column of each row rowPtrs[i] = dataPtr; for (int j = 1; j <= seq2Length; j++){ if (*postPtr >= POSTERIOR_CUTOFF){ dataPtr->first = j; dataPtr->second = *postPtr; dataPtr++; } postPtr++; } rowSize[i] = dataPtr - rowPtrs[i]; } } ///////////////////////////////////////////////////////////////// // SparseMatrix::GetRowPtr() // // Returns the pointer to a particular row in the sparse matrix. ///////////////////////////////////////////////////////////////// SafeVector::iterator GetRowPtr (int row) const { assert (row >= 1 && row <= seq1Length); return rowPtrs[row]; } ///////////////////////////////////////////////////////////////// // SparseMatrix::GetValue() // // Returns value at a particular row, column. ///////////////////////////////////////////////////////////////// float GetValue (int row, int col){ assert (row >= 1 && row <= seq1Length); assert (col >= 1 && col <= seq2Length); for (int i = 0; i < rowSize[row]; i++){ if (rowPtrs[row][i].first == col) return rowPtrs[row][i].second; } return 0; } ///////////////////////////////////////////////////////////////// // SparseMatrix::GetRowSize() // // Returns the number of entries in a particular row. ///////////////////////////////////////////////////////////////// int GetRowSize (int row) const { assert (row >= 1 && row <= seq1Length); return rowSize[row]; } ///////////////////////////////////////////////////////////////// // SparseMatrix::GetSeq1Length() // // Returns the first dimension of the matrix. ///////////////////////////////////////////////////////////////// int GetSeq1Length () const { return seq1Length; } ///////////////////////////////////////////////////////////////// // SparseMatrix::GetSeq2Length() // // Returns the second dimension of the matrix. ///////////////////////////////////////////////////////////////// int GetSeq2Length () const { return seq2Length; } ///////////////////////////////////////////////////////////////// // SparseMatrix::GetRowPtr // // Returns the pointer to a particular row in the sparse matrix. ///////////////////////////////////////////////////////////////// int GetNumCells () const { return data.size(); } ///////////////////////////////////////////////////////////////// // SparseMatrix::Print() // // Prints out a sparse matrix. ///////////////////////////////////////////////////////////////// void Print (ostream &outfile) const { outfile << "Sparse Matrix:" << endl; for (int i = 1; i <= seq1Length; i++){ outfile << " " << i << ":"; for (int j = 0; j < rowSize[i]; j++){ outfile << " (" << rowPtrs[i][j].first << "," << rowPtrs[i][j].second << ")"; } outfile << endl; } } ///////////////////////////////////////////////////////////////// // SparseMatrix::ComputeTranspose() // // Returns a new sparse matrix containing the transpose of the // current matrix. ///////////////////////////////////////////////////////////////// SparseMatrix *ComputeTranspose () const { // create a new sparse matrix SparseMatrix *ret = new SparseMatrix(); int numCells = data.size(); ret->seq1Length = seq2Length; ret->seq2Length = seq1Length; // allocate memory ret->data.resize (numCells); ret->rowSize.resize (seq2Length + 1); ret->rowSize[0] = -1; ret->rowPtrs.resize (seq2Length + 1); ret->rowPtrs[0] = ret->data.end(); // compute row sizes for (int i = 1; i <= seq2Length; i++) ret->rowSize[i] = 0; for (int i = 0; i < numCells; i++) ret->rowSize[data[i].first]++; // compute row ptrs for (int i = 1; i <= seq2Length; i++){ ret->rowPtrs[i] = (i == 1) ? ret->data.begin() : ret->rowPtrs[i-1] + ret->rowSize[i-1]; } // now fill in data SafeVector::iterator> currPtrs = ret->rowPtrs; for (int i = 1; i <= seq1Length; i++){ SafeVector::iterator row = rowPtrs[i]; for (int j = 0; j < rowSize[i]; j++){ currPtrs[row[j].first]->first = i; currPtrs[row[j].first]->second = row[j].second; currPtrs[row[j].first]++; } } return ret; } ///////////////////////////////////////////////////////////////// // SparseMatrix::GetPosterior() // // Return the posterior representation of the sparse matrix. ///////////////////////////////////////////////////////////////// VF *GetPosterior () const { // create a new posterior matrix VF *posteriorPtr = new VF((seq1Length+1) * (seq2Length+1)); assert (posteriorPtr); VF &posterior = *posteriorPtr; // build the posterior matrix for (int i = 0; i < (seq1Length+1) * (seq2Length+1); i++) posterior[i] = 0; for (int i = 1; i <= seq1Length; i++){ VF::iterator postPtr = posterior.begin() + i * (seq2Length+1); for (int j = 0; j < rowSize[i]; j++){ postPtr[rowPtrs[i][j].first] = rowPtrs[i][j].second; } } return posteriorPtr; } }; #endif